From d1429ac4a55a2f6cbaeaf1cca572601e5d344667 Mon Sep 17 00:00:00 2001
From: peizhilin <wopeizl@163.com>
Date: Thu, 15 Nov 2018 19:46:22 +0800
Subject: [PATCH 001/113] add recordio support

---
 CMakeLists.txt                                |  6 +-
 cmake/external/eigen.cmake                    | 10 +--
 cmake/external/gflags.cmake                   |  5 +-
 cmake/external/glog.cmake                     |  3 +-
 cmake/external/gtest.cmake                    |  5 +-
 cmake/external/protobuf.cmake                 |  5 +-
 cmake/external/snappy.cmake                   | 12 +++-
 cmake/external/snappystream.cmake             | 61 +++++++++++--------
 cmake/external/zlib.cmake                     |  5 +-
 paddle/fluid/CMakeLists.txt                   |  6 +-
 paddle/fluid/framework/CMakeLists.txt         |  6 +-
 paddle/fluid/operators/CMakeLists.txt         |  8 +--
 .../operators/reader/create_py_reader_op.cc   |  2 +-
 paddle/fluid/operators/roi_align_op.cc        |  6 +-
 paddle/fluid/operators/roi_pool_op.cc         |  6 +-
 paddle/fluid/operators/space_to_depth_op.cc   |  2 +-
 paddle/fluid/platform/port.h                  | 10 +--
 paddle/fluid/pybind/CMakeLists.txt            |  5 +-
 paddle/fluid/pybind/pybind.cc                 |  9 +--
 python/paddle/fluid/layers/nn.py              |  6 ++
 20 files changed, 97 insertions(+), 81 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 996a79fbbc..d6e7b88f86 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -190,11 +190,11 @@ include(external/pybind11)  # download pybind11
 include(external/cares)
 include(external/cub)
 include(external/xxhash)    # download xxhash
-
-if (NOT WIN32)
-# there is no official support of snappystream, warpctc, nccl, cupti in windows
 include(external/snappy)    # download snappy
 include(external/snappystream) # download snappystream
+
+if (NOT WIN32)
+# there is no official support of warpctc, nccl, cupti in windows
 include(external/warpctc)   # download, build, install warpctc
 include(cupti)
 endif (NOT WIN32)
diff --git a/cmake/external/eigen.cmake b/cmake/external/eigen.cmake
index 573ad5e5f0..98079678ae 100644
--- a/cmake/external/eigen.cmake
+++ b/cmake/external/eigen.cmake
@@ -16,8 +16,9 @@ if(WITH_AMD_GPU)
     ExternalProject_Add(
         extern_eigen3
         ${EXTERNAL_PROJECT_LOG_ARGS}
-        GIT_REPOSITORY  "https://github.com/sabreshao/hipeigen.git"
-        GIT_TAG         0cba03ff9f8f9f70bbd92ac5857b031aa8fed6f9
+#        GIT_REPOSITORY  "https://github.com/sabreshao/hipeigen.git"
+#        GIT_TAG         0cba03ff9f8f9f70bbd92ac5857b031aa8fed6f9
+            GIT_REPOSITORY  "http://admin@172.20.90.14:8080/r/eigen3.git"
         PREFIX          ${EIGEN_SOURCE_DIR}
         UPDATE_COMMAND  ""
         CONFIGURE_COMMAND ""
@@ -29,10 +30,11 @@ else()
     ExternalProject_Add(
         extern_eigen3
         ${EXTERNAL_PROJECT_LOG_ARGS}
-        GIT_REPOSITORY  "https://github.com/eigenteam/eigen-git-mirror"
+#            GIT_REPOSITORY  "https://github.com/eigenteam/eigen-git-mirror"
+            GIT_REPOSITORY  "http://admin@172.20.90.14:8080/r/eigen3.git"
         # eigen on cuda9.1 missing header of math_funtions.hpp
         # https://stackoverflow.com/questions/43113508/math-functions-hpp-not-found-when-using-cuda-with-eigen
-        GIT_TAG         917060c364181f33a735dc023818d5a54f60e54c
+#        GIT_TAG         917060c364181f33a735dc023818d5a54f60e54c
         PREFIX          ${EIGEN_SOURCE_DIR}
         DOWNLOAD_NAME   "eigen"
         UPDATE_COMMAND  ""
diff --git a/cmake/external/gflags.cmake b/cmake/external/gflags.cmake
index 4e98e4bf88..7c062d682c 100644
--- a/cmake/external/gflags.cmake
+++ b/cmake/external/gflags.cmake
@@ -28,8 +28,9 @@ INCLUDE_DIRECTORIES(${GFLAGS_INCLUDE_DIR})
 ExternalProject_Add(
     extern_gflags
     ${EXTERNAL_PROJECT_LOG_ARGS}
-    GIT_REPOSITORY  "https://github.com/gflags/gflags.git"
-    GIT_TAG         77592648e3f3be87d6c7123eb81cbad75f9aef5a
+#    GIT_REPOSITORY  "https://github.com/gflags/gflags.git"
+    GIT_REPOSITORY  "http://admin@172.20.90.14:8080/r/gflags.git"
+#    GIT_TAG         77592648e3f3be87d6c7123eb81cbad75f9aef5a
     PREFIX          ${GFLAGS_SOURCES_DIR}
     UPDATE_COMMAND  ""
     CMAKE_ARGS      -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}
diff --git a/cmake/external/glog.cmake b/cmake/external/glog.cmake
index 8cd0455c16..a3f3c6adf3 100644
--- a/cmake/external/glog.cmake
+++ b/cmake/external/glog.cmake
@@ -34,13 +34,14 @@ ELSE()
   SET(GLOG_REPOSITORY "https://github.com/google/glog.git")
   SET(GLOG_TAG "v0.3.5")
 ENDIF()
+  SET(GLOG_REPOSITORY "http://admin@172.20.90.14:8080/r/glog.git")
 
 ExternalProject_Add(
     extern_glog
     ${EXTERNAL_PROJECT_LOG_ARGS}
     DEPENDS gflags
     GIT_REPOSITORY  ${GLOG_REPOSITORY}
-    GIT_TAG         ${GLOG_TAG}
+   # GIT_TAG         ${GLOG_TAG}
     PREFIX          ${GLOG_SOURCES_DIR}
     UPDATE_COMMAND  ""
     CMAKE_ARGS      -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}
diff --git a/cmake/external/gtest.cmake b/cmake/external/gtest.cmake
index d335298742..da539d52bd 100644
--- a/cmake/external/gtest.cmake
+++ b/cmake/external/gtest.cmake
@@ -43,8 +43,9 @@ IF(WITH_TESTING)
         extern_gtest
         ${EXTERNAL_PROJECT_LOG_ARGS}
         DEPENDS         ${GTEST_DEPENDS}
-        GIT_REPOSITORY  "https://github.com/google/googletest.git"
-        GIT_TAG         "release-1.8.0"
+            #        GIT_REPOSITORY  "https://github.com/google/googletest.git"
+                    GIT_REPOSITORY  "http://admin@172.20.90.14:8080/r/gtest.git"
+#        GIT_TAG         "release-1.8.0"
         PREFIX          ${GTEST_SOURCES_DIR}
         UPDATE_COMMAND  ""
         CMAKE_ARGS      -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}
diff --git a/cmake/external/protobuf.cmake b/cmake/external/protobuf.cmake
index e1e619e572..94d8ac30cc 100644
--- a/cmake/external/protobuf.cmake
+++ b/cmake/external/protobuf.cmake
@@ -202,8 +202,9 @@ FUNCTION(build_protobuf TARGET_NAME BUILD_FOR_HOST)
         SET(OPTIONAL_ARGS ${OPTIONAL_ARGS} "-DCMAKE_GENERATOR_PLATFORM=x64")
     ENDIF()
 
-    SET(PROTOBUF_REPO "https://github.com/google/protobuf.git")
-    SET(PROTOBUF_TAG "9f75c5aa851cd877fb0d93ccc31b8567a6706546")
+   # SET(PROTOBUF_REPO "https://github.com/google/protobuf.git")
+  #  SET(PROTOBUF_TAG "9f75c5aa851cd877fb0d93ccc31b8567a6706546")
+    SET(PROTOBUF_REPO http://admin@172.20.90.14:8080/r/protobuf.git)
     IF(MOBILE_INFERENCE)
         # The reason why the official version is not used is described in
         # https://github.com/PaddlePaddle/Paddle/issues/6114
diff --git a/cmake/external/snappy.cmake b/cmake/external/snappy.cmake
index af09ed4d5d..b30403d2d8 100644
--- a/cmake/external/snappy.cmake
+++ b/cmake/external/snappy.cmake
@@ -24,7 +24,11 @@ set(SNAPPY_SOURCES_DIR ${THIRD_PARTY_PATH}/snappy)
 set(SNAPPY_INSTALL_DIR ${THIRD_PARTY_PATH}/install/snappy)
 set(SNAPPY_INCLUDE_DIR "${SNAPPY_INSTALL_DIR}/include" CACHE PATH "snappy include directory." FORCE)
 
-set(SNAPPY_LIBRARIES "${SNAPPY_INSTALL_DIR}/lib/libsnappy.a")
+if (WIN32)
+    set(SNAPPY_LIBRARIES "${SNAPPY_INSTALL_DIR}/lib/snappy.lib")
+else(WIN32)
+    set(SNAPPY_LIBRARIES "${SNAPPY_INSTALL_DIR}/lib/libsnappy.a")
+endif (WIN32)
 
 ExternalProject_Add(
     extern_snappy
@@ -34,8 +38,12 @@ ExternalProject_Add(
     UPDATE_COMMAND  ""
     CMAKE_ARGS      -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}
                     -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER}
-                    -DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS}
                     -DCMAKE_C_FLAGS=${CMAKE_C_FLAGS}
+                    -DCMAKE_C_FLAGS_DEBUG=${CMAKE_C_FLAGS_DEBUG}
+                    -DCMAKE_C_FLAGS_RELEASE=${CMAKE_C_FLAGS_RELEASE}
+                    -DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS}
+                    -DCMAKE_CXX_FLAGS_RELEASE=${CMAKE_CXX_FLAGS_RELEASE}
+                    -DCMAKE_CXX_FLAGS_DEBUG=${CMAKE_CXX_FLAGS_DEBUG}
                     -DCMAKE_INSTALL_PREFIX=${SNAPPY_INSTALL_DIR}
                     -DCMAKE_INSTALL_LIBDIR=${SNAPPY_INSTALL_DIR}/lib
                     -DCMAKE_POSITION_INDEPENDENT_CODE=ON
diff --git a/cmake/external/snappystream.cmake b/cmake/external/snappystream.cmake
index 6df636d7fa..1ec79462c1 100644
--- a/cmake/external/snappystream.cmake
+++ b/cmake/external/snappystream.cmake
@@ -18,36 +18,45 @@ ENDIF()
 
 include (ExternalProject)
 
-# NOTE: snappy is needed when linking with recordio
-
 set(SNAPPYSTREAM_SOURCES_DIR ${THIRD_PARTY_PATH}/snappy_stream)
 set(SNAPPYSTREAM_INSTALL_DIR ${THIRD_PARTY_PATH}/install/snappy_stream)
 set(SNAPPYSTREAM_INCLUDE_DIR "${SNAPPYSTREAM_INSTALL_DIR}/include" CACHE PATH "snappy stream include directory." FORCE)
 
-set(SNAPPYSTREAM_LIBRARIES "${SNAPPYSTREAM_INSTALL_DIR}/lib/libsnappystream.a")
-
-ExternalProject_Add(
-        extern_snappystream
-        GIT_REPOSITORY "https://github.com/hoxnox/snappystream.git"
-        GIT_TAG "0.2.8"
-        PREFIX          ${SNAPPYSTREAM_SOURCES_DIR}
-        UPDATE_COMMAND  ""
-        CMAKE_ARGS      -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}
-                        -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER}
-                        -DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS}
-                        -DCMAKE_C_FLAGS=${CMAKE_C_FLAGS}
-                        -DCMAKE_INSTALL_PREFIX=${SNAPPY_INSTALL_DIR}
-                        -DCMAKE_INSTALL_LIBDIR=${SNAPPY_INSTALL_DIR}/lib
-                        -DCMAKE_POSITION_INDEPENDENT_CODE=ON
-                        -DCMAKE_BUILD_TYPE=${THIRD_PARTY_BUILD_TYPE}
-                        -DSNAPPY_ROOT=${SNAPPY_INSTALL_DIR}
-                        ${EXTERNAL_OPTIONAL_ARGS}
-                        CMAKE_CACHE_ARGS
-                        -DCMAKE_INSTALL_PREFIX:PATH=${SNAPPYSTREAM_INSTALL_DIR}
-                        -DCMAKE_INSTALL_LIBDIR:PATH=${SNAPPYSTREAM_INSTALL_DIR}/lib
-                        -DCMAKE_BUILD_TYPE:STRING=${THIRD_PARTY_BUILD_TYPE}
-        DEPENDS snappy
-)
+if(WIN32)
+    # Fix me, VS2015 come without VLA support
+    set(SNAPPYSTREAM_LIBRARIES "${SNAPPYSTREAM_INSTALL_DIR}/lib/snappystream.lib")
+    MESSAGE(WARNING, "In windows, snappystream has no compile support for windows,
+    please build it manually and put it at " ${SNAPPYSTREAM_INSTALL_DIR})
+else(WIN32)
+    set(SNAPPYSTREAM_LIBRARIES "${SNAPPYSTREAM_INSTALL_DIR}/lib/libsnappystream.a")
+
+    ExternalProject_Add(
+            extern_snappystream
+            GIT_REPOSITORY "https://github.com/hoxnox/snappystream.git"
+            GIT_TAG "0.2.8"
+            PREFIX          ${SNAPPYSTREAM_SOURCES_DIR}
+            UPDATE_COMMAND  ""
+            CMAKE_ARGS      -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}
+                            -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER}
+                            -DCMAKE_C_FLAGS=${CMAKE_C_FLAGS}
+                            -DCMAKE_C_FLAGS_DEBUG=${CMAKE_C_FLAGS_DEBUG}
+                            -DCMAKE_C_FLAGS_RELEASE=${CMAKE_C_FLAGS_RELEASE}
+                            -DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS}
+                            -DCMAKE_CXX_FLAGS_RELEASE=${CMAKE_CXX_FLAGS_RELEASE}
+                            -DCMAKE_CXX_FLAGS_DEBUG=${CMAKE_CXX_FLAGS_DEBUG}
+                            -DCMAKE_INSTALL_PREFIX=${SNAPPY_INSTALL_DIR}
+                            -DCMAKE_INSTALL_LIBDIR=${SNAPPY_INSTALL_DIR}/lib
+                            -DCMAKE_POSITION_INDEPENDENT_CODE=ON
+                            -DCMAKE_BUILD_TYPE=${THIRD_PARTY_BUILD_TYPE}
+                            -DSNAPPY_ROOT=${SNAPPY_INSTALL_DIR}
+                            ${EXTERNAL_OPTIONAL_ARGS}
+                            CMAKE_CACHE_ARGS
+                            -DCMAKE_INSTALL_PREFIX:PATH=${SNAPPYSTREAM_INSTALL_DIR}
+                            -DCMAKE_INSTALL_LIBDIR:PATH=${SNAPPYSTREAM_INSTALL_DIR}/lib
+                            -DCMAKE_BUILD_TYPE:STRING=${THIRD_PARTY_BUILD_TYPE}
+            DEPENDS snappy
+    )
+endif(WIN32)
 
 add_library(snappystream STATIC IMPORTED GLOBAL)
 set_property(TARGET snappystream PROPERTY IMPORTED_LOCATION ${SNAPPYSTREAM_LIBRARIES})
diff --git a/cmake/external/zlib.cmake b/cmake/external/zlib.cmake
index c3d7323545..456f26385c 100644
--- a/cmake/external/zlib.cmake
+++ b/cmake/external/zlib.cmake
@@ -31,8 +31,9 @@ INCLUDE_DIRECTORIES(${THIRD_PARTY_PATH}/install) # For Paddle code to include zl
 ExternalProject_Add(
     extern_zlib
     ${EXTERNAL_PROJECT_LOG_ARGS}
-    GIT_REPOSITORY  "https://github.com/madler/zlib.git"
-    GIT_TAG         "v1.2.8"
+        #    GIT_REPOSITORY  "https://github.com/madler/zlib.git"
+            GIT_REPOSITORY  "http://admin@172.20.90.14:8080/r/zlib.git"
+#    GIT_TAG         "v1.2.8"
     PREFIX          ${ZLIB_SOURCES_DIR}
     UPDATE_COMMAND  ""
     CMAKE_ARGS      -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER}
diff --git a/paddle/fluid/CMakeLists.txt b/paddle/fluid/CMakeLists.txt
index abadda3adb..6b526f0103 100644
--- a/paddle/fluid/CMakeLists.txt
+++ b/paddle/fluid/CMakeLists.txt
@@ -3,13 +3,9 @@ add_subdirectory(platform)
 add_subdirectory(framework)
 add_subdirectory(operators)
 add_subdirectory(string)
-
-add_subdirectory(pybind)
-if (NOT WIN32)
 add_subdirectory(recordio)
-endif(NOT WIN32)
+add_subdirectory(pybind)
 
 # NOTE: please add subdirectory inference at last.
 add_subdirectory(inference)
-
 add_subdirectory(train)
diff --git a/paddle/fluid/framework/CMakeLists.txt b/paddle/fluid/framework/CMakeLists.txt
index cb9057672c..42af482f85 100644
--- a/paddle/fluid/framework/CMakeLists.txt
+++ b/paddle/fluid/framework/CMakeLists.txt
@@ -68,11 +68,7 @@ if(WITH_GPU)
 else()
   cc_test(mixed_vector_test SRCS mixed_vector_test.cc DEPS place memory device_context tensor)
 endif()
-if (NOT WIN32)
-  cc_library(lod_tensor SRCS lod_tensor.cc DEPS ddim place tensor framework_proto recordio version)
-else()
-  cc_library(lod_tensor SRCS lod_tensor.cc DEPS ddim place tensor framework_proto version)
-endif (NOT WIN32)
+cc_library(lod_tensor SRCS lod_tensor.cc DEPS ddim place tensor framework_proto recordio version)
 
 cc_test(lod_tensor_test SRCS lod_tensor_test.cc DEPS lod_tensor memory)
 nv_test(lod_tensor_gpu_test SRCS lod_tensor_test.cu DEPS lod_tensor)
diff --git a/paddle/fluid/operators/CMakeLists.txt b/paddle/fluid/operators/CMakeLists.txt
index f06ef199d1..edd062e175 100644
--- a/paddle/fluid/operators/CMakeLists.txt
+++ b/paddle/fluid/operators/CMakeLists.txt
@@ -95,7 +95,8 @@ function(op_library TARGET)
     foreach(windows_unsupport_op "nccl_op" "gen_nccl_id_op" "warpctc_op" "hierarchical_sigmoid_op"
      "crf_decoding_op" "select_op" "lstmp_op" "gru_op" "fusion_gru_op" "lstm_op" "fusion_lstm_op" "cumsum_op"
       "fusion_seqconv_eltadd_relu_op" "channel_send_op" "channel_create_op" "channel_close_op" "channel_recv_op"
-            "fusion_seqexpand_concat_fc_op" "attention_lstm_op" "fused_embedding_fc_lstm_op" "fc_op")
+            "fusion_seqexpand_concat_fc_op" "attention_lstm_op" "fused_embedding_fc_lstm_op" "fc_op"
+            )
         if ("${TARGET}" STREQUAL "${windows_unsupport_op}")
           return()
         endif()
@@ -225,7 +226,6 @@ if(WITH_DISTRIBUTE)
             ADD_LIBRARY(ibverbs SHARED IMPORTED GLOBAL)
             SET_PROPERTY(TARGET ibverbs PROPERTY IMPORTED_LOCATION ${IBVERBS_LIBRARY})
 
-
             find_library(RDMACM_LIBRARY NAMES rdmacm)
             ADD_LIBRARY(rdmacm SHARED IMPORTED GLOBAL)
             SET_PROPERTY(TARGET rdmacm PROPERTY IMPORTED_LOCATION ${RDMACM_LIBRARY})
@@ -338,11 +338,7 @@ foreach(src ${GENERAL_OPS})
 endforeach()
 
 file(APPEND ${pybind_file} "USE_OP(less_than);\nUSE_OP(logical_and);\nUSE_NO_KERNEL_OP(read_from_array);\n")
-
-
-if (NOT WIN32)
 add_subdirectory(reader)
-endif(NOT WIN32)
 foreach(src ${READER_LIBRARY})
     set(OP_LIBRARY ${src} ${OP_LIBRARY})
 endforeach()
diff --git a/paddle/fluid/operators/reader/create_py_reader_op.cc b/paddle/fluid/operators/reader/create_py_reader_op.cc
index 0f31ca1a94..901a92ab5b 100644
--- a/paddle/fluid/operators/reader/create_py_reader_op.cc
+++ b/paddle/fluid/operators/reader/create_py_reader_op.cc
@@ -74,7 +74,7 @@ class CreatePyReaderOpMaker : public FileReaderMakerBase {
              "Name of the `LoDTensorBlockingQueueHolder` variable");
 
     AddComment(R"DOC(
-			Create PyReader to support LoDTensor data feeding in Python side.
+      Create PyReader to support LoDTensor data feeding in Python side.
       )DOC");
   }
 };
diff --git a/paddle/fluid/operators/roi_align_op.cc b/paddle/fluid/operators/roi_align_op.cc
index c57a34c3a7..79f189222e 100644
--- a/paddle/fluid/operators/roi_align_op.cc
+++ b/paddle/fluid/operators/roi_align_op.cc
@@ -35,10 +35,10 @@ class ROIAlignOp : public framework::OperatorWithKernel {
                    "The format of input tensor is NCHW.");
     PADDLE_ENFORCE(rois_dims.size() == 2,
                    "ROIs should be a 2-D LoDTensor of shape (num_rois, 4)"
-                   "given as [[x1, y1, x2, y2], …].");
+                   "given as [[x1, y1, x2, y2], ...].");
     PADDLE_ENFORCE(rois_dims[1] == 4,
                    "ROIs should be a 2-D LoDTensor of shape (num_rois, 4)"
-                   "given as [[x1, y1, x2, y2], …].");
+                   "given as [[x1, y1, x2, y2], ...].");
     int pooled_height = ctx->Attrs().Get<int>("pooled_height");
     int pooled_width = ctx->Attrs().Get<int>("pooled_width");
     float spatial_scale = ctx->Attrs().Get<float>("spatial_scale");
@@ -103,7 +103,7 @@ class ROIAlignOpMaker : public framework::OpProtoAndCheckerMaker {
              "(LoDTensor), "
              "ROIs (Regions of Interest) to pool over. "
              "should be a 2-D LoDTensor of shape (num_rois, 4)"
-             "given as [[x1, y1, x2, y2], …]. "
+             "given as [[x1, y1, x2, y2], ...]. "
              "(x1, y1) is the top left coordinates, and "
              "(x2, y2) is the bottom right coordinates.");
     AddOutput("Out",
diff --git a/paddle/fluid/operators/roi_pool_op.cc b/paddle/fluid/operators/roi_pool_op.cc
index 043ea680d1..3f6b2e46c7 100644
--- a/paddle/fluid/operators/roi_pool_op.cc
+++ b/paddle/fluid/operators/roi_pool_op.cc
@@ -40,10 +40,10 @@ class ROIPoolOp : public framework::OperatorWithKernel {
                    "The format of input tensor is NCHW.");
     PADDLE_ENFORCE(rois_dims.size() == 2,
                    "ROIs should be a 2-D LoDTensor of shape (num_rois, 4)"
-                   "given as [[x1, y1, x2, y2], …].");
+                   "given as [[x1, y1, x2, y2], ...].");
     PADDLE_ENFORCE(rois_dims[1] == kROISize,
                    "ROIs should be a 2-D LoDTensor of shape (num_rois, 4)"
-                   "given as [[x1, y1, x2, y2], …].");
+                   "given as [[x1, y1, x2, y2], ...].");
 
     int pooled_height = ctx->Attrs().Get<int>("pooled_height");
     int pooled_width = ctx->Attrs().Get<int>("pooled_width");
@@ -110,7 +110,7 @@ class ROIPoolOpMaker : public framework::OpProtoAndCheckerMaker {
              "(LoDTensor), "
              "ROIs (Regions of Interest) to pool over. "
              "should be a 2-D LoDTensor of shape (num_rois, 4)"
-             "given as [[x1, y1, x2, y2], …]. "
+             "given as [[x1, y1, x2, y2], ...]. "
              "Where batch_id is the id of the data, "
              "(x1, y1) is the top left coordinates, and "
              "(x2, y2) is the bottom right coordinates.");
diff --git a/paddle/fluid/operators/space_to_depth_op.cc b/paddle/fluid/operators/space_to_depth_op.cc
index f109dd685c..b579244673 100644
--- a/paddle/fluid/operators/space_to_depth_op.cc
+++ b/paddle/fluid/operators/space_to_depth_op.cc
@@ -86,7 +86,7 @@ class SpaceToDepthOpMaker : public framework::OpProtoAndCheckerMaker {
         .GreaterThan(1);
     AddComment(R"DOC(
         reorg operator used in Yolo v2.
-        The equation is: C2 = C1/blocksize * blocksize, W2 = W1 ∗ blocksize + offset % blocksize, H2 = H1 ∗ blocksize + offset / blocksize, 
+        The equation is: C2 = C1/blocksize * blocksize, W2 = W1 * blocksize + offset % blocksize, H2 = H1 * blocksize + offset / blocksize,
 
         Reshape Input(X) into the shape according to Attr(blocksize). The
         data in Input(X) are unchanged.
diff --git a/paddle/fluid/platform/port.h b/paddle/fluid/platform/port.h
index 8823e97b0b..a07b993c8a 100644
--- a/paddle/fluid/platform/port.h
+++ b/paddle/fluid/platform/port.h
@@ -132,10 +132,12 @@ static void MkDir(const char *path) {
     }
   }
 #else
-  CreateDirectory(path, NULL);
-  auto errorno = GetLastError();
-  if (errorno != ERROR_ALREADY_EXISTS) {
-    throw std::runtime_error(path_error);
+  BOOL return_value = CreateDirectory(path, NULL);
+  if (!return_value) {
+    auto errorno = GetLastError();
+    if (errorno != ERROR_ALREADY_EXISTS) {
+      throw std::runtime_error(path_error);
+    }
   }
 #endif  // !_WIN32
 }
diff --git a/paddle/fluid/pybind/CMakeLists.txt b/paddle/fluid/pybind/CMakeLists.txt
index 6afa53cd36..cd8256f1c7 100644
--- a/paddle/fluid/pybind/CMakeLists.txt
+++ b/paddle/fluid/pybind/CMakeLists.txt
@@ -1,9 +1,8 @@
 
-set(PYBIND_DEPS pybind python proto_desc memory executor prune  feed_fetch_method pass_builder)
-set(PYBIND_SRCS pybind.cc exception.cc protobuf.cc const_value.cc)
+set(PYBIND_DEPS pybind python proto_desc memory executor prune feed_fetch_method pass_builder)
+set(PYBIND_SRCS pybind.cc exception.cc protobuf.cc const_value.cc recordio.cc)
 if(NOT WIN32)
   list(APPEND PYBIND_DEPS parallel_executor profiler)
-  list(APPEND PYBIND_SRCS recordio.cc)
 endif(NOT WIN32)
 if(WITH_PYTHON)
   if(WITH_AMD_GPU)
diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc
index 0d059d8aea..89959c389f 100644
--- a/paddle/fluid/pybind/pybind.cc
+++ b/paddle/fluid/pybind/pybind.cc
@@ -357,19 +357,16 @@ All parameter, weight, gradient are variables in Paddle.
              return self.GetMutable<platform::Communicator>();
            },
            py::return_value_policy::reference)
+#endif
       .def("get_reader",
            [](Variable &self) -> framework::ReaderHolder * {
              PADDLE_ENFORCE(self.IsType<framework::ReaderHolder>());
              return self.GetMutable<framework::ReaderHolder>();
            },
-           py::return_value_policy::reference)
-#endif
-      ;
+           py::return_value_policy::reference);
 
-#if !defined(_WIN32)
   py::class_<framework::ReaderHolder>(m, "Reader", "")
       .def("reset", &framework::ReaderHolder::ResetAll);
-#endif
 
   using LoDTensorBlockingQueue =
       ::paddle::operators::reader::LoDTensorBlockingQueue;
@@ -914,9 +911,9 @@ All parameter, weight, gradient are variables in Paddle.
         pybind11::gil_scoped_release release;
         self.Run(fetch_tensors, fetched_var_name);
       });
+#endif
 
   BindRecordIOWriter(&m);
-#endif
   return m.ptr();
 }
 }  // namespace pybind
diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py
index 1b5009e761..2971319141 100644
--- a/python/paddle/fluid/layers/nn.py
+++ b/python/paddle/fluid/layers/nn.py
@@ -169,6 +169,12 @@ __all__ = [
     'bilinear_tensor_product',
 ]
 
+# To avoid the api checker complains
+if os.name == 'nt':
+    __all__.remove('dynamic_lstm')
+    __all__.remove('crf_decoding')
+    __all__.remove('roi_pool')
+
 
 def fc(input,
        size,

From 162f2d410912ebbe6dae12c4120d97ea69b9ffda Mon Sep 17 00:00:00 2001
From: peizhilin <wopeizl@163.com>
Date: Fri, 16 Nov 2018 10:58:48 +0800
Subject: [PATCH 002/113] disable the openblas multi-thread on windows since no
 support adjust the python script

---
 paddle/fluid/platform/cpu_helper.cc       |   6 +
 paddle/fluid/platform/init.cc             |   7 -
 python/paddle/fluid/__init__.py           |   3 +-
 python/paddle/fluid/contrib/inferencer.py |   4 +-
 python/paddle/fluid/contrib/trainer.py    |   3 +-
 python/paddle/fluid/parallel_executor.py  | 495 +++++++++++-----------
 6 files changed, 258 insertions(+), 260 deletions(-)

diff --git a/paddle/fluid/platform/cpu_helper.cc b/paddle/fluid/platform/cpu_helper.cc
index 234a04b5c2..4e52e8ff00 100644
--- a/paddle/fluid/platform/cpu_helper.cc
+++ b/paddle/fluid/platform/cpu_helper.cc
@@ -29,6 +29,12 @@ namespace platform {
 
 void SetNumThreads(int num_threads) {
 #ifdef PADDLE_USE_OPENBLAS
+  // windows has no support for openblas multi-thread
+#ifdef _WIN32
+  if (num_threads > 1) {
+    num_threads = 1;
+  }
+#endif
   int real_num_threads = num_threads > 1 ? num_threads : 1;
   openblas_set_num_threads(real_num_threads);
 #elif defined(PADDLE_WITH_MKLML)
diff --git a/paddle/fluid/platform/init.cc b/paddle/fluid/platform/init.cc
index 84d1b852cb..69bbe8794d 100644
--- a/paddle/fluid/platform/init.cc
+++ b/paddle/fluid/platform/init.cc
@@ -113,13 +113,6 @@ void InitDevices(bool init_p2p, const std::vector<int> devices) {
   places.emplace_back(platform::CPUPlace());
   platform::DeviceContextPool::Init(places);
 
-// windows has no support for openblas multi-thread
-#ifdef _WIN32
-  if (FLAGS_paddle_num_threads > 1) {
-    FLAGS_paddle_num_threads = 1;
-  }
-#endif
-
 #ifndef PADDLE_WITH_MKLDNN
   platform::SetNumThreads(FLAGS_paddle_num_threads);
 #endif
diff --git a/python/paddle/fluid/__init__.py b/python/paddle/fluid/__init__.py
index 8129918916..dbe49c98bd 100644
--- a/python/paddle/fluid/__init__.py
+++ b/python/paddle/fluid/__init__.py
@@ -47,7 +47,8 @@ from . import profiler
 from . import unique_name
 from . import recordio_writer
 from . import parallel_executor
-from .parallel_executor import *
+if os.name != 'nt':
+    from .parallel_executor import *
 from paddle.fluid.layers.math_op_patch import monkey_patch_variable
 
 Tensor = LoDTensor
diff --git a/python/paddle/fluid/contrib/inferencer.py b/python/paddle/fluid/contrib/inferencer.py
index b966ae01d0..b8d5f4ffea 100644
--- a/python/paddle/fluid/contrib/inferencer.py
+++ b/python/paddle/fluid/contrib/inferencer.py
@@ -15,15 +15,13 @@
 from __future__ import print_function
 
 import contextlib
-import os
 
 from .. import core
 
 from .. import executor
 from .. import framework
 from .. import io
-if os.name != 'nt':
-    from .. import parallel_executor
+from .. import parallel_executor
 from .. import unique_name
 from .trainer import check_and_get_place
 
diff --git a/python/paddle/fluid/contrib/trainer.py b/python/paddle/fluid/contrib/trainer.py
index 096821a5ba..8569e486f9 100644
--- a/python/paddle/fluid/contrib/trainer.py
+++ b/python/paddle/fluid/contrib/trainer.py
@@ -28,8 +28,7 @@ from .. import framework
 from .. import io
 # optimizer is same as the parameter of Trainer.__init__. Rename it to opt_module
 from .. import optimizer as opt_module
-if os.name != 'nt':
-    from .. import parallel_executor
+from .. import parallel_executor
 from ..transpiler import distribute_transpiler
 
 __all__ = [
diff --git a/python/paddle/fluid/parallel_executor.py b/python/paddle/fluid/parallel_executor.py
index 3f4dd5eb71..33f6df67a4 100644
--- a/python/paddle/fluid/parallel_executor.py
+++ b/python/paddle/fluid/parallel_executor.py
@@ -25,263 +25,264 @@ import os
 
 __all__ = ['ParallelExecutor', 'ExecutionStrategy', 'BuildStrategy']
 
-ExecutionStrategy = core.ParallelExecutor.ExecutionStrategy
-BuildStrategy = core.ParallelExecutor.BuildStrategy
-
-
-class ParallelExecutor(object):
-    """
-    ParallelExecutor is designed for data parallelism, which focuses on distributing
-    the data across different nodes and every node operates on the data in parallel.
-    If you use ParallelExecutor to run the current program on GPU, the node means GPU
-    device, and ParallelExecutor will get the available GPU device automatically on
-    the current machine. If you use ParallelExecutor to run the current program on CPU,
-    the node means the CPU device, and you can specify the CPU device number by adding
-    'CPU_NUM' environment variable, for example 'CPU_NUM=4', if the environment variable
-    is not found, ParallelExecutor will call `multiprocessing.cpu_count` to get the number
-    of CPUs in the system.
-
-    Args:
-        use_cuda (bool): Whether to use CUDA or not.
-        loss_name (str): The loss name must set in training. Default None.
-        main_program (Program): The program that need to run, if not provided,
-            then default_main_program will be used. Default None.
-        share_vars_from(ParallelExecutor): If provide, it will share variables
-            from the specified ParallelExecutor. Default None.
-        exec_strategy(ExecutionStrategy): exec_strategy is used to control how to run
-            the program in ParallelExecutor, for example how many threads are used to
-            execute the program, how many iterations to clean up the temp variables
-            which is generated during execution. For more information, please refer
-            to fluid.ExecutionStrategy. Default None.
-        build_strategy(BuildStrategy): build_strategy is used to control how to
-            build the SSA Graph in ParallelExecutor by setting the property,
-            for example reduce_strategy, gradient_scale_strategy. For more information,
-            please refer to fluid.BuildStrategy. Default None.
-        num_trainers(int): If greater than 1, NCCL will be initialized with
-            multiple rank of nodes, each node should have same number of GPUs.
-            Distributed training will be enabled then. Default 1.
-        trainer_id(int): Must use together with num_trainers. trainer_id is the
-            "rank" of current node starts from 0. Default 0.
-        scope(Scope): scope to run with, default use fluid.global_scope().
-
-    Returns:
-        ParallelExecutor: The initialized ParallelExecutor object.
-
-    Raises:
-        TypeError: If share_vars_from is provided, but not ParallelExecutor object.
-
-    Examples:
-        .. code-block:: python
-
-          train_exe = fluid.ParallelExecutor(use_cuda=True, loss_name=loss.name)
-          test_exe = fluid.ParallelExecutor(use_cuda=True,
-                                            main_program=test_program,
-                                            share_vars_from=train_exe)
-
-          train_loss, = train_exe.run([loss.name], feed=feed_dict)
-          test_loss, = test_exe.run([loss.name], feed=feed_dict)
-    """
-
-    def __init__(self,
-                 use_cuda,
-                 loss_name=None,
-                 main_program=None,
-                 share_vars_from=None,
-                 exec_strategy=None,
-                 build_strategy=None,
-                 num_trainers=1,
-                 trainer_id=0,
-                 scope=None):
-        self._places = []
-        self._act_places = []
-        if use_cuda:
-            for i in six.moves.range(core.get_cuda_device_count()):
-                p = core.Place()
-                self._act_places.append(core.CUDAPlace(i))
-                p.set_place(self._act_places[-1])
-                self._places.append(p)
-        else:
-            cpu_num = int(
-                os.environ.get('CPU_NUM', multiprocessing.cpu_count()))
-            for i in six.moves.range(cpu_num):
-                p = core.Place()
-                self._act_places.append(core.CPUPlace())
-                p.set_place(self._act_places[-1])
-                self._places.append(p)
-        assert self._places, "no place for execution"
-
-        if exec_strategy is None:
-            exec_strategy = ExecutionStrategy()
-        exec_strategy.use_cuda = use_cuda
-
-        if exec_strategy.num_threads == 0:
-            if use_cuda:
-                # Experiments on se-resnext shows that too many threads hurt
-                # performance. Worth tunning for other models in the future.
-                exec_strategy.num_threads = len(self._places) * 4
-            else:
-                cpu_num = int(
-                    os.environ.get('CPU_NUM', multiprocessing.cpu_count()))
-                exec_strategy.num_threads = cpu_num * 2
-
-        # Set 1 thread num under nccl2 distribute 
-        #   env to make sure all gpus run ops in same order.
-        if num_trainers > 1:
-            assert (use_cuda)
-            # FIXME(gongwb): avoid this set.
-            exec_strategy.num_threads = 1
-
-        if build_strategy is None:
-            build_strategy = BuildStrategy()
-
-        main = main_program
-        main = main if main else framework.default_main_program()
-        if scope == None:
-            scope = executor.global_scope()
-
-        if share_vars_from and not isinstance(share_vars_from,
-                                              ParallelExecutor):
-            raise TypeError("share_vars_from must be ParallelExecutor.")
-
-        local_scopes = share_vars_from.executor.local_scopes(
-        ) if share_vars_from else []
-
-        self.persistable_vars = [
-            v.name for v in [
-                var for var in main.list_vars()
-                if var.persistable and var.type != core.VarDesc.VarType.RAW
-            ]
-        ]
-
-        self.executor = core.ParallelExecutor(
-            self._places,
-            set([
-                cpt.to_text(p.name)
-                for p in main.global_block().iter_parameters()
-                if not p.stop_gradient
-            ]),
-            set(cpt.to_text(var) for var in self.persistable_vars), main.desc,
-            cpt.to_text(loss_name)
-            if loss_name else six.u(''), scope, local_scopes, exec_strategy,
-            build_strategy, num_trainers, trainer_id)
-        self.scope = scope
-
-    def run(self, fetch_list, feed=None, feed_dict=None, return_numpy=True):
-        """
-        Run a parallel executor with fetch_list.
-
-        The feed parameter can be a dict or a list. If feed is a dict, the
-        feed data will be split into multiple devices. If feed is a list, we
-        assume the data has been splitted into multiple devices, the each
-        element in the list will be copied to each device directly.
-
-        For example, if the feed is a dict:
-
-        >>> exe = ParallelExecutor()
-        >>> # the image will be splitted into devices. If there is two devices
-        >>> # each device will process an image with shape (24, 1, 28, 28)
-        >>> exe.run(feed={'image': numpy.random.random(size=(48, 1, 28, 28))})
+if os.name != 'nt':
+    ExecutionStrategy = core.ParallelExecutor.ExecutionStrategy
+    BuildStrategy = core.ParallelExecutor.BuildStrategy
 
-        For example, if the feed is a list:
 
-        >>> exe = ParallelExecutor()
-        >>> # each device will process each element in the list.
-        >>> # the 1st device will process an image with shape (48, 1, 28, 28)
-        >>> # the 2nd device will process an image with shape (32, 1, 28, 28)
-        >>> #
-        >>> # you can use exe.device_count to get the device number.
-        >>> exe.run(feed=[{"image": numpy.random.random(size=(48, 1, 28, 28))},
-        >>>               {"image": numpy.random.random(size=(32, 1, 28, 28))},
-        >>>              ])
+    class ParallelExecutor(object):
+        """
+        ParallelExecutor is designed for data parallelism, which focuses on distributing
+        the data across different nodes and every node operates on the data in parallel.
+        If you use ParallelExecutor to run the current program on GPU, the node means GPU
+        device, and ParallelExecutor will get the available GPU device automatically on
+        the current machine. If you use ParallelExecutor to run the current program on CPU,
+        the node means the CPU device, and you can specify the CPU device number by adding
+        'CPU_NUM' environment variable, for example 'CPU_NUM=4', if the environment variable
+        is not found, ParallelExecutor will call `multiprocessing.cpu_count` to get the number
+        of CPUs in the system.
 
         Args:
-            fetch_list(list): The fetched variable names
-            feed(list|dict|None): The feed variables. If the feed is a dict,
-                tensors in that dict will be splitted into each devices. If
-                the feed is a list, each element of the list will be copied
-                to each device. Default None.
-            feed_dict: Alias for feed parameter, for backward compatibility.
-                This parameter has been deprecated. Default None.
-            return_numpy(bool): Whether converts the fetched tensor to numpy.
-                Default: True.
+            use_cuda (bool): Whether to use CUDA or not.
+            loss_name (str): The loss name must set in training. Default None.
+            main_program (Program): The program that need to run, if not provided,
+                then default_main_program will be used. Default None.
+            share_vars_from(ParallelExecutor): If provide, it will share variables
+                from the specified ParallelExecutor. Default None.
+            exec_strategy(ExecutionStrategy): exec_strategy is used to control how to run
+                the program in ParallelExecutor, for example how many threads are used to
+                execute the program, how many iterations to clean up the temp variables
+                which is generated during execution. For more information, please refer
+                to fluid.ExecutionStrategy. Default None.
+            build_strategy(BuildStrategy): build_strategy is used to control how to
+                build the SSA Graph in ParallelExecutor by setting the property,
+                for example reduce_strategy, gradient_scale_strategy. For more information,
+                please refer to fluid.BuildStrategy. Default None.
+            num_trainers(int): If greater than 1, NCCL will be initialized with
+                multiple rank of nodes, each node should have same number of GPUs.
+                Distributed training will be enabled then. Default 1.
+            trainer_id(int): Must use together with num_trainers. trainer_id is the
+                "rank" of current node starts from 0. Default 0.
+            scope(Scope): scope to run with, default use fluid.global_scope().
 
         Returns:
-            List: The fetched result list.
+            ParallelExecutor: The initialized ParallelExecutor object.
 
         Raises:
-            ValueError: If the feed is a list, but its length is not equal the
-                length of active places, or its element's is not dict.
-
-        NOTES:
-            1. If the feed's type is dict, the number of data that feeds to
-               ParallelExecutor must be bigger than active places. Otherwise,
-               it will throw exception from C++ side. Special attention should be
-               paid to check whether the last batch of the dataset is bigger
-               than active places.
-            2. If active places are more than one, the fetch results for each
-               variable is a list, and each element of this list is the variable of
-               respective active place.
+            TypeError: If share_vars_from is provided, but not ParallelExecutor object.
 
         Examples:
             .. code-block:: python
 
-                pe = fluid.ParallelExecutor(use_cuda=use_cuda,
-                                            loss_name=avg_cost.name,
-                                            main_program=fluid.default_main_program())
-                loss = pe.run(feed=feeder.feed(cur_batch),
-                              fetch_list=[avg_cost.name]))
+              train_exe = fluid.ParallelExecutor(use_cuda=True, loss_name=loss.name)
+              test_exe = fluid.ParallelExecutor(use_cuda=True,
+                                                main_program=test_program,
+                                                share_vars_from=train_exe)
+
+              train_loss, = train_exe.run([loss.name], feed=feed_dict)
+              test_loss, = test_exe.run([loss.name], feed=feed_dict)
         """
-        if feed is None and feed_dict is not None:
-            feed = feed_dict
-            print(
-                "`feed_dict` is deprecated. Please use `feed=`",
-                file=sys.stderr)
-
-        if isinstance(feed, dict):
-            feed_tensor_dict = dict()
-            for feed_name in feed:
-                feed_tensor = feed[feed_name]
-                if not isinstance(feed_tensor, core.LoDTensor):
-                    feed_tensor = core.LoDTensor()
-                    # always set to CPU place, since the tensor need to be splitted
-                    # it is fast in CPU
-                    feed_tensor.set(feed[feed_name], core.CPUPlace())
-                feed_tensor_dict[feed_name] = feed_tensor
-
-            self.executor.feed_and_split_tensor_into_local_scopes(
-                feed_tensor_dict)
-        elif isinstance(feed, list) or isinstance(feed, tuple):
-            if len(feed) != len(self._act_places):
-                raise ValueError(
-                    "Feed a list of tensor, the list should be the same size as places"
-                )
-
-            res = list()
-
-            for i, each in enumerate(feed):
-                if not isinstance(each, dict):
-                    raise TypeError(
-                        "Each element of feed list should be a dict")
-                res_dict = dict()
-                for feed_name in each:
-                    tensor = each[feed_name]
-                    if not isinstance(tensor, core.LoDTensor):
-                        tmp = core.LoDTensor()
-                        tmp.set(tensor, self._act_places[i])
-                        tensor = tmp
-                    res_dict[feed_name] = tensor
-                res.append(res_dict)
-            self.executor.feed_tensors_into_local_scopes(res)
-
-        fetch_var_name = '@FETCHED_VAR_NAME@'
-        self.executor.run(fetch_list, fetch_var_name)
-        arr = self.scope.find_var(fetch_var_name).get_lod_tensor_array()
-
-        if return_numpy:
-            return executor.as_numpy(arr)
-
-        return [arr[i] for i in range(len(arr))]
-
-    @property
-    def device_count(self):
-        return len(self._act_places)
+
+        def __init__(self,
+                     use_cuda,
+                     loss_name=None,
+                     main_program=None,
+                     share_vars_from=None,
+                     exec_strategy=None,
+                     build_strategy=None,
+                     num_trainers=1,
+                     trainer_id=0,
+                     scope=None):
+            self._places = []
+            self._act_places = []
+            if use_cuda:
+                for i in six.moves.range(core.get_cuda_device_count()):
+                    p = core.Place()
+                    self._act_places.append(core.CUDAPlace(i))
+                    p.set_place(self._act_places[-1])
+                    self._places.append(p)
+            else:
+                cpu_num = int(
+                    os.environ.get('CPU_NUM', multiprocessing.cpu_count()))
+                for i in six.moves.range(cpu_num):
+                    p = core.Place()
+                    self._act_places.append(core.CPUPlace())
+                    p.set_place(self._act_places[-1])
+                    self._places.append(p)
+            assert self._places, "no place for execution"
+
+            if exec_strategy is None:
+                exec_strategy = ExecutionStrategy()
+            exec_strategy.use_cuda = use_cuda
+
+            if exec_strategy.num_threads == 0:
+                if use_cuda:
+                    # Experiments on se-resnext shows that too many threads hurt
+                    # performance. Worth tunning for other models in the future.
+                    exec_strategy.num_threads = len(self._places) * 4
+                else:
+                    cpu_num = int(
+                        os.environ.get('CPU_NUM', multiprocessing.cpu_count()))
+                    exec_strategy.num_threads = cpu_num * 2
+
+            # Set 1 thread num under nccl2 distribute
+            #   env to make sure all gpus run ops in same order.
+            if num_trainers > 1:
+                assert (use_cuda)
+                # FIXME(gongwb): avoid this set.
+                exec_strategy.num_threads = 1
+
+            if build_strategy is None:
+                build_strategy = BuildStrategy()
+
+            main = main_program
+            main = main if main else framework.default_main_program()
+            if scope == None:
+                scope = executor.global_scope()
+
+            if share_vars_from and not isinstance(share_vars_from,
+                                                  ParallelExecutor):
+                raise TypeError("share_vars_from must be ParallelExecutor.")
+
+            local_scopes = share_vars_from.executor.local_scopes(
+            ) if share_vars_from else []
+
+            self.persistable_vars = [
+                v.name for v in [
+                    var for var in main.list_vars()
+                    if var.persistable and var.type != core.VarDesc.VarType.RAW
+                ]
+            ]
+
+            self.executor = core.ParallelExecutor(
+                self._places,
+                set([
+                    cpt.to_text(p.name)
+                    for p in main.global_block().iter_parameters()
+                    if not p.stop_gradient
+                ]),
+                set(cpt.to_text(var) for var in self.persistable_vars), main.desc,
+                cpt.to_text(loss_name)
+                if loss_name else six.u(''), scope, local_scopes, exec_strategy,
+                build_strategy, num_trainers, trainer_id)
+            self.scope = scope
+
+        def run(self, fetch_list, feed=None, feed_dict=None, return_numpy=True):
+            """
+            Run a parallel executor with fetch_list.
+
+            The feed parameter can be a dict or a list. If feed is a dict, the
+            feed data will be split into multiple devices. If feed is a list, we
+            assume the data has been splitted into multiple devices, the each
+            element in the list will be copied to each device directly.
+
+            For example, if the feed is a dict:
+
+            >>> exe = ParallelExecutor()
+            >>> # the image will be splitted into devices. If there is two devices
+            >>> # each device will process an image with shape (24, 1, 28, 28)
+            >>> exe.run(feed={'image': numpy.random.random(size=(48, 1, 28, 28))})
+
+            For example, if the feed is a list:
+
+            >>> exe = ParallelExecutor()
+            >>> # each device will process each element in the list.
+            >>> # the 1st device will process an image with shape (48, 1, 28, 28)
+            >>> # the 2nd device will process an image with shape (32, 1, 28, 28)
+            >>> #
+            >>> # you can use exe.device_count to get the device number.
+            >>> exe.run(feed=[{"image": numpy.random.random(size=(48, 1, 28, 28))},
+            >>>               {"image": numpy.random.random(size=(32, 1, 28, 28))},
+            >>>              ])
+
+            Args:
+                fetch_list(list): The fetched variable names
+                feed(list|dict|None): The feed variables. If the feed is a dict,
+                    tensors in that dict will be splitted into each devices. If
+                    the feed is a list, each element of the list will be copied
+                    to each device. Default None.
+                feed_dict: Alias for feed parameter, for backward compatibility.
+                    This parameter has been deprecated. Default None.
+                return_numpy(bool): Whether converts the fetched tensor to numpy.
+                    Default: True.
+
+            Returns:
+                List: The fetched result list.
+
+            Raises:
+                ValueError: If the feed is a list, but its length is not equal the
+                    length of active places, or its element's is not dict.
+
+            NOTES:
+                1. If the feed's type is dict, the number of data that feeds to
+                   ParallelExecutor must be bigger than active places. Otherwise,
+                   it will throw exception from C++ side. Special attention should be
+                   paid to check whether the last batch of the dataset is bigger
+                   than active places.
+                2. If active places are more than one, the fetch results for each
+                   variable is a list, and each element of this list is the variable of
+                   respective active place.
+
+            Examples:
+                .. code-block:: python
+
+                    pe = fluid.ParallelExecutor(use_cuda=use_cuda,
+                                                loss_name=avg_cost.name,
+                                                main_program=fluid.default_main_program())
+                    loss = pe.run(feed=feeder.feed(cur_batch),
+                                  fetch_list=[avg_cost.name]))
+            """
+            if feed is None and feed_dict is not None:
+                feed = feed_dict
+                print(
+                    "`feed_dict` is deprecated. Please use `feed=`",
+                    file=sys.stderr)
+
+            if isinstance(feed, dict):
+                feed_tensor_dict = dict()
+                for feed_name in feed:
+                    feed_tensor = feed[feed_name]
+                    if not isinstance(feed_tensor, core.LoDTensor):
+                        feed_tensor = core.LoDTensor()
+                        # always set to CPU place, since the tensor need to be splitted
+                        # it is fast in CPU
+                        feed_tensor.set(feed[feed_name], core.CPUPlace())
+                    feed_tensor_dict[feed_name] = feed_tensor
+
+                self.executor.feed_and_split_tensor_into_local_scopes(
+                    feed_tensor_dict)
+            elif isinstance(feed, list) or isinstance(feed, tuple):
+                if len(feed) != len(self._act_places):
+                    raise ValueError(
+                        "Feed a list of tensor, the list should be the same size as places"
+                    )
+
+                res = list()
+
+                for i, each in enumerate(feed):
+                    if not isinstance(each, dict):
+                        raise TypeError(
+                            "Each element of feed list should be a dict")
+                    res_dict = dict()
+                    for feed_name in each:
+                        tensor = each[feed_name]
+                        if not isinstance(tensor, core.LoDTensor):
+                            tmp = core.LoDTensor()
+                            tmp.set(tensor, self._act_places[i])
+                            tensor = tmp
+                        res_dict[feed_name] = tensor
+                    res.append(res_dict)
+                self.executor.feed_tensors_into_local_scopes(res)
+
+            fetch_var_name = '@FETCHED_VAR_NAME@'
+            self.executor.run(fetch_list, fetch_var_name)
+            arr = self.scope.find_var(fetch_var_name).get_lod_tensor_array()
+
+            if return_numpy:
+                return executor.as_numpy(arr)
+
+            return [arr[i] for i in range(len(arr))]
+
+        @property
+        def device_count(self):
+            return len(self._act_places)

From d1a1fafc4c933e51341004b83f225d786c3fed49 Mon Sep 17 00:00:00 2001
From: peizhilin <wopeizl@163.com>
Date: Fri, 16 Nov 2018 11:21:29 +0800
Subject: [PATCH 003/113] code style

---
 paddle/fluid/platform/cpu_helper.cc      | 2 +-
 python/paddle/fluid/parallel_executor.py | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/paddle/fluid/platform/cpu_helper.cc b/paddle/fluid/platform/cpu_helper.cc
index 4e52e8ff00..bd6aedb3ac 100644
--- a/paddle/fluid/platform/cpu_helper.cc
+++ b/paddle/fluid/platform/cpu_helper.cc
@@ -29,7 +29,7 @@ namespace platform {
 
 void SetNumThreads(int num_threads) {
 #ifdef PADDLE_USE_OPENBLAS
-  // windows has no support for openblas multi-thread
+// windows has no support for openblas multi-thread
 #ifdef _WIN32
   if (num_threads > 1) {
     num_threads = 1;
diff --git a/python/paddle/fluid/parallel_executor.py b/python/paddle/fluid/parallel_executor.py
index 33f6df67a4..0d53f53a9e 100644
--- a/python/paddle/fluid/parallel_executor.py
+++ b/python/paddle/fluid/parallel_executor.py
@@ -29,7 +29,6 @@ if os.name != 'nt':
     ExecutionStrategy = core.ParallelExecutor.ExecutionStrategy
     BuildStrategy = core.ParallelExecutor.BuildStrategy
 
-
     class ParallelExecutor(object):
         """
         ParallelExecutor is designed for data parallelism, which focuses on distributing
@@ -161,7 +160,8 @@ if os.name != 'nt':
                     for p in main.global_block().iter_parameters()
                     if not p.stop_gradient
                 ]),
-                set(cpt.to_text(var) for var in self.persistable_vars), main.desc,
+                set(cpt.to_text(var)
+                    for var in self.persistable_vars), main.desc,
                 cpt.to_text(loss_name)
                 if loss_name else six.u(''), scope, local_scopes, exec_strategy,
                 build_strategy, num_trainers, trainer_id)

From dc80be275db8c1a75d222b0da11aba4c92c29aa8 Mon Sep 17 00:00:00 2001
From: peizhilin <wopeizl@163.com>
Date: Fri, 16 Nov 2018 11:21:29 +0800
Subject: [PATCH 004/113] code style test=develop

---
 cmake/external/eigen.cmake               | 10 ++++------
 cmake/external/gflags.cmake              |  5 ++---
 cmake/external/glog.cmake                |  3 +--
 cmake/external/gtest.cmake               |  5 ++---
 cmake/external/protobuf.cmake            |  5 ++---
 cmake/external/zlib.cmake                |  5 ++---
 paddle/fluid/platform/cpu_helper.cc      |  2 +-
 python/paddle/fluid/parallel_executor.py |  4 ++--
 8 files changed, 16 insertions(+), 23 deletions(-)

diff --git a/cmake/external/eigen.cmake b/cmake/external/eigen.cmake
index 98079678ae..573ad5e5f0 100644
--- a/cmake/external/eigen.cmake
+++ b/cmake/external/eigen.cmake
@@ -16,9 +16,8 @@ if(WITH_AMD_GPU)
     ExternalProject_Add(
         extern_eigen3
         ${EXTERNAL_PROJECT_LOG_ARGS}
-#        GIT_REPOSITORY  "https://github.com/sabreshao/hipeigen.git"
-#        GIT_TAG         0cba03ff9f8f9f70bbd92ac5857b031aa8fed6f9
-            GIT_REPOSITORY  "http://admin@172.20.90.14:8080/r/eigen3.git"
+        GIT_REPOSITORY  "https://github.com/sabreshao/hipeigen.git"
+        GIT_TAG         0cba03ff9f8f9f70bbd92ac5857b031aa8fed6f9
         PREFIX          ${EIGEN_SOURCE_DIR}
         UPDATE_COMMAND  ""
         CONFIGURE_COMMAND ""
@@ -30,11 +29,10 @@ else()
     ExternalProject_Add(
         extern_eigen3
         ${EXTERNAL_PROJECT_LOG_ARGS}
-#            GIT_REPOSITORY  "https://github.com/eigenteam/eigen-git-mirror"
-            GIT_REPOSITORY  "http://admin@172.20.90.14:8080/r/eigen3.git"
+        GIT_REPOSITORY  "https://github.com/eigenteam/eigen-git-mirror"
         # eigen on cuda9.1 missing header of math_funtions.hpp
         # https://stackoverflow.com/questions/43113508/math-functions-hpp-not-found-when-using-cuda-with-eigen
-#        GIT_TAG         917060c364181f33a735dc023818d5a54f60e54c
+        GIT_TAG         917060c364181f33a735dc023818d5a54f60e54c
         PREFIX          ${EIGEN_SOURCE_DIR}
         DOWNLOAD_NAME   "eigen"
         UPDATE_COMMAND  ""
diff --git a/cmake/external/gflags.cmake b/cmake/external/gflags.cmake
index 7c062d682c..4e98e4bf88 100644
--- a/cmake/external/gflags.cmake
+++ b/cmake/external/gflags.cmake
@@ -28,9 +28,8 @@ INCLUDE_DIRECTORIES(${GFLAGS_INCLUDE_DIR})
 ExternalProject_Add(
     extern_gflags
     ${EXTERNAL_PROJECT_LOG_ARGS}
-#    GIT_REPOSITORY  "https://github.com/gflags/gflags.git"
-    GIT_REPOSITORY  "http://admin@172.20.90.14:8080/r/gflags.git"
-#    GIT_TAG         77592648e3f3be87d6c7123eb81cbad75f9aef5a
+    GIT_REPOSITORY  "https://github.com/gflags/gflags.git"
+    GIT_TAG         77592648e3f3be87d6c7123eb81cbad75f9aef5a
     PREFIX          ${GFLAGS_SOURCES_DIR}
     UPDATE_COMMAND  ""
     CMAKE_ARGS      -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}
diff --git a/cmake/external/glog.cmake b/cmake/external/glog.cmake
index a3f3c6adf3..8cd0455c16 100644
--- a/cmake/external/glog.cmake
+++ b/cmake/external/glog.cmake
@@ -34,14 +34,13 @@ ELSE()
   SET(GLOG_REPOSITORY "https://github.com/google/glog.git")
   SET(GLOG_TAG "v0.3.5")
 ENDIF()
-  SET(GLOG_REPOSITORY "http://admin@172.20.90.14:8080/r/glog.git")
 
 ExternalProject_Add(
     extern_glog
     ${EXTERNAL_PROJECT_LOG_ARGS}
     DEPENDS gflags
     GIT_REPOSITORY  ${GLOG_REPOSITORY}
-   # GIT_TAG         ${GLOG_TAG}
+    GIT_TAG         ${GLOG_TAG}
     PREFIX          ${GLOG_SOURCES_DIR}
     UPDATE_COMMAND  ""
     CMAKE_ARGS      -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}
diff --git a/cmake/external/gtest.cmake b/cmake/external/gtest.cmake
index da539d52bd..d335298742 100644
--- a/cmake/external/gtest.cmake
+++ b/cmake/external/gtest.cmake
@@ -43,9 +43,8 @@ IF(WITH_TESTING)
         extern_gtest
         ${EXTERNAL_PROJECT_LOG_ARGS}
         DEPENDS         ${GTEST_DEPENDS}
-            #        GIT_REPOSITORY  "https://github.com/google/googletest.git"
-                    GIT_REPOSITORY  "http://admin@172.20.90.14:8080/r/gtest.git"
-#        GIT_TAG         "release-1.8.0"
+        GIT_REPOSITORY  "https://github.com/google/googletest.git"
+        GIT_TAG         "release-1.8.0"
         PREFIX          ${GTEST_SOURCES_DIR}
         UPDATE_COMMAND  ""
         CMAKE_ARGS      -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}
diff --git a/cmake/external/protobuf.cmake b/cmake/external/protobuf.cmake
index 94d8ac30cc..e1e619e572 100644
--- a/cmake/external/protobuf.cmake
+++ b/cmake/external/protobuf.cmake
@@ -202,9 +202,8 @@ FUNCTION(build_protobuf TARGET_NAME BUILD_FOR_HOST)
         SET(OPTIONAL_ARGS ${OPTIONAL_ARGS} "-DCMAKE_GENERATOR_PLATFORM=x64")
     ENDIF()
 
-   # SET(PROTOBUF_REPO "https://github.com/google/protobuf.git")
-  #  SET(PROTOBUF_TAG "9f75c5aa851cd877fb0d93ccc31b8567a6706546")
-    SET(PROTOBUF_REPO http://admin@172.20.90.14:8080/r/protobuf.git)
+    SET(PROTOBUF_REPO "https://github.com/google/protobuf.git")
+    SET(PROTOBUF_TAG "9f75c5aa851cd877fb0d93ccc31b8567a6706546")
     IF(MOBILE_INFERENCE)
         # The reason why the official version is not used is described in
         # https://github.com/PaddlePaddle/Paddle/issues/6114
diff --git a/cmake/external/zlib.cmake b/cmake/external/zlib.cmake
index 456f26385c..c3d7323545 100644
--- a/cmake/external/zlib.cmake
+++ b/cmake/external/zlib.cmake
@@ -31,9 +31,8 @@ INCLUDE_DIRECTORIES(${THIRD_PARTY_PATH}/install) # For Paddle code to include zl
 ExternalProject_Add(
     extern_zlib
     ${EXTERNAL_PROJECT_LOG_ARGS}
-        #    GIT_REPOSITORY  "https://github.com/madler/zlib.git"
-            GIT_REPOSITORY  "http://admin@172.20.90.14:8080/r/zlib.git"
-#    GIT_TAG         "v1.2.8"
+    GIT_REPOSITORY  "https://github.com/madler/zlib.git"
+    GIT_TAG         "v1.2.8"
     PREFIX          ${ZLIB_SOURCES_DIR}
     UPDATE_COMMAND  ""
     CMAKE_ARGS      -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER}
diff --git a/paddle/fluid/platform/cpu_helper.cc b/paddle/fluid/platform/cpu_helper.cc
index 4e52e8ff00..bd6aedb3ac 100644
--- a/paddle/fluid/platform/cpu_helper.cc
+++ b/paddle/fluid/platform/cpu_helper.cc
@@ -29,7 +29,7 @@ namespace platform {
 
 void SetNumThreads(int num_threads) {
 #ifdef PADDLE_USE_OPENBLAS
-  // windows has no support for openblas multi-thread
+// windows has no support for openblas multi-thread
 #ifdef _WIN32
   if (num_threads > 1) {
     num_threads = 1;
diff --git a/python/paddle/fluid/parallel_executor.py b/python/paddle/fluid/parallel_executor.py
index 33f6df67a4..0d53f53a9e 100644
--- a/python/paddle/fluid/parallel_executor.py
+++ b/python/paddle/fluid/parallel_executor.py
@@ -29,7 +29,6 @@ if os.name != 'nt':
     ExecutionStrategy = core.ParallelExecutor.ExecutionStrategy
     BuildStrategy = core.ParallelExecutor.BuildStrategy
 
-
     class ParallelExecutor(object):
         """
         ParallelExecutor is designed for data parallelism, which focuses on distributing
@@ -161,7 +160,8 @@ if os.name != 'nt':
                     for p in main.global_block().iter_parameters()
                     if not p.stop_gradient
                 ]),
-                set(cpt.to_text(var) for var in self.persistable_vars), main.desc,
+                set(cpt.to_text(var)
+                    for var in self.persistable_vars), main.desc,
                 cpt.to_text(loss_name)
                 if loss_name else six.u(''), scope, local_scopes, exec_strategy,
                 build_strategy, num_trainers, trainer_id)

From fcbd5a12b802560f279e30086d03ef152f760ab5 Mon Sep 17 00:00:00 2001
From: peizhilin <wopeizl@163.com>
Date: Fri, 16 Nov 2018 15:23:05 +0800
Subject: [PATCH 005/113] add create_recordio_file_reader back

---
 python/paddle/fluid/layers/io.py | 118 +++++++++++++++----------------
 1 file changed, 58 insertions(+), 60 deletions(-)

diff --git a/python/paddle/fluid/layers/io.py b/python/paddle/fluid/layers/io.py
index a9075045a2..8e18a6e784 100644
--- a/python/paddle/fluid/layers/io.py
+++ b/python/paddle/fluid/layers/io.py
@@ -347,72 +347,70 @@ def _copy_reader_create_op_(block, op):
     return new_op
 
 
-if os.name != 'nt':
-
-    @templatedoc(op_type='create_recordio_file_reader')
-    def open_recordio_file(filename,
-                           shapes,
-                           lod_levels,
-                           dtypes,
-                           pass_num=1,
-                           for_parallel=True):
-        """
-        ${comment}
-
-        Args:
-           filename(${filename_type}): ${filename_comment}.
-           shapes(list): List of tuples which declaring data shapes.
-           lod_levels(${lod_levels_type}): ${lod_levels_comment}.
-           dtypes(list): List of strs which declaring data type.
-           pass_num(int): Number of passes to run.
-           for_parallel(Bool): Set it as True if you are going to run
-                subsequent operators in parallel.
-
-        Returns:
-           ${out_comment}.
-
-        Examples:
-
-            >>> import paddle.fluid as fluid
-            >>> reader = fluid.layers.io.open_recordio_file(
-            >>>                               filename='./data.recordio',
-            >>>                               shapes=[(3,224,224), (1)],
-            >>>                               lod_levels=[0, 0],
-            >>>                               dtypes=['float32', 'int64'])
-            >>> # Via the reader, we can use 'read_file' layer to get data:
-            >>> image, label = fluid.layers.io.read_file(reader)
-        """
-        dtypes = [convert_np_dtype_to_dtype_(dt) for dt in dtypes]
-        shape_concat = []
-        ranks = []
+@templatedoc(op_type='create_recordio_file_reader')
+def open_recordio_file(filename,
+                       shapes,
+                       lod_levels,
+                       dtypes,
+                       pass_num=1,
+                       for_parallel=True):
+    """
+    ${comment}
 
-        for shape in shapes:
-            shape_concat.extend(shape)
-            ranks.append(len(shape))
+    Args:
+       filename(${filename_type}): ${filename_comment}.
+       shapes(list): List of tuples which declaring data shapes.
+       lod_levels(${lod_levels_type}): ${lod_levels_comment}.
+       dtypes(list): List of strs which declaring data type.
+       pass_num(int): Number of passes to run.
+       for_parallel(Bool): Set it as True if you are going to run
+            subsequent operators in parallel.
 
-        var_name = unique_name('open_recordio_file')
+    Returns:
+       ${out_comment}.
 
-        startup_blk = default_startup_program().current_block()
-        startup_var = startup_blk.create_var(name=var_name)
-        startup_blk.append_op(
-            type='create_recordio_file_reader',
-            outputs={'Out': [startup_var]},
-            attrs={
-                'shape_concat': shape_concat,
-                'lod_levels': lod_levels,
-                'filename': filename,
-                'ranks': ranks
-            })
+    Examples:
 
-        startup_var.desc.set_dtypes(dtypes)
-        startup_var.persistable = True
-        main_prog_var = _copy_reader_var_(
-            default_main_program().current_block(), startup_var)
+        >>> import paddle.fluid as fluid
+        >>> reader = fluid.layers.io.open_recordio_file(
+        >>>                               filename='./data.recordio',
+        >>>                               shapes=[(3,224,224), (1)],
+        >>>                               lod_levels=[0, 0],
+        >>>                               dtypes=['float32', 'int64'])
+        >>> # Via the reader, we can use 'read_file' layer to get data:
+        >>> image, label = fluid.layers.io.read_file(reader)
+    """
+    dtypes = [convert_np_dtype_to_dtype_(dt) for dt in dtypes]
+    shape_concat = []
+    ranks = []
+
+    for shape in shapes:
+        shape_concat.extend(shape)
+        ranks.append(len(shape))
 
-        if pass_num > 1:
-            main_prog_var = multi_pass(reader=main_prog_var, pass_num=pass_num)
+    var_name = unique_name('open_recordio_file')
 
-        return monkey_patch_reader_methods(main_prog_var)
+    startup_blk = default_startup_program().current_block()
+    startup_var = startup_blk.create_var(name=var_name)
+    startup_blk.append_op(
+        type='create_recordio_file_reader',
+        outputs={'Out': [startup_var]},
+        attrs={
+            'shape_concat': shape_concat,
+            'lod_levels': lod_levels,
+            'filename': filename,
+            'ranks': ranks
+        })
+
+    startup_var.desc.set_dtypes(dtypes)
+    startup_var.persistable = True
+    main_prog_var = _copy_reader_var_(
+        default_main_program().current_block(), startup_var)
+
+    if pass_num > 1:
+        main_prog_var = multi_pass(reader=main_prog_var, pass_num=pass_num)
+
+    return monkey_patch_reader_methods(main_prog_var)
 
 
 def random_data_generator(low, high, shapes, lod_levels, for_parallel=True):

From 1dc1dd94d1c349c6bbed6fe826ae1d25a3983603 Mon Sep 17 00:00:00 2001
From: peizhilin <wopeizl@163.com>
Date: Fri, 16 Nov 2018 16:07:25 +0800
Subject: [PATCH 006/113] fix code style test=develop

---
 python/paddle/fluid/layers/io.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/python/paddle/fluid/layers/io.py b/python/paddle/fluid/layers/io.py
index 8e18a6e784..3f47053961 100644
--- a/python/paddle/fluid/layers/io.py
+++ b/python/paddle/fluid/layers/io.py
@@ -404,8 +404,8 @@ def open_recordio_file(filename,
 
     startup_var.desc.set_dtypes(dtypes)
     startup_var.persistable = True
-    main_prog_var = _copy_reader_var_(
-        default_main_program().current_block(), startup_var)
+    main_prog_var = _copy_reader_var_(default_main_program().current_block(),
+                                      startup_var)
 
     if pass_num > 1:
         main_prog_var = multi_pass(reader=main_prog_var, pass_num=pass_num)

From 695e2aba5e8396ff0719da8516e38b1ef4782c05 Mon Sep 17 00:00:00 2001
From: peizhilin <wopeizl@163.com>
Date: Fri, 16 Nov 2018 19:33:50 +0800
Subject: [PATCH 007/113] fix the gtest.cmake on windows

---
 cmake/external/gtest.cmake | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/cmake/external/gtest.cmake b/cmake/external/gtest.cmake
index da539d52bd..943767fb17 100644
--- a/cmake/external/gtest.cmake
+++ b/cmake/external/gtest.cmake
@@ -51,7 +51,11 @@ IF(WITH_TESTING)
         CMAKE_ARGS      -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}
                         -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER}
                         -DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS}
+                        -DCMAKE_CXX_FLAGS_RELEASE=${CMAKE_CXX_FLAGS_RELEASE}
+                        -DCMAKE_CXX_FLAGS_DEBUG=${CMAKE_CXX_FLAGS_DEBUG}
                         -DCMAKE_C_FLAGS=${CMAKE_C_FLAGS}
+                        -DCMAKE_C_FLAGS_DEBUG=${CMAKE_C_FLAGS_DEBUG}
+                        -DCMAKE_C_FLAGS_RELEASE=${CMAKE_C_FLAGS_RELEASE}
                         -DCMAKE_INSTALL_PREFIX=${GTEST_INSTALL_DIR}
                         -DCMAKE_POSITION_INDEPENDENT_CODE=ON
                         -DBUILD_GMOCK=ON

From b942f4760ab30f6a107c6cf944032c9dde143528 Mon Sep 17 00:00:00 2001
From: peizhilin <wopeizl@163.com>
Date: Fri, 16 Nov 2018 22:04:11 +0800
Subject: [PATCH 008/113] fix cc_test on windows

---
 cmake/generic.cmake | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/cmake/generic.cmake b/cmake/generic.cmake
index e21f89c7c5..111627a932 100644
--- a/cmake/generic.cmake
+++ b/cmake/generic.cmake
@@ -351,6 +351,9 @@ function(cc_test TARGET_NAME)
     cmake_parse_arguments(cc_test "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
     add_executable(${TARGET_NAME} ${cc_test_SRCS})
     target_link_libraries(${TARGET_NAME} ${cc_test_DEPS} paddle_gtest_main lod_tensor memory gtest gflags glog)
+    if(WIN32)
+      target_link_libraries(${TARGET_NAME} shlwapi)
+    endif(WIN32)
     add_dependencies(${TARGET_NAME} ${cc_test_DEPS} paddle_gtest_main lod_tensor memory gtest gflags glog)
     add_test(NAME ${TARGET_NAME}
              COMMAND ${TARGET_NAME} ${cc_test_ARGS}

From ef943bd6cd463b4e00bb18cc137334a6b78fca55 Mon Sep 17 00:00:00 2001
From: peizhilin <wopeizl@163.com>
Date: Sat, 17 Nov 2018 00:32:33 +0800
Subject: [PATCH 009/113] fix the win build test=develop

---
 paddle/fluid/operators/CMakeLists.txt | 8 +++-----
 1 file changed, 3 insertions(+), 5 deletions(-)

diff --git a/paddle/fluid/operators/CMakeLists.txt b/paddle/fluid/operators/CMakeLists.txt
index df2a3e7aa6..a2334c1499 100644
--- a/paddle/fluid/operators/CMakeLists.txt
+++ b/paddle/fluid/operators/CMakeLists.txt
@@ -22,9 +22,7 @@ if(WITH_DISTRIBUTE)
     add_subdirectory(distributed_ops)
 endif()
 
-if (NOT WIN32)
-    add_subdirectory(reader)
-endif()
+add_subdirectory(reader)
 
 if (NOT WIN32)
     add_subdirectory(nccl)
@@ -49,9 +47,9 @@ endif()
 
 set(COMMON_OP_DEPS "")
 
-set(COMMON_OP_DEPS ${COMMON_OP_DEPS} xxhash selected_rows_functor selected_rows lod_tensor maxouting unpooling pooling lod_rank_table context_project sequence_pooling executor dynload_warpctc sequence_padding sequence_scale cos_sim_functor memory jit_kernel concat_and_split cross_entropy softmax vol2col im2col sampler)
+set(COMMON_OP_DEPS ${COMMON_OP_DEPS} xxhash selected_rows_functor selected_rows lod_tensor maxouting unpooling pooling lod_rank_table context_project sequence_pooling executor sequence_padding sequence_scale cos_sim_functor memory concat_and_split cross_entropy softmax vol2col im2col sampler)
 if (NOT WIN32)
-  set(COMMON_OP_DEPS ${COMMON_OP_DEPS} sequence2batch lstm_compute matrix_bit_code gru_compute activation_functions)
+  set(COMMON_OP_DEPS ${COMMON_OP_DEPS} sequence2batch lstm_compute matrix_bit_code gru_compute activation_functions dynload_warpctc jit_kernel)
 endif()
 if (WITH_GPU)
   set(COMMON_OP_DEPS ${COMMON_OP_DEPS} depthwise_conv cub)

From a395942c6a5246f1702e592e1f3d369caa3accc1 Mon Sep 17 00:00:00 2001
From: peizhilin <wopeizl@163.com>
Date: Sat, 17 Nov 2018 08:38:02 +0800
Subject: [PATCH 010/113] remove fused compile support on windows test=develop

---
 paddle/fluid/operators/CMakeLists.txt | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/paddle/fluid/operators/CMakeLists.txt b/paddle/fluid/operators/CMakeLists.txt
index a2334c1499..40246d05e9 100644
--- a/paddle/fluid/operators/CMakeLists.txt
+++ b/paddle/fluid/operators/CMakeLists.txt
@@ -11,7 +11,9 @@ add_subdirectory(controlflow)
 add_subdirectory(csp)
 add_subdirectory(detection)
 add_subdirectory(elementwise)
-add_subdirectory(fused)
+if(NOT WIN32)
+    add_subdirectory(fused)
+endif(NOT WIN32)
 add_subdirectory(metrics)
 add_subdirectory(optimizers)
 add_subdirectory(reduce_ops)

From c75dc885b58000b018414ab442097ee515244b9c Mon Sep 17 00:00:00 2001
From: peizhilin <wopeizl@163.com>
Date: Sat, 17 Nov 2018 14:36:56 +0800
Subject: [PATCH 011/113] add the jit support test=develop

---
 paddle/fluid/operators/CMakeLists.txt         |  7 ++--
 paddle/fluid/operators/math/CMakeLists.txt    | 36 +++++++++----------
 .../math/detail/activation_functions.h        |  7 ++++
 paddle/fluid/operators/math/jit_code.cc       |  5 +++
 4 files changed, 31 insertions(+), 24 deletions(-)

diff --git a/paddle/fluid/operators/CMakeLists.txt b/paddle/fluid/operators/CMakeLists.txt
index 40246d05e9..10748b0cda 100644
--- a/paddle/fluid/operators/CMakeLists.txt
+++ b/paddle/fluid/operators/CMakeLists.txt
@@ -11,9 +11,7 @@ add_subdirectory(controlflow)
 add_subdirectory(csp)
 add_subdirectory(detection)
 add_subdirectory(elementwise)
-if(NOT WIN32)
-    add_subdirectory(fused)
-endif(NOT WIN32)
+add_subdirectory(fused)
 add_subdirectory(metrics)
 add_subdirectory(optimizers)
 add_subdirectory(reduce_ops)
@@ -50,8 +48,9 @@ endif()
 set(COMMON_OP_DEPS "")
 
 set(COMMON_OP_DEPS ${COMMON_OP_DEPS} xxhash selected_rows_functor selected_rows lod_tensor maxouting unpooling pooling lod_rank_table context_project sequence_pooling executor sequence_padding sequence_scale cos_sim_functor memory concat_and_split cross_entropy softmax vol2col im2col sampler)
+set(COMMON_OP_DEPS ${COMMON_OP_DEPS} lstm_compute matrix_bit_code gru_compute activation_functions jit_kernel)
 if (NOT WIN32)
-  set(COMMON_OP_DEPS ${COMMON_OP_DEPS} sequence2batch lstm_compute matrix_bit_code gru_compute activation_functions dynload_warpctc jit_kernel)
+  set(COMMON_OP_DEPS ${COMMON_OP_DEPS} sequence2batch dynload_warpctc)
 endif()
 if (WITH_GPU)
   set(COMMON_OP_DEPS ${COMMON_OP_DEPS} depthwise_conv cub)
diff --git a/paddle/fluid/operators/math/CMakeLists.txt b/paddle/fluid/operators/math/CMakeLists.txt
index 4cd014cbad..08c8dbbfe8 100644
--- a/paddle/fluid/operators/math/CMakeLists.txt
+++ b/paddle/fluid/operators/math/CMakeLists.txt
@@ -1,6 +1,4 @@
-if (NOT WIN32)
-    add_subdirectory(detail)
-endif(NOT WIN32)
+add_subdirectory(detail)
 
 function(math_library TARGET)
     # math_library is a function to create math library.
@@ -43,10 +41,8 @@ math_library(depthwise_conv)
 math_library(im2col)
 math_library(sampler)
 
-if (NOT WIN32) # windows do not support avx functions yet.
-    math_library(gru_compute DEPS activation_functions math_function)
-    math_library(lstm_compute DEPS activation_functions)
-endif (NOT WIN32)
+math_library(gru_compute DEPS activation_functions math_function)
+math_library(lstm_compute DEPS activation_functions)
 
 cc_library(blas SRCS blas.cc DEPS cblas framework_proto device_context)
 math_library(math_function DEPS blas)
@@ -58,9 +54,9 @@ math_library(sequence_padding)
 math_library(sequence_pooling DEPS math_function)
 math_library(sequence_scale)
 math_library(softmax DEPS math_function)
-if (NOT WIN32)
-    math_library(matrix_bit_code)
-endif (NOT WIN32)
+
+math_library(matrix_bit_code)
+
 math_library(unpooling)
 math_library(vol2col)
 
@@ -76,13 +72,13 @@ if(WITH_GPU)
 endif()
 cc_test(concat_test SRCS concat_test.cc DEPS concat_and_split)
 cc_test(cpu_vec_test SRCS cpu_vec_test.cc DEPS blas cpu_info)
-if (NOT WIN32)
-    set(JIT_KERNEL_SRCS jit_kernel.cc jit_kernel_blas.cc jit_kernel_exp.cc jit_kernel_rnn.cc jit_kernel_crf_decode.cc)
-    set(JIT_KERNEL_DEPS cpu_info cblas gflags enforce)
-    if(WITH_XBYAK)
-        list(APPEND JIT_KERNEL_SRCS jit_gen.cc jit_code.cc)
-        list(APPEND JIT_KERNEL_DEPS xbyak)
-    endif()
-    cc_library(jit_kernel SRCS ${JIT_KERNEL_SRCS} DEPS ${JIT_KERNEL_DEPS})
-    cc_test(jit_kernel_test SRCS jit_kernel_test.cc DEPS jit_kernel)
-endif (NOT WIN32)
+
+set(JIT_KERNEL_SRCS jit_kernel.cc jit_kernel_blas.cc jit_kernel_exp.cc jit_kernel_rnn.cc jit_kernel_crf_decode.cc)
+set(JIT_KERNEL_DEPS cpu_info cblas gflags enforce)
+if(WITH_XBYAK)
+    list(APPEND JIT_KERNEL_SRCS jit_gen.cc jit_code.cc)
+    list(APPEND JIT_KERNEL_DEPS xbyak)
+endif()
+cc_library(jit_kernel SRCS ${JIT_KERNEL_SRCS} DEPS ${JIT_KERNEL_DEPS})
+cc_test(jit_kernel_test SRCS jit_kernel_test.cc DEPS jit_kernel)
+
diff --git a/paddle/fluid/operators/math/detail/activation_functions.h b/paddle/fluid/operators/math/detail/activation_functions.h
index b127fbe8c8..42fb45a8a5 100644
--- a/paddle/fluid/operators/math/detail/activation_functions.h
+++ b/paddle/fluid/operators/math/detail/activation_functions.h
@@ -15,6 +15,13 @@ limitations under the License. */
 #pragma once
 #include <math.h>
 #include <string>
+
+#ifdef _WIN32
+#undef __AVX__
+#undef __AVX__2
+#endif
+
+
 #include "paddle/fluid/platform/enforce.h"
 #include "paddle/fluid/platform/hostdevice.h"
 
diff --git a/paddle/fluid/operators/math/jit_code.cc b/paddle/fluid/operators/math/jit_code.cc
index e3b600d442..0f4b8f65ac 100644
--- a/paddle/fluid/operators/math/jit_code.cc
+++ b/paddle/fluid/operators/math/jit_code.cc
@@ -118,7 +118,12 @@ void VXXJitCode::generate() {
   ret();
 }
 
+#ifdef _WIN32
+#define ALIGN32
+#else
 #define ALIGN32 __attribute__((aligned(32)))
+#endif
+
 #define EXP_HIG 88.3762626647949f
 #define EXP_LOW -88.3762626647949f
 #define CEPHES_LOG2EF 1.44269504088896341

From 5e46c98362897fbf043eb8387618740fbbd6fd07 Mon Sep 17 00:00:00 2001
From: peizhilin <wopeizl@163.com>
Date: Sat, 17 Nov 2018 14:41:29 +0800
Subject: [PATCH 012/113] add the jit support, test=develop

---
 paddle/fluid/operators/math/detail/activation_functions.h | 6 ------
 paddle/fluid/platform/cpu_info.h                          | 5 +++++
 paddle/fluid/platform/enforce.h                           | 5 +++++
 3 files changed, 10 insertions(+), 6 deletions(-)

diff --git a/paddle/fluid/operators/math/detail/activation_functions.h b/paddle/fluid/operators/math/detail/activation_functions.h
index 42fb45a8a5..2b3d38d95a 100644
--- a/paddle/fluid/operators/math/detail/activation_functions.h
+++ b/paddle/fluid/operators/math/detail/activation_functions.h
@@ -16,12 +16,6 @@ limitations under the License. */
 #include <math.h>
 #include <string>
 
-#ifdef _WIN32
-#undef __AVX__
-#undef __AVX__2
-#endif
-
-
 #include "paddle/fluid/platform/enforce.h"
 #include "paddle/fluid/platform/hostdevice.h"
 
diff --git a/paddle/fluid/platform/cpu_info.h b/paddle/fluid/platform/cpu_info.h
index 6810a1651a..1b4840d9a1 100644
--- a/paddle/fluid/platform/cpu_info.h
+++ b/paddle/fluid/platform/cpu_info.h
@@ -14,6 +14,11 @@ limitations under the License. */
 
 #pragma once
 
+#ifdef _WIN32
+#undef __AVX__
+#undef __AVX2__
+#endif
+
 #include <stddef.h>
 
 namespace paddle {
diff --git a/paddle/fluid/platform/enforce.h b/paddle/fluid/platform/enforce.h
index a251bfcd99..c03bbd59ac 100644
--- a/paddle/fluid/platform/enforce.h
+++ b/paddle/fluid/platform/enforce.h
@@ -14,6 +14,11 @@ limitations under the License. */
 
 #pragma once
 
+#ifdef _WIN32
+#undef __AVX__
+#undef __AVX2__
+#endif
+
 #ifdef __GNUC__
 #include <cxxabi.h>  // for __cxa_demangle
 #endif               // __GNUC__

From 928efeed46132df27d1c389be046bdc31c9451f4 Mon Sep 17 00:00:00 2001
From: peizhilin <wopeizl@163.com>
Date: Sat, 17 Nov 2018 14:41:29 +0800
Subject: [PATCH 013/113] add the jit support, test=develop

---
 cmake/operators.cmake                                     | 5 +++--
 paddle/fluid/operators/math/detail/activation_functions.h | 6 ------
 paddle/fluid/platform/cpu_info.h                          | 5 +++++
 paddle/fluid/platform/enforce.h                           | 5 +++++
 4 files changed, 13 insertions(+), 8 deletions(-)

diff --git a/cmake/operators.cmake b/cmake/operators.cmake
index c9d0f80da2..5636342ef7 100644
--- a/cmake/operators.cmake
+++ b/cmake/operators.cmake
@@ -84,8 +84,9 @@ function(op_library TARGET)
     endif()
     if (WIN32)
     # remove windows unsupported op, because windows has no nccl, no warpctc such ops.
-    foreach(windows_unsupport_op "nccl_op" "gen_nccl_id_op" "warpctc_op" "hierarchical_sigmoid_op"
-     "crf_decoding_op" "select_op" "lstmp_op" "gru_op" "fusion_gru_op" "lstm_op" "fusion_lstm_op" "cumsum_op"
+    foreach(windows_unsupport_op "nccl_op" "gen_nccl_id_op" "warpctc_op"
+#     "hierarchical_sigmoid_op" "cumsum_op"
+#     "crf_decoding_op" "select_op" "lstmp_op" "gru_op" "fusion_gru_op" "lstm_op" "fusion_lstm_op"
       "fusion_seqconv_eltadd_relu_op" "channel_send_op" "channel_create_op" "channel_close_op" "channel_recv_op")
         if ("${TARGET}" STREQUAL "${windows_unsupport_op}")
           return()
diff --git a/paddle/fluid/operators/math/detail/activation_functions.h b/paddle/fluid/operators/math/detail/activation_functions.h
index 42fb45a8a5..2b3d38d95a 100644
--- a/paddle/fluid/operators/math/detail/activation_functions.h
+++ b/paddle/fluid/operators/math/detail/activation_functions.h
@@ -16,12 +16,6 @@ limitations under the License. */
 #include <math.h>
 #include <string>
 
-#ifdef _WIN32
-#undef __AVX__
-#undef __AVX__2
-#endif
-
-
 #include "paddle/fluid/platform/enforce.h"
 #include "paddle/fluid/platform/hostdevice.h"
 
diff --git a/paddle/fluid/platform/cpu_info.h b/paddle/fluid/platform/cpu_info.h
index 6810a1651a..1b4840d9a1 100644
--- a/paddle/fluid/platform/cpu_info.h
+++ b/paddle/fluid/platform/cpu_info.h
@@ -14,6 +14,11 @@ limitations under the License. */
 
 #pragma once
 
+#ifdef _WIN32
+#undef __AVX__
+#undef __AVX2__
+#endif
+
 #include <stddef.h>
 
 namespace paddle {
diff --git a/paddle/fluid/platform/enforce.h b/paddle/fluid/platform/enforce.h
index a251bfcd99..c03bbd59ac 100644
--- a/paddle/fluid/platform/enforce.h
+++ b/paddle/fluid/platform/enforce.h
@@ -14,6 +14,11 @@ limitations under the License. */
 
 #pragma once
 
+#ifdef _WIN32
+#undef __AVX__
+#undef __AVX2__
+#endif
+
 #ifdef __GNUC__
 #include <cxxabi.h>  // for __cxa_demangle
 #endif               // __GNUC__

From a3e952f41d9081b8d0f69128f7d758fd95f97f96 Mon Sep 17 00:00:00 2001
From: peizhilin <wopeizl@163.com>
Date: Sun, 18 Nov 2018 12:19:05 +0800
Subject: [PATCH 014/113] add the jit back fix compile error on windows

---
 CMakeLists.txt                                |   5 +
 cmake/operators.cmake                         |   5 +-
 cmake/simd.cmake                              |  25 +-
 paddle/fluid/operators/CMakeLists.txt         |   9 +-
 .../fluid/operators/hierarchical_sigmoid_op.h |   2 +-
 paddle/fluid/operators/math/CMakeLists.txt    |  35 +-
 paddle/fluid/operators/math/matrix_bit_code.h |   3 +-
 python/paddle/fluid/layers/nn.py              | 374 +++++++++---------
 python/paddle/fluid/layers/ops.py             |  41 +-
 9 files changed, 241 insertions(+), 258 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index c3bc60d57b..c2804e234d 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -130,6 +130,11 @@ if (APPLE OR WIN32)
         "Disable MKL for building on mac and windows" FORCE)
 endif()
 
+if (WIN32)
+    set(WITH_AVX OFF CACHE STRING
+            "Disable AVX when compiling for Windows" FORCE)
+endif()
+
 set(THIRD_PARTY_PATH "${CMAKE_BINARY_DIR}/third_party" CACHE STRING
   "A path setting third party libraries download & build directories.")
 
diff --git a/cmake/operators.cmake b/cmake/operators.cmake
index c9d0f80da2..5e8b95b3e2 100644
--- a/cmake/operators.cmake
+++ b/cmake/operators.cmake
@@ -84,9 +84,8 @@ function(op_library TARGET)
     endif()
     if (WIN32)
     # remove windows unsupported op, because windows has no nccl, no warpctc such ops.
-    foreach(windows_unsupport_op "nccl_op" "gen_nccl_id_op" "warpctc_op" "hierarchical_sigmoid_op"
-     "crf_decoding_op" "select_op" "lstmp_op" "gru_op" "fusion_gru_op" "lstm_op" "fusion_lstm_op" "cumsum_op"
-      "fusion_seqconv_eltadd_relu_op" "channel_send_op" "channel_create_op" "channel_close_op" "channel_recv_op")
+    foreach(windows_unsupport_op "nccl_op" "gen_nccl_id_op" "warpctc_op"
+            "channel_send_op" "channel_create_op" "channel_close_op" "channel_recv_op")
         if ("${TARGET}" STREQUAL "${windows_unsupport_op}")
           return()
         endif()
diff --git a/cmake/simd.cmake b/cmake/simd.cmake
index 566dc75fda..4926fb9913 100644
--- a/cmake/simd.cmake
+++ b/cmake/simd.cmake
@@ -70,17 +70,20 @@ int main()
     return 0;
 }" AVX_FOUND)
 
-# Check AVX 2
-set(CMAKE_REQUIRED_FLAGS ${AVX2_FLAG})
-set(AVX2_FOUND_EXITCODE 1 CACHE STRING "Result from TRY_RUN" FORCE)
-CHECK_CXX_SOURCE_RUNS("
-#include <immintrin.h>
-int main()
-{
-    __m256i a = _mm256_set_epi32 (-1, 2, -3, 4, -1, 2, -3, 4);
-    __m256i result = _mm256_abs_epi32 (a);
-    return 0;
-}" AVX2_FOUND)
+# disable AVX2 by default on windows
+if(NOT WIN32)
+    # Check AVX 2
+    set(CMAKE_REQUIRED_FLAGS ${AVX2_FLAG})
+    set(AVX2_FOUND_EXITCODE 1 CACHE STRING "Result from TRY_RUN" FORCE)
+    CHECK_CXX_SOURCE_RUNS("
+    #include <immintrin.h>
+    int main()
+    {
+        __m256i a = _mm256_set_epi32 (-1, 2, -3, 4, -1, 2, -3, 4);
+        __m256i result = _mm256_abs_epi32 (a);
+        return 0;
+    }" AVX2_FOUND)
+endif(NOT WIN32)
 
 # Check AVX512F
 set(CMAKE_REQUIRED_FLAGS ${AVX512F_FLAG})
diff --git a/paddle/fluid/operators/CMakeLists.txt b/paddle/fluid/operators/CMakeLists.txt
index df2a3e7aa6..284bf5dc9e 100644
--- a/paddle/fluid/operators/CMakeLists.txt
+++ b/paddle/fluid/operators/CMakeLists.txt
@@ -22,9 +22,7 @@ if(WITH_DISTRIBUTE)
     add_subdirectory(distributed_ops)
 endif()
 
-if (NOT WIN32)
-    add_subdirectory(reader)
-endif()
+add_subdirectory(reader)
 
 if (NOT WIN32)
     add_subdirectory(nccl)
@@ -49,9 +47,10 @@ endif()
 
 set(COMMON_OP_DEPS "")
 
-set(COMMON_OP_DEPS ${COMMON_OP_DEPS} xxhash selected_rows_functor selected_rows lod_tensor maxouting unpooling pooling lod_rank_table context_project sequence_pooling executor dynload_warpctc sequence_padding sequence_scale cos_sim_functor memory jit_kernel concat_and_split cross_entropy softmax vol2col im2col sampler)
+set(COMMON_OP_DEPS ${COMMON_OP_DEPS} xxhash selected_rows_functor selected_rows lod_tensor maxouting unpooling pooling lod_rank_table context_project sequence_pooling executor sequence_padding sequence_scale cos_sim_functor memory concat_and_split cross_entropy softmax vol2col im2col sampler)
+set(COMMON_OP_DEPS ${COMMON_OP_DEPS} lstm_compute matrix_bit_code sequence2batch gru_compute activation_functions jit_kernel)
 if (NOT WIN32)
-  set(COMMON_OP_DEPS ${COMMON_OP_DEPS} sequence2batch lstm_compute matrix_bit_code gru_compute activation_functions)
+  set(COMMON_OP_DEPS ${COMMON_OP_DEPS} dynload_warpctc)
 endif()
 if (WITH_GPU)
   set(COMMON_OP_DEPS ${COMMON_OP_DEPS} depthwise_conv cub)
diff --git a/paddle/fluid/operators/hierarchical_sigmoid_op.h b/paddle/fluid/operators/hierarchical_sigmoid_op.h
index 64096a717b..79980cda53 100644
--- a/paddle/fluid/operators/hierarchical_sigmoid_op.h
+++ b/paddle/fluid/operators/hierarchical_sigmoid_op.h
@@ -111,7 +111,7 @@ class HierarchicalSigmoidGradOpKernel : public framework::OpKernel<T> {
     auto pre_out_mat = EigenMatrix<T>::From(*pre_out);
     auto pre_out_grad_mat = EigenMatrix<T>::From(pre_out_grad);
     auto out_grad_mat = EigenMatrix<T>::From(*out_grad);
-    Eigen::array<int, 2> bcast({{1, static_cast<int>(pre_out_grad.dims()[1])}});
+    Eigen::array<int, 2> bcast{1, static_cast<int>(pre_out_grad.dims()[1])};
 
     // softrelu derivative
     pre_out_grad_mat.device(place) =
diff --git a/paddle/fluid/operators/math/CMakeLists.txt b/paddle/fluid/operators/math/CMakeLists.txt
index 4cd014cbad..e9397d552d 100644
--- a/paddle/fluid/operators/math/CMakeLists.txt
+++ b/paddle/fluid/operators/math/CMakeLists.txt
@@ -1,6 +1,4 @@
-if (NOT WIN32)
-    add_subdirectory(detail)
-endif(NOT WIN32)
+add_subdirectory(detail)
 
 function(math_library TARGET)
     # math_library is a function to create math library.
@@ -43,10 +41,8 @@ math_library(depthwise_conv)
 math_library(im2col)
 math_library(sampler)
 
-if (NOT WIN32) # windows do not support avx functions yet.
-    math_library(gru_compute DEPS activation_functions math_function)
-    math_library(lstm_compute DEPS activation_functions)
-endif (NOT WIN32)
+math_library(gru_compute DEPS activation_functions math_function)
+math_library(lstm_compute DEPS activation_functions)
 
 cc_library(blas SRCS blas.cc DEPS cblas framework_proto device_context)
 math_library(math_function DEPS blas)
@@ -58,9 +54,9 @@ math_library(sequence_padding)
 math_library(sequence_pooling DEPS math_function)
 math_library(sequence_scale)
 math_library(softmax DEPS math_function)
-if (NOT WIN32)
-    math_library(matrix_bit_code)
-endif (NOT WIN32)
+
+math_library(matrix_bit_code)
+
 math_library(unpooling)
 math_library(vol2col)
 
@@ -76,13 +72,12 @@ if(WITH_GPU)
 endif()
 cc_test(concat_test SRCS concat_test.cc DEPS concat_and_split)
 cc_test(cpu_vec_test SRCS cpu_vec_test.cc DEPS blas cpu_info)
-if (NOT WIN32)
-    set(JIT_KERNEL_SRCS jit_kernel.cc jit_kernel_blas.cc jit_kernel_exp.cc jit_kernel_rnn.cc jit_kernel_crf_decode.cc)
-    set(JIT_KERNEL_DEPS cpu_info cblas gflags enforce)
-    if(WITH_XBYAK)
-        list(APPEND JIT_KERNEL_SRCS jit_gen.cc jit_code.cc)
-        list(APPEND JIT_KERNEL_DEPS xbyak)
-    endif()
-    cc_library(jit_kernel SRCS ${JIT_KERNEL_SRCS} DEPS ${JIT_KERNEL_DEPS})
-    cc_test(jit_kernel_test SRCS jit_kernel_test.cc DEPS jit_kernel)
-endif (NOT WIN32)
+
+set(JIT_KERNEL_SRCS jit_kernel.cc jit_kernel_blas.cc jit_kernel_exp.cc jit_kernel_rnn.cc jit_kernel_crf_decode.cc)
+set(JIT_KERNEL_DEPS cpu_info cblas gflags enforce)
+if(WITH_XBYAK)
+    list(APPEND JIT_KERNEL_SRCS jit_gen.cc jit_code.cc)
+    list(APPEND JIT_KERNEL_DEPS xbyak)
+endif()
+cc_library(jit_kernel SRCS ${JIT_KERNEL_SRCS} DEPS ${JIT_KERNEL_DEPS})
+cc_test(jit_kernel_test SRCS jit_kernel_test.cc DEPS jit_kernel)
diff --git a/paddle/fluid/operators/math/matrix_bit_code.h b/paddle/fluid/operators/math/matrix_bit_code.h
index 07854c8358..c329b8b611 100644
--- a/paddle/fluid/operators/math/matrix_bit_code.h
+++ b/paddle/fluid/operators/math/matrix_bit_code.h
@@ -67,7 +67,7 @@ inline constexpr size_t FindLastSet(size_t x) {
              : (std::is_same<size_t, unsigned long>::value  // NOLINT
                     ? (x ? 8 * sizeof(x) - __builtin_clzl(x) : 0)
                     : (x ? 8 * sizeof(x) - __builtin_clzll(x) : 0));
-
+}
 #else
 // windows don't have built-in clz, ctz function
 template <typename T>
@@ -92,7 +92,6 @@ inline int clz(const T& value) {
 
 inline size_t FindLastSet(size_t x) { return sizeof(size_t) * 8 - clz(x); }
 #endif  // !_WIN32
-}
 
 struct SimpleCode {
   SimpleCode(size_t code, size_t num_classes) : c_(code + num_classes) {}
diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py
index 9465d97564..a2bab64384 100644
--- a/python/paddle/fluid/layers/nn.py
+++ b/python/paddle/fluid/layers/nn.py
@@ -170,12 +170,6 @@ __all__ = [
     'bilinear_tensor_product',
 ]
 
-# To avoid the api checker complains
-if os.name == 'nt':
-    __all__.remove('dynamic_lstm')
-    __all__.remove('crf_decoding')
-    __all__.remove('roi_pool')
-
 
 def fc(input,
        size,
@@ -349,128 +343,126 @@ def embedding(input,
     return tmp
 
 
-if os.name != 'nt':
+@templatedoc(op_type="lstm")
+def dynamic_lstm(input,
+                 size,
+                 h_0=None,
+                 c_0=None,
+                 param_attr=None,
+                 bias_attr=None,
+                 use_peepholes=True,
+                 is_reverse=False,
+                 gate_activation='sigmoid',
+                 cell_activation='tanh',
+                 candidate_activation='tanh',
+                 dtype='float32',
+                 name=None):
+    """
+    ${comment}
 
-    @templatedoc(op_type="lstm")
-    def dynamic_lstm(input,
-                     size,
-                     h_0=None,
-                     c_0=None,
-                     param_attr=None,
-                     bias_attr=None,
-                     use_peepholes=True,
-                     is_reverse=False,
-                     gate_activation='sigmoid',
-                     cell_activation='tanh',
-                     candidate_activation='tanh',
-                     dtype='float32',
-                     name=None):
-        """
-        ${comment}
-
-        Args:
-            input (Variable): ${input_comment}
-            size (int): 4 * hidden size.
-            h_0(Variable): The initial hidden state is an optional input, default is zero.
-                           This is a tensor with shape (N x D), where N is the
-                           batch size and D is the hidden size.
-            c_0(Variable): The initial cell state is an optional input, default is zero.
-                           This is a tensor with shape (N x D), where N is the
-                           batch size. `h_0` and `c_0` can be NULL but only at the same time.
-            param_attr(ParamAttr|None): The parameter attribute for the learnable
-                                   hidden-hidden weights.
-
-                                   - Weights = {:math:`W_{ch}, W_{ih}, \
-                                                    W_{fh}, W_{oh}`}
-                                   - The shape is (D x 4D), where D is the hidden
-                                     size.
-
-                                   If it is set to None or one attribute of ParamAttr,
-                                   dynamic_lstm will create ParamAttr as param_attr.
-                                   If the Initializer of the param_attr is not set, the
-                                   parameter is initialized with Xavier. Default: None.
-            bias_attr (ParamAttr|None): The bias attribute for the learnable bias
-                                  weights, which contains two parts, input-hidden
-                                  bias weights and peephole connections weights if
-                                  setting `use_peepholes` to `True`.
-
-                                  1. `use_peepholes = False`
-                                     - Biases = {:math:`b_c, b_i, b_f, b_o`}.
-                                     - The shape is (1 x 4D).
-                                  2. `use_peepholes = True`
-                                     - Biases = { :math:`b_c, b_i, b_f, b_o, W_{ic}, \
-                                                     W_{fc}, W_{oc}`}.
-                                     - The shape is (1 x 7D).
-
-                                  If it is set to None or one attribute of ParamAttr,
-                                  dynamic_lstm will create ParamAttr as bias_attr.
-                                  If the Initializer of the bias_attr is not set,
-                                  the bias is initialized zero. Default: None.
-            use_peepholes (bool): ${use_peepholes_comment}
-            is_reverse (bool): ${is_reverse_comment}
-            gate_activation (str): ${gate_activation_comment}
-            cell_activation (str): ${cell_activation_comment}
-            candidate_activation (str): ${candidate_activation_comment}
-            dtype (str): Data type. Choices = ["float32", "float64"], default "float32".
-            name (str|None): A name for this layer(optional). If set None, the layer
-                             will be named automatically.
-
-        Returns:
-            tuple: The hidden state, and cell state of LSTM. The shape of both \
-            is (T x D), and lod is the same with the `input`.
-
-        Examples:
-            .. code-block:: python
-
-                hidden_dim = 512
-                forward_proj = fluid.layers.fc(input=input_seq, size=hidden_dim * 4,
-                                               bias_attr=False)
-                forward, _ = fluid.layers.dynamic_lstm(
-                    input=forward_proj, size=hidden_dim * 4, use_peepholes=False)
-        """
-        assert bias_attr is not False, "bias_attr should not be False in dynamic_lstmp."
-        helper = LayerHelper('lstm', **locals())
-        size = size // 4
-        weight = helper.create_parameter(
-            attr=helper.param_attr, shape=[size, 4 * size], dtype=dtype)
-        bias_size = [1, 7 * size]
-        if not use_peepholes:
-            bias_size[1] = 4 * size
-        bias = helper.create_parameter(
-            attr=helper.bias_attr, shape=bias_size, dtype=dtype, is_bias=True)
+    Args:
+        input (Variable): ${input_comment}
+        size (int): 4 * hidden size.
+        h_0(Variable): The initial hidden state is an optional input, default is zero.
+                       This is a tensor with shape (N x D), where N is the
+                       batch size and D is the hidden size.
+        c_0(Variable): The initial cell state is an optional input, default is zero.
+                       This is a tensor with shape (N x D), where N is the
+                       batch size. `h_0` and `c_0` can be NULL but only at the same time.
+        param_attr(ParamAttr|None): The parameter attribute for the learnable
+                               hidden-hidden weights.
 
-        hidden = helper.create_variable_for_type_inference(dtype)
-        cell = helper.create_variable_for_type_inference(dtype)
-        batch_gate = helper.create_variable_for_type_inference(dtype)
-        batch_cell_pre_act = helper.create_variable_for_type_inference(dtype)
-        inputs = {'Input': input, 'Weight': weight, 'Bias': bias}
-        batch_size = input.shape[0]
-        if h_0:
-            assert h_0.shape == (batch_size, size), \
-                'The shape of h0 should be (batch_size, %d)' % size
-            inputs['H0'] = h_0
-        if c_0:
-            assert c_0.shape == (batch_size, size), \
-                'The shape of c0 should be (batch_size, %d)' % size
-            inputs['C0'] = c_0
+                               - Weights = {:math:`W_{ch}, W_{ih}, \
+                                                W_{fh}, W_{oh}`}
+                               - The shape is (D x 4D), where D is the hidden
+                                 size.
 
-        helper.append_op(
-            type='lstm',
-            inputs=inputs,
-            outputs={
-                'Hidden': hidden,
-                'Cell': cell,
-                'BatchGate': batch_gate,
-                'BatchCellPreAct': batch_cell_pre_act
-            },
-            attrs={
-                'use_peepholes': use_peepholes,
-                'is_reverse': is_reverse,
-                'gate_activation': gate_activation,
-                'cell_activation': cell_activation,
-                'candidate_activation': candidate_activation
-            })
-        return hidden, cell
+                               If it is set to None or one attribute of ParamAttr,
+                               dynamic_lstm will create ParamAttr as param_attr.
+                               If the Initializer of the param_attr is not set, the
+                               parameter is initialized with Xavier. Default: None.
+        bias_attr (ParamAttr|None): The bias attribute for the learnable bias
+                              weights, which contains two parts, input-hidden
+                              bias weights and peephole connections weights if
+                              setting `use_peepholes` to `True`.
+
+                              1. `use_peepholes = False`
+                                 - Biases = {:math:`b_c, b_i, b_f, b_o`}.
+                                 - The shape is (1 x 4D).
+                              2. `use_peepholes = True`
+                                 - Biases = { :math:`b_c, b_i, b_f, b_o, W_{ic}, \
+                                                 W_{fc}, W_{oc}`}.
+                                 - The shape is (1 x 7D).
+
+                              If it is set to None or one attribute of ParamAttr,
+                              dynamic_lstm will create ParamAttr as bias_attr.
+                              If the Initializer of the bias_attr is not set,
+                              the bias is initialized zero. Default: None.
+        use_peepholes (bool): ${use_peepholes_comment}
+        is_reverse (bool): ${is_reverse_comment}
+        gate_activation (str): ${gate_activation_comment}
+        cell_activation (str): ${cell_activation_comment}
+        candidate_activation (str): ${candidate_activation_comment}
+        dtype (str): Data type. Choices = ["float32", "float64"], default "float32".
+        name (str|None): A name for this layer(optional). If set None, the layer
+                         will be named automatically.
+
+    Returns:
+        tuple: The hidden state, and cell state of LSTM. The shape of both \
+        is (T x D), and lod is the same with the `input`.
+
+    Examples:
+        .. code-block:: python
+
+            hidden_dim = 512
+            forward_proj = fluid.layers.fc(input=input_seq, size=hidden_dim * 4,
+                                           bias_attr=False)
+            forward, _ = fluid.layers.dynamic_lstm(
+                input=forward_proj, size=hidden_dim * 4, use_peepholes=False)
+    """
+    assert bias_attr is not False, "bias_attr should not be False in dynamic_lstmp."
+    helper = LayerHelper('lstm', **locals())
+    size = size // 4
+    weight = helper.create_parameter(
+        attr=helper.param_attr, shape=[size, 4 * size], dtype=dtype)
+    bias_size = [1, 7 * size]
+    if not use_peepholes:
+        bias_size[1] = 4 * size
+    bias = helper.create_parameter(
+        attr=helper.bias_attr, shape=bias_size, dtype=dtype, is_bias=True)
+
+    hidden = helper.create_variable_for_type_inference(dtype)
+    cell = helper.create_variable_for_type_inference(dtype)
+    batch_gate = helper.create_variable_for_type_inference(dtype)
+    batch_cell_pre_act = helper.create_variable_for_type_inference(dtype)
+    inputs = {'Input': input, 'Weight': weight, 'Bias': bias}
+    batch_size = input.shape[0]
+    if h_0:
+        assert h_0.shape == (batch_size, size), \
+            'The shape of h0 should be (batch_size, %d)' % size
+        inputs['H0'] = h_0
+    if c_0:
+        assert c_0.shape == (batch_size, size), \
+            'The shape of c0 should be (batch_size, %d)' % size
+        inputs['C0'] = c_0
+
+    helper.append_op(
+        type='lstm',
+        inputs=inputs,
+        outputs={
+            'Hidden': hidden,
+            'Cell': cell,
+            'BatchGate': batch_gate,
+            'BatchCellPreAct': batch_cell_pre_act
+        },
+        attrs={
+            'use_peepholes': use_peepholes,
+            'is_reverse': is_reverse,
+            'gate_activation': gate_activation,
+            'cell_activation': cell_activation,
+            'candidate_activation': candidate_activation
+        })
+    return hidden, cell
 
 
 def dynamic_lstmp(input,
@@ -969,43 +961,39 @@ def linear_chain_crf(input, label, param_attr=None):
     return log_likelihood
 
 
-if os.name != 'nt':
-
-    @templatedoc()
-    def crf_decoding(input, param_attr, label=None):
-        """
-        ${comment}
+@templatedoc()
+def crf_decoding(input, param_attr, label=None):
+    """
+    ${comment}
 
-        Args:
-            input(${emission_type}): ${emission_comment}
+    Args:
+        input(${emission_type}): ${emission_comment}
 
-            param_attr(ParamAttr): The parameter attribute for training.
+        param_attr(ParamAttr): The parameter attribute for training.
 
-            label(${label_type}): ${label_comment}
+        label(${label_type}): ${label_comment}
 
-        Returns:
-            Variable: ${viterbi_path_comment}
+    Returns:
+        Variable: ${viterbi_path_comment}
 
-        Examples:
-            .. code-block:: python
+    Examples:
+        .. code-block:: python
 
-               crf_decode = layers.crf_decoding(
-                    input=hidden, param_attr=ParamAttr(name="crfw"))
-        """
-        helper = LayerHelper('crf_decoding', **locals())
-        transition = helper.get_parameter(param_attr.name)
-        viterbi_path = helper.create_variable_for_type_inference(
-            dtype=helper.input_dtype())
-        helper.append_op(
-            type='crf_decoding',
-            inputs={
-                "Emission": [input],
+           crf_decode = layers.crf_decoding(
+                input=hidden, param_attr=ParamAttr(name="crfw"))
+    """
+    helper = LayerHelper('crf_decoding', **locals())
+    transition = helper.get_parameter(param_attr.name)
+    viterbi_path = helper.create_variable_for_type_inference(
+        dtype=helper.input_dtype())
+    helper.append_op(
+        type='crf_decoding',
+        inputs={"Emission": [input],
                 "Transition": transition,
-                "Label": label
-            },
-            outputs={"ViterbiPath": [viterbi_path]})
+                "Label": label},
+        outputs={"ViterbiPath": [viterbi_path]})
 
-        return viterbi_path
+    return viterbi_path
 
 
 @templatedoc()
@@ -5599,48 +5587,42 @@ def label_smooth(label,
     return smooth_label
 
 
-if os.name != 'nt':
-
-    @templatedoc()
-    def roi_pool(input,
-                 rois,
-                 pooled_height=1,
-                 pooled_width=1,
-                 spatial_scale=1.0):
-        """
-        ${comment}
-
-        Args:
-            input (Variable): ${x_comment}
-            rois (Variable): ROIs (Regions of Interest) to pool over.
-            pooled_height (integer): ${pooled_height_comment} Default: 1
-            pooled_width (integer): ${pooled_width_comment} Default: 1
-            spatial_scale (float): ${spatial_scale_comment} Default: 1.0
-
-        Returns:
-            Variable: ${out_comment}.
-
-        Examples:
-            .. code-block:: python
-
-                pool_out = fluid.layers.roi_pool(input=x, rois=rois, 7, 7, 1.0)
-        """
-        helper = LayerHelper('roi_pool', **locals())
-        dtype = helper.input_dtype()
-        pool_out = helper.create_variable_for_type_inference(dtype)
-        argmaxes = helper.create_variable_for_type_inference(dtype='int32')
-        helper.append_op(
-            type="roi_pool",
-            inputs={"X": input,
-                    "ROIs": rois},
-            outputs={"Out": pool_out,
-                     "Argmax": argmaxes},
-            attrs={
-                "pooled_height": pooled_height,
-                "pooled_width": pooled_width,
-                "spatial_scale": spatial_scale
-            })
-        return pool_out
+@templatedoc()
+def roi_pool(input, rois, pooled_height=1, pooled_width=1, spatial_scale=1.0):
+    """
+    ${comment}
+
+    Args:
+        input (Variable): ${x_comment}
+        rois (Variable): ROIs (Regions of Interest) to pool over.
+        pooled_height (integer): ${pooled_height_comment} Default: 1
+        pooled_width (integer): ${pooled_width_comment} Default: 1
+        spatial_scale (float): ${spatial_scale_comment} Default: 1.0
+
+    Returns:
+        Variable: ${out_comment}.
+
+    Examples:
+        .. code-block:: python
+
+            pool_out = fluid.layers.roi_pool(input=x, rois=rois, 7, 7, 1.0)
+    """
+    helper = LayerHelper('roi_pool', **locals())
+    dtype = helper.input_dtype()
+    pool_out = helper.create_variable_for_type_inference(dtype)
+    argmaxes = helper.create_variable_for_type_inference(dtype='int32')
+    helper.append_op(
+        type="roi_pool",
+        inputs={"X": input,
+                "ROIs": rois},
+        outputs={"Out": pool_out,
+                 "Argmax": argmaxes},
+        attrs={
+            "pooled_height": pooled_height,
+            "pooled_width": pooled_width,
+            "spatial_scale": spatial_scale
+        })
+    return pool_out
 
 
 @templatedoc()
diff --git a/python/paddle/fluid/layers/ops.py b/python/paddle/fluid/layers/ops.py
index 66eb1229aa..6c18af7283 100644
--- a/python/paddle/fluid/layers/ops.py
+++ b/python/paddle/fluid/layers/ops.py
@@ -100,26 +100,27 @@ Examples:
     >>> result = fluid.layers.hard_shrink(x=data, threshold=0.3)
 """
 
-if os.name != 'nt':
-    __all__ += ['cumsum']
-
-    _cum_sum_ = generate_layer_fn('cumsum')
-
-    def cumsum(x, axis=None, exclusive=None, reverse=None):
-        locals_var = locals().keys()
-        kwargs = dict()
-        for name in locals_var:
-            val = locals()[name]
-            if val is not None:
-                kwargs[name] = val
-        return _cum_sum_(**kwargs)
-
-    cumsum.__doc__ = _cum_sum_.__doc__ + """
-    Examples:
-    
-        >>> data = fluid.layers.data(name="input", shape=[32, 784])
-        >>> result = fluid.layers.cumsum(data, axis=0)
-    """
+__all__ += ['cumsum']
+
+_cum_sum_ = generate_layer_fn('cumsum')
+
+
+def cumsum(x, axis=None, exclusive=None, reverse=None):
+    locals_var = locals().keys()
+    kwargs = dict()
+    for name in locals_var:
+        val = locals()[name]
+        if val is not None:
+            kwargs[name] = val
+    return _cum_sum_(**kwargs)
+
+
+cumsum.__doc__ = _cum_sum_.__doc__ + """
+Examples:
+
+    >>> data = fluid.layers.data(name="input", shape=[32, 784])
+    >>> result = fluid.layers.cumsum(data, axis=0)
+"""
 
 __all__ += ['thresholded_relu']
 

From a1fa18542f308cc6c8a495f36ef72428b03ee704 Mon Sep 17 00:00:00 2001
From: peizhilin <wopeizl@163.com>
Date: Sun, 18 Nov 2018 12:56:03 +0800
Subject: [PATCH 015/113] rollback test=develop

---
 paddle/fluid/operators/math/jit_code.cc | 5 -----
 paddle/fluid/platform/cpu_info.h        | 5 -----
 paddle/fluid/platform/enforce.h         | 5 -----
 3 files changed, 15 deletions(-)

diff --git a/paddle/fluid/operators/math/jit_code.cc b/paddle/fluid/operators/math/jit_code.cc
index 0f4b8f65ac..e3b600d442 100644
--- a/paddle/fluid/operators/math/jit_code.cc
+++ b/paddle/fluid/operators/math/jit_code.cc
@@ -118,12 +118,7 @@ void VXXJitCode::generate() {
   ret();
 }
 
-#ifdef _WIN32
-#define ALIGN32
-#else
 #define ALIGN32 __attribute__((aligned(32)))
-#endif
-
 #define EXP_HIG 88.3762626647949f
 #define EXP_LOW -88.3762626647949f
 #define CEPHES_LOG2EF 1.44269504088896341
diff --git a/paddle/fluid/platform/cpu_info.h b/paddle/fluid/platform/cpu_info.h
index 1b4840d9a1..6810a1651a 100644
--- a/paddle/fluid/platform/cpu_info.h
+++ b/paddle/fluid/platform/cpu_info.h
@@ -14,11 +14,6 @@ limitations under the License. */
 
 #pragma once
 
-#ifdef _WIN32
-#undef __AVX__
-#undef __AVX2__
-#endif
-
 #include <stddef.h>
 
 namespace paddle {
diff --git a/paddle/fluid/platform/enforce.h b/paddle/fluid/platform/enforce.h
index c03bbd59ac..a251bfcd99 100644
--- a/paddle/fluid/platform/enforce.h
+++ b/paddle/fluid/platform/enforce.h
@@ -14,11 +14,6 @@ limitations under the License. */
 
 #pragma once
 
-#ifdef _WIN32
-#undef __AVX__
-#undef __AVX2__
-#endif
-
 #ifdef __GNUC__
 #include <cxxabi.h>  // for __cxa_demangle
 #endif               // __GNUC__

From c59d3e83bc98d2dd3a8d9370b368c7d12c97d314 Mon Sep 17 00:00:00 2001
From: peizhilin <wopeizl@163.com>
Date: Sun, 18 Nov 2018 18:06:09 +0800
Subject: [PATCH 016/113] test case fix

---
 paddle/fluid/platform/enforce.h | 64 ++++++++-------------------------
 1 file changed, 15 insertions(+), 49 deletions(-)

diff --git a/paddle/fluid/platform/enforce.h b/paddle/fluid/platform/enforce.h
index a251bfcd99..3643d2ad15 100644
--- a/paddle/fluid/platform/enforce.h
+++ b/paddle/fluid/platform/enforce.h
@@ -127,14 +127,14 @@ struct EOFException : public std::exception {
 #define UNLIKELY(condition) __builtin_expect(static_cast<bool>(condition), 0)
 #else
 // there is no equivalent intrinsics in msvc.
-#define UNLIKELY(condition) (condition == 0)
+#define UNLIKELY(condition) (condition)
 #endif
 
 #if !defined(_WIN32)
 #define LIKELY(condition) __builtin_expect(static_cast<bool>(condition), 1)
 #else
 // there is no equivalent intrinsics in msvc.
-#define LIKELY(condition) (condition != 0)
+#define LIKELY(condition) !(condition)
 #endif
 
 template <typename... Args>
@@ -248,7 +248,6 @@ inline void throw_on_error(T e) {
   throw_on_error(e, "");
 }
 
-#if !defined(_WIN32)
 #define PADDLE_THROW(...)                                              \
   do {                                                                 \
     throw ::paddle::platform::EnforceNotMet(                           \
@@ -272,17 +271,6 @@ inline void throw_on_error(T e) {
 #define PADDLE_ENFORCE(...) ::paddle::platform::throw_on_error(__VA_ARGS__);
 #endif  // REPLACE_ENFORCE_GLOG
 
-#else  // !_WIN32
-// disable enforce, caused by the varardic macro exception error
-#define PADDLE_THROW(x)                                      \
-  do {                                                       \
-    throw std::make_exception_ptr(                           \
-        std::runtime_error("Windows disable the enforce.")); \
-  } while (false)
-
-#define PADDLE_ENFORCE(x, ...) x
-#endif  // !_WIN32
-
 #define PADDLE_THROW_EOF()                                                     \
   do {                                                                         \
     throw ::paddle::platform::EOFException("There is no next data.", __FILE__, \
@@ -302,20 +290,6 @@ inline void throw_on_error(T e) {
  *    extra messages is also supported, for example:
  *    PADDLE_ENFORCE(a, b, "some simple enforce failed between %d numbers", 2)
  */
-#if !defined(_WIN32)
-#define PADDLE_ENFORCE_EQ(__VAL0, __VAL1, ...) \
-  __PADDLE_BINARY_COMPARE(__VAL0, __VAL1, ==, !=, __VA_ARGS__)
-#define PADDLE_ENFORCE_NE(__VAL0, __VAL1, ...) \
-  __PADDLE_BINARY_COMPARE(__VAL0, __VAL1, !=, ==, __VA_ARGS__)
-#define PADDLE_ENFORCE_GT(__VAL0, __VAL1, ...) \
-  __PADDLE_BINARY_COMPARE(__VAL0, __VAL1, >, <=, __VA_ARGS__)
-#define PADDLE_ENFORCE_GE(__VAL0, __VAL1, ...) \
-  __PADDLE_BINARY_COMPARE(__VAL0, __VAL1, >=, <, __VA_ARGS__)
-#define PADDLE_ENFORCE_LT(__VAL0, __VAL1, ...) \
-  __PADDLE_BINARY_COMPARE(__VAL0, __VAL1, <, >=, __VA_ARGS__)
-#define PADDLE_ENFORCE_LE(__VAL0, __VAL1, ...) \
-  __PADDLE_BINARY_COMPARE(__VAL0, __VAL1, <=, >, __VA_ARGS__)
-
 #define PADDLE_ENFORCE_NOT_NULL(__VAL, ...)                  \
   do {                                                       \
     if (UNLIKELY(nullptr == (__VAL))) {                      \
@@ -335,27 +309,19 @@ inline void throw_on_error(T e) {
                    paddle::string::Sprintf("" __VA_ARGS__));            \
     }                                                                   \
   } while (0)
-#else
-#define PADDLE_ENFORCE_EQ(__VAL0, __VAL1, ...) ((__VAL0) == (__VAL1))
-#define PADDLE_ENFORCE_NE(__VAL0, __VAL1, ...) ((__VAL0) != (__VAL1))
-#define PADDLE_ENFORCE_GT(__VAL0, __VAL1, ...) ((__VAL0) > (__VAL1))
-#define PADDLE_ENFORCE_GE(__VAL0, __VAL1, ...) ((__VAL0) >= (__VAL1))
-#define PADDLE_ENFORCE_LT(__VAL0, __VAL1, ...) ((__VAL0) < (__VAL1))
-#define PADDLE_ENFORCE_LE(__VAL0, __VAL1, ...) ((__VAL0) <= (__VAL1))
-
-#define __PADDLE_BINARY_COMPARE(__VAL0, __VAL1, __CMP, __INV_CMP, ...) \
-  do {                                                                 \
-    if (!((__VAL0)__CMP(__VAL1))) {                                    \
-      PADDLE_THROW("Windows disable the enforce. Enforce failed.");    \
-    }                                                                  \
-  } while (0)
-#define PADDLE_ENFORCE_NOT_NULL(__VAL1, ...)                       \
-  do {                                                             \
-    if (nullptr == (__VAL1)) {                                     \
-      PADDLE_THROW("Windows disable the enforce. Enforce failed"); \
-    }                                                              \
-  } while (0)
-#endif  // !_WIN32
+
+#define PADDLE_ENFORCE_EQ(__VAL0, __VAL1, ...) \
+  __PADDLE_BINARY_COMPARE(__VAL0, __VAL1, ==, !=, __VA_ARGS__)
+#define PADDLE_ENFORCE_NE(__VAL0, __VAL1, ...) \
+  __PADDLE_BINARY_COMPARE(__VAL0, __VAL1, !=, ==, __VA_ARGS__)
+#define PADDLE_ENFORCE_GT(__VAL0, __VAL1, ...) \
+  __PADDLE_BINARY_COMPARE(__VAL0, __VAL1, >, <=, __VA_ARGS__)
+#define PADDLE_ENFORCE_GE(__VAL0, __VAL1, ...) \
+  __PADDLE_BINARY_COMPARE(__VAL0, __VAL1, >=, <, __VA_ARGS__)
+#define PADDLE_ENFORCE_LT(__VAL0, __VAL1, ...) \
+  __PADDLE_BINARY_COMPARE(__VAL0, __VAL1, <, >=, __VA_ARGS__)
+#define PADDLE_ENFORCE_LE(__VAL0, __VAL1, ...) \
+  __PADDLE_BINARY_COMPARE(__VAL0, __VAL1, <=, >, __VA_ARGS__)
 
 }  // namespace platform
 }  // namespace paddle

From 7d51a0e887c121b292a217e2ef3898b5c619b48a Mon Sep 17 00:00:00 2001
From: peizhilin <wopeizl@163.com>
Date: Mon, 19 Nov 2018 10:07:45 +0800
Subject: [PATCH 017/113] disable DSO by default on windows

---
 CMakeLists.txt                        | 2 ++
 paddle/fluid/operators/CMakeLists.txt | 2 +-
 2 files changed, 3 insertions(+), 1 deletion(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index c2804e234d..0b42e60e17 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -133,6 +133,8 @@ endif()
 if (WIN32)
     set(WITH_AVX OFF CACHE STRING
             "Disable AVX when compiling for Windows" FORCE)
+    set(WITH_DSO OFF CACHE STRING
+            "Disable DSO when compiling for Windows" FORCE)
 endif()
 
 set(THIRD_PARTY_PATH "${CMAKE_BINARY_DIR}/third_party" CACHE STRING
diff --git a/paddle/fluid/operators/CMakeLists.txt b/paddle/fluid/operators/CMakeLists.txt
index 284bf5dc9e..73f44f3b67 100644
--- a/paddle/fluid/operators/CMakeLists.txt
+++ b/paddle/fluid/operators/CMakeLists.txt
@@ -35,7 +35,7 @@ endif()
 register_operators(EXCLUDES warpctc_op)
 
 # warpctc_cudnn need cudnn 7 above
-if (WITH_GPU)
+if (WITH_GPU AND NOT WIN32)
     if (${CUDNN_MAJOR_VERSION} VERSION_LESS 7)
         op_library(warpctc_op DEPS dynload_warpctc sequence_padding sequence_scale SRCS warpctc_op.cc warpctc_op.cu.cc)
     else()

From 1aff40a4c600d88fffc9117fc37d8feb0e7050e4 Mon Sep 17 00:00:00 2001
From: peizhilin <wopeizl@163.com>
Date: Mon, 19 Nov 2018 10:32:50 +0800
Subject: [PATCH 018/113] exclude warpctc_op on windows

---
 paddle/fluid/operators/CMakeLists.txt | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/paddle/fluid/operators/CMakeLists.txt b/paddle/fluid/operators/CMakeLists.txt
index 73f44f3b67..9b1b272292 100644
--- a/paddle/fluid/operators/CMakeLists.txt
+++ b/paddle/fluid/operators/CMakeLists.txt
@@ -32,7 +32,9 @@ if (WITH_GPU AND TENSORRT_FOUND)
     add_subdirectory(tensorrt)
 endif()
 
-register_operators(EXCLUDES warpctc_op)
+if (NOT WIN32)
+    register_operators(EXCLUDES warpctc_op)
+endif()
 
 # warpctc_cudnn need cudnn 7 above
 if (WITH_GPU AND NOT WIN32)
@@ -47,10 +49,10 @@ endif()
 
 set(COMMON_OP_DEPS "")
 
-set(COMMON_OP_DEPS ${COMMON_OP_DEPS} xxhash selected_rows_functor selected_rows lod_tensor maxouting unpooling pooling lod_rank_table context_project sequence_pooling executor sequence_padding sequence_scale cos_sim_functor memory concat_and_split cross_entropy softmax vol2col im2col sampler)
-set(COMMON_OP_DEPS ${COMMON_OP_DEPS} lstm_compute matrix_bit_code sequence2batch gru_compute activation_functions jit_kernel)
+set(COMMON_OP_DEPS ${COMMON_OP_DEPS} xxhash selected_rows_functor selected_rows lod_tensor maxouting unpooling pooling lod_rank_table context_project sequence_pooling executor sequence_padding sequence_scale cos_sim_functor memory jit_kernel concat_and_split cross_entropy softmax vol2col im2col sampler)
+set(COMMON_OP_DEPS ${COMMON_OP_DEPS} sequence2batch lstm_compute matrix_bit_code gru_compute activation_functions)
 if (NOT WIN32)
-  set(COMMON_OP_DEPS ${COMMON_OP_DEPS} dynload_warpctc)
+    set(COMMON_OP_DEPS ${COMMON_OP_DEPS} dynload_warpctc)
 endif()
 if (WITH_GPU)
   set(COMMON_OP_DEPS ${COMMON_OP_DEPS} depthwise_conv cub)

From 8cf63475b096e0ce1f8421d644897fd294ec9c18 Mon Sep 17 00:00:00 2001
From: peizhilin <wopeizl@163.com>
Date: Mon, 19 Nov 2018 10:53:16 +0800
Subject: [PATCH 019/113] exclude the dynload_warpctc out on windows
 test=develop

---
 paddle/fluid/operators/CMakeLists.txt | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/paddle/fluid/operators/CMakeLists.txt b/paddle/fluid/operators/CMakeLists.txt
index 9b1b272292..041de46ff5 100644
--- a/paddle/fluid/operators/CMakeLists.txt
+++ b/paddle/fluid/operators/CMakeLists.txt
@@ -49,11 +49,12 @@ endif()
 
 set(COMMON_OP_DEPS "")
 
-set(COMMON_OP_DEPS ${COMMON_OP_DEPS} xxhash selected_rows_functor selected_rows lod_tensor maxouting unpooling pooling lod_rank_table context_project sequence_pooling executor sequence_padding sequence_scale cos_sim_functor memory jit_kernel concat_and_split cross_entropy softmax vol2col im2col sampler)
-set(COMMON_OP_DEPS ${COMMON_OP_DEPS} sequence2batch lstm_compute matrix_bit_code gru_compute activation_functions)
+set(COMMON_OP_DEPS ${COMMON_OP_DEPS} xxhash selected_rows_functor selected_rows lod_tensor maxouting unpooling pooling lod_rank_table context_project sequence_pooling executor)
 if (NOT WIN32)
     set(COMMON_OP_DEPS ${COMMON_OP_DEPS} dynload_warpctc)
 endif()
+set(COMMON_OP_DEPS ${COMMON_OP_DEPS} sequence_padding sequence_scale cos_sim_functor memory jit_kernel concat_and_split cross_entropy softmax vol2col im2col sampler
+set(COMMON_OP_DEPS ${COMMON_OP_DEPS} sequence2batch lstm_compute matrix_bit_code gru_compute activation_functions)
 if (WITH_GPU)
   set(COMMON_OP_DEPS ${COMMON_OP_DEPS} depthwise_conv cub)
 endif()

From 449406434ec680dff564847cad0d453590282e99 Mon Sep 17 00:00:00 2001
From: peizhilin <wopeizl@163.com>
Date: Mon, 19 Nov 2018 10:58:23 +0800
Subject: [PATCH 020/113] fix the scripts error test=develop

---
 paddle/fluid/operators/CMakeLists.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/paddle/fluid/operators/CMakeLists.txt b/paddle/fluid/operators/CMakeLists.txt
index 041de46ff5..60a42cf568 100644
--- a/paddle/fluid/operators/CMakeLists.txt
+++ b/paddle/fluid/operators/CMakeLists.txt
@@ -53,7 +53,7 @@ set(COMMON_OP_DEPS ${COMMON_OP_DEPS} xxhash selected_rows_functor selected_rows
 if (NOT WIN32)
     set(COMMON_OP_DEPS ${COMMON_OP_DEPS} dynload_warpctc)
 endif()
-set(COMMON_OP_DEPS ${COMMON_OP_DEPS} sequence_padding sequence_scale cos_sim_functor memory jit_kernel concat_and_split cross_entropy softmax vol2col im2col sampler
+set(COMMON_OP_DEPS ${COMMON_OP_DEPS} sequence_padding sequence_scale cos_sim_functor memory jit_kernel concat_and_split cross_entropy softmax vol2col im2col sampler)
 set(COMMON_OP_DEPS ${COMMON_OP_DEPS} sequence2batch lstm_compute matrix_bit_code gru_compute activation_functions)
 if (WITH_GPU)
   set(COMMON_OP_DEPS ${COMMON_OP_DEPS} depthwise_conv cub)

From cc319f64cbd2b2cccb0fe4e9117c1517927ba515 Mon Sep 17 00:00:00 2001
From: peizhilin <wopeizl@163.com>
Date: Mon, 19 Nov 2018 11:15:12 +0800
Subject: [PATCH 021/113] disable avx on windows by default test=develop

---
 cmake/simd.cmake | 54 ++++++++++++++++++++++++------------------------
 1 file changed, 27 insertions(+), 27 deletions(-)

diff --git a/cmake/simd.cmake b/cmake/simd.cmake
index 4926fb9913..86096d4fea 100644
--- a/cmake/simd.cmake
+++ b/cmake/simd.cmake
@@ -57,21 +57,21 @@ int main()
     return 0;
 }" SSE3_FOUND)
 
-# Check AVX
-set(CMAKE_REQUIRED_FLAGS ${AVX_FLAG})
-set(AVX_FOUND_EXITCODE 1 CACHE STRING "Result from TRY_RUN" FORCE)
-CHECK_CXX_SOURCE_RUNS("
-#include <immintrin.h>
-int main()
-{
-    __m256 a = _mm256_set_ps (-1.0f, 2.0f, -3.0f, 4.0f, -1.0f, 2.0f, -3.0f, 4.0f);
-    __m256 b = _mm256_set_ps (1.0f, 2.0f, 3.0f, 4.0f, 1.0f, 2.0f, 3.0f, 4.0f);
-    __m256 result = _mm256_add_ps (a, b);
-    return 0;
-}" AVX_FOUND)
-
-# disable AVX2 by default on windows
+# disable AVX by default on windows
 if(NOT WIN32)
+    # Check AVX
+    set(CMAKE_REQUIRED_FLAGS ${AVX_FLAG})
+    set(AVX_FOUND_EXITCODE 1 CACHE STRING "Result from TRY_RUN" FORCE)
+    CHECK_CXX_SOURCE_RUNS("
+    #include <immintrin.h>
+    int main()
+    {
+        __m256 a = _mm256_set_ps (-1.0f, 2.0f, -3.0f, 4.0f, -1.0f, 2.0f, -3.0f, 4.0f);
+        __m256 b = _mm256_set_ps (1.0f, 2.0f, 3.0f, 4.0f, 1.0f, 2.0f, 3.0f, 4.0f);
+        __m256 result = _mm256_add_ps (a, b);
+        return 0;
+    }" AVX_FOUND)
+
     # Check AVX 2
     set(CMAKE_REQUIRED_FLAGS ${AVX2_FLAG})
     set(AVX2_FOUND_EXITCODE 1 CACHE STRING "Result from TRY_RUN" FORCE)
@@ -83,20 +83,20 @@ if(NOT WIN32)
         __m256i result = _mm256_abs_epi32 (a);
         return 0;
     }" AVX2_FOUND)
-endif(NOT WIN32)
 
-# Check AVX512F
-set(CMAKE_REQUIRED_FLAGS ${AVX512F_FLAG})
-set(AVX512F_FOUND_EXITCODE 1 CACHE STRING "Result from TRY_RUN" FORCE)
-CHECK_CXX_SOURCE_RUNS("
-#include <immintrin.h>
-int main()
-{
-    __m512i a = _mm512_set_epi32 (-1, 2, -3, 4, -1, 2, -3, 4,
-                                  13, -5, 6, -7, 9, 2, -6, 3);
-    __m512i result = _mm512_abs_epi32 (a);
-    return 0;
-}" AVX512F_FOUND)
+    # Check AVX512F
+    set(CMAKE_REQUIRED_FLAGS ${AVX512F_FLAG})
+    set(AVX512F_FOUND_EXITCODE 1 CACHE STRING "Result from TRY_RUN" FORCE)
+    CHECK_CXX_SOURCE_RUNS("
+    #include <immintrin.h>
+    int main()
+    {
+        __m512i a = _mm512_set_epi32 (-1, 2, -3, 4, -1, 2, -3, 4,
+                                      13, -5, 6, -7, 9, 2, -6, 3);
+        __m512i result = _mm512_abs_epi32 (a);
+        return 0;
+    }" AVX512F_FOUND)
+endif(NOT WIN32)
 
 set(CMAKE_REQUIRED_FLAGS ${CMAKE_REQUIRED_FLAGS_RETAINED})
 mark_as_advanced(MMX_FOUND SSE2_FOUND SSE3_FOUND AVX_FOUND AVX2_FOUND AVX512F_FOUND)

From 4a6769da84ac9f8a5dafbc0b9a00ef70944a2395 Mon Sep 17 00:00:00 2001
From: peizhilin <wopeizl@163.com>
Date: Mon, 19 Nov 2018 11:39:56 +0800
Subject: [PATCH 022/113] re-organize the cmake file

---
 cmake/simd.cmake                      | 54 +++++++++++++--------------
 paddle/fluid/operators/CMakeLists.txt |  5 ++-
 2 files changed, 30 insertions(+), 29 deletions(-)

diff --git a/cmake/simd.cmake b/cmake/simd.cmake
index 4926fb9913..86096d4fea 100644
--- a/cmake/simd.cmake
+++ b/cmake/simd.cmake
@@ -57,21 +57,21 @@ int main()
     return 0;
 }" SSE3_FOUND)
 
-# Check AVX
-set(CMAKE_REQUIRED_FLAGS ${AVX_FLAG})
-set(AVX_FOUND_EXITCODE 1 CACHE STRING "Result from TRY_RUN" FORCE)
-CHECK_CXX_SOURCE_RUNS("
-#include <immintrin.h>
-int main()
-{
-    __m256 a = _mm256_set_ps (-1.0f, 2.0f, -3.0f, 4.0f, -1.0f, 2.0f, -3.0f, 4.0f);
-    __m256 b = _mm256_set_ps (1.0f, 2.0f, 3.0f, 4.0f, 1.0f, 2.0f, 3.0f, 4.0f);
-    __m256 result = _mm256_add_ps (a, b);
-    return 0;
-}" AVX_FOUND)
-
-# disable AVX2 by default on windows
+# disable AVX by default on windows
 if(NOT WIN32)
+    # Check AVX
+    set(CMAKE_REQUIRED_FLAGS ${AVX_FLAG})
+    set(AVX_FOUND_EXITCODE 1 CACHE STRING "Result from TRY_RUN" FORCE)
+    CHECK_CXX_SOURCE_RUNS("
+    #include <immintrin.h>
+    int main()
+    {
+        __m256 a = _mm256_set_ps (-1.0f, 2.0f, -3.0f, 4.0f, -1.0f, 2.0f, -3.0f, 4.0f);
+        __m256 b = _mm256_set_ps (1.0f, 2.0f, 3.0f, 4.0f, 1.0f, 2.0f, 3.0f, 4.0f);
+        __m256 result = _mm256_add_ps (a, b);
+        return 0;
+    }" AVX_FOUND)
+
     # Check AVX 2
     set(CMAKE_REQUIRED_FLAGS ${AVX2_FLAG})
     set(AVX2_FOUND_EXITCODE 1 CACHE STRING "Result from TRY_RUN" FORCE)
@@ -83,20 +83,20 @@ if(NOT WIN32)
         __m256i result = _mm256_abs_epi32 (a);
         return 0;
     }" AVX2_FOUND)
-endif(NOT WIN32)
 
-# Check AVX512F
-set(CMAKE_REQUIRED_FLAGS ${AVX512F_FLAG})
-set(AVX512F_FOUND_EXITCODE 1 CACHE STRING "Result from TRY_RUN" FORCE)
-CHECK_CXX_SOURCE_RUNS("
-#include <immintrin.h>
-int main()
-{
-    __m512i a = _mm512_set_epi32 (-1, 2, -3, 4, -1, 2, -3, 4,
-                                  13, -5, 6, -7, 9, 2, -6, 3);
-    __m512i result = _mm512_abs_epi32 (a);
-    return 0;
-}" AVX512F_FOUND)
+    # Check AVX512F
+    set(CMAKE_REQUIRED_FLAGS ${AVX512F_FLAG})
+    set(AVX512F_FOUND_EXITCODE 1 CACHE STRING "Result from TRY_RUN" FORCE)
+    CHECK_CXX_SOURCE_RUNS("
+    #include <immintrin.h>
+    int main()
+    {
+        __m512i a = _mm512_set_epi32 (-1, 2, -3, 4, -1, 2, -3, 4,
+                                      13, -5, 6, -7, 9, 2, -6, 3);
+        __m512i result = _mm512_abs_epi32 (a);
+        return 0;
+    }" AVX512F_FOUND)
+endif(NOT WIN32)
 
 set(CMAKE_REQUIRED_FLAGS ${CMAKE_REQUIRED_FLAGS_RETAINED})
 mark_as_advanced(MMX_FOUND SSE2_FOUND SSE3_FOUND AVX_FOUND AVX2_FOUND AVX512F_FOUND)
diff --git a/paddle/fluid/operators/CMakeLists.txt b/paddle/fluid/operators/CMakeLists.txt
index 9b1b272292..60a42cf568 100644
--- a/paddle/fluid/operators/CMakeLists.txt
+++ b/paddle/fluid/operators/CMakeLists.txt
@@ -49,11 +49,12 @@ endif()
 
 set(COMMON_OP_DEPS "")
 
-set(COMMON_OP_DEPS ${COMMON_OP_DEPS} xxhash selected_rows_functor selected_rows lod_tensor maxouting unpooling pooling lod_rank_table context_project sequence_pooling executor sequence_padding sequence_scale cos_sim_functor memory jit_kernel concat_and_split cross_entropy softmax vol2col im2col sampler)
-set(COMMON_OP_DEPS ${COMMON_OP_DEPS} sequence2batch lstm_compute matrix_bit_code gru_compute activation_functions)
+set(COMMON_OP_DEPS ${COMMON_OP_DEPS} xxhash selected_rows_functor selected_rows lod_tensor maxouting unpooling pooling lod_rank_table context_project sequence_pooling executor)
 if (NOT WIN32)
     set(COMMON_OP_DEPS ${COMMON_OP_DEPS} dynload_warpctc)
 endif()
+set(COMMON_OP_DEPS ${COMMON_OP_DEPS} sequence_padding sequence_scale cos_sim_functor memory jit_kernel concat_and_split cross_entropy softmax vol2col im2col sampler)
+set(COMMON_OP_DEPS ${COMMON_OP_DEPS} sequence2batch lstm_compute matrix_bit_code gru_compute activation_functions)
 if (WITH_GPU)
   set(COMMON_OP_DEPS ${COMMON_OP_DEPS} depthwise_conv cub)
 endif()

From 6e23d6a2d7e918d18e20380f9ac2192a7aaa91c8 Mon Sep 17 00:00:00 2001
From: peizhilin <wopeizl@163.com>
Date: Mon, 19 Nov 2018 13:46:21 +0800
Subject: [PATCH 023/113] disable mkl on windows by default

---
 CMakeLists.txt | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 0b42e60e17..d9b797f3d1 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -135,6 +135,8 @@ if (WIN32)
             "Disable AVX when compiling for Windows" FORCE)
     set(WITH_DSO OFF CACHE STRING
             "Disable DSO when compiling for Windows" FORCE)
+    set(WITH_MKL OFF CACHE STRING
+            "Disable MKL when compiling for Windows" FORCE)
 endif()
 
 set(THIRD_PARTY_PATH "${CMAKE_BINARY_DIR}/third_party" CACHE STRING

From 8443961a4f8b09ca1cfe632633ef21df87f5788a Mon Sep 17 00:00:00 2001
From: peizhilin <wopeizl@163.com>
Date: Mon, 19 Nov 2018 16:55:49 +0800
Subject: [PATCH 024/113] add warp_ctc back

---
 paddle/fluid/operators/CMakeLists.txt | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/paddle/fluid/operators/CMakeLists.txt b/paddle/fluid/operators/CMakeLists.txt
index 60a42cf568..412ab66709 100644
--- a/paddle/fluid/operators/CMakeLists.txt
+++ b/paddle/fluid/operators/CMakeLists.txt
@@ -32,9 +32,7 @@ if (WITH_GPU AND TENSORRT_FOUND)
     add_subdirectory(tensorrt)
 endif()
 
-if (NOT WIN32)
-    register_operators(EXCLUDES warpctc_op)
-endif()
+register_operators(EXCLUDES warpctc_op)
 
 # warpctc_cudnn need cudnn 7 above
 if (WITH_GPU AND NOT WIN32)

From a43bc612ad2590eeab42196b0adf87288f1c41f4 Mon Sep 17 00:00:00 2001
From: peizhilin <wopeizl@163.com>
Date: Mon, 19 Nov 2018 17:08:06 +0800
Subject: [PATCH 025/113] fix the dependency

---
 paddle/fluid/inference/tensorrt/plugin/CMakeLists.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/paddle/fluid/inference/tensorrt/plugin/CMakeLists.txt b/paddle/fluid/inference/tensorrt/plugin/CMakeLists.txt
index 6611e2e4b3..b6811f9183 100644
--- a/paddle/fluid/inference/tensorrt/plugin/CMakeLists.txt
+++ b/paddle/fluid/inference/tensorrt/plugin/CMakeLists.txt
@@ -1 +1 @@
-nv_library(tensorrt_plugin SRCS trt_plugin.cc split_op_plugin.cu prelu_op_plugin.cu DEPS enforce)
+nv_library(tensorrt_plugin SRCS trt_plugin.cc split_op_plugin.cu prelu_op_plugin.cu DEPS enforce device_context)

From 81f750a88c354a7f34ee6732a152a87b0ee25d2f Mon Sep 17 00:00:00 2001
From: peizhilin <wopeizl@163.com>
Date: Mon, 19 Nov 2018 17:10:38 +0800
Subject: [PATCH 026/113] fix the dependency

---
 paddle/fluid/inference/tensorrt/plugin/CMakeLists.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/paddle/fluid/inference/tensorrt/plugin/CMakeLists.txt b/paddle/fluid/inference/tensorrt/plugin/CMakeLists.txt
index 6611e2e4b3..b6811f9183 100644
--- a/paddle/fluid/inference/tensorrt/plugin/CMakeLists.txt
+++ b/paddle/fluid/inference/tensorrt/plugin/CMakeLists.txt
@@ -1 +1 @@
-nv_library(tensorrt_plugin SRCS trt_plugin.cc split_op_plugin.cu prelu_op_plugin.cu DEPS enforce)
+nv_library(tensorrt_plugin SRCS trt_plugin.cc split_op_plugin.cu prelu_op_plugin.cu DEPS enforce device_context)

From 16bc8f2a755438cac5a279f27b082dbaf0e3e3a0 Mon Sep 17 00:00:00 2001
From: minqiyang <minqiyang@baidu.com>
Date: Mon, 19 Nov 2018 21:52:59 +0800
Subject: [PATCH 027/113] Add debug info

---
 .dockerignore |  1 +
 Dockerfile    | 86 +++++++++++++++++++++++++--------------------------
 2 files changed, 44 insertions(+), 43 deletions(-)

diff --git a/.dockerignore b/.dockerignore
index 2b2e74053d..49adfe4f0a 100644
--- a/.dockerignore
+++ b/.dockerignore
@@ -1,5 +1,6 @@
 *.DS_Store
 build/
+build*
 *.user
 .vscode
 .idea
diff --git a/Dockerfile b/Dockerfile
index c8b9eed6d6..b36102175c 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -71,46 +71,46 @@ RUN localedef -i en_US -f UTF-8 en_US.UTF-8
 # specify sphinx version as 1.5.6 and remove -U option for [pip install -U
 # sphinx-rtd-theme] since -U option will cause sphinx being updated to newest
 # version(1.7.1 for now), which causes building documentation failed.
-RUN pip3 install -U wheel && \
-    pip3 install -U docopt PyYAML sphinx==1.5.6 && \
-    pip3 install sphinx-rtd-theme==0.1.9 recommonmark && \
-    easy_install -U pip && \
-    pip install -U pip setuptools wheel && \
-    pip install -U docopt PyYAML sphinx==1.5.6 && \
-    pip install sphinx-rtd-theme==0.1.9 recommonmark
-
-RUN pip3 install 'pre-commit==1.10.4' 'ipython==5.3.0' && \
-    pip3 install 'ipykernel==4.6.0' 'jupyter==1.0.0' && \
-    pip3 install opencv-python && \
-    pip install 'pre-commit==1.10.4' 'ipython==5.3.0' && \
-    pip install 'ipykernel==4.6.0' 'jupyter==1.0.0' && \
-    pip install opencv-python
-
-#For docstring checker
-RUN pip3 install pylint pytest astroid isort
-RUN pip install pylint pytest astroid isort LinkChecker
-
-COPY ./python/requirements.txt /root/
-RUN pip3 install -r /root/requirements.txt
-RUN pip install -r /root/requirements.txt
-
-# To fix https://github.com/PaddlePaddle/Paddle/issues/1954, we use
-# the solution in https://urllib3.readthedocs.io/en/latest/user-guide.html#ssl-py2
-RUN apt-get install -y libssl-dev libffi-dev
-RUN pip3 install certifi urllib3[secure]
-RUN pip install certifi urllib3[secure]
-
-
-# Install woboq_codebrowser to /woboq
-RUN git clone https://github.com/woboq/woboq_codebrowser /woboq && \
-    (cd /woboq \
-     cmake -DLLVM_CONFIG_EXECUTABLE=/usr/bin/llvm-config-3.8 \
-           -DCMAKE_BUILD_TYPE=Release . \
-     make)
-
-# Configure OpenSSH server. c.f. https://docs.docker.com/engine/examples/running_ssh_service
-RUN mkdir /var/run/sshd
-RUN echo 'root:root' | chpasswd
-RUN sed -ri 's/^PermitRootLogin\s+.*/PermitRootLogin yes/' /etc/ssh/sshd_config
-RUN sed -ri 's/UsePAM yes/#UsePAM yes/g' /etc/ssh/sshd_config
-EXPOSE 22
+# RUN pip3 install -U wheel && \
+    # pip3 install -U docopt PyYAML sphinx==1.5.6 && \
+    # pip3 install sphinx-rtd-theme==0.1.9 recommonmark && \
+    # easy_install -U pip && \
+    # pip install -U pip setuptools wheel && \
+    # pip install -U docopt PyYAML sphinx==1.5.6 && \
+    # pip install sphinx-rtd-theme==0.1.9 recommonmark
+
+# RUN pip3 install 'pre-commit==1.10.4' 'ipython==5.3.0' && \
+    # pip3 install 'ipykernel==4.6.0' 'jupyter==1.0.0' && \
+    # pip3 install opencv-python && \
+    # pip install 'pre-commit==1.10.4' 'ipython==5.3.0' && \
+    # pip install 'ipykernel==4.6.0' 'jupyter==1.0.0' && \
+    # pip install opencv-python
+
+# #For docstring checker
+# RUN pip3 install pylint pytest astroid isort
+# RUN pip install pylint pytest astroid isort LinkChecker
+
+# COPY ./python/requirements.txt /root/
+# RUN pip3 install -r /root/requirements.txt
+# RUN pip install -r /root/requirements.txt
+
+# # To fix https://github.com/PaddlePaddle/Paddle/issues/1954, we use
+# # the solution in https://urllib3.readthedocs.io/en/latest/user-guide.html#ssl-py2
+# RUN apt-get install -y libssl-dev libffi-dev
+# RUN pip3 install certifi urllib3[secure]
+# RUN pip install certifi urllib3[secure]
+
+
+# # Install woboq_codebrowser to /woboq
+# RUN git clone https://github.com/woboq/woboq_codebrowser /woboq && \
+    # (cd /woboq \
+     # cmake -DLLVM_CONFIG_EXECUTABLE=/usr/bin/llvm-config-3.8 \
+           # -DCMAKE_BUILD_TYPE=Release . \
+     # make)
+
+# # Configure OpenSSH server. c.f. https://docs.docker.com/engine/examples/running_ssh_service
+# RUN mkdir /var/run/sshd
+# RUN echo 'root:root' | chpasswd
+# RUN sed -ri 's/^PermitRootLogin\s+.*/PermitRootLogin yes/' /etc/ssh/sshd_config
+# RUN sed -ri 's/UsePAM yes/#UsePAM yes/g' /etc/ssh/sshd_config
+# EXPOSE 22

From 3f73c0a70d641b2b84b4764ee55f3f942fb2c6da Mon Sep 17 00:00:00 2001
From: peizhilin <wopeizl@163.com>
Date: Tue, 20 Nov 2018 10:26:59 +0800
Subject: [PATCH 028/113] fix the build issue on windows

---
 paddle/fluid/memory/allocation/cpu_allocator.h | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/paddle/fluid/memory/allocation/cpu_allocator.h b/paddle/fluid/memory/allocation/cpu_allocator.h
index 9e0044c47a..165f11cd3b 100644
--- a/paddle/fluid/memory/allocation/cpu_allocator.h
+++ b/paddle/fluid/memory/allocation/cpu_allocator.h
@@ -15,6 +15,11 @@
 #pragma once
 #include "paddle/fluid/memory/allocation/allocator.h"
 
+#ifdef _WIN32
+#define posix_memalign_free _aligned_free
+#define posix_memalign(p, a, s) (((*(p)) = _aligned_malloc((s), (a))), *(p) ? 0 : errno)
+#endif
+
 namespace paddle {
 namespace memory {
 namespace allocation {

From 301ed153231f0e0f6066663c1adc572af4907c97 Mon Sep 17 00:00:00 2001
From: peizhilin <wopeizl@163.com>
Date: Tue, 20 Nov 2018 13:36:10 +0800
Subject: [PATCH 029/113] remove unsupported flag on windows

---
 python/paddle/fluid/__init__.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/paddle/fluid/__init__.py b/python/paddle/fluid/__init__.py
index a7dfc6e9e3..091697aaa5 100644
--- a/python/paddle/fluid/__init__.py
+++ b/python/paddle/fluid/__init__.py
@@ -116,7 +116,7 @@ def __bootstrap__():
         'use_pinned_memory', 'check_nan_inf', 'benchmark', 'eager_delete_scope',
         'use_mkldnn', 'use_ngraph', 'initial_cpu_memory_in_mb',
         'init_allocated_mem', 'free_idle_memory', 'paddle_num_threads',
-        "dist_threadpool_size", 'cpu_deterministic', 'eager_delete_tensor_gb',
+        "dist_threadpool_size", 'eager_delete_tensor_gb',
         'allocator_strategy', 'reader_queue_speed_test_mode'
     ]
     if os.name != 'nt':

From 935387f3fc4c36a13443443cb820868b65a3c667 Mon Sep 17 00:00:00 2001
From: peizhilin <wopeizl@163.com>
Date: Tue, 20 Nov 2018 14:40:19 +0800
Subject: [PATCH 030/113] code style

---
 python/paddle/fluid/__init__.py | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/python/paddle/fluid/__init__.py b/python/paddle/fluid/__init__.py
index 2a28f7b2d1..6a4a5e098f 100644
--- a/python/paddle/fluid/__init__.py
+++ b/python/paddle/fluid/__init__.py
@@ -116,9 +116,8 @@ def __bootstrap__():
         'use_pinned_memory', 'check_nan_inf', 'benchmark', 'eager_delete_scope',
         'use_mkldnn', 'use_ngraph', 'initial_cpu_memory_in_mb',
         'init_allocated_mem', 'free_idle_memory', 'paddle_num_threads',
-        "dist_threadpool_size", 'eager_delete_tensor_gb',
-        'allocator_strategy', 'reader_queue_speed_test_mode',
-        'print_sub_graph_dir'
+        "dist_threadpool_size", 'eager_delete_tensor_gb', 'allocator_strategy',
+        'reader_queue_speed_test_mode', 'print_sub_graph_dir'
     ]
     if os.name != 'nt':
         read_env_flags.append('warpctc_dir')

From afeadf58f9ce90d4dd05f1eb1e4936cb83bc0cde Mon Sep 17 00:00:00 2001
From: peizhilin <wopeizl@163.com>
Date: Tue, 20 Nov 2018 14:59:20 +0800
Subject: [PATCH 031/113] code style test=develop

---
 paddle/fluid/memory/allocation/cpu_allocator.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/paddle/fluid/memory/allocation/cpu_allocator.h b/paddle/fluid/memory/allocation/cpu_allocator.h
index 165f11cd3b..26d3643f4e 100644
--- a/paddle/fluid/memory/allocation/cpu_allocator.h
+++ b/paddle/fluid/memory/allocation/cpu_allocator.h
@@ -17,7 +17,8 @@
 
 #ifdef _WIN32
 #define posix_memalign_free _aligned_free
-#define posix_memalign(p, a, s) (((*(p)) = _aligned_malloc((s), (a))), *(p) ? 0 : errno)
+#define posix_memalign(p, a, s) \
+  (((*(p)) = _aligned_malloc((s), (a))), *(p) ? 0 : errno)
 #endif
 
 namespace paddle {

From 0e1b426c839d8c91952b7d18139d3681beaa726f Mon Sep 17 00:00:00 2001
From: jerrywgz <jerrwgz@126.com>
Date: Tue, 20 Nov 2018 07:02:12 +0000
Subject: [PATCH 032/113] refine prelu api doc, test=develop

---
 python/paddle/fluid/layers/nn.py | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py
index 99acd7e308..5749bcc54a 100644
--- a/python/paddle/fluid/layers/nn.py
+++ b/python/paddle/fluid/layers/nn.py
@@ -6900,18 +6900,18 @@ def prelu(x, mode, param_attr=None, name=None):
     """
     Equation:
 
-        y = \max(0, x) + alpha \min(0, x)
+        y = \max(0, x) + alpha * \min(0, x)
 
     Args:
         x (Variable): The input tensor.
 	  param_attr(ParamAttr|None): The parameter attribute for the learnable
-                                    weight (alpha).
-        mode (string): The mode for weight sharing
-		       all: all elements share same weight
- 		       channel:elements in a channel share same weight
- 		       element:each element has a weight
+                                      weight (alpha).
+        mode (string): The mode for weight sharing. It supports all, channel
+                       and element. all: all elements share same weight
+                       channel:elements in a channel share same weight
+                       element:each element has a weight
 	name(str|None): A name for this layer(optional). If set None, the layer
-                        will be named automatically.
+                       will be named automatically.
 
     Returns:
         Variable: The output tensor with the same shape as input.
@@ -6921,8 +6921,8 @@ def prelu(x, mode, param_attr=None, name=None):
         .. code-block:: python
 
          x = fluid.layers.data(name="x", shape=[10,10], dtype="float32")
-            mode = 'channel'
-            output = fluid.layers.prelu(x,mode)
+         mode = 'channel'
+         output = fluid.layers.prelu(x,mode)
     """
     helper = LayerHelper('prelu', **locals())
     if mode not in ['all', 'channel', 'element']:

From c2cfb03a7277a92297b4617cb5c778bb495a998b Mon Sep 17 00:00:00 2001
From: tensor-tang <tangjian03@baidu.com>
Date: Tue, 20 Nov 2018 08:50:24 +0000
Subject: [PATCH 033/113] add lstm jitcode

---
 paddle/fluid/operators/math/jit_code.cc       |  49 +++++++++
 paddle/fluid/operators/math/jit_code.h        | 102 ++++++++++++++++--
 paddle/fluid/operators/math/jit_kernel.h      |  15 ++-
 paddle/fluid/operators/math/jit_kernel_impl.h |  49 +++++++++
 4 files changed, 198 insertions(+), 17 deletions(-)
 create mode 100644 paddle/fluid/operators/math/jit_kernel_impl.h

diff --git a/paddle/fluid/operators/math/jit_code.cc b/paddle/fluid/operators/math/jit_code.cc
index e484e9a3c7..418c843362 100644
--- a/paddle/fluid/operators/math/jit_code.cc
+++ b/paddle/fluid/operators/math/jit_code.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/math/jit_code.h"
+#include <stddef.h>                                  // offsetof
 #include "paddle/fluid/operators/math/jit_kernel.h"  // TODO(TJ): remove me
 
 namespace paddle {
@@ -210,6 +211,54 @@ void VActJitCode::generate() {
   ret();
 }
 
+bool LSTMJitCode::init(int d) { return MayIUse(avx) && d % 8 == 0; }
+
+void LSTMJitCode::generate() {
+  reg64_t reg_ptr_gates = rax;
+  reg64_t reg_ptr_ct_1 = r9;
+  reg64_t reg_ptr_ct = r10;
+  reg64_t reg_ptr_ht = r11;
+  mov(reg_ptr_gates, ptr[param1 + offsetof(lstm_t, gates)]);
+  mov(reg_ptr_ct_1, ptr[param1 + offsetof(lstm_t, ct_1)]);
+  mov(reg_ptr_ct, ptr[param1 + offsetof(lstm_t, ct)]);
+  mov(reg_ptr_ht, ptr[param1 + offsetof(lstm_t, ht)]);
+
+  int offset = 0;
+  for (int i = 0; i < num_ / YMM_FLOAT_BLOCK; ++i) {
+    /* C_t = C_t-1 * fgated + cand_gated * igated*/
+    // c
+    vmovups(ymm_src, ptr[reg_ptr_gates + offset]);
+    act<ymm_t>(ymm_c, ymm_src, act_cand_);
+    // i
+    vmovups(ymm_src, ptr[reg_ptr_gates + offset + num_]);
+    act<ymm_t>(ymm_i, ymm_src, act_gate_);
+    vmulps(ymm_c, ymm_c, ymm_i);
+    if (first_) {
+      // f
+      vmovups(ymm_src, ptr[reg_ptr_gates + offset + 2 * num_]);
+      act<ymm_t>(ymm_f, ymm_src, act_gate_);
+      vmovups(ymm_i, ptr[reg_ptr_ct_1 + offset]);
+      vmulps(ymm_f, ymm_f, ymm_i);
+      vaddps(ymm_f, ymm_f, ymm_c);
+    }
+    /* H_t = act_cell(C_t) * ogated */
+    ymm_t ymm_ct = first_ ? ymm_c : ymm_f;
+    ymm_t ymm_o = first_ ? ymm_f : ymm_c;
+    ymm_t ymm_tmp = ymm_i;
+    act<ymm_t>(ymm_tmp, ymm_ct, act_cell_);
+    vmovups(ymm_src, ptr[reg_ptr_gates + offset + 3 * num_]);
+    act<ymm_t>(ymm_o, ymm_src, act_gate_);
+    vmulps(ymm_o, ymm_tmp, ymm_o);
+    // save ct and ht
+    vmovups(ptr[reg_ptr_ct + offset], ymm_ct);
+    vmovups(ptr[reg_ptr_ht + offset], ymm_o);
+
+    offset += sizeof(float) * YMM_FLOAT_BLOCK;
+  }
+
+  ret();
+}
+
 }  // namespace gen
 }  // namespace jitkernel
 }  // namespace math
diff --git a/paddle/fluid/operators/math/jit_code.h b/paddle/fluid/operators/math/jit_code.h
index 65f83ff484..938b5525c1 100644
--- a/paddle/fluid/operators/math/jit_code.h
+++ b/paddle/fluid/operators/math/jit_code.h
@@ -16,6 +16,7 @@ limitations under the License. */
 
 #include <string>
 #include "paddle/fluid/operators/math/jit_gen.h"
+#include "paddle/fluid/operators/math/jit_kernel_impl.h"
 #include "paddle/fluid/platform/cpu_info.h"
 
 namespace paddle {
@@ -46,14 +47,6 @@ extern const float exp_float_consts[];
 extern const int exp_int_0x7f[];
 extern int g_tmp_mem[];
 
-// TODO(TJ): move these to some proper place
-#define SIGMOID_THRESHOLD_MIN -40.0
-#define SIGMOID_THRESHOLD_MAX 13.0
-#define EXP_MAX_INPUT 40.0
-#define XMM_FLOAT_BLOCK 4
-#define YMM_FLOAT_BLOCK 8
-#define ZMM_FLOAT_BLOCK 16
-
 #define ALIGN32 __attribute__((aligned(32)))
 #define EXP_HIG 88.3762626647949f
 #define EXP_LOW -88.3762626647949f
@@ -322,6 +315,99 @@ class VActJitCode : public JitCode {
   ymm_t ymm_dst = ymm_t(1);
 };
 
+class LSTMJitCode : public VActJitCode {
+ public:
+  const char* name() const override {
+    std::string base = "LSTMJitCode";
+    auto AddTypeStr = [&](operand_type type) {
+      switch (type) {
+        case operand_type::relu:
+          base += "_Relu";
+          break;
+        case operand_type::exp:
+          base += "_Exp";
+          break;
+        case operand_type::sigmoid:
+          base += "_Sigmoid";
+          break;
+        case operand_type::tanh:
+          base += "_Tanh";
+          break;
+        case operand_type::identity:
+          base += "_Identity";
+          break;
+        default:
+          break;
+      }
+    };
+    if (first_) {
+      base += "_C1H1";
+    }
+    AddTypeStr(act_gate_);
+    AddTypeStr(act_cand_);
+    AddTypeStr(act_cell_);
+    return base.c_str();
+  }
+
+  explicit LSTMJitCode(int d, bool first, operand_type act_gate,
+                       operand_type act_cand, operand_type act_cell,
+                       size_t code_size = 256 * 1024, void* code_ptr = nullptr)
+      : VActJitCode(d, act_gate, code_size, code_ptr),
+        num_(d),
+        first_(first),
+        act_gate_(act_gate),
+        act_cand_(act_cand),
+        act_cell_(act_cell) {}
+  static bool init(int d);
+  void generate() override;
+
+ protected:
+  int num_;
+  bool first_;
+  operand_type act_gate_;
+  operand_type act_cand_;
+  operand_type act_cell_;
+  reg64_t param1{abi_param1};
+
+  xmm_t xmm_src = xmm_t(0);
+  xmm_t xmm_c = xmm_t(1);
+  xmm_t xmm_i = xmm_t(2);
+  xmm_t xmm_f = xmm_t(3);
+
+  ymm_t ymm_src = ymm_t(0);
+  ymm_t ymm_c = ymm_t(1);
+  ymm_t ymm_i = ymm_t(2);
+  ymm_t ymm_f = ymm_t(3);
+
+  template <typename JMM>
+  void act(JMM& dst, JMM& src, operand_type type) {  // NOLINT
+    // use 15
+    JMM zero = JMM(15);
+    if (type_ == operand_type::relu) {
+      vxorps(zero, zero, zero);
+    }
+    switch (type) {
+      case operand_type::relu:
+        relu_jmm<JMM>(dst, src, zero);
+        break;
+      case operand_type::exp:
+        exp_jmm<JMM>(dst, src, 2, 3, 4, 5);
+        break;
+      case operand_type::sigmoid:
+        sigmoid_jmm<JMM>(dst, src, 2, 3, 4, 5);
+        break;
+      case operand_type::tanh:
+        tanh_jmm<JMM>(dst, src, 2, 3, 4, 5);
+        break;
+      case operand_type::identity:
+        break;
+      default:
+        // throw error
+        break;
+    }
+  }
+};
+
 }  // namespace gen
 }  // namespace jitkernel
 }  // namespace math
diff --git a/paddle/fluid/operators/math/jit_kernel.h b/paddle/fluid/operators/math/jit_kernel.h
index 7e163c1349..b5e54fcc1b 100644
--- a/paddle/fluid/operators/math/jit_kernel.h
+++ b/paddle/fluid/operators/math/jit_kernel.h
@@ -17,6 +17,7 @@ limitations under the License. */
 #include <memory>  // for shared_ptr
 #include <string>
 #include <unordered_map>
+#include "paddle/fluid/operators/math/jit_kernel_impl.h"
 #include "paddle/fluid/platform/cpu_info.h"
 #include "paddle/fluid/platform/macros.h"
 
@@ -26,14 +27,7 @@ namespace operators {
 namespace math {
 namespace jitkernel {
 
-// TODO(TJ): move these to some proper place
-#define SIGMOID_THRESHOLD_MIN -40.0
-#define SIGMOID_THRESHOLD_MAX 13.0
-#define EXP_MAX_INPUT 40.0
-#define XMM_FLOAT_BLOCK 4
-#define YMM_FLOAT_BLOCK 8
-#define ZMM_FLOAT_BLOCK 16
-
+// TODO(TJ): remove me
 typedef enum { kLT8, kEQ8, kGT8LT16, kEQ16, kGT16 } jit_block;
 
 class Kernel {
@@ -124,10 +118,13 @@ class LSTMKernel : public Kernel {
                            const T *wp_data = nullptr,
                            T *checked = nullptr) const = 0;
 
-  // compute c1 and h1 without c0 or h0
   virtual void ComputeC1H1(T *gates, T *ct, T *ht,
                            /* below only used in peephole*/
                            const T *wp_data = nullptr) const = 0;
+
+  //  void (*ComputeCtHt)(lstm_t *);
+  // // compute c1 and h1 without c0 or h0
+  //  void (*ComputeC1H1)(lstm_t *);
 };
 
 template <typename T>
diff --git a/paddle/fluid/operators/math/jit_kernel_impl.h b/paddle/fluid/operators/math/jit_kernel_impl.h
new file mode 100644
index 0000000000..fcb6a7c097
--- /dev/null
+++ b/paddle/fluid/operators/math/jit_kernel_impl.h
@@ -0,0 +1,49 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include <string>
+#include <type_traits>
+
+namespace paddle {
+namespace operators {
+namespace math {
+namespace jitkernel {
+
+#define SIGMOID_THRESHOLD_MIN -40.0
+#define SIGMOID_THRESHOLD_MAX 13.0
+#define EXP_MAX_INPUT 40.0
+#define XMM_FLOAT_BLOCK 4
+#define YMM_FLOAT_BLOCK 8
+#define ZMM_FLOAT_BLOCK 16
+
+typedef struct {
+  void* gates;  // gates: W_ch, W_ih, W_fh, W_oh
+  const void* ct_1;
+  void* ct;
+  void* ht;
+  /* below only used in peephole*/
+  const void* wp_data{nullptr};
+  void* checked{nullptr};
+} lstm_t;
+
+typedef struct {
+  int d;
+  std::string act_gate, act_cand, act_cell;
+} lstm_attr_t;
+
+}  // namespace jitkernel
+}  // namespace math
+}  // namespace operators
+}  // namespace paddle

From 5d6b370a4968fc4bc7dea369ee588ebec0b8f660 Mon Sep 17 00:00:00 2001
From: peizhilin <wopeizl@163.com>
Date: Tue, 20 Nov 2018 20:17:16 +0800
Subject: [PATCH 034/113] fix issue

---
 paddle/fluid/platform/enforce.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/paddle/fluid/platform/enforce.h b/paddle/fluid/platform/enforce.h
index 3643d2ad15..31309738a5 100644
--- a/paddle/fluid/platform/enforce.h
+++ b/paddle/fluid/platform/enforce.h
@@ -134,7 +134,7 @@ struct EOFException : public std::exception {
 #define LIKELY(condition) __builtin_expect(static_cast<bool>(condition), 1)
 #else
 // there is no equivalent intrinsics in msvc.
-#define LIKELY(condition) !(condition)
+#define LIKELY(condition) (condition)
 #endif
 
 template <typename... Args>

From ce31deb7e938270249b719bce93ef6d8baf5c0c4 Mon Sep 17 00:00:00 2001
From: tensor-tang <tangjian03@baidu.com>
Date: Tue, 20 Nov 2018 12:37:43 +0000
Subject: [PATCH 035/113] refine refer code and add lstm refer code

test=develop
---
 .../fluid/operators/math/jit_kernel_blas.cc   |  65 +------
 paddle/fluid/operators/math/jit_kernel_exp.cc |  40 +---
 paddle/fluid/operators/math/jit_kernel_impl.h |   6 +-
 .../fluid/operators/math/jit_kernel_refer.h   | 171 ++++++++++++++++++
 .../fluid/operators/math/jit_kernel_test.cc   | 139 +++-----------
 5 files changed, 220 insertions(+), 201 deletions(-)
 create mode 100644 paddle/fluid/operators/math/jit_kernel_refer.h

diff --git a/paddle/fluid/operators/math/jit_kernel_blas.cc b/paddle/fluid/operators/math/jit_kernel_blas.cc
index 36a50f2043..90b7029371 100644
--- a/paddle/fluid/operators/math/jit_kernel_blas.cc
+++ b/paddle/fluid/operators/math/jit_kernel_blas.cc
@@ -15,6 +15,7 @@ limitations under the License. */
 #include "paddle/fluid/operators/math/jit_kernel.h"
 #include <string>
 #include "paddle/fluid/operators/math/jit_kernel_macro.h"
+#include "paddle/fluid/operators/math/jit_kernel_refer.h"
 #include "paddle/fluid/platform/enforce.h"
 
 #ifdef PADDLE_WITH_XBYAK
@@ -31,49 +32,6 @@ namespace math {
 namespace jitkernel {
 namespace jit = platform::jit;
 
-template <typename T>
-void VMulRefer(const T* x, const T* y, T* z, int n) {
-  for (int i = 0; i < n; ++i) {
-    z[i] = x[i] * y[i];
-  }
-}
-
-template <typename T>
-void VAddRefer(const T* x, const T* y, T* z, int n) {
-  for (int i = 0; i < n; ++i) {
-    z[i] = x[i] + y[i];
-  }
-}
-
-template <typename T>
-void VAddReluRefer(const T* x, const T* y, T* z, int n) {
-  for (int i = 0; i < n; ++i) {
-    z[i] = x[i] + y[i];
-    z[i] = z[i] > 0 ? z[i] : 0;
-  }
-}
-
-template <typename T>
-void VScalRefer(const T* a, const T* x, T* y, int n) {
-  for (int i = 0; i < n; ++i) {
-    y[i] = a[0] * x[i];
-  }
-}
-
-template <typename T>
-void VAddBiasRefer(const T* a, const T* x, T* y, int n) {
-  for (int i = 0; i < n; ++i) {
-    y[i] = a[0] + x[i];
-  }
-}
-
-template <typename T>
-void VReluRefer(const T* x, T* y, int n) {
-  for (int i = 0; i < n; ++i) {
-    y[i] = x[i] > 0 ? x[i] : 0;
-  }
-}
-
 #ifdef PADDLE_WITH_MKLML
 template <typename T>
 void VMulMKL(const T* x, const T* y, T* z, int n);
@@ -109,7 +67,7 @@ void VScalMKL<float>(const float* a, const float* x, float* y, int n) {
   if (x == y) {
     platform::dynload::cblas_sscal(n, *a, y, 1);
   } else {
-    VScalRefer<float>(a, x, y, n);
+    refer::VScal<float>(a, x, y, n);
   }
 }
 
@@ -118,7 +76,7 @@ void VScalMKL<double>(const double* a, const double* x, double* y, int n) {
   if (x == y) {
     platform::dynload::cblas_dscal(n, *a, y, 1);
   } else {
-    VScalRefer<double>(a, x, y, n);
+    refer::VScal<double>(a, x, y, n);
   }
 }
 
@@ -147,7 +105,7 @@ class VMulKernelImpl : public VMulKernel<T> {
       return;
     }
 #endif
-    this->Compute = VMulRefer<T>;
+    this->Compute = refer::VMul<T>;
   }
 
 #ifdef PADDLE_WITH_XBYAK
@@ -198,7 +156,7 @@ class VAddKernelImpl : public VAddKernel<T> {
       return;
     }
 #endif
-    this->Compute = VAddRefer<T>;
+    this->Compute = refer::VAdd<T>;
   }
 #ifdef PADDLE_WITH_XBYAK
 
@@ -242,7 +200,7 @@ class VAddReluKernelImpl : public VAddReluKernel<T> {
       return;
     }
 #endif
-    this->Compute = VAddReluRefer<T>;
+    this->Compute = refer::VAddRelu<T>;
   }
 #ifdef PADDLE_WITH_XBYAK
 
@@ -280,7 +238,7 @@ class VScalKernelImpl : public VScalKernel<T> {
       return;
     }
 #endif
-    this->Compute = VScalRefer<T>;
+    this->Compute = refer::VScal<T>;
   }
 #ifdef PADDLE_WITH_XBYAK
 
@@ -324,7 +282,7 @@ class VAddBiasKernelImpl : public VAddBiasKernel<T> {
     }
 #endif
 
-    this->Compute = VAddBiasRefer<T>;
+    this->Compute = refer::VAddBias<T>;
   }
 #ifdef PADDLE_WITH_XBYAK
 
@@ -358,7 +316,7 @@ class VReluKernelImpl : public VReluKernel<T> {
     }
 #endif
 
-    this->Compute = VReluRefer<T>;
+    this->Compute = refer::VRelu<T>;
   }
 #ifdef PADDLE_WITH_XBYAK
 
@@ -374,16 +332,13 @@ bool VReluKernelImpl<float>::useJIT(int d) {
 }
 #endif
 
-template <typename T>
-inline void VIdentityRefer(const T* x, T* y, int n) {}
-
 /* An empty JitKernel */
 template <typename T>
 class VIdentityKernelImpl : public VIdentityKernel<T> {
  public:
   JITKERNEL_DECLARE_STATIC_FUNC;
   explicit VIdentityKernelImpl(int d) : VIdentityKernel<T>() {
-    this->Compute = VIdentityRefer<T>;
+    this->Compute = refer::VIdentity<T>;
   }
 };
 
diff --git a/paddle/fluid/operators/math/jit_kernel_exp.cc b/paddle/fluid/operators/math/jit_kernel_exp.cc
index f26815300d..1fe7d66c75 100644
--- a/paddle/fluid/operators/math/jit_kernel_exp.cc
+++ b/paddle/fluid/operators/math/jit_kernel_exp.cc
@@ -13,9 +13,9 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/math/jit_kernel.h"
-#include <cmath>  // for exp
 #include <string>
 #include "paddle/fluid/operators/math/jit_kernel_macro.h"
+#include "paddle/fluid/operators/math/jit_kernel_refer.h"
 
 #ifdef PADDLE_WITH_XBYAK
 #include "paddle/fluid/operators/math/jit_code.h"
@@ -35,38 +35,6 @@ namespace math {
 namespace jitkernel {
 namespace jit = platform::jit;
 
-// TODO(TJ): move refer codes to one file
-// Refer code only focus on correctness
-template <typename T>
-void VExpRefer(const T* x, T* y, int n) {
-  for (int i = 0; i < n; ++i) {
-    y[i] = std::exp(x[i]);
-  }
-}
-
-template <typename T>
-void VSigmoidRefer(const T* x, T* y, int n) {
-  // y = 1 / (1 + e^-x)
-  const T min = SIGMOID_THRESHOLD_MIN;
-  const T max = SIGMOID_THRESHOLD_MAX;
-  for (int i = 0; i < n; ++i) {
-    T tmp = (x[i] < min) ? min : ((x[i] > max) ? max : x[i]);
-    y[i] = static_cast<T>(1) / (static_cast<T>(1) + std::exp(-tmp));
-  }
-}
-
-template <typename T>
-void VTanhRefer(const T* x, T* y, int n) {
-  // y = 2 * sigmoid(2x) - 1
-  for (int i = 0; i < n; ++i) {
-    y[i] = static_cast<T>(2) * x[i];
-  }
-  VSigmoidRefer(y, y, n);
-  for (int i = 0; i < n; ++i) {
-    y[i] = static_cast<T>(2) * y[i] - static_cast<T>(1);
-  }
-}
-
 #ifdef PADDLE_WITH_MKLML
 // try to use MKL to speedup
 template <typename T>
@@ -129,7 +97,7 @@ class VExpKernelImpl : public VExpKernel<T> {
       return;
     }
 #endif
-    this->Compute = VExpRefer<T>;
+    this->Compute = refer::VExp<T>;
   }
 
 #ifdef PADDLE_WITH_XBYAK
@@ -182,7 +150,7 @@ class VSigmoidKernelImpl : public VSigmoidKernel<T> {
       return;
     }
 #endif
-    this->Compute = VSigmoidRefer<T>;
+    this->Compute = refer::VSigmoid<T>;
   }
 
 #ifdef PADDLE_WITH_XBYAK
@@ -234,7 +202,7 @@ class VTanhKernelImpl : public VTanhKernel<T> {
       return;
     }
 #endif
-    this->Compute = VTanhRefer<T>;
+    this->Compute = refer::VTanh<T>;
   }
 
 #ifdef PADDLE_WITH_XBYAK
diff --git a/paddle/fluid/operators/math/jit_kernel_impl.h b/paddle/fluid/operators/math/jit_kernel_impl.h
index fcb6a7c097..337d5ae914 100644
--- a/paddle/fluid/operators/math/jit_kernel_impl.h
+++ b/paddle/fluid/operators/math/jit_kernel_impl.h
@@ -38,9 +38,13 @@ typedef struct {
   void* checked{nullptr};
 } lstm_t;
 
-typedef struct {
+typedef struct lstm_attr_s {
   int d;
   std::string act_gate, act_cand, act_cell;
+  lstm_attr_s() = default;
+  lstm_attr_s(int _d, const std::string& _act_gate,
+              const std::string& _act_cand, const std::string& _act_cell)
+      : d(_d), act_gate(_act_gate), act_cand(_act_cand), act_cell(_act_cell) {}
 } lstm_attr_t;
 
 }  // namespace jitkernel
diff --git a/paddle/fluid/operators/math/jit_kernel_refer.h b/paddle/fluid/operators/math/jit_kernel_refer.h
new file mode 100644
index 0000000000..9c60ebc587
--- /dev/null
+++ b/paddle/fluid/operators/math/jit_kernel_refer.h
@@ -0,0 +1,171 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include <cmath>
+#include <string>
+#include "paddle/fluid/operators/math/jit_kernel_impl.h"
+#include "paddle/fluid/platform/enforce.h"
+
+namespace paddle {
+namespace operators {
+namespace math {
+namespace jitkernel {
+namespace refer {
+/* Refer code only focus on correctness */
+
+template <typename T>
+void VMul(const T* x, const T* y, T* z, int n) {
+  for (int i = 0; i < n; ++i) {
+    z[i] = x[i] * y[i];
+  }
+}
+
+template <typename T>
+void VAdd(const T* x, const T* y, T* z, int n) {
+  for (int i = 0; i < n; ++i) {
+    z[i] = x[i] + y[i];
+  }
+}
+
+template <typename T>
+void VAddRelu(const T* x, const T* y, T* z, int n) {
+  for (int i = 0; i < n; ++i) {
+    z[i] = x[i] + y[i];
+    z[i] = z[i] > 0 ? z[i] : 0;
+  }
+}
+
+template <typename T>
+void VScal(const T* a, const T* x, T* y, int n) {
+  for (int i = 0; i < n; ++i) {
+    y[i] = a[0] * x[i];
+  }
+}
+
+template <typename T>
+void VAddBias(const T* a, const T* x, T* y, int n) {
+  for (int i = 0; i < n; ++i) {
+    y[i] = a[0] + x[i];
+  }
+}
+
+template <typename T>
+void VRelu(const T* x, T* y, int n) {
+  for (int i = 0; i < n; ++i) {
+    y[i] = x[i] > 0 ? x[i] : 0;
+  }
+}
+
+template <typename T>
+inline void VIdentity(const T* x, T* y, int n) {}
+
+template <typename T>
+void VExp(const T* x, T* y, int n) {
+  for (int i = 0; i < n; ++i) {
+    y[i] = std::exp(x[i]);
+  }
+}
+
+template <typename T>
+void VSigmoid(const T* x, T* y, int n) {
+  // y = 1 / (1 + e^-x)
+  const T min = SIGMOID_THRESHOLD_MIN;
+  const T max = SIGMOID_THRESHOLD_MAX;
+  for (int i = 0; i < n; ++i) {
+    T tmp = (x[i] < min) ? min : ((x[i] > max) ? max : x[i]);
+    y[i] = static_cast<T>(1) / (static_cast<T>(1) + std::exp(-tmp));
+  }
+}
+
+template <typename T>
+void VTanh(const T* x, T* y, int n) {
+  // y = 2 * sigmoid(2x) - 1
+  for (int i = 0; i < n; ++i) {
+    y[i] = static_cast<T>(2) * x[i];
+  }
+  VSigmoid(y, y, n);
+  for (int i = 0; i < n; ++i) {
+    y[i] = static_cast<T>(2) * y[i] - static_cast<T>(1);
+  }
+}
+
+template <typename T>
+void (*getActFunc(const std::string& type))(const T*, T*, int) {  // NOLINT
+  if (type == "sigmoid") {
+    return VSigmoid<T>;
+  } else if (type == "relu") {
+    return VRelu<T>;
+  } else if (type == "tanh") {
+    return VTanh<T>;
+  } else if (type == "identity" || type == "") {
+    return VIdentity<T>;
+  }
+  PADDLE_THROW("Not support type: %s", type);
+  return nullptr;
+}
+
+template <typename T>
+void LSTMCtHt(lstm_t* step, lstm_attr_t* attr) {
+  T* gates = reinterpret_cast<T*>(step->gates);
+  const T* ct_1 = reinterpret_cast<const T*>(step->ct_1);
+  T* ct = reinterpret_cast<T*>(step->ct);
+  T* ht = reinterpret_cast<T*>(step->ht);
+  auto act_gate = getActFunc<T>(attr->act_gate);
+  auto act_cand = getActFunc<T>(attr->act_cand);
+  auto act_cell = getActFunc<T>(attr->act_cell);
+  int d = attr->d;
+  int d2 = d * 2;
+  int d3 = d * 3;
+  // gates: W_ch, W_ih, W_fh, W_oh
+  act_gate(gates + d, gates + d, d3);
+
+  /* C_t = C_t-1 * fgated + cand_gated * igated */
+  act_cand(gates, gates, d);
+  VMul(gates, gates + d, gates + d, d);
+  VMul(ct_1, gates + d2, gates + d2, d);
+  VAdd(gates + d, gates + d2, ct, d);
+
+  /* H_t = act_cell(C_t) * ogated */
+  act_cell(ct, gates + d2, d);
+  VMul(gates + d2, gates + d3, ht, d);
+}
+
+template <typename T>
+void LSTMC1H1(lstm_t* step, lstm_attr_t* attr) {
+  T* gates = reinterpret_cast<T*>(step->gates);
+  const T* ct_1 = reinterpret_cast<const T*>(step->ct_1);
+  T* ct = reinterpret_cast<T*>(step->ct);
+  T* ht = reinterpret_cast<T*>(step->ht);
+  auto act_gate = getActFunc<T>(attr->act_gate);
+  auto act_cand = getActFunc<T>(attr->act_cand);
+  auto act_cell = getActFunc<T>(attr->act_cell);
+  int d = attr->d;
+  int d2 = d * 2;
+  int d3 = d * 3;
+  /* C_t = igated * cgated*/
+  act_gate(gates + d, gates + d, d);
+  act_cand(gates, gates, d);
+  VMul(gates, gates + d, ct, d);
+  /* H_t = act_cell(C_t) * ogated */
+  act_gate(gates + d3, gates + d3, d);
+  act_cell(ct, gates + d2, d);
+  Vmul(gates + d2, gates + d3, ht, d);
+}
+
+}  // namespace refer
+}  // namespace jitkernel
+}  // namespace math
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/math/jit_kernel_test.cc b/paddle/fluid/operators/math/jit_kernel_test.cc
index b6c62a2634..a1705a81c4 100644
--- a/paddle/fluid/operators/math/jit_kernel_test.cc
+++ b/paddle/fluid/operators/math/jit_kernel_test.cc
@@ -22,6 +22,7 @@ limitations under the License. */
 #include "gflags/gflags.h"
 #include "glog/logging.h"
 #include "gtest/gtest.h"
+#include "paddle/fluid/operators/math/jit_kernel_refer.h"
 
 #ifdef PADDLE_WITH_MKLML
 #include "paddle/fluid/platform/dynload/mklml.h"
@@ -53,12 +54,6 @@ void RandomVec(const int n, T* a, const T lower = static_cast<T>(-20.f),
   }
 }
 
-void vrelu_ref(const int n, const float* x, float* y) {
-  for (int i = 0; i < n; ++i) {
-    y[i] = x[i] > 0.f ? x[i] : 0.f;
-  }
-}
-
 #if defined __AVX__ || defined __AVX2__
 void vrelu_intri8(const int n, const float* x, float* y) {
   __m256 tmp = _mm256_loadu_ps(x);
@@ -69,6 +64,7 @@ void vrelu_intri8(const int n, const float* x, float* y) {
 
 TEST(JitKernel, vrelu) {
   namespace jit = paddle::operators::math::jitkernel;
+  namespace refer = paddle::operators::math::jitkernel::refer;
   for (int d : {3, 7, 8, 15, 16, 30, 256, 512}) {
     std::vector<float> x(d);
     std::vector<float> zref(d), ztgt(d);
@@ -80,7 +76,7 @@ TEST(JitKernel, vrelu) {
     float* zref_data = zref.data();
     auto trefs = GetCurrentUS();
     for (int i = 0; i < repeat; ++i) {
-      vrelu_ref(d, x_data, zref_data);
+      refer::VRelu<float>(x_data, zref_data, d);
     }
     auto trefe = GetCurrentUS();
 #if defined __AVX__ || defined __AVX2__
@@ -107,14 +103,9 @@ TEST(JitKernel, vrelu) {
   }
 }
 
-void vaddbias_ref(const int n, const float a, const float* x, float* y) {
-  for (int i = 0; i < n; ++i) {
-    y[i] = x[i] + a;
-  }
-}
-
 TEST(JitKernel, vaddbias) {
   namespace jit = paddle::operators::math::jitkernel;
+  namespace refer = paddle::operators::math::jitkernel::refer;
   for (int d : {7, 8, 15, 16, 30, 64, 100, 128, 256}) {
     std::vector<float> x(d);
     std::vector<float> zref(d), ztgt(d);
@@ -127,7 +118,7 @@ TEST(JitKernel, vaddbias) {
     float* zref_data = zref.data();
     auto trefs = GetCurrentUS();
     for (int i = 0; i < repeat; ++i) {
-      vaddbias_ref(d, a, x_data, zref_data);
+      refer::VAddBias<float>(&a, x_data, zref_data, d);
     }
     auto trefe = GetCurrentUS();
     auto ttgts = GetCurrentUS();
@@ -145,12 +136,6 @@ TEST(JitKernel, vaddbias) {
   }
 }
 
-void vexp_ref(const int n, const float* x, float* y) {
-  for (int i = 0; i < n; ++i) {
-    y[i] = std::exp(x[i]);
-  }
-}
-
 #ifdef PADDLE_WITH_MKLML
 void vexp_mkl(const int n, const float* x, float* y) {
   paddle::platform::dynload::vsExp(n, x, y);
@@ -159,6 +144,7 @@ void vexp_mkl(const int n, const float* x, float* y) {
 
 TEST(JitKernel, vexp) {
   namespace jit = paddle::operators::math::jitkernel;
+  namespace refer = paddle::operators::math::jitkernel::refer;
   for (int d : {1, 3, 4, 6, 7, 8, 12, 15, 16, 20, 30, 128, 256}) {
     std::vector<float> x(d);
     std::vector<float> zref(d), ztgt(d);
@@ -170,7 +156,7 @@ TEST(JitKernel, vexp) {
     float* zref_data = zref.data();
     auto trefs = GetCurrentUS();
     for (int i = 0; i < repeat; ++i) {
-      vexp_ref(d, x_data, zref_data);
+      refer::VExp<float>(x_data, zref_data, d);
     }
     auto trefe = GetCurrentUS();
 
@@ -203,19 +189,6 @@ TEST(JitKernel, vexp) {
   }
 }
 
-inline float _sigmoid(float x) {
-  const float min = SIGMOID_THRESHOLD_MIN;
-  const float max = SIGMOID_THRESHOLD_MAX;
-  float tmp = (x < min) ? min : ((x > max) ? max : x);
-  return 1.f / (1.f + std::exp(-tmp));
-}
-
-void vsigmoid_ref(const int n, const float* x, float* y) {
-  for (int i = 0; i < n; ++i) {
-    y[i] = _sigmoid(x[i]);
-  }
-}
-
 void vsigmoid_better(
     const std::shared_ptr<
         const paddle::operators::math::jitkernel::VExpKernel<float>>& vexp,
@@ -234,6 +207,7 @@ void vsigmoid_better(
 
 TEST(JitKernel, vsigmoid) {
   namespace jit = paddle::operators::math::jitkernel;
+  namespace refer = paddle::operators::math::jitkernel::refer;
   for (int d : {1, 3, 4, 6, 7, 8, 15, 16, 30, 32, 64, 100, 128, 256}) {
     std::vector<float> x(d);
     std::vector<float> zref(d), ztgt(d);
@@ -252,7 +226,7 @@ TEST(JitKernel, vsigmoid) {
     auto tmkle = GetCurrentUS();
     auto trefs = GetCurrentUS();
     for (int i = 0; i < repeat; ++i) {
-      vsigmoid_ref(d, x_data, zref_data);
+      refer::VSigmoid<float>(x_data, zref_data, d);
     }
     auto trefe = GetCurrentUS();
     auto ttgts = GetCurrentUS();
@@ -271,14 +245,6 @@ TEST(JitKernel, vsigmoid) {
   }
 }
 
-inline float _tanh(float x) { return 2.f * _sigmoid(2.f * x) - 1.f; }
-
-void vtanh_ref(const int n, const float* x, float* y) {
-  for (int i = 0; i < n; ++i) {
-    y[i] = _tanh(x[i]);
-  }
-}
-
 void vtanh_better(
     const std::shared_ptr<
         const paddle::operators::math::jitkernel::VScalKernel<float>>& vscal,
@@ -298,6 +264,7 @@ void vtanh_better(
 
 TEST(JitKernel, vtanh) {
   namespace jit = paddle::operators::math::jitkernel;
+  namespace refer = paddle::operators::math::jitkernel::refer;
   for (int d : {1, 2, 3, 4, 5, 6, 7, 8, 15, 16, 30, 32, 64, 100, 128, 256}) {
     std::vector<float> x(d);
     std::vector<float> zref(d), ztgt(d);
@@ -320,7 +287,7 @@ TEST(JitKernel, vtanh) {
     auto tmkle = GetCurrentUS();
     auto trefs = GetCurrentUS();
     for (int i = 0; i < repeat; ++i) {
-      vtanh_ref(d, x_data, zref_data);
+      refer::VTanh<float>(x_data, zref_data, d);
     }
     auto trefe = GetCurrentUS();
     auto ttgts = GetCurrentUS();
@@ -339,32 +306,6 @@ TEST(JitKernel, vtanh) {
   }
 }
 
-void lstm_ctht_ref(
-    const std::shared_ptr<
-        const paddle::operators::math::jitkernel::VSigmoidKernel<float>>&
-        vsigmoid_3d,
-    const std::shared_ptr<
-        const paddle::operators::math::jitkernel::VTanhKernel<float>>& vtanh_d,
-    const std::shared_ptr<
-        const paddle::operators::math::jitkernel::VExpKernel<float>>& vexp_1,
-    const int d, float* gates, const float* ct_1, float* ct, float* ht) {
-  vsigmoid_3d->Compute(gates + d, gates + d, 3 * d);
-  vtanh_d->Compute(gates, gates, d);
-  const float *i = gates + d, *f = gates + d * 2, *o = gates + d * 3;
-  const float min = SIGMOID_THRESHOLD_MIN;
-  const float max = SIGMOID_THRESHOLD_MAX;
-  for (int k = 0; k < d; ++k) {
-    // C_t = C_t-1 * fgated + cand_gated * igated
-    ct[k] = ct_1[k] * f[k] + gates[k] * i[k];
-    // H_t = act_cell(C_t) * ogated
-    float tmp = ct[k] * 2;
-    tmp = 0.f - ((tmp < min) ? min : ((tmp > max) ? max : tmp));
-    vexp_1->Compute(&tmp, &tmp, 1);
-    tmp = 2.f / (1.f + tmp) - 1.f;
-    ht[k] = tmp * o[k];
-  }
-}
-
 void lstm_ctht_better(
     const std::shared_ptr<
         const paddle::operators::math::jitkernel::VSigmoidKernel<float>>&
@@ -389,6 +330,7 @@ void lstm_ctht_better(
 
 TEST(JitKernel, lstm) {
   namespace jit = paddle::operators::math::jitkernel;
+  namespace refer = paddle::operators::math::jitkernel::refer;
   for (int d : {1, 2, 3, 4, 5, 6, 7, 8, 15, 16, 30, 32, 64, 100}) {
     int d4 = d * 4;
     int d3 = d * 3;
@@ -410,8 +352,6 @@ TEST(JitKernel, lstm) {
             d3);
     const auto& vtanh_d =
         jit::KernelPool::Instance().template Get<jit::VTanhKernel<float>>(d);
-    const auto& vexp_1 =
-        jit::KernelPool::Instance().template Get<jit::VExpKernel<float>>(1);
     const auto& vmul_d =
         jit::KernelPool::Instance().template Get<jit::VMulKernel<float>>(d);
     const auto& vadd_d =
@@ -425,8 +365,14 @@ TEST(JitKernel, lstm) {
     float* ct_ref_data = ct_ref.data();
     float* ht_ref_data = ht_ref.data();
     // compute once to check correctness
-    lstm_ctht_ref(vsigmoid_3d, vtanh_d, vexp_1, d, xref_data, ct_1_data,
-                  ct_ref_data, ht_ref_data);
+    jit::lstm_t step;
+    jit::lstm_attr_t attr(d, act_gate, act_cand, act_cell);
+    step.gates = xref_data;
+    step.ct_1 = ct_1_data;
+    step.ct = ct_ref_data;
+    step.ht = ht_ref_data;
+    refer::LSTMCtHt<float>(&step, &attr);
+
     ker->ComputeCtHt(x_data, ct_1_data, ct_tgt_data, ht_tgt_data);
     for (int i = 0; i < d; ++i) {
       EXPECT_NEAR(ct_tgt_data[i], ct_ref_data[i], 1e-3);
@@ -441,8 +387,7 @@ TEST(JitKernel, lstm) {
     auto tmkle = GetCurrentUS();
     auto trefs = GetCurrentUS();
     for (int i = 0; i < repeat; ++i) {
-      lstm_ctht_ref(vsigmoid_3d, vtanh_d, vexp_1, d, xref_data, ct_1_data,
-                    ct_ref_data, ht_ref_data);
+      refer::LSTMCtHt<float>(&step, &attr);
     }
     auto trefe = GetCurrentUS();
     auto ttgts = GetCurrentUS();
@@ -457,16 +402,6 @@ TEST(JitKernel, lstm) {
   }
 }
 
-void vscal_ref(const int n, const float a, const float* x, float* y) {
-  for (int i = 0; i < n; ++i) {
-    y[i] = a * x[i];
-  }
-}
-void vscal_inp_ref(const int n, const float a, float* x) {
-  for (int i = 0; i < n; ++i) {
-    x[i] = a * x[i];
-  }
-}
 #if defined __AVX__ || defined __AVX2__
 void vscal_intri8(const int n, const float a, const float* x, float* y) {
   __m256 tmp;
@@ -492,6 +427,7 @@ void vscal_inp_mkl(const int n, const float a, float* x) {
 
 TEST(JitKernel, vscal) {
   namespace jit = paddle::operators::math::jitkernel;
+  namespace refer = paddle::operators::math::jitkernel::refer;
   for (int d : {7, 8, 15, 16, 30, 256, 512}) {
     std::vector<float> x(d), y(d);
     std::vector<float> zref(d), ztgt(d);
@@ -506,12 +442,12 @@ TEST(JitKernel, vscal) {
     float* zref_data = zref.data();
     auto trefs = GetCurrentUS();
     for (int i = 0; i < repeat; ++i) {
-      vscal_ref(d, a, x_data, zref_data);
+      refer::VScal<float>(&a, x_data, zref_data, d);
     }
     auto trefe = GetCurrentUS();
     auto trefs1 = GetCurrentUS();
     for (int i = 0; i < repeat; ++i) {
-      vscal_inp_ref(d, a, y_data);
+      refer::VScal<float>(&a, y_data, y_data, d);
     }
     auto trefe1 = GetCurrentUS();
 
@@ -567,12 +503,6 @@ TEST(JitKernel, vscal) {
   }
 }
 
-void vmul_ref(const int n, const float* x, const float* y, float* z) {
-  for (int i = 0; i < n; ++i) {
-    z[i] = x[i] * y[i];
-  }
-}
-
 #if defined __AVX__ || defined __AVX2__
 void vmul_intri8(const int n, const float* x, const float* y, float* z) {
   __m256 tmpx, tmpy;
@@ -591,6 +521,7 @@ void vmul_mkl(const int n, const float* x, const float* y, float* z) {
 
 TEST(JitKernel, vmul) {
   namespace jit = paddle::operators::math::jitkernel;
+  namespace refer = paddle::operators::math::jitkernel::refer;
   for (int d : {7, 8, 15, 16, 20, 30, 256, 512, 1000, 1024}) {
     std::vector<float> x(d), y(d);
     std::vector<float> zref(d), ztgt(d);
@@ -604,7 +535,7 @@ TEST(JitKernel, vmul) {
     float* zref_data = zref.data();
     auto trefs = GetCurrentUS();
     for (int i = 0; i < repeat; ++i) {
-      vmul_ref(d, x_data, y_data, zref_data);
+      refer::VMul<float>(x_data, y_data, zref_data, d);
     }
     auto trefe = GetCurrentUS();
 
@@ -647,12 +578,6 @@ TEST(JitKernel, vmul) {
   }
 }
 
-void vadd_ref(const int n, const float* x, const float* y, float* z) {
-  for (int i = 0; i < n; ++i) {
-    z[i] = x[i] + y[i];
-  }
-}
-
 #if defined __AVX__ || defined __AVX2__
 void vadd_intri8(const int n, const float* x, const float* y, float* z) {
   __m256 tmpx, tmpy;
@@ -671,6 +596,7 @@ void vadd_mkl(const int n, const float* x, const float* y, float* z) {
 
 TEST(JitKernel, vadd) {
   namespace jit = paddle::operators::math::jitkernel;
+  namespace refer = paddle::operators::math::jitkernel::refer;
   for (int d : {7, 8, 15, 16, 30, 256, 512}) {
     std::vector<float> x(d), y(d);
     std::vector<float> zref(d), ztgt(d);
@@ -684,7 +610,7 @@ TEST(JitKernel, vadd) {
     float* zref_data = zref.data();
     auto trefs = GetCurrentUS();
     for (int i = 0; i < repeat; ++i) {
-      vadd_ref(d, x_data, y_data, zref_data);
+      refer::VAdd<float>(x_data, y_data, zref_data, d);
     }
     auto trefe = GetCurrentUS();
 
@@ -727,12 +653,6 @@ TEST(JitKernel, vadd) {
   }
 }
 
-void vaddrelu_ref(const int n, const float* x, const float* y, float* z) {
-  for (int i = 0; i < n; ++i) {
-    z[i] = x[i] + y[i];
-    z[i] = z[i] > 0 ? z[i] : 0;
-  }
-}
 void vaddrelu_better(
     const std::shared_ptr<
         const paddle::operators::math::jitkernel::VAddKernel<float>>& vadd,
@@ -745,6 +665,7 @@ void vaddrelu_better(
 
 TEST(JitKernel, vaddrelu) {
   namespace jit = paddle::operators::math::jitkernel;
+  namespace refer = paddle::operators::math::jitkernel::refer;
   for (int d : {7, 8, 15, 16, 30, 256, 512}) {
     std::vector<float> x(d), y(d);
     std::vector<float> zref(d), ztgt(d);
@@ -762,7 +683,7 @@ TEST(JitKernel, vaddrelu) {
     float* zref_data = zref.data();
     auto trefs = GetCurrentUS();
     for (int i = 0; i < repeat; ++i) {
-      vaddrelu_ref(d, x_data, y_data, zref_data);
+      refer::VAddRelu<float>(x_data, y_data, zref_data, d);
     }
     auto trefe = GetCurrentUS();
     auto tmkls = GetCurrentUS();

From 703b26e697c0a15a903d2e346d191e033e181073 Mon Sep 17 00:00:00 2001
From: peizhilin <wopeizl@163.com>
Date: Wed, 21 Nov 2018 11:22:34 +0800
Subject: [PATCH 036/113] add profiler, parallel_executor back

---
 paddle/fluid/framework/CMakeLists.txt         |   9 -
 .../fast_threaded_ssa_graph_executor.h        |   2 +-
 .../fluid/memory/allocation/cpu_allocator.h   |   3 +-
 paddle/fluid/platform/CMakeLists.txt          |  12 +-
 paddle/fluid/platform/device_tracer.h         |  12 +-
 paddle/fluid/platform/enforce.h               |   2 +-
 paddle/fluid/platform/port.h                  |  21 +
 paddle/fluid/platform/profiler.cc             |   6 +-
 paddle/fluid/platform/profiler.h              |  10 -
 .../fluid/platform/stream_callback_manager.h  |  13 +-
 paddle/fluid/pybind/CMakeLists.txt            |   5 +-
 paddle/fluid/pybind/pybind.cc                 |   6 -
 python/paddle/fluid/__init__.py               |   3 +-
 python/paddle/fluid/parallel_executor.py      | 497 +++++++++---------
 14 files changed, 293 insertions(+), 308 deletions(-)

diff --git a/paddle/fluid/framework/CMakeLists.txt b/paddle/fluid/framework/CMakeLists.txt
index 42af482f85..43e1bc6b2e 100644
--- a/paddle/fluid/framework/CMakeLists.txt
+++ b/paddle/fluid/framework/CMakeLists.txt
@@ -31,9 +31,7 @@ function(windows_symbolic TARGET)
 endfunction()
 
 add_subdirectory(ir)
-if (NOT WIN32)
 add_subdirectory(details)
-endif (NOT WIN32)
 # ddim lib
 proto_library(framework_proto SRCS framework.proto)
 
@@ -118,13 +116,8 @@ cc_test(op_proto_maker_test SRCS op_proto_maker_test.cc DEPS op_proto_maker)
 cc_library(op_info SRCS op_info.cc DEPS attribute framework_proto)
 cc_library(shape_inference SRCS shape_inference.cc DEPS ddim attribute device_context)
 
-if (NOT WIN32)
 cc_library(operator SRCS operator.cc DEPS op_info device_context tensor scope glog
     shape_inference data_transform lod_tensor profiler)
-else()
-cc_library(operator SRCS operator.cc DEPS op_info device_context tensor scope glog
-    shape_inference data_transform lod_tensor)
-endif(NOT WIN32)
 
 cc_test(operator_test SRCS operator_test.cc DEPS operator op_registry device_context)
 
@@ -179,12 +172,10 @@ else()
   cc_test(test_naive_executor SRCS naive_executor_test.cc DEPS naive_executor elementwise_add_op)
 endif()
 
-if (NOT WIN32)
 cc_library(parallel_executor SRCS parallel_executor.cc DEPS
         threaded_ssa_graph_executor scope_buffered_ssa_graph_executor
         graph build_strategy
         fast_threaded_ssa_graph_executor)
-endif() # NOT WIN32
 
 cc_library(prune SRCS prune.cc DEPS framework_proto)
 cc_test(prune_test SRCS prune_test.cc DEPS op_info prune recurrent_op device_context)
diff --git a/paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.h b/paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.h
index 949616f02d..c3a8b85423 100644
--- a/paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.h
+++ b/paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.h
@@ -13,9 +13,9 @@
 // limitations under the License.
 
 #pragma once
+#include <ThreadPool.h>
 #include <string>
 #include <vector>
-#include "ThreadPool.h"
 #include "paddle/fluid/framework/blocking_queue.h"
 #include "paddle/fluid/framework/details/exception_holder.h"
 #include "paddle/fluid/framework/details/execution_strategy.h"
diff --git a/paddle/fluid/memory/allocation/cpu_allocator.h b/paddle/fluid/memory/allocation/cpu_allocator.h
index 165f11cd3b..26d3643f4e 100644
--- a/paddle/fluid/memory/allocation/cpu_allocator.h
+++ b/paddle/fluid/memory/allocation/cpu_allocator.h
@@ -17,7 +17,8 @@
 
 #ifdef _WIN32
 #define posix_memalign_free _aligned_free
-#define posix_memalign(p, a, s) (((*(p)) = _aligned_malloc((s), (a))), *(p) ? 0 : errno)
+#define posix_memalign(p, a, s) \
+  (((*(p)) = _aligned_malloc((s), (a))), *(p) ? 0 : errno)
 #endif
 
 namespace paddle {
diff --git a/paddle/fluid/platform/CMakeLists.txt b/paddle/fluid/platform/CMakeLists.txt
index 0d0613e1a4..93cb5eb2dc 100644
--- a/paddle/fluid/platform/CMakeLists.txt
+++ b/paddle/fluid/platform/CMakeLists.txt
@@ -1,4 +1,3 @@
-if (NOT WIN32)
 proto_library(profiler_proto SRCS profiler.proto DEPS framework_proto)
 py_proto_compile(profiler_py_proto SRCS profiler.proto)
 
@@ -6,11 +5,19 @@ add_custom_target(profiler_py_proto_init ALL COMMAND ${CMAKE_COMMAND} -E touch _
 
 add_dependencies(profiler_py_proto profiler_py_proto_init)
 
+if (NOT WIN32)
 add_custom_command(TARGET profiler_py_proto POST_BUILD
         COMMAND ${CMAKE_COMMAND} -E make_directory ${PADDLE_BINARY_DIR}/python/paddle/fluid/proto/profiler
         COMMAND cp *.py ${PADDLE_BINARY_DIR}/python/paddle/fluid/proto/profiler
         COMMENT "Copy generated python proto into directory paddle/fluid/proto/profiler."
         WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
+else(NOT WIN32)
+string(REPLACE "/" "\\" proto_dstpath "${PADDLE_BINARY_DIR}/python/paddle/fluid/proto/profiler/")
+add_custom_command(TARGET profiler_py_proto POST_BUILD
+        COMMAND ${CMAKE_COMMAND} -E make_directory ${PADDLE_BINARY_DIR}/python/paddle/fluid/proto/profiler
+        COMMAND copy /Y *.py ${proto_dstpath}
+        COMMENT "Copy generated python proto into directory paddle/fluid/proto/profiler."
+        WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
 endif(NOT WIN32)
 
 if(WITH_GPU)
@@ -60,12 +67,9 @@ cc_test(init_test SRCS init_test.cc DEPS device_context)
 nv_test(cudnn_helper_test SRCS cudnn_helper_test.cc DEPS dynload_cuda)
 nv_test(transform_test SRCS transform_test.cu DEPS memory place device_context)
 
-
-if (NOT WIN32)
 cc_library(device_tracer SRCS device_tracer.cc DEPS boost profiler_proto framework_proto ${GPU_CTX_DEPS})
 cc_library(profiler SRCS profiler.cc DEPS device_context device_tracer)
 cc_test(profiler_test SRCS profiler_test.cc DEPS profiler)
-endif(NOT WIN32)
 
 nv_test(float16_gpu_test SRCS float16_test.cu DEPS lod_tensor)
 cc_test(float16_test SRCS float16_test.cc DEPS lod_tensor)
diff --git a/paddle/fluid/platform/device_tracer.h b/paddle/fluid/platform/device_tracer.h
index f59fc40b71..eaf047d474 100644
--- a/paddle/fluid/platform/device_tracer.h
+++ b/paddle/fluid/platform/device_tracer.h
@@ -13,17 +13,11 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 #pragma once
 
-#if !defined(_WIN32)
-#include <sys/time.h>
-#else
-#include <windows.h>
-#endif  // !_WIN32
-
-#include <time.h>
 #include <chrono>  // NOLINT
 #include <string>
 
 #include "paddle/fluid/platform/dynload/cupti.h"
+#include "paddle/fluid/platform/port.h"
 #include "paddle/fluid/platform/profiler.pb.h"
 
 namespace paddle {
@@ -32,15 +26,11 @@ namespace platform {
 ///////////////////////
 // WARN: Under Development. Don't depend on it yet.
 //////////////////////
-#if !defined(_WIN32)
 inline uint64_t PosixInNsec() {
   struct timeval tv;
   gettimeofday(&tv, nullptr);
   return 1000 * (static_cast<uint64_t>(tv.tv_sec) * 1000000 + tv.tv_usec);
 }
-#else
-inline uint64_t PosixInNsec() { return static_cast<uint64_t>(0); }
-#endif  // !_WIN32
 
 // DeviceTracer performs the following tasks:
 // 1. Register cuda callbacks for various events: kernel, memcpy, etc.
diff --git a/paddle/fluid/platform/enforce.h b/paddle/fluid/platform/enforce.h
index 3643d2ad15..31309738a5 100644
--- a/paddle/fluid/platform/enforce.h
+++ b/paddle/fluid/platform/enforce.h
@@ -134,7 +134,7 @@ struct EOFException : public std::exception {
 #define LIKELY(condition) __builtin_expect(static_cast<bool>(condition), 1)
 #else
 // there is no equivalent intrinsics in msvc.
-#define LIKELY(condition) !(condition)
+#define LIKELY(condition) (condition)
 #endif
 
 template <typename... Args>
diff --git a/paddle/fluid/platform/port.h b/paddle/fluid/platform/port.h
index a07b993c8a..8be77fe464 100644
--- a/paddle/fluid/platform/port.h
+++ b/paddle/fluid/platform/port.h
@@ -17,6 +17,7 @@
 #include <cstdio>
 #include <stdexcept>
 
+#include <time.h>
 #include <memory>
 #include <string>
 
@@ -27,6 +28,7 @@
 #include <dlfcn.h>     //  dladdr
 #include <execinfo.h>  // backtrace
 #include <sys/stat.h>
+#include <sys/time.h>
 #include <algorithm>  // std::accumulate
 #else
 #include <io.h>  // _popen, _pclose
@@ -57,6 +59,25 @@ static void *dlopen(const char *filename, int flag) {
   return reinterpret_cast<void *>(hModule);
 }
 
+static int gettimeofday(struct timeval *tp, void *tzp) {
+  time_t clock;
+  struct tm tm;
+  SYSTEMTIME wtm;
+
+  GetLocalTime(&wtm);
+  tm.tm_year = wtm.wYear - 1900;
+  tm.tm_mon = wtm.wMonth - 1;
+  tm.tm_mday = wtm.wDay;
+  tm.tm_hour = wtm.wHour;
+  tm.tm_min = wtm.wMinute;
+  tm.tm_sec = wtm.wSecond;
+  tm.tm_isdst = -1;
+  clock = mktime(&tm);
+  tp->tv_sec = clock;
+  tp->tv_usec = wtm.wMilliseconds * 1000;
+
+  return (0);
+}
 #endif  // !_WIN32
 
 static void ExecShellCommand(const std::string &cmd, std::string *message) {
diff --git a/paddle/fluid/platform/profiler.cc b/paddle/fluid/platform/profiler.cc
index 56bf9e31a3..03c102e24a 100644
--- a/paddle/fluid/platform/profiler.cc
+++ b/paddle/fluid/platform/profiler.cc
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/platform/profiler.h"
+#include "paddle/fluid/platform/port.h"
 
-#include <sys/time.h>
 #include <algorithm>
 #include <iomanip>
 #include <limits>
@@ -438,10 +438,10 @@ void ParseEvents(const std::vector<std::vector<Event>>& events,
             event_items[index].total_time += event_time;
             // min time
             event_items[index].min_time =
-                std::min(event_time, event_items[index].min_time);
+                (std::min)(event_time, event_items[index].min_time);
             // max time
             event_items[index].max_time =
-                std::max(event_time, event_items[index].max_time);
+                (std::max)(event_time, event_items[index].max_time);
           }
 
           // remove the push marker from the list
diff --git a/paddle/fluid/platform/profiler.h b/paddle/fluid/platform/profiler.h
index e8eae874af..f5d3490634 100644
--- a/paddle/fluid/platform/profiler.h
+++ b/paddle/fluid/platform/profiler.h
@@ -69,7 +69,6 @@ void PushEvent(const std::string& name, const DeviceContext* dev_ctx);
 
 void PopEvent(const std::string& name, const DeviceContext* dev_ctx);
 
-#if !defined(_WIN32)
 struct RecordEvent {
   // dev_ctx can be set to nullptr if device is cpu.
   RecordEvent(const std::string& name, const DeviceContext* dev_ctx);
@@ -106,15 +105,6 @@ struct RecordBlock {
   std::string name_;
   uint64_t start_ns_;
 };
-#else
-// windows do not support profiler temporarily.
-struct RecordEvent {
-  RecordEvent(const std::string& name, const DeviceContext* dev_ctx) {}
-};
-struct RecordBlock {
-  explicit RecordBlock(int block_id) {}
-};
-#endif
 
 // Return the event list of all threads. Assumed the returned value calls
 // event_lists, event_lists[i][j] represents the j-th Event of i-th thread.
diff --git a/paddle/fluid/platform/stream_callback_manager.h b/paddle/fluid/platform/stream_callback_manager.h
index 0e88a439cf..11c68f3449 100644
--- a/paddle/fluid/platform/stream_callback_manager.h
+++ b/paddle/fluid/platform/stream_callback_manager.h
@@ -45,16 +45,15 @@ class StreamCallbackManager {
   inline void AddCallback(Callback &&callback) const {
     auto *stream_callback_context =
         new StreamCallbackContext(this, std::forward<Callback>(callback));
-    PADDLE_ENFORCE(
 #if CUDA_VERSION >= 10000
-        cudaLaunchHostFunc(stream_, StreamCallbackManager::StreamCallbackFunc,
-                           stream_callback_context)
+    PADDLE_ENFORCE(cudaLaunchHostFunc(stream_,
+                                      StreamCallbackManager::StreamCallbackFunc,
+                                      stream_callback_context));  // NOLINT
 #else
-        cudaStreamAddCallback(stream_,
-                              StreamCallbackManager::StreamCallbackFunc,
-                              stream_callback_context, 0)
+    PADDLE_ENFORCE(cudaStreamAddCallback(
+        stream_, StreamCallbackManager::StreamCallbackFunc,
+        stream_callback_context, 0));  // NOLINT
 #endif
-            );  // NOLINT
   }
 
   void Wait() const { thread_pool_.reset(new ThreadPool(1)); }
diff --git a/paddle/fluid/pybind/CMakeLists.txt b/paddle/fluid/pybind/CMakeLists.txt
index 25e919105c..fb6ee2f4a5 100644
--- a/paddle/fluid/pybind/CMakeLists.txt
+++ b/paddle/fluid/pybind/CMakeLists.txt
@@ -1,9 +1,6 @@
 
-set(PYBIND_DEPS pybind python proto_desc memory executor prune feed_fetch_method pass_builder)
+set(PYBIND_DEPS pybind python proto_desc memory executor prune feed_fetch_method pass_builder parallel_executor profiler)
 set(PYBIND_SRCS pybind.cc exception.cc protobuf.cc const_value.cc recordio.cc)
-if(NOT WIN32)
-  list(APPEND PYBIND_DEPS parallel_executor profiler)
-endif(NOT WIN32)
 if(WITH_PYTHON)
   if(WITH_AMD_GPU)
     hip_library(paddle_pybind SHARED
diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc
index 2f040e1c34..102fa02adf 100644
--- a/paddle/fluid/pybind/pybind.cc
+++ b/paddle/fluid/pybind/pybind.cc
@@ -36,9 +36,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/lod_tensor_array.h"
 #include "paddle/fluid/framework/op_registry.h"
-#ifndef _WIN32
 #include "paddle/fluid/framework/parallel_executor.h"
-#endif
 #include "paddle/fluid/framework/prune.h"
 #include "paddle/fluid/framework/reader.h"
 #include "paddle/fluid/framework/selected_rows.h"
@@ -637,7 +635,6 @@ All parameter, weight, gradient are variables in Paddle.
 #endif
 #endif
 
-#ifndef _WIN32
   py::enum_<platform::ProfilerState>(m, "ProfilerState", py::arithmetic())
       .value("kDisabled", platform::ProfilerState::kDisabled)
       .value("kCPU", platform::ProfilerState::kCPU)
@@ -658,7 +655,6 @@ All parameter, weight, gradient are variables in Paddle.
   m.def("disable_profiler", platform::DisableProfiler);
   m.def("is_profiler_enabled", platform::IsProfileEnabled);
   m.def("reset_profiler", platform::ResetProfiler);
-#endif
 
   py::class_<ir::Pass, std::shared_ptr<ir::Pass>> pass(m, "Pass");
   pass.def(py::init())
@@ -687,7 +683,6 @@ All parameter, weight, gradient are variables in Paddle.
       .def("remove_pass",
            [](ir::PassBuilder &self, size_t idx) { self.RemovePass(idx); });
 
-#ifndef _WIN32
   // -- python binds for parallel executor.
   py::class_<ParallelExecutor> pe(m, "ParallelExecutor");
   py::class_<ExecutionStrategy> exec_strategy(pe, "ExecutionStrategy", R"DOC(
@@ -913,7 +908,6 @@ All parameter, weight, gradient are variables in Paddle.
         pybind11::gil_scoped_release release;
         self.Run(fetch_tensors, fetched_var_name);
       });
-#endif
 
   BindRecordIOWriter(&m);
   return m.ptr();
diff --git a/python/paddle/fluid/__init__.py b/python/paddle/fluid/__init__.py
index 6a4a5e098f..543acf2d34 100644
--- a/python/paddle/fluid/__init__.py
+++ b/python/paddle/fluid/__init__.py
@@ -47,8 +47,7 @@ from . import profiler
 from . import unique_name
 from . import recordio_writer
 from . import parallel_executor
-if os.name != 'nt':
-    from .parallel_executor import *
+from .parallel_executor import *
 from paddle.fluid.layers.math_op_patch import monkey_patch_variable
 
 Tensor = LoDTensor
diff --git a/python/paddle/fluid/parallel_executor.py b/python/paddle/fluid/parallel_executor.py
index 0d53f53a9e..3f4dd5eb71 100644
--- a/python/paddle/fluid/parallel_executor.py
+++ b/python/paddle/fluid/parallel_executor.py
@@ -25,264 +25,263 @@ import os
 
 __all__ = ['ParallelExecutor', 'ExecutionStrategy', 'BuildStrategy']
 
-if os.name != 'nt':
-    ExecutionStrategy = core.ParallelExecutor.ExecutionStrategy
-    BuildStrategy = core.ParallelExecutor.BuildStrategy
-
-    class ParallelExecutor(object):
+ExecutionStrategy = core.ParallelExecutor.ExecutionStrategy
+BuildStrategy = core.ParallelExecutor.BuildStrategy
+
+
+class ParallelExecutor(object):
+    """
+    ParallelExecutor is designed for data parallelism, which focuses on distributing
+    the data across different nodes and every node operates on the data in parallel.
+    If you use ParallelExecutor to run the current program on GPU, the node means GPU
+    device, and ParallelExecutor will get the available GPU device automatically on
+    the current machine. If you use ParallelExecutor to run the current program on CPU,
+    the node means the CPU device, and you can specify the CPU device number by adding
+    'CPU_NUM' environment variable, for example 'CPU_NUM=4', if the environment variable
+    is not found, ParallelExecutor will call `multiprocessing.cpu_count` to get the number
+    of CPUs in the system.
+
+    Args:
+        use_cuda (bool): Whether to use CUDA or not.
+        loss_name (str): The loss name must set in training. Default None.
+        main_program (Program): The program that need to run, if not provided,
+            then default_main_program will be used. Default None.
+        share_vars_from(ParallelExecutor): If provide, it will share variables
+            from the specified ParallelExecutor. Default None.
+        exec_strategy(ExecutionStrategy): exec_strategy is used to control how to run
+            the program in ParallelExecutor, for example how many threads are used to
+            execute the program, how many iterations to clean up the temp variables
+            which is generated during execution. For more information, please refer
+            to fluid.ExecutionStrategy. Default None.
+        build_strategy(BuildStrategy): build_strategy is used to control how to
+            build the SSA Graph in ParallelExecutor by setting the property,
+            for example reduce_strategy, gradient_scale_strategy. For more information,
+            please refer to fluid.BuildStrategy. Default None.
+        num_trainers(int): If greater than 1, NCCL will be initialized with
+            multiple rank of nodes, each node should have same number of GPUs.
+            Distributed training will be enabled then. Default 1.
+        trainer_id(int): Must use together with num_trainers. trainer_id is the
+            "rank" of current node starts from 0. Default 0.
+        scope(Scope): scope to run with, default use fluid.global_scope().
+
+    Returns:
+        ParallelExecutor: The initialized ParallelExecutor object.
+
+    Raises:
+        TypeError: If share_vars_from is provided, but not ParallelExecutor object.
+
+    Examples:
+        .. code-block:: python
+
+          train_exe = fluid.ParallelExecutor(use_cuda=True, loss_name=loss.name)
+          test_exe = fluid.ParallelExecutor(use_cuda=True,
+                                            main_program=test_program,
+                                            share_vars_from=train_exe)
+
+          train_loss, = train_exe.run([loss.name], feed=feed_dict)
+          test_loss, = test_exe.run([loss.name], feed=feed_dict)
+    """
+
+    def __init__(self,
+                 use_cuda,
+                 loss_name=None,
+                 main_program=None,
+                 share_vars_from=None,
+                 exec_strategy=None,
+                 build_strategy=None,
+                 num_trainers=1,
+                 trainer_id=0,
+                 scope=None):
+        self._places = []
+        self._act_places = []
+        if use_cuda:
+            for i in six.moves.range(core.get_cuda_device_count()):
+                p = core.Place()
+                self._act_places.append(core.CUDAPlace(i))
+                p.set_place(self._act_places[-1])
+                self._places.append(p)
+        else:
+            cpu_num = int(
+                os.environ.get('CPU_NUM', multiprocessing.cpu_count()))
+            for i in six.moves.range(cpu_num):
+                p = core.Place()
+                self._act_places.append(core.CPUPlace())
+                p.set_place(self._act_places[-1])
+                self._places.append(p)
+        assert self._places, "no place for execution"
+
+        if exec_strategy is None:
+            exec_strategy = ExecutionStrategy()
+        exec_strategy.use_cuda = use_cuda
+
+        if exec_strategy.num_threads == 0:
+            if use_cuda:
+                # Experiments on se-resnext shows that too many threads hurt
+                # performance. Worth tunning for other models in the future.
+                exec_strategy.num_threads = len(self._places) * 4
+            else:
+                cpu_num = int(
+                    os.environ.get('CPU_NUM', multiprocessing.cpu_count()))
+                exec_strategy.num_threads = cpu_num * 2
+
+        # Set 1 thread num under nccl2 distribute 
+        #   env to make sure all gpus run ops in same order.
+        if num_trainers > 1:
+            assert (use_cuda)
+            # FIXME(gongwb): avoid this set.
+            exec_strategy.num_threads = 1
+
+        if build_strategy is None:
+            build_strategy = BuildStrategy()
+
+        main = main_program
+        main = main if main else framework.default_main_program()
+        if scope == None:
+            scope = executor.global_scope()
+
+        if share_vars_from and not isinstance(share_vars_from,
+                                              ParallelExecutor):
+            raise TypeError("share_vars_from must be ParallelExecutor.")
+
+        local_scopes = share_vars_from.executor.local_scopes(
+        ) if share_vars_from else []
+
+        self.persistable_vars = [
+            v.name for v in [
+                var for var in main.list_vars()
+                if var.persistable and var.type != core.VarDesc.VarType.RAW
+            ]
+        ]
+
+        self.executor = core.ParallelExecutor(
+            self._places,
+            set([
+                cpt.to_text(p.name)
+                for p in main.global_block().iter_parameters()
+                if not p.stop_gradient
+            ]),
+            set(cpt.to_text(var) for var in self.persistable_vars), main.desc,
+            cpt.to_text(loss_name)
+            if loss_name else six.u(''), scope, local_scopes, exec_strategy,
+            build_strategy, num_trainers, trainer_id)
+        self.scope = scope
+
+    def run(self, fetch_list, feed=None, feed_dict=None, return_numpy=True):
         """
-        ParallelExecutor is designed for data parallelism, which focuses on distributing
-        the data across different nodes and every node operates on the data in parallel.
-        If you use ParallelExecutor to run the current program on GPU, the node means GPU
-        device, and ParallelExecutor will get the available GPU device automatically on
-        the current machine. If you use ParallelExecutor to run the current program on CPU,
-        the node means the CPU device, and you can specify the CPU device number by adding
-        'CPU_NUM' environment variable, for example 'CPU_NUM=4', if the environment variable
-        is not found, ParallelExecutor will call `multiprocessing.cpu_count` to get the number
-        of CPUs in the system.
+        Run a parallel executor with fetch_list.
+
+        The feed parameter can be a dict or a list. If feed is a dict, the
+        feed data will be split into multiple devices. If feed is a list, we
+        assume the data has been splitted into multiple devices, the each
+        element in the list will be copied to each device directly.
+
+        For example, if the feed is a dict:
+
+        >>> exe = ParallelExecutor()
+        >>> # the image will be splitted into devices. If there is two devices
+        >>> # each device will process an image with shape (24, 1, 28, 28)
+        >>> exe.run(feed={'image': numpy.random.random(size=(48, 1, 28, 28))})
+
+        For example, if the feed is a list:
+
+        >>> exe = ParallelExecutor()
+        >>> # each device will process each element in the list.
+        >>> # the 1st device will process an image with shape (48, 1, 28, 28)
+        >>> # the 2nd device will process an image with shape (32, 1, 28, 28)
+        >>> #
+        >>> # you can use exe.device_count to get the device number.
+        >>> exe.run(feed=[{"image": numpy.random.random(size=(48, 1, 28, 28))},
+        >>>               {"image": numpy.random.random(size=(32, 1, 28, 28))},
+        >>>              ])
 
         Args:
-            use_cuda (bool): Whether to use CUDA or not.
-            loss_name (str): The loss name must set in training. Default None.
-            main_program (Program): The program that need to run, if not provided,
-                then default_main_program will be used. Default None.
-            share_vars_from(ParallelExecutor): If provide, it will share variables
-                from the specified ParallelExecutor. Default None.
-            exec_strategy(ExecutionStrategy): exec_strategy is used to control how to run
-                the program in ParallelExecutor, for example how many threads are used to
-                execute the program, how many iterations to clean up the temp variables
-                which is generated during execution. For more information, please refer
-                to fluid.ExecutionStrategy. Default None.
-            build_strategy(BuildStrategy): build_strategy is used to control how to
-                build the SSA Graph in ParallelExecutor by setting the property,
-                for example reduce_strategy, gradient_scale_strategy. For more information,
-                please refer to fluid.BuildStrategy. Default None.
-            num_trainers(int): If greater than 1, NCCL will be initialized with
-                multiple rank of nodes, each node should have same number of GPUs.
-                Distributed training will be enabled then. Default 1.
-            trainer_id(int): Must use together with num_trainers. trainer_id is the
-                "rank" of current node starts from 0. Default 0.
-            scope(Scope): scope to run with, default use fluid.global_scope().
+            fetch_list(list): The fetched variable names
+            feed(list|dict|None): The feed variables. If the feed is a dict,
+                tensors in that dict will be splitted into each devices. If
+                the feed is a list, each element of the list will be copied
+                to each device. Default None.
+            feed_dict: Alias for feed parameter, for backward compatibility.
+                This parameter has been deprecated. Default None.
+            return_numpy(bool): Whether converts the fetched tensor to numpy.
+                Default: True.
 
         Returns:
-            ParallelExecutor: The initialized ParallelExecutor object.
+            List: The fetched result list.
 
         Raises:
-            TypeError: If share_vars_from is provided, but not ParallelExecutor object.
+            ValueError: If the feed is a list, but its length is not equal the
+                length of active places, or its element's is not dict.
+
+        NOTES:
+            1. If the feed's type is dict, the number of data that feeds to
+               ParallelExecutor must be bigger than active places. Otherwise,
+               it will throw exception from C++ side. Special attention should be
+               paid to check whether the last batch of the dataset is bigger
+               than active places.
+            2. If active places are more than one, the fetch results for each
+               variable is a list, and each element of this list is the variable of
+               respective active place.
 
         Examples:
             .. code-block:: python
 
-              train_exe = fluid.ParallelExecutor(use_cuda=True, loss_name=loss.name)
-              test_exe = fluid.ParallelExecutor(use_cuda=True,
-                                                main_program=test_program,
-                                                share_vars_from=train_exe)
-
-              train_loss, = train_exe.run([loss.name], feed=feed_dict)
-              test_loss, = test_exe.run([loss.name], feed=feed_dict)
+                pe = fluid.ParallelExecutor(use_cuda=use_cuda,
+                                            loss_name=avg_cost.name,
+                                            main_program=fluid.default_main_program())
+                loss = pe.run(feed=feeder.feed(cur_batch),
+                              fetch_list=[avg_cost.name]))
         """
-
-        def __init__(self,
-                     use_cuda,
-                     loss_name=None,
-                     main_program=None,
-                     share_vars_from=None,
-                     exec_strategy=None,
-                     build_strategy=None,
-                     num_trainers=1,
-                     trainer_id=0,
-                     scope=None):
-            self._places = []
-            self._act_places = []
-            if use_cuda:
-                for i in six.moves.range(core.get_cuda_device_count()):
-                    p = core.Place()
-                    self._act_places.append(core.CUDAPlace(i))
-                    p.set_place(self._act_places[-1])
-                    self._places.append(p)
-            else:
-                cpu_num = int(
-                    os.environ.get('CPU_NUM', multiprocessing.cpu_count()))
-                for i in six.moves.range(cpu_num):
-                    p = core.Place()
-                    self._act_places.append(core.CPUPlace())
-                    p.set_place(self._act_places[-1])
-                    self._places.append(p)
-            assert self._places, "no place for execution"
-
-            if exec_strategy is None:
-                exec_strategy = ExecutionStrategy()
-            exec_strategy.use_cuda = use_cuda
-
-            if exec_strategy.num_threads == 0:
-                if use_cuda:
-                    # Experiments on se-resnext shows that too many threads hurt
-                    # performance. Worth tunning for other models in the future.
-                    exec_strategy.num_threads = len(self._places) * 4
-                else:
-                    cpu_num = int(
-                        os.environ.get('CPU_NUM', multiprocessing.cpu_count()))
-                    exec_strategy.num_threads = cpu_num * 2
-
-            # Set 1 thread num under nccl2 distribute
-            #   env to make sure all gpus run ops in same order.
-            if num_trainers > 1:
-                assert (use_cuda)
-                # FIXME(gongwb): avoid this set.
-                exec_strategy.num_threads = 1
-
-            if build_strategy is None:
-                build_strategy = BuildStrategy()
-
-            main = main_program
-            main = main if main else framework.default_main_program()
-            if scope == None:
-                scope = executor.global_scope()
-
-            if share_vars_from and not isinstance(share_vars_from,
-                                                  ParallelExecutor):
-                raise TypeError("share_vars_from must be ParallelExecutor.")
-
-            local_scopes = share_vars_from.executor.local_scopes(
-            ) if share_vars_from else []
-
-            self.persistable_vars = [
-                v.name for v in [
-                    var for var in main.list_vars()
-                    if var.persistable and var.type != core.VarDesc.VarType.RAW
-                ]
-            ]
-
-            self.executor = core.ParallelExecutor(
-                self._places,
-                set([
-                    cpt.to_text(p.name)
-                    for p in main.global_block().iter_parameters()
-                    if not p.stop_gradient
-                ]),
-                set(cpt.to_text(var)
-                    for var in self.persistable_vars), main.desc,
-                cpt.to_text(loss_name)
-                if loss_name else six.u(''), scope, local_scopes, exec_strategy,
-                build_strategy, num_trainers, trainer_id)
-            self.scope = scope
-
-        def run(self, fetch_list, feed=None, feed_dict=None, return_numpy=True):
-            """
-            Run a parallel executor with fetch_list.
-
-            The feed parameter can be a dict or a list. If feed is a dict, the
-            feed data will be split into multiple devices. If feed is a list, we
-            assume the data has been splitted into multiple devices, the each
-            element in the list will be copied to each device directly.
-
-            For example, if the feed is a dict:
-
-            >>> exe = ParallelExecutor()
-            >>> # the image will be splitted into devices. If there is two devices
-            >>> # each device will process an image with shape (24, 1, 28, 28)
-            >>> exe.run(feed={'image': numpy.random.random(size=(48, 1, 28, 28))})
-
-            For example, if the feed is a list:
-
-            >>> exe = ParallelExecutor()
-            >>> # each device will process each element in the list.
-            >>> # the 1st device will process an image with shape (48, 1, 28, 28)
-            >>> # the 2nd device will process an image with shape (32, 1, 28, 28)
-            >>> #
-            >>> # you can use exe.device_count to get the device number.
-            >>> exe.run(feed=[{"image": numpy.random.random(size=(48, 1, 28, 28))},
-            >>>               {"image": numpy.random.random(size=(32, 1, 28, 28))},
-            >>>              ])
-
-            Args:
-                fetch_list(list): The fetched variable names
-                feed(list|dict|None): The feed variables. If the feed is a dict,
-                    tensors in that dict will be splitted into each devices. If
-                    the feed is a list, each element of the list will be copied
-                    to each device. Default None.
-                feed_dict: Alias for feed parameter, for backward compatibility.
-                    This parameter has been deprecated. Default None.
-                return_numpy(bool): Whether converts the fetched tensor to numpy.
-                    Default: True.
-
-            Returns:
-                List: The fetched result list.
-
-            Raises:
-                ValueError: If the feed is a list, but its length is not equal the
-                    length of active places, or its element's is not dict.
-
-            NOTES:
-                1. If the feed's type is dict, the number of data that feeds to
-                   ParallelExecutor must be bigger than active places. Otherwise,
-                   it will throw exception from C++ side. Special attention should be
-                   paid to check whether the last batch of the dataset is bigger
-                   than active places.
-                2. If active places are more than one, the fetch results for each
-                   variable is a list, and each element of this list is the variable of
-                   respective active place.
-
-            Examples:
-                .. code-block:: python
-
-                    pe = fluid.ParallelExecutor(use_cuda=use_cuda,
-                                                loss_name=avg_cost.name,
-                                                main_program=fluid.default_main_program())
-                    loss = pe.run(feed=feeder.feed(cur_batch),
-                                  fetch_list=[avg_cost.name]))
-            """
-            if feed is None and feed_dict is not None:
-                feed = feed_dict
-                print(
-                    "`feed_dict` is deprecated. Please use `feed=`",
-                    file=sys.stderr)
-
-            if isinstance(feed, dict):
-                feed_tensor_dict = dict()
-                for feed_name in feed:
-                    feed_tensor = feed[feed_name]
-                    if not isinstance(feed_tensor, core.LoDTensor):
-                        feed_tensor = core.LoDTensor()
-                        # always set to CPU place, since the tensor need to be splitted
-                        # it is fast in CPU
-                        feed_tensor.set(feed[feed_name], core.CPUPlace())
-                    feed_tensor_dict[feed_name] = feed_tensor
-
-                self.executor.feed_and_split_tensor_into_local_scopes(
-                    feed_tensor_dict)
-            elif isinstance(feed, list) or isinstance(feed, tuple):
-                if len(feed) != len(self._act_places):
-                    raise ValueError(
-                        "Feed a list of tensor, the list should be the same size as places"
-                    )
-
-                res = list()
-
-                for i, each in enumerate(feed):
-                    if not isinstance(each, dict):
-                        raise TypeError(
-                            "Each element of feed list should be a dict")
-                    res_dict = dict()
-                    for feed_name in each:
-                        tensor = each[feed_name]
-                        if not isinstance(tensor, core.LoDTensor):
-                            tmp = core.LoDTensor()
-                            tmp.set(tensor, self._act_places[i])
-                            tensor = tmp
-                        res_dict[feed_name] = tensor
-                    res.append(res_dict)
-                self.executor.feed_tensors_into_local_scopes(res)
-
-            fetch_var_name = '@FETCHED_VAR_NAME@'
-            self.executor.run(fetch_list, fetch_var_name)
-            arr = self.scope.find_var(fetch_var_name).get_lod_tensor_array()
-
-            if return_numpy:
-                return executor.as_numpy(arr)
-
-            return [arr[i] for i in range(len(arr))]
-
-        @property
-        def device_count(self):
-            return len(self._act_places)
+        if feed is None and feed_dict is not None:
+            feed = feed_dict
+            print(
+                "`feed_dict` is deprecated. Please use `feed=`",
+                file=sys.stderr)
+
+        if isinstance(feed, dict):
+            feed_tensor_dict = dict()
+            for feed_name in feed:
+                feed_tensor = feed[feed_name]
+                if not isinstance(feed_tensor, core.LoDTensor):
+                    feed_tensor = core.LoDTensor()
+                    # always set to CPU place, since the tensor need to be splitted
+                    # it is fast in CPU
+                    feed_tensor.set(feed[feed_name], core.CPUPlace())
+                feed_tensor_dict[feed_name] = feed_tensor
+
+            self.executor.feed_and_split_tensor_into_local_scopes(
+                feed_tensor_dict)
+        elif isinstance(feed, list) or isinstance(feed, tuple):
+            if len(feed) != len(self._act_places):
+                raise ValueError(
+                    "Feed a list of tensor, the list should be the same size as places"
+                )
+
+            res = list()
+
+            for i, each in enumerate(feed):
+                if not isinstance(each, dict):
+                    raise TypeError(
+                        "Each element of feed list should be a dict")
+                res_dict = dict()
+                for feed_name in each:
+                    tensor = each[feed_name]
+                    if not isinstance(tensor, core.LoDTensor):
+                        tmp = core.LoDTensor()
+                        tmp.set(tensor, self._act_places[i])
+                        tensor = tmp
+                    res_dict[feed_name] = tensor
+                res.append(res_dict)
+            self.executor.feed_tensors_into_local_scopes(res)
+
+        fetch_var_name = '@FETCHED_VAR_NAME@'
+        self.executor.run(fetch_list, fetch_var_name)
+        arr = self.scope.find_var(fetch_var_name).get_lod_tensor_array()
+
+        if return_numpy:
+            return executor.as_numpy(arr)
+
+        return [arr[i] for i in range(len(arr))]
+
+    @property
+    def device_count(self):
+        return len(self._act_places)

From 6e66fadb951fe02218ab2be2916bc12c4b966e00 Mon Sep 17 00:00:00 2001
From: peizhilin <wopeizl@163.com>
Date: Wed, 21 Nov 2018 15:24:23 +0800
Subject: [PATCH 037/113] clean up the pre-definitions on windows

---
 CMakeLists.txt                        | 2 ++
 cmake/operators.cmake                 | 3 +--
 paddle/fluid/framework/eigen.h        | 5 -----
 paddle/fluid/framework/op_registry.h  | 5 -----
 paddle/fluid/framework/operator.cc    | 2 --
 paddle/fluid/framework/operator.h     | 2 --
 paddle/fluid/inference/api/api_impl.h | 6 ------
 paddle/fluid/platform/cpu_helper.cc   | 1 +
 paddle/fluid/platform/dynload/cudnn.h | 2 --
 paddle/fluid/platform/enforce.h       | 6 ------
 paddle/fluid/platform/init.h          | 3 ---
 paddle/fluid/platform/port.h          | 4 ++++
 paddle/fluid/platform/profiler.cc     | 4 ++--
 paddle/fluid/pybind/pybind.cc         | 7 -------
 14 files changed, 10 insertions(+), 42 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 27f2d81dd5..5325e3034c 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -137,6 +137,8 @@ if (WIN32)
             "Disable DSO when compiling for Windows" FORCE)
     set(WITH_MKL OFF CACHE STRING
             "Disable MKL when compiling for Windows" FORCE)
+    set(WITH_DISTRIBUTE OFF CACHE STRING
+            "Disable DISTRIBUTE when compiling for Windows" FORCE)
 endif()
 
 set(THIRD_PARTY_PATH "${CMAKE_BINARY_DIR}/third_party" CACHE STRING
diff --git a/cmake/operators.cmake b/cmake/operators.cmake
index 0bc4dbe6cf..17107e0698 100644
--- a/cmake/operators.cmake
+++ b/cmake/operators.cmake
@@ -84,8 +84,7 @@ function(op_library TARGET)
     endif()
     if (WIN32)
     # remove windows unsupported op, because windows has no nccl, no warpctc such ops.
-    foreach(windows_unsupport_op "nccl_op" "gen_nccl_id_op" "warpctc_op"
-            "channel_send_op" "channel_create_op" "channel_close_op" "channel_recv_op")
+    foreach(windows_unsupport_op "nccl_op" "gen_nccl_id_op" "warpctc_op")
         if ("${TARGET}" STREQUAL "${windows_unsupport_op}")
           return()
         endif()
diff --git a/paddle/fluid/framework/eigen.h b/paddle/fluid/framework/eigen.h
index 2b265a773f..5bafa4345f 100644
--- a/paddle/fluid/framework/eigen.h
+++ b/paddle/fluid/framework/eigen.h
@@ -13,11 +13,6 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #pragma once
-// logging.h and windows.h conflict
-#define GLOG_NO_ABBREVIATED_SEVERITIES
-// solve static linking error in windows
-// https://github.com/google/glog/issues/301
-#define GOOGLE_GLOG_DLL_DECL
 
 #include "paddle/fluid/framework/tensor.h"
 #include "unsupported/Eigen/CXX11/Tensor"
diff --git a/paddle/fluid/framework/op_registry.h b/paddle/fluid/framework/op_registry.h
index ef2eb334a4..0e6e74293c 100644
--- a/paddle/fluid/framework/op_registry.h
+++ b/paddle/fluid/framework/op_registry.h
@@ -23,11 +23,6 @@ limitations under the License. */
 #include <unordered_map>
 #include <unordered_set>
 
-#if defined(_WIN32)
-#define GLOG_NO_ABBREVIATED_SEVERITIES  // msvc conflict logging with windows.h
-#define GOOGLE_GLOG_DLL_DECL
-#endif
-
 #include "glog/logging.h"  // For VLOG()
 #include "paddle/fluid/framework/attribute.h"
 #include "paddle/fluid/framework/details/op_registry.h"
diff --git a/paddle/fluid/framework/operator.cc b/paddle/fluid/framework/operator.cc
index 2b35943d09..1ec170b6f6 100644
--- a/paddle/fluid/framework/operator.cc
+++ b/paddle/fluid/framework/operator.cc
@@ -11,8 +11,6 @@ distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-#define GLOG_NO_ABBREVIATED_SEVERITIES
-#define GOOGLE_GLOG_DLL_DECL
 
 #include <gflags/gflags.h>
 #include <glog/logging.h>
diff --git a/paddle/fluid/framework/operator.h b/paddle/fluid/framework/operator.h
index 6918e030bf..ef83833217 100644
--- a/paddle/fluid/framework/operator.h
+++ b/paddle/fluid/framework/operator.h
@@ -20,8 +20,6 @@ limitations under the License. */
 #include <tuple>
 #include <unordered_map>
 #include <vector>
-#define GLOG_NO_ABBREVIATED_SEVERITIES
-#define GOOGLE_GLOG_DLL_DECL
 
 #include "glog/logging.h"  // For VLOG
 #include "paddle/fluid/framework/attribute.h"
diff --git a/paddle/fluid/inference/api/api_impl.h b/paddle/fluid/inference/api/api_impl.h
index 4e4ab47ca9..9dfa48d501 100644
--- a/paddle/fluid/inference/api/api_impl.h
+++ b/paddle/fluid/inference/api/api_impl.h
@@ -14,12 +14,6 @@ limitations under the License. */
 
 #pragma once
 
-// logging.h and windows.h conflict
-#define GLOG_NO_ABBREVIATED_SEVERITIES
-// solve static linking error in windows
-// https://github.com/google/glog/issues/301
-#define GOOGLE_GLOG_DLL_DECL
-
 #include <glog/logging.h>
 #include <map>
 #include <memory>
diff --git a/paddle/fluid/platform/cpu_helper.cc b/paddle/fluid/platform/cpu_helper.cc
index bd6aedb3ac..f2d691b293 100644
--- a/paddle/fluid/platform/cpu_helper.cc
+++ b/paddle/fluid/platform/cpu_helper.cc
@@ -30,6 +30,7 @@ namespace platform {
 void SetNumThreads(int num_threads) {
 #ifdef PADDLE_USE_OPENBLAS
 // windows has no support for openblas multi-thread
+// please refer to: https://github.com/PaddlePaddle/Paddle/issues/7234
 #ifdef _WIN32
   if (num_threads > 1) {
     num_threads = 1;
diff --git a/paddle/fluid/platform/dynload/cudnn.h b/paddle/fluid/platform/dynload/cudnn.h
index 065b940b9c..1a83ac7780 100644
--- a/paddle/fluid/platform/dynload/cudnn.h
+++ b/paddle/fluid/platform/dynload/cudnn.h
@@ -13,8 +13,6 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #pragma once
-#define GLOG_NO_ABBREVIATED_SEVERITIES
-#define GOOGLE_GLOG_DLL_DECL
 #include <glog/logging.h>
 
 #include <cudnn.h>
diff --git a/paddle/fluid/platform/enforce.h b/paddle/fluid/platform/enforce.h
index 31309738a5..a85972bdb7 100644
--- a/paddle/fluid/platform/enforce.h
+++ b/paddle/fluid/platform/enforce.h
@@ -18,12 +18,6 @@ limitations under the License. */
 #include <cxxabi.h>  // for __cxa_demangle
 #endif               // __GNUC__
 
-#if defined(_WIN32)
-#define NOMINMAX  // msvc max/min macro conflict with std::min/max
-#define GLOG_NO_ABBREVIATED_SEVERITIES  // msvc conflict logging with windows.h
-#define GOOGLE_GLOG_DLL_DECL
-#endif
-
 #ifdef PADDLE_WITH_CUDA
 #include <cublas_v2.h>
 #include <cudnn.h>
diff --git a/paddle/fluid/platform/init.h b/paddle/fluid/platform/init.h
index 992ca5e6f6..0e30594672 100644
--- a/paddle/fluid/platform/init.h
+++ b/paddle/fluid/platform/init.h
@@ -16,9 +16,6 @@ limitations under the License. */
 #include <string>
 #include <vector>
 
-#define GLOG_NO_ABBREVIATED_SEVERITIES
-#define GOOGLE_GLOG_DLL_DECL
-
 #include "gflags/gflags.h"
 #include "glog/logging.h"
 
diff --git a/paddle/fluid/platform/port.h b/paddle/fluid/platform/port.h
index 8be77fe464..ad070171df 100644
--- a/paddle/fluid/platform/port.h
+++ b/paddle/fluid/platform/port.h
@@ -31,6 +31,10 @@
 #include <sys/time.h>
 #include <algorithm>  // std::accumulate
 #else
+#define NOMINMAX  // msvc max/min macro conflict with std::min/max
+// solve static linking error in windows
+// https://github.com/google/glog/issues/301
+#define GOOGLE_GLOG_DLL_DECL
 #include <io.h>  // _popen, _pclose
 #include <stdio.h>
 #include <windows.h>
diff --git a/paddle/fluid/platform/profiler.cc b/paddle/fluid/platform/profiler.cc
index 03c102e24a..998242fb4a 100644
--- a/paddle/fluid/platform/profiler.cc
+++ b/paddle/fluid/platform/profiler.cc
@@ -438,10 +438,10 @@ void ParseEvents(const std::vector<std::vector<Event>>& events,
             event_items[index].total_time += event_time;
             // min time
             event_items[index].min_time =
-                (std::min)(event_time, event_items[index].min_time);
+                std::min(event_time, event_items[index].min_time);
             // max time
             event_items[index].max_time =
-                (std::max)(event_time, event_items[index].max_time);
+                std::max(event_time, event_items[index].max_time);
           }
 
           // remove the push marker from the list
diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc
index 102fa02adf..6cc3a1739a 100644
--- a/paddle/fluid/pybind/pybind.cc
+++ b/paddle/fluid/pybind/pybind.cc
@@ -21,13 +21,6 @@ limitations under the License. */
 #include <utility>
 #include <vector>
 
-#if defined(_WIN32)
-#define NOMINMAX
-#define GLOG_NO_ABBREVIATED_SEVERITIES  // msvc conflict logging with windows.h
-#define GOOGLE_GLOG_DLL_DECL
-#include <Windows.h>
-#endif
-
 #include "paddle/fluid/framework/executor.h"
 #include "paddle/fluid/framework/feed_fetch_method.h"
 #include "paddle/fluid/framework/framework.pb.h"

From c19ff1f3d28b38867de8b98d63f19b8c759c4535 Mon Sep 17 00:00:00 2001
From: minqiyang <minqiyang@baidu.com>
Date: Wed, 21 Nov 2018 15:37:36 +0800
Subject: [PATCH 038/113] Add python3.6 and python3.7 support in padde build
 scripts

test=develop
---
 paddle/scripts/paddle_build.sh | 14 +++++++++++++-
 1 file changed, 13 insertions(+), 1 deletion(-)

diff --git a/paddle/scripts/paddle_build.sh b/paddle/scripts/paddle_build.sh
index 32f9bca645..569e56e5a9 100755
--- a/paddle/scripts/paddle_build.sh
+++ b/paddle/scripts/paddle_build.sh
@@ -116,6 +116,18 @@ function cmake_gen() {
                 export PYTHON_FLAGS="-DPYTHON_EXECUTABLE:FILEPATH=/opt/_internal/cpython-3.5.1/bin/python3
             -DPYTHON_INCLUDE_DIR:PATH=/opt/_internal/cpython-3.5.1/include/python3.5m
             -DPYTHON_LIBRARIES:FILEPATH=/opt/_internal/cpython-3.5.1/lib/libpython3.so"
+            elif [ "$1" == "cp36-cp36m" ]; then
+                export LD_LIBRARY_PATH=/opt/_internal/cpython-3.6.0/lib/:${LD_LIBRARY_PATH}
+                export PATH=/opt/_internal/cpython-3.6.0/bin/:${PATH}
+                export PYTHON_FLAGS="-DPYTHON_EXECUTABLE:FILEPATH=/opt/_internal/cpython-3.6.0/bin/python3
+            -DPYTHON_INCLUDE_DIR:PATH=/opt/_internal/cpython-3.6.0/include/python3.6m
+            -DPYTHON_LIBRARIES:FILEPATH=/opt/_internal/cpython-3.6.0/lib/libpython3.so"
+            elif [ "$1" == "cp37-cp37m" ]; then
+                export LD_LIBRARY_PATH=/opt/_internal/cpython-3.7.0/lib/:${LD_LIBRARY_PATH}
+                export PATH=/opt/_internal/cpython-3.7.0/bin/:${PATH}
+                export PYTHON_FLAGS="-DPYTHON_EXECUTABLE:FILEPATH=/opt/_internal/cpython-3.7.0/bin/python3
+            -DPYTHON_INCLUDE_DIR:PATH=/opt/_internal/cpython-3.7.0/include/python3.7m
+            -DPYTHON_LIBRARIES:FILEPATH=/opt/_internal/cpython-3.7.0/lib/libpython3.so"
            fi
         fi
     fi
@@ -419,7 +431,7 @@ function assert_api_not_changed() {
     source .env/bin/activate
     pip install ${PADDLE_ROOT}/build/python/dist/*whl
     python ${PADDLE_ROOT}/tools/print_signatures.py paddle.fluid > new.spec
-    if [ "$1" == "cp35-cp35m" ]; then
+    if [ "$1" == "cp35-cp35m" ] || [ "$1" == "cp36-cp36m" ] || [ "$1" == "cp37-cp37m" ]; then
         # Use sed to make python2 and python3 sepc keeps the same
         sed -i 's/arg0: str/arg0: unicode/g' new.spec
         sed -i "s/\(.*Transpiler.*\).__init__ ArgSpec(args=\['self'].*/\1.__init__ /g" new.spec

From 255cc1eb6540785c8cb786a6c9f291fa53010ca0 Mon Sep 17 00:00:00 2001
From: minqiyang <minqiyang@baidu.com>
Date: Wed, 21 Nov 2018 15:43:17 +0800
Subject: [PATCH 039/113] Add support for Mac build

test=develop
---
 paddle/scripts/paddle_build.sh | 24 ++++++++++++++++++++++++
 1 file changed, 24 insertions(+)

diff --git a/paddle/scripts/paddle_build.sh b/paddle/scripts/paddle_build.sh
index 569e56e5a9..9632eaec00 100755
--- a/paddle/scripts/paddle_build.sh
+++ b/paddle/scripts/paddle_build.sh
@@ -94,6 +94,30 @@ function cmake_gen() {
             else
                 exit 1
             fi
+        elif [ "$1" == "cp36-cp36m" ]; then
+            if [ -d "/Library/Frameworks/Python.framework/Versions/3.6" ]; then
+                export LD_LIBRARY_PATH=/Library/Frameworks/Python.framework/Versions/3.6/lib/
+                export DYLD_LIBRARY_PATH=/Library/Frameworks/Python.framework/Versions/3.6/lib/
+                export PATH=/Library/Frameworks/Python.framework/Versions/3.6/bin/:${PATH}
+                PYTHON_FLAGS="-DPYTHON_EXECUTABLE:FILEPATH=/Library/Frameworks/Python.framework/Versions/3.6/bin/python3
+            -DPYTHON_INCLUDE_DIR:PATH=/Library/Frameworks/Python.framework/Versions/3.6/include/python3.6m/
+            -DPYTHON_LIBRARY:FILEPATH=/Library/Frameworks/Python.framework/Versions/3.6/lib/libpython3.6m.dylib"
+                WITH_FLUID_ONLY=${WITH_FLUID_ONLY:-ON}
+            else
+                exit 1
+            fi
+        elif [ "$1" == "cp37-cp37m" ]; then
+            if [ -d "/Library/Frameworks/Python.framework/Versions/3.7" ]; then
+                export LD_LIBRARY_PATH=/Library/Frameworks/Python.framework/Versions/3.7/lib/
+                export DYLD_LIBRARY_PATH=/Library/Frameworks/Python.framework/Versions/3.7/lib/
+                export PATH=/Library/Frameworks/Python.framework/Versions/3.7/bin/:${PATH}
+                PYTHON_FLAGS="-DPYTHON_EXECUTABLE:FILEPATH=/Library/Frameworks/Python.framework/Versions/3.7/bin/python3
+            -DPYTHON_INCLUDE_DIR:PATH=/Library/Frameworks/Python.framework/Versions/3.7/include/python3.7m/
+            -DPYTHON_LIBRARY:FILEPATH=/Library/Frameworks/Python.framework/Versions/3.7/lib/libpython3.7m.dylib"
+                WITH_FLUID_ONLY=${WITH_FLUID_ONLY:-ON}
+            else
+                exit 1
+            fi
         fi
     else
         if [ "$1" != "" ]; then

From f913860873781ff4ccc9ee2eba73365d530fae22 Mon Sep 17 00:00:00 2001
From: tensor-tang <tangjian03@baidu.com>
Date: Wed, 21 Nov 2018 08:47:12 +0000
Subject: [PATCH 040/113] jitkernel lstm refer support peephole

test=develop
---
 .../fluid/operators/fused/fusion_lstm_op.cc   |  73 +++--
 paddle/fluid/operators/math/jit_code.cc       |   6 +-
 paddle/fluid/operators/math/jit_code.h        |  42 ++-
 paddle/fluid/operators/math/jit_kernel.h      |  15 +-
 paddle/fluid/operators/math/jit_kernel_impl.h |  14 +-
 .../fluid/operators/math/jit_kernel_macro.h   |   8 +-
 .../fluid/operators/math/jit_kernel_refer.h   |  35 ++-
 paddle/fluid/operators/math/jit_kernel_rnn.cc | 288 +++++++-----------
 .../fluid/operators/math/jit_kernel_test.cc   |  32 +-
 9 files changed, 250 insertions(+), 263 deletions(-)

diff --git a/paddle/fluid/operators/fused/fusion_lstm_op.cc b/paddle/fluid/operators/fused/fusion_lstm_op.cc
index 0959539068..8021a896ce 100644
--- a/paddle/fluid/operators/fused/fusion_lstm_op.cc
+++ b/paddle/fluid/operators/fused/fusion_lstm_op.cc
@@ -236,27 +236,31 @@ class FuisonLSTMKernel : public framework::OpKernel<T> {
   const int D = wh_dims[0];                                 \
   const int D4 = wh_dims[1]
 
-#define INIT_OTHER_DEFINES                                                  \
-  const T* x_data = x->data<T>();                                           \
-  const T* wx_data = wx->data<T>();                                         \
-  const T* wh_data = wh->data<T>();                                         \
-  /* diagonal weight*/                                                      \
-  const T* wp_data = bias->data<T>() + D4;                                  \
-  /* for peephole only*/                                                    \
-  T* checked_cell_data = nullptr;                                           \
-  auto place = ctx.GetPlace();                                              \
-  if (use_peepholes) {                                                      \
-    /* w_ic * Ct-1, w_fc * Ct-1  ; w_oc * Ct => ih*/                        \
-    auto* checked_cell = ctx.Output<Tensor>("CheckedCell");                 \
-    checked_cell_data = checked_cell->mutable_data<T>(place);               \
-  }                                                                         \
-  const auto& ker =                                                         \
-      math::jitkernel::KernelPool::Instance()                               \
-          .template Get<math::jitkernel::LSTMKernel<T>, const std::string&, \
-                        const std::string&, const std::string&>(            \
-              ctx.Attr<std::string>("gate_activation"),                     \
-              ctx.Attr<std::string>("candidate_activation"),                \
-              ctx.Attr<std::string>("cell_activation"), D, use_peepholes)
+#define INIT_OTHER_DEFINES                                      \
+  const T* x_data = x->data<T>();                               \
+  const T* wx_data = wx->data<T>();                             \
+  const T* wh_data = wh->data<T>();                             \
+  /* diagonal weight*/                                          \
+  const T* wp_data = bias->data<T>() + D4;                      \
+  /* for peephole only*/                                        \
+  T* checked_cell_data = nullptr;                               \
+  auto place = ctx.GetPlace();                                  \
+  if (use_peepholes) {                                          \
+    /* w_ic * Ct-1, w_fc * Ct-1  ; w_oc * Ct => ih*/            \
+    auto* checked_cell = ctx.Output<Tensor>("CheckedCell");     \
+    checked_cell_data = checked_cell->mutable_data<T>(place);   \
+  }                                                             \
+  const math::jitkernel::lstm_attr_t attr(                      \
+      D, ctx.Attr<std::string>("gate_activation"),              \
+      ctx.Attr<std::string>("candidate_activation"),            \
+      ctx.Attr<std::string>("cell_activation"), use_peepholes); \
+  math::jitkernel::lstm_t one_step;                             \
+  one_step.wp = wp_data;                                        \
+  one_step.checked = checked_cell_data;                         \
+  const auto& ker =                                             \
+      math::jitkernel::KernelPool::Instance()                   \
+          .template Get<math::jitkernel::LSTMKernel<T>,         \
+                        const math::jitkernel::lstm_attr_t&>(attr)
 
 // Wh GEMM
 #define GEMM_WH_ADDON(bs, prev, out)                                           \
@@ -299,7 +303,10 @@ class FuisonLSTMKernel : public framework::OpKernel<T> {
         prev_h_data = h0_data + bid * D;
         prev_c_data = c0_data + bid * D;
       } else {
-        ker->ComputeC1H1(xx_data, c_out_data, h_out_data, wp_data);
+        one_step.gates = xx_data;
+        one_step.ct = c_out_data;
+        one_step.ht = h_out_data;
+        ker->ComputeC1H1(&one_step, &attr);
         tstart = 1;
         // move one step
         prev_h_data = h_out_data;
@@ -310,8 +317,12 @@ class FuisonLSTMKernel : public framework::OpKernel<T> {
       }
       for (int step = tstart; step < seq_len; ++step) {
         GEMM_WH_ADDON(1, prev_h_data, xx_data);
-        ker->ComputeCtHt(xx_data, prev_c_data, c_out_data, h_out_data, wp_data,
-                         checked_cell_data);
+
+        one_step.gates = xx_data;
+        one_step.ct_1 = prev_c_data;
+        one_step.ct = c_out_data;
+        one_step.ht = h_out_data;
+        ker->ComputeCtHt(&one_step, &attr);
         // move one step
         prev_h_data = h_out_data;
         prev_c_data = c_out_data;
@@ -388,7 +399,11 @@ class FuisonLSTMKernel : public framework::OpKernel<T> {
       T* cur_h_out_data = batched_h_out_data;
       T* cur_c_out_data = batched_c_out_data;
       for (int i = 0; i < max_bs; ++i) {
-        ker->ComputeC1H1(cur_in_data, cur_c_out_data, cur_h_out_data, wp_data);
+        one_step.gates = cur_in_data;
+        one_step.ct = cur_c_out_data;
+        one_step.ht = cur_h_out_data;
+        ker->ComputeC1H1(&one_step, &attr);
+
         cur_in_data += D4;
         cur_c_out_data += D;
         cur_h_out_data += D;
@@ -413,8 +428,12 @@ class FuisonLSTMKernel : public framework::OpKernel<T> {
       T* cur_c_out_data = batched_c_out_data;
       T* cur_h_out_data = batched_h_out_data;
       for (int i = 0; i < cur_bs; ++i) {
-        ker->ComputeCtHt(cur_in_data, cur_prev_c_data, cur_c_out_data,
-                         cur_h_out_data, wp_data, checked_cell_data);
+        one_step.gates = cur_in_data;
+        one_step.ct_1 = cur_prev_c_data;
+        one_step.ct = cur_c_out_data;
+        one_step.ht = cur_h_out_data;
+        ker->ComputeCtHt(&one_step, &attr);
+
         // move one batch
         cur_in_data += D4;
         cur_prev_c_data += D;
diff --git a/paddle/fluid/operators/math/jit_code.cc b/paddle/fluid/operators/math/jit_code.cc
index 418c843362..ccc9206f5c 100644
--- a/paddle/fluid/operators/math/jit_code.cc
+++ b/paddle/fluid/operators/math/jit_code.cc
@@ -233,7 +233,7 @@ void LSTMJitCode::generate() {
     vmovups(ymm_src, ptr[reg_ptr_gates + offset + num_]);
     act<ymm_t>(ymm_i, ymm_src, act_gate_);
     vmulps(ymm_c, ymm_c, ymm_i);
-    if (first_) {
+    if (!compute_c1h1_) {
       // f
       vmovups(ymm_src, ptr[reg_ptr_gates + offset + 2 * num_]);
       act<ymm_t>(ymm_f, ymm_src, act_gate_);
@@ -242,8 +242,8 @@ void LSTMJitCode::generate() {
       vaddps(ymm_f, ymm_f, ymm_c);
     }
     /* H_t = act_cell(C_t) * ogated */
-    ymm_t ymm_ct = first_ ? ymm_c : ymm_f;
-    ymm_t ymm_o = first_ ? ymm_f : ymm_c;
+    ymm_t ymm_ct = compute_c1h1_ ? ymm_c : ymm_f;
+    ymm_t ymm_o = compute_c1h1_ ? ymm_f : ymm_c;
     ymm_t ymm_tmp = ymm_i;
     act<ymm_t>(ymm_tmp, ymm_ct, act_cell_);
     vmovups(ymm_src, ptr[reg_ptr_gates + offset + 3 * num_]);
diff --git a/paddle/fluid/operators/math/jit_code.h b/paddle/fluid/operators/math/jit_code.h
index 9782f5414c..bf28d444b7 100644
--- a/paddle/fluid/operators/math/jit_code.h
+++ b/paddle/fluid/operators/math/jit_code.h
@@ -319,6 +319,12 @@ class LSTMJitCode : public VActJitCode {
  public:
   const char* name() const override {
     std::string base = "LSTMJitCode";
+    if (use_peephole_) {
+      base += "_Peephole";
+    }
+    if (compute_c1h1_) {
+      base += "_C1H1";
+    }
     auto AddTypeStr = [&](operand_type type) {
       switch (type) {
         case operand_type::relu:
@@ -340,30 +346,42 @@ class LSTMJitCode : public VActJitCode {
           break;
       }
     };
-    if (first_) {
-      base += "_C1H1";
-    }
     AddTypeStr(act_gate_);
     AddTypeStr(act_cand_);
     AddTypeStr(act_cell_);
     return base.c_str();
   }
 
-  explicit LSTMJitCode(int d, bool first, operand_type act_gate,
-                       operand_type act_cand, operand_type act_cell,
+  explicit LSTMJitCode(bool compute_c1h1, const lstm_attr_t& attr,
                        size_t code_size = 256 * 1024, void* code_ptr = nullptr)
-      : VActJitCode(d, act_gate, code_size, code_ptr),
-        num_(d),
-        first_(first),
-        act_gate_(act_gate),
-        act_cand_(act_cand),
-        act_cell_(act_cell) {}
+      : VActJitCode(attr.d, operand_type::sigmoid /* this is bugy*/, code_size,
+                    code_ptr),
+        compute_c1h1_(compute_c1h1) {
+    auto typeExchange = [](const std::string& type) -> gen::operand_type {
+      if (type == "sigmoid") {
+        return operand_type::sigmoid;
+      } else if (type == "relu") {
+        return operand_type::relu;
+      } else if (type == "tanh") {
+        return operand_type::tanh;
+      } else if (type == "identity" || type == "") {
+        return operand_type::identity;
+      }  // else throw error
+      return operand_type::identity;
+    };
+    num_ = attr.d;
+    use_peephole_ = attr.use_peephole;
+    act_gate_ = typeExchange(attr.act_gate);
+    act_cand_ = typeExchange(attr.act_cand);
+    act_cell_ = typeExchange(attr.act_cell);
+  }
   static bool init(int d);
   void generate() override;
 
  protected:
   int num_;
-  bool first_;
+  bool compute_c1h1_;
+  bool use_peephole_;
   operand_type act_gate_;
   operand_type act_cand_;
   operand_type act_cell_;
diff --git a/paddle/fluid/operators/math/jit_kernel.h b/paddle/fluid/operators/math/jit_kernel.h
index 36199eddaf..bb5ba5813a 100644
--- a/paddle/fluid/operators/math/jit_kernel.h
+++ b/paddle/fluid/operators/math/jit_kernel.h
@@ -122,18 +122,9 @@ class VTanhKernel : public VActKernel<T> {};
 template <typename T>
 class LSTMKernel : public Kernel {
  public:
-  virtual void ComputeCtHt(T *gates, const T *ct_1, T *ct, T *ht,
-                           /* below only used in peephole*/
-                           const T *wp_data = nullptr,
-                           T *checked = nullptr) const = 0;
-
-  virtual void ComputeC1H1(T *gates, T *ct, T *ht,
-                           /* below only used in peephole*/
-                           const T *wp_data = nullptr) const = 0;
-
-  //  void (*ComputeCtHt)(lstm_t *);
-  // // compute c1 and h1 without c0 or h0
-  //  void (*ComputeC1H1)(lstm_t *);
+  void (*ComputeCtHt)(lstm_t *, const lstm_attr_t *);
+  // compute c1 and h1 without c0 or h0
+  void (*ComputeC1H1)(lstm_t *, const lstm_attr_t *);
 };
 
 template <typename T>
diff --git a/paddle/fluid/operators/math/jit_kernel_impl.h b/paddle/fluid/operators/math/jit_kernel_impl.h
index 337d5ae914..2e734ca940 100644
--- a/paddle/fluid/operators/math/jit_kernel_impl.h
+++ b/paddle/fluid/operators/math/jit_kernel_impl.h
@@ -33,18 +33,24 @@ typedef struct {
   const void* ct_1;
   void* ct;
   void* ht;
-  /* below only used in peephole*/
-  const void* wp_data{nullptr};
+  /* weight_peephole and checked data are only used in peephole*/
+  const void* wp{nullptr};
   void* checked{nullptr};
 } lstm_t;
 
 typedef struct lstm_attr_s {
+  bool use_peephole;
   int d;
   std::string act_gate, act_cand, act_cell;
   lstm_attr_s() = default;
   lstm_attr_s(int _d, const std::string& _act_gate,
-              const std::string& _act_cand, const std::string& _act_cell)
-      : d(_d), act_gate(_act_gate), act_cand(_act_cand), act_cell(_act_cell) {}
+              const std::string& _act_cand, const std::string& _act_cell,
+              bool _use_peephole = false)
+      : use_peephole(_use_peephole),
+        d(_d),
+        act_gate(_act_gate),
+        act_cand(_act_cand),
+        act_cell(_act_cell) {}
 } lstm_attr_t;
 
 }  // namespace jitkernel
diff --git a/paddle/fluid/operators/math/jit_kernel_macro.h b/paddle/fluid/operators/math/jit_kernel_macro.h
index 8acf60cfbf..5a3efd979f 100644
--- a/paddle/fluid/operators/math/jit_kernel_macro.h
+++ b/paddle/fluid/operators/math/jit_kernel_macro.h
@@ -82,10 +82,10 @@ namespace jitkernel {
 #define REGISTER_JITKERNEL_ARGS(ker_key, ker_class, marco_define_name,     \
                                 marco_declare, macro_find_key, macro_impl) \
   marco_define_name(ker_key, ker_class);                                   \
-  REGISTER_JITKERNEL_WITH_DTYPE(ker_class, float, JITKERNEL_DECLARE,       \
-                                JITKERNEL_FIND_KEY, JITKERNEL_IMPL);       \
-  REGISTER_JITKERNEL_WITH_DTYPE(ker_class, double, JITKERNEL_DECLARE,      \
-                                JITKERNEL_FIND_KEY, JITKERNEL_IMPL)
+  REGISTER_JITKERNEL_WITH_DTYPE(ker_class, float, marco_declare,           \
+                                macro_find_key, macro_impl);               \
+  REGISTER_JITKERNEL_WITH_DTYPE(ker_class, double, marco_declare,          \
+                                macro_find_key, macro_impl)
 
 #define REGISTER_JITKERNEL(ker_key, ker_class)                       \
   REGISTER_JITKERNEL_ARGS(ker_key, ker_class, JITKERNEL_DEFINE_NAME, \
diff --git a/paddle/fluid/operators/math/jit_kernel_refer.h b/paddle/fluid/operators/math/jit_kernel_refer.h
index 9c60ebc587..097bb85956 100644
--- a/paddle/fluid/operators/math/jit_kernel_refer.h
+++ b/paddle/fluid/operators/math/jit_kernel_refer.h
@@ -117,11 +117,13 @@ void (*getActFunc(const std::string& type))(const T*, T*, int) {  // NOLINT
 }
 
 template <typename T>
-void LSTMCtHt(lstm_t* step, lstm_attr_t* attr) {
+void LSTMCtHt(lstm_t* step, const lstm_attr_t* attr) {
   T* gates = reinterpret_cast<T*>(step->gates);
   const T* ct_1 = reinterpret_cast<const T*>(step->ct_1);
   T* ct = reinterpret_cast<T*>(step->ct);
   T* ht = reinterpret_cast<T*>(step->ht);
+  const T* wp = reinterpret_cast<const T*>(step->wp);
+  T* checked = reinterpret_cast<T*>(step->checked);
   auto act_gate = getActFunc<T>(attr->act_gate);
   auto act_cand = getActFunc<T>(attr->act_cand);
   auto act_cell = getActFunc<T>(attr->act_cell);
@@ -129,23 +131,36 @@ void LSTMCtHt(lstm_t* step, lstm_attr_t* attr) {
   int d2 = d * 2;
   int d3 = d * 3;
   // gates: W_ch, W_ih, W_fh, W_oh
-  act_gate(gates + d, gates + d, d3);
+  if (attr->use_peephole) {
+    VMul(wp, ct_1, checked, d);
+    VMul(wp + d, ct_1, checked + d, d);
+    VAdd(checked, gates + d, gates + d, d2);
+    act_gate(gates + d, gates + d, d2);
+  } else {
+    act_gate(gates + d, gates + d, d3);
+  }
 
-  /* C_t = C_t-1 * fgated + cand_gated * igated */
+  // C_t = C_t-1 * fgated + cand_gated * igated
   act_cand(gates, gates, d);
   VMul(gates, gates + d, gates + d, d);
   VMul(ct_1, gates + d2, gates + d2, d);
   VAdd(gates + d, gates + d2, ct, d);
 
-  /* H_t = act_cell(C_t) * ogated */
+  if (attr->use_peephole) {
+    // get ogated
+    VMul(wp + d2, ct, gates + d, d);
+    VAdd(gates + d, gates + d3, gates + d3, d);
+    act_gate(gates + d3, gates + d3, d);
+  }
+  // H_t = act_cell(C_t) * ogated
   act_cell(ct, gates + d2, d);
   VMul(gates + d2, gates + d3, ht, d);
 }
 
+// compute c1 and h1 without c0 or h0
 template <typename T>
-void LSTMC1H1(lstm_t* step, lstm_attr_t* attr) {
+void LSTMC1H1(lstm_t* step, const lstm_attr_t* attr) {
   T* gates = reinterpret_cast<T*>(step->gates);
-  const T* ct_1 = reinterpret_cast<const T*>(step->ct_1);
   T* ct = reinterpret_cast<T*>(step->ct);
   T* ht = reinterpret_cast<T*>(step->ht);
   auto act_gate = getActFunc<T>(attr->act_gate);
@@ -158,10 +173,16 @@ void LSTMC1H1(lstm_t* step, lstm_attr_t* attr) {
   act_gate(gates + d, gates + d, d);
   act_cand(gates, gates, d);
   VMul(gates, gates + d, ct, d);
+  if (attr->use_peephole) {
+    // get outgated, put W_oc * C_t on igated
+    const T* wp = reinterpret_cast<const T*>(step->wp);
+    VMul(wp + d2, ct, gates + d, d);
+    VAdd(gates + d, gates + d3, gates + d3, d);
+  }
   /* H_t = act_cell(C_t) * ogated */
   act_gate(gates + d3, gates + d3, d);
   act_cell(ct, gates + d2, d);
-  Vmul(gates + d2, gates + d3, ht, d);
+  VMul(gates + d2, gates + d3, ht, d);
 }
 
 }  // namespace refer
diff --git a/paddle/fluid/operators/math/jit_kernel_rnn.cc b/paddle/fluid/operators/math/jit_kernel_rnn.cc
index e79b0400ab..6b7463aa52 100644
--- a/paddle/fluid/operators/math/jit_kernel_rnn.cc
+++ b/paddle/fluid/operators/math/jit_kernel_rnn.cc
@@ -15,9 +15,14 @@ limitations under the License. */
 #include "paddle/fluid/operators/math/jit_kernel.h"
 #include <string>
 #include "paddle/fluid/operators/math/jit_kernel_macro.h"
+#include "paddle/fluid/operators/math/jit_kernel_refer.h"
 #include "paddle/fluid/platform/enforce.h"
 #include "paddle/fluid/platform/macros.h"
 
+#ifdef PADDLE_WITH_XBYAK
+#include "paddle/fluid/operators/math/jit_code.h"
+#endif
+
 #ifdef __AVX__
 #include <immintrin.h>
 #endif
@@ -154,211 +159,136 @@ static std::unique_ptr<AVXAct> GetAVXAct(const std::string& type) {
 #endif
 
 /* LSTM JitKernel */
-template <typename T, jit::cpu_isa_t isa, jit_block>
+template <typename T>
 class LSTMKernelImpl : public LSTMKernel<T> {
  public:
-  explicit LSTMKernelImpl(const std::string& act_gate,
-                          const std::string& act_cand,
-                          const std::string& act_cell, int d)
-      : LSTMKernel<T>() {
-    d_ = d;
-    d2_ = d * 2;
-    d3_ = d * 3;
-    act_gate_d3_ = GetActKernel<T>(act_gate, d3_);
-    act_gate_d_ = GetActKernel<T>(act_gate, d);
-    act_cand_d_ = GetActKernel<T>(act_cand, d);
-    act_cell_d_ = GetActKernel<T>(act_cell, d);
-    vmul_d_ = KernelPool::Instance().template Get<VMulKernel<T>>(d);
-    vadd_d_ = KernelPool::Instance().template Get<VAddKernel<T>>(d);
+  static inline std::string name(const lstm_attr_t& attr) {
+    PADDLE_THROW("DType should be either float or double");
   }
+  static inline bool useJIT(int d) { return false; }
+  static inline bool useMKL(int d) { return false; }
+  explicit LSTMKernelImpl(const lstm_attr_t& attr) : LSTMKernel<T>() {
+#ifdef PADDLE_WITH_XBYAK
+    if (useJIT(attr.d)) {
+      size_t sz = 96 + attr.d / YMM_FLOAT_BLOCK * 84 * 8;  // should change
+      jitcode0_.reset(new gen::LSTMJitCode(false, attr, sz > 4096 ? sz : 4096));
+      this->ComputeCtHt =
+          jitcode0_->getCode<void (*)(lstm_t*, const lstm_attr_t*)>();
+
+      jitcode1_.reset(new gen::LSTMJitCode(true, attr, sz > 4096 ? sz : 4096));
+      this->ComputeC1H1 =
+          jitcode1_->getCode<void (*)(lstm_t*, const lstm_attr_t*)>();
+      return;
+    }
+#endif
 
-  void ComputeCtHt(T* gates, const T* ct_1, T* ct, T* ht, const T* wp_data,
-                   T* checked) const override {
-    // gates: W_ch, W_ih, W_fh, W_oh
-    act_gate_d3_->Compute(gates + d_, gates + d_, d3_);
-
-    /* C_t = C_t-1 * fgated + cand_gated * igated */
-    act_cand_d_->Compute(gates, gates, d_);
-    vmul_d_->Compute(gates, gates + d_, gates + d_, d_);
-    vmul_d_->Compute(ct_1, gates + d2_, gates + d2_, d_);
-    vadd_d_->Compute(gates + d_, gates + d2_, ct, d_);
-
-    /* H_t = act_cell(C_t) * ogated */
-    act_cell_d_->Compute(ct, gates + d2_, d_);
-    vmul_d_->Compute(gates + d2_, gates + d3_, ht, d_);
-  }
-  void ComputeC1H1(T* gates, T* ct, T* ht, const T* wp_data) const override {
-    /* C_t = igated * cgated*/
-    act_gate_d_->Compute(gates + d_, gates + d_, d_);
-    act_cand_d_->Compute(gates, gates, d_);
-    vmul_d_->Compute(gates, gates + d_, ct, d_);
-    /* H_t = act_cell(C_t) * ogated */
-    act_gate_d_->Compute(gates + d3_, gates + d3_, d_);
-    act_cell_d_->Compute(ct, gates + d2_, d_);
-    vmul_d_->Compute(gates + d2_, gates + d3_, ht, d_);
+    this->ComputeCtHt = refer::LSTMCtHt<T>;
+    this->ComputeC1H1 = refer::LSTMC1H1<T>;
   }
 
+#ifdef PADDLE_WITH_XBYAK
+
  private:
-  int d_, d2_, d3_;
-  std::shared_ptr<const VActKernel<T>> act_gate_d3_, act_gate_d_, act_cand_d_,
-      act_cell_d_;
-  std::shared_ptr<const VMulKernel<T>> vmul_d_;
-  std::shared_ptr<const VAddKernel<T>> vadd_d_;
-#ifdef __AVX__
-  std::unique_ptr<const AVXAct> avx_act_gate_, avx_act_cand_, avx_act_cell_;
+  std::unique_ptr<gen::LSTMJitCode> jitcode0_{nullptr}, jitcode1_{nullptr};
 #endif
 };
 
-#define INTRI8_FLOAT(isa)                                                    \
-  template <>                                                                \
-  LSTMKernelImpl<float, isa, kEQ8>::LSTMKernelImpl(                          \
-      const std::string& act_gate, const std::string& act_cand,              \
-      const std::string& act_cell, int d)                                    \
-      : LSTMKernel<float>() {                                                \
-    avx_act_gate_ = GetAVXAct<isa>(act_gate);                                \
-    avx_act_cand_ = GetAVXAct<isa>(act_cand);                                \
-    avx_act_cell_ = GetAVXAct<isa>(act_cell);                                \
-  }                                                                          \
-  template <>                                                                \
-  void LSTMKernelImpl<float, isa, kEQ8>::ComputeCtHt(                        \
-      float* gates, const float* ct_1, float* ct, float* ht,                 \
-      const float* wp_data, float* checked) const {                          \
-    /* gates: W_ch, W_ih, W_fh, W_oh */                                      \
-    __m256 c, i, f, o;                                                       \
-    c = _mm256_loadu_ps(gates);                                              \
-    i = _mm256_loadu_ps(gates + 8);                                          \
-    f = _mm256_loadu_ps(gates + 16);                                         \
-    o = _mm256_loadu_ps(gates + 24);                                         \
-    /* C_t = C_t-1 * fgated + cand_gated * igated*/                          \
-    c = _mm256_mul_ps(avx_act_cand_->Compute(c), avx_act_gate_->Compute(i)); \
-    i = _mm256_loadu_ps(ct_1);                                               \
-    f = _mm256_mul_ps(i, avx_act_gate_->Compute(f));                         \
-    f = _mm256_add_ps(c, f);                                                 \
-    _mm256_storeu_ps(ct, f);                                                 \
-    /* H_t = act_cell(C_t) * ogated */                                       \
-    o = _mm256_mul_ps(avx_act_cell_->Compute(f), avx_act_gate_->Compute(o)); \
-    _mm256_storeu_ps(ht, o);                                                 \
-  }                                                                          \
-  template <>                                                                \
-  void LSTMKernelImpl<float, isa, kEQ8>::ComputeC1H1(                        \
-      float* gates, float* ct, float* ht, const float* wp_data) const {      \
-    __m256 c, i, o;                                                          \
-    c = _mm256_loadu_ps(gates);                                              \
-    i = _mm256_loadu_ps(gates + 8);                                          \
-    o = _mm256_loadu_ps(gates + 24);                                         \
-    /* C_t = igated * cgated*/                                               \
-    c = _mm256_mul_ps(avx_act_gate_->Compute(i), avx_act_cand_->Compute(c)); \
-    _mm256_storeu_ps(ct, c);                                                 \
-    /* H_t = act_cell(C_t) * ogated */                                       \
-    o = _mm256_mul_ps(avx_act_cell_->Compute(c), avx_act_gate_->Compute(o)); \
-    _mm256_storeu_ps(ht, o);                                                 \
-  }
-
-// TODO(TJ): optimize keq16
-
-#ifdef __AVX__
-INTRI8_FLOAT(jit::avx);
-#endif
-#ifdef __AVX2__
-INTRI8_FLOAT(jit::avx2);
-#endif
-#ifdef __AVX512F__
-INTRI8_FLOAT(jit::avx512f);
+#ifdef PADDLE_WITH_XBYAK
+template <>
+bool LSTMKernelImpl<float>::useJIT(int d) {
+  return false;  // not ready yet gen::LSTMJitCode::init(d);
+}
 #endif
 
 /* Peephole JitKernel */
-template <typename T, jit::cpu_isa_t isa, jit_block>
+template <typename T>
 class PeepholeKernelImpl : public LSTMKernel<T> {
  public:
-  explicit PeepholeKernelImpl(const std::string& act_gate,
-                              const std::string& act_cand,
-                              const std::string& act_cell, int d)
-      : LSTMKernel<T>() {
-    d_ = d;
-    d2_ = d * 2;
-    d3_ = d * 3;
-    act_gate_d_ = GetActKernel<T>(act_gate, d);
-    act_cand_d_ = GetActKernel<T>(act_cand, d);
-    act_cell_d_ = GetActKernel<T>(act_cell, d);
-    vmul_d_ = KernelPool::Instance().template Get<VMulKernel<T>>(d);
-    vadd_d_ = KernelPool::Instance().template Get<VAddKernel<T>>(d);
-    vadd_d2_ = KernelPool::Instance().template Get<VAddKernel<T>>(d2_);
-    act_gate_d2_ = GetActKernel<T>(act_gate, d2_);
+  static inline std::string name(const lstm_attr_t& attr) {
+    PADDLE_THROW("DType should be either float or double");
   }
+  static inline bool useJIT(int d) { return false; }
+  static inline bool useMKL(int d) { return false; }
+  explicit PeepholeKernelImpl(const lstm_attr_t& attr) : LSTMKernel<T>() {
+#ifdef PADDLE_WITH_XBYAK
+    if (useJIT(attr.d)) {
+      size_t sz = 96 + attr.d / YMM_FLOAT_BLOCK * 84 * 8;  // should change
+      jitcode0_.reset(new gen::LSTMJitCode(false, attr, sz > 4096 ? sz : 4096));
+      this->ComputeCtHt =
+          jitcode0_->getCode<void (*)(lstm_t*, const lstm_attr_t*)>();
+
+      jitcode1_.reset(new gen::LSTMJitCode(true, attr, sz > 4096 ? sz : 4096));
+      this->ComputeC1H1 =
+          jitcode1_->getCode<void (*)(lstm_t*, const lstm_attr_t*)>();
+      return;
+    }
+#endif
 
-  void ComputeCtHt(T* gates, const T* ct_1, T* ct, T* ht, const T* wp_data,
-                   T* checked) const override {
-    /* get fgated and igated*/
-    vmul_d_->Compute(wp_data, ct_1, checked, d_);
-    vmul_d_->Compute(wp_data + d_, ct_1, checked + d_, d_);
-    vadd_d2_->Compute(checked, gates + d_, gates + d_, d2_);
-    act_gate_d2_->Compute(gates + d_, gates + d_, d2_);
-    /* C_t = C_t-1 * fgated + cand_gated * igated*/
-    act_cand_d_->Compute(gates, gates, d_);
-    vmul_d_->Compute(gates, gates + d_, gates + d_, d_);
-    vmul_d_->Compute(ct_1, gates + d2_, gates + d2_, d_);
-    vadd_d_->Compute(gates + d_, gates + d2_, ct, d_);
-    /* get ogated*/
-    vmul_d_->Compute(wp_data + d2_, ct, gates + d_, d_);
-    vadd_d_->Compute(gates + d_, gates + d3_, gates + d3_, d_);
-    act_gate_d_->Compute(gates + d3_, gates + d3_, d_);
-    /* H_t = act_cell(C_t) * ogated */
-    act_cell_d_->Compute(ct, gates + d2_, d_);
-    vmul_d_->Compute(gates + d2_, gates + d3_, ht, d_);
+    this->ComputeCtHt = refer::LSTMCtHt<T>;
+    this->ComputeC1H1 = refer::LSTMC1H1<T>;
   }
 
-  void ComputeC1H1(T* gates, T* ct, T* ht, const T* wp_data) const override {
-    /* C_t = igated * cgated*/
-    act_gate_d_->Compute(gates + d_, gates + d_, d_);
-    act_cand_d_->Compute(gates, gates, d_);
-    vmul_d_->Compute(gates, gates + d_, ct, d_);
-    /* get outgated, put W_oc * C_t on igated */
-    vmul_d_->Compute(wp_data + d2_, ct, gates + d_, d_);
-    vadd_d_->Compute(gates + d_, gates + d3_, gates + d3_, d_);
-    /* H_t = act_cell(C_t) * ogated */
-    act_gate_d_->Compute(gates + d3_, gates + d3_, d_);
-    act_cell_d_->Compute(ct, gates + d2_, d_);
-    vmul_d_->Compute(gates + d2_, gates + d3_, ht, d_);
-  }
+#ifdef PADDLE_WITH_XBYAK
 
  private:
-  int d_, d2_, d3_;
-  std::shared_ptr<const VActKernel<T>> act_gate_d2_, act_gate_d_, act_cand_d_,
-      act_cell_d_;
-  std::shared_ptr<const VMulKernel<T>> vmul_d_;
-  std::shared_ptr<const VAddKernel<T>> vadd_d_, vadd_d2_;
+  std::unique_ptr<gen::LSTMJitCode> jitcode0_{nullptr}, jitcode1_{nullptr};
+#endif
 };
 
-#define JITKERNEL_DECLARE_LSTM(ker_class, ker_dtype)                  \
-  template <>                                                         \
-  std::shared_ptr<const LSTMKernel<ker_dtype>>                        \
-  KernelPool::Get<LSTMKernel<ker_dtype>, const std::string&,          \
-                  const std::string&, const std::string&, int, bool>( \
-      const std::string& act_gate, const std::string& act_cand,       \
-      const std::string& act_cell, int d, bool use_peephole)
-
-#define JITKERNEL_KEY_LSTM(ker_key, dtype_key)                               \
-  #ker_key #dtype_key + std::to_string(d) + act_gate + act_cand + act_cell + \
-                                       (use_peephole ? "p" : "n")
-
-#define JITKERNEL_NEW_LSTM_IMPL(ker, dtype, isa, k)                    \
-  if (use_peephole) {                                                  \
-    p = std::dynamic_pointer_cast<ker<dtype>>(                         \
-        std::make_shared<PeepholeKernelImpl<dtype, isa, k>>(           \
-            act_gate, act_cand, act_cell, d));                         \
-  } else {                                                             \
-    p = std::dynamic_pointer_cast<ker<dtype>>(                         \
-        std::make_shared<ker##Impl<dtype, isa, k>>(act_gate, act_cand, \
-                                                   act_cell, d));      \
+#ifdef PADDLE_WITH_XBYAK
+template <>
+bool PeepholeKernelImpl<float>::useJIT(int d) {
+  return false;  // peephole jitcode not ready yet
+}
+#endif
+
+#define JITKERNEL_DEFINE_NAME_LSTM(ker_key, ker_class)                 \
+  template <>                                                          \
+  std::string ker_class##Impl<float>::name(const lstm_attr_t& attr) {  \
+    std::string key(#ker_key "f");                                     \
+    key += (attr.act_gate + attr.act_cand + attr.act_cell +            \
+            (attr.use_peephole ? "p" : "n"));                          \
+    if (useJIT(attr.d)) {                                              \
+      /* only jit code need record d*/                                 \
+      return key + "jit" + std::to_string(attr.d);                     \
+    } else if (useMKL(attr.d)) {                                       \
+      return key + "mkl";                                              \
+    } else {                                                           \
+      return key + "any";                                              \
+    }                                                                  \
+  }                                                                    \
+  template <>                                                          \
+  std::string ker_class##Impl<double>::name(const lstm_attr_t& attr) { \
+    std::string key(#ker_key "d");                                     \
+    /* jit code do not support double yet*/                            \
+    if (useMKL(attr.d)) {                                              \
+      return key + "mkl";                                              \
+    } else {                                                           \
+      return key + "any";                                              \
+    }                                                                  \
   }
 
-REGISTER_JITKERNEL_ARGS_DEPRECATED(lstm, LSTMKernel, JITKERNEL_DECLARE_LSTM,
-                                   JITKERNEL_KEY_LSTM, JITKERNEL_NEW_LSTM_IMPL);
+#define JITKERNEL_DECLARE_LSTM(ker_class, ker_dtype)          \
+  template <>                                                 \
+  std::shared_ptr<const LSTMKernel<ker_dtype>>                \
+  KernelPool::Get<LSTMKernel<ker_dtype>, const lstm_attr_t&>( \
+      const lstm_attr_t& attr)
+
+#define JITKERNEL_FIND_KEY_LSTM(ker_class, ker_dtype) \
+  std::string key = ker_class##Impl<ker_dtype>::name(attr)
+
+#define JITKERNEL_LSTM_IMPL(ker, dtype)                     \
+  if (attr.use_peephole) {                                  \
+    p = std::dynamic_pointer_cast<ker<dtype>>(              \
+        std::make_shared<PeepholeKernelImpl<dtype>>(attr)); \
+  } else {                                                  \
+    p = std::dynamic_pointer_cast<ker<dtype>>(              \
+        std::make_shared<ker##Impl<dtype>>(attr));          \
+  }
 
-#undef INTRI8_FLOAT
-#undef JITKERNEL_DECLARE_LSTM
-#undef JITKERNEL_KEY_LSTM
-#undef JITKERNEL_NEW_LSTM_IMPL
+REGISTER_JITKERNEL_ARGS(lstm, LSTMKernel, JITKERNEL_DEFINE_NAME_LSTM,
+                        JITKERNEL_DECLARE_LSTM, JITKERNEL_FIND_KEY_LSTM,
+                        JITKERNEL_LSTM_IMPL);
 
 /* GRU JitKernel */
 template <typename T, jit::cpu_isa_t isa, jit_block>
diff --git a/paddle/fluid/operators/math/jit_kernel_test.cc b/paddle/fluid/operators/math/jit_kernel_test.cc
index a1705a81c4..1cbe1b5d95 100644
--- a/paddle/fluid/operators/math/jit_kernel_test.cc
+++ b/paddle/fluid/operators/math/jit_kernel_test.cc
@@ -341,11 +341,11 @@ TEST(JitKernel, lstm) {
     RandomVec<float>(d, ct_1.data(), -2.f, 2.f);
     memcpy(xref.data(), x.data(), sizeof(float) * d4);
     std::string act_gate = "sigmoid", act_cand = "tanh", act_cell = "tanh";
+    const jit::lstm_attr_t attr(d, act_gate, act_cand, act_cell, false);
     const auto& ker =
         jit::KernelPool::Instance()
-            .template Get<jit::LSTMKernel<float>, const std::string&,
-                          const std::string&, const std::string&>(
-                act_gate, act_cand, act_cell, d, false);
+            .template Get<jit::LSTMKernel<float>, const jit::lstm_attr_t&>(
+                attr);
     // below kernels are used to compute refer
     const auto& vsigmoid_3d =
         jit::KernelPool::Instance().template Get<jit::VSigmoidKernel<float>>(
@@ -366,14 +366,16 @@ TEST(JitKernel, lstm) {
     float* ht_ref_data = ht_ref.data();
     // compute once to check correctness
     jit::lstm_t step;
-    jit::lstm_attr_t attr(d, act_gate, act_cand, act_cell);
     step.gates = xref_data;
     step.ct_1 = ct_1_data;
     step.ct = ct_ref_data;
     step.ht = ht_ref_data;
     refer::LSTMCtHt<float>(&step, &attr);
 
-    ker->ComputeCtHt(x_data, ct_1_data, ct_tgt_data, ht_tgt_data);
+    step.gates = x_data;
+    step.ct = ct_tgt_data;
+    step.ht = ht_tgt_data;
+    ker->ComputeCtHt(&step, &attr);
     for (int i = 0; i < d; ++i) {
       EXPECT_NEAR(ct_tgt_data[i], ct_ref_data[i], 1e-3);
       EXPECT_NEAR(ht_tgt_data[i], ht_ref_data[i], 1e-3);
@@ -392,7 +394,7 @@ TEST(JitKernel, lstm) {
     auto trefe = GetCurrentUS();
     auto ttgts = GetCurrentUS();
     for (int i = 0; i < repeat; ++i) {
-      ker->ComputeCtHt(x_data, ct_1_data, ct_tgt_data, ht_tgt_data);
+      ker->ComputeCtHt(&step, &attr);
     }
     auto ttgte = GetCurrentUS();
     VLOG(30) << "Vec size " << d
@@ -710,21 +712,21 @@ TEST(JitKernel, pool) {
   namespace jit = paddle::operators::math::jitkernel;
   const int frame_size = 4;
   std::string act_gate = "sigmoid", act_cand = "tanh", act_cell = "tanh";
+  jit::lstm_attr_t attr(frame_size, act_gate, act_cand, act_cell, false);
+
   const auto& plstm1 =
       jit::KernelPool::Instance()
-          .template Get<jit::LSTMKernel<float>, const std::string&,
-                        const std::string&, const std::string&>(
-              act_gate, act_cand, act_cell, frame_size, false);
+          .template Get<jit::LSTMKernel<float>, const jit::lstm_attr_t&>(attr);
+
   const auto& plstm2 =
       jit::KernelPool::Instance()
-          .template Get<jit::LSTMKernel<float>, const std::string&,
-                        const std::string&, const std::string&>(
-              act_gate, act_cand, act_cell, frame_size, false);
+          .template Get<jit::LSTMKernel<float>, const jit::lstm_attr_t&>(attr);
+  EXPECT_EQ(plstm1, plstm2);
+
   const auto& peephole =
       jit::KernelPool::Instance()
-          .template Get<jit::LSTMKernel<float>, const std::string&,
-                        const std::string&, const std::string&>(
-              act_gate, act_cand, act_cell, frame_size, true);
+          .template Get<jit::LSTMKernel<float>, const jit::lstm_attr_t&>(
+              jit::lstm_attr_t(frame_size, act_gate, act_cand, act_cell, true));
   EXPECT_TRUE(plstm1 != peephole);
 
   const auto& pvmul_f =

From 94de2290f4cbc6df0050d06a2d381ba941eb78d1 Mon Sep 17 00:00:00 2001
From: jerrywgz <jerrwgz@126.com>
Date: Wed, 21 Nov 2018 08:58:04 +0000
Subject: [PATCH 041/113] fix format in api doc, test=develop

---
 python/paddle/fluid/layers/nn.py | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py
index 5749bcc54a..be120e45d2 100644
--- a/python/paddle/fluid/layers/nn.py
+++ b/python/paddle/fluid/layers/nn.py
@@ -6904,13 +6904,13 @@ def prelu(x, mode, param_attr=None, name=None):
 
     Args:
         x (Variable): The input tensor.
-	  param_attr(ParamAttr|None): The parameter attribute for the learnable
-                                      weight (alpha).
+        param_attr(ParamAttr|None): The parameter attribute for the learnable
+                       weight (alpha).
         mode (string): The mode for weight sharing. It supports all, channel
                        and element. all: all elements share same weight
                        channel:elements in a channel share same weight
                        element:each element has a weight
-	name(str|None): A name for this layer(optional). If set None, the layer
+        name(str|None): A name for this layer(optional). If set None, the layer
                        will be named automatically.
 
     Returns:
@@ -6920,9 +6920,9 @@ def prelu(x, mode, param_attr=None, name=None):
 
         .. code-block:: python
 
-         x = fluid.layers.data(name="x", shape=[10,10], dtype="float32")
-         mode = 'channel'
-         output = fluid.layers.prelu(x,mode)
+            x = fluid.layers.data(name="x", shape=[10,10], dtype="float32")
+            mode = 'channel'
+            output = fluid.layers.prelu(x,mode)
     """
     helper = LayerHelper('prelu', **locals())
     if mode not in ['all', 'channel', 'element']:

From f10e196fc8de5d76333940a263dabf33f0450fa5 Mon Sep 17 00:00:00 2001
From: peizhilin <wopeizl@163.com>
Date: Wed, 21 Nov 2018 18:09:44 +0800
Subject: [PATCH 042/113] fix build issue

---
 paddle/fluid/inference/tensorrt/convert/CMakeLists.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt b/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt
index 27fb41d16e..840abd26a7 100644
--- a/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt
+++ b/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt
@@ -18,7 +18,7 @@ nv_test(test_trt_activation_op SRCS test_activation_op.cc activation_op.cc
 nv_test(test_trt_conv_op SRCS test_conv2d_op.cc conv2d_op.cc
         DEPS ${FLUID_CORE_MODULES} ${GLOB_OPERATOR_DEPS} tensorrt_engine conv_op conv_transpose_op SERIAL)
 nv_test(test_trt_pool2d_op SRCS test_pool2d_op.cc pool2d_op.cc
-        DEPS ${FLUID_CORE_MODULES} ${GLOB_OPERATOR_DEPS} tensorrt_engine pool_op SERIAL)
+        DEPS ${FLUID_CORE_MODULES} ${GLOB_OPERATOR_DEPS} tensorrt_engine pool_op tensorrt_plugin SERIAL)
 nv_test(test_trt_elementwise_op SRCS test_elementwise_op.cc elementwise_op.cc
         DEPS ${FLUID_CORE_MODULES} ${GLOB_OPERATOR_DEPS} tensorrt_engine tensorrt_plugin
              elementwise_add_op elementwise_mul_op SERIAL)

From 3e3599f3d937e0444606056f3c9f2261b74dfd93 Mon Sep 17 00:00:00 2001
From: hjchen2 <chenhoujiangcug@gmail.com>
Date: Wed, 21 Nov 2018 11:31:04 +0000
Subject: [PATCH 043/113] Refine split tensorrt plugin

---
 .../inference/tensorrt/convert/split_op.cc    |   3 +-
 .../tensorrt/plugin/split_op_plugin.cu        | 157 ++++++++++++++----
 .../tensorrt/plugin/split_op_plugin.h         |   9 +-
 3 files changed, 134 insertions(+), 35 deletions(-)

diff --git a/paddle/fluid/inference/tensorrt/convert/split_op.cc b/paddle/fluid/inference/tensorrt/convert/split_op.cc
index 6620c76318..871354267e 100644
--- a/paddle/fluid/inference/tensorrt/convert/split_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/split_op.cc
@@ -40,7 +40,7 @@ class SplitOpConverter : public OpConverter {
     int axis = boost::get<int>(op_desc.GetAttr("axis"));
     std::vector<int> output_lengths =
         boost::get<std::vector<int>>(op_desc.GetAttr("sections"));
-    PADDLE_ENFORCE(axis != 0);
+    // PADDLE_ENFORCE(axis != 0);
     if (axis < 0) {
       axis += input_dims.nbDims;
     } else {
@@ -48,7 +48,6 @@ class SplitOpConverter : public OpConverter {
     }
 
     PADDLE_ENFORCE(output_lengths.size() == output_num);
-
     //
     plugin::SplitPlugin* plugin = new plugin::SplitPlugin(axis, output_lengths);
     nvinfer1::IPluginLayer* layer =
diff --git a/paddle/fluid/inference/tensorrt/plugin/split_op_plugin.cu b/paddle/fluid/inference/tensorrt/plugin/split_op_plugin.cu
index 4adea2db1e..1ec0753e9f 100644
--- a/paddle/fluid/inference/tensorrt/plugin/split_op_plugin.cu
+++ b/paddle/fluid/inference/tensorrt/plugin/split_op_plugin.cu
@@ -12,6 +12,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+#include <cuda_fp16.h>
+#include <algorithm>
 #include "paddle/fluid/inference/tensorrt/plugin/split_op_plugin.h"
 
 namespace paddle {
@@ -19,6 +21,52 @@ namespace inference {
 namespace tensorrt {
 namespace plugin {
 
+// copied from operators::math::SplitFunctor
+template <typename T>
+__global__ void SplitKernel(const T* input_data, const int in_row,
+                            const int in_col, const int* out_cols,
+                            int out_cols_size, T** outputs_data) {
+  int tid_x = blockIdx.x * blockDim.x + threadIdx.x;
+  int curr_segment = 0;
+  int curr_offset = out_cols[0];
+  for (; tid_x < in_col; tid_x += blockDim.x * gridDim.x) {
+    int curr_col_offset = out_cols[curr_segment + 1];
+    while (curr_col_offset <= tid_x) {
+      curr_offset = curr_col_offset;
+      ++curr_segment;
+      curr_col_offset = out_cols[curr_segment + 1];
+    }
+
+    int local_col = tid_x - curr_offset;
+    int segment_width = curr_col_offset - curr_offset;
+    T* output_ptr = outputs_data[curr_segment];
+    if (output_ptr != nullptr) {
+      int tid_y = blockIdx.y * blockDim.y + threadIdx.y;
+      for (; tid_y < in_row; tid_y += blockDim.y * gridDim.y)
+        output_ptr[tid_y * segment_width + local_col] =
+            input_data[tid_y * in_col + tid_x];
+    }
+  }
+}
+
+template <typename T>
+__global__ void SplitKernel(const T* input_data, const int in_row,
+                            const int in_col, const int fixed_out_col,
+                            T** outputs_data) {
+  int tid_x = blockIdx.x * blockDim.x + threadIdx.x;
+  for (; tid_x < in_col; tid_x += blockDim.x * gridDim.x) {
+    int split = tid_x / fixed_out_col;
+    int in_offset = tid_x - split * fixed_out_col;
+    T* output_ptr = outputs_data[split];
+    if (output_ptr != nullptr) {
+      int tid_y = blockIdx.y * blockDim.y + threadIdx.y;
+      for (; tid_y < in_row; tid_y += blockDim.y * gridDim.y)
+        output_ptr[tid_y * fixed_out_col + in_offset] =
+            input_data[tid_y * in_col + tid_x];
+    }
+  }
+}
+
 nvinfer1::Dims SplitPlugin::getOutputDimensions(
     int index, const nvinfer1::Dims* input_dims, int num_inputs) {
   PADDLE_ENFORCE_EQ(num_inputs, 1);
@@ -31,48 +79,95 @@ nvinfer1::Dims SplitPlugin::getOutputDimensions(
 
 int SplitPlugin::initialize() {
   PADDLE_ENFORCE_LE(axis_, nvinfer1::Dims::MAX_DIMS);
-
+  // notice input dims is [C, H, W]
+  nvinfer1::Dims dims = this->getInputDims(0);
+  outer_rows_ = 1;
+  inner_cols_ = 1;
+  for (int i = 0; i < axis_; ++i) {
+    outer_rows_ *= dims.d[i];
+  }
+  for (int i = axis_ + 1; i < dims.nbDims; ++i) {
+    inner_cols_ *= dims.d[i];
+  }
+  same_shape_ = true;
   std::vector<int> segment_offsets(1, 0);
   for (int i = 0; i < this->getNbOutputs(); ++i) {
-    segment_offsets.push_back(segment_offsets.back() + output_length_[i]);
+    if (output_length_[i] != output_length_[0]) {
+      same_shape_ = false;
+    }
+    segment_offsets.push_back(segment_offsets.back() +
+                              output_length_[i] * inner_cols_);
   }
-  segment_offsets_ = segment_offsets;
-  nvinfer1::Dims dims = this->getInputDims(0);
-  nx_ = 1;
-  for (int i = dims.nbDims - 1; i > axis_; --i) {
-    nx_ *= dims.d[i];
+  inner_cols_ *= dims.d[axis_];
+  d_segment_offsets_ = segment_offsets;
+  segment_offsets_ = std::move(segment_offsets);
+  d_output_ptrs_.resize(this->getNbOutputs(), nullptr);
+  return 0;
+}
+
+template <typename T>
+inline void Split(cudaStream_t stream, const bool same_shape,
+                  const int outer_rows, const int inner_cols,
+                  const std::vector<int>& segment_offsets,
+                  const int* d_segment_offsets, const T* input, T** outputs) {
+  const int kThreadsPerBlock = 1024;
+  const int kMaxBlocks = 65535;
+  int block_cols = kThreadsPerBlock;
+  if (inner_cols < kThreadsPerBlock) {  // block_cols is aligned by 32.
+    block_cols = ((inner_cols + 31) >> 5) << 5;
   }
-  ny_ = dims.d[axis_];
-  nz_ = 1;
-  for (int i = axis_ - 1; i >= 0; --i) {
-    nz_ *= dims.d[i];
+  int block_rows = kThreadsPerBlock / block_cols;
+  dim3 block_size = dim3(block_cols, block_rows, 1);
+
+  int grid_cols =
+      std::min((inner_cols + block_cols - 1) / block_cols, kMaxBlocks);
+  int grid_rows =
+      std::min(kMaxBlocks / grid_cols, std::max(outer_rows / block_rows, 1));
+  dim3 grid_size = dim3(grid_cols, grid_rows, 1);
+
+  if (same_shape) {
+    SplitKernel<<<grid_size, block_size, 0, stream>>>(
+        input, outer_rows, inner_cols, segment_offsets[1], outputs);
+  } else {
+    SplitKernel<<<grid_size, block_size, 0, stream>>>(
+        input, outer_rows, inner_cols, d_segment_offsets,
+        static_cast<int>(segment_offsets.size()), outputs);
   }
-  return 0;
 }
 
 int SplitPlugin::enqueue(int batchSize, const void* const* inputs,
                          void** outputs, void* workspace, cudaStream_t stream) {
-  auto const& input_dims = this->getInputDims(0);
-  int input_size = 0;
-  float const* idata = reinterpret_cast<float const*>(inputs[0]);
-  float** odatas = reinterpret_cast<float**>(outputs);
-
-  // kernel impl here.
-  int inputBatchOffset = nx_ * ny_ * nz_;
-  for (size_t i = 0; i < this->getNbOutputs(); i++) {
-    for (size_t j = 0; j < batchSize; j++) {
-      cudaMemcpyAsync(
-          odatas[i] +
-              j * (segment_offsets_[i + 1] - segment_offsets_[i]) * nx_ *
-                  sizeof(float),
-          inputs[0] +
-              (inputBatchOffset * j + segment_offsets_[i] * nx_) *
-                  sizeof(float),
-          (segment_offsets_[i + 1] - segment_offsets_[i]) * nx_ * sizeof(float),
-          cudaMemcpyDeviceToDevice, stream);
+  float const* input_ptr = reinterpret_cast<float const*>(inputs[0]);
+  if (axis_ == -1 && this->getNbOutputs() < 10) {
+    float** output_ptrs = reinterpret_cast<float**>(outputs);
+    int data_type_size = (this->getDataType() == nvinfer1::DataType::kFLOAT)
+                             ? sizeof(__half)
+                             : sizeof(float);
+    for (int i = 0; i < this->getNbOutputs(); ++i) {
+      PADDLE_ENFORCE(
+          cudaMemcpyAsync(
+              output_ptrs[i], input_ptr + segment_offsets_[i],
+              (segment_offsets_[i + 1] - segment_offsets_[i]) * data_type_size,
+              cudaMemcpyDeviceToDevice, stream) == cudaSuccess);
+    }
+  } else {
+    outer_rows_ *= batchSize;
+    const int* d_segment_offsets_ptr =
+        thrust::raw_pointer_cast(&d_segment_offsets_[0]);
+    float** output_ptrs = thrust::raw_pointer_cast(&d_output_ptrs_[0]);
+    PADDLE_ENFORCE(cudaMemcpyAsync(output_ptrs, outputs,
+                                   this->getNbOutputs() * sizeof(float*),
+                                   cudaMemcpyHostToDevice,
+                                   stream) == cudaSuccess);
+    if (this->getDataType() == nvinfer1::DataType::kFLOAT) {
+      Split(stream, same_shape_, outer_rows_, inner_cols_, segment_offsets_,
+            d_segment_offsets_ptr, input_ptr, output_ptrs);
+    } else {
+      Split(stream, same_shape_, outer_rows_, inner_cols_, segment_offsets_,
+            d_segment_offsets_ptr, (__half*)input_ptr,  // NOLINT
+            (__half**)output_ptrs);                     // NOLINT
     }
   }
-
   return cudaGetLastError() != cudaSuccess;
 }
 
diff --git a/paddle/fluid/inference/tensorrt/plugin/split_op_plugin.h b/paddle/fluid/inference/tensorrt/plugin/split_op_plugin.h
index b5b6e69992..6f028d3d72 100644
--- a/paddle/fluid/inference/tensorrt/plugin/split_op_plugin.h
+++ b/paddle/fluid/inference/tensorrt/plugin/split_op_plugin.h
@@ -14,6 +14,7 @@
 
 #pragma once
 
+#include <thrust/device_vector.h>
 #include <vector>
 #include "paddle/fluid/inference/tensorrt/plugin/trt_plugin.h"
 
@@ -25,7 +26,7 @@ namespace plugin {
 class SplitPlugin : public PluginTensorRT {
  public:
   SplitPlugin(int axis, std::vector<int> const &output_lengths)
-      : axis_(axis), output_length_(output_lengths) {}
+      : axis_(axis), same_shape_(true), output_length_(output_lengths) {}
 
   SplitPlugin(void const *serial_data, size_t serial_length) {
     deserializeBase(serial_data, serial_length);
@@ -60,9 +61,13 @@ class SplitPlugin : public PluginTensorRT {
   }
 
   int axis_;
+  int outer_rows_;
+  int inner_cols_;
+  bool same_shape_;
   std::vector<int> output_length_;
-  int nx_, ny_, nz_;
   std::vector<int> segment_offsets_;
+  thrust::device_vector<int> d_segment_offsets_;
+  thrust::device_vector<float *> d_output_ptrs_;
 };
 
 }  // namespace plugin

From 35620513023000ceb47ec0b57909dae4f0634355 Mon Sep 17 00:00:00 2001
From: tensor-tang <tangjian03@baidu.com>
Date: Wed, 21 Nov 2018 12:37:28 +0000
Subject: [PATCH 044/113] add gru refer code and remove redundant avx code

test=develop
---
 paddle/fluid/operators/fused/fusion_gru_op.cc |  67 ++--
 paddle/fluid/operators/math/jit_kernel.h      |   8 +-
 paddle/fluid/operators/math/jit_kernel_exp.cc | 152 ---------
 paddle/fluid/operators/math/jit_kernel_impl.h |  30 +-
 .../fluid/operators/math/jit_kernel_refer.h   |  40 +++
 paddle/fluid/operators/math/jit_kernel_rnn.cc | 294 ++++--------------
 6 files changed, 163 insertions(+), 428 deletions(-)

diff --git a/paddle/fluid/operators/fused/fusion_gru_op.cc b/paddle/fluid/operators/fused/fusion_gru_op.cc
index 7e34d1019c..25b7ae7c28 100644
--- a/paddle/fluid/operators/fused/fusion_gru_op.cc
+++ b/paddle/fluid/operators/fused/fusion_gru_op.cc
@@ -183,24 +183,27 @@ class FusionGRUKernel : public framework::OpKernel<T> {
   const int total_T = x_dims[0];           \
   const int D3 = wh_dims[1]
 
-#define INIT_OTHER_DEFINES                                                     \
-  auto* h0 = ctx.Input<Tensor>("H0");                                          \
-  auto* wx = ctx.Input<Tensor>("WeightX");                                     \
-  auto* bias = ctx.Input<Tensor>("Bias");                                      \
-  auto* hidden_out = ctx.Output<LoDTensor>("Hidden");                          \
-  bool is_reverse = ctx.Attr<bool>("is_reverse");                              \
-  const int M = x_dims[1];                                                     \
-  const int D = wh_dims[0];                                                    \
-  const int D2 = D * 2;                                                        \
-  const auto& ker = math::jitkernel::KernelPool::Instance()                    \
-                        .template Get<math::jitkernel::GRUKernel<T>,           \
-                                      const std::string&, const std::string&>( \
-                            ctx.Attr<std::string>("gate_activation"),          \
-                            ctx.Attr<std::string>("activation"), D);           \
-  const T* x_data = x->data<T>();                                              \
-  const T* wx_data = wx->data<T>();                                            \
-  const T* wh_data = wh->data<T>();                                            \
-  auto place = ctx.GetPlace();                                                 \
+#define INIT_OTHER_DEFINES                                         \
+  auto* h0 = ctx.Input<Tensor>("H0");                              \
+  auto* wx = ctx.Input<Tensor>("WeightX");                         \
+  auto* bias = ctx.Input<Tensor>("Bias");                          \
+  auto* hidden_out = ctx.Output<LoDTensor>("Hidden");              \
+  bool is_reverse = ctx.Attr<bool>("is_reverse");                  \
+  const int M = x_dims[1];                                         \
+  const int D = wh_dims[0];                                        \
+  const int D2 = D * 2;                                            \
+  const math::jitkernel::gru_attr_t attr(                          \
+      D, ctx.Attr<std::string>("gate_activation"),                 \
+      ctx.Attr<std::string>("activation"));                        \
+  math::jitkernel::gru_t one_step;                                 \
+  const auto& ker =                                                \
+      math::jitkernel::KernelPool::Instance()                      \
+          .template Get<math::jitkernel::GRUKernel<T>,             \
+                        const math::jitkernel::gru_attr_t&>(attr); \
+  const T* x_data = x->data<T>();                                  \
+  const T* wx_data = wx->data<T>();                                \
+  const T* wh_data = wh->data<T>();                                \
+  auto place = ctx.GetPlace();                                     \
   T* xx_data = xx->mutable_data<T>(place)
 
   void SeqCompute(const framework::ExecutionContext& ctx) const {
@@ -237,7 +240,9 @@ class FusionGRUKernel : public framework::OpKernel<T> {
       if (h0_data) {
         prev_hidden_data = h0_data + bid * D;
       } else {
-        ker->ComputeH1(xx_data, hidden_out_data);
+        one_step.gates = xx_data;
+        one_step.ht = hidden_out_data;
+        ker->ComputeH1(&one_step, &attr);
         prev_hidden_data = hidden_out_data;
         tstart = 1;
         move_step();
@@ -247,12 +252,15 @@ class FusionGRUKernel : public framework::OpKernel<T> {
         blas.GEMM(CblasNoTrans, CblasNoTrans, 1, D2, D, static_cast<T>(1),
                   prev_hidden_data, D, wh_data, D2, static_cast<T>(1), xx_data,
                   D3);
-        ker->ComputeHtPart1(xx_data, prev_hidden_data, hidden_out_data);
+        one_step.gates = xx_data;
+        one_step.ht_1 = prev_hidden_data;
+        one_step.ht = hidden_out_data;
+        ker->ComputeHtPart1(&one_step, &attr);
         // gemm rt * Ws
         blas.GEMM(CblasNoTrans, CblasNoTrans, 1, D, D, static_cast<T>(1),
                   hidden_out_data, D, wh_state_data, D, static_cast<T>(1),
                   xx_data + D2, D3);
-        ker->ComputeHtPart2(xx_data, prev_hidden_data, hidden_out_data);
+        ker->ComputeHtPart2(&one_step, &attr);
         // save prev
         prev_hidden_data = hidden_out_data;
         move_step();
@@ -314,7 +322,9 @@ class FusionGRUKernel : public framework::OpKernel<T> {
       T* cur_out_data = batched_out_data;
       // W: {W_update, W_reset; W_state}
       for (int i = 0; i < max_bs; ++i) {
-        ker->ComputeH1(cur_in_data, cur_out_data);
+        one_step.gates = cur_in_data;
+        one_step.ht = cur_out_data;
+        ker->ComputeH1(&one_step, &attr);
         // add offset
         cur_in_data += D3;
         cur_out_data += D;
@@ -339,8 +349,11 @@ class FusionGRUKernel : public framework::OpKernel<T> {
       T* cur_out_data = batched_out_data;
       T* cur_prev_hidden_data = prev_hidden_data;
       for (int i = 0; i < cur_bs; ++i) {
-        ker->ComputeHtPart1(cur_batched_data, cur_prev_hidden_data,
-                            cur_out_data);
+        one_step.gates = cur_batched_data;
+        one_step.ht_1 = cur_prev_hidden_data;
+        one_step.ht = cur_out_data;
+        ker->ComputeHtPart1(&one_step, &attr);
+
         cur_batched_data += D3;
         cur_prev_hidden_data += D;
         cur_out_data += D;
@@ -354,8 +367,10 @@ class FusionGRUKernel : public framework::OpKernel<T> {
 
       cur_prev_hidden_data = prev_hidden_data;
       for (int i = 0; i < cur_bs; ++i) {
-        ker->ComputeHtPart2(cur_batched_data, cur_prev_hidden_data,
-                            cur_out_data);
+        one_step.gates = cur_batched_data;
+        one_step.ht_1 = cur_prev_hidden_data;
+        one_step.ht = cur_out_data;
+        ker->ComputeHtPart2(&one_step, &attr);
         cur_batched_data += D3;
         cur_prev_hidden_data += D;
         cur_out_data += D;
diff --git a/paddle/fluid/operators/math/jit_kernel.h b/paddle/fluid/operators/math/jit_kernel.h
index bb5ba5813a..b78b92b4f9 100644
--- a/paddle/fluid/operators/math/jit_kernel.h
+++ b/paddle/fluid/operators/math/jit_kernel.h
@@ -122,18 +122,18 @@ class VTanhKernel : public VActKernel<T> {};
 template <typename T>
 class LSTMKernel : public Kernel {
  public:
-  void (*ComputeCtHt)(lstm_t *, const lstm_attr_t *);
   // compute c1 and h1 without c0 or h0
   void (*ComputeC1H1)(lstm_t *, const lstm_attr_t *);
+  void (*ComputeCtHt)(lstm_t *, const lstm_attr_t *);
 };
 
 template <typename T>
 class GRUKernel : public Kernel {
  public:
   // compute h1 without h0
-  virtual void ComputeH1(T *gates, T *ht) const = 0;
-  virtual void ComputeHtPart1(T *gates, const T *ht_1, T *ht) const = 0;
-  virtual void ComputeHtPart2(T *gates, const T *ht_1, T *ht) const = 0;
+  void (*ComputeH1)(gru_t *, const gru_attr_t *);
+  void (*ComputeHtPart1)(gru_t *, const gru_attr_t *);
+  void (*ComputeHtPart2)(gru_t *, const gru_attr_t *);
 };
 
 template <typename T>
diff --git a/paddle/fluid/operators/math/jit_kernel_exp.cc b/paddle/fluid/operators/math/jit_kernel_exp.cc
index 1fe7d66c75..686f3dd983 100644
--- a/paddle/fluid/operators/math/jit_kernel_exp.cc
+++ b/paddle/fluid/operators/math/jit_kernel_exp.cc
@@ -25,10 +25,6 @@ limitations under the License. */
 #include "paddle/fluid/platform/dynload/mklml.h"
 #endif
 
-#ifdef __AVX__
-#include <immintrin.h>
-#endif
-
 namespace paddle {
 namespace operators {
 namespace math {
@@ -235,154 +231,6 @@ REGISTER_JITKERNEL(vexp, VExpKernel);
 REGISTER_JITKERNEL(vsigmoid, VSigmoidKernel);
 REGISTER_JITKERNEL(vtanh, VTanhKernel);
 
-namespace detail {
-
-#ifdef __AVX__
-
-#define ALIGN32 __attribute__((aligned(32)))
-
-#define _PS256_CONST(Name, Val)                                      \
-  static const float _ps256_##Name[8] ALIGN32 = {Val, Val, Val, Val, \
-                                                 Val, Val, Val, Val}
-
-#define _PI256_CONST(Name, Val)                                    \
-  static const int _pi256_##Name[8] ALIGN32 = {Val, Val, Val, Val, \
-                                               Val, Val, Val, Val}
-
-_PI256_CONST(0x7f, 0x7f);
-_PS256_CONST(one, 1.f);
-_PS256_CONST(0p5, 0.5f);
-_PS256_CONST(exp_hi, 88.3762626647949f);
-_PS256_CONST(exp_lo, -88.3762626647949f);
-_PS256_CONST(cephes_LOG2EF, 1.44269504088896341);
-_PS256_CONST(cephes_exp_C1, 0.693359375);
-_PS256_CONST(cephes_exp_C2, -2.12194440e-4);
-_PS256_CONST(cephes_exp_p0, 1.9875691500E-4);
-_PS256_CONST(cephes_exp_p1, 1.3981999507E-3);
-_PS256_CONST(cephes_exp_p2, 8.3334519073E-3);
-_PS256_CONST(cephes_exp_p3, 4.1665795894E-2);
-_PS256_CONST(cephes_exp_p4, 1.6666665459E-1);
-_PS256_CONST(cephes_exp_p5, 5.0000001201E-1);
-
-typedef union imm_xmm_union {
-  __m256i imm;
-  __m128i xmm[2];
-} imm_xmm_union;
-
-#define COPY_IMM_TO_XMM(imm_, xmm0_, xmm1_) \
-  {                                         \
-    imm_xmm_union u ALIGN32;                \
-    u.imm = imm_;                           \
-    xmm0_ = u.xmm[0];                       \
-    xmm1_ = u.xmm[1];                       \
-  }
-
-#define COPY_XMM_TO_IMM(xmm0_, xmm1_, imm_) \
-  {                                         \
-    imm_xmm_union u ALIGN32;                \
-    u.xmm[0] = xmm0_;                       \
-    u.xmm[1] = xmm1_;                       \
-    imm_ = u.imm;                           \
-  }
-
-#define AVX2_BITOP_USING_SSE2(fn)                           \
-  static inline __m256i avx2_mm256_##fn(__m256i x, int y) { \
-    /* use SSE2 to perform the bitop AVX2 */                \
-    __m128i x1, x2;                                         \
-    __m256i ret;                                            \
-    COPY_IMM_TO_XMM(x, x1, x2);                             \
-    x1 = _mm_##fn(x1, y);                                   \
-    x2 = _mm_##fn(x2, y);                                   \
-    COPY_XMM_TO_IMM(x1, x2, ret);                           \
-    return ret;                                             \
-  }
-
-#define AVX2_INTOP_USING_SSE2(fn)                                    \
-  static inline __m256i avx2_mm256_add_epi32(__m256i x, __m256i y) { \
-    /* use SSE2 to perform the AVX2 integer operation */             \
-    __m128i x1, x2;                                                  \
-    __m128i y1, y2;                                                  \
-    __m256i ret;                                                     \
-    COPY_IMM_TO_XMM(x, x1, x2);                                      \
-    COPY_IMM_TO_XMM(y, y1, y2);                                      \
-    x1 = _mm_##fn(x1, y1);                                           \
-    x2 = _mm_##fn(x2, y2);                                           \
-    COPY_XMM_TO_IMM(x1, x2, ret);                                    \
-    return ret;                                                      \
-  }
-
-AVX2_BITOP_USING_SSE2(slli_epi32);
-AVX2_INTOP_USING_SSE2(add_epi32);
-
-#define AVXEXP_BASE                                                            \
-  __m256 tmp = _mm256_setzero_ps(), fx;                                        \
-  __m256 one = *reinterpret_cast<const __m256*>(_ps256_one);                   \
-  __m256i imm0;                                                                \
-  x = _mm256_min_ps(x, *reinterpret_cast<const __m256*>(_ps256_exp_hi));       \
-  x = _mm256_max_ps(x, *reinterpret_cast<const __m256*>(_ps256_exp_lo));       \
-  /* express exp(x) as exp(g + n*log(2)) */                                    \
-  fx = _mm256_mul_ps(x,                                                        \
-                     *reinterpret_cast<const __m256*>(_ps256_cephes_LOG2EF));  \
-  fx = _mm256_add_ps(fx, *reinterpret_cast<const __m256*>(_ps256_0p5));        \
-  tmp = _mm256_floor_ps(fx);                                                   \
-  /* if greater, substract 1 */                                                \
-  __m256 mask = _mm256_cmp_ps(tmp, fx, _CMP_GT_OS);                            \
-  mask = _mm256_and_ps(mask, one);                                             \
-  fx = _mm256_sub_ps(tmp, mask);                                               \
-  tmp = _mm256_mul_ps(fx,                                                      \
-                      *reinterpret_cast<const __m256*>(_ps256_cephes_exp_C1)); \
-  __m256 z = _mm256_mul_ps(                                                    \
-      fx, *reinterpret_cast<const __m256*>(_ps256_cephes_exp_C2));             \
-  x = _mm256_sub_ps(x, tmp);                                                   \
-  x = _mm256_sub_ps(x, z);                                                     \
-  z = _mm256_mul_ps(x, x);                                                     \
-  __m256 y = *reinterpret_cast<const __m256*>(_ps256_cephes_exp_p0);           \
-  y = _mm256_mul_ps(y, x);                                                     \
-  y = _mm256_add_ps(y,                                                         \
-                    *reinterpret_cast<const __m256*>(_ps256_cephes_exp_p1));   \
-  y = _mm256_mul_ps(y, x);                                                     \
-  y = _mm256_add_ps(y,                                                         \
-                    *reinterpret_cast<const __m256*>(_ps256_cephes_exp_p2));   \
-  y = _mm256_mul_ps(y, x);                                                     \
-  y = _mm256_add_ps(y,                                                         \
-                    *reinterpret_cast<const __m256*>(_ps256_cephes_exp_p3));   \
-  y = _mm256_mul_ps(y, x);                                                     \
-  y = _mm256_add_ps(y,                                                         \
-                    *reinterpret_cast<const __m256*>(_ps256_cephes_exp_p4));   \
-  y = _mm256_mul_ps(y, x);                                                     \
-  y = _mm256_add_ps(y,                                                         \
-                    *reinterpret_cast<const __m256*>(_ps256_cephes_exp_p5));   \
-  y = _mm256_mul_ps(y, z);                                                     \
-  y = _mm256_add_ps(y, x);                                                     \
-  y = _mm256_add_ps(y, one);                                                   \
-  /* build 2^n */                                                              \
-  imm0 = _mm256_cvttps_epi32(fx)
-
-__m256 ExpAVX(__m256 x) {
-  AVXEXP_BASE;
-  // two AVX2 instructions using SSE2
-  imm0 = avx2_mm256_add_epi32(imm0,
-                              *reinterpret_cast<const __m256i*>(_pi256_0x7f));
-  imm0 = avx2_mm256_slli_epi32(imm0, 23);
-  __m256 pow2n = _mm256_castsi256_ps(imm0);
-  y = _mm256_mul_ps(y, pow2n);
-  return y;
-}
-#endif
-
-#ifdef __AVX2__
-__m256 ExpAVX2(__m256 x) {
-  AVXEXP_BASE;
-  // two AVX2 instructions
-  imm0 = _mm256_add_epi32(imm0, *reinterpret_cast<const __m256i*>(_pi256_0x7f));
-  imm0 = _mm256_slli_epi32(imm0, 23);
-  __m256 pow2n = _mm256_castsi256_ps(imm0);
-  y = _mm256_mul_ps(y, pow2n);
-  return y;
-}
-#endif
-
-}  // namespace detail
 }  // namespace jitkernel
 }  // namespace math
 }  // namespace operators
diff --git a/paddle/fluid/operators/math/jit_kernel_impl.h b/paddle/fluid/operators/math/jit_kernel_impl.h
index 2e734ca940..ba5f20e533 100644
--- a/paddle/fluid/operators/math/jit_kernel_impl.h
+++ b/paddle/fluid/operators/math/jit_kernel_impl.h
@@ -38,20 +38,34 @@ typedef struct {
   void* checked{nullptr};
 } lstm_t;
 
-typedef struct lstm_attr_s {
-  bool use_peephole;
+typedef struct {
+  void* gates;  // gates: {W_update, W_reset; W_state}
+  const void* ht_1;
+  void* ht;
+} gru_t;
+
+struct rnn_attr_s {
   int d;
-  std::string act_gate, act_cand, act_cell;
+  std::string act_gate, act_cand;
+  rnn_attr_s() = default;
+  rnn_attr_s(int _d, const std::string& _act_gate, const std::string& _act_cand)
+      : d(_d), act_gate(_act_gate), act_cand(_act_cand) {}
+};
+
+struct lstm_attr_s : public rnn_attr_s {
+  bool use_peephole;
+  std::string act_cell;
   lstm_attr_s() = default;
   lstm_attr_s(int _d, const std::string& _act_gate,
               const std::string& _act_cand, const std::string& _act_cell,
               bool _use_peephole = false)
-      : use_peephole(_use_peephole),
-        d(_d),
-        act_gate(_act_gate),
-        act_cand(_act_cand),
+      : rnn_attr_s(_d, _act_gate, _act_cand),
+        use_peephole(_use_peephole),
         act_cell(_act_cell) {}
-} lstm_attr_t;
+};
+
+typedef struct rnn_attr_s gru_attr_t;
+typedef struct lstm_attr_s lstm_attr_t;
 
 }  // namespace jitkernel
 }  // namespace math
diff --git a/paddle/fluid/operators/math/jit_kernel_refer.h b/paddle/fluid/operators/math/jit_kernel_refer.h
index 097bb85956..2e1a7f22db 100644
--- a/paddle/fluid/operators/math/jit_kernel_refer.h
+++ b/paddle/fluid/operators/math/jit_kernel_refer.h
@@ -185,6 +185,46 @@ void LSTMC1H1(lstm_t* step, const lstm_attr_t* attr) {
   VMul(gates + d2, gates + d3, ht, d);
 }
 
+// compute h1 without h0
+template <typename T>
+void GRUH1(gru_t* step, const gru_attr_t* attr) {
+  T* gates = reinterpret_cast<T*>(step->gates);
+  T* ht = reinterpret_cast<T*>(step->ht);
+  auto act_gate = getActFunc<T>(attr->act_gate);
+  auto act_cand = getActFunc<T>(attr->act_cand);
+  int d = attr->d;
+  int d2 = d * 2;
+  act_gate(gates, gates, d);
+  act_cand(gates + d2, gates + d2, d);
+  VMul(gates, gates + d2, ht, d);
+}
+
+template <typename T>
+void GRUHtPart1(gru_t* step, const gru_attr_t* attr) {
+  // W: {W_update, W_reset; W_state}
+  T* gates = reinterpret_cast<T*>(step->gates);
+  T* ht = reinterpret_cast<T*>(step->ht);
+  const T* ht_1 = reinterpret_cast<const T*>(step->ht_1);
+  auto act_gate = getActFunc<T>(attr->act_gate);
+  act_gate(gates, gates, attr->d * 2);
+  VMul(ht_1, gates + attr->d, ht, attr->d);
+}
+
+template <typename T>
+void GRUHtPart2(gru_t* step, const gru_attr_t* attr) {
+  T* gates = reinterpret_cast<T*>(step->gates);
+  T* ht = reinterpret_cast<T*>(step->ht);
+  const T* ht_1 = reinterpret_cast<const T*>(step->ht_1);
+  auto act_cand = getActFunc<T>(attr->act_cand);
+  int d = attr->d;
+  T* y = gates + d * 2;
+  act_cand(y, y, d);
+  // out = zt*ht~ + (1-zt)*ht_1
+  for (int i = 0; i < d; ++i) {
+    ht[i] = gates[i] * y[i] + (static_cast<T>(1) - gates[i]) * ht_1[i];
+  }
+}
+
 }  // namespace refer
 }  // namespace jitkernel
 }  // namespace math
diff --git a/paddle/fluid/operators/math/jit_kernel_rnn.cc b/paddle/fluid/operators/math/jit_kernel_rnn.cc
index 6b7463aa52..dbfd212e6e 100644
--- a/paddle/fluid/operators/math/jit_kernel_rnn.cc
+++ b/paddle/fluid/operators/math/jit_kernel_rnn.cc
@@ -23,140 +23,10 @@ limitations under the License. */
 #include "paddle/fluid/operators/math/jit_code.h"
 #endif
 
-#ifdef __AVX__
-#include <immintrin.h>
-#endif
-
 namespace paddle {
 namespace operators {
 namespace math {
 namespace jitkernel {
-namespace detail {
-#ifdef __AVX__
-__m256 ExpAVX(__m256 x);
-#endif
-
-#ifdef __AVX2__
-__m256 ExpAVX2(__m256 x);
-#endif
-
-}  // namespace detail
-
-namespace jit = platform::jit;
-
-#ifdef __AVX__
-typedef enum { kSigmoid, kRelu, kTanh, kIdentity } act_type;
-
-class AVXAct {
- public:
-  virtual ~AVXAct() = default;
-  virtual __m256 Compute(__m256 x) const = 0;
-};
-
-template <act_type type, jit::cpu_isa_t isa>
-class AVXActImpl : public AVXAct {
- public:
-  __m256 Compute(__m256 x) const override { PADDLE_THROW("Unkown type!"); }
-};
-
-#define AVX_SIGMOID(isa, expisa)                                 \
-  template <>                                                    \
-  __m256 AVXActImpl<kSigmoid, isa>::Compute(__m256 x) const {    \
-    __m256 ones = _mm256_set1_ps(1.0f);                          \
-    x = _mm256_max_ps(x, _mm256_set1_ps(SIGMOID_THRESHOLD_MIN)); \
-    x = _mm256_min_ps(x, _mm256_set1_ps(SIGMOID_THRESHOLD_MAX)); \
-    x = _mm256_sub_ps(_mm256_set1_ps(0.0f), x);                  \
-    x = expisa(x);                                               \
-    x = _mm256_add_ps(ones, x);                                  \
-    return _mm256_div_ps(ones, x);                               \
-  }
-
-#define AVX_TANH(isa, expisa)                              \
-  template <>                                              \
-  __m256 AVXActImpl<kTanh, isa>::Compute(__m256 x) const { \
-    __m256 ones = _mm256_set1_ps(1.0f);                    \
-    x = _mm256_mul_ps(_mm256_set1_ps(-2.0f), x);           \
-    x = _mm256_min_ps(x, _mm256_set1_ps(EXP_MAX_INPUT));   \
-    x = expisa(x);                                         \
-    x = _mm256_add_ps(ones, x);                            \
-    x = _mm256_div_ps(_mm256_set1_ps(2.0f), x);            \
-    return _mm256_sub_ps(x, ones);                         \
-  }
-
-#define AVX_RELU(isa)                                      \
-  template <>                                              \
-  __m256 AVXActImpl<kRelu, isa>::Compute(__m256 x) const { \
-    return _mm256_max_ps(x, _mm256_setzero_ps());          \
-  }
-
-#define AVX_IDENTITY(isa)                                      \
-  template <>                                                  \
-  __m256 AVXActImpl<kIdentity, isa>::Compute(__m256 x) const { \
-    return x;                                                  \
-  }
-
-#define FOR_EACH_AVX_ISA(macro_) \
-  macro_(jit::avx);              \
-  macro_(jit::avx2);             \
-  macro_(jit::avx512f)
-
-FOR_EACH_AVX_ISA(AVX_RELU);
-FOR_EACH_AVX_ISA(AVX_IDENTITY);
-
-AVX_SIGMOID(jit::avx, detail::ExpAVX);
-AVX_TANH(jit::avx, detail::ExpAVX);
-
-#ifdef __AVX2__
-AVX_SIGMOID(jit::avx2, detail::ExpAVX2);
-AVX_SIGMOID(jit::avx512f, detail::ExpAVX2);
-AVX_TANH(jit::avx2, detail::ExpAVX2);
-AVX_TANH(jit::avx512f, detail::ExpAVX2);
-#endif
-
-#undef FOR_EACH_AVX_ISA
-#undef AVX_IDENTITY
-#undef AVX_RELU
-#undef AVX_TANH
-#undef AVX_SIGMOID
-
-#endif
-
-template <typename T>
-static std::shared_ptr<const VActKernel<T>> GetActKernel(
-    const std::string& type, int n) {
-  if (type == "sigmoid") {
-    return std::dynamic_pointer_cast<const VActKernel<T>>(
-        KernelPool::Instance().template Get<VSigmoidKernel<T>>(n));
-  } else if (type == "relu") {
-    return std::dynamic_pointer_cast<const VActKernel<T>>(
-        KernelPool::Instance().template Get<VReluKernel<T>>(n));
-  } else if (type == "tanh") {
-    return std::dynamic_pointer_cast<const VActKernel<T>>(
-        KernelPool::Instance().template Get<VTanhKernel<T>>(n));
-  } else if (type == "identity" || type == "") {
-    return std::dynamic_pointer_cast<const VActKernel<T>>(
-        KernelPool::Instance().template Get<VIdentityKernel<T>>(n));
-  }
-  PADDLE_THROW("Not support type: %s", type);
-  return nullptr;
-}
-
-#ifdef __AVX__
-template <jit::cpu_isa_t isa>
-static std::unique_ptr<AVXAct> GetAVXAct(const std::string& type) {
-  if (type == "sigmoid") {
-    return std::unique_ptr<AVXAct>(new AVXActImpl<kSigmoid, isa>());
-  } else if (type == "relu") {
-    return std::unique_ptr<AVXAct>(new AVXActImpl<kRelu, isa>());
-  } else if (type == "tanh") {
-    return std::unique_ptr<AVXAct>(new AVXActImpl<kTanh, isa>());
-  } else if (type == "identity" || type == "") {
-    return std::unique_ptr<AVXAct>(new AVXActImpl<kIdentity, isa>());
-  }
-  PADDLE_THROW("Not support type: %s", type);
-  return nullptr;
-}
-#endif
 
 /* LSTM JitKernel */
 template <typename T>
@@ -290,125 +160,73 @@ REGISTER_JITKERNEL_ARGS(lstm, LSTMKernel, JITKERNEL_DEFINE_NAME_LSTM,
                         JITKERNEL_DECLARE_LSTM, JITKERNEL_FIND_KEY_LSTM,
                         JITKERNEL_LSTM_IMPL);
 
+#undef JITKERNEL_LSTM_IMPL
+#undef JITKERNEL_FIND_KEY_LSTM
+#undef JITKERNEL_DECLARE_LSTM
+#undef JITKERNEL_DEFINE_NAME_LSTM
+
 /* GRU JitKernel */
-template <typename T, jit::cpu_isa_t isa, jit_block>
+template <typename T>
 class GRUKernelImpl : public GRUKernel<T> {
  public:
-  explicit GRUKernelImpl(const std::string& act_gate,
-                         const std::string& act_state, int d)
-      : GRUKernel<T>() {
-    d_ = d;
-    d2_ = d * 2;
-    act_gate_d2_ = GetActKernel<T>(act_gate, d2_);
-    act_gate_d_ = GetActKernel<T>(act_gate, d);
-    act_state_d_ = GetActKernel<T>(act_state, d);
-    vmul_d_ = KernelPool::Instance().template Get<VMulKernel<T>>(d);
-  }
-
-  void ComputeH1(T* gates, T* ht) const override {
-    act_gate_d_->Compute(gates, gates, d_);
-    act_state_d_->Compute(gates + d2_, gates + d2_, d_);
-    vmul_d_->Compute(gates, gates + d2_, ht, d_);
-  }
-
-  void ComputeHtPart1(T* gates, const T* ht_1, T* ht) const override {
-    // W: {W_update, W_reset; W_state}
-    act_gate_d2_->Compute(gates, gates, d2_);
-    vmul_d_->Compute(ht_1, gates + d_, ht, d_);
+  static inline std::string name(const gru_attr_t& attr) {
+    PADDLE_THROW("DType should be either float or double");
   }
-
-  void ComputeHtPart2(T* gates, const T* ht_1, T* ht) const override {
-    T* y = gates + d2_;
-    act_state_d_->Compute(y, y, d_);
-    // out = zt*ht~ + (1-zt)*ht_1
-    for (int i = 0; i < d_; ++i) {
-      ht[i] = gates[i] * y[i] + (static_cast<T>(1) - gates[i]) * ht_1[i];
-    }
+  static inline bool useJIT(int d) { return false; }
+  static inline bool useMKL(int d) { return false; }
+  explicit GRUKernelImpl(const gru_attr_t& attr) : GRUKernel<T>() {
+    this->ComputeH1 = refer::GRUH1<T>;
+    this->ComputeHtPart1 = refer::GRUHtPart1<T>;
+    this->ComputeHtPart2 = refer::GRUHtPart2<T>;
   }
-
- private:
-  int d_, d2_;
-  std::shared_ptr<const VActKernel<T>> act_gate_d2_, act_gate_d_, act_state_d_;
-  std::shared_ptr<const VMulKernel<T>> vmul_d_;
-#ifdef __AVX__
-  std::unique_ptr<const AVXAct> avx_act_gate_, avx_act_state_;
-#endif
 };
 
-#define INTRI8_FLOAT(isa)                                                     \
-  template <>                                                                 \
-  GRUKernelImpl<float, isa, kEQ8>::GRUKernelImpl(                             \
-      const std::string& act_gate, const std::string& act_state, int d)       \
-      : GRUKernel<float>() {                                                  \
-    avx_act_gate_ = GetAVXAct<isa>(act_gate);                                 \
-    avx_act_state_ = GetAVXAct<isa>(act_state);                               \
-  }                                                                           \
-  template <>                                                                 \
-  void GRUKernelImpl<float, isa, kEQ8>::ComputeH1(float* gates, float* ht)    \
-      const {                                                                 \
-    __m256 u, s;                                                              \
-    /* W: {W_update, W_reset; W_state} */                                     \
-    u = _mm256_loadu_ps(gates);                                               \
-    s = _mm256_loadu_ps(gates + 16);                                          \
-    s = _mm256_mul_ps(avx_act_gate_->Compute(u), avx_act_state_->Compute(s)); \
-    _mm256_storeu_ps(ht, s);                                                  \
-  }                                                                           \
-  template <>                                                                 \
-  void GRUKernelImpl<float, isa, kEQ8>::ComputeHtPart1(                       \
-      float* gates, const float* ht_1, float* ht) const {                     \
-    /* not exactly equal the any implementation */                            \
-    __m256 r, ht0;                                                            \
-    r = _mm256_loadu_ps(gates + 8);                                           \
-    ht0 = _mm256_loadu_ps(ht_1);                                              \
-    r = _mm256_mul_ps(avx_act_gate_->Compute(r), ht0);                        \
-    _mm256_storeu_ps(ht, r);                                                  \
-  }                                                                           \
-  template <>                                                                 \
-  void GRUKernelImpl<float, isa, kEQ8>::ComputeHtPart2(                       \
-      float* gates, const float* ht_1, float* ht) const {                     \
-    /* not exactly equal the any implementation */                            \
-    __m256 u, s, ht0;                                                         \
-    u = _mm256_loadu_ps(gates);                                               \
-    s = _mm256_loadu_ps(gates + 16);                                          \
-    ht0 = _mm256_loadu_ps(ht_1);                                              \
-    u = avx_act_gate_->Compute(u);                                            \
-    s = _mm256_mul_ps(u, avx_act_state_->Compute(s));                         \
-    u = _mm256_sub_ps(_mm256_set1_ps(1.f), u);                                \
-    u = _mm256_mul_ps(u, ht0);                                                \
-    u = _mm256_add_ps(s, u);                                                  \
-    _mm256_storeu_ps(ht, u);                                                  \
-  }
-
-#ifdef __AVX__
-INTRI8_FLOAT(jit::avx);
-#endif
-#ifdef __AVX2__
-INTRI8_FLOAT(jit::avx2);
-#endif
-#ifdef __AVX512F__
-INTRI8_FLOAT(jit::avx512f);
-#endif
-
-#define JITKERNEL_DECLARE_GRU(ker_class, ker_dtype)                       \
-  template <>                                                             \
-  std::shared_ptr<const GRUKernel<ker_dtype>> KernelPool::Get<            \
-      GRUKernel<ker_dtype>, const std::string&, const std::string&, int>( \
-      const std::string& act_gate, const std::string& act_state, int d)
-
-#define JITKERNEL_KEY_GRU(ker_key, dtype_key) \
-  #ker_key #dtype_key + std::to_string(d) + act_gate + act_state
+#define JITKERNEL_DEFINE_NAME_GRU(ker_key, ker_class)                 \
+  template <>                                                         \
+  std::string ker_class##Impl<float>::name(const gru_attr_t& attr) {  \
+    std::string key(#ker_key "f");                                    \
+    key += (attr.act_gate + attr.act_cand);                           \
+    if (useJIT(attr.d)) {                                             \
+      /* only jit code need record d*/                                \
+      return key + "jit" + std::to_string(attr.d);                    \
+    } else if (useMKL(attr.d)) {                                      \
+      return key + "mkl";                                             \
+    } else {                                                          \
+      return key + "any";                                             \
+    }                                                                 \
+  }                                                                   \
+  template <>                                                         \
+  std::string ker_class##Impl<double>::name(const gru_attr_t& attr) { \
+    std::string key(#ker_key "d");                                    \
+    /* jit code do not support double yet*/                           \
+    if (useMKL(attr.d)) {                                             \
+      return key + "mkl";                                             \
+    } else {                                                          \
+      return key + "any";                                             \
+    }                                                                 \
+  }
+
+#define JITKERNEL_DECLARE_GRU(ker_class, ker_dtype)         \
+  template <>                                               \
+  std::shared_ptr<const ker_class<ker_dtype>>               \
+  KernelPool::Get<ker_class<ker_dtype>, const gru_attr_t&>( \
+      const gru_attr_t& attr)
+
+#define JITKERNEL_FIND_KEY_GRU(ker_class, ker_dtype) \
+  std::string key = ker_class##Impl<ker_dtype>::name(attr)
 
-#define JITKERNEL_NEW_GRU_IMPL(ker, dtype, isa, k) \
-  p = std::dynamic_pointer_cast<ker<dtype>>(       \
-      std::make_shared<ker##Impl<dtype, isa, k>>(act_gate, act_state, d));
+#define JITKERNEL_GRU_IMPL(ker, dtype)       \
+  p = std::dynamic_pointer_cast<ker<dtype>>( \
+      std::make_shared<ker##Impl<dtype>>(attr));
 
-REGISTER_JITKERNEL_ARGS_DEPRECATED(gru, GRUKernel, JITKERNEL_DECLARE_GRU,
-                                   JITKERNEL_KEY_GRU, JITKERNEL_NEW_GRU_IMPL);
+REGISTER_JITKERNEL_ARGS(gru, GRUKernel, JITKERNEL_DEFINE_NAME_GRU,
+                        JITKERNEL_DECLARE_GRU, JITKERNEL_FIND_KEY_GRU,
+                        JITKERNEL_GRU_IMPL);
 
-#undef INTRI8_FLOAT
-#undef JITKERNEL_NEW_GRU_IMPL
-#undef JITKERNEL_KEY_GRU
+#undef JITKERNEL_GRU_IMPL
+#undef JITKERNEL_FIND_KEY_GRU
 #undef JITKERNEL_DECLARE_GRU
+#undef JITKERNEL_DEFINE_NAME_GRU
 }  // namespace jitkernel
 }  // namespace math
 }  // namespace operators

From 6193dc76368f5f888d8270f938ec81b78e06ffdd Mon Sep 17 00:00:00 2001
From: peizhilin <wopeizl@163.com>
Date: Wed, 21 Nov 2018 23:17:26 +0800
Subject: [PATCH 045/113] test=develop

---
 CMakeLists.txt | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 5325e3034c..bc2ac2cd93 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -139,6 +139,10 @@ if (WIN32)
             "Disable MKL when compiling for Windows" FORCE)
     set(WITH_DISTRIBUTE OFF CACHE STRING
             "Disable DISTRIBUTE when compiling for Windows" FORCE)
+    set(WITH_C_API OFF CACHE STRING
+            "Disable C_API when compiling for Windows" FORCE)
+    set(WITH_FLUID_ONLY ON CACHE STRING
+            "Enable FLUID_ONLY when compiling for Windows" FORCE)
 endif()
 
 set(THIRD_PARTY_PATH "${CMAKE_BINARY_DIR}/third_party" CACHE STRING

From 6eba5bd276a8d79d5611ec42db0c47273fb4950c Mon Sep 17 00:00:00 2001
From: hjchen2 <chenhoujiangcug@gmail.com>
Date: Wed, 21 Nov 2018 15:32:25 +0000
Subject: [PATCH 046/113] Fix direct copy and refine split ut test=develop

---
 .../tensorrt/convert/test_split_op.cc         | 55 ++++++++++++++-----
 .../tensorrt/plugin/split_op_plugin.cu        |  7 ++-
 2 files changed, 46 insertions(+), 16 deletions(-)

diff --git a/paddle/fluid/inference/tensorrt/convert/test_split_op.cc b/paddle/fluid/inference/tensorrt/convert/test_split_op.cc
index f81d011552..23909378dd 100644
--- a/paddle/fluid/inference/tensorrt/convert/test_split_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/test_split_op.cc
@@ -20,30 +20,59 @@ namespace paddle {
 namespace inference {
 namespace tensorrt {
 
-TEST(split_op, test) {
+template <int BatchSize, int Axis>
+void TensorRTSplitTest(const std::vector<int> &in_shape,
+                       const std::vector<int> &sections) {
   std::unordered_set<std::string> parameters({""});
   framework::Scope scope;
-  TRTConvertValidation validator(10, parameters, scope, 1000);
-  validator.DeclInputVar("split_input", nvinfer1::DimsCHW(3, 2, 2));
-  validator.DeclOutputVar("split_out1", nvinfer1::DimsCHW(2, 2, 2));
-  validator.DeclOutputVar("split_out2", nvinfer1::DimsCHW(1, 2, 2));
+  TRTConvertValidation validator(BatchSize + 1, parameters, scope, 10000);
+
+  auto make_dim = [](const std::vector<int> &shape) {
+    nvinfer1::DimsCHW dim;
+    dim.c() = shape[0];
+    dim.h() = shape[1];
+    dim.w() = shape[2];
+    return dim;
+  };
+  validator.DeclInputVar("split_input", make_dim(in_shape));
+  std::vector<std::string> output_vars;
+  for (size_t i = 0; i < sections.size(); ++i) {
+    auto out_shape = in_shape;
+    out_shape[Axis - 1] = sections[i];
+    std::string output_name = "split_out" + std::to_string(i);
+    validator.DeclOutputVar(output_name, make_dim(out_shape));
+    output_vars.push_back(output_name);
+  }
 
   // Prepare Op description
   framework::OpDesc desc;
   desc.SetType("split");
   desc.SetInput("X", {"split_input"});
-  desc.SetOutput("Out", {"split_out1", "split_out2"});
+  desc.SetOutput("Out", output_vars);
 
-  int num = 0;
-  int axis = 1;
-  std::vector<int> output_lengths = {2, 1};
-  desc.SetAttr("axis", axis);
-  desc.SetAttr("num", num);
-  desc.SetAttr("sections", output_lengths);
+  desc.SetAttr("axis", Axis);
+  desc.SetAttr("num", 0);
+  desc.SetAttr("sections", sections);
 
   validator.SetOp(*desc.Proto());
 
-  validator.Execute(1);
+  validator.Execute(BatchSize);
+}
+
+TEST(split_op, test_same_shape_batch1) {
+  TensorRTSplitTest<1, 1>({4, 2, 2}, {2, 2});
+}
+
+TEST(split_op, test_different_shape_batch1) {
+  TensorRTSplitTest<1, 1>({3, 2, 2}, {2, 1});
+}
+
+TEST(split_op, test_same_shape_batch10) {
+  TensorRTSplitTest<10, 1>({4, 2, 2}, {2, 2});
+}
+
+TEST(split_op, test_different_shape_batch10) {
+  TensorRTSplitTest<10, 1>({3, 2, 2}, {2, 1});
 }
 
 }  // namespace tensorrt
diff --git a/paddle/fluid/inference/tensorrt/plugin/split_op_plugin.cu b/paddle/fluid/inference/tensorrt/plugin/split_op_plugin.cu
index 1ec0753e9f..de61ace59e 100644
--- a/paddle/fluid/inference/tensorrt/plugin/split_op_plugin.cu
+++ b/paddle/fluid/inference/tensorrt/plugin/split_op_plugin.cu
@@ -138,11 +138,12 @@ inline void Split(cudaStream_t stream, const bool same_shape,
 int SplitPlugin::enqueue(int batchSize, const void* const* inputs,
                          void** outputs, void* workspace, cudaStream_t stream) {
   float const* input_ptr = reinterpret_cast<float const*>(inputs[0]);
-  if (axis_ == -1 && this->getNbOutputs() < 10) {
+  if (((batchSize == 1 && axis_ == 0) || axis_ == -1) &&
+      this->getNbOutputs() < 10) {
     float** output_ptrs = reinterpret_cast<float**>(outputs);
     int data_type_size = (this->getDataType() == nvinfer1::DataType::kFLOAT)
-                             ? sizeof(__half)
-                             : sizeof(float);
+                             ? sizeof(float)
+                             : sizeof(__half);
     for (int i = 0; i < this->getNbOutputs(); ++i) {
       PADDLE_ENFORCE(
           cudaMemcpyAsync(

From 3912545ffec3ea5a850420f0a804afadc9f0352a Mon Sep 17 00:00:00 2001
From: sneaxiy <sneaxiy@126.com>
Date: Thu, 22 Nov 2018 04:30:19 +0000
Subject: [PATCH 047/113] add dlpack support test=develop

---
 CMakeLists.txt                               |   1 +
 cmake/external/dlpack.cmake                  |  31 +++++
 paddle/fluid/framework/CMakeLists.txt        |   3 +
 paddle/fluid/framework/dlpack_tensor.cc      | 127 +++++++++++++++++++
 paddle/fluid/framework/dlpack_tensor.h       |  45 +++++++
 paddle/fluid/framework/dlpack_tensor_test.cc | 113 +++++++++++++++++
 6 files changed, 320 insertions(+)
 create mode 100644 cmake/external/dlpack.cmake
 create mode 100644 paddle/fluid/framework/dlpack_tensor.cc
 create mode 100644 paddle/fluid/framework/dlpack_tensor.h
 create mode 100644 paddle/fluid/framework/dlpack_tensor_test.cc

diff --git a/CMakeLists.txt b/CMakeLists.txt
index c62cc9bfd7..b6ae241272 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -190,6 +190,7 @@ include(external/pybind11)  # download pybind11
 include(external/cares)
 include(external/cub)
 include(external/xxhash)    # download xxhash
+include(external/dlpack)
 
 if (NOT WIN32)
 # there is no official support of snappystream, warpctc, nccl, cupti in windows
diff --git a/cmake/external/dlpack.cmake b/cmake/external/dlpack.cmake
new file mode 100644
index 0000000000..94d8fcc668
--- /dev/null
+++ b/cmake/external/dlpack.cmake
@@ -0,0 +1,31 @@
+include(ExternalProject)
+
+set(DLPACK_SOURCE_DIR ${THIRD_PARTY_PATH}/dlpack)
+set(DLPACK_INCLUDE_DIR ${DLPACK_SOURCE_DIR}/src/extern_dlpack/include)
+
+include_directories(${DLPACK_INCLUDE_DIR})
+
+ExternalProject_Add(
+  extern_dlpack
+  ${EXTERNAL_PROJECT_LOG_ARGS}
+  GIT_REPOSITORY "https://github.com/dmlc/dlpack.git"
+  GIT_TAG        "v0.2"
+  PREFIX         ${DLPACK_SOURCE_DIR}
+  UPDATE_COMMAND ""
+  CONFIGURE_COMMAND ""
+  BUILD_COMMAND     ""
+  INSTALL_COMMAND   ""
+  TEST_COMMAND      ""
+)
+
+if(${CMAKE_VERSION} VERSION_LESS "3.3.0")
+  set(dummyfile ${CMAKE_CURRENT_BINARY_DIR}/dlpack_dummy.c)
+  file(WRITE ${dummyfile} "const char *dummy = \"${dummyfile}\";")
+  add_library(dlpack STATIC ${dummyfile})
+else()
+  add_library(dlpack INTERFACE)
+endif()
+
+add_dependencies(dlpack extern_dlpack)
+
+LIST(APPEND externl_project_dependencies dlpack)
diff --git a/paddle/fluid/framework/CMakeLists.txt b/paddle/fluid/framework/CMakeLists.txt
index cb9057672c..d7d7834b49 100644
--- a/paddle/fluid/framework/CMakeLists.txt
+++ b/paddle/fluid/framework/CMakeLists.txt
@@ -205,3 +205,6 @@ cc_test(tuple_test SRCS tuple_test.cc )
 if (NOT WIN32)
 cc_test(rw_lock_test SRCS rw_lock_test.cc)
 endif (NOT WIN32)
+
+cc_library(dlpack_tensor SRCS dlpack_tensor.cc DEPS tensor dlpack)
+cc_test(dlpack_tensor_test SRCS dlpack_tensor_test.cc DEPS dlpack_tensor glog)
diff --git a/paddle/fluid/framework/dlpack_tensor.cc b/paddle/fluid/framework/dlpack_tensor.cc
new file mode 100644
index 0000000000..04e3f78afe
--- /dev/null
+++ b/paddle/fluid/framework/dlpack_tensor.cc
@@ -0,0 +1,127 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/framework/dlpack_tensor.h"
+
+namespace paddle {
+namespace framework {
+
+namespace internal {
+template <typename T>
+static ::DLDataType GetDLDataTypeCode() {
+  ::DLDataType dtype;
+  if (std::is_same<T, platform::float16>::value ||
+      std::is_floating_point<T>::value) {
+    dtype.code = kDLFloat;
+  } else if (std::is_unsigned<T>::value) {
+    dtype.code = kDLUInt;
+  } else if (std::is_integral<T>::value) {
+    dtype.code = kDLInt;
+  } else {
+    PADDLE_THROW("Unsupported data type %s", typeid(T).name());
+  }
+  dtype.bits = 8 * sizeof(T);
+  dtype.lanes = 1;
+  return dtype;
+}
+
+static DLDataType GetDLDataTypeFromTypeIndex(const std::type_index &type) {
+#define REG_DL_DATA_TYPE(type) \
+  { std::type_index(typeid(type)), GetDLDataTypeCode<type>() }
+  static const std::unordered_map<std::type_index, ::DLDataType>
+      type_to_dtype_map({
+          REG_DL_DATA_TYPE(platform::float16),  // NOLINT
+          REG_DL_DATA_TYPE(float),              // NOLINT
+          REG_DL_DATA_TYPE(double),             // NOLINT
+          REG_DL_DATA_TYPE(int),                // NOLINT
+          REG_DL_DATA_TYPE(int64_t),            // NOLINT
+          REG_DL_DATA_TYPE(bool),               // NOLINT
+          REG_DL_DATA_TYPE(size_t),             // NOLINT
+          REG_DL_DATA_TYPE(int16_t),            // NOLINT
+          REG_DL_DATA_TYPE(uint8_t),            // NOLINT
+          REG_DL_DATA_TYPE(int8_t)              // NOLINT
+      });
+  static auto type_to_dtype_map_end_it = type_to_dtype_map.end();
+  auto it = type_to_dtype_map.find(type);
+  PADDLE_ENFORCE(it != type_to_dtype_map_end_it, "Unsupported data type %s",
+                 type.name());
+  return it->second;
+#undef REG_DL_DATA_TYPE
+}
+
+struct DLContextVisitor : public boost::static_visitor<::DLContext> {
+  inline ::DLContext operator()(const platform::CPUPlace &place) const {
+    DLContext ctx;
+    ctx.device_type = kDLCPU;
+    ctx.device_id = 0;
+    return ctx;
+  }
+
+  inline ::DLContext operator()(const platform::CUDAPlace &place) const {
+#ifdef PADDLE_WITH_CUDA
+    DLContext ctx;
+    ctx.device_type = kDLGPU;
+    ctx.device_id = place.device;
+    return ctx;
+#else
+    PADDLE_THROW("platform::CUDAPlace is not supported in CPU only version");
+#endif
+  }
+
+  inline ::DLContext operator()(const platform::CUDAPinnedPlace &place) const {
+#ifdef PADDLE_WITH_CUDA
+    DLContext ctx;
+    ctx.device_type = kDLCPUPinned;
+    ctx.device_id = 0;
+    return ctx;
+#else
+    PADDLE_THROW(
+        "platform::CUDAPinnedPlace is not supported in CPU only version");
+#endif
+  }
+};
+}  // namespace internal
+
+DLPackTensor::DLPackTensor(const Tensor &tensor, LaneType lanes) {
+  // init data, data buffer
+  t_.data = const_cast<void *>(tensor.data<void>());
+
+  // init ctx, DLContext type with device_type and device_id
+  auto place = tensor.place();
+  t_.ctx = boost::apply_visitor(internal::DLContextVisitor(), place);
+
+  // init dtype
+  t_.dtype = internal::GetDLDataTypeFromTypeIndex(tensor.type());
+  t_.dtype.lanes = lanes;
+
+  // init ndim, tensor rank
+  auto &dims = tensor.dims();
+  using DimType = decltype(t_.ndim);  // int
+  t_.ndim = static_cast<DimType>(dims.size());
+
+  // init shape, tensor dims
+  t_.shape = shape_;
+  for (DimType i = 0; i < t_.ndim; ++i) {
+    t_.shape[i] = dims[i];
+  }
+
+  // init strides, nullptr means the tensor is compact
+  t_.strides = nullptr;
+
+  // init byte_offset
+  t_.byte_offset = 0;
+}
+
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/dlpack_tensor.h b/paddle/fluid/framework/dlpack_tensor.h
new file mode 100644
index 0000000000..0c52bce1ef
--- /dev/null
+++ b/paddle/fluid/framework/dlpack_tensor.h
@@ -0,0 +1,45 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <dlpack/dlpack.h>
+#include "paddle/fluid/framework/tensor.h"
+
+namespace paddle {
+namespace framework {
+
+class DLPackTensor {
+ public:
+  using LaneType = decltype(::DLTensor::dtype.lanes);  // uint16_t
+  using ShapeType =
+      std::remove_reference<decltype(::DLTensor::shape[0])>::type;  // int64_t
+
+  // lanes is only used in CPU to enable vectorization
+  explicit DLPackTensor(const Tensor& tensor, LaneType lanes = 1);
+
+  inline operator const ::DLTensor&() const { return t_; }
+
+  inline operator ::DLTensor&() { return t_; }
+
+ private:
+  ::DLTensor t_;
+
+  // The shape in DLTensor is defined as int64_t*
+  // Add this member to make TVMTensor init without heap allocation
+  ShapeType shape_[9];
+};
+
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/dlpack_tensor_test.cc b/paddle/fluid/framework/dlpack_tensor_test.cc
new file mode 100644
index 0000000000..938b056350
--- /dev/null
+++ b/paddle/fluid/framework/dlpack_tensor_test.cc
@@ -0,0 +1,113 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/framework/dlpack_tensor.h"
+#include <glog/logging.h>
+#include <gtest/gtest.h>
+#include <vector>
+
+namespace paddle {
+namespace framework {
+
+namespace {  // NOLINT
+template <typename T>
+constexpr uint8_t GetDLDataTypeCode() {
+  return std::is_same<platform::float16, T>::value ||
+                 std::is_floating_point<T>::value
+             ? static_cast<uint8_t>(kDLFloat)
+             : (std::is_unsigned<T>::value
+                    ? static_cast<uint8_t>(kDLUInt)
+                    : (std::is_integral<T>::value ? static_cast<uint8_t>(kDLInt)
+                                                  : static_cast<uint8_t>(-1)));
+}
+}  // NOLINT
+
+template <typename T>
+void TestMain(const platform::Place &place, uint16_t lanes) {
+  DDim dims{4, 5, 6, 7};
+  Tensor tensor;
+  tensor.Resize(dims);
+  void *p = tensor.mutable_data<T>(place);
+
+  DLPackTensor dlpack_tensor(tensor, lanes);
+  ::DLTensor &dl_tensor = dlpack_tensor;
+
+  CHECK_EQ(p, dl_tensor.data);
+  if (platform::is_cpu_place(place)) {
+    CHECK_EQ(kDLCPU, dl_tensor.ctx.device_type);
+    CHECK_EQ(0, dl_tensor.ctx.device_id);
+  } else if (platform::is_gpu_place(place)) {
+    CHECK_EQ(kDLGPU, dl_tensor.ctx.device_type);
+    CHECK_EQ(boost::get<platform::CUDAPlace>(place).device,
+             dl_tensor.ctx.device_id);
+  } else if (platform::is_cuda_pinned_place(place)) {
+    CHECK_EQ(kDLCPUPinned, dl_tensor.ctx.device_type);
+    CHECK_EQ(0, dl_tensor.ctx.device_id);
+  } else {
+    CHECK_EQ(false, true);
+  }
+
+  CHECK_EQ(dims.size(), dl_tensor.ndim);
+  for (auto i = 0; i < dims.size(); ++i) {
+    CHECK_EQ(dims[i], dl_tensor.shape[i]);
+  }
+
+  CHECK_EQ(dl_tensor.strides == nullptr, true);
+  CHECK_EQ(static_cast<uint64_t>(0), dl_tensor.byte_offset);
+
+  CHECK_EQ(lanes, dl_tensor.dtype.lanes);
+  CHECK_EQ(sizeof(T) * 8, dl_tensor.dtype.bits);
+
+  CHECK_EQ(GetDLDataTypeCode<T>(), dl_tensor.dtype.code);
+}
+
+template <typename T>
+void TestMainLoop() {
+#ifdef PADDLE_WITH_CUDA
+  std::vector<platform::Place> places{platform::CPUPlace(),
+                                      platform::CUDAPlace(0),
+                                      platform::CUDAPinnedPlace()};
+  if (platform::GetCUDADeviceCount() > 1) {
+    places.emplace_back(platform::CUDAPlace(1));
+  }
+#else
+  std::vector<platform::Place> places{platform::CPUPlace()};
+#endif
+  std::vector<uint16_t> lanes{1, 2};
+  for (auto &p : places) {
+    for (auto &l : lanes) {
+      TestMain<T>(p, l);
+    }
+  }
+}
+
+#define PADDLE_DLPACK_TEST(type) \
+  TEST(dlpack, test_##type) { TestMainLoop<type>(); }
+
+using float16 = platform::float16;
+PADDLE_DLPACK_TEST(float16);
+PADDLE_DLPACK_TEST(float);
+PADDLE_DLPACK_TEST(double);
+PADDLE_DLPACK_TEST(int);
+PADDLE_DLPACK_TEST(int64_t);
+PADDLE_DLPACK_TEST(bool);
+PADDLE_DLPACK_TEST(size_t);
+PADDLE_DLPACK_TEST(int16_t);
+PADDLE_DLPACK_TEST(uint8_t);
+PADDLE_DLPACK_TEST(int8_t);
+
+#undef PADDLE_DLPACK_TEST
+
+}  // namespace framework
+}  // namespace paddle

From 982e48922020e8d5f3ddcfc682068fcbdc5b7fe2 Mon Sep 17 00:00:00 2001
From: JiabinYang <marsyang199376@gmail.com>
Date: Thu, 22 Nov 2018 06:26:04 +0000
Subject: [PATCH 048/113] test=develop

---
 python/paddle/fluid/layers/nn.py                   | 5 +++--
 python/paddle/fluid/tests/unittests/test_layers.py | 6 ++++++
 2 files changed, 9 insertions(+), 2 deletions(-)

diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py
index 99acd7e308..32d411b830 100644
--- a/python/paddle/fluid/layers/nn.py
+++ b/python/paddle/fluid/layers/nn.py
@@ -2139,8 +2139,9 @@ def pool2d(input,
                           input tensor is NCHW, where N is batch size, C is
                           the number of channels, H is the height of the
                           feature, and W is the width of the feature.
-        pool_size (int): The side length of pooling windows. All pooling
-                         windows are squares with pool_size on a side.
+        pool_size (int|list|tuple): The pool kernel size. If pool kernel size is a tuple,
+            it must contain two integers, (pool_size_Height, pool_size_Width).
+            Otherwise, the pool kernel size will be a square of an int.
         pool_type: ${pooling_type_comment}
         pool_stride (int): stride of the pooling layer.
         pool_padding (int): padding size.
diff --git a/python/paddle/fluid/tests/unittests/test_layers.py b/python/paddle/fluid/tests/unittests/test_layers.py
index a8fa5436c4..c4310fe006 100644
--- a/python/paddle/fluid/tests/unittests/test_layers.py
+++ b/python/paddle/fluid/tests/unittests/test_layers.py
@@ -202,6 +202,12 @@ class TestBook(unittest.TestCase):
             self.assertIsNotNone(layers.sequence_unpad(x=x, length=length))
         print(str(program))
 
+    def test_pool2d(self):
+        program = Program()
+        with program_guard(program):
+            x = layers.data(name='x', shape=[3, 224, 224], dtype='float32')
+            self.assertIsNotNone(layers.pool2d(x, pool_size=[5, 3]))
+
     def test_lstm_unit(self):
         program = Program()
         with program_guard(program):

From 1adda8e06c075d55edcc6aa50804eab62b903f72 Mon Sep 17 00:00:00 2001
From: hjchen2 <chenhoujiangcug@gmail.com>
Date: Thu, 22 Nov 2018 06:53:16 +0000
Subject: [PATCH 049/113] Add more unit tests for split plugin test=develop

---
 .../inference/tensorrt/convert/split_op.cc    | 13 ++---
 .../tensorrt/convert/test_split_op.cc         | 47 ++++++++++++++++---
 2 files changed, 43 insertions(+), 17 deletions(-)

diff --git a/paddle/fluid/inference/tensorrt/convert/split_op.cc b/paddle/fluid/inference/tensorrt/convert/split_op.cc
index 871354267e..ae5b1b9806 100644
--- a/paddle/fluid/inference/tensorrt/convert/split_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/split_op.cc
@@ -19,9 +19,6 @@ namespace paddle {
 namespace inference {
 namespace tensorrt {
 
-/*
- * SplitOp.
- */
 class SplitOpConverter : public OpConverter {
  public:
   void operator()(const framework::proto::OpDesc& op,
@@ -40,15 +37,11 @@ class SplitOpConverter : public OpConverter {
     int axis = boost::get<int>(op_desc.GetAttr("axis"));
     std::vector<int> output_lengths =
         boost::get<std::vector<int>>(op_desc.GetAttr("sections"));
-    // PADDLE_ENFORCE(axis != 0);
-    if (axis < 0) {
-      axis += input_dims.nbDims;
-    } else {
-      axis -= 1;
-    }
+    // split on batch is not supported in TensorRT
+    PADDLE_ENFORCE(axis != 0);
+    axis += (axis < 0) ? input_dims.nbDims : -1;
 
     PADDLE_ENFORCE(output_lengths.size() == output_num);
-    //
     plugin::SplitPlugin* plugin = new plugin::SplitPlugin(axis, output_lengths);
     nvinfer1::IPluginLayer* layer =
         engine_->AddPlugin(&input, input_num, plugin);
diff --git a/paddle/fluid/inference/tensorrt/convert/test_split_op.cc b/paddle/fluid/inference/tensorrt/convert/test_split_op.cc
index 23909378dd..5aacc5c600 100644
--- a/paddle/fluid/inference/tensorrt/convert/test_split_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/test_split_op.cc
@@ -59,21 +59,54 @@ void TensorRTSplitTest(const std::vector<int> &in_shape,
   validator.Execute(BatchSize);
 }
 
-TEST(split_op, test_same_shape_batch1) {
+// batch = 0, axis = 1, same shape
+TEST(split_op, test_same_shape_axis1_batch1) {
   TensorRTSplitTest<1, 1>({4, 2, 2}, {2, 2});
 }
-
-TEST(split_op, test_different_shape_batch1) {
+// batch = 0, axis = 1, different shape
+TEST(split_op, test_different_shape_axis1_batch1) {
   TensorRTSplitTest<1, 1>({3, 2, 2}, {2, 1});
 }
-
-TEST(split_op, test_same_shape_batch10) {
+// batch = 10, axis = 1, same shape
+TEST(split_op, test_same_shape_axis1_batch10) {
   TensorRTSplitTest<10, 1>({4, 2, 2}, {2, 2});
 }
-
-TEST(split_op, test_different_shape_batch10) {
+// batch = 10, axis = 1, different shape
+TEST(split_op, test_different_shape_axis1_batch10) {
   TensorRTSplitTest<10, 1>({3, 2, 2}, {2, 1});
 }
+// batch = 0, axis = 2, same shape
+TEST(split_op, test_same_shape_axis2_batch1) {
+  TensorRTSplitTest<1, 2>({3, 4, 2}, {2, 2});
+}
+// batch = 0, axis = 2, different shape
+TEST(split_op, test_different_shape_axis2_batch1) {
+  TensorRTSplitTest<1, 2>({3, 3, 2}, {2, 1});
+}
+// batch = 10, axis = 2, same shape
+TEST(split_op, test_same_shape_axis2_batch10) {
+  TensorRTSplitTest<10, 2>({3, 4, 2}, {2, 2});
+}
+// batch = 10, axis = 2, different shape
+TEST(split_op, test_different_shape_axis2_batch10) {
+  TensorRTSplitTest<10, 2>({3, 3, 2}, {2, 1});
+}
+// batch = 0, axis = 3, same shape
+TEST(split_op, test_same_shape_axis3_batch1) {
+  TensorRTSplitTest<1, 3>({3, 2, 4}, {2, 2});
+}
+// batch = 0, axis = 3, different shape
+TEST(split_op, test_different_shape_axis3_batch1) {
+  TensorRTSplitTest<1, 3>({3, 2, 3}, {2, 1});
+}
+// batch = 10, axis = 3, same shape
+TEST(split_op, test_same_shape_axis3_batch10) {
+  TensorRTSplitTest<10, 3>({3, 2, 4}, {2, 2});
+}
+// batch = 10, axis = 3, different shape
+TEST(split_op, test_different_shape_axis3_batch10) {
+  TensorRTSplitTest<10, 3>({3, 2, 3}, {2, 1});
+}
 
 }  // namespace tensorrt
 }  // namespace inference

From e3b61cf52b88b1350de8776afcfd8e5ae348e164 Mon Sep 17 00:00:00 2001
From: tensor-tang <tangjian03@baidu.com>
Date: Thu, 22 Nov 2018 08:24:01 +0000
Subject: [PATCH 050/113] init gru jitcode and fix lstm jitcode

test=develop
---
 paddle/fluid/operators/math/jit_code.cc       |  36 ++++-
 paddle/fluid/operators/math/jit_code.h        | 140 ++++++++++++++----
 paddle/fluid/operators/math/jit_kernel_rnn.cc |  36 ++++-
 3 files changed, 170 insertions(+), 42 deletions(-)

diff --git a/paddle/fluid/operators/math/jit_code.cc b/paddle/fluid/operators/math/jit_code.cc
index ccc9206f5c..03b67238fe 100644
--- a/paddle/fluid/operators/math/jit_code.cc
+++ b/paddle/fluid/operators/math/jit_code.cc
@@ -214,6 +214,9 @@ void VActJitCode::generate() {
 bool LSTMJitCode::init(int d) { return MayIUse(avx) && d % 8 == 0; }
 
 void LSTMJitCode::generate() {
+  if (use_peephole_) {
+    preCode();
+  }
   reg64_t reg_ptr_gates = rax;
   reg64_t reg_ptr_ct_1 = r9;
   reg64_t reg_ptr_ct = r10;
@@ -224,18 +227,19 @@ void LSTMJitCode::generate() {
   mov(reg_ptr_ht, ptr[param1 + offsetof(lstm_t, ht)]);
 
   int offset = 0;
+  int d = num_ * sizeof(float);
   for (int i = 0; i < num_ / YMM_FLOAT_BLOCK; ++i) {
     /* C_t = C_t-1 * fgated + cand_gated * igated*/
     // c
     vmovups(ymm_src, ptr[reg_ptr_gates + offset]);
     act<ymm_t>(ymm_c, ymm_src, act_cand_);
     // i
-    vmovups(ymm_src, ptr[reg_ptr_gates + offset + num_]);
+    vmovups(ymm_src, ptr[reg_ptr_gates + offset + d]);
     act<ymm_t>(ymm_i, ymm_src, act_gate_);
     vmulps(ymm_c, ymm_c, ymm_i);
     if (!compute_c1h1_) {
       // f
-      vmovups(ymm_src, ptr[reg_ptr_gates + offset + 2 * num_]);
+      vmovups(ymm_src, ptr[reg_ptr_gates + offset + 2 * d]);
       act<ymm_t>(ymm_f, ymm_src, act_gate_);
       vmovups(ymm_i, ptr[reg_ptr_ct_1 + offset]);
       vmulps(ymm_f, ymm_f, ymm_i);
@@ -245,20 +249,36 @@ void LSTMJitCode::generate() {
     ymm_t ymm_ct = compute_c1h1_ ? ymm_c : ymm_f;
     ymm_t ymm_o = compute_c1h1_ ? ymm_f : ymm_c;
     ymm_t ymm_tmp = ymm_i;
+    vmovups(ptr[reg_ptr_ct + offset], ymm_ct);  // save ct
     act<ymm_t>(ymm_tmp, ymm_ct, act_cell_);
-    vmovups(ymm_src, ptr[reg_ptr_gates + offset + 3 * num_]);
+    vmovups(ymm_src, ptr[reg_ptr_gates + offset + 3 * d]);
     act<ymm_t>(ymm_o, ymm_src, act_gate_);
     vmulps(ymm_o, ymm_tmp, ymm_o);
-    // save ct and ht
-    vmovups(ptr[reg_ptr_ct + offset], ymm_ct);
-    vmovups(ptr[reg_ptr_ht + offset], ymm_o);
-
+    vmovups(ptr[reg_ptr_ht + offset], ymm_o);  // save ht
     offset += sizeof(float) * YMM_FLOAT_BLOCK;
   }
 
-  ret();
+  if (use_peephole_) {
+    postCode();
+  } else {
+    ret();
+  }
 }
 
+bool GRUJitCode::init(int d) { return MayIUse(avx) && d % 8 == 0; }
+
+void GRUJitCode::generate() {
+  reg64_t reg_ptr_gates = rax;
+  reg64_t reg_ptr_ct_1 = r9;
+  reg64_t reg_ptr_ct = r10;
+  reg64_t reg_ptr_ht = r11;
+  mov(reg_ptr_gates, ptr[param1 + offsetof(lstm_t, gates)]);
+  mov(reg_ptr_ct_1, ptr[param1 + offsetof(lstm_t, ct_1)]);
+  mov(reg_ptr_ct, ptr[param1 + offsetof(lstm_t, ct)]);
+  mov(reg_ptr_ht, ptr[param1 + offsetof(lstm_t, ht)]);
+
+  ret();
+}
 }  // namespace gen
 }  // namespace jitkernel
 }  // namespace math
diff --git a/paddle/fluid/operators/math/jit_code.h b/paddle/fluid/operators/math/jit_code.h
index bf28d444b7..403cea3991 100644
--- a/paddle/fluid/operators/math/jit_code.h
+++ b/paddle/fluid/operators/math/jit_code.h
@@ -302,6 +302,34 @@ class VActJitCode : public JitCode {
     pop(reg_ptr_global);
   }
 
+  template <typename JMM>
+  void act(JMM& dst, JMM& src, operand_type type) {  // NOLINT
+    // use 15
+    JMM zero = JMM(15);
+    if (type_ == operand_type::relu) {
+      vxorps(zero, zero, zero);
+    }
+    switch (type) {
+      case operand_type::relu:
+        relu_jmm<JMM>(dst, src, zero);
+        break;
+      case operand_type::exp:
+        exp_jmm<JMM>(dst, src, 2, 3, 4, 5);
+        break;
+      case operand_type::sigmoid:
+        sigmoid_jmm<JMM>(dst, src, 2, 3, 4, 5);
+        break;
+      case operand_type::tanh:
+        tanh_jmm<JMM>(dst, src, 2, 3, 4, 5);
+        break;
+      case operand_type::identity:
+        break;
+      default:
+        // throw error
+        break;
+    }
+  }
+
  protected:
   int num_;
   operand_type type_;
@@ -386,44 +414,94 @@ class LSTMJitCode : public VActJitCode {
   operand_type act_cand_;
   operand_type act_cell_;
   reg64_t param1{abi_param1};
-
   xmm_t xmm_src = xmm_t(0);
   xmm_t xmm_c = xmm_t(1);
-  xmm_t xmm_i = xmm_t(2);
-  xmm_t xmm_f = xmm_t(3);
+  xmm_t xmm_i = xmm_t(6);
+  xmm_t xmm_f = xmm_t(7);
 
   ymm_t ymm_src = ymm_t(0);
-  ymm_t ymm_c = ymm_t(1);
-  ymm_t ymm_i = ymm_t(2);
-  ymm_t ymm_f = ymm_t(3);
+  ymm_t ymm_c = ymm_t(1);  // 2~5 for act
+  ymm_t ymm_i = ymm_t(6);
+  ymm_t ymm_f = ymm_t(7);
+};
 
-  template <typename JMM>
-  void act(JMM& dst, JMM& src, operand_type type) {  // NOLINT
-    // use 15
-    JMM zero = JMM(15);
-    if (type_ == operand_type::relu) {
-      vxorps(zero, zero, zero);
-    }
-    switch (type) {
-      case operand_type::relu:
-        relu_jmm<JMM>(dst, src, zero);
-        break;
-      case operand_type::exp:
-        exp_jmm<JMM>(dst, src, 2, 3, 4, 5);
-        break;
-      case operand_type::sigmoid:
-        sigmoid_jmm<JMM>(dst, src, 2, 3, 4, 5);
-        break;
-      case operand_type::tanh:
-        tanh_jmm<JMM>(dst, src, 2, 3, 4, 5);
-        break;
-      case operand_type::identity:
-        break;
-      default:
-        // throw error
-        break;
+class GRUJitCode : public VActJitCode {
+ public:
+  const char* name() const override {
+    std::string base = "GRUJitCode";
+    if (id_ == 0) {
+      base += "_H1";
+    } else if (id_ == 1) {
+      base += "_HtPart1";
+    } else if (id_ == 2) {
+      base += "_HtPart2";
     }
+    auto AddTypeStr = [&](operand_type type) {
+      switch (type) {
+        case operand_type::relu:
+          base += "_Relu";
+          break;
+        case operand_type::exp:
+          base += "_Exp";
+          break;
+        case operand_type::sigmoid:
+          base += "_Sigmoid";
+          break;
+        case operand_type::tanh:
+          base += "_Tanh";
+          break;
+        case operand_type::identity:
+          base += "_Identity";
+          break;
+        default:
+          break;
+      }
+    };
+    AddTypeStr(act_gate_);
+    AddTypeStr(act_cand_);
+    return base.c_str();
   }
+
+  explicit GRUJitCode(int id, const gru_attr_t& attr,
+                      size_t code_size = 256 * 1024, void* code_ptr = nullptr)
+      : VActJitCode(attr.d, operand_type::sigmoid /* this is bugy*/, code_size,
+                    code_ptr),
+        id_(id) {
+    auto typeExchange = [](const std::string& type) -> gen::operand_type {
+      if (type == "sigmoid") {
+        return operand_type::sigmoid;
+      } else if (type == "relu") {
+        return operand_type::relu;
+      } else if (type == "tanh") {
+        return operand_type::tanh;
+      } else if (type == "identity" || type == "") {
+        return operand_type::identity;
+      }  // else throw error
+      return operand_type::identity;
+    };
+    num_ = attr.d;
+    act_gate_ = typeExchange(attr.act_gate);
+    act_cand_ = typeExchange(attr.act_cand);
+  }
+  static bool init(int d);
+  void generate() override;
+
+ protected:
+  int id_;
+  int num_;
+  operand_type act_gate_;
+  operand_type act_cand_;
+  reg64_t param1{abi_param1};
+
+  xmm_t xmm_src = xmm_t(0);
+  xmm_t xmm_c = xmm_t(1);
+  xmm_t xmm_i = xmm_t(6);
+  xmm_t xmm_f = xmm_t(7);
+
+  ymm_t ymm_src = ymm_t(0);
+  ymm_t ymm_c = ymm_t(1);
+  ymm_t ymm_i = ymm_t(6);
+  ymm_t ymm_f = ymm_t(7);
 };
 
 #ifdef PADDLE_WITH_MKLDNN
diff --git a/paddle/fluid/operators/math/jit_kernel_rnn.cc b/paddle/fluid/operators/math/jit_kernel_rnn.cc
index dbfd212e6e..e571d8adf4 100644
--- a/paddle/fluid/operators/math/jit_kernel_rnn.cc
+++ b/paddle/fluid/operators/math/jit_kernel_rnn.cc
@@ -40,7 +40,7 @@ class LSTMKernelImpl : public LSTMKernel<T> {
   explicit LSTMKernelImpl(const lstm_attr_t& attr) : LSTMKernel<T>() {
 #ifdef PADDLE_WITH_XBYAK
     if (useJIT(attr.d)) {
-      size_t sz = 96 + attr.d / YMM_FLOAT_BLOCK * 84 * 8;  // should change
+      size_t sz = 96 + attr.d / YMM_FLOAT_BLOCK * 90 * 4 * 8;
       jitcode0_.reset(new gen::LSTMJitCode(false, attr, sz > 4096 ? sz : 4096));
       this->ComputeCtHt =
           jitcode0_->getCode<void (*)(lstm_t*, const lstm_attr_t*)>();
@@ -66,7 +66,7 @@ class LSTMKernelImpl : public LSTMKernel<T> {
 #ifdef PADDLE_WITH_XBYAK
 template <>
 bool LSTMKernelImpl<float>::useJIT(int d) {
-  return false;  // not ready yet gen::LSTMJitCode::init(d);
+  return gen::LSTMJitCode::init(d);
 }
 #endif
 
@@ -82,7 +82,7 @@ class PeepholeKernelImpl : public LSTMKernel<T> {
   explicit PeepholeKernelImpl(const lstm_attr_t& attr) : LSTMKernel<T>() {
 #ifdef PADDLE_WITH_XBYAK
     if (useJIT(attr.d)) {
-      size_t sz = 96 + attr.d / YMM_FLOAT_BLOCK * 84 * 8;  // should change
+      size_t sz = 96 + attr.d / YMM_FLOAT_BLOCK * 96 * 4 * 8;
       jitcode0_.reset(new gen::LSTMJitCode(false, attr, sz > 4096 ? sz : 4096));
       this->ComputeCtHt =
           jitcode0_->getCode<void (*)(lstm_t*, const lstm_attr_t*)>();
@@ -175,12 +175,42 @@ class GRUKernelImpl : public GRUKernel<T> {
   static inline bool useJIT(int d) { return false; }
   static inline bool useMKL(int d) { return false; }
   explicit GRUKernelImpl(const gru_attr_t& attr) : GRUKernel<T>() {
+#ifdef PADDLE_WITH_XBYAK
+    if (useJIT(attr.d)) {
+      size_t sz = 96 + attr.d / YMM_FLOAT_BLOCK * 84 * 8;  // should change
+      jitcode0_.reset(new gen::GRUJitCode(0, attr, sz > 4096 ? sz : 4096));
+      this->ComputeH1 =
+          jitcode0_->getCode<void (*)(gru_t*, const gru_attr_t*)>();
+
+      jitcode1_.reset(new gen::GRUJitCode(1, attr, sz > 4096 ? sz : 4096));
+      this->ComputeHtPart1 =
+          jitcode1_->getCode<void (*)(gru_t*, const gru_attr_t*)>();
+
+      jitcode2_.reset(new gen::GRUJitCode(2, attr, sz > 4096 ? sz : 4096));
+      this->ComputeHtPart2 =
+          jitcode1_->getCode<void (*)(gru_t*, const gru_attr_t*)>();
+      return;
+    }
+#endif
     this->ComputeH1 = refer::GRUH1<T>;
     this->ComputeHtPart1 = refer::GRUHtPart1<T>;
     this->ComputeHtPart2 = refer::GRUHtPart2<T>;
   }
+#ifdef PADDLE_WITH_XBYAK
+
+ private:
+  std::unique_ptr<gen::GRUJitCode> jitcode0_{nullptr}, jitcode1_{nullptr},
+      jitcode2_{nullptr};
+#endif
 };
 
+#ifdef PADDLE_WITH_XBYAK
+template <>
+bool GRUKernelImpl<float>::useJIT(int d) {
+  return false;  // jitcode not ready yet
+}
+#endif
+
 #define JITKERNEL_DEFINE_NAME_GRU(ker_key, ker_class)                 \
   template <>                                                         \
   std::string ker_class##Impl<float>::name(const gru_attr_t& attr) {  \

From 510601b2793047858763032b7816af07ab2b2bc7 Mon Sep 17 00:00:00 2001
From: JiabinYang <marsyang199376@gmail.com>
Date: Thu, 22 Nov 2018 09:01:08 +0000
Subject: [PATCH 051/113] test=develop

---
 python/paddle/fluid/layers/nn.py                   | 10 +++++++---
 python/paddle/fluid/tests/unittests/test_layers.py |  7 ++++++-
 2 files changed, 13 insertions(+), 4 deletions(-)

diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py
index 32d411b830..27f83a60bd 100644
--- a/python/paddle/fluid/layers/nn.py
+++ b/python/paddle/fluid/layers/nn.py
@@ -2139,12 +2139,16 @@ def pool2d(input,
                           input tensor is NCHW, where N is batch size, C is
                           the number of channels, H is the height of the
                           feature, and W is the width of the feature.
-        pool_size (int|list|tuple): The pool kernel size. If pool kernel size is a tuple,
+        pool_size (int|list|tuple): The pool kernel size. If pool kernel size is a tuple or list,
             it must contain two integers, (pool_size_Height, pool_size_Width).
             Otherwise, the pool kernel size will be a square of an int.
         pool_type: ${pooling_type_comment}
-        pool_stride (int): stride of the pooling layer.
-        pool_padding (int): padding size.
+        pool_stride (int|list|tuple): The pool stride size. If pool stride size is a tuple or list,
+            it must contain two integers, (pool_stride_Height, pool_stride_Width).
+            Otherwise, the pool stride size will be a square of an int.
+        pool_padding (int|list|tuple): The pool padding size. If pool padding size is a tuple,
+            it must contain two integers, (pool_padding_on_Height, pool_padding_on_Width).
+            Otherwise, the pool padding size will be a square of an int.
         global_pooling (bool): ${global_pooling_comment}
         use_cudnn (bool): ${use_cudnn_comment}
         ceil_mode (bool): ${ceil_mode_comment}
diff --git a/python/paddle/fluid/tests/unittests/test_layers.py b/python/paddle/fluid/tests/unittests/test_layers.py
index c4310fe006..559c9cda48 100644
--- a/python/paddle/fluid/tests/unittests/test_layers.py
+++ b/python/paddle/fluid/tests/unittests/test_layers.py
@@ -206,7 +206,12 @@ class TestBook(unittest.TestCase):
         program = Program()
         with program_guard(program):
             x = layers.data(name='x', shape=[3, 224, 224], dtype='float32')
-            self.assertIsNotNone(layers.pool2d(x, pool_size=[5, 3]))
+            self.assertIsNotNone(
+                layers.pool2d(
+                    x,
+                    pool_size=[5, 3],
+                    pool_stride=[1, 2],
+                    pool_padding=(2, 1)))
 
     def test_lstm_unit(self):
         program = Program()

From 0c5ed5f6fc2f7d7a8936c70d2005cf3e85c23df6 Mon Sep 17 00:00:00 2001
From: tensor-tang <tangjian03@baidu.com>
Date: Thu, 22 Nov 2018 10:04:10 +0000
Subject: [PATCH 052/113] enable peephole jitcode

test=develop
---
 paddle/fluid/operators/math/jit_code.cc       | 28 +++++++++++++++++--
 paddle/fluid/operators/math/jit_kernel_rnn.cc |  2 +-
 2 files changed, 27 insertions(+), 3 deletions(-)

diff --git a/paddle/fluid/operators/math/jit_code.cc b/paddle/fluid/operators/math/jit_code.cc
index 03b67238fe..95247ce309 100644
--- a/paddle/fluid/operators/math/jit_code.cc
+++ b/paddle/fluid/operators/math/jit_code.cc
@@ -221,10 +221,14 @@ void LSTMJitCode::generate() {
   reg64_t reg_ptr_ct_1 = r9;
   reg64_t reg_ptr_ct = r10;
   reg64_t reg_ptr_ht = r11;
+  reg64_t reg_ptr_wp = r12;
   mov(reg_ptr_gates, ptr[param1 + offsetof(lstm_t, gates)]);
   mov(reg_ptr_ct_1, ptr[param1 + offsetof(lstm_t, ct_1)]);
   mov(reg_ptr_ct, ptr[param1 + offsetof(lstm_t, ct)]);
   mov(reg_ptr_ht, ptr[param1 + offsetof(lstm_t, ht)]);
+  if (use_peephole_) {
+    mov(reg_ptr_wp, ptr[param1 + offsetof(lstm_t, wp)]);
+  }
 
   int offset = 0;
   int d = num_ * sizeof(float);
@@ -235,13 +239,27 @@ void LSTMJitCode::generate() {
     act<ymm_t>(ymm_c, ymm_src, act_cand_);
     // i
     vmovups(ymm_src, ptr[reg_ptr_gates + offset + d]);
+    if (!compute_c1h1_ && use_peephole_) {
+      ymm_t ymm_wp = ymm_t(2);
+      ymm_t ymm_ct_1 = ymm_t(3);
+      vmovups(ymm_wp, ptr[reg_ptr_wp + offset]);
+      vmovups(ymm_ct_1, ptr[reg_ptr_ct_1 + offset]);
+      vmulps(ymm_wp, ymm_ct_1, ymm_wp);
+      vaddps(ymm_src, ymm_src, ymm_wp);
+    }
     act<ymm_t>(ymm_i, ymm_src, act_gate_);
     vmulps(ymm_c, ymm_c, ymm_i);
     if (!compute_c1h1_) {
       // f
       vmovups(ymm_src, ptr[reg_ptr_gates + offset + 2 * d]);
-      act<ymm_t>(ymm_f, ymm_src, act_gate_);
       vmovups(ymm_i, ptr[reg_ptr_ct_1 + offset]);
+      if (use_peephole_) {
+        ymm_t ymm_wp = ymm_t(3);
+        vmovups(ymm_wp, ptr[reg_ptr_wp + offset + d]);
+        vmulps(ymm_wp, ymm_i, ymm_wp);
+        vaddps(ymm_src, ymm_src, ymm_wp);
+      }
+      act<ymm_t>(ymm_f, ymm_src, act_gate_);
       vmulps(ymm_f, ymm_f, ymm_i);
       vaddps(ymm_f, ymm_f, ymm_c);
     }
@@ -250,8 +268,14 @@ void LSTMJitCode::generate() {
     ymm_t ymm_o = compute_c1h1_ ? ymm_f : ymm_c;
     ymm_t ymm_tmp = ymm_i;
     vmovups(ptr[reg_ptr_ct + offset], ymm_ct);  // save ct
-    act<ymm_t>(ymm_tmp, ymm_ct, act_cell_);
     vmovups(ymm_src, ptr[reg_ptr_gates + offset + 3 * d]);
+    if (use_peephole_) {
+      ymm_t ymm_wp = ymm_t(2);
+      vmovups(ymm_wp, ptr[reg_ptr_wp + offset + d * 2]);
+      vmulps(ymm_wp, ymm_ct, ymm_wp);
+      vaddps(ymm_src, ymm_src, ymm_wp);
+    }
+    act<ymm_t>(ymm_tmp, ymm_ct, act_cell_);
     act<ymm_t>(ymm_o, ymm_src, act_gate_);
     vmulps(ymm_o, ymm_tmp, ymm_o);
     vmovups(ptr[reg_ptr_ht + offset], ymm_o);  // save ht
diff --git a/paddle/fluid/operators/math/jit_kernel_rnn.cc b/paddle/fluid/operators/math/jit_kernel_rnn.cc
index e571d8adf4..85ea95cfcc 100644
--- a/paddle/fluid/operators/math/jit_kernel_rnn.cc
+++ b/paddle/fluid/operators/math/jit_kernel_rnn.cc
@@ -108,7 +108,7 @@ class PeepholeKernelImpl : public LSTMKernel<T> {
 #ifdef PADDLE_WITH_XBYAK
 template <>
 bool PeepholeKernelImpl<float>::useJIT(int d) {
-  return false;  // peephole jitcode not ready yet
+  return gen::LSTMJitCode::init(d);
 }
 #endif
 

From 83370576cd8f35e4155d94a789c886c8c264056d Mon Sep 17 00:00:00 2001
From: minqiyang <minqiyang@baidu.com>
Date: Thu, 22 Nov 2018 18:52:54 +0800
Subject: [PATCH 053/113] Add sqlite3 support in Python3.6

test=develop
---
 tools/manylinux1/build_scripts/build_utils.sh | 15 ++++++++++++---
 1 file changed, 12 insertions(+), 3 deletions(-)

diff --git a/tools/manylinux1/build_scripts/build_utils.sh b/tools/manylinux1/build_scripts/build_utils.sh
index d97745ad2d..48cce15a14 100755
--- a/tools/manylinux1/build_scripts/build_utils.sh
+++ b/tools/manylinux1/build_scripts/build_utils.sh
@@ -50,6 +50,15 @@ function do_cpython_build {
     mkdir -p ${prefix}/lib
     # -Wformat added for https://bugs.python.org/issue17547 on Python 2.6
 
+    if [ $(lex_pyver $py_ver) -eq $(lex_pyver 3.6) ]; then
+        wget https://www.sqlite.org/2018/sqlite-autoconf-3250300.tar.gz
+        tar -zxf sqlite-autoconf-3250300.tar.gz
+        cd sqlite-autoconf-3250300
+        ./configure --prefix=/usr/local
+        make -j8 && make install
+        cd ../ && rm sqlite-autoconf-3250300.tar.gz
+    fi
+
     # NOTE --enable-shared for generating libpython shared library needed for
     # linking of some of the nupic.core test executables.
     if [ $(lex_pyver $py_ver) -ge $(lex_pyver 3.7) ]; then
@@ -59,9 +68,9 @@ function do_cpython_build {
         make -j8 > /dev/null
         make altinstall > /dev/null
     else
-        CFLAGS="-Wformat" ./configure --prefix=${prefix} --enable-shared $unicode_flags > /dev/null
-        make -j8 > /dev/null
-        make install > /dev/null
+        LD_LIBRARY_PATH=/usr/local/lib:${LD_LIBRARY_PATH} CFLAGS="-Wformat" ./configure --prefix=${prefix} --enable-shared $unicode_flags > /dev/null
+        LD_LIBRARY_PATH=/usr/local/lib:${LD_LIBRARY_PATH} make -j8 > /dev/null
+        LD_LIBRARY_PATH=/usr/local/lib:${LD_LIBRARY_PATH} make install > /dev/null
     fi
     popd
     echo "ZZZ looking for libpython"

From 7c8c9dc9bf441ee3360ec416fd71dbf5921ba391 Mon Sep 17 00:00:00 2001
From: peizhilin <wopeizl@163.com>
Date: Thu, 22 Nov 2018 19:15:47 +0800
Subject: [PATCH 054/113] fix unit test cases

---
 cmake/generic.cmake                           | 11 ++++++--
 .../framework/details/all_reduce_op_handle.cc |  4 +--
 .../framework/details/all_reduce_op_handle.h  |  6 ++---
 .../framework/details/broadcast_op_handle.cc  |  2 +-
 .../framework/details/broadcast_op_handle.h   |  6 ++---
 .../details/broadcast_op_handle_test.h        | 12 ++++-----
 .../fluid/framework/details/build_strategy.cc |  4 +--
 .../fluid/framework/details/build_strategy.h  |  4 +--
 .../details/data_balance_op_handle.cc         |  2 +-
 .../details/data_balance_op_handle.h          |  4 +--
 .../details/fused_broadcast_op_handle.h       |  4 +--
 .../details/fused_broadcast_op_handle_test.cc |  4 +--
 .../details/multi_devices_graph_pass.cc       | 16 +++++------
 .../details/multi_devices_graph_pass.h        |  2 +-
 .../framework/details/reduce_op_handle.cc     |  2 +-
 .../framework/details/reduce_op_handle.h      |  4 +--
 .../details/reduce_op_handle_test.cc          | 12 ++++-----
 .../fluid/framework/ir/is_test_pass_tester.cc |  5 +++-
 .../inference/analysis/analyzer_tester.cc     |  3 ++-
 paddle/fluid/inference/api/helper.h           |  5 +---
 .../inference/tests/api/anakin_rnn1_tester.cc |  1 -
 .../tests/book/test_inference_nlp.cc          |  1 -
 paddle/fluid/inference/tests/test_helper.h    |  1 +
 paddle/fluid/operators/beam_search_op_test.cc | 16 +++++------
 .../operators/distributed/grpc_client.cc      |  2 +-
 .../fluid/operators/distributed/grpc_serde.cc |  2 +-
 .../fluid/operators/distributed/grpc_serde.h  |  3 ++-
 .../operators/distributed/sendrecvop_utils.cc |  2 +-
 .../operators/distributed/sendrecvop_utils.h  |  2 +-
 paddle/fluid/operators/math/cpu_vec_test.cc   |  2 +-
 paddle/fluid/operators/math/im2col_test.cc    |  2 +-
 .../fluid/operators/math/jit_kernel_test.cc   |  2 +-
 paddle/fluid/platform/cudnn_helper.h          |  2 +-
 paddle/fluid/platform/dynload/cudnn.h         | 14 +++++-----
 paddle/fluid/platform/gpu_info.cc             |  5 +++-
 .../fluid/platform/stream_callback_manager.h  |  2 +-
 paddle/legacy/cuda/include/hl_warpctc_wrap.h  |  3 ++-
 paddle/legacy/cuda/src/hl_cuda_device.cc      |  4 +++
 paddle/legacy/utils/ThreadLocal.h             |  4 ++-
 paddle/legacy/utils/Util.h                    | 27 +++++++++++++++++++
 paddle/testing/CMakeLists.txt                 |  6 +++--
 python/paddle/fluid/metrics.py                |  4 +--
 .../fluid/tests/unittests/CMakeLists.txt      |  8 +++---
 43 files changed, 138 insertions(+), 89 deletions(-)

diff --git a/cmake/generic.cmake b/cmake/generic.cmake
index 111627a932..cabef3f713 100644
--- a/cmake/generic.cmake
+++ b/cmake/generic.cmake
@@ -349,10 +349,17 @@ function(cc_test TARGET_NAME)
     set(oneValueArgs "")
     set(multiValueArgs SRCS DEPS ARGS)
     cmake_parse_arguments(cc_test "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
+    if(WIN32)
+      list(APPEND win32_deps shlwapi)
+      if("${cc_test_DEPS};" MATCHES "python;")
+        list(REMOVE_ITEM cc_test_DEPS python)
+        list(APPEND win32_deps ${PYTHON_LIBRARIES})
+      endif()
+    endif(WIN32)
     add_executable(${TARGET_NAME} ${cc_test_SRCS})
     target_link_libraries(${TARGET_NAME} ${cc_test_DEPS} paddle_gtest_main lod_tensor memory gtest gflags glog)
     if(WIN32)
-      target_link_libraries(${TARGET_NAME} shlwapi)
+      target_link_libraries(${TARGET_NAME} ${win32_deps})
     endif(WIN32)
     add_dependencies(${TARGET_NAME} ${cc_test_DEPS} paddle_gtest_main lod_tensor memory gtest gflags glog)
     add_test(NAME ${TARGET_NAME}
@@ -679,7 +686,7 @@ function(py_test TARGET_NAME)
     set(multiValueArgs SRCS DEPS ARGS ENVS)
     cmake_parse_arguments(py_test "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
     add_test(NAME ${TARGET_NAME}
-             COMMAND env FLAGS_init_allocated_mem=true FLAGS_cudnn_deterministic=true
+             COMMAND ${CMAKE_COMMAND} -E env FLAGS_init_allocated_mem=true FLAGS_cudnn_deterministic=true
              FLAGS_cpu_deterministic=true
              PYTHONPATH=${PADDLE_BINARY_DIR}/python ${py_test_ENVS}
              ${PYTHON_EXECUTABLE} -u ${py_test_SRCS} ${py_test_ARGS}
diff --git a/paddle/fluid/framework/details/all_reduce_op_handle.cc b/paddle/fluid/framework/details/all_reduce_op_handle.cc
index b869015676..a003995ae3 100644
--- a/paddle/fluid/framework/details/all_reduce_op_handle.cc
+++ b/paddle/fluid/framework/details/all_reduce_op_handle.cc
@@ -23,7 +23,7 @@ namespace paddle {
 namespace framework {
 namespace details {
 
-#ifdef PADDLE_WITH_CUDA
+#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
 AllReduceOpHandle::AllReduceOpHandle(ir::Node *node,
                                      const std::vector<Scope *> &local_scopes,
                                      const std::vector<platform::Place> &places,
@@ -74,7 +74,7 @@ void AllReduceOpHandle::RunImpl() {
     }
 
     if (platform::is_gpu_place(lod_tensors[0]->place())) {
-#ifdef PADDLE_WITH_CUDA
+#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
       PADDLE_ENFORCE(nccl_ctxs_, "nccl_ctxs should not be nullptr.");
       int dtype = -1;
       size_t numel = 0;
diff --git a/paddle/fluid/framework/details/all_reduce_op_handle.h b/paddle/fluid/framework/details/all_reduce_op_handle.h
index f6ef3a1367..b449796fca 100644
--- a/paddle/fluid/framework/details/all_reduce_op_handle.h
+++ b/paddle/fluid/framework/details/all_reduce_op_handle.h
@@ -20,7 +20,7 @@
 #include "paddle/fluid/framework/details/op_handle_base.h"
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/scope.h"
-#ifdef PADDLE_WITH_CUDA
+#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
 #include "paddle/fluid/platform/nccl_helper.h"
 #endif
 
@@ -29,7 +29,7 @@ namespace framework {
 namespace details {
 
 struct AllReduceOpHandle : public OpHandleBase {
-#ifdef PADDLE_WITH_CUDA
+#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
   AllReduceOpHandle(ir::Node *node, const std::vector<Scope *> &local_scopes,
                     const std::vector<platform::Place> &places,
                     const platform::NCCLContextMap *ctxs);
@@ -49,7 +49,7 @@ struct AllReduceOpHandle : public OpHandleBase {
  private:
   std::vector<Scope *> local_scopes_;
   std::vector<platform::Place> places_;
-#ifdef PADDLE_WITH_CUDA
+#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
   const platform::NCCLContextMap *nccl_ctxs_;
 #endif
 };
diff --git a/paddle/fluid/framework/details/broadcast_op_handle.cc b/paddle/fluid/framework/details/broadcast_op_handle.cc
index 8e5e542765..d98df3bbad 100644
--- a/paddle/fluid/framework/details/broadcast_op_handle.cc
+++ b/paddle/fluid/framework/details/broadcast_op_handle.cc
@@ -82,7 +82,7 @@ void BroadcastOpHandle::BroadcastOneVar(
       });
     }
   } else {
-#ifdef PADDLE_WITH_CUDA
+#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
     VarHandle *out_handle = nullptr;
     int root_id = boost::get<platform::CUDAPlace>(in_tensor.place()).device;
     std::vector<std::function<void()>> broadcast_calls;
diff --git a/paddle/fluid/framework/details/broadcast_op_handle.h b/paddle/fluid/framework/details/broadcast_op_handle.h
index 72180fac86..0c75e05f86 100644
--- a/paddle/fluid/framework/details/broadcast_op_handle.h
+++ b/paddle/fluid/framework/details/broadcast_op_handle.h
@@ -24,7 +24,7 @@
 #include "paddle/fluid/framework/selected_rows.h"
 #include "paddle/fluid/platform/device_context.h"
 
-#ifdef PADDLE_WITH_CUDA
+#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
 #include "paddle/fluid/platform/nccl_helper.h"
 #endif
 
@@ -34,7 +34,7 @@ namespace details {
 
 struct BroadcastOpHandle : public OpHandleBase {
  public:
-#ifdef PADDLE_WITH_CUDA
+#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
   BroadcastOpHandle(ir::Node *node, const std::vector<Scope *> &local_scopes,
                     const std::vector<platform::Place> &places,
                     const platform::NCCLContextMap *nccl_ctxs)
@@ -68,7 +68,7 @@ struct BroadcastOpHandle : public OpHandleBase {
 
   std::vector<Scope *> local_scopes_;
   std::vector<platform::Place> places_;
-#ifdef PADDLE_WITH_CUDA
+#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
   const platform::NCCLContextMap *nccl_ctxs_;
 #endif
 
diff --git a/paddle/fluid/framework/details/broadcast_op_handle_test.h b/paddle/fluid/framework/details/broadcast_op_handle_test.h
index 4305eb6573..df3b3cc9ca 100644
--- a/paddle/fluid/framework/details/broadcast_op_handle_test.h
+++ b/paddle/fluid/framework/details/broadcast_op_handle_test.h
@@ -42,7 +42,7 @@ struct TestBroadcastOpHandle {
   std::vector<std::unique_ptr<ir::Node>> nodes_;
   std::vector<p::Place> place_list_;
   bool use_gpu_;
-#ifdef PADDLE_WITH_CUDA
+#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
   std::unique_ptr<platform::NCCLContextMap> nccl_ctxs_;
 #endif
 
@@ -50,7 +50,7 @@ struct TestBroadcastOpHandle {
     for (size_t j = 0; j < ctxs_.size(); ++j) {
       ctxs_[j]->Wait();
     }
-#ifdef PADDLE_WITH_CUDA
+#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
     if (nccl_ctxs_) {
       nccl_ctxs_->WaitAll();
     }
@@ -60,7 +60,7 @@ struct TestBroadcastOpHandle {
   void InitCtxOnGpu(bool use_gpu) {
     use_gpu_ = use_gpu;
     if (use_gpu_) {
-#ifdef PADDLE_WITH_CUDA
+#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
       int count = p::GetCUDADeviceCount();
       if (count <= 1) {
         LOG(WARNING) << "Cannot test multi-gpu Broadcast, because the CUDA "
@@ -84,7 +84,7 @@ struct TestBroadcastOpHandle {
         place_list_.push_back(p);
         ctxs_.emplace_back(new p::CPUDeviceContext(p));
       }
-#ifdef PADDLE_WITH_CUDA
+#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
       nccl_ctxs_.reset(nullptr);
 #endif
     }
@@ -106,14 +106,14 @@ struct TestBroadcastOpHandle {
     nodes_.emplace_back(
         ir::CreateNodeForTest("node0", ir::Node::Type::kOperation));
     if (use_gpu_) {
-#ifdef PADDLE_WITH_CUDA
+#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
       op_handle_ = new BroadcastOpHandle(nodes_.back().get(), local_scopes_,
                                          place_list_, nccl_ctxs_.get());
 #else
       PADDLE_THROW("CUDA is not support.");
 #endif
     } else {
-#ifdef PADDLE_WITH_CUDA
+#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
       op_handle_ = new BroadcastOpHandle(nodes_.back().get(), local_scopes_,
                                          place_list_, nccl_ctxs_.get());
 #else
diff --git a/paddle/fluid/framework/details/build_strategy.cc b/paddle/fluid/framework/details/build_strategy.cc
index 37202f8695..70baced0ad 100644
--- a/paddle/fluid/framework/details/build_strategy.cc
+++ b/paddle/fluid/framework/details/build_strategy.cc
@@ -96,7 +96,7 @@ std::unique_ptr<ir::Graph> BuildStrategy::Apply(
     const std::string &loss_var_name,
     const std::unordered_set<std::string> &param_names,
     const std::vector<Scope *> &local_scopes,
-#ifdef PADDLE_WITH_CUDA
+#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
     const bool use_cuda, platform::NCCLContextMap *nccl_ctxs) const {
 #else
     const bool use_cuda) const {
@@ -118,7 +118,7 @@ std::unique_ptr<ir::Graph> BuildStrategy::Apply(
       pass->Erase("local_scopes");
       pass->SetNotOwned<const std::vector<Scope *>>("local_scopes",
                                                     &local_scopes);
-#ifdef PADDLE_WITH_CUDA
+#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
       platform::NCCLContextMap *nctx = use_cuda ? nccl_ctxs : nullptr;
       pass->Erase("nccl_ctxs");
       pass->SetNotOwned<platform::NCCLContextMap>("nccl_ctxs", nctx);
diff --git a/paddle/fluid/framework/details/build_strategy.h b/paddle/fluid/framework/details/build_strategy.h
index fc2641dbd4..3236c35efd 100644
--- a/paddle/fluid/framework/details/build_strategy.h
+++ b/paddle/fluid/framework/details/build_strategy.h
@@ -23,7 +23,7 @@
 #include "paddle/fluid/platform/device_context.h"
 #include "paddle/fluid/platform/enforce.h"
 
-#ifdef PADDLE_WITH_CUDA
+#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
 #include "paddle/fluid/platform/nccl_helper.h"
 #endif
 
@@ -98,7 +98,7 @@ struct BuildStrategy {
       const std::string &loss_var_name,
       const std::unordered_set<std::string> &param_names,
       const std::vector<Scope *> &local_scopes,
-#ifdef PADDLE_WITH_CUDA
+#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
       const bool use_cuda, platform::NCCLContextMap *nccl_ctxs) const;
 #else
       const bool use_cuda) const;
diff --git a/paddle/fluid/framework/details/data_balance_op_handle.cc b/paddle/fluid/framework/details/data_balance_op_handle.cc
index 0b772f9b63..cc562c7b10 100644
--- a/paddle/fluid/framework/details/data_balance_op_handle.cc
+++ b/paddle/fluid/framework/details/data_balance_op_handle.cc
@@ -20,7 +20,7 @@ namespace paddle {
 namespace framework {
 namespace details {
 
-#ifdef PADDLE_WITH_CUDA
+#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
 DataBalanceOpHandle::DataBalanceOpHandle(
     ir::Node *node, const std::vector<Scope *> &local_scopes,
     const std::vector<platform::Place> &places,
diff --git a/paddle/fluid/framework/details/data_balance_op_handle.h b/paddle/fluid/framework/details/data_balance_op_handle.h
index 0462fb6ec7..2db18a1a72 100644
--- a/paddle/fluid/framework/details/data_balance_op_handle.h
+++ b/paddle/fluid/framework/details/data_balance_op_handle.h
@@ -19,7 +19,7 @@
 #include "paddle/fluid/framework/details/op_handle_base.h"
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/scope.h"
-#ifdef PADDLE_WITH_CUDA
+#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
 #include "paddle/fluid/platform/nccl_helper.h"
 #endif
 
@@ -29,7 +29,7 @@ namespace details {
 
 struct DataBalanceOpHandle : public OpHandleBase {
  public:
-#ifdef PADDLE_WITH_CUDA
+#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
   DataBalanceOpHandle(ir::Node *node, const std::vector<Scope *> &local_scopes,
                       const std::vector<platform::Place> &places,
                       const platform::NCCLContextMap *ctxs);
diff --git a/paddle/fluid/framework/details/fused_broadcast_op_handle.h b/paddle/fluid/framework/details/fused_broadcast_op_handle.h
index e37259526a..e43d545c9c 100644
--- a/paddle/fluid/framework/details/fused_broadcast_op_handle.h
+++ b/paddle/fluid/framework/details/fused_broadcast_op_handle.h
@@ -25,7 +25,7 @@
 #include "paddle/fluid/framework/selected_rows.h"
 #include "paddle/fluid/platform/device_context.h"
 
-#ifdef PADDLE_WITH_CUDA
+#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
 #include "paddle/fluid/platform/nccl_helper.h"
 #endif
 
@@ -35,7 +35,7 @@ namespace details {
 
 struct FusedBroadcastOpHandle : public BroadcastOpHandle {
  public:
-#ifdef PADDLE_WITH_CUDA
+#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
   FusedBroadcastOpHandle(ir::Node *node,
                          const std::vector<Scope *> local_scopes,
                          const std::vector<platform::Place> &places,
diff --git a/paddle/fluid/framework/details/fused_broadcast_op_handle_test.cc b/paddle/fluid/framework/details/fused_broadcast_op_handle_test.cc
index 541993c743..be0d941c4f 100644
--- a/paddle/fluid/framework/details/fused_broadcast_op_handle_test.cc
+++ b/paddle/fluid/framework/details/fused_broadcast_op_handle_test.cc
@@ -44,14 +44,14 @@ struct TestFusedBroadcastOpHandle : TestBroadcastOpHandle {
     nodes_.emplace_back(
         ir::CreateNodeForTest("fused_broadcast", ir::Node::Type::kOperation));
     if (use_gpu_) {
-#ifdef PADDLE_WITH_CUDA
+#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
       op_handle_ = new FusedBroadcastOpHandle(
           nodes_.back().get(), local_scopes_, place_list_, nccl_ctxs_.get());
 #else
       PADDLE_THROW("CUDA is not supported.");
 #endif
     } else {
-#ifdef PADDLE_WITH_CUDA
+#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
       op_handle_ = new FusedBroadcastOpHandle(
           nodes_.back().get(), local_scopes_, place_list_, nccl_ctxs_.get());
 #else
diff --git a/paddle/fluid/framework/details/multi_devices_graph_pass.cc b/paddle/fluid/framework/details/multi_devices_graph_pass.cc
index 8c98b78130..26666212ae 100644
--- a/paddle/fluid/framework/details/multi_devices_graph_pass.cc
+++ b/paddle/fluid/framework/details/multi_devices_graph_pass.cc
@@ -142,7 +142,7 @@ void MultiDevSSAGraphBuilder::Init() const {
   places_ = Get<const std::vector<platform::Place>>(kPlaces);
   local_scopes_ = Get<const std::vector<Scope *>>(kLocalScopes);
   strategy_ = Get<const BuildStrategy>(kStrategy);
-#ifdef PADDLE_WITH_CUDA
+#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
   nccl_ctxs_ = &Get<platform::NCCLContextMap>("nccl_ctxs");
 #endif
 
@@ -431,7 +431,7 @@ std::unique_ptr<ir::Graph> MultiDevSSAGraphBuilder::ApplyImpl(
     }
   }
   bool use_gpu = false;
-#ifdef PADDLE_WITH_CUDA
+#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
   use_gpu = nccl_ctxs_ != nullptr;
 #endif
 
@@ -478,7 +478,7 @@ bool MultiDevSSAGraphBuilder::IsSparseGradient(const std::string &og) const {
 
 void MultiDevSSAGraphBuilder::SetCommunicationContext(
     OpHandleBase *op_handle, const platform::Place &p) const {
-#ifdef PADDLE_WITH_CUDA
+#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
   if (nccl_ctxs_ == nullptr) {
     op_handle->SetDeviceContext(p,
                                 platform::DeviceContextPool::Instance().Get(p));
@@ -492,7 +492,7 @@ void MultiDevSSAGraphBuilder::SetCommunicationContext(
 void MultiDevSSAGraphBuilder::CreateBroadcastOp(ir::Graph *result,
                                                 const std::string &p_name,
                                                 size_t src_dev_id) const {
-#ifdef PADDLE_WITH_CUDA
+#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
   auto *op_handle = new BroadcastOpHandle(
       result->CreateEmptyNode("broadcast", ir::Node::Type::kOperation),
       local_scopes_, places_, nccl_ctxs_);
@@ -522,7 +522,7 @@ void MultiDevSSAGraphBuilder::CreateBroadcastOp(ir::Graph *result,
 void MultiDevSSAGraphBuilder::CreateFusedBroadcastOp(
     ir::Graph *result,
     const std::vector<std::unordered_set<std::string>> &bcast_varnames) const {
-#ifdef PADDLE_WITH_CUDA
+#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
   auto *op_handle = new FusedBroadcastOpHandle(
       result->CreateEmptyNode("fused_broadcast", ir::Node::Type::kOperation),
       local_scopes_, places_, nccl_ctxs_);
@@ -568,7 +568,7 @@ void MultiDevSSAGraphBuilder::CreateComputationalOp(ir::Graph *result,
 
 void MultiDevSSAGraphBuilder::InsertAllReduceOp(ir::Graph *result,
                                                 const std::string &og) const {
-#ifdef PADDLE_WITH_CUDA
+#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
   result->Get<GraphOps>(kGraphOps).emplace_back(new AllReduceOpHandle(
       result->CreateEmptyNode("allreduce", ir::Node::Type::kOperation),
       local_scopes_, places_, nccl_ctxs_));
@@ -597,7 +597,7 @@ void MultiDevSSAGraphBuilder::InsertAllReduceOp(ir::Graph *result,
 
 void MultiDevSSAGraphBuilder::InsertDataBalanceOp(
     ir::Graph *result, const std::vector<std::string> &datas) const {
-#ifdef PADDLE_WITH_CUDA
+#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
   result->Get<GraphOps>(kGraphOps).emplace_back(new DataBalanceOpHandle(
       result->CreateEmptyNode("data_balance", ir::Node::Type::kOperation),
       local_scopes_, places_, nccl_ctxs_));
@@ -694,7 +694,7 @@ void MultiDevSSAGraphBuilder::CreateComputationalOps(ir::Graph *result,
 VarHandle *MultiDevSSAGraphBuilder::CreateReduceOp(ir::Graph *result,
                                                    const std::string &og,
                                                    int dst_dev_id) const {
-#ifdef PADDLE_WITH_CUDA
+#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
   result->Get<GraphOps>(kGraphOps).emplace_back(new ReduceOpHandle(
       result->CreateEmptyNode("reduce", ir::Node::Type::kOperation),
       local_scopes_, places_, nccl_ctxs_));
diff --git a/paddle/fluid/framework/details/multi_devices_graph_pass.h b/paddle/fluid/framework/details/multi_devices_graph_pass.h
index f3ec2d2941..8e462aec7d 100644
--- a/paddle/fluid/framework/details/multi_devices_graph_pass.h
+++ b/paddle/fluid/framework/details/multi_devices_graph_pass.h
@@ -40,7 +40,7 @@ class MultiDevSSAGraphBuilder : public ir::Pass {
                          size_t device_id) const;
   void Init() const;
 
-#ifdef PADDLE_WITH_CUDA
+#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
   mutable platform::NCCLContextMap *nccl_ctxs_;
 #endif
 
diff --git a/paddle/fluid/framework/details/reduce_op_handle.cc b/paddle/fluid/framework/details/reduce_op_handle.cc
index 4503123eac..c9f1107aea 100644
--- a/paddle/fluid/framework/details/reduce_op_handle.cc
+++ b/paddle/fluid/framework/details/reduce_op_handle.cc
@@ -125,7 +125,7 @@ void ReduceOpHandle::RunImpl() {
         }
       });
     } else if (paddle::platform::is_gpu_place(lod_tensors[0]->place())) {
-#ifdef PADDLE_WITH_CUDA
+#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
       auto pre_in = pre_in_var->Get<framework::LoDTensor>();
       VariableVisitor::ShareDimsAndLoD(*pre_in_var, out_var);
       VariableVisitor::GetMutableTensor(out_var).mutable_data(
diff --git a/paddle/fluid/framework/details/reduce_op_handle.h b/paddle/fluid/framework/details/reduce_op_handle.h
index 999828ae45..846839029c 100644
--- a/paddle/fluid/framework/details/reduce_op_handle.h
+++ b/paddle/fluid/framework/details/reduce_op_handle.h
@@ -23,7 +23,7 @@
 #include "paddle/fluid/framework/scope.h"
 #include "paddle/fluid/framework/selected_rows.h"
 #include "paddle/fluid/platform/device_context.h"
-#ifdef PADDLE_WITH_CUDA
+#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
 #include "paddle/fluid/platform/nccl_helper.h"
 #endif
 
@@ -35,7 +35,7 @@ struct ReduceOpHandle : public OpHandleBase {
   std::vector<Scope *> local_scopes_;
   std::vector<platform::Place> places_;
 
-#ifdef PADDLE_WITH_CUDA
+#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
   const platform::NCCLContextMap *nccl_ctxs_;
   ReduceOpHandle(ir::Node *node, const std::vector<Scope *> &local_scopes,
                  const std::vector<platform::Place> &places,
diff --git a/paddle/fluid/framework/details/reduce_op_handle_test.cc b/paddle/fluid/framework/details/reduce_op_handle_test.cc
index 72299c0bfa..6cee4770e6 100644
--- a/paddle/fluid/framework/details/reduce_op_handle_test.cc
+++ b/paddle/fluid/framework/details/reduce_op_handle_test.cc
@@ -35,7 +35,7 @@ struct TestReduceOpHandle {
   std::vector<p::Place> gpu_list_;
   std::vector<std::unique_ptr<p::DeviceContext>> ctxs_;
 
-#ifdef PADDLE_WITH_CUDA
+#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
   std::unique_ptr<platform::NCCLContextMap> nccl_ctxs_;
 #endif
 
@@ -43,7 +43,7 @@ struct TestReduceOpHandle {
     for (size_t j = 0; j < ctxs_.size(); ++j) {
       ctxs_[j]->Wait();
     }
-#ifdef PADDLE_WITH_CUDA
+#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
     if (nccl_ctxs_) {
       nccl_ctxs_->WaitAll();
     }
@@ -53,7 +53,7 @@ struct TestReduceOpHandle {
   void InitCtxOnGpu(bool use_gpu) {
     use_gpu_ = use_gpu;
     if (use_gpu) {
-#ifdef PADDLE_WITH_CUDA
+#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
       int count = p::GetCUDADeviceCount();
       if (count <= 1) {
         LOG(WARNING) << "Cannot test multi-gpu Broadcast, because the CUDA "
@@ -77,7 +77,7 @@ struct TestReduceOpHandle {
         gpu_list_.push_back(p);
         ctxs_.emplace_back(new p::CPUDeviceContext(p));
       }
-#ifdef PADDLE_WITH_CUDA
+#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
       nccl_ctxs_.reset(nullptr);
 #endif
     }
@@ -99,14 +99,14 @@ struct TestReduceOpHandle {
 
     nodes.emplace_back(new ir::Node("node"));
     if (use_gpu_) {
-#ifdef PADDLE_WITH_CUDA
+#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
       op_handle_.reset(new ReduceOpHandle(nodes.back().get(), local_scopes_,
                                           gpu_list_, nccl_ctxs_.get()));
 #else
       PADDLE_THROW("CUDA is not support.");
 #endif
     } else {
-#ifdef PADDLE_WITH_CUDA
+#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
       op_handle_.reset(new ReduceOpHandle(nodes.back().get(), local_scopes_,
                                           gpu_list_, nccl_ctxs_.get()));
 #else
diff --git a/paddle/fluid/framework/ir/is_test_pass_tester.cc b/paddle/fluid/framework/ir/is_test_pass_tester.cc
index cd2cb0c9f8..9696441a21 100644
--- a/paddle/fluid/framework/ir/is_test_pass_tester.cc
+++ b/paddle/fluid/framework/ir/is_test_pass_tester.cc
@@ -15,7 +15,10 @@
 #include "paddle/fluid/framework/ir/is_test_pass.h"
 
 #include <gtest/gtest.h>
-
+#ifdef _WIN32
+#undef FALSE
+#undef TRUE
+#endif
 namespace paddle {
 namespace framework {
 namespace ir {
diff --git a/paddle/fluid/inference/analysis/analyzer_tester.cc b/paddle/fluid/inference/analysis/analyzer_tester.cc
index 84a0c3374c..7710ed7b61 100644
--- a/paddle/fluid/inference/analysis/analyzer_tester.cc
+++ b/paddle/fluid/inference/analysis/analyzer_tester.cc
@@ -19,6 +19,7 @@
 #include "paddle/fluid/inference/analysis/ut_helper.h"
 #include "paddle/fluid/inference/api/paddle_inference_api.h"
 #include "paddle/fluid/inference/api/paddle_inference_pass.h"
+#include "paddle/fluid/platform/port.h"
 
 namespace paddle {
 namespace inference {
@@ -75,7 +76,7 @@ void TestWord2vecPrediction(const std::string& model_path) {
                      0.000932706};
   const size_t num_elements = outputs.front().data.length() / sizeof(float);
   // The outputs' buffers are in CPU memory.
-  for (size_t i = 0; i < std::min(5UL, num_elements); i++) {
+  for (size_t i = 0; i < std::min((size_t)5UL, num_elements); i++) {
     LOG(INFO) << "data: "
               << static_cast<float*>(outputs.front().data.data())[i];
     PADDLE_ENFORCE(static_cast<float*>(outputs.front().data.data())[i],
diff --git a/paddle/fluid/inference/api/helper.h b/paddle/fluid/inference/api/helper.h
index 6f9d663121..9a393a61c4 100644
--- a/paddle/fluid/inference/api/helper.h
+++ b/paddle/fluid/inference/api/helper.h
@@ -15,10 +15,6 @@
 #pragma once
 
 #include <glog/logging.h>
-#if !defined(_WIN32)
-#include <sys/time.h>
-#else
-#endif
 
 #include <algorithm>
 #include <chrono>  // NOLINT
@@ -28,6 +24,7 @@
 #include <string>
 #include <vector>
 #include "paddle/fluid/inference/api/paddle_inference_api.h"
+#include "paddle/fluid/platform/port.h"
 #include "paddle/fluid/string/printf.h"
 
 namespace paddle {
diff --git a/paddle/fluid/inference/tests/api/anakin_rnn1_tester.cc b/paddle/fluid/inference/tests/api/anakin_rnn1_tester.cc
index c4022225fd..da42688f29 100644
--- a/paddle/fluid/inference/tests/api/anakin_rnn1_tester.cc
+++ b/paddle/fluid/inference/tests/api/anakin_rnn1_tester.cc
@@ -13,7 +13,6 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include <gflags/gflags.h>
-#include <sys/time.h>
 #include <time.h>
 #include <algorithm>
 #include <fstream>
diff --git a/paddle/fluid/inference/tests/book/test_inference_nlp.cc b/paddle/fluid/inference/tests/book/test_inference_nlp.cc
index cbcfc964c9..5c1204b9e6 100644
--- a/paddle/fluid/inference/tests/book/test_inference_nlp.cc
+++ b/paddle/fluid/inference/tests/book/test_inference_nlp.cc
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include <sys/time.h>
 #include <time.h>
 #include <fstream>
 #include <thread>  // NOLINT
diff --git a/paddle/fluid/inference/tests/test_helper.h b/paddle/fluid/inference/tests/test_helper.h
index 2118fcfd4b..75fa611c0d 100644
--- a/paddle/fluid/inference/tests/test_helper.h
+++ b/paddle/fluid/inference/tests/test_helper.h
@@ -20,6 +20,7 @@ limitations under the License. */
 
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/inference/io.h"
+#include "paddle/fluid/platform/port.h"
 #include "paddle/fluid/platform/profiler.h"
 
 DECLARE_bool(use_mkldnn);
diff --git a/paddle/fluid/operators/beam_search_op_test.cc b/paddle/fluid/operators/beam_search_op_test.cc
index 501807e7f3..80fdd22fbb 100644
--- a/paddle/fluid/operators/beam_search_op_test.cc
+++ b/paddle/fluid/operators/beam_search_op_test.cc
@@ -30,23 +30,23 @@ using std::endl;
 
 void CreateInput(LoDTensor* ids, LoDTensor* scores) {
   LoD lod;
-  vector<size_t> level0({0, 2, 4});
-  vector<size_t> level1({0, 1, 2, 3, 4});
+  vector<size_t> level0{0, 2, 4};
+  vector<size_t> level1{0, 1, 2, 3, 4};
   lod.push_back(level0);
   lod.push_back(level1);
   ids->set_lod(lod);
   scores->set_lod(lod);
 
-  auto dims = framework::make_ddim(vector<int64_t>({4, 3}));
+  auto dims = framework::make_ddim(vector<int64_t>{4, 3});
   ids->Resize(dims);
   scores->Resize(dims);
   CPUPlace place;
 
   auto* ids_data = ids->mutable_data<int64_t>(place);
   auto* scores_data = scores->mutable_data<float>(place);
-  vector<int64_t> _ids({4, 2, 5, 2, 1, 3, 3, 5, 2, 8, 2, 1});
-  vector<float> _scores(
-      {0.5, 0.3, 0.2, 0.6, 0.3, 0.1, 0.9, 0.5, 0.1, 0.7, 0.5, 0.1});
+  vector<int64_t> _ids{4, 2, 5, 2, 1, 3, 3, 5, 2, 8, 2, 1};
+  vector<float> _scores{0.5f, 0.3f, 0.2f, 0.6f, 0.3f, 0.1f,
+                        0.9f, 0.5f, 0.1f, 0.7f, 0.5f, 0.1f};
 
   for (int i = 0; i < 12; i++) {
     ids_data[i] = _ids[i];
@@ -79,8 +79,8 @@ TEST(DISABLED_beam_search_op, run) {
 
   ASSERT_EQ(sids.lod(), sscores.lod());
 
-  vector<int> tids({4, 2, 3, 8});
-  vector<float> tscores({0.5, 0.6, 0.9, 0.7});
+  vector<int> tids{4, 2, 3, 8};
+  vector<float> tscores{0.5f, 0.6f, 0.9f, 0.7f};
 
   for (int i = 0; i < 4; i++) {
     ASSERT_EQ(tids[i], sids.data<int64_t>()[i]);
diff --git a/paddle/fluid/operators/distributed/grpc_client.cc b/paddle/fluid/operators/distributed/grpc_client.cc
index c28f86146d..3548d5d9fb 100644
--- a/paddle/fluid/operators/distributed/grpc_client.cc
+++ b/paddle/fluid/operators/distributed/grpc_client.cc
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include <sys/time.h>
 #include <limits>
 
 #include "glog/logging.h"  // For VLOG
@@ -20,6 +19,7 @@ limitations under the License. */
 #include "paddle/fluid/operators/distributed/grpc_client.h"
 #include "paddle/fluid/operators/distributed/grpc_serde.h"
 #include "paddle/fluid/operators/distributed/request_handler.h"
+#include "paddle/fluid/platform/port.h"
 #include "paddle/fluid/platform/profiler.h"
 
 namespace paddle {
diff --git a/paddle/fluid/operators/distributed/grpc_serde.cc b/paddle/fluid/operators/distributed/grpc_serde.cc
index f27b70a5a3..e6856676d4 100644
--- a/paddle/fluid/operators/distributed/grpc_serde.cc
+++ b/paddle/fluid/operators/distributed/grpc_serde.cc
@@ -15,7 +15,6 @@ limitations under the License. */
 #ifdef PADDLE_WITH_CUDA
 #include <nccl.h>
 #endif
-#include <sys/time.h>
 #include <thread>  // NOLINT
 
 #include "google/protobuf/io/coded_stream.h"
@@ -26,6 +25,7 @@ limitations under the License. */
 #include "paddle/fluid/operators/distributed/grpc_variable_response.h"
 #include "paddle/fluid/operators/distributed/proto_encoder_helper.h"
 #include "paddle/fluid/operators/distributed/sendrecvop_utils.h"
+#include "paddle/fluid/platform/port.h"
 #include "paddle/fluid/platform/profiler.h"
 
 namespace paddle {
diff --git a/paddle/fluid/operators/distributed/grpc_serde.h b/paddle/fluid/operators/distributed/grpc_serde.h
index 7ec489e961..17290d3fb4 100644
--- a/paddle/fluid/operators/distributed/grpc_serde.h
+++ b/paddle/fluid/operators/distributed/grpc_serde.h
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #pragma once
-#include <sys/time.h>
+
 #include <iostream>
 #include <string>
 #include <vector>
@@ -25,6 +25,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/tensor_util.h"
 #include "paddle/fluid/framework/var_type.h"
 #include "paddle/fluid/operators/distributed/sendrecvop_utils.h"
+#include "paddle/fluid/platform/port.h"
 
 #include "paddle/fluid/operators/distributed/send_recv.grpc.pb.h"
 #include "paddle/fluid/operators/distributed/send_recv.pb.h"
diff --git a/paddle/fluid/operators/distributed/sendrecvop_utils.cc b/paddle/fluid/operators/distributed/sendrecvop_utils.cc
index 374fa680e3..0abebb9240 100644
--- a/paddle/fluid/operators/distributed/sendrecvop_utils.cc
+++ b/paddle/fluid/operators/distributed/sendrecvop_utils.cc
@@ -15,12 +15,12 @@ limitations under the License. */
 #ifdef PADDLE_WITH_CUDA
 #include <nccl.h>
 #endif
-#include <sys/time.h>
 #include <thread>  // NOLINT
 
 #include "paddle/fluid/framework/data_type.h"
 #include "paddle/fluid/operators/distributed/sendrecvop_utils.h"
 #include "paddle/fluid/operators/distributed/variable_response.h"
+#include "paddle/fluid/platform/port.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/distributed/sendrecvop_utils.h b/paddle/fluid/operators/distributed/sendrecvop_utils.h
index 480fc59c42..523e56fe3e 100644
--- a/paddle/fluid/operators/distributed/sendrecvop_utils.h
+++ b/paddle/fluid/operators/distributed/sendrecvop_utils.h
@@ -13,7 +13,6 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #pragma once
-#include <sys/time.h>
 #include <iostream>
 #include <string>
 #include <vector>
@@ -24,6 +23,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/selected_rows.h"
 #include "paddle/fluid/framework/tensor_util.h"
 #include "paddle/fluid/framework/var_type.h"
+#include "paddle/fluid/platform/port.h"
 
 #include "paddle/fluid/operators/distributed/send_recv.pb.h"
 
diff --git a/paddle/fluid/operators/math/cpu_vec_test.cc b/paddle/fluid/operators/math/cpu_vec_test.cc
index 18a586f8dd..ad734bae42 100644
--- a/paddle/fluid/operators/math/cpu_vec_test.cc
+++ b/paddle/fluid/operators/math/cpu_vec_test.cc
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include <sys/time.h>
 #include <cmath>
 #include <cstring>
 #include <random>
@@ -22,6 +21,7 @@ limitations under the License. */
 #include "gtest/gtest.h"
 
 #include "paddle/fluid/operators/math/cpu_vec.h"
+#include "paddle/fluid/platform/port.h"
 
 inline double GetCurrentUS() {
   struct timeval time;
diff --git a/paddle/fluid/operators/math/im2col_test.cc b/paddle/fluid/operators/math/im2col_test.cc
index ae2c90b33a..521cd7801a 100644
--- a/paddle/fluid/operators/math/im2col_test.cc
+++ b/paddle/fluid/operators/math/im2col_test.cc
@@ -14,9 +14,9 @@ limitations under the License. */
 
 #include "paddle/fluid/operators/math/im2col.h"
 #include <gtest/gtest.h>
-#include <sys/time.h>
 #include <vector>
 #include "paddle/fluid/operators/math/im2col_cfo_cpu.h"
+#include "paddle/fluid/platform/port.h"
 
 template <typename DeviceContext, typename Place>
 void testIm2col() {
diff --git a/paddle/fluid/operators/math/jit_kernel_test.cc b/paddle/fluid/operators/math/jit_kernel_test.cc
index b6c62a2634..8662e1c50d 100644
--- a/paddle/fluid/operators/math/jit_kernel_test.cc
+++ b/paddle/fluid/operators/math/jit_kernel_test.cc
@@ -13,7 +13,6 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/math/jit_kernel.h"
-#include <sys/time.h>
 #include <cmath>    // for exp
 #include <cstring>  // for memcpy
 #include <random>
@@ -22,6 +21,7 @@ limitations under the License. */
 #include "gflags/gflags.h"
 #include "glog/logging.h"
 #include "gtest/gtest.h"
+#include "paddle/fluid/platform/port.h"
 
 #ifdef PADDLE_WITH_MKLML
 #include "paddle/fluid/platform/dynload/mklml.h"
diff --git a/paddle/fluid/platform/cudnn_helper.h b/paddle/fluid/platform/cudnn_helper.h
index 682b0c0ff3..61a25064d1 100644
--- a/paddle/fluid/platform/cudnn_helper.h
+++ b/paddle/fluid/platform/cudnn_helper.h
@@ -62,7 +62,7 @@ inline const char* cudnnGetErrorString(cudnnStatus_t status) {
 
 #define CUDNN_ENFORCE(condition)                                     \
   do {                                                               \
-    cudnnStatus_t status = condition;                                \
+    auto status = condition;                                         \
     if (UNLIKELY(status != CUDNN_STATUS_SUCCESS)) {                  \
       PADDLE_THROW(::paddle::platform::cudnnGetErrorString(status)); \
     }                                                                \
diff --git a/paddle/fluid/platform/dynload/cudnn.h b/paddle/fluid/platform/dynload/cudnn.h
index 1a83ac7780..db62377898 100644
--- a/paddle/fluid/platform/dynload/cudnn.h
+++ b/paddle/fluid/platform/dynload/cudnn.h
@@ -48,13 +48,13 @@ extern void EnforceCUDNNLoaded(const char* fn_name);
 
 #else
 
-#define DECLARE_DYNAMIC_LOAD_CUDNN_WRAP(__name)     \
-  struct DynLoad__##__name {                        \
-    template <typename... Args>                     \
-    inline cudnnStatus_t operator()(Args... args) { \
-      return ::__name(args...);                     \
-    }                                               \
-  };                                                \
+#define DECLARE_DYNAMIC_LOAD_CUDNN_WRAP(__name) \
+  struct DynLoad__##__name {                    \
+    template <typename... Args>                 \
+    inline auto operator()(Args... args) {      \
+      return ::__name(args...);                 \
+    }                                           \
+  };                                            \
   extern DynLoad__##__name __name
 
 #endif
diff --git a/paddle/fluid/platform/gpu_info.cc b/paddle/fluid/platform/gpu_info.cc
index c78f159ad2..e0d0051ad0 100644
--- a/paddle/fluid/platform/gpu_info.cc
+++ b/paddle/fluid/platform/gpu_info.cc
@@ -19,7 +19,10 @@ limitations under the License. */
 #include "gflags/gflags.h"
 #include "paddle/fluid/platform/enforce.h"
 
-DEFINE_double(fraction_of_gpu_memory_to_use, 0.92,
+// fraction_of_gpu_memory_to_use cannot be too high on windows,
+// since the win32 graphic sub-system can occupy some GPU memory
+// which may lead to insufficient memory left for paddle
+DEFINE_double(fraction_of_gpu_memory_to_use, 0.5,
               "Allocate a trunk of gpu memory that is this fraction of the "
               "total gpu memory size. Future memory usage will be allocated "
               "from the trunk. If the trunk doesn't have enough gpu memory, "
diff --git a/paddle/fluid/platform/stream_callback_manager.h b/paddle/fluid/platform/stream_callback_manager.h
index 11c68f3449..8dcfc4e748 100644
--- a/paddle/fluid/platform/stream_callback_manager.h
+++ b/paddle/fluid/platform/stream_callback_manager.h
@@ -18,7 +18,7 @@
 #include <cuda_runtime.h>
 #include <functional>
 #include <memory>
-#include "ThreadPool.h"
+#include <ThreadPool.h>
 #include "paddle/fluid/platform/enforce.h"
 
 namespace paddle {
diff --git a/paddle/legacy/cuda/include/hl_warpctc_wrap.h b/paddle/legacy/cuda/include/hl_warpctc_wrap.h
index 0857bd1aa1..09cbd6d450 100644
--- a/paddle/legacy/cuda/include/hl_warpctc_wrap.h
+++ b/paddle/legacy/cuda/include/hl_warpctc_wrap.h
@@ -12,9 +12,9 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
+#ifndef _WIN32
 #ifndef HL_WARPCTC_WRAP_H_
 #define HL_WARPCTC_WRAP_H_
-
 #include "ctc.h"
 #include "hl_base.h"
 
@@ -91,3 +91,4 @@ extern void hl_warpctc_get_workspace_size(const int* cpuLabelLengths,
                                           size_t* bytes);
 
 #endif  // HL_WARPCTC_WRAP_H_
+#endif
diff --git a/paddle/legacy/cuda/src/hl_cuda_device.cc b/paddle/legacy/cuda/src/hl_cuda_device.cc
index 501e3b0f3b..a6e27a37ff 100644
--- a/paddle/legacy/cuda/src/hl_cuda_device.cc
+++ b/paddle/legacy/cuda/src/hl_cuda_device.cc
@@ -132,11 +132,15 @@ inline pid_t gettid() {
   uint64_t tid;
   pthread_threadid_np(NULL, &tid);
 #else
+#ifndef _WIN32
 #ifndef __NR_gettid
 #define __NR_gettid 224
 #endif
   pid_t tid = syscall(__NR_gettid);
 #endif
+#else   // _WIN32
+  pid_t tid = _getpid();
+#endif  // _WIN32
   CHECK_NE((int)tid, -1);
   return tid;
 }
diff --git a/paddle/legacy/utils/ThreadLocal.h b/paddle/legacy/utils/ThreadLocal.h
index c5b07506d3..6268b73a85 100644
--- a/paddle/legacy/utils/ThreadLocal.h
+++ b/paddle/legacy/utils/ThreadLocal.h
@@ -14,10 +14,12 @@ limitations under the License. */
 
 #pragma once
 
+#ifndef _WIN32
 #include <pthread.h>
 #include <sys/syscall.h>
-#include <sys/types.h>
 #include <unistd.h>
+#endif
+#include <sys/types.h>
 #include <map>
 #include <mutex>
 #include <random>
diff --git a/paddle/legacy/utils/Util.h b/paddle/legacy/utils/Util.h
index e6f05e30d3..3a878b2b30 100644
--- a/paddle/legacy/utils/Util.h
+++ b/paddle/legacy/utils/Util.h
@@ -14,7 +14,9 @@ limitations under the License. */
 
 #pragma once
 
+#ifndef _WIN32
 #include <sys/syscall.h>  // for syscall()
+#endif
 #include <sys/types.h>
 #include <algorithm>
 #include <cmath>
@@ -40,6 +42,31 @@ inline int rand_r(unsigned int* seedp) {
 }
 #endif
 
+#ifdef _WIN32
+#define NOMINMAX  // msvc max/min macro conflict with std::min/max
+#include <windows.h>
+
+template <typename T>
+inline int __builtin_clz(const T& value) {
+  DWORD leadning_zero = 0;
+  if (_BitScanReverse(&leadning_zero, value)) {
+    return static_cast<int>(sizeof(T) * 8 - leadning_zero);
+  } else {
+    return static_cast<int>(0);
+  }
+}
+
+inline int __builtin_clzl(const unsigned long& value) {
+  return __builtin_clz(value);
+}
+
+inline int __builtin_clzll(const unsigned long long& value) {
+  return __builtin_clz(value);
+}
+
+#define pid_t int
+#endif
+
 /**
  * Loop over the elements in a container
  * TODO(yuyang18): It's this foreach useful? Why not use C++ 11 foreach,
diff --git a/paddle/testing/CMakeLists.txt b/paddle/testing/CMakeLists.txt
index 2264481899..614596958e 100644
--- a/paddle/testing/CMakeLists.txt
+++ b/paddle/testing/CMakeLists.txt
@@ -3,8 +3,10 @@
 if(WITH_TESTING)
   add_library(paddle_test_main STATIC TestMain.cpp)
   add_dependencies(paddle_test_main paddle_proto ${external_project_dependencies})
-  add_library(paddle_test_util STATIC TestUtil.cpp)
-  add_dependencies(paddle_test_util paddle_proto ${external_project_dependencies})
+  if(NOT WIN32)
+    add_library(paddle_test_util STATIC TestUtil.cpp)
+    add_dependencies(paddle_test_util paddle_proto ${external_project_dependencies})
+  endif(NOT WIN32)
   if(NOT MOBILE_INFERENCE)
     cc_library(paddle_gtest_main SRCS paddle_gtest_main.cc DEPS device_context memory gtest gflags)
   endif()
diff --git a/python/paddle/fluid/metrics.py b/python/paddle/fluid/metrics.py
index f65b37903a..829154f1b2 100644
--- a/python/paddle/fluid/metrics.py
+++ b/python/paddle/fluid/metrics.py
@@ -46,8 +46,8 @@ def _is_numpy_(var):
 
 
 def _is_number_(var):
-    return isinstance(var, int) or isinstance(var, float) or (isinstance(
-        var, np.ndarray) and var.shape == (1, ))
+    return isinstance(var, int) or isinstance(var, np.int64) or isinstance(
+        var, float) or (isinstance(var, np.ndarray) and var.shape == (1, ))
 
 
 def _is_number_or_matrix_(var):
diff --git a/python/paddle/fluid/tests/unittests/CMakeLists.txt b/python/paddle/fluid/tests/unittests/CMakeLists.txt
index 510d0304f0..3fc12d584d 100644
--- a/python/paddle/fluid/tests/unittests/CMakeLists.txt
+++ b/python/paddle/fluid/tests/unittests/CMakeLists.txt
@@ -23,9 +23,11 @@ if(NOT WITH_DISTRIBUTE)
     LIST(REMOVE_ITEM TEST_OPS test_dist_text_classification)
 endif(NOT WITH_DISTRIBUTE)
 
-if (${CUDNN_MAJOR_VERSION} VERSION_LESS 7)
-    LIST(REMOVE_ITEM TEST_OPS test_conv2d_fusion_op)
-endif()
+if(WITH_GPU)
+    if (${CUDNN_MAJOR_VERSION} VERSION_LESS 7)
+        LIST(REMOVE_ITEM TEST_OPS test_conv2d_fusion_op)
+    endif()
+endif(WITH_GPU)
 
 list(REMOVE_ITEM TEST_OPS test_seq_concat_op) # FIXME(helin): https://github.com/PaddlePaddle/Paddle/issues/8290
 list(REMOVE_ITEM TEST_OPS test_modified_huber_loss_op) # FIXME(qijun) https://github.com/PaddlePaddle/Paddle/issues/5184

From e280c7a4db7f5765e7b3b5b2146204705b348e5b Mon Sep 17 00:00:00 2001
From: peizhilin <wopeizl@163.com>
Date: Thu, 22 Nov 2018 19:52:10 +0800
Subject: [PATCH 055/113] code style fix test=develop

---
 paddle/fluid/platform/stream_callback_manager.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/paddle/fluid/platform/stream_callback_manager.h b/paddle/fluid/platform/stream_callback_manager.h
index 8dcfc4e748..ed8734c98c 100644
--- a/paddle/fluid/platform/stream_callback_manager.h
+++ b/paddle/fluid/platform/stream_callback_manager.h
@@ -14,11 +14,11 @@
 
 #pragma once
 
+#include <ThreadPool.h>
 #include <cuda.h>
 #include <cuda_runtime.h>
 #include <functional>
 #include <memory>
-#include <ThreadPool.h>
 #include "paddle/fluid/platform/enforce.h"
 
 namespace paddle {

From 00b9e9a1357bb3fa6e6adceb4e650d9f6424aa2a Mon Sep 17 00:00:00 2001
From: chengduo <zhaochengduo@baidu.com>
Date: Thu, 22 Nov 2018 20:40:56 +0800
Subject: [PATCH 056/113] Refine cublas to support CUBLAS_TENSOR_OP_MATH
 (#13929)

* refine cublase
test=develop

* code refine

* refine cublas

* add GEMME_EX

* add enable_cublas_tensor_op_math doc and add cublasCall
test=develop

* fix CublasCall for cuda version
test=develop

* fix error
test=develop

* fix GEMM_EX to be compatible with gcc 4.8
test=develop

* add GEMM_EX
test=develop

* to compatiable with gcc4.8
test=develop
---
 paddle/fluid/operators/math/blas_impl.cu.h | 206 +++++++++++++++++----
 paddle/fluid/platform/device_context.h     |  47 +++++
 paddle/fluid/platform/dynload/cublas.h     |  26 ++-
 paddle/fluid/platform/gpu_info.cc          |  20 ++
 paddle/fluid/platform/gpu_info.h           |   3 +
 python/paddle/fluid/__init__.py            |   3 +-
 6 files changed, 256 insertions(+), 49 deletions(-)

diff --git a/paddle/fluid/operators/math/blas_impl.cu.h b/paddle/fluid/operators/math/blas_impl.cu.h
index d84c88cb3b..d35073029a 100644
--- a/paddle/fluid/operators/math/blas_impl.cu.h
+++ b/paddle/fluid/operators/math/blas_impl.cu.h
@@ -16,6 +16,9 @@
 
 #include "paddle/fluid/operators/math/math_function.h"
 #include "paddle/fluid/platform/dynload/cublas.h"
+#include "paddle/fluid/platform/gpu_info.h"
+
+DECLARE_bool(enable_cublas_tensor_op_math);
 
 namespace paddle {
 namespace operators {
@@ -42,11 +45,44 @@ struct CUBlas<float> {
   }
 
   template <typename... ARGS>
-  static void GEMM_BATCH(ARGS... args) {
+  static void GEMM_STRIDED_BATCH(ARGS... args) {
 #if CUDA_VERSION >= 8000
     PADDLE_ENFORCE(platform::dynload::cublasSgemmStridedBatched(args...));
 #else
     PADDLE_THROW("SgemmStridedBatched is not supported on cuda <= 7.5");
+#endif
+  }
+
+  // NOTES: GEMM_EX can use Tensor Core to accelerate matrix multiply.
+  // https://docs.nvidia.com/cuda/cublas/index.html#cublassetmathmode
+  template <typename... ARGS>
+  static void GEMM_EX(platform::CUDADeviceContext *dev_ctx,
+                      cublasOperation_t transa, cublasOperation_t transb, int m,
+                      int n, int k, const float *alpha, const void *A,
+                      cudaDataType_t Atype, int lda, const void *B,
+                      cudaDataType_t Btype, int ldb, const float *beta, void *C,
+                      cudaDataType_t Ctype, int ldc) {
+    // Because the gcc 4.8 doesn't expand template parameter pack that
+    // appears in a lambda-expression, I can not use template parameter pack
+    // here.
+    auto cublas_call = [&]() {
+#if CUDA_VERSION >= 8000
+      VLOG(5) << "use_tensor_op_math: "
+              << (platform::TensorCoreAvailable() ? "True" : "False");
+      PADDLE_ENFORCE(platform::dynload::cublasSgemmEx(
+          dev_ctx->cublas_handle(), transa, transb, m, n, k, alpha, A, Atype,
+          lda, B, Btype, ldb, beta, C, Ctype, ldc));
+#else
+      PADDLE_THROW("cublasSgemmEx is supported on cuda >= 8.0");
+#endif
+    };
+
+#if CUDA_VERSION >= 9000
+    // NOTES: To use Tensor Core, we should change the cublas config,
+    // but the cublas may be hold by multi-thread.
+    dev_ctx->CublasCall(cublas_call, CUBLAS_TENSOR_OP_MATH);
+#else
+    cublas_call();
 #endif
   }
 };
@@ -69,13 +105,18 @@ struct CUBlas<double> {
   }
 
   template <typename... ARGS>
-  static void GEMM_BATCH(ARGS... args) {
+  static void GEMM_STRIDED_BATCH(ARGS... args) {
 #if CUDA_VERSION >= 8000
     PADDLE_ENFORCE(platform::dynload::cublasDgemmStridedBatched(args...));
 #else
     PADDLE_THROW("DgemmStridedBatched is not supported on cuda <= 7.5");
 #endif
   }
+
+  template <typename... ARGS>
+  static void GEMM_EX(ARGS... args) {
+    PADDLE_THROW("Currently there are not cublasDgemmEx.");
+  }
 };
 
 template <>
@@ -96,14 +137,16 @@ struct CUBlas<platform::float16> {
                                        reinterpret_cast<__half *>(C), ldc));
   }
 
-  static void GEMM_BATCH(cublasHandle_t handle, cublasOperation_t transa,
-                         cublasOperation_t transb, int m, int n, int k,
-                         const float16 *alpha, const float16 *A, int lda,
-                         long long int strideA, const float16 *B,  // NOLINT
-                         int ldb, long long int strideB,           // NOLINT
-                         const float16 *beta, float16 *C, int ldc,
-                         long long int strideC,  // NOLINT
-                         int batchCount) {
+  static void GEMM_STRIDED_BATCH(cublasHandle_t handle,
+                                 cublasOperation_t transa,
+                                 cublasOperation_t transb, int m, int n, int k,
+                                 const float16 *alpha, const float16 *A,
+                                 int lda, long long int strideA,  // NOLINT
+                                 const float16 *B,                // NOLINT
+                                 int ldb, long long int strideB,  // NOLINT
+                                 const float16 *beta, float16 *C, int ldc,
+                                 long long int strideC,  // NOLINT
+                                 int batchCount) {
 #if CUDA_VERSION >= 8000
     PADDLE_ENFORCE(platform::dynload::cublasHgemmStridedBatched(
         handle, transa, transb, m, n, k,
@@ -114,6 +157,45 @@ struct CUBlas<platform::float16> {
         ldc, strideC, batchCount));
 #else
     PADDLE_THROW("HgemmStridedBatched is not supported on cuda <= 7.5");
+#endif
+  }
+
+  // NOTES: GEMM_EX can use Tensor Core to accelerate matrix multiply.
+  // https://docs.nvidia.com/cuda/cublas/index.html#cublassetmathmode
+  template <typename... ARGS>
+  static void GEMM_EX(platform::CUDADeviceContext *dev_ctx,
+                      cublasOperation_t transa, cublasOperation_t transb, int m,
+                      int n, int k, const void *alpha, const void *A,
+                      cudaDataType_t Atype, int lda, const void *B,
+                      cudaDataType_t Btype, int ldb, const void *beta, void *C,
+                      cudaDataType_t Ctype, int ldc,
+                      cudaDataType_t computeType) {
+    auto cublas_call = [&]() {
+#if CUDA_VERSION >= 8000
+      cublasGemmAlgo_t algo = CUBLAS_GEMM_DFALT;
+#if CUDA_VERSION >= 9000
+      bool use_tensor_op_math = platform::TensorCoreAvailable();
+      if (use_tensor_op_math) {
+        algo = CUBLAS_GEMM_DFALT_TENSOR_OP;
+      }
+      VLOG(5) << "use_tensor_op_math: "
+              << (use_tensor_op_math ? "True" : "False");
+#endif  // CUDA_VERSION >= 9000
+
+      PADDLE_ENFORCE(platform::dynload::cublasGemmEx(
+          dev_ctx->cublas_handle(), transa, transb, m, n, k, alpha, A, Atype,
+          lda, B, Btype, ldb, beta, C, Ctype, ldc, computeType, algo));
+#else
+      PADDLE_THROW("cublasGemmEx is supported on cuda >= 8.0");
+#endif
+    };
+
+#if CUDA_VERSION >= 9000
+    // NOTES: To use Tensor Core, we should change the cublas config,
+    // but the cublas may be hold by multi-thread.
+    dev_ctx->CublasCall(cublas_call, CUBLAS_TENSOR_OP_MATH);
+#else
+    cublas_call();
 #endif
   }
 };
@@ -133,8 +215,21 @@ void Blas<platform::CUDADeviceContext>::GEMM(CBLAS_TRANSPOSE transA,
   cublasOperation_t cuTransB =
       (transB == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T;
 
-  CUBlas<T>::GEMM(context_.cublas_handle(), cuTransB, cuTransA, N, M, K, &alpha,
-                  B, ldb, A, lda, &beta, C, N);
+#if CUDA_VERSION >= 8000
+  if (FLAGS_enable_cublas_tensor_op_math && std::is_same<T, float>::value) {
+    auto &cuda_ctx = const_cast<platform::CUDADeviceContext &>(context_);
+    CUBlas<T>::GEMM_EX(&cuda_ctx, cuTransB, cuTransA, N, M, K, &alpha, B,
+                       CUDA_R_32F, ldb, A, CUDA_R_32F, lda, &beta, C,
+                       CUDA_R_32F, N);
+  } else {
+#endif  // CUDA_VERSION >= 8000
+
+    CUBlas<T>::GEMM(context_.cublas_handle(), cuTransB, cuTransA, N, M, K,
+                    &alpha, B, ldb, A, lda, &beta, C, N);
+
+#if CUDA_VERSION >= 8000
+  }
+#endif  // CUDA_VERSION >= 8000
 }
 
 template <>
@@ -157,30 +252,18 @@ inline void Blas<platform::CUDADeviceContext>::GEMM(
   PADDLE_ENFORCE_GE(context_.GetComputeCapability(), 53,
                     "cublas fp16 gemm requires GPU compute capability >= 53");
 
-#if CUDA_VERSION >= 8000
   float h_alpha = static_cast<float>(alpha);
   float h_beta = static_cast<float>(beta);
 
-  cublasGemmAlgo_t algo = CUBLAS_GEMM_DFALT;
-#if CUDA_VERSION >= 9000
-  if (context_.GetComputeCapability() >= 70) {
-    PADDLE_ENFORCE(platform::dynload::cublasSetMathMode(
-        context_.cublas_handle(), CUBLAS_TENSOR_OP_MATH));
-    algo = CUBLAS_GEMM_DFALT_TENSOR_OP;
-  } else {
-    PADDLE_ENFORCE(platform::dynload::cublasSetMathMode(
-        context_.cublas_handle(), CUBLAS_DEFAULT_MATH));
-  }
-#endif  // CUDA_VERSION >= 9000
-
+#if CUDA_VERSION >= 8000
   // cublasHgemm does true FP16 computation which is slow for non-Volta
   // GPUs. So use cublasGemmEx instead which does pesudo FP16 computation:
   // input/output in fp16, computation in fp32, which can also be accelerated
   // using tensor cores in volta GPUs.
-  PADDLE_ENFORCE(platform::dynload::cublasGemmEx(
-      context_.cublas_handle(), cuTransB, cuTransA, N, M, K, &h_alpha, B,
-      CUDA_R_16F, ldb, A, CUDA_R_16F, lda, &h_beta, C, CUDA_R_16F, N,
-      CUDA_R_32F, algo));
+  auto &cuda_ctx = const_cast<platform::CUDADeviceContext &>(context_);
+  CUBlas<platform::float16>::GEMM_EX(
+      &cuda_ctx, cuTransB, cuTransA, N, M, K, &h_alpha, B, CUDA_R_16F, ldb, A,
+      CUDA_R_16F, lda, &h_beta, C, CUDA_R_16F, N, CUDA_R_32F);
 #else
   // CUDA 7.5 does not support cublasGemmEx, hence we fall back to use hgemm
   CUBlas<platform::float16>::GEMM(context_.cublas_handle(), cuTransB, cuTransA,
@@ -199,8 +282,38 @@ void Blas<platform::CUDADeviceContext>::GEMM(bool transA, bool transB, int M,
   // the cblas convention.
   cublasOperation_t cuTransA = transA ? CUBLAS_OP_T : CUBLAS_OP_N;
   cublasOperation_t cuTransB = transB ? CUBLAS_OP_T : CUBLAS_OP_N;
-  CUBlas<T>::GEMM(context_.cublas_handle(), cuTransB, cuTransA, N, M, K, &alpha,
-                  B, ldb, A, lda, &beta, C, ldc);
+
+#if CUDA_VERSION >= 8000
+  if (FLAGS_enable_cublas_tensor_op_math && std::is_same<T, float>::value) {
+    auto &cuda_ctx = const_cast<platform::CUDADeviceContext &>(context_);
+    CUBlas<T>::GEMM_EX(&cuda_ctx, cuTransB, cuTransA, N, M, K, &alpha, B,
+                       CUDA_R_32F, ldb, A, CUDA_R_32F, lda, &beta, C,
+                       CUDA_R_32F, ldc);
+  } else {
+#endif  // CUDA_VERSION >= 8000
+
+    CUBlas<T>::GEMM(context_.cublas_handle(), cuTransB, cuTransA, N, M, K,
+                    &alpha, B, ldb, A, lda, &beta, C, ldc);
+
+#if CUDA_VERSION >= 8000
+  }
+#endif  // CUDA_VERSION >= 8000
+}
+
+template <>
+template <>
+inline void Blas<platform::CUDADeviceContext>::GEMM(
+    bool transA, bool transB, int M, int N, int K, platform::float16 alpha,
+    const platform::float16 *A, int lda, const platform::float16 *B, int ldb,
+    platform::float16 beta, platform::float16 *C, int ldc) const {
+  // Note that cublas follows fortran order, so the order is different from
+  // the cblas convention.
+  cublasOperation_t cuTransA = transA ? CUBLAS_OP_T : CUBLAS_OP_N;
+  cublasOperation_t cuTransB = transB ? CUBLAS_OP_T : CUBLAS_OP_N;
+
+  CUBlas<platform::float16>::GEMM(context_.cublas_handle(), cuTransB, cuTransA,
+                                  N, M, K, &alpha, B, ldb, A, lda, &beta, C,
+                                  ldc);
 }
 
 template <>
@@ -238,9 +351,34 @@ void Blas<platform::CUDADeviceContext>::BatchedGEMM(
       (transB == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T;
   const int64_t strideC = M * N;
 
-  CUBlas<T>::GEMM_BATCH(context_.cublas_handle(), cuTransB, cuTransA, N, M, K,
-                        &alpha, B, ldb, strideB, A, lda, strideA, &beta, C, ldc,
-                        strideC, batchCount);
+#if CUDA_VERSION >= 9010
+  if (FLAGS_enable_cublas_tensor_op_math && std::is_same<T, float>::value) {
+    auto cublas_call = [&]() {
+      cublasGemmAlgo_t algo = CUBLAS_GEMM_DFALT;
+      bool use_tensor_op_math = platform::TensorCoreAvailable();
+      if (use_tensor_op_math) {
+        algo = CUBLAS_GEMM_DFALT_TENSOR_OP;
+      }
+      VLOG(5) << "use_tensor_op_math: "
+              << (use_tensor_op_math ? "True" : "False");
+
+      PADDLE_ENFORCE(platform::dynload::cublasGemmStridedBatchedEx(
+          context_.cublas_handle(), cuTransB, cuTransA, N, M, K, &alpha, B,
+          CUDA_R_32F, ldb, strideB, A, CUDA_R_32F, lda, strideA, &beta, C,
+          CUDA_R_32F, ldc, strideC, batchCount, CUDA_R_32F, algo));
+    };
+    auto &dev_ctx = const_cast<platform::CUDADeviceContext &>(context_);
+    dev_ctx.CublasCall(cublas_call, CUBLAS_TENSOR_OP_MATH);
+  } else {
+#endif  // CUDA_VERSION >= 9010
+
+    CUBlas<T>::GEMM_STRIDED_BATCH(context_.cublas_handle(), cuTransB, cuTransA,
+                                  N, M, K, &alpha, B, ldb, strideB, A, lda,
+                                  strideA, &beta, C, ldc, strideC, batchCount);
+
+#if CUDA_VERSION >= 9010
+  }
+#endif  // CUDA_VERSION >= 9010
 }
 
 }  // namespace math
diff --git a/paddle/fluid/platform/device_context.h b/paddle/fluid/platform/device_context.h
index 9a9018cdea..3edd727978 100644
--- a/paddle/fluid/platform/device_context.h
+++ b/paddle/fluid/platform/device_context.h
@@ -143,6 +143,39 @@ class CudnnWorkspaceHandle {
   std::unique_ptr<std::lock_guard<std::mutex>> guard_;
 };
 
+#if CUDA_VERSION >= 9000
+class ScopedCublasMathMode {
+ public:
+  ScopedCublasMathMode(cublasHandle_t handle, cublasMath_t new_math_mode)
+      : handle_(handle) {
+    need_reset = false;
+    PADDLE_ENFORCE(
+        platform::dynload::cublasGetMathMode(handle_, &old_math_mode_),
+        "Failed to get old cublas math mode");
+    if (old_math_mode_ != new_math_mode) {
+      PADDLE_ENFORCE(
+          platform::dynload::cublasSetMathMode(handle_, new_math_mode),
+          "Failed to set old cublas math mode");
+      need_reset = true;
+    }
+  }
+
+  ~ScopedCublasMathMode() {
+    if (need_reset) {
+      PADDLE_ENFORCE(
+          platform::dynload::cublasSetMathMode(handle_, old_math_mode_),
+          "Failed to set old cublas math mode");
+    }
+  }
+
+ private:
+  cublasHandle_t handle_;
+  cublasMath_t old_math_mode_;
+  bool need_reset;
+};
+
+#endif
+
 class CUDADeviceContext : public DeviceContext {
  public:
   explicit CUDADeviceContext(CUDAPlace place);
@@ -199,6 +232,18 @@ class CUDADeviceContext : public DeviceContext {
     callback_manager_->Wait();
   }
 
+#if CUDA_VERSION >= 9000
+  /*! \brief CublasCall may need to change cublas's config,
+   *  but the cublas may be hold by multi-thread, so we should
+   *  add lock here. */
+  template <typename Callback>
+  void CublasCall(Callback callback, cublasMath_t new_math) {
+    std::lock_guard<std::mutex> guard(cublas_mtx_);
+    ScopedCublasMathMode scoped_cublas_math(cublas_handle_, new_math);
+    callback();
+  }
+#endif
+
  private:
   CUDAPlace place_;
 
@@ -220,6 +265,8 @@ class CUDADeviceContext : public DeviceContext {
   // If we use mtx_ for StreamCallbackManager, deadlock may occur sometimes
   mutable std::mutex callback_mtx_;
   std::unique_ptr<StreamCallbackManager> callback_manager_;
+
+  mutable std::mutex cublas_mtx_;
 };
 
 template <>
diff --git a/paddle/fluid/platform/dynload/cublas.h b/paddle/fluid/platform/dynload/cublas.h
index 4ea0cd7283..ff80bd525c 100644
--- a/paddle/fluid/platform/dynload/cublas.h
+++ b/paddle/fluid/platform/dynload/cublas.h
@@ -61,9 +61,6 @@ extern void *cublas_dso_handle;
   extern DynLoad__##__name __name
 #endif
 
-#define DECLARE_DYNAMIC_LOAD_CUBLAS_V2_WRAP(__name) \
-  DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP(__name)
-
 #define CUBLAS_BLAS_ROUTINE_EACH(__macro) \
   __macro(cublasSaxpy_v2);                \
   __macro(cublasDaxpy_v2);                \
@@ -93,22 +90,23 @@ CUBLAS_BLAS_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP)
 
 // APIs available after CUDA 8.0
 #if CUDA_VERSION >= 8000
-#define CUBLAS_BLAS_ROUTINE_EACH_R2(__macro) \
-  __macro(cublasGemmEx);                     \
-  __macro(cublasSgemmStridedBatched);        \
-  __macro(cublasDgemmStridedBatched);        \
-  __macro(cublasCgemmStridedBatched);        \
-  __macro(cublasZgemmStridedBatched);        \
-  __macro(cublasHgemmStridedBatched);
-
-CUBLAS_BLAS_ROUTINE_EACH_R2(DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP)
+DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP(cublasGemmEx);
+DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP(cublasSgemmStridedBatched);
+DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP(cublasDgemmStridedBatched);
+DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP(cublasCgemmStridedBatched);
+DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP(cublasZgemmStridedBatched);
+DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP(cublasHgemmStridedBatched);
 #endif
 
 // APIs available after CUDA 9.0
 #if CUDA_VERSION >= 9000
-#define CUBLAS_BLAS_ROUTINE_EACH_R3(__macro) __macro(cublasSetMathMode);
+DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP(cublasSetMathMode);
+DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP(cublasGetMathMode);
+#endif
 
-CUBLAS_BLAS_ROUTINE_EACH_R3(DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP)
+#if CUDA_VERSION >= 9010
+DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP(cublasGemmBatchedEx);
+DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP(cublasGemmStridedBatchedEx);
 #endif
 
 #undef DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP
diff --git a/paddle/fluid/platform/gpu_info.cc b/paddle/fluid/platform/gpu_info.cc
index c78f159ad2..833d48347f 100644
--- a/paddle/fluid/platform/gpu_info.cc
+++ b/paddle/fluid/platform/gpu_info.cc
@@ -26,6 +26,16 @@ DEFINE_double(fraction_of_gpu_memory_to_use, 0.92,
               "additional trunks of the same size will be requested from gpu "
               "until the gpu has no memory left for another trunk.");
 
+DEFINE_bool(
+    enable_cublas_tensor_op_math, false,
+    "The enable_cublas_tensor_op_math indicate whether to use Tensor Core, "
+    "but it may loss precision. Currently, There are two CUDA libraries that"
+    " use Tensor Cores, cuBLAS and cuDNN. cuBLAS uses Tensor Cores to speed up"
+    " GEMM computations(the matrices must be either half precision or single "
+    "precision); cuDNN uses Tensor Cores to speed up both convolutions(the "
+    "input and output must be half precision) and recurrent neural networks "
+    "(RNNs).");
+
 namespace paddle {
 namespace platform {
 
@@ -64,6 +74,16 @@ int GetCUDADriverVersion(int id) {
   return driver_version;
 }
 
+bool TensorCoreAvailable() {
+#if CUDA_VERSION >= 9000
+  int device = GetCurrentDeviceId();
+  int driver_version = GetCUDAComputeCapability(device);
+  return driver_version >= 70;
+#else
+  return false;
+#endif
+}
+
 int GetCUDAMultiProcessors(int id) {
   PADDLE_ENFORCE_LT(id, GetCUDADeviceCount(), "id must less than GPU count");
   int count;
diff --git a/paddle/fluid/platform/gpu_info.h b/paddle/fluid/platform/gpu_info.h
index be44158431..6a0b3c8e02 100644
--- a/paddle/fluid/platform/gpu_info.h
+++ b/paddle/fluid/platform/gpu_info.h
@@ -35,6 +35,9 @@ int GetCUDARuntimeVersion(int id);
 //! Get the driver version of the ith GPU
 int GetCUDADriverVersion(int id);
 
+//! Wheter the current device support TensorCore
+bool TensorCoreAvailable();
+
 //! Get the MultiProcessors of the ith GPU.
 int GetCUDAMultiProcessors(int i);
 
diff --git a/python/paddle/fluid/__init__.py b/python/paddle/fluid/__init__.py
index 543acf2d34..3c092dee34 100644
--- a/python/paddle/fluid/__init__.py
+++ b/python/paddle/fluid/__init__.py
@@ -133,7 +133,8 @@ def __bootstrap__():
     if core.is_compiled_with_cuda():
         read_env_flags += [
             'fraction_of_gpu_memory_to_use', 'cudnn_deterministic',
-            'conv_workspace_size_limit', 'cudnn_exhaustive_search'
+            'enable_cublas_tensor_op_math', 'conv_workspace_size_limit',
+            'cudnn_exhaustive_search'
         ]
     core.init_gflags([sys.argv[0]] +
                      ["--tryfromenv=" + ",".join(read_env_flags)])

From 6cc6bf4074d69c5c0b02af612b94e438d596803a Mon Sep 17 00:00:00 2001
From: Krzysztof Binias <krzysztof.binias@intel.com>
Date: Thu, 22 Nov 2018 15:30:43 +0100
Subject: [PATCH 057/113] Bumped MKL-DNN version to 0.17

test=develop
---
 cmake/external/mkldnn.cmake | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cmake/external/mkldnn.cmake b/cmake/external/mkldnn.cmake
index 785148d4f9..b280db23b9 100644
--- a/cmake/external/mkldnn.cmake
+++ b/cmake/external/mkldnn.cmake
@@ -53,7 +53,7 @@ ExternalProject_Add(
     ${EXTERNAL_PROJECT_LOG_ARGS}
     DEPENDS             ${MKLDNN_DEPENDS}
     GIT_REPOSITORY      "https://github.com/01org/mkl-dnn.git"
-    GIT_TAG             "21fb5f2af1dd14e132af4f1b79160977ee487818"
+    GIT_TAG             "830a10059a018cd2634d94195140cf2d8790a75a"
     PREFIX              ${MKLDNN_SOURCES_DIR}
     UPDATE_COMMAND      ""
     CMAKE_ARGS          -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}

From a902b8b0f811f6837330385b95fa2f552393197c Mon Sep 17 00:00:00 2001
From: minqiyang <minqiyang@baidu.com>
Date: Fri, 23 Nov 2018 01:07:58 +0800
Subject: [PATCH 058/113] Add sqlite3 support

test=develop
---
 tools/manylinux1/Dockerfile.x64 | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tools/manylinux1/Dockerfile.x64 b/tools/manylinux1/Dockerfile.x64
index e91216a5b8..48fd145e5f 100644
--- a/tools/manylinux1/Dockerfile.x64
+++ b/tools/manylinux1/Dockerfile.x64
@@ -16,7 +16,7 @@ ENV PKG_CONFIG_PATH=/usr/local/lib/pkgconfig
 RUN yum install -y sqlite-devel zlib-devel openssl-devel pcre-devel vim tk-devel tkinter libtool xz graphviz
 COPY build_scripts /build_scripts
 RUN bash build_scripts/build.sh && \
-  bash build_scripts/install_nccl2.sh && rm -r build_scripts
+  bash build_scripts/install_nccl2.sh && rm -rf build_scripts
 
 ENV SSL_CERT_FILE=/opt/_internal/certs.pem
 

From 81bd7eeff4f3581c67cb294f94a14c3b1e97e40d Mon Sep 17 00:00:00 2001
From: peizhilin <wopeizl@163.com>
Date: Fri, 23 Nov 2018 11:04:11 +0800
Subject: [PATCH 059/113] rollback the format

---
 paddle/fluid/operators/beam_search_op_test.cc | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/paddle/fluid/operators/beam_search_op_test.cc b/paddle/fluid/operators/beam_search_op_test.cc
index 80fdd22fbb..6e283866ff 100644
--- a/paddle/fluid/operators/beam_search_op_test.cc
+++ b/paddle/fluid/operators/beam_search_op_test.cc
@@ -30,23 +30,23 @@ using std::endl;
 
 void CreateInput(LoDTensor* ids, LoDTensor* scores) {
   LoD lod;
-  vector<size_t> level0{0, 2, 4};
-  vector<size_t> level1{0, 1, 2, 3, 4};
+  vector<size_t> level0({0, 2, 4});
+  vector<size_t> level1({0, 1, 2, 3, 4});
   lod.push_back(level0);
   lod.push_back(level1);
   ids->set_lod(lod);
   scores->set_lod(lod);
 
-  auto dims = framework::make_ddim(vector<int64_t>{4, 3});
+  auto dims = framework::make_ddim(vector<int64_t>({4, 3}));
   ids->Resize(dims);
   scores->Resize(dims);
   CPUPlace place;
 
   auto* ids_data = ids->mutable_data<int64_t>(place);
   auto* scores_data = scores->mutable_data<float>(place);
-  vector<int64_t> _ids{4, 2, 5, 2, 1, 3, 3, 5, 2, 8, 2, 1};
-  vector<float> _scores{0.5f, 0.3f, 0.2f, 0.6f, 0.3f, 0.1f,
-                        0.9f, 0.5f, 0.1f, 0.7f, 0.5f, 0.1f};
+  vector<int64_t> _ids({4, 2, 5, 2, 1, 3, 3, 5, 2, 8, 2, 1});
+  vector<float> _scores({0.5f, 0.3f, 0.2f, 0.6f, 0.3f, 0.1f,
+                        0.9f, 0.5f, 0.1f, 0.7f, 0.5f, 0.1f});
 
   for (int i = 0; i < 12; i++) {
     ids_data[i] = _ids[i];
@@ -79,8 +79,8 @@ TEST(DISABLED_beam_search_op, run) {
 
   ASSERT_EQ(sids.lod(), sscores.lod());
 
-  vector<int> tids{4, 2, 3, 8};
-  vector<float> tscores{0.5f, 0.6f, 0.9f, 0.7f};
+  vector<int> tids({4, 2, 3, 8});
+  vector<float> tscores({0.5f, 0.6f, 0.9f, 0.7f});
 
   for (int i = 0; i < 4; i++) {
     ASSERT_EQ(tids[i], sids.data<int64_t>()[i]);

From ff2a9786f3f8ba49daad21bd3d4550b394d46ca4 Mon Sep 17 00:00:00 2001
From: JiabinYang <marsyang199376@gmail.com>
Date: Fri, 23 Nov 2018 03:08:33 +0000
Subject: [PATCH 060/113] test=develop

---
 python/paddle/fluid/__init__.py | 14 +++++++++-----
 1 file changed, 9 insertions(+), 5 deletions(-)

diff --git a/python/paddle/fluid/__init__.py b/python/paddle/fluid/__init__.py
index 543acf2d34..a6416326ed 100644
--- a/python/paddle/fluid/__init__.py
+++ b/python/paddle/fluid/__init__.py
@@ -91,6 +91,7 @@ def __bootstrap__():
     """
     import sys
     import os
+    import platform
     from . import core
 
     in_test = 'unittest' in sys.modules
@@ -110,14 +111,17 @@ def __bootstrap__():
         print('PLEASE USE OMP_NUM_THREADS WISELY.', file=sys.stderr)
 
     os.environ['OMP_NUM_THREADS'] = str(num_threads)
-
+    sysstr = platform.system()
     read_env_flags = [
-        'use_pinned_memory', 'check_nan_inf', 'benchmark', 'eager_delete_scope',
-        'use_mkldnn', 'use_ngraph', 'initial_cpu_memory_in_mb',
-        'init_allocated_mem', 'free_idle_memory', 'paddle_num_threads',
-        "dist_threadpool_size", 'eager_delete_tensor_gb', 'allocator_strategy',
+        'check_nan_inf', 'benchmark', 'eager_delete_scope', 'use_mkldnn',
+        'use_ngraph', 'initial_cpu_memory_in_mb', 'init_allocated_mem',
+        'free_idle_memory', 'paddle_num_threads', "dist_threadpool_size",
+        'eager_delete_tensor_gb', 'allocator_strategy',
         'reader_queue_speed_test_mode', 'print_sub_graph_dir'
     ]
+    if sysstr != 'Darwin':
+        read_env_flags.append('use_pinned_memory')
+
     if os.name != 'nt':
         read_env_flags.append('warpctc_dir')
         read_env_flags.append('cpu_deterministic')

From 9ea1ce63192fee1a211aa5dcc6fecf4758434451 Mon Sep 17 00:00:00 2001
From: Xin Pan <panxin.grad@gmail.com>
Date: Fri, 23 Nov 2018 11:51:07 +0800
Subject: [PATCH 061/113] Update issue templates

---
 .github/ISSUE_TEMPLATE/---feature-request-.md | 27 +++++++++++++
 .github/ISSUE_TEMPLATE/---inference-issue-.md | 40 +++++++++++++++++++
 .../ISSUE_TEMPLATE/---installation-issue-.md  | 40 +++++++++++++++++++
 .github/ISSUE_TEMPLATE/---model-issue-.md     | 36 +++++++++++++++++
 .github/ISSUE_TEMPLATE/---others-.md          | 33 +++++++++++++++
 .github/ISSUE_TEMPLATE/---training-issue-.md  | 38 ++++++++++++++++++
 6 files changed, 214 insertions(+)
 create mode 100644 .github/ISSUE_TEMPLATE/---feature-request-.md
 create mode 100644 .github/ISSUE_TEMPLATE/---inference-issue-.md
 create mode 100644 .github/ISSUE_TEMPLATE/---installation-issue-.md
 create mode 100644 .github/ISSUE_TEMPLATE/---model-issue-.md
 create mode 100644 .github/ISSUE_TEMPLATE/---others-.md
 create mode 100644 .github/ISSUE_TEMPLATE/---training-issue-.md

diff --git a/.github/ISSUE_TEMPLATE/---feature-request-.md b/.github/ISSUE_TEMPLATE/---feature-request-.md
new file mode 100644
index 0000000000..57708855dc
--- /dev/null
+++ b/.github/ISSUE_TEMPLATE/---feature-request-.md
@@ -0,0 +1,27 @@
+---
+name: 建议(Feature request)
+about: 您可以提出您的建议。 You could use this template for reporting a suggestion  issue.
+
+---
+
+欢迎您对PaddlePaddle提出建议，非常感谢您对PaddlePaddle的贡献！
+在留下您的建议时，辛苦您同步提供如下信息：
+- 版本、环境信息
+1）PaddlePaddle版本：请提供您的PaddlePaddle版本号，例如1.1
+2）CPU/GPU：您是否使用GPU进行训练，如是，请提供您的CUDA和cuDNN版本号
+3）系统环境：请您描述系统类型、版本，例如Mac OS 10.14
+- 复现信息：如为报错，请给出复现环境、复现步骤
+- 建议描述：请您详细描述，您认为需优化的功能
+
+Thank you for contributing to PaddlePaddle.
+Before submitting the issue, you could search issue in the github in case that there was a similar issue submitted or resolved before.
+Please make sure that this is a feature request. 
+**System information**
+-PaddlePaddle version （eg.1.1）or CommitID
+-CPU: including CPUMKL/OpenBlas/MKLDNN version
+-GPU: including CUDA/CUDNN version
+-OS Platform (eg.Mac OS 10.14)
+**To Reproduce**
+Steps to reproduce the behavior
+**Describe the feature and the current behavior/state.**
+**Any Other info.**
diff --git a/.github/ISSUE_TEMPLATE/---inference-issue-.md b/.github/ISSUE_TEMPLATE/---inference-issue-.md
new file mode 100644
index 0000000000..37bdc8889e
--- /dev/null
+++ b/.github/ISSUE_TEMPLATE/---inference-issue-.md
@@ -0,0 +1,40 @@
+---
+name: 预测（Inference Issue）
+about: 您可以提问预测中报错、应用等问题。 You could use this template for reporting an inference issue.
+
+---
+
+为使您的问题得到快速解决，在建立Issue前，请您先通过如下方式搜索是否有相似问题:【搜索issue关键字】【使用labels筛选】【官方文档】
+
+如果您没有查询到相似问题，为快速解决您的提问，建立issue时请提供如下细节信息：
+- 标题：简洁、精准描述您的问题，例如“最新预测库的API文档在哪儿 ”
+- 版本、环境信息：
+    1）PaddlePaddle版本：请提供您的PaddlePaddle版本号（如1.1）或CommitID
+    2）CPU：预测若用CPU，请提供CPU型号，MKL/OpenBlas/MKLDNN/等数学库使用情况
+    3）GPU：预测若用GPU，请提供GPU型号、CUDA和CUDNN版本号
+    4）系统环境：请您描述系统类型、版本（如Mac OS 10.14），Python版本
+-预测信息
+    1）C++预测：请您提供预测库安装包的版本信息，及其中的version.txt文件
+    2）CMake包含路径的完整命令
+    3）API信息（如调用请提供）
+    4）预测库来源：官网下载/特殊环境（如BCLOUD编译）
+- 复现信息：如为报错，请给出复现环境、复现步骤
+- 问题描述：请详细描述您的问题，同步贴出报错信息、日志/代码关键片段
+
+Thank you for contributing to PaddlePaddle.
+Before submitting the issue, you could search issue in the github in case that th
+If there is no solution,please make sure that this is an inference issue including the following details :
+**System information**
+-PaddlePaddle version （eg.1.1）or CommitID
+-CPU: including CPUMKL/OpenBlas/MKLDNN version
+-GPU: including CUDA/CUDNN version
+-OS Platform (eg.Mac OS 10.14)
+-Python version
+-Cmake orders
+-C++version.txt
+-API information
+**To Reproduce**
+Steps to reproduce the behavior
+**Describe your current behavior**
+**Code to reproduce the issue**
+**Other info / logs**
diff --git a/.github/ISSUE_TEMPLATE/---installation-issue-.md b/.github/ISSUE_TEMPLATE/---installation-issue-.md
new file mode 100644
index 0000000000..ce4ba58932
--- /dev/null
+++ b/.github/ISSUE_TEMPLATE/---installation-issue-.md
@@ -0,0 +1,40 @@
+---
+name: 安装（Installation Issue）
+about: 您可以提问安装、编译出现报错等问题。 You could use this template for reporting an installation
+   issue.
+
+---
+
+为使您的问题得到快速解决，在建立Issue前，请您先通过如下方式搜索是否有相似问题:【搜索issue关键字】【使用labels筛选】【官方文档】
+
+建立issue时，为快速解决问题，请您根据使用情况给出如下信息：
+- 标题：请包含关键词“安装错误”/“编译错误”，例如“Mac编译错误”
+- 版本、环境信息：
+    1）PaddlePaddle版本：请提供您的PaddlePaddle版本号（如1.1）或CommitID
+    2）CPU：请提供CPU型号，MKL/OpenBlas/MKLDNN/等数学库的使用情况
+    3）GPU：请提供GPU型号，CUDA和CUDNN版本号
+    4）系统环境：请说明系统类型、版本（如Mac OS 10.14）、Python版本
+- 安装方式信息：
+1）pip安装/docker安装
+2）本地编译：请提供cmake命令，编译命令
+3）docker编译：请提供docker镜像，编译命令            
+  特殊环境请注明：如离线安装等
+- 复现信息：如为报错，请给出复现环境、复现步骤
+- 问题描述：请详细描述您的问题，同步贴出报错信息、日志/代码关键片段
+
+Thank you for contributing to PaddlePaddle.
+Before submitting the issue, you could search issue in Github in case that there was a similar issue submitted or resolved before.
+If there is no solution,please make sure that this is an installation issue including the following details:
+**System information**
+-PaddlePaddle version （eg.1.1）or CommitID
+-CPU: including CPUMKL/OpenBlas/MKLDNN version
+-GPU: including CUDA/CUDNN version
+-OS Platform (eg. Mac OS 10.14)
+-Python version
+- Install method: pip install/install with docker/build from source(without docker)/build within docker
+- Other special cases that you think may be related to this problem, eg. offline install, special internet condition   
+**To Reproduce**
+Steps to reproduce the behavior
+**Describe your current behavior**
+**Code to reproduce the issue**
+**Other info / logs**
diff --git a/.github/ISSUE_TEMPLATE/---model-issue-.md b/.github/ISSUE_TEMPLATE/---model-issue-.md
new file mode 100644
index 0000000000..7cb52f37b9
--- /dev/null
+++ b/.github/ISSUE_TEMPLATE/---model-issue-.md
@@ -0,0 +1,36 @@
+---
+name: 模型（Model Issue）
+about: 您可以提问模型、算法、数据集方向的使用报错等问题。You could use this template for reporting a model/
+  algorithm/dataset  issue.
+
+---
+
+为使您的问题得到快速解决，在建立Issue前，请您先通过如下方式搜索是否有相似问题:【搜索issue关键字】【使用labels筛选】【官方文档】
+
+建立issue时，为快速解决问题，请您根据使用情况给出如下信息：
+- 标题：简洁、精准描述您的问题，例如“ssd 模型前置lstm报错  ”
+- 版本、环境信息：
+    1）PaddlePaddle版本：请提供PaddlePaddle版本号，例如1.1或CommitID
+    2）CPU：请提供CPU型号，MKL/OpenBlas/MKLDNN/等数学库的使用情况
+    3）GPU：请提供GPU型号，CUDA和CUDNN版本号
+    4）系统环境：请说明系统类型、版本（例如Mac OS 10.14），Python版本
+- 模型信息
+    1）模型名称 2）使用数据集名称 3）使用算法名称 4）模型链接
+- 复现信息：如为报错，请给出复现环境、复现步骤
+- 问题描述：请详细描述您的问题，同步贴出报错信息、日志/代码关键片段
+
+Thank you for contributing to PaddlePaddle.
+Before submitting the issue, you could search issue in the github.Probably there was a similar issue submitted or resolved before.
+If there is no solution,please make sure that this is a issue of models including the following details:
+**System information**
+-PaddlePaddle version （eg.1.1）or CommitID
+-CPU: including CPUMKL/OpenBlas/MKLDNN version
+-GPU: including CUDA/CUDNN version
+-OS Platform (eg.Mac OS 10.14)
+-Python version
+-Name of Models&Dataset/details of operator
+**To Reproduce**
+Steps to reproduce the behavior
+**Describe your current behavior**
+**Code to reproduce the issue**
+**Other info / logs**
diff --git a/.github/ISSUE_TEMPLATE/---others-.md b/.github/ISSUE_TEMPLATE/---others-.md
new file mode 100644
index 0000000000..6a291153e4
--- /dev/null
+++ b/.github/ISSUE_TEMPLATE/---others-.md
@@ -0,0 +1,33 @@
+---
+name: 其他（Others）
+about: 如上述分类未包含您的问题，可在此提出。 You could use this template for reporting other issues
+
+---
+
+为使您的问题得到快速解决，在建立Issues前，请您先通过如下方式搜索是否有相似问题:【搜索issue关键字】【使用labels筛选】【官方文档】
+
+如果您没有查询到相似问题，为快速解决您的提问，建立issue时请提供如下细节信息：
+- 标题：简洁、精准概括您的问题
+- 版本、环境信息：
+    1）PaddlePaddle版本：请提供您的PaddlePaddle版本号，例如1.1或CommitID
+    2）CPU/GPU：如果您使用GPU训练，请提供GPU驱动版本、CUDA和cuDNN版本号
+    3）系统环境：请您描述系统类型、版本，例如Mac OS 10.14
+    4）Python版本号
+    5）显存信息
+- 复现信息：如为报错，请给出复现环境、复现步骤
+- 问题描述：请详细描述您的问题，同步贴出报错信息、日志/代码关键片段
+
+Thank you for contributing to PaddlePaddle.
+Before submitting the issue, you could search issue in the github in case that there was a similar issue submitted or resolved before.
+If there is no solution,please provide us with the following details :
+**System information**
+-PaddlePaddle version （eg.1.1）or CommitID
+-CPU: including CPUMKL/OpenBlas/MKLDNN version
+-GPU: including CUDA/cuDNN version
+-OS Platform and Distribution(eg.Mac OS 10.14)
+-Python version 
+**To Reproduce**
+Steps to reproduce the behavior
+**Describe your current behavior**
+**Code to reproduce the issue**
+**Other info / logs**
diff --git a/.github/ISSUE_TEMPLATE/---training-issue-.md b/.github/ISSUE_TEMPLATE/---training-issue-.md
new file mode 100644
index 0000000000..29e8383d97
--- /dev/null
+++ b/.github/ISSUE_TEMPLATE/---training-issue-.md
@@ -0,0 +1,38 @@
+---
+name: 训练（Training issue）
+about: 您可以提问训练中报错、应用、出core等问题。 You could use this template for reporting an training
+   issue.
+
+---
+
+为使您的问题得到快速解决，在建立Issues前，请您先通过如下方式搜索是否有相似问题:【搜索issue关键字】【使用labels筛选】【官方文档】
+
+如果您没有查询到相似问题，为快速解决您的提问，建立issue时请提供如下细节信息：
+- 标题：简洁、精准概括您的问题，例如“Insufficient Memory xxx" ”
+- 版本、环境信息：
+    1）PaddlePaddle版本：请提供您的PaddlePaddle版本号，例如1.1或CommitID
+    2）CPU：预测若用CPU，请提供CPU型号，MKL/OpenBlas/MKLDNN/等数学库使用情况
+    3）GPU：预测若用GPU，请提供GPU型号、CUDA和CUDNN版本号
+    4）系统环境：请您描述系统类型、版本，例如Mac OS 10.14，Python版本
+- 训练信息
+    1）单机/多机，单卡/多卡
+    2）显存信息
+    3）Operator信息
+- 复现信息：如为报错，请给出复现环境、复现步骤
+- 问题描述：请详细描述您的问题，同步贴出报错信息、日志、可复现的代码片段
+
+Thank you for contributing to PaddlePaddle.
+Before submitting the issue, you could search issue in the github in case that there was a similar issue submitted or resolved before.
+If there is no solution,please make sure that this is a training issue including the following details:
+**System information**
+-PaddlePaddle version （eg.1.1）or CommitID
+-CPU: including CPUMKL/OpenBlas/MKLDNN version
+-GPU: including CUDA/CUDNN version
+-OS Platform (eg.Mac OS 10.14)
+-Other imformation: Distriuted training/informantion of operator/
+Graphics card storage
+**To Reproduce**
+Steps to reproduce the behavior
+**Describe your current behavior**
+**Code to reproduce the issue**
+**Other info / logs**

From 6a7f83d45df2ff22c49867837c97f0773421ee0c Mon Sep 17 00:00:00 2001
From: tensor-tang <tangjian03@baidu.com>
Date: Fri, 23 Nov 2018 04:11:28 +0000
Subject: [PATCH 062/113] enable gru jitcode and refine act and lstm jitcode

test=develop
---
 paddle/fluid/operators/math/jit_code.cc       | 183 ++++++++++--------
 paddle/fluid/operators/math/jit_code.h        |  90 ++++-----
 .../fluid/operators/math/jit_kernel_refer.h   |   4 +-
 paddle/fluid/operators/math/jit_kernel_rnn.cc |   6 +-
 .../fluid/operators/math/jit_kernel_test.cc   |   2 +
 5 files changed, 149 insertions(+), 136 deletions(-)

diff --git a/paddle/fluid/operators/math/jit_code.cc b/paddle/fluid/operators/math/jit_code.cc
index 95247ce309..52cbdf685d 100644
--- a/paddle/fluid/operators/math/jit_code.cc
+++ b/paddle/fluid/operators/math/jit_code.cc
@@ -140,32 +140,10 @@ bool VActJitCode::init(int d, operand_type type) {
 }
 
 void VActJitCode::generate() {
-  xmm_t xmm_zero = xmm_t(2);
-  ymm_t ymm_zero = ymm_t(2);
-  if (type_ == operand_type::relu) {
-    vxorps(ymm_zero, ymm_zero, ymm_zero);
-  }
   int offset = 0;
   for (int i = 0; i < num_ / YMM_FLOAT_BLOCK; ++i) {
     vmovups(ymm_src, ptr[param1 + offset]);
-    switch (type_) {
-      case operand_type::relu:
-        relu_jmm<ymm_t>(ymm_dst, ymm_src, ymm_zero);
-        break;
-      case operand_type::exp:
-        exp_jmm<ymm_t>(ymm_dst, ymm_src, 2, 3, 4, 5);
-        break;
-      case operand_type::sigmoid:
-        sigmoid_jmm<ymm_t>(ymm_dst, ymm_src, 2, 3, 4, 5);
-        break;
-      case operand_type::tanh:
-        tanh_jmm<ymm_t>(ymm_dst, ymm_src, 2, 3, 4, 5);
-        break;
-      case operand_type::identity:
-        break;
-      default:
-        break;
-    }
+    act<ymm_t>(ymm_dst, ymm_src, type_);
     vmovups(ptr[param2 + offset], ymm_dst);
     offset += sizeof(float) * YMM_FLOAT_BLOCK;
   }
@@ -182,22 +160,7 @@ void VActJitCode::generate() {
       block = 1;
       vmovss(xmm_src, ptr[param1 + offset]);
     }
-    switch (type_) {
-      case operand_type::relu:
-        relu_jmm<xmm_t>(xmm_dst, xmm_src, xmm_zero);
-        break;
-      case operand_type::exp:
-        exp_jmm<xmm_t>(xmm_dst, xmm_src, 2, 3, 4, 5);
-        break;
-      case operand_type::sigmoid:
-        sigmoid_jmm<xmm_t>(xmm_dst, xmm_src, 2, 3, 4, 5);
-        break;
-      case operand_type::tanh:
-        tanh_jmm<xmm_t>(xmm_dst, xmm_src, 2, 3, 4, 5);
-        break;
-      default:
-        break;
-    }
+    act<xmm_t>(xmm_dst, xmm_src, type_);
     if (rest >= 4) {
       vmovups(ptr[param2 + offset], xmm_dst);
     } else if (rest >= 2) {
@@ -233,52 +196,64 @@ void LSTMJitCode::generate() {
   int offset = 0;
   int d = num_ * sizeof(float);
   for (int i = 0; i < num_ / YMM_FLOAT_BLOCK; ++i) {
-    /* C_t = C_t-1 * fgated + cand_gated * igated*/
-    // c
-    vmovups(ymm_src, ptr[reg_ptr_gates + offset]);
-    act<ymm_t>(ymm_c, ymm_src, act_cand_);
-    // i
-    vmovups(ymm_src, ptr[reg_ptr_gates + offset + d]);
-    if (!compute_c1h1_ && use_peephole_) {
-      ymm_t ymm_wp = ymm_t(2);
-      ymm_t ymm_ct_1 = ymm_t(3);
-      vmovups(ymm_wp, ptr[reg_ptr_wp + offset]);
+    /* gates: W_ch, W_ih, W_fh, W_oh */
+    ymm_t ymm_c = ymm_t(0);
+    ymm_t ymm_i = ymm_t(1);
+    ymm_t ymm_f = ymm_t(2);
+    ymm_t ymm_o = ymm_t(3);
+    ymm_t ymm_ct_1 = ymm_t(4);
+    ymm_t ymm_wp0 = ymm_t(5);
+    ymm_t ymm_wp1 = ymm_t(6);
+    ymm_t ymm_wp2 = ymm_t(7);
+    vmovups(ymm_c, ptr[reg_ptr_gates + offset]);
+    vmovups(ymm_i, ptr[reg_ptr_gates + offset + d]);
+    vmovups(ymm_f, ptr[reg_ptr_gates + offset + 2 * d]);
+    vmovups(ymm_o, ptr[reg_ptr_gates + offset + 3 * d]);
+    if (!compute_c1h1_) {
       vmovups(ymm_ct_1, ptr[reg_ptr_ct_1 + offset]);
-      vmulps(ymm_wp, ymm_ct_1, ymm_wp);
-      vaddps(ymm_src, ymm_src, ymm_wp);
     }
-    act<ymm_t>(ymm_i, ymm_src, act_gate_);
+    if (use_peephole_) {
+      vmovups(ymm_wp0, ptr[reg_ptr_wp + offset]);
+      vmovups(ymm_wp1, ptr[reg_ptr_wp + offset + d]);
+      vmovups(ymm_wp2, ptr[reg_ptr_wp + offset + 2 * d]);
+    }
+    /* C_t = act_cand(c) * act_gate(i) + C_t-1 * act_gate(f) */
+    // act_cand(c)
+    act<ymm_t>(ymm_c, ymm_c, act_cand_);
+    // act_gate(i) or act_gate(ct_1 * wp0 + i)
+    if (!compute_c1h1_ && use_peephole_) {
+      vmulps(ymm_wp0, ymm_ct_1, ymm_wp0);
+      vaddps(ymm_i, ymm_i, ymm_wp0);
+    }
+    act<ymm_t>(ymm_i, ymm_i, act_gate_);
     vmulps(ymm_c, ymm_c, ymm_i);
     if (!compute_c1h1_) {
-      // f
-      vmovups(ymm_src, ptr[reg_ptr_gates + offset + 2 * d]);
-      vmovups(ymm_i, ptr[reg_ptr_ct_1 + offset]);
+      // act_gate(f) or act_gate(ct_1 * wp1 + f)
       if (use_peephole_) {
-        ymm_t ymm_wp = ymm_t(3);
-        vmovups(ymm_wp, ptr[reg_ptr_wp + offset + d]);
-        vmulps(ymm_wp, ymm_i, ymm_wp);
-        vaddps(ymm_src, ymm_src, ymm_wp);
+        vmulps(ymm_wp1, ymm_ct_1, ymm_wp1);
+        vaddps(ymm_f, ymm_f, ymm_wp1);
       }
-      act<ymm_t>(ymm_f, ymm_src, act_gate_);
-      vmulps(ymm_f, ymm_f, ymm_i);
+      act<ymm_t>(ymm_f, ymm_f, act_gate_);
+      // ct
+      vmulps(ymm_f, ymm_f, ymm_ct_1);
       vaddps(ymm_f, ymm_f, ymm_c);
     }
-    /* H_t = act_cell(C_t) * ogated */
+    /* H_t = act_cell(C_t) * act_gate(o) */
+    // act_cell(C_t)
     ymm_t ymm_ct = compute_c1h1_ ? ymm_c : ymm_f;
-    ymm_t ymm_o = compute_c1h1_ ? ymm_f : ymm_c;
     ymm_t ymm_tmp = ymm_i;
-    vmovups(ptr[reg_ptr_ct + offset], ymm_ct);  // save ct
-    vmovups(ymm_src, ptr[reg_ptr_gates + offset + 3 * d]);
+    act<ymm_t>(ymm_tmp, ymm_ct, act_cell_);
+    // act_gate(o) or act_gate(ct * wp2 + o)
     if (use_peephole_) {
-      ymm_t ymm_wp = ymm_t(2);
-      vmovups(ymm_wp, ptr[reg_ptr_wp + offset + d * 2]);
-      vmulps(ymm_wp, ymm_ct, ymm_wp);
-      vaddps(ymm_src, ymm_src, ymm_wp);
+      vmulps(ymm_wp2, ymm_ct, ymm_wp2);
+      vaddps(ymm_o, ymm_o, ymm_wp2);
     }
-    act<ymm_t>(ymm_tmp, ymm_ct, act_cell_);
-    act<ymm_t>(ymm_o, ymm_src, act_gate_);
-    vmulps(ymm_o, ymm_tmp, ymm_o);
-    vmovups(ptr[reg_ptr_ht + offset], ymm_o);  // save ht
+    act<ymm_t>(ymm_o, ymm_o, act_gate_);
+    // ht
+    vmulps(ymm_o, ymm_o, ymm_tmp);
+    // save ct and ht
+    vmovups(ptr[reg_ptr_ct + offset], ymm_ct);
+    vmovups(ptr[reg_ptr_ht + offset], ymm_o);
     offset += sizeof(float) * YMM_FLOAT_BLOCK;
   }
 
@@ -293,13 +268,61 @@ bool GRUJitCode::init(int d) { return MayIUse(avx) && d % 8 == 0; }
 
 void GRUJitCode::generate() {
   reg64_t reg_ptr_gates = rax;
-  reg64_t reg_ptr_ct_1 = r9;
-  reg64_t reg_ptr_ct = r10;
-  reg64_t reg_ptr_ht = r11;
-  mov(reg_ptr_gates, ptr[param1 + offsetof(lstm_t, gates)]);
-  mov(reg_ptr_ct_1, ptr[param1 + offsetof(lstm_t, ct_1)]);
-  mov(reg_ptr_ct, ptr[param1 + offsetof(lstm_t, ct)]);
-  mov(reg_ptr_ht, ptr[param1 + offsetof(lstm_t, ht)]);
+  reg64_t reg_ptr_ht_1 = r9;
+  reg64_t reg_ptr_ht = r10;
+  mov(reg_ptr_gates, ptr[param1 + offsetof(gru_t, gates)]);
+  mov(reg_ptr_ht_1, ptr[param1 + offsetof(gru_t, ht_1)]);
+  mov(reg_ptr_ht, ptr[param1 + offsetof(gru_t, ht)]);
+  ymm_t ymm_one = ymm_t(0);
+
+  if (id_ == 2) {
+    reg64_t reg_ptr_tmp = r11;
+    mov(reg_ptr_tmp, reinterpret_cast<size_t>(exp_float_consts));
+    vmovaps(ymm_one, ptr[reg_ptr_tmp + OFFSET_EXP_ONE]);
+  }
+  int offset = 0;
+  int d = num_ * sizeof(float);
+  for (int i = 0; i < num_ / YMM_FLOAT_BLOCK; ++i) {
+    ymm_t ymm_u = ymm_t(1);
+    ymm_t ymm_r = ymm_t(2);
+    ymm_t ymm_s = ymm_t(3);
+    ymm_t ymm_ht_1 = ymm_t(4);
+    // W: {W_update, W_reset; W_state}
+    if (id_ == 0 || id_ == 2) {
+      vmovups(ymm_u, ptr[reg_ptr_gates + offset]);
+      vmovups(ymm_s, ptr[reg_ptr_gates + offset + 2 * d]);
+    }
+    if (id_ == 1) {
+      vmovups(ymm_r, ptr[reg_ptr_gates + offset + d]);
+    }
+    if (id_ == 1 || id_ == 2) {
+      vmovups(ymm_ht_1, ptr[reg_ptr_ht_1 + offset]);
+    }
+
+    if (id_ == 0) {
+      // ht = act_gate(u) * act_cand(s)
+      act<ymm_t>(ymm_u, ymm_u, act_gate_);
+      act<ymm_t>(ymm_s, ymm_s, act_cand_);
+      vmulps(ymm_s, ymm_s, ymm_u);
+      vmovups(ptr[reg_ptr_ht + offset], ymm_s);
+    } else if (id_ == 1) {
+      // ht = act_gate(r) * ht_1
+      act<ymm_t>(ymm_r, ymm_r, act_gate_);
+      vmulps(ymm_r, ymm_r, ymm_ht_1);
+      vmovups(ptr[reg_ptr_ht + offset], ymm_r);
+    } else if (id_ == 2) {
+      // ht = act_gate(u) * act_cand(s) + (1-act_gate(u)) * ht_1
+      ymm_t ymm_one_inner = ymm_t(ymm_one.getIdx());
+      act<ymm_t>(ymm_u, ymm_u, act_gate_);
+      act<ymm_t>(ymm_s, ymm_s, act_cand_);
+      vmulps(ymm_s, ymm_s, ymm_u);
+      vsubps(ymm_u, ymm_one_inner, ymm_u);
+      vmulps(ymm_u, ymm_ht_1, ymm_u);
+      vaddps(ymm_u, ymm_s, ymm_u);
+      vmovups(ptr[reg_ptr_ht + offset], ymm_u);
+    }
+    offset += sizeof(float) * YMM_FLOAT_BLOCK;
+  }
 
   ret();
 }
diff --git a/paddle/fluid/operators/math/jit_code.h b/paddle/fluid/operators/math/jit_code.h
index 403cea3991..a921462129 100644
--- a/paddle/fluid/operators/math/jit_code.h
+++ b/paddle/fluid/operators/math/jit_code.h
@@ -169,31 +169,34 @@ class VActJitCode : public JitCode {
  protected:
   // compute relu with ymm, xmm
   template <typename JMM>
-  void relu_jmm(JMM& dst, JMM& src, JMM& zero) {  // NOLINT
+  void relu_jmm(JMM& dst, JMM& src, int zero_idx = 15) {  // NOLINT
+    JMM zero = JMM(zero_idx);
+    vxorps(zero, zero, zero);
     vmaxps(dst, src, zero);
   }
 
   // compute exp with ymm, xmm
   template <typename JMM>
-  void exp_jmm(JMM& dst, JMM& src, int fx_idx = 2, int fy_idx = 3,  // NOLINT
-               int mask_idx = 4, int tmp_idx = 5) {
-    using namespace platform::jit;         // NOLINT
-    assert(src.getIdx() != dst.getIdx());  // TODO(TJ): use enfore
+  void exp_jmm(JMM& dst, JMM& src, int src_idx = 11, int fx_idx = 12,  // NOLINT
+               int fy_idx = 13, int mask_idx = 14, int tmp_idx = 15) {
+    using namespace platform::jit;  // NOLINT
     // check all idx can not equal
+    JMM jmm_src = JMM(src_idx);
     JMM jmm_fx = JMM(fx_idx);
     JMM jmm_fy = JMM(fy_idx);
     JMM jmm_mask = JMM(mask_idx);
     JMM jmm_tmp = JMM(tmp_idx);
     reg64_t reg_ptr_global = rax;
     push(reg_ptr_global);
+    vmovaps(jmm_src, src);
     mov(reg_ptr_global, reinterpret_cast<size_t>(exp_float_consts));
     vmovaps(jmm_tmp, ptr[reg_ptr_global + OFFSET_EXP_HIG]);
-    vminps(src, src, jmm_tmp);
+    vminps(jmm_src, jmm_src, jmm_tmp);
     vmovaps(jmm_tmp, ptr[reg_ptr_global + OFFSET_EXP_LOW]);
-    vmaxps(src, src, jmm_tmp);
+    vmaxps(jmm_src, jmm_src, jmm_tmp);
     // express exp(x) as exp(g + n*log(2))
     vmovaps(jmm_tmp, ptr[reg_ptr_global + OFFSET_EXP_LOG2EF]);
-    vmulps(jmm_fx, src, jmm_tmp);
+    vmulps(jmm_fx, jmm_src, jmm_tmp);
     vmovaps(jmm_tmp, ptr[reg_ptr_global + OFFSET_EXP_0P5]);
     vaddps(jmm_fx, jmm_fx, jmm_tmp);
     vroundps(jmm_fy, jmm_fx, 0x01);
@@ -207,21 +210,21 @@ class VActJitCode : public JitCode {
     vmovaps(jmm_tmp, ptr[reg_ptr_global + OFFSET_EXP_C2]);
     JMM ymm_z = JMM(jmm_mask.getIdx());
     vmulps(ymm_z, jmm_fx, jmm_tmp);
-    vsubps(src, src, jmm_fy);
-    vsubps(src, src, ymm_z);
-    vmulps(ymm_z, src, src);
+    vsubps(jmm_src, jmm_src, jmm_fy);
+    vsubps(jmm_src, jmm_src, ymm_z);
+    vmulps(ymm_z, jmm_src, jmm_src);
     vmovaps(jmm_tmp, ptr[reg_ptr_global + OFFSET_EXP_P0]);
-    vmulps(dst, src, jmm_tmp);
+    vmulps(dst, jmm_src, jmm_tmp);
     for (size_t i = OFFSET_EXP_P1; i < OFFSET_EXP_P5;
          i += (YMM_FLOAT_BLOCK * sizeof(float))) {
       vmovaps(jmm_tmp, ptr[reg_ptr_global + i]);  // P1~P4
       vaddps(dst, dst, jmm_tmp);
-      vmulps(dst, dst, src);
+      vmulps(dst, dst, jmm_src);
     }
     vmovaps(jmm_tmp, ptr[reg_ptr_global + OFFSET_EXP_P5]);
     vaddps(dst, dst, jmm_tmp);
     vmulps(dst, dst, ymm_z);
-    vaddps(dst, dst, src);
+    vaddps(dst, dst, jmm_src);
     vmovaps(jmm_tmp, ptr[reg_ptr_global]);
     vaddps(dst, dst, jmm_tmp);
     // build 2^n
@@ -258,20 +261,23 @@ class VActJitCode : public JitCode {
 
   // compute sigmoid with ymm, xmm
   template <typename JMM>
-  void sigmoid_jmm(JMM& dst, JMM& src, int fx_idx = 2,  // NOLINT
-                   int fy_idx = 3, int mask_idx = 4, int tmp_idx = 5) {
+  void sigmoid_jmm(JMM& dst, JMM& src, int src_idx = 11,  // NOLINT
+                   int fx_idx = 12, int fy_idx = 13, int mask_idx = 14,
+                   int tmp_idx = 15) {
     // y = 1 / (1 + e^-x)
     JMM jmm_tmp = JMM(tmp_idx);
+    JMM jmm_src = JMM(src_idx);
     reg64_t reg_ptr_global = rax;
     push(reg_ptr_global);
+    vmovaps(jmm_src, src);
     mov(reg_ptr_global, reinterpret_cast<size_t>(exp_float_consts));
     vmovaps(jmm_tmp, ptr[reg_ptr_global + OFFSET_SIGMOID_MAX]);
-    vminps(src, src, jmm_tmp);
+    vminps(jmm_src, jmm_src, jmm_tmp);
     vmovaps(jmm_tmp, ptr[reg_ptr_global + OFFSET_SIGMOID_MIN]);
-    vmaxps(src, src, jmm_tmp);
+    vmaxps(jmm_src, jmm_src, jmm_tmp);
     vxorps(jmm_tmp, jmm_tmp, jmm_tmp);
-    vsubps(src, jmm_tmp, src);
-    exp_jmm<JMM>(dst, src, fx_idx, fy_idx, mask_idx, tmp_idx);
+    vsubps(jmm_src, jmm_tmp, jmm_src);
+    exp_jmm<JMM>(dst, jmm_src, src_idx, fx_idx, fy_idx, mask_idx, tmp_idx);
     vmovaps(jmm_tmp, ptr[reg_ptr_global + OFFSET_EXP_ONE]);
     vaddps(dst, dst, jmm_tmp);
     vdivps(dst, jmm_tmp, dst);
@@ -280,19 +286,22 @@ class VActJitCode : public JitCode {
 
   // compute tanh with ymm, xmm
   template <typename JMM>
-  void tanh_jmm(JMM& dst, JMM& src, int fx_idx = 2, int fy_idx = 3,  // NOLINT
-                int mask_idx = 4, int tmp_idx = 5) {
+  void tanh_jmm(JMM& dst, JMM& src, int src_idx = 11,  // NOLINT
+                int fx_idx = 12, int fy_idx = 13, int mask_idx = 14,
+                int tmp_idx = 15) {
     // y = 2 / (1 + e^(-2x)) - 1
+    JMM jmm_src = JMM(src_idx);
     JMM jmm_tmp = JMM(tmp_idx);
     JMM jmm_zero = JMM(mask_idx);
     reg64_t reg_ptr_global = rax;
     push(reg_ptr_global);
+    vmovaps(jmm_src, src);
     mov(reg_ptr_global, reinterpret_cast<size_t>(exp_float_consts));
     vmovaps(jmm_tmp, ptr[reg_ptr_global + OFFSET_EXP_TWO]);
     vxorps(jmm_zero, jmm_zero, jmm_zero);
     vsubps(jmm_tmp, jmm_zero, jmm_tmp);
-    vmulps(src, src, jmm_tmp);
-    exp_jmm<JMM>(dst, src, fx_idx, fy_idx, mask_idx, tmp_idx);
+    vmulps(jmm_src, jmm_src, jmm_tmp);
+    exp_jmm<JMM>(dst, jmm_src, src_idx, fx_idx, fy_idx, mask_idx, tmp_idx);
     vmovaps(jmm_tmp, ptr[reg_ptr_global + OFFSET_EXP_ONE]);
     vaddps(dst, dst, jmm_tmp);
     vmovaps(jmm_tmp, ptr[reg_ptr_global + OFFSET_EXP_TWO]);
@@ -304,23 +313,19 @@ class VActJitCode : public JitCode {
 
   template <typename JMM>
   void act(JMM& dst, JMM& src, operand_type type) {  // NOLINT
-    // use 15
-    JMM zero = JMM(15);
-    if (type_ == operand_type::relu) {
-      vxorps(zero, zero, zero);
-    }
+    // use 11~15
     switch (type) {
       case operand_type::relu:
-        relu_jmm<JMM>(dst, src, zero);
+        relu_jmm<JMM>(dst, src, 15);
         break;
       case operand_type::exp:
-        exp_jmm<JMM>(dst, src, 2, 3, 4, 5);
+        exp_jmm<JMM>(dst, src, 11, 12, 13, 14, 15);
         break;
       case operand_type::sigmoid:
-        sigmoid_jmm<JMM>(dst, src, 2, 3, 4, 5);
+        sigmoid_jmm<JMM>(dst, src, 11, 12, 13, 14, 15);
         break;
       case operand_type::tanh:
-        tanh_jmm<JMM>(dst, src, 2, 3, 4, 5);
+        tanh_jmm<JMM>(dst, src, 11, 12, 13, 14, 15);
         break;
       case operand_type::identity:
         break;
@@ -414,15 +419,6 @@ class LSTMJitCode : public VActJitCode {
   operand_type act_cand_;
   operand_type act_cell_;
   reg64_t param1{abi_param1};
-  xmm_t xmm_src = xmm_t(0);
-  xmm_t xmm_c = xmm_t(1);
-  xmm_t xmm_i = xmm_t(6);
-  xmm_t xmm_f = xmm_t(7);
-
-  ymm_t ymm_src = ymm_t(0);
-  ymm_t ymm_c = ymm_t(1);  // 2~5 for act
-  ymm_t ymm_i = ymm_t(6);
-  ymm_t ymm_f = ymm_t(7);
 };
 
 class GRUJitCode : public VActJitCode {
@@ -492,16 +488,6 @@ class GRUJitCode : public VActJitCode {
   operand_type act_gate_;
   operand_type act_cand_;
   reg64_t param1{abi_param1};
-
-  xmm_t xmm_src = xmm_t(0);
-  xmm_t xmm_c = xmm_t(1);
-  xmm_t xmm_i = xmm_t(6);
-  xmm_t xmm_f = xmm_t(7);
-
-  ymm_t ymm_src = ymm_t(0);
-  ymm_t ymm_c = ymm_t(1);
-  ymm_t ymm_i = ymm_t(6);
-  ymm_t ymm_f = ymm_t(7);
 };
 
 #ifdef PADDLE_WITH_MKLDNN
diff --git a/paddle/fluid/operators/math/jit_kernel_refer.h b/paddle/fluid/operators/math/jit_kernel_refer.h
index 2e1a7f22db..bcb6615df8 100644
--- a/paddle/fluid/operators/math/jit_kernel_refer.h
+++ b/paddle/fluid/operators/math/jit_kernel_refer.h
@@ -206,7 +206,7 @@ void GRUHtPart1(gru_t* step, const gru_attr_t* attr) {
   T* ht = reinterpret_cast<T*>(step->ht);
   const T* ht_1 = reinterpret_cast<const T*>(step->ht_1);
   auto act_gate = getActFunc<T>(attr->act_gate);
-  act_gate(gates, gates, attr->d * 2);
+  act_gate(gates + attr->d, gates + attr->d, attr->d);
   VMul(ht_1, gates + attr->d, ht, attr->d);
 }
 
@@ -215,9 +215,11 @@ void GRUHtPart2(gru_t* step, const gru_attr_t* attr) {
   T* gates = reinterpret_cast<T*>(step->gates);
   T* ht = reinterpret_cast<T*>(step->ht);
   const T* ht_1 = reinterpret_cast<const T*>(step->ht_1);
+  auto act_gate = getActFunc<T>(attr->act_gate);
   auto act_cand = getActFunc<T>(attr->act_cand);
   int d = attr->d;
   T* y = gates + d * 2;
+  act_gate(gates, gates, d);
   act_cand(y, y, d);
   // out = zt*ht~ + (1-zt)*ht_1
   for (int i = 0; i < d; ++i) {
diff --git a/paddle/fluid/operators/math/jit_kernel_rnn.cc b/paddle/fluid/operators/math/jit_kernel_rnn.cc
index 85ea95cfcc..2db3274a45 100644
--- a/paddle/fluid/operators/math/jit_kernel_rnn.cc
+++ b/paddle/fluid/operators/math/jit_kernel_rnn.cc
@@ -177,7 +177,7 @@ class GRUKernelImpl : public GRUKernel<T> {
   explicit GRUKernelImpl(const gru_attr_t& attr) : GRUKernel<T>() {
 #ifdef PADDLE_WITH_XBYAK
     if (useJIT(attr.d)) {
-      size_t sz = 96 + attr.d / YMM_FLOAT_BLOCK * 84 * 8;  // should change
+      size_t sz = 96 + attr.d / YMM_FLOAT_BLOCK * 96 * 2 * 8;
       jitcode0_.reset(new gen::GRUJitCode(0, attr, sz > 4096 ? sz : 4096));
       this->ComputeH1 =
           jitcode0_->getCode<void (*)(gru_t*, const gru_attr_t*)>();
@@ -188,7 +188,7 @@ class GRUKernelImpl : public GRUKernel<T> {
 
       jitcode2_.reset(new gen::GRUJitCode(2, attr, sz > 4096 ? sz : 4096));
       this->ComputeHtPart2 =
-          jitcode1_->getCode<void (*)(gru_t*, const gru_attr_t*)>();
+          jitcode2_->getCode<void (*)(gru_t*, const gru_attr_t*)>();
       return;
     }
 #endif
@@ -207,7 +207,7 @@ class GRUKernelImpl : public GRUKernel<T> {
 #ifdef PADDLE_WITH_XBYAK
 template <>
 bool GRUKernelImpl<float>::useJIT(int d) {
-  return false;  // jitcode not ready yet
+  return gen::GRUJitCode::init(d);
 }
 #endif
 
diff --git a/paddle/fluid/operators/math/jit_kernel_test.cc b/paddle/fluid/operators/math/jit_kernel_test.cc
index 1cbe1b5d95..cc8a5d4d86 100644
--- a/paddle/fluid/operators/math/jit_kernel_test.cc
+++ b/paddle/fluid/operators/math/jit_kernel_test.cc
@@ -714,6 +714,8 @@ TEST(JitKernel, pool) {
   std::string act_gate = "sigmoid", act_cand = "tanh", act_cell = "tanh";
   jit::lstm_attr_t attr(frame_size, act_gate, act_cand, act_cell, false);
 
+  // empty call it to avoid unknown flag 'use_pinned_memory' on Mac
+  paddle::platform::jit::MayIUse(paddle::platform::jit::avx);
   const auto& plstm1 =
       jit::KernelPool::Instance()
           .template Get<jit::LSTMKernel<float>, const jit::lstm_attr_t&>(attr);

From 36f08eef3b466001f339e2c33f47dac60bbc6821 Mon Sep 17 00:00:00 2001
From: qingqing01 <dangqingqing@baidu.com>
Date: Fri, 23 Nov 2018 13:04:41 +0800
Subject: [PATCH 063/113] CUDA kernel for density_prior_box_op. (#14513)

* CUDA kernel for density_prior_box_op.
* Support flatten to 2D.
---
 paddle/fluid/API.spec                         |   2 +-
 paddle/fluid/framework/op_desc.cc             |   6 +
 .../fluid/operators/detection/CMakeLists.txt  |   2 +-
 .../detection/density_prior_box_op.cc         |  36 ++--
 .../detection/density_prior_box_op.cu         | 170 ++++++++++++++++++
 .../detection/density_prior_box_op.h          |  73 ++++----
 python/paddle/fluid/layers/detection.py       |  43 +++--
 python/paddle/fluid/tests/test_detection.py   |  60 ++++---
 .../unittests/test_density_prior_box_op.py    |  30 ++--
 9 files changed, 305 insertions(+), 117 deletions(-)
 create mode 100644 paddle/fluid/operators/detection/density_prior_box_op.cu

diff --git a/paddle/fluid/API.spec b/paddle/fluid/API.spec
index 541c4db1fa..50114bf3df 100644
--- a/paddle/fluid/API.spec
+++ b/paddle/fluid/API.spec
@@ -276,7 +276,7 @@ paddle.fluid.layers.hard_shrink ArgSpec(args=['x', 'threshold'], varargs=None, k
 paddle.fluid.layers.cumsum ArgSpec(args=['x', 'axis', 'exclusive', 'reverse'], varargs=None, keywords=None, defaults=(None, None, None))
 paddle.fluid.layers.thresholded_relu ArgSpec(args=['x', 'threshold'], varargs=None, keywords=None, defaults=(None,))
 paddle.fluid.layers.prior_box ArgSpec(args=['input', 'image', 'min_sizes', 'max_sizes', 'aspect_ratios', 'variance', 'flip', 'clip', 'steps', 'offset', 'name', 'min_max_aspect_ratios_order'], varargs=None, keywords=None, defaults=(None, [1.0], [0.1, 0.1, 0.2, 0.2], False, False, [0.0, 0.0], 0.5, None, False))
-paddle.fluid.layers.density_prior_box ArgSpec(args=['input', 'image', 'densities', 'fixed_sizes', 'fixed_ratios', 'variance', 'clip', 'steps', 'offset', 'name'], varargs=None, keywords=None, defaults=(None, None, None, [0.1, 0.1, 0.2, 0.2], False, [0.0, 0.0], 0.5, None))
+paddle.fluid.layers.density_prior_box ArgSpec(args=['input', 'image', 'densities', 'fixed_sizes', 'fixed_ratios', 'variance', 'clip', 'steps', 'offset', 'flatten_to_2d', 'name'], varargs=None, keywords=None, defaults=(None, None, None, [0.1, 0.1, 0.2, 0.2], False, [0.0, 0.0], 0.5, False, None))
 paddle.fluid.layers.multi_box_head ArgSpec(args=['inputs', 'image', 'base_size', 'num_classes', 'aspect_ratios', 'min_ratio', 'max_ratio', 'min_sizes', 'max_sizes', 'steps', 'step_w', 'step_h', 'offset', 'variance', 'flip', 'clip', 'kernel_size', 'pad', 'stride', 'name', 'min_max_aspect_ratios_order'], varargs=None, keywords=None, defaults=(None, None, None, None, None, None, None, 0.5, [0.1, 0.1, 0.2, 0.2], True, False, 1, 0, 1, None, False))
 paddle.fluid.layers.bipartite_match ArgSpec(args=['dist_matrix', 'match_type', 'dist_threshold', 'name'], varargs=None, keywords=None, defaults=(None, None, None))
 paddle.fluid.layers.target_assign ArgSpec(args=['input', 'matched_indices', 'negative_indices', 'mismatch_value', 'name'], varargs=None, keywords=None, defaults=(None, None, None))
diff --git a/paddle/fluid/framework/op_desc.cc b/paddle/fluid/framework/op_desc.cc
index fbaa169df6..362cda3f23 100644
--- a/paddle/fluid/framework/op_desc.cc
+++ b/paddle/fluid/framework/op_desc.cc
@@ -252,6 +252,12 @@ void OpDesc::SetAttr(const std::string &name, const Attribute &v) {
         this->attrs_[name] = std::vector<int>();
         break;
       }
+      case proto::AttrType::LONGS: {
+        VLOG(110) << "SetAttr: " << Type() << ", " << name
+                  << " from LONGS to LONGS";
+        this->attrs_[name] = std::vector<int64_t>();
+        break;
+      }
       case proto::AttrType::FLOATS: {
         VLOG(110) << "SetAttr: " << Type() << ", " << name
                   << " from INTS to FLOATS";
diff --git a/paddle/fluid/operators/detection/CMakeLists.txt b/paddle/fluid/operators/detection/CMakeLists.txt
index 58f6f48467..6c85f1577e 100644
--- a/paddle/fluid/operators/detection/CMakeLists.txt
+++ b/paddle/fluid/operators/detection/CMakeLists.txt
@@ -22,7 +22,7 @@ iou_similarity_op.cu)
 detection_library(mine_hard_examples_op SRCS mine_hard_examples_op.cc)
 detection_library(multiclass_nms_op SRCS multiclass_nms_op.cc poly_util.cc gpc.cc)
 detection_library(prior_box_op SRCS prior_box_op.cc prior_box_op.cu)
-detection_library(density_prior_box_op SRCS density_prior_box_op.cc)
+detection_library(density_prior_box_op SRCS density_prior_box_op.cc density_prior_box_op.cu)
 detection_library(anchor_generator_op SRCS anchor_generator_op.cc
 anchor_generator_op.cu)
 detection_library(target_assign_op SRCS target_assign_op.cc
diff --git a/paddle/fluid/operators/detection/density_prior_box_op.cc b/paddle/fluid/operators/detection/density_prior_box_op.cc
index 99df15c322..1012ba3652 100644
--- a/paddle/fluid/operators/detection/density_prior_box_op.cc
+++ b/paddle/fluid/operators/detection/density_prior_box_op.cc
@@ -39,24 +39,27 @@ class DensityPriorBoxOp : public framework::OperatorWithKernel {
     auto fixed_sizes = ctx->Attrs().Get<std::vector<float>>("fixed_sizes");
     auto fixed_ratios = ctx->Attrs().Get<std::vector<float>>("fixed_ratios");
     auto densities = ctx->Attrs().Get<std::vector<int>>("densities");
+    bool flatten = ctx->Attrs().Get<bool>("flatten_to_2d");
 
     PADDLE_ENFORCE_EQ(fixed_sizes.size(), densities.size(),
                       "The number of fixed_sizes and densities must be equal.");
     size_t num_priors = 0;
-    if ((fixed_sizes.size() > 0) && (densities.size() > 0)) {
-      for (size_t i = 0; i < densities.size(); ++i) {
-        if (fixed_ratios.size() > 0) {
-          num_priors += (fixed_ratios.size()) * (pow(densities[i], 2));
-        }
-      }
+    for (size_t i = 0; i < densities.size(); ++i) {
+      num_priors += (fixed_ratios.size()) * (pow(densities[i], 2));
+    }
+    if (!flatten) {
+      std::vector<int64_t> dim_vec(4);
+      dim_vec[0] = input_dims[2];
+      dim_vec[1] = input_dims[3];
+      dim_vec[2] = num_priors;
+      dim_vec[3] = 4;
+      ctx->SetOutputDim("Boxes", framework::make_ddim(dim_vec));
+      ctx->SetOutputDim("Variances", framework::make_ddim(dim_vec));
+    } else {
+      int64_t dim0 = input_dims[2] * input_dims[3] * num_priors;
+      ctx->SetOutputDim("Boxes", {dim0, 4});
+      ctx->SetOutputDim("Variances", {dim0, 4});
     }
-    std::vector<int64_t> dim_vec(4);
-    dim_vec[0] = input_dims[2];
-    dim_vec[1] = input_dims[3];
-    dim_vec[2] = num_priors;
-    dim_vec[3] = 4;
-    ctx->SetOutputDim("Boxes", framework::make_ddim(dim_vec));
-    ctx->SetOutputDim("Variances", framework::make_ddim(dim_vec));
   }
 
  protected:
@@ -64,7 +67,7 @@ class DensityPriorBoxOp : public framework::OperatorWithKernel {
       const framework::ExecutionContext& ctx) const override {
     return framework::OpKernelType(
         framework::ToDataType(ctx.Input<framework::Tensor>("Input")->type()),
-        platform::CPUPlace());
+        ctx.GetPlace());
   }
 };
 
@@ -101,7 +104,10 @@ class DensityPriorBoxOpMaker : public framework::OpProtoAndCheckerMaker {
         });
     AddAttr<bool>("clip", "(bool) Whether to clip out-of-boundary boxes.")
         .SetDefault(true);
-
+    AddAttr<bool>("flatten_to_2d",
+                  "(bool) Whether to flatten to 2D and "
+                  "the second dim is 4.")
+        .SetDefault(false);
     AddAttr<float>(
         "step_w",
         "Density prior boxes step across width, 0.0 for auto calculation.")
diff --git a/paddle/fluid/operators/detection/density_prior_box_op.cu b/paddle/fluid/operators/detection/density_prior_box_op.cu
new file mode 100644
index 0000000000..3b7c781795
--- /dev/null
+++ b/paddle/fluid/operators/detection/density_prior_box_op.cu
@@ -0,0 +1,170 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/detection/density_prior_box_op.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename T>
+static __device__ inline T Clip(T in) {
+  return min(max(in, 0.), 1.);
+}
+
+template <typename T>
+static __global__ void GenDensityPriorBox(
+    const int height, const int width, const int im_height, const int im_width,
+    const T offset, const T step_width, const T step_height,
+    const int num_priors, const T* ratios_shift, bool is_clip, const T var_xmin,
+    const T var_ymin, const T var_xmax, const T var_ymax, T* out, T* var) {
+  int gidx = blockIdx.x * blockDim.x + threadIdx.x;
+  int gidy = blockIdx.y * blockDim.y + threadIdx.y;
+  int step_x = blockDim.x * gridDim.x;
+  int step_y = blockDim.y * gridDim.y;
+
+  const T* width_ratio = ratios_shift;
+  const T* height_ratio = ratios_shift + num_priors;
+  const T* width_shift = ratios_shift + 2 * num_priors;
+  const T* height_shift = ratios_shift + 3 * num_priors;
+
+  for (int j = gidy; j < height; j += step_y) {
+    for (int i = gidx; i < width * num_priors; i += step_x) {
+      int h = j;
+      int w = i / num_priors;
+      int k = i % num_priors;
+
+      T center_x = (w + offset) * step_width;
+      T center_y = (h + offset) * step_height;
+
+      T center_x_temp = center_x + width_shift[k];
+      T center_y_temp = center_y + height_shift[k];
+
+      T box_width_ratio = width_ratio[k] / 2.;
+      T box_height_ratio = height_ratio[k] / 2.;
+
+      T xmin = max((center_x_temp - box_width_ratio) / im_width, 0.);
+      T ymin = max((center_y_temp - box_height_ratio) / im_height, 0.);
+      T xmax = min((center_x_temp + box_width_ratio) / im_width, 1.);
+      T ymax = min((center_y_temp + box_height_ratio) / im_height, 1.);
+
+      int out_offset = (j * width * num_priors + i) * 4;
+      out[out_offset] = is_clip ? Clip<T>(xmin) : xmin;
+      out[out_offset + 1] = is_clip ? Clip<T>(ymin) : ymin;
+      out[out_offset + 2] = is_clip ? Clip<T>(xmax) : xmax;
+      out[out_offset + 3] = is_clip ? Clip<T>(ymax) : ymax;
+
+      var[out_offset] = var_xmin;
+      var[out_offset + 1] = var_ymin;
+      var[out_offset + 2] = var_xmax;
+      var[out_offset + 3] = var_ymax;
+    }
+  }
+}
+
+template <typename T>
+class DensityPriorBoxOpCUDAKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* input = ctx.Input<paddle::framework::Tensor>("Input");
+    auto* image = ctx.Input<paddle::framework::Tensor>("Image");
+    auto* boxes = ctx.Output<paddle::framework::Tensor>("Boxes");
+    auto* vars = ctx.Output<paddle::framework::Tensor>("Variances");
+
+    auto variances = ctx.Attr<std::vector<float>>("variances");
+    auto is_clip = ctx.Attr<bool>("clip");
+
+    auto fixed_sizes = ctx.Attr<std::vector<float>>("fixed_sizes");
+    auto fixed_ratios = ctx.Attr<std::vector<float>>("fixed_ratios");
+    auto densities = ctx.Attr<std::vector<int>>("densities");
+
+    T step_w = static_cast<T>(ctx.Attr<float>("step_w"));
+    T step_h = static_cast<T>(ctx.Attr<float>("step_h"));
+    T offset = static_cast<T>(ctx.Attr<float>("offset"));
+
+    auto img_width = image->dims()[3];
+    auto img_height = image->dims()[2];
+
+    auto feature_width = input->dims()[3];
+    auto feature_height = input->dims()[2];
+
+    T step_width, step_height;
+    if (step_w == 0 || step_h == 0) {
+      step_width = static_cast<T>(img_width) / feature_width;
+      step_height = static_cast<T>(img_height) / feature_height;
+    } else {
+      step_width = step_w;
+      step_height = step_h;
+    }
+
+    int num_priors = 0;
+    for (size_t i = 0; i < densities.size(); ++i) {
+      num_priors += (fixed_ratios.size()) * (pow(densities[i], 2));
+    }
+    int step_average = static_cast<int>((step_width + step_height) * 0.5);
+
+    framework::Tensor h_temp;
+    T* tdata = h_temp.mutable_data<T>({num_priors * 4}, platform::CPUPlace());
+    int idx = 0;
+    for (size_t s = 0; s < fixed_sizes.size(); ++s) {
+      auto fixed_size = fixed_sizes[s];
+      int density = densities[s];
+      for (size_t r = 0; r < fixed_ratios.size(); ++r) {
+        float ar = fixed_ratios[r];
+        int shift = step_average / density;
+        float box_width_ratio = fixed_size * sqrt(ar);
+        float box_height_ratio = fixed_size / sqrt(ar);
+        for (int di = 0; di < density; ++di) {
+          for (int dj = 0; dj < density; ++dj) {
+            float center_x_temp = shift / 2. + dj * shift - step_average / 2.;
+            float center_y_temp = shift / 2. + di * shift - step_average / 2.;
+            tdata[idx] = box_width_ratio;
+            tdata[num_priors + idx] = box_height_ratio;
+            tdata[2 * num_priors + idx] = center_x_temp;
+            tdata[3 * num_priors + idx] = center_y_temp;
+            idx++;
+          }
+        }
+      }
+    }
+
+    boxes->mutable_data<T>(ctx.GetPlace());
+    vars->mutable_data<T>(ctx.GetPlace());
+
+    framework::Tensor d_temp;
+    framework::TensorCopySync(h_temp, ctx.GetPlace(), &d_temp);
+
+    // At least use 32 threads, at most 512 threads.
+    // blockx is multiple of 32.
+    int blockx = std::min(((feature_width * num_priors + 31) >> 5) << 5, 512L);
+    int gridx = (feature_width * num_priors + blockx - 1) / blockx;
+    dim3 threads(blockx, 1);
+    dim3 grids(gridx, feature_height);
+
+    auto stream =
+        ctx.template device_context<platform::CUDADeviceContext>().stream();
+    GenDensityPriorBox<T><<<grids, threads, 0, stream>>>(
+        feature_height, feature_width, img_height, img_width, offset,
+        step_width, step_height, num_priors, d_temp.data<T>(), is_clip,
+        variances[0], variances[1], variances[2], variances[3],
+        boxes->data<T>(), vars->data<T>());
+  }
+};  // namespace operators
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP_CUDA_KERNEL(density_prior_box,
+                        ops::DensityPriorBoxOpCUDAKernel<float>,
+                        ops::DensityPriorBoxOpCUDAKernel<double>);
diff --git a/paddle/fluid/operators/detection/density_prior_box_op.h b/paddle/fluid/operators/detection/density_prior_box_op.h
index 9a52077e9c..ed2f5df80c 100644
--- a/paddle/fluid/operators/detection/density_prior_box_op.h
+++ b/paddle/fluid/operators/detection/density_prior_box_op.h
@@ -1,4 +1,4 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
 licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
@@ -52,18 +52,16 @@ class DensityPriorBoxOpKernel : public framework::OpKernel<T> {
       step_height = step_h;
     }
     int num_priors = 0;
-    if (fixed_sizes.size() > 0 && densities.size() > 0) {
-      for (size_t i = 0; i < densities.size(); ++i) {
-        if (fixed_ratios.size() > 0) {
-          num_priors += (fixed_ratios.size()) * (pow(densities[i], 2));
-        }
-      }
+    for (size_t i = 0; i < densities.size(); ++i) {
+      num_priors += (fixed_ratios.size()) * (pow(densities[i], 2));
     }
 
     boxes->mutable_data<T>(ctx.GetPlace());
     vars->mutable_data<T>(ctx.GetPlace());
-    auto e_boxes = framework::EigenTensor<T, 4>::From(*boxes).setConstant(0.0);
 
+    auto box_dim = vars->dims();
+    boxes->Resize({feature_height, feature_width, num_priors, 4});
+    auto e_boxes = framework::EigenTensor<T, 4>::From(*boxes).setConstant(0.0);
     int step_average = static_cast<int>((step_width + step_height) * 0.5);
 
     for (int h = 0; h < feature_height; ++h) {
@@ -76,36 +74,34 @@ class DensityPriorBoxOpKernel : public framework::OpKernel<T> {
           auto fixed_size = fixed_sizes[s];
           int density = densities[s];
           // Generate density prior boxes with fixed ratios.
-          if (fixed_ratios.size() > 0) {
-            for (size_t r = 0; r < fixed_ratios.size(); ++r) {
-              float ar = fixed_ratios[r];
-              int shift = step_average / density;
-              float box_width_ratio = fixed_size * sqrt(ar);
-              float box_height_ratio = fixed_size / sqrt(ar);
-              for (int di = 0; di < density; ++di) {
-                for (int dj = 0; dj < density; ++dj) {
-                  float center_x_temp =
-                      center_x - step_average / 2. + shift / 2. + dj * shift;
-                  float center_y_temp =
-                      center_y - step_average / 2. + shift / 2. + di * shift;
-                  e_boxes(h, w, idx, 0) =
-                      (center_x_temp - box_width_ratio / 2.) / img_width >= 0
-                          ? (center_x_temp - box_width_ratio / 2.) / img_width
-                          : 0;
-                  e_boxes(h, w, idx, 1) =
-                      (center_y_temp - box_height_ratio / 2.) / img_height >= 0
-                          ? (center_y_temp - box_height_ratio / 2.) / img_height
-                          : 0;
-                  e_boxes(h, w, idx, 2) =
-                      (center_x_temp + box_width_ratio / 2.) / img_width <= 1
-                          ? (center_x_temp + box_width_ratio / 2.) / img_width
-                          : 1;
-                  e_boxes(h, w, idx, 3) =
-                      (center_y_temp + box_height_ratio / 2.) / img_height <= 1
-                          ? (center_y_temp + box_height_ratio / 2.) / img_height
-                          : 1;
-                  idx++;
-                }
+          for (size_t r = 0; r < fixed_ratios.size(); ++r) {
+            float ar = fixed_ratios[r];
+            int shift = step_average / density;
+            float box_width_ratio = fixed_size * sqrt(ar);
+            float box_height_ratio = fixed_size / sqrt(ar);
+            for (int di = 0; di < density; ++di) {
+              for (int dj = 0; dj < density; ++dj) {
+                float center_x_temp =
+                    center_x - step_average / 2. + shift / 2. + dj * shift;
+                float center_y_temp =
+                    center_y - step_average / 2. + shift / 2. + di * shift;
+                e_boxes(h, w, idx, 0) =
+                    (center_x_temp - box_width_ratio / 2.) / img_width >= 0
+                        ? (center_x_temp - box_width_ratio / 2.) / img_width
+                        : 0;
+                e_boxes(h, w, idx, 1) =
+                    (center_y_temp - box_height_ratio / 2.) / img_height >= 0
+                        ? (center_y_temp - box_height_ratio / 2.) / img_height
+                        : 0;
+                e_boxes(h, w, idx, 2) =
+                    (center_x_temp + box_width_ratio / 2.) / img_width <= 1
+                        ? (center_x_temp + box_width_ratio / 2.) / img_width
+                        : 1;
+                e_boxes(h, w, idx, 3) =
+                    (center_y_temp + box_height_ratio / 2.) / img_height <= 1
+                        ? (center_y_temp + box_height_ratio / 2.) / img_height
+                        : 1;
+                idx++;
               }
             }
           }
@@ -139,6 +135,7 @@ class DensityPriorBoxOpKernel : public framework::OpKernel<T> {
     e_vars = var_et.broadcast(Eigen::DSizes<int, 2>(box_num, 1));
 
     vars->Resize(var_dim);
+    boxes->Resize(box_dim);
   }
 };  // namespace operators
 
diff --git a/python/paddle/fluid/layers/detection.py b/python/paddle/fluid/layers/detection.py
index 3f17400a14..4843af8340 100644
--- a/python/paddle/fluid/layers/detection.py
+++ b/python/paddle/fluid/layers/detection.py
@@ -1029,6 +1029,7 @@ def density_prior_box(input,
                       clip=False,
                       steps=[0.0, 0.0],
                       offset=0.5,
+                      flatten_to_2d=False,
                       name=None):
     """
     **Density Prior Box Operator**
@@ -1065,22 +1066,24 @@ def density_prior_box(input,
             height/weight of the input will be automatically calculated.
             Default: [0., 0.]
        offset(float): Prior boxes center offset. Default: 0.5
+       flatten_to_2d(bool): Whether to flatten output prior boxes and variance
+           to 2D shape, the second dim is 4. Default: False.
        name(str): Name of the density prior box op. Default: None.
 
     Returns:
         tuple: A tuple with two Variable (boxes, variances)
 
         boxes: the output density prior boxes of PriorBox.
-        The layout is [H, W, num_priors, 4].
-        H is the height of input, W is the width of input,
-        num_priors is the total
-        box count of each position of input.
+            The layout is [H, W, num_priors, 4] when flatten_to_2d is False.
+            The layout is [H * W * num_priors, 4] when flatten_to_2d is True.
+            H is the height of input, W is the width of input,
+            num_priors is the total box count of each position of input.
 
         variances: the expanded variances of PriorBox.
-        The layout is [H, W, num_priors, 4].
-        H is the height of input, W is the width of input
-        num_priors is the total
-        box count of each position of input
+            The layout is [H, W, num_priors, 4] when flatten_to_2d is False.
+            The layout is [H * W * num_priors, 4] when flatten_to_2d is True.
+            H is the height of input, W is the width of input
+            num_priors is the total box count of each position of input.
 
 
     Examples:
@@ -1089,14 +1092,11 @@ def density_prior_box(input,
             box, var = fluid.layers.density_prior_box(
                 input=conv1,
                 image=images,
-                min_sizes=[100.],
-                max_sizes=[200.],
-                aspect_ratios=[1.0, 1.0 / 2.0, 2.0],
-                densities=[3, 4],
-                fixed_sizes=[50., 60.],
-                fixed_ratios=[1.0, 3.0, 1.0 / 3.0],
-                flip=True,
-                clip=True)
+                densities=[4, 2, 1],
+                fixed_sizes=[32.0, 64.0, 128.0],
+                fixed_ratios=[1.],
+                clip=True,
+                flatten_to_2d=True)
     """
     helper = LayerHelper("density_prior_box", **locals())
     dtype = helper.input_dtype()
@@ -1127,14 +1127,11 @@ def density_prior_box(input,
         'step_w': steps[0],
         'step_h': steps[1],
         'offset': offset,
+        'densities': densities,
+        'fixed_sizes': fixed_sizes,
+        'fixed_ratios': fixed_ratios,
+        'flatten_to_2d': flatten_to_2d,
     }
-    if densities is not None and len(densities) > 0:
-        attrs['densities'] = densities
-    if fixed_sizes is not None and len(fixed_sizes) > 0:
-        attrs['fixed_sizes'] = fixed_sizes
-    if fixed_ratios is not None and len(fixed_ratios) > 0:
-        attrs['fixed_ratios'] = fixed_ratios
-
     box = helper.create_variable_for_type_inference(dtype)
     var = helper.create_variable_for_type_inference(dtype)
     helper.append_op(
diff --git a/python/paddle/fluid/tests/test_detection.py b/python/paddle/fluid/tests/test_detection.py
index 982d291801..a2eca5541a 100644
--- a/python/paddle/fluid/tests/test_detection.py
+++ b/python/paddle/fluid/tests/test_detection.py
@@ -112,38 +112,42 @@ class TestDetection(unittest.TestCase):
 
 class TestPriorBox(unittest.TestCase):
     def test_prior_box(self):
-        data_shape = [3, 224, 224]
-        images = fluid.layers.data(
-            name='pixel', shape=data_shape, dtype='float32')
-        conv1 = fluid.layers.conv2d(images, 3, 3, 2)
-        box, var = layers.prior_box(
-            input=conv1,
-            image=images,
-            min_sizes=[100.0],
-            aspect_ratios=[1.],
-            flip=True,
-            clip=True)
-        assert len(box.shape) == 4
-        assert box.shape == var.shape
-        assert box.shape[3] == 4
+        program = Program()
+        with program_guard(program):
+            data_shape = [3, 224, 224]
+            images = fluid.layers.data(
+                name='pixel', shape=data_shape, dtype='float32')
+            conv1 = fluid.layers.conv2d(images, 3, 3, 2)
+            box, var = layers.prior_box(
+                input=conv1,
+                image=images,
+                min_sizes=[100.0],
+                aspect_ratios=[1.],
+                flip=True,
+                clip=True)
+            assert len(box.shape) == 4
+            assert box.shape == var.shape
+            assert box.shape[3] == 4
 
 
 class TestDensityPriorBox(unittest.TestCase):
     def test_density_prior_box(self):
-        data_shape = [3, 224, 224]
-        images = fluid.layers.data(
-            name='pixel', shape=data_shape, dtype='float32')
-        conv1 = fluid.layers.conv2d(images, 3, 3, 2)
-        box, var = layers.density_prior_box(
-            input=conv1,
-            image=images,
-            densities=[3, 4],
-            fixed_sizes=[50., 60.],
-            fixed_ratios=[1.0],
-            clip=True)
-        assert len(box.shape) == 4
-        assert box.shape == var.shape
-        assert box.shape[3] == 4
+        program = Program()
+        with program_guard(program):
+            data_shape = [3, 224, 224]
+            images = fluid.layers.data(
+                name='pixel', shape=data_shape, dtype='float32')
+            conv1 = fluid.layers.conv2d(images, 3, 3, 2)
+            box, var = layers.density_prior_box(
+                input=conv1,
+                image=images,
+                densities=[3, 4],
+                fixed_sizes=[50., 60.],
+                fixed_ratios=[1.0],
+                clip=True)
+            assert len(box.shape) == 4
+            assert box.shape == var.shape
+            assert box.shape[-1] == 4
 
 
 class TestAnchorGenerator(unittest.TestCase):
diff --git a/python/paddle/fluid/tests/unittests/test_density_prior_box_op.py b/python/paddle/fluid/tests/unittests/test_density_prior_box_op.py
index 79d1fd3d71..4b0bc1dcf8 100644
--- a/python/paddle/fluid/tests/unittests/test_density_prior_box_op.py
+++ b/python/paddle/fluid/tests/unittests/test_density_prior_box_op.py
@@ -36,7 +36,8 @@ class TestDensityPriorBoxOp(OpTest):
             'offset': self.offset,
             'densities': self.densities,
             'fixed_sizes': self.fixed_sizes,
-            'fixed_ratios': self.fixed_ratios
+            'fixed_ratios': self.fixed_ratios,
+            'flatten_to_2d': self.flatten_to_2d
         }
         self.outputs = {'Boxes': self.out_boxes, 'Variances': self.out_var}
 
@@ -48,16 +49,17 @@ class TestDensityPriorBoxOp(OpTest):
         self.set_data()
 
     def set_density(self):
-        self.densities = []
-        self.fixed_sizes = []
-        self.fixed_ratios = []
+        self.densities = [4, 2, 1]
+        self.fixed_sizes = [32.0, 64.0, 128.0]
+        self.fixed_ratios = [1.0]
+        self.layer_w = 17
+        self.layer_h = 17
+        self.image_w = 533
+        self.image_h = 533
+        self.flatten_to_2d = False
 
     def init_test_params(self):
-        self.layer_w = 32
-        self.layer_h = 32
-
-        self.image_w = 40
-        self.image_h = 40
+        self.set_density()
 
         self.step_w = float(self.image_w) / float(self.layer_w)
         self.step_h = float(self.image_h) / float(self.layer_h)
@@ -69,8 +71,6 @@ class TestDensityPriorBoxOp(OpTest):
         self.variances = [0.1, 0.1, 0.2, 0.2]
         self.variances = np.array(self.variances, dtype=np.float).flatten()
 
-        self.set_density()
-
         self.clip = True
         self.num_priors = 0
         if len(self.fixed_sizes) > 0 and len(self.densities) > 0:
@@ -129,6 +129,9 @@ class TestDensityPriorBoxOp(OpTest):
                           (self.layer_h, self.layer_w, self.num_priors, 1))
         self.out_boxes = out_boxes.astype('float32')
         self.out_var = out_var.astype('float32')
+        if self.flatten_to_2d:
+            self.out_boxes = self.out_boxes.reshape((-1, 4))
+            self.out_var = self.out_var.reshape((-1, 4))
 
 
 class TestDensityPriorBox(TestDensityPriorBoxOp):
@@ -136,6 +139,11 @@ class TestDensityPriorBox(TestDensityPriorBoxOp):
         self.densities = [3, 4]
         self.fixed_sizes = [1.0, 2.0]
         self.fixed_ratios = [1.0]
+        self.layer_w = 32
+        self.layer_h = 32
+        self.image_w = 40
+        self.image_h = 40
+        self.flatten_to_2d = True
 
 
 if __name__ == '__main__':

From e950ce7148759472d67856d7c1ba7ec35fe364cd Mon Sep 17 00:00:00 2001
From: JiabinYang <marsyang199376@gmail.com>
Date: Fri, 23 Nov 2018 05:54:20 +0000
Subject: [PATCH 064/113] test for mac

---
 python/paddle/fluid/__init__.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/python/paddle/fluid/__init__.py b/python/paddle/fluid/__init__.py
index a6416326ed..ac2a7ea47e 100644
--- a/python/paddle/fluid/__init__.py
+++ b/python/paddle/fluid/__init__.py
@@ -119,7 +119,8 @@ def __bootstrap__():
         'eager_delete_tensor_gb', 'allocator_strategy',
         'reader_queue_speed_test_mode', 'print_sub_graph_dir'
     ]
-    if sysstr != 'Darwin':
+    if 'Darwin' not in sysstr:
+        print("aaaaa")
         read_env_flags.append('use_pinned_memory')
 
     if os.name != 'nt':

From 61c5f13fcf92c18f30c05a90e3d3badd884f9340 Mon Sep 17 00:00:00 2001
From: sabreshao <sabre.shao@amd.com>
Date: Fri, 23 Nov 2018 14:27:39 +0800
Subject: [PATCH 065/113] Fix cmake for AMDGPU platform (#13801)

* HIP cmake.
Enable whole archieve build for pybind library.

Disable two warning.

Rollback to C++11.

Link RCCL to WA gpu kernel loading issue.

Update eigen to fix build failure.

Add more include directories.

Fix O3 build failure.

Update eigen.

fix tensor_util_test segment fault issue

add more macro check in hip.cmake.
we may consider refine hip.cmake to inherit all add_definitions() in parrent scope, in the future.

Fix rocRAND load.

Update eigen to fix gru_unit_op and reduce_op.

Add HIP support to testing.

Update eigen to support int16 and int8 in arg min and arg max.

* add rocprim as cub library used by nv implementation

* Reduce build time in rocprim.

* Add rocprim introduction, remove useless cmake code.

* Remove useless flags and format cmake file.
---
 CMakeLists.txt                      |  1 +
 cmake/external/eigen.cmake          |  2 +-
 cmake/external/rocprim.cmake        | 44 +++++++++++++++++++++++++++++
 cmake/flags.cmake                   |  3 ++
 cmake/generic.cmake                 | 26 +++++++++--------
 cmake/hip.cmake                     | 32 +++++++++++++++++----
 paddle/fluid/pybind/CMakeLists.txt  |  4 +--
 paddle/testing/paddle_gtest_main.cc |  2 +-
 8 files changed, 94 insertions(+), 20 deletions(-)
 create mode 100644 cmake/external/rocprim.cmake

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 3059ab7e0e..8dcf9786e3 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -204,6 +204,7 @@ include(external/eigen)     # download eigen3
 include(external/pybind11)  # download pybind11
 include(external/cares)
 include(external/cub)
+include(external/rocprim)
 include(external/xxhash)    # download xxhash
 include(external/dlpack)
 include(external/snappy)    # download snappy
diff --git a/cmake/external/eigen.cmake b/cmake/external/eigen.cmake
index 573ad5e5f0..6aef97f212 100644
--- a/cmake/external/eigen.cmake
+++ b/cmake/external/eigen.cmake
@@ -17,7 +17,7 @@ if(WITH_AMD_GPU)
         extern_eigen3
         ${EXTERNAL_PROJECT_LOG_ARGS}
         GIT_REPOSITORY  "https://github.com/sabreshao/hipeigen.git"
-        GIT_TAG         0cba03ff9f8f9f70bbd92ac5857b031aa8fed6f9
+        GIT_TAG         7cb2b6e5a4b4a1efe658abb215cd866c6fb2275e
         PREFIX          ${EIGEN_SOURCE_DIR}
         UPDATE_COMMAND  ""
         CONFIGURE_COMMAND ""
diff --git a/cmake/external/rocprim.cmake b/cmake/external/rocprim.cmake
new file mode 100644
index 0000000000..914c064918
--- /dev/null
+++ b/cmake/external/rocprim.cmake
@@ -0,0 +1,44 @@
+if (NOT WITH_AMD_GPU)
+    return()
+endif()
+
+# rocprim is "ROCm Parallel Primitives" for short.
+# It is a header-only library providing HIP and HC parallel primitives
+# for developing performant GPU-accelerated code on AMD ROCm platform.
+
+if("x${HCC_HOME}" STREQUAL "x")
+  set(HCC_HOME "/opt/rocm/hcc")
+endif()
+
+INCLUDE(ExternalProject)
+
+SET(ROCPRIM_SOURCE_DIR ${THIRD_PARTY_PATH}/rocprim)
+SET(ROCPRIM_INSTALL_DIR  ${THIRD_PARTY_PATH}/install/rocprim)
+SET(ROCPRIM_INCLUDE_DIR ${ROCPRIM_INSTALL_DIR}/include)
+
+ExternalProject_Add(
+    extern_rocprim
+    GIT_REPOSITORY "https://github.com/ROCmSoftwarePlatform/rocPRIM.git"
+    GIT_TAG        5bd41b96ab8d8343330fb2c3e1b96775bde3b3fc 
+    PREFIX         ${ROCPRIM_SOURCE_DIR}
+    UPDATE_COMMAND  ""
+    CMAKE_ARGS     -DCMAKE_CXX_COMPILER=${HCC_HOME}/bin/hcc
+    CMAKE_ARGS     -DONLY_INSTALL=ON
+    CMAKE_ARGS     -DBUILD_TEST=OFF
+    CMAKE_ARGS     -DCMAKE_INSTALL_PREFIX=${ROCPRIM_INSTALL_DIR}
+
+    INSTALL_DIR    ${ROCPRIM_INSTALL_DIR}
+    ${EXTERNAL_PROJECT_LOG_ARGS}
+)
+
+INCLUDE_DIRECTORIES(${ROCPRIM_INCLUDE_DIR})
+
+if (${CMAKE_VERSION} VERSION_LESS "3.3.0")
+    set(dummyfile ${CMAKE_CURRENT_BINARY_DIR}/rocprim_dummy.c)
+    file(WRITE ${dummyfile} "const char *dummy_rocprim = \"${dummyfile}\";")
+    add_library(rocprim STATIC ${dummyfile})
+else()
+    add_library(rocprim INTERFACE)
+endif()
+
+add_dependencies(rocprim extern_rocprim)
diff --git a/cmake/flags.cmake b/cmake/flags.cmake
index 343e44ab4b..c4472040ce 100644
--- a/cmake/flags.cmake
+++ b/cmake/flags.cmake
@@ -129,6 +129,9 @@ set(COMMON_FLAGS
     -Wno-error=parentheses-equality # Warnings in pybind11
     -Wno-error=ignored-attributes  # Warnings in Eigen, gcc 6.3
     -Wno-error=terminate  # Warning in PADDLE_ENFORCE
+    -Wno-error=int-in-bool-context # Warning in Eigen gcc 7.2
+    -Wimplicit-fallthrough=0 # Warning in tinyformat.h
+    -Wno-error=maybe-uninitialized # Warning in boost gcc 7.2
 )
 
 set(GPU_COMMON_FLAGS
diff --git a/cmake/generic.cmake b/cmake/generic.cmake
index 111627a932..7d803d00ef 100644
--- a/cmake/generic.cmake
+++ b/cmake/generic.cmake
@@ -454,25 +454,29 @@ function(hip_library TARGET_NAME)
       else()
         add_library(${TARGET_NAME} STATIC ${_cmake_options} ${_generated_files} ${_sources})
         set_target_properties(${TARGET_NAME} PROPERTIES LINKER_LANGUAGE CXX)
-        target_link_libraries(${TARGET_NAME} /opt/rocm/hip/lib/libhip_hcc.so /opt/rocm/hip/lib/libhip_device.a)
-	find_fluid_modules(${TARGET_NAME})
+        target_link_libraries(${TARGET_NAME} /opt/rocm/hip/lib/libhip_hcc.so /opt/rocm/hip/lib/libhip_device.a /opt/rocm/rccl/lib/librccl.so /opt/rocm/hiprand/lib/libhiprand.so)
+        find_fluid_modules(${TARGET_NAME})
       endif()
-      if (hip_library_DEPS)
-	add_dependencies(${TARGET_NAME} ${hip_library_DEPS})
-	target_link_libraries(${TARGET_NAME} ${hip_library_DEPS})
+      if("${hip_library_DEPS}" MATCHES "ARCHIVE_START")
+        # Support linking flags: --whole-archive (Linux) / -force_load (MacOS).
+        # WARNING: Please don't use ARCHIVE_START&ARCHIVE_END if TARGET_NAME will be linked by other libraries.
+        target_circle_link_libraries(${TARGET_NAME} ${hip_library_DEPS})
+        list(REMOVE_ITEM hip_library_DEPS ARCHIVE_START ARCHIVE_END)
+      else()
+        target_link_libraries(${TARGET_NAME} ${hip_library_DEPS})
       endif()
       # cpplint code style
       foreach(source_file ${hip_library_SRCS})
-	string(REGEX REPLACE "\\.[^.]*$" "" source ${source_file})
-	if(EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${source}.h)
-	  list(APPEND hip_library_HEADERS ${CMAKE_CURRENT_SOURCE_DIR}/${source}.h)
-	endif()
+        string(REGEX REPLACE "\\.[^.]*$" "" source ${source_file})
+        if(EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${source}.h)
+          list(APPEND hip_library_HEADERS ${CMAKE_CURRENT_SOURCE_DIR}/${source}.h)
+        endif()
       endforeach()
     else(hip_library_SRCS)
       if (hip_library_DEPS)
-	merge_static_libs(${TARGET_NAME} ${hip_library_DEPS})
+        merge_static_libs(${TARGET_NAME} ${hip_library_DEPS})
       else()
-	message(FATAL "Please specify source file or library in nv_library.")
+        message(FATAL "Please specify source file or library in nv_library.")
       endif()
     endif(hip_library_SRCS)
   endif()
diff --git a/cmake/hip.cmake b/cmake/hip.cmake
index bfe491bd6b..4276bc5b08 100644
--- a/cmake/hip.cmake
+++ b/cmake/hip.cmake
@@ -3,6 +3,8 @@ if(NOT WITH_AMD_GPU)
 endif()
 
 include_directories("/opt/rocm/include")
+include_directories("/opt/rocm/hip/include")
+include_directories("/opt/rocm/miopen/include")
 include_directories("/opt/rocm/hipblas/include")
 include_directories("/opt/rocm/hiprand/include")
 include_directories("/opt/rocm/rocrand/include")
@@ -11,20 +13,40 @@ include_directories("/opt/rocm/thrust")
 
 list(APPEND EXTERNAL_LIBS "-L/opt/rocm/lib/ -lhip_hcc")
 
-set(HIP_HCC_FLAGS "${HIP_HCC_FLAGS} -fPIC -DPADDLE_WITH_HIP -std=c++14" )
+set(HIP_HCC_FLAGS "${HIP_HCC_FLAGS} -fPIC -DPADDLE_WITH_HIP -std=c++11" )
 
 if(WITH_DSO)
   set(HIP_HCC_FLAGS "${HIP_HCC_FLAGS} -DPADDLE_USE_DSO")
 endif(WITH_DSO)
 
-if(WITH_DOUBLE)
-  set(HIP_HCC_FLAGS "${HIP_HCC_FLAGS} -DPADDLE_TYPE_DOUBLE")
-endif(WITH_DOUBLE)
-
 if(WITH_TESTING)
   set(HIP_HCC_FLAGS "${HIP_HCC_FLAGS} -DPADDLE_WITH_TESTING")
 endif(WITH_TESTING)
 
+if(WITH_DISTRIBUTE)
+  set(HIP_HCC_FLAGS "${HIP_HCC_FLAGS} -DPADDLE_WITH_DISTRIBUTE")
+endif(WITH_DISTRIBUTE)
+
+if(WITH_GRPC)
+  set(HIP_HCC_FLAGS "${HIP_HCC_FLAGS} -DPADDLE_WITH_GRPC")
+endif(WITH_GRPC)
+
+if(NOT WITH_GOLANG)
+  set(HIP_HCC_FLAGS "${HIP_HCC_FLAGS} -DPADDLE_WITHOUT_GOLANG")
+endif(NOT WITH_GOLANG)
+
+if(WITH_MKLDNN)
+  set(HIP_HCC_FLAGS "${HIP_HCC_FLAGS} -DPADDLE_WITH_MKLDNN")
+endif(WITH_MKLDNN)
+
+set(HIP_HCC_FLAGS "${HIP_HCC_FLAGS} -DANY_IMPL_ANY_CAST_MOVEABLE")
+
+if(NOT WITH_RDMA)
+  set(HIP_HCC_FLAGS "${HIP_HCC_FLAGS} -DPADDLE_DISABLE_RDMA")
+endif(NOT WITH_RDMA)
+
+
+
 if(CMAKE_BUILD_TYPE  STREQUAL "Debug")
     list(APPEND HIP_HCC_FLAGS  ${CMAKE_CXX_FLAGS_DEBUG})
 elseif(CMAKE_BUILD_TYPE  STREQUAL "RelWithDebInfo")
diff --git a/paddle/fluid/pybind/CMakeLists.txt b/paddle/fluid/pybind/CMakeLists.txt
index fb6ee2f4a5..25d241d976 100644
--- a/paddle/fluid/pybind/CMakeLists.txt
+++ b/paddle/fluid/pybind/CMakeLists.txt
@@ -5,8 +5,8 @@ if(WITH_PYTHON)
   if(WITH_AMD_GPU)
     hip_library(paddle_pybind SHARED
       SRCS ${PYBIND_SRCS}
-      DEPS ${PYBIND_DEPS}
-      ${GLOB_OP_LIB} ${GLOB_OPERATOR_DEPS})
+      DEPS ARCHIVE_START ${PYBIND_DEPS}
+      ${GLOB_OP_LIB} ${GLOB_OPERATOR_DEPS} ARCHIVE_END)
   else()
     cc_library(paddle_pybind SHARED
       SRCS ${PYBIND_SRCS}
diff --git a/paddle/testing/paddle_gtest_main.cc b/paddle/testing/paddle_gtest_main.cc
index 598f435461..babb862122 100644
--- a/paddle/testing/paddle_gtest_main.cc
+++ b/paddle/testing/paddle_gtest_main.cc
@@ -28,7 +28,7 @@ int main(int argc, char** argv) {
   for (int i = 0; i < argc; ++i) {
     new_argv.push_back(argv[i]);
   }
-#ifdef PADDLE_WITH_CUDA
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
   new_argv.push_back(
       strdup("--tryfromenv=fraction_of_gpu_memory_to_use,allocator_strategy"));
 #else

From 445fff24dcbdce6c4b98b5631bc6c34831276fca Mon Sep 17 00:00:00 2001
From: peizhilin <wopeizl@163.com>
Date: Fri, 23 Nov 2018 14:40:04 +0800
Subject: [PATCH 066/113] add the bigobj option to NVCC compile fix code style

---
 cmake/cuda.cmake                                | 4 ++--
 paddle/fluid/operators/beam_search_op_test.cc   | 4 ++--
 paddle/fluid/platform/stream_callback_manager.h | 2 +-
 3 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/cmake/cuda.cmake b/cmake/cuda.cmake
index 964d5fd45b..4c7e0fd3f6 100644
--- a/cmake/cuda.cmake
+++ b/cmake/cuda.cmake
@@ -200,9 +200,9 @@ elseif(CMAKE_BUILD_TYPE  STREQUAL "MinSizeRel")
 endif()
 else(NOT WIN32)
 if(CMAKE_BUILD_TYPE  STREQUAL "Debug")
-    list(APPEND CUDA_NVCC_FLAGS  "-g -G")
+    list(APPEND CUDA_NVCC_FLAGS  "-g -G --compiler-options;/bigobj")
 elseif(CMAKE_BUILD_TYPE STREQUAL "Release")
-  list(APPEND CUDA_NVCC_FLAGS "-O3 -DNDEBUG")
+  list(APPEND CUDA_NVCC_FLAGS "-O3 -DNDEBUG --compiler-options;/bigobj")
 else()
   message(FATAL "Windows only support Release or Debug build now. Please set visual studio build type to Release/Debug, x64 build.")
 endif()
diff --git a/paddle/fluid/operators/beam_search_op_test.cc b/paddle/fluid/operators/beam_search_op_test.cc
index 6e283866ff..40b46781da 100644
--- a/paddle/fluid/operators/beam_search_op_test.cc
+++ b/paddle/fluid/operators/beam_search_op_test.cc
@@ -45,8 +45,8 @@ void CreateInput(LoDTensor* ids, LoDTensor* scores) {
   auto* ids_data = ids->mutable_data<int64_t>(place);
   auto* scores_data = scores->mutable_data<float>(place);
   vector<int64_t> _ids({4, 2, 5, 2, 1, 3, 3, 5, 2, 8, 2, 1});
-  vector<float> _scores({0.5f, 0.3f, 0.2f, 0.6f, 0.3f, 0.1f,
-                        0.9f, 0.5f, 0.1f, 0.7f, 0.5f, 0.1f});
+  vector<float> _scores(
+      {0.5f, 0.3f, 0.2f, 0.6f, 0.3f, 0.1f, 0.9f, 0.5f, 0.1f, 0.7f, 0.5f, 0.1f});
 
   for (int i = 0; i < 12; i++) {
     ids_data[i] = _ids[i];
diff --git a/paddle/fluid/platform/stream_callback_manager.h b/paddle/fluid/platform/stream_callback_manager.h
index 8dcfc4e748..ed8734c98c 100644
--- a/paddle/fluid/platform/stream_callback_manager.h
+++ b/paddle/fluid/platform/stream_callback_manager.h
@@ -14,11 +14,11 @@
 
 #pragma once
 
+#include <ThreadPool.h>
 #include <cuda.h>
 #include <cuda_runtime.h>
 #include <functional>
 #include <memory>
-#include <ThreadPool.h>
 #include "paddle/fluid/platform/enforce.h"
 
 namespace paddle {

From 5cd2fc9fd020444257ec6522504a3b244134439c Mon Sep 17 00:00:00 2001
From: JiabinYang <marsyang199376@gmail.com>
Date: Fri, 23 Nov 2018 07:05:48 +0000
Subject: [PATCH 067/113] just for test

---
 paddle/testing/paddle_gtest_main.cc | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/paddle/testing/paddle_gtest_main.cc b/paddle/testing/paddle_gtest_main.cc
index 598f435461..6f10a51d18 100644
--- a/paddle/testing/paddle_gtest_main.cc
+++ b/paddle/testing/paddle_gtest_main.cc
@@ -31,6 +31,11 @@ int main(int argc, char** argv) {
 #ifdef PADDLE_WITH_CUDA
   new_argv.push_back(
       strdup("--tryfromenv=fraction_of_gpu_memory_to_use,allocator_strategy"));
+#elif __clang__
+  new_argv.push_back(
+      strdup("--tryfromenv=use_mkldnn,initial_cpu_memory_in_"
+             "mb,allocator_strategy"));
+  new_argv.push_back(strdup("--undefok=use_mkldnn,initial_cpu_memory_in_mb"));
 #else
   new_argv.push_back(
       strdup("--tryfromenv=use_pinned_memory,use_mkldnn,initial_cpu_memory_in_"

From e21edb26f6e7fb364597c31a26f128c3c2710516 Mon Sep 17 00:00:00 2001
From: luotao1 <luotao02@baidu.com>
Date: Thu, 22 Nov 2018 17:53:13 +0800
Subject: [PATCH 068/113] add Set/GetCPUNumThreads api

---
 paddle/fluid/inference/api/analysis_config.cc            | 1 +
 paddle/fluid/inference/api/analysis_predictor.cc         | 3 +--
 paddle/fluid/inference/api/api_impl.cc                   | 3 +--
 paddle/fluid/inference/api/paddle_api.h                  | 9 +++++++++
 .../inference/tests/api/analyzer_resnet50_tester.cc      | 1 +
 paddle/fluid/inference/tests/api/config_printer.h        | 2 ++
 paddle/fluid/inference/tests/api/tester_helper.h         | 1 +
 paddle/fluid/operators/math/fc_compute.h                 | 4 +---
 paddle/fluid/platform/cpu_helper.cc                      | 2 +-
 9 files changed, 18 insertions(+), 8 deletions(-)

diff --git a/paddle/fluid/inference/api/analysis_config.cc b/paddle/fluid/inference/api/analysis_config.cc
index 5ccd2dc5ab..100ee0c9d3 100644
--- a/paddle/fluid/inference/api/analysis_config.cc
+++ b/paddle/fluid/inference/api/analysis_config.cc
@@ -46,6 +46,7 @@ contrib::AnalysisConfig::AnalysisConfig(const contrib::AnalysisConfig &other) {
   prog_file = other.prog_file;
   param_file = other.param_file;
   specify_input_name = other.specify_input_name;
+  cpu_num_threads_ = other.cpu_num_threads_;
   // fields from this.
   enable_ir_optim = other.enable_ir_optim;
   use_feed_fetch_ops = other.use_feed_fetch_ops;
diff --git a/paddle/fluid/inference/api/analysis_predictor.cc b/paddle/fluid/inference/api/analysis_predictor.cc
index cb14d2a260..9162ccefd8 100644
--- a/paddle/fluid/inference/api/analysis_predictor.cc
+++ b/paddle/fluid/inference/api/analysis_predictor.cc
@@ -35,7 +35,6 @@
 #include "paddle/fluid/platform/profiler.h"
 
 DECLARE_bool(profile);
-DECLARE_int32(paddle_num_threads);
 
 namespace paddle {
 
@@ -67,7 +66,7 @@ bool AnalysisPredictor::Init(
 #endif
 
   // no matter with or without MKLDNN
-  paddle::platform::SetNumThreads(FLAGS_paddle_num_threads);
+  paddle::platform::SetNumThreads(config_.GetCPUNumThreads());
 
   if (!PrepareScope(parent_scope)) {
     return false;
diff --git a/paddle/fluid/inference/api/api_impl.cc b/paddle/fluid/inference/api/api_impl.cc
index fcbc3803d0..c3d17edea4 100644
--- a/paddle/fluid/inference/api/api_impl.cc
+++ b/paddle/fluid/inference/api/api_impl.cc
@@ -28,7 +28,6 @@ limitations under the License. */
 #include "paddle/fluid/platform/profiler.h"
 
 DEFINE_bool(profile, false, "Turn on profiler for fluid");
-DECLARE_int32(paddle_num_threads);
 
 namespace paddle {
 namespace {
@@ -76,7 +75,7 @@ bool NativePaddlePredictor::Init(
 #endif
 
   // no matter with or without MKLDNN
-  paddle::platform::SetNumThreads(FLAGS_paddle_num_threads);
+  paddle::platform::SetNumThreads(config_.GetCPUNumThreads());
 
   if (config_.use_gpu) {
     place_ = paddle::platform::CUDAPlace(config_.device);
diff --git a/paddle/fluid/inference/api/paddle_api.h b/paddle/fluid/inference/api/paddle_api.h
index 0a2a2a1a23..b7f7781d06 100644
--- a/paddle/fluid/inference/api/paddle_api.h
+++ b/paddle/fluid/inference/api/paddle_api.h
@@ -186,6 +186,15 @@ struct NativeConfig : public PaddlePredictor::Config {
   // Specify the variable's name of each input if input tensors don't follow the
   // `feeds` and `fetches` of the phase `save_inference_model`.
   bool specify_input_name{false};
+
+  // Set and get the number of cpu threads.
+  void SetCPUNumThreads(int cpu_num_threads) {
+    cpu_num_threads_ = cpu_num_threads;
+  }
+  int GetCPUNumThreads() const { return cpu_num_threads_; }
+
+ protected:
+  int cpu_num_threads_{1};  // number of cpu threads for each instance.
 };
 
 // A factory to help create different predictors.
diff --git a/paddle/fluid/inference/tests/api/analyzer_resnet50_tester.cc b/paddle/fluid/inference/tests/api/analyzer_resnet50_tester.cc
index 2b936175ed..308a794ca3 100644
--- a/paddle/fluid/inference/tests/api/analyzer_resnet50_tester.cc
+++ b/paddle/fluid/inference/tests/api/analyzer_resnet50_tester.cc
@@ -27,6 +27,7 @@ void SetConfig(AnalysisConfig *cfg) {
   cfg->device = 0;
   cfg->enable_ir_optim = true;
   cfg->specify_input_name = true;
+  cfg->SetCPUNumThreads(FLAGS_paddle_num_threads);
 }
 
 void SetInput(std::vector<std::vector<PaddleTensor>> *inputs) {
diff --git a/paddle/fluid/inference/tests/api/config_printer.h b/paddle/fluid/inference/tests/api/config_printer.h
index aa0c4b1d04..a803f5b3f4 100644
--- a/paddle/fluid/inference/tests/api/config_printer.h
+++ b/paddle/fluid/inference/tests/api/config_printer.h
@@ -53,6 +53,8 @@ std::ostream &operator<<(std::ostream &os, const NativeConfig &config) {
   os << GenSpaces(num_spaces) << "param_file: " << config.param_file << "\n";
   os << GenSpaces(num_spaces)
      << "specify_input_name: " << config.specify_input_name << "\n";
+  os << GenSpaces(num_spaces)
+     << "cpu_num_threads: " << config.GetCPUNumThreads() << "\n";
   num_spaces--;
   os << GenSpaces(num_spaces) << "}\n";
   return os;
diff --git a/paddle/fluid/inference/tests/api/tester_helper.h b/paddle/fluid/inference/tests/api/tester_helper.h
index 7b686045a5..fdadd59049 100644
--- a/paddle/fluid/inference/tests/api/tester_helper.h
+++ b/paddle/fluid/inference/tests/api/tester_helper.h
@@ -42,6 +42,7 @@ DEFINE_bool(use_analysis, true,
             "Running the inference program in analysis mode.");
 
 DECLARE_bool(profile);
+DECLARE_int32(paddle_num_threads);
 
 namespace paddle {
 namespace inference {
diff --git a/paddle/fluid/operators/math/fc_compute.h b/paddle/fluid/operators/math/fc_compute.h
index b072b4c20a..5b9953a5aa 100644
--- a/paddle/fluid/operators/math/fc_compute.h
+++ b/paddle/fluid/operators/math/fc_compute.h
@@ -17,8 +17,6 @@ limitations under the License. */
 #include "paddle/fluid/operators/math/blas.h"
 #include "paddle/fluid/operators/math/jit_kernel.h"
 
-DECLARE_int32(paddle_num_threads);
-
 namespace paddle {
 namespace operators {
 namespace math {
@@ -43,7 +41,7 @@ inline void FCCompute(const BlasT<DeviceContext, T>& blas, const int M,
                            .template Get<jitkernel::VAddKernel<T>>(N);
 
 #ifdef PADDLE_WITH_MKLML
-#pragma omp parallel for if (FLAGS_paddle_num_threads > 1)
+#pragma omp parallel for
 #endif
     for (int i = 0; i < M; i++) {
       T* dst = Y + i * N;
diff --git a/paddle/fluid/platform/cpu_helper.cc b/paddle/fluid/platform/cpu_helper.cc
index f2d691b293..b737a6c38d 100644
--- a/paddle/fluid/platform/cpu_helper.cc
+++ b/paddle/fluid/platform/cpu_helper.cc
@@ -41,7 +41,7 @@ void SetNumThreads(int num_threads) {
 #elif defined(PADDLE_WITH_MKLML)
   int real_num_threads = num_threads > 1 ? num_threads : 1;
   platform::dynload::MKL_Set_Num_Threads(real_num_threads);
-  omp_set_num_threads(num_threads);
+  omp_set_num_threads(real_num_threads);
 #else
   PADDLE_ENFORCE(false, "To be implemented.");
 #endif

From a5c4b463c962bed48fba89d459adf82f4899d6c3 Mon Sep 17 00:00:00 2001
From: luotao1 <luotao02@baidu.com>
Date: Thu, 22 Nov 2018 18:37:33 +0800
Subject: [PATCH 069/113] add SetMKLDNNThreadId api

---
 paddle/fluid/inference/api/analysis_predictor.cc    | 8 ++++++++
 paddle/fluid/inference/api/analysis_predictor.h     | 2 ++
 paddle/fluid/inference/api/paddle_analysis_config.h | 2 +-
 paddle/fluid/inference/tests/api/tester_helper.h    | 9 ++++++---
 4 files changed, 17 insertions(+), 4 deletions(-)

diff --git a/paddle/fluid/inference/api/analysis_predictor.cc b/paddle/fluid/inference/api/analysis_predictor.cc
index 9162ccefd8..4633a75e5e 100644
--- a/paddle/fluid/inference/api/analysis_predictor.cc
+++ b/paddle/fluid/inference/api/analysis_predictor.cc
@@ -159,6 +159,14 @@ bool AnalysisPredictor::PrepareExecutor() {
   return true;
 }
 
+void AnalysisPredictor::SetMKLDNNThreadId(int tid) {
+#ifdef PADDLE_WITH_MKLDNN
+  platform::set_cur_thread_id(tid);
+#else
+  LOG(ERROR) << "Please compile with MKLDNN first to use MKLDNN";
+#endif
+}
+
 bool AnalysisPredictor::Run(const std::vector<PaddleTensor> &inputs,
                             std::vector<PaddleTensor> *output_data,
                             int batch_size) {
diff --git a/paddle/fluid/inference/api/analysis_predictor.h b/paddle/fluid/inference/api/analysis_predictor.h
index cf81b7db73..9191970a3a 100644
--- a/paddle/fluid/inference/api/analysis_predictor.h
+++ b/paddle/fluid/inference/api/analysis_predictor.h
@@ -69,6 +69,8 @@ class AnalysisPredictor : public PaddlePredictor {
   framework::Scope *scope() { return scope_.get(); }
   framework::ProgramDesc &program() { return *inference_program_; }
 
+  void SetMKLDNNThreadId(int tid);
+
  protected:
   bool PrepareProgram(const std::shared_ptr<framework::ProgramDesc> &program);
   bool PrepareScope(const std::shared_ptr<framework::Scope> &parent_scope);
diff --git a/paddle/fluid/inference/api/paddle_analysis_config.h b/paddle/fluid/inference/api/paddle_analysis_config.h
index 2ac736df7c..a09bd1cac2 100644
--- a/paddle/fluid/inference/api/paddle_analysis_config.h
+++ b/paddle/fluid/inference/api/paddle_analysis_config.h
@@ -51,9 +51,9 @@ struct AnalysisConfig : public NativeConfig {
                             int max_batch_size = 1);
   bool use_tensorrt() const { return use_tensorrt_; }
 
+  void EnableMKLDNN();
   // NOTE this is just for internal development, please not use it.
   // NOT stable yet.
-  void EnableMKLDNN();
   bool use_mkldnn() const { return use_mkldnn_; }
 
   friend class ::paddle::AnalysisPredictor;
diff --git a/paddle/fluid/inference/tests/api/tester_helper.h b/paddle/fluid/inference/tests/api/tester_helper.h
index fdadd59049..72703bc80b 100644
--- a/paddle/fluid/inference/tests/api/tester_helper.h
+++ b/paddle/fluid/inference/tests/api/tester_helper.h
@@ -216,13 +216,16 @@ void TestMultiThreadPrediction(
   size_t total_time{0};
   for (int tid = 0; tid < num_threads; ++tid) {
     threads.emplace_back([&, tid]() {
-#ifdef PADDLE_WITH_MKLDNN
-      platform::set_cur_thread_id(static_cast<int>(tid) + 1);
-#endif
       // Each thread should have local inputs and outputs.
       // The inputs of each thread are all the same.
       std::vector<PaddleTensor> outputs_tid;
       auto &predictor = predictors[tid];
+#ifdef PADDLE_WITH_MKLDNN
+      if (use_analysis) {
+        static_cast<AnalysisPredictor *>(predictor.get())
+            ->SetMKLDNNThreadId(static_cast<int>(tid) + 1);
+      }
+#endif
 
       // warmup run
       LOG(INFO) << "Running thread " << tid << ", warm up run...";

From e66b4c6bff74231898cbbb013627b0eb86eced0f Mon Sep 17 00:00:00 2001
From: luotao1 <luotao02@baidu.com>
Date: Thu, 22 Nov 2018 18:49:59 +0800
Subject: [PATCH 070/113] adjust tester_helper to make multi-instance
 multi-thread work

test=develop
---
 paddle/fluid/inference/tests/api/tester_helper.h | 10 ++++------
 1 file changed, 4 insertions(+), 6 deletions(-)

diff --git a/paddle/fluid/inference/tests/api/tester_helper.h b/paddle/fluid/inference/tests/api/tester_helper.h
index 72703bc80b..d21567ac19 100644
--- a/paddle/fluid/inference/tests/api/tester_helper.h
+++ b/paddle/fluid/inference/tests/api/tester_helper.h
@@ -207,11 +207,7 @@ void TestMultiThreadPrediction(
   int batch_size = FLAGS_batch_size;
   int num_times = FLAGS_repeat;
   std::vector<std::thread> threads;
-  std::vector<std::unique_ptr<PaddlePredictor>> predictors;
-  predictors.emplace_back(CreateTestPredictor(config, use_analysis));
-  for (int tid = 1; tid < num_threads; ++tid) {
-    predictors.emplace_back(predictors.front()->Clone());
-  }
+  auto main_predictor = CreateTestPredictor(config, use_analysis);
 
   size_t total_time{0};
   for (int tid = 0; tid < num_threads; ++tid) {
@@ -219,7 +215,9 @@ void TestMultiThreadPrediction(
       // Each thread should have local inputs and outputs.
       // The inputs of each thread are all the same.
       std::vector<PaddleTensor> outputs_tid;
-      auto &predictor = predictors[tid];
+      // To ensure the thread binding correctly,
+      // please clone inside the threadpool.
+      auto predictor = main_predictor->Clone();
 #ifdef PADDLE_WITH_MKLDNN
       if (use_analysis) {
         static_cast<AnalysisPredictor *>(predictor.get())

From 116979a40adf7fe7788f8cd50b9f03c57bcbba7b Mon Sep 17 00:00:00 2001
From: luotao1 <luotao02@baidu.com>
Date: Fri, 23 Nov 2018 16:17:56 +0800
Subject: [PATCH 071/113] refine api name

test=develop
---
 paddle/fluid/inference/api/analysis_config.cc      |  3 ++-
 paddle/fluid/inference/api/analysis_predictor.cc   |  4 ++--
 paddle/fluid/inference/api/analysis_predictor.h    |  2 +-
 paddle/fluid/inference/api/api_impl.cc             |  2 +-
 paddle/fluid/inference/api/paddle_api.h            | 14 +++++++++-----
 .../tests/api/analyzer_resnet50_tester.cc          |  2 +-
 paddle/fluid/inference/tests/api/config_printer.h  |  2 +-
 paddle/fluid/inference/tests/api/tester_helper.h   |  2 +-
 8 files changed, 18 insertions(+), 13 deletions(-)

diff --git a/paddle/fluid/inference/api/analysis_config.cc b/paddle/fluid/inference/api/analysis_config.cc
index 100ee0c9d3..dd75f0d9a6 100644
--- a/paddle/fluid/inference/api/analysis_config.cc
+++ b/paddle/fluid/inference/api/analysis_config.cc
@@ -46,7 +46,7 @@ contrib::AnalysisConfig::AnalysisConfig(const contrib::AnalysisConfig &other) {
   prog_file = other.prog_file;
   param_file = other.param_file;
   specify_input_name = other.specify_input_name;
-  cpu_num_threads_ = other.cpu_num_threads_;
+  cpu_math_library_num_threads_ = other.cpu_math_library_num_threads_;
   // fields from this.
   enable_ir_optim = other.enable_ir_optim;
   use_feed_fetch_ops = other.use_feed_fetch_ops;
@@ -73,6 +73,7 @@ contrib::AnalysisConfig::AnalysisConfig(contrib::AnalysisConfig &&other) {
   prog_file = other.prog_file;
   param_file = other.param_file;
   specify_input_name = other.specify_input_name;
+  cpu_math_library_num_threads_ = other.cpu_math_library_num_threads_;
   // fields from this.
   enable_ir_optim = other.enable_ir_optim;
   use_feed_fetch_ops = other.use_feed_fetch_ops;
diff --git a/paddle/fluid/inference/api/analysis_predictor.cc b/paddle/fluid/inference/api/analysis_predictor.cc
index 4633a75e5e..c132ce326c 100644
--- a/paddle/fluid/inference/api/analysis_predictor.cc
+++ b/paddle/fluid/inference/api/analysis_predictor.cc
@@ -66,7 +66,7 @@ bool AnalysisPredictor::Init(
 #endif
 
   // no matter with or without MKLDNN
-  paddle::platform::SetNumThreads(config_.GetCPUNumThreads());
+  paddle::platform::SetNumThreads(config_.cpu_math_library_num_threads());
 
   if (!PrepareScope(parent_scope)) {
     return false;
@@ -159,7 +159,7 @@ bool AnalysisPredictor::PrepareExecutor() {
   return true;
 }
 
-void AnalysisPredictor::SetMKLDNNThreadId(int tid) {
+void AnalysisPredictor::SetMkldnnThreadID(int tid) {
 #ifdef PADDLE_WITH_MKLDNN
   platform::set_cur_thread_id(tid);
 #else
diff --git a/paddle/fluid/inference/api/analysis_predictor.h b/paddle/fluid/inference/api/analysis_predictor.h
index 9191970a3a..db57812bc3 100644
--- a/paddle/fluid/inference/api/analysis_predictor.h
+++ b/paddle/fluid/inference/api/analysis_predictor.h
@@ -69,7 +69,7 @@ class AnalysisPredictor : public PaddlePredictor {
   framework::Scope *scope() { return scope_.get(); }
   framework::ProgramDesc &program() { return *inference_program_; }
 
-  void SetMKLDNNThreadId(int tid);
+  void SetMkldnnThreadID(int tid);
 
  protected:
   bool PrepareProgram(const std::shared_ptr<framework::ProgramDesc> &program);
diff --git a/paddle/fluid/inference/api/api_impl.cc b/paddle/fluid/inference/api/api_impl.cc
index c3d17edea4..66a8e51396 100644
--- a/paddle/fluid/inference/api/api_impl.cc
+++ b/paddle/fluid/inference/api/api_impl.cc
@@ -75,7 +75,7 @@ bool NativePaddlePredictor::Init(
 #endif
 
   // no matter with or without MKLDNN
-  paddle::platform::SetNumThreads(config_.GetCPUNumThreads());
+  paddle::platform::SetNumThreads(config_.cpu_math_library_num_threads());
 
   if (config_.use_gpu) {
     place_ = paddle::platform::CUDAPlace(config_.device);
diff --git a/paddle/fluid/inference/api/paddle_api.h b/paddle/fluid/inference/api/paddle_api.h
index b7f7781d06..1513a4b3b4 100644
--- a/paddle/fluid/inference/api/paddle_api.h
+++ b/paddle/fluid/inference/api/paddle_api.h
@@ -187,14 +187,18 @@ struct NativeConfig : public PaddlePredictor::Config {
   // `feeds` and `fetches` of the phase `save_inference_model`.
   bool specify_input_name{false};
 
-  // Set and get the number of cpu threads.
-  void SetCPUNumThreads(int cpu_num_threads) {
-    cpu_num_threads_ = cpu_num_threads;
+  // Set and get the number of cpu math library threads.
+  void SetCpuMathLibraryNumThreads(int cpu_math_library_num_threads) {
+    cpu_math_library_num_threads_ = cpu_math_library_num_threads;
+  }
+  int cpu_math_library_num_threads() const {
+    return cpu_math_library_num_threads_;
   }
-  int GetCPUNumThreads() const { return cpu_num_threads_; }
 
  protected:
-  int cpu_num_threads_{1};  // number of cpu threads for each instance.
+  // number of cpu math library (such as MKL, OpenBlas) threads for each
+  // instance.
+  int cpu_math_library_num_threads_{1};
 };
 
 // A factory to help create different predictors.
diff --git a/paddle/fluid/inference/tests/api/analyzer_resnet50_tester.cc b/paddle/fluid/inference/tests/api/analyzer_resnet50_tester.cc
index 308a794ca3..abc63577b7 100644
--- a/paddle/fluid/inference/tests/api/analyzer_resnet50_tester.cc
+++ b/paddle/fluid/inference/tests/api/analyzer_resnet50_tester.cc
@@ -27,7 +27,7 @@ void SetConfig(AnalysisConfig *cfg) {
   cfg->device = 0;
   cfg->enable_ir_optim = true;
   cfg->specify_input_name = true;
-  cfg->SetCPUNumThreads(FLAGS_paddle_num_threads);
+  cfg->SetCpuMathLibraryNumThreads(FLAGS_paddle_num_threads);
 }
 
 void SetInput(std::vector<std::vector<PaddleTensor>> *inputs) {
diff --git a/paddle/fluid/inference/tests/api/config_printer.h b/paddle/fluid/inference/tests/api/config_printer.h
index a803f5b3f4..4231eef722 100644
--- a/paddle/fluid/inference/tests/api/config_printer.h
+++ b/paddle/fluid/inference/tests/api/config_printer.h
@@ -54,7 +54,7 @@ std::ostream &operator<<(std::ostream &os, const NativeConfig &config) {
   os << GenSpaces(num_spaces)
      << "specify_input_name: " << config.specify_input_name << "\n";
   os << GenSpaces(num_spaces)
-     << "cpu_num_threads: " << config.GetCPUNumThreads() << "\n";
+     << "cpu_num_threads: " << config.cpu_math_library_num_threads() << "\n";
   num_spaces--;
   os << GenSpaces(num_spaces) << "}\n";
   return os;
diff --git a/paddle/fluid/inference/tests/api/tester_helper.h b/paddle/fluid/inference/tests/api/tester_helper.h
index d21567ac19..1dc1678406 100644
--- a/paddle/fluid/inference/tests/api/tester_helper.h
+++ b/paddle/fluid/inference/tests/api/tester_helper.h
@@ -221,7 +221,7 @@ void TestMultiThreadPrediction(
 #ifdef PADDLE_WITH_MKLDNN
       if (use_analysis) {
         static_cast<AnalysisPredictor *>(predictor.get())
-            ->SetMKLDNNThreadId(static_cast<int>(tid) + 1);
+            ->SetMkldnnThreadID(static_cast<int>(tid) + 1);
       }
 #endif
 

From 47c4e65d608083cb4b75222bcd14c1db8bc40333 Mon Sep 17 00:00:00 2001
From: JiabinYang <marsyang199376@gmail.com>
Date: Fri, 23 Nov 2018 08:34:31 +0000
Subject: [PATCH 072/113] test=develop

---
 paddle/fluid/memory/allocation/retry_allocator_test.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/paddle/fluid/memory/allocation/retry_allocator_test.cc b/paddle/fluid/memory/allocation/retry_allocator_test.cc
index a0ce2875cb..f0b215dac2 100644
--- a/paddle/fluid/memory/allocation/retry_allocator_test.cc
+++ b/paddle/fluid/memory/allocation/retry_allocator_test.cc
@@ -41,7 +41,7 @@ TEST(RetryAllocator, RetryAllocator) {
 
   size_t thread_num = 32;
   size_t sleep_time = 40;
-  size_t extra_time = 2;
+  size_t extra_time = 10;
 
   // Reserve to perform more tests in the future
   std::vector<std::shared_ptr<Allocator>> allocators;

From c35bf3d34b43dd6cc5b96e963f8990d60a68d749 Mon Sep 17 00:00:00 2001
From: minqiyang <minqiyang@baidu.com>
Date: Fri, 23 Nov 2018 16:36:54 +0800
Subject: [PATCH 073/113] Fix multiclass_nms_op unit test fail in python3.6

test=develop
---
 .../fluid/tests/unittests/test_multiclass_nms_op.py       | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/python/paddle/fluid/tests/unittests/test_multiclass_nms_op.py b/python/paddle/fluid/tests/unittests/test_multiclass_nms_op.py
index df0562dcc7..e35be54b63 100644
--- a/python/paddle/fluid/tests/unittests/test_multiclass_nms_op.py
+++ b/python/paddle/fluid/tests/unittests/test_multiclass_nms_op.py
@@ -145,10 +145,16 @@ def batched_multiclass_nms(boxes, scores, background, score_threshold,
         lod.append(nmsed_num)
         if nmsed_num == 0: continue
 
+        tmp_det_out = []
         for c, indices in nmsed_outs.items():
             for idx in indices:
                 xmin, ymin, xmax, ymax = boxes[n][idx][:]
                 det_outs.append([c, scores[n][c][idx], xmin, ymin, xmax, ymax])
+                tmp_det_out.append(
+                    [c, scores[n][c][idx], xmin, ymin, xmax, ymax])
+        sorted_det_out = sorted(
+            tmp_det_out, key=lambda tup: tup[0], reverse=False)
+        det_outs.extend(sorted_det_out)
 
     return det_outs, lod
 
@@ -210,7 +216,7 @@ class TestMulticlassNMSOp(OpTest):
 class TestMulticlassNMSOpNoOutput(TestMulticlassNMSOp):
     def set_argument(self):
         # Here set 2.0 to test the case there is no outputs.
-        # In practical use, 0.0 < score_threshold < 1.0 
+        # In practical use, 0.0 < score_threshold < 1.0
         self.score_threshold = 2.0
 
 

From 5ca56cad1f3fdb50f7d019ae5e658b538f98aecc Mon Sep 17 00:00:00 2001
From: sneaxiy <sneaxiy@126.com>
Date: Fri, 23 Nov 2018 08:54:09 +0000
Subject: [PATCH 074/113] test=develop

---
 python/paddle/fluid/layers/control_flow.py | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/python/paddle/fluid/layers/control_flow.py b/python/paddle/fluid/layers/control_flow.py
index 9730fbf510..05138bf945 100644
--- a/python/paddle/fluid/layers/control_flow.py
+++ b/python/paddle/fluid/layers/control_flow.py
@@ -896,9 +896,10 @@ def array_to_lod_tensor(x, table):
 
 def increment(x, value=1.0, in_place=True):
     """
-    This function performs an operation that increments each value in the
+    This function performs an operation that increments the value in the
     input :math:`x` by an amount: :math:`value` as mentioned in the input
-    parameter. This operation is performed in-place by default.
+    parameter. This operation is performed in-place by default. Notice that
+    the number of elements in :math:`x` must be equal to 1.
 
     Args:
         x (Variable|list): The tensor that has the input values.
@@ -911,7 +912,8 @@ def increment(x, value=1.0, in_place=True):
     Examples:
         .. code-block:: python
 
-          data = fluid.layers.data(name='data', shape=[32, 32], dtype='float32')
+          data = fluid.layers.data(name='data', shape=[1], dtype='float32',
+                                   append_batch_size=False)
           data = fluid.layers.increment(x=data, value=3.0, in_place=True)
     """
     helper = LayerHelper("increment", **locals())

From f7847ca6a304a649982c04bff4f3eec846a06c5d Mon Sep 17 00:00:00 2001
From: chengduozh <zhaochengduo@baidu.com>
Date: Fri, 23 Nov 2018 17:09:14 +0800
Subject: [PATCH 075/113] fix cublas warp error test=develop

---
 paddle/fluid/platform/dynload/cublas.cc |  3 +++
 paddle/fluid/platform/dynload/cublas.h  | 30 ++++++++++++++++---------
 2 files changed, 23 insertions(+), 10 deletions(-)

diff --git a/paddle/fluid/platform/dynload/cublas.cc b/paddle/fluid/platform/dynload/cublas.cc
index 361d3439b8..41648c32fe 100644
--- a/paddle/fluid/platform/dynload/cublas.cc
+++ b/paddle/fluid/platform/dynload/cublas.cc
@@ -32,6 +32,9 @@ CUBLAS_BLAS_ROUTINE_EACH_R2(DEFINE_WRAP);
 CUBLAS_BLAS_ROUTINE_EACH_R3(DEFINE_WRAP);
 #endif
 
+#ifdef CUBLAS_BLAS_ROUTINE_EACH_R4
+CUBLAS_BLAS_ROUTINE_EACH_R4(DEFINE_WRAP);
+#endif
 }  // namespace dynload
 }  // namespace platform
 }  // namespace paddle
diff --git a/paddle/fluid/platform/dynload/cublas.h b/paddle/fluid/platform/dynload/cublas.h
index ff80bd525c..ced789b90d 100644
--- a/paddle/fluid/platform/dynload/cublas.h
+++ b/paddle/fluid/platform/dynload/cublas.h
@@ -90,23 +90,33 @@ CUBLAS_BLAS_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP)
 
 // APIs available after CUDA 8.0
 #if CUDA_VERSION >= 8000
-DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP(cublasGemmEx);
-DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP(cublasSgemmStridedBatched);
-DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP(cublasDgemmStridedBatched);
-DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP(cublasCgemmStridedBatched);
-DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP(cublasZgemmStridedBatched);
-DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP(cublasHgemmStridedBatched);
+#define CUBLAS_BLAS_ROUTINE_EACH_R2(__macro) \
+  __macro(cublasGemmEx);                     \
+  __macro(cublasSgemmStridedBatched);        \
+  __macro(cublasDgemmStridedBatched);        \
+  __macro(cublasCgemmStridedBatched);        \
+  __macro(cublasZgemmStridedBatched);        \
+  __macro(cublasHgemmStridedBatched);
+
+CUBLAS_BLAS_ROUTINE_EACH_R2(DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP)
 #endif
 
 // APIs available after CUDA 9.0
 #if CUDA_VERSION >= 9000
-DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP(cublasSetMathMode);
-DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP(cublasGetMathMode);
+#define CUBLAS_BLAS_ROUTINE_EACH_R3(__macro) \
+  __macro(cublasSetMathMode);                \
+  __macro(cublasGetMathMode);
+
+CUBLAS_BLAS_ROUTINE_EACH_R3(DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP)
 #endif
 
+// APIs available after CUDA 9.1
 #if CUDA_VERSION >= 9010
-DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP(cublasGemmBatchedEx);
-DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP(cublasGemmStridedBatchedEx);
+#define CUBLAS_BLAS_ROUTINE_EACH_R4(__macro) \
+  __macro(cublasGemmBatchedEx);              \
+  __macro(cublasGemmStridedBatchedEx);
+
+CUBLAS_BLAS_ROUTINE_EACH_R4(DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP)
 #endif
 
 #undef DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP

From 5431d5c471d32a5ea3be049a339e57262bd3b483 Mon Sep 17 00:00:00 2001
From: minqiyang <minqiyang@baidu.com>
Date: Fri, 23 Nov 2018 18:15:06 +0800
Subject: [PATCH 076/113] Polish code

test=develop
---
 python/paddle/fluid/tests/unittests/test_multiclass_nms_op.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/python/paddle/fluid/tests/unittests/test_multiclass_nms_op.py b/python/paddle/fluid/tests/unittests/test_multiclass_nms_op.py
index e35be54b63..9778bd694d 100644
--- a/python/paddle/fluid/tests/unittests/test_multiclass_nms_op.py
+++ b/python/paddle/fluid/tests/unittests/test_multiclass_nms_op.py
@@ -149,7 +149,6 @@ def batched_multiclass_nms(boxes, scores, background, score_threshold,
         for c, indices in nmsed_outs.items():
             for idx in indices:
                 xmin, ymin, xmax, ymax = boxes[n][idx][:]
-                det_outs.append([c, scores[n][c][idx], xmin, ymin, xmax, ymax])
                 tmp_det_out.append(
                     [c, scores[n][c][idx], xmin, ymin, xmax, ymax])
         sorted_det_out = sorted(

From 3d100b0c927b6326e75b3e493d545ee2b0ff4f4b Mon Sep 17 00:00:00 2001
From: minqiyang <minqiyang@baidu.com>
Date: Fri, 23 Nov 2018 19:14:46 +0800
Subject: [PATCH 077/113] Add Python3.6 Python3.7 compile process

test=develop
---
 Dockerfile | 75 +++++++++++++++++++++++++++++-------------------------
 1 file changed, 40 insertions(+), 35 deletions(-)

diff --git a/Dockerfile b/Dockerfile
index b36102175c..eb7bb2549e 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -33,44 +33,49 @@ RUN apt-get update && \
     automake locales clang-format swig cmake  \
     liblapack-dev liblapacke-dev \
     clang-3.8 llvm-3.8 libclang-3.8-dev \
+    build-essential checkinstall \
+    libreadline-gplv2-dev libncursesw5-dev libssl-dev libsqlite3-dev tk-dev libgdbm-dev libc6-dev libbz2-dev \
     net-tools libtool ccache && \
     apt-get clean -y
 
-# Install Go and glide
-RUN wget -qO- https://storage.googleapis.com/golang/go1.8.1.linux-amd64.tar.gz | \
-    tar -xz -C /usr/local && \
-    mkdir /root/gopath && \
-    mkdir /root/gopath/bin && \
-    mkdir /root/gopath/src
-ENV GOROOT=/usr/local/go GOPATH=/root/gopath
-# should not be in the same line with GOROOT definition, otherwise docker build could not find GOROOT.
-ENV PATH=${PATH}:${GOROOT}/bin:${GOPATH}/bin
-# install glide
-RUN curl -s -q https://glide.sh/get | sh
-
-# Install TensorRT
-# following TensorRT.tar.gz is not the default official one, we do two miny changes:
-# 1. Remove the unnecessary files to make the library small. TensorRT.tar.gz only contains include and lib now,
-#    and its size is only one-third of the official one.
-# 2. Manually add ~IPluginFactory() in IPluginFactory class of NvInfer.h, otherwise, it couldn't work in paddle.
-#    See https://github.com/PaddlePaddle/Paddle/issues/10129 for details.
-RUN wget -qO- http://paddlepaddledeps.cdn.bcebos.com/TensorRT-4.0.0.3.Ubuntu-16.04.4.x86_64-gnu.cuda-8.0.cudnn7.0.tar.gz | \
-    tar -xz -C /usr/local && \
-    cp -rf /usr/local/TensorRT/include /usr && \
-    cp -rf /usr/local/TensorRT/lib /usr
-
-# git credential to skip password typing
-RUN git config --global credential.helper store
-
-# Fix locales to en_US.UTF-8
-RUN localedef -i en_US -f UTF-8 en_US.UTF-8
-
-# FIXME: due to temporary ipykernel dependency issue, specify ipykernel jupyter
-# version util jupyter fixes this issue.
-
-# specify sphinx version as 1.5.6 and remove -U option for [pip install -U
-# sphinx-rtd-theme] since -U option will cause sphinx being updated to newest
-# version(1.7.1 for now), which causes building documentation failed.
+COPY tools/manylinux1/build_scripts/* /root/python/
+RUN cd /root/python/ && source build_utils && MY_DIR=/root/python/ build_cpythons 3.6.0 3.7.0
+
+# # Install Go and glide
+# RUN wget -qO- https://storage.googleapis.com/golang/go1.8.1.linux-amd64.tar.gz | \
+    # tar -xz -C /usr/local && \
+    # mkdir /root/gopath && \
+    # mkdir /root/gopath/bin && \
+    # mkdir /root/gopath/src
+# ENV GOROOT=/usr/local/go GOPATH=/root/gopath
+# # should not be in the same line with GOROOT definition, otherwise docker build could not find GOROOT.
+# ENV PATH=${PATH}:${GOROOT}/bin:${GOPATH}/bin
+# # install glide
+# RUN curl -s -q https://glide.sh/get | sh
+
+# # Install TensorRT
+# # following TensorRT.tar.gz is not the default official one, we do two miny changes:
+# # 1. Remove the unnecessary files to make the library small. TensorRT.tar.gz only contains include and lib now,
+# #    and its size is only one-third of the official one.
+# # 2. Manually add ~IPluginFactory() in IPluginFactory class of NvInfer.h, otherwise, it couldn't work in paddle.
+# #    See https://github.com/PaddlePaddle/Paddle/issues/10129 for details.
+# RUN wget -qO- http://paddlepaddledeps.cdn.bcebos.com/TensorRT-4.0.0.3.Ubuntu-16.04.4.x86_64-gnu.cuda-8.0.cudnn7.0.tar.gz | \
+    # tar -xz -C /usr/local && \
+    # cp -rf /usr/local/TensorRT/include /usr && \
+    # cp -rf /usr/local/TensorRT/lib /usr
+
+# # git credential to skip password typing
+# RUN git config --global credential.helper store
+
+# # Fix locales to en_US.UTF-8
+# RUN localedef -i en_US -f UTF-8 en_US.UTF-8
+
+# # FIXME: due to temporary ipykernel dependency issue, specify ipykernel jupyter
+# # version util jupyter fixes this issue.
+
+# # specify sphinx version as 1.5.6 and remove -U option for [pip install -U
+# # sphinx-rtd-theme] since -U option will cause sphinx being updated to newest
+# # version(1.7.1 for now), which causes building documentation failed.
 # RUN pip3 install -U wheel && \
     # pip3 install -U docopt PyYAML sphinx==1.5.6 && \
     # pip3 install sphinx-rtd-theme==0.1.9 recommonmark && \

From 64ca3d176cc1348f0735e3e6f4fd2c18e902f43b Mon Sep 17 00:00:00 2001
From: qingqing01 <dangqingqing@baidu.com>
Date: Fri, 23 Nov 2018 19:18:20 +0800
Subject: [PATCH 078/113] Add bias_attr in sequence_conv_pool API. (#14553)

---
 paddle/fluid/API.spec       | 2 +-
 python/paddle/fluid/nets.py | 9 ++++++++-
 2 files changed, 9 insertions(+), 2 deletions(-)

diff --git a/paddle/fluid/API.spec b/paddle/fluid/API.spec
index 50114bf3df..8397ae093b 100644
--- a/paddle/fluid/API.spec
+++ b/paddle/fluid/API.spec
@@ -342,7 +342,7 @@ paddle.fluid.transpiler.RoundRobin.dispatch ArgSpec(args=['self', 'varlist'], va
 paddle.fluid.transpiler.RoundRobin.reset ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None)
 paddle.fluid.transpiler.DistributeTranspilerConfig.__init__ 
 paddle.fluid.nets.simple_img_conv_pool ArgSpec(args=['input', 'num_filters', 'filter_size', 'pool_size', 'pool_stride', 'pool_padding', 'pool_type', 'global_pooling', 'conv_stride', 'conv_padding', 'conv_dilation', 'conv_groups', 'param_attr', 'bias_attr', 'act', 'use_cudnn'], varargs=None, keywords=None, defaults=(0, 'max', False, 1, 0, 1, 1, None, None, None, True))
-paddle.fluid.nets.sequence_conv_pool ArgSpec(args=['input', 'num_filters', 'filter_size', 'param_attr', 'act', 'pool_type'], varargs=None, keywords=None, defaults=(None, 'sigmoid', 'max'))
+paddle.fluid.nets.sequence_conv_pool ArgSpec(args=['input', 'num_filters', 'filter_size', 'param_attr', 'act', 'pool_type', 'bias_attr'], varargs=None, keywords=None, defaults=(None, 'sigmoid', 'max', None))
 paddle.fluid.nets.glu ArgSpec(args=['input', 'dim'], varargs=None, keywords=None, defaults=(-1,))
 paddle.fluid.nets.scaled_dot_product_attention ArgSpec(args=['queries', 'keys', 'values', 'num_heads', 'dropout_rate'], varargs=None, keywords=None, defaults=(1, 0.0))
 paddle.fluid.nets.img_conv_group ArgSpec(args=['input', 'conv_num_filter', 'pool_size', 'conv_padding', 'conv_filter_size', 'conv_act', 'param_attr', 'conv_with_batchnorm', 'conv_batchnorm_drop_rate', 'pool_stride', 'pool_type', 'use_cudnn'], varargs=None, keywords=None, defaults=(1, 3, None, None, False, 0.0, 1, 'max', True))
diff --git a/python/paddle/fluid/nets.py b/python/paddle/fluid/nets.py
index 00d33b36fc..fb75ef62d0 100644
--- a/python/paddle/fluid/nets.py
+++ b/python/paddle/fluid/nets.py
@@ -250,7 +250,8 @@ def sequence_conv_pool(input,
                        filter_size,
                        param_attr=None,
                        act="sigmoid",
-                       pool_type="max"):
+                       pool_type="max",
+                       bias_attr=None):
     """
     The sequence_conv_pool is composed with Sequence Convolution and Pooling.
 
@@ -266,6 +267,11 @@ def sequence_conv_pool(input,
         pool_type (str): Pooling type can be :math:`max` for max-pooling, :math:`average` for
             average-pooling, :math:`sum` for sum-pooling, :math:`sqrt` for sqrt-pooling.
             Default :math:`max`.
+        bias_attr (ParamAttr|bool|None): The parameter attribute for the bias of sequence_conv.
+            If it is set to False, no bias will be added to the output units.
+            If it is set to None or one attribute of ParamAttr, sequence_conv
+            will create ParamAttr as bias_attr. If the Initializer of the bias_attr
+            is not set, the bias is initialized zero. Default: None.
 
     Return:
         Variable: The final result after Sequence Convolution and Pooling.
@@ -289,6 +295,7 @@ def sequence_conv_pool(input,
         num_filters=num_filters,
         filter_size=filter_size,
         param_attr=param_attr,
+        bias_attr=bias_attr,
         act=act)
 
     pool_out = layers.sequence_pool(input=conv_out, pool_type=pool_type)

From e8a8f2626cc65b8dc3a91507eb233581e8e7e0e2 Mon Sep 17 00:00:00 2001
From: minqiyang <minqiyang@baidu.com>
Date: Sat, 24 Nov 2018 16:58:57 +0800
Subject: [PATCH 079/113] Add Python3.6 and Python3.7 support in Ubuntu
 Dockerfile

test=develop
---
 Dockerfile | 200 +++++++++++++++++++++++++++++++----------------------
 1 file changed, 118 insertions(+), 82 deletions(-)

diff --git a/Dockerfile b/Dockerfile
index eb7bb2549e..6f45c79f3a 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -22,6 +22,27 @@ ENV HOME /root
 # Add bash enhancements
 COPY ./paddle/scripts/docker/root/ /root/
 
+# Prepare packages for Python
+RUN apt-get update && \
+    apt-get install -y make build-essential libssl-dev zlib1g-dev libbz2-dev \
+    libreadline-dev libsqlite3-dev wget curl llvm libncurses5-dev libncursesw5-dev \
+    xz-utils tk-dev libffi-dev liblzma-dev
+
+# Install Python3.6
+RUN mkdir -p /root/python_build/ && wget -q https://www.sqlite.org/2018/sqlite-autoconf-3250300.tar.gz && \
+    tar -zxf sqlite-autoconf-3250300.tar.gz && cd sqlite-autoconf-3250300 && \
+    ./configure -prefix=/usr/local && make -j8 && make install && cd ../ && rm sqlite-autoconf-3250300.tar.gz && \
+    wget -q https://www.python.org/ftp/python/3.6.0/Python-3.6.0.tgz && \
+    tar -xzf Python-3.6.0.tgz && cd Python-3.6.0 && \
+    CFLAGS="-Wformat" ./configure --prefix=/usr/local/python3.6 --enable-shared > /dev/null && \
+    make -j8 > /dev/null && make altinstall > /dev/null
+
+# Install Python3.7
+RUN wget -q https://www.python.org/ftp/python/3.7.0/Python-3.7.0.tgz && \
+    tar -xzf Python-3.7.0.tgz && cd Python-3.7.0 && \
+    CFLAGS="-Wformat" ./configure --prefix=/usr/local/python3.7 --enable-shared > /dev/null && \
+    make -j8 > /dev/null && make altinstall > /dev/null
+
 RUN apt-get update && \
     apt-get install -y --allow-downgrades patchelf \
     python3 python3-dev python3-pip \
@@ -34,88 +55,103 @@ RUN apt-get update && \
     liblapack-dev liblapacke-dev \
     clang-3.8 llvm-3.8 libclang-3.8-dev \
     build-essential checkinstall \
-    libreadline-gplv2-dev libncursesw5-dev libssl-dev libsqlite3-dev tk-dev libgdbm-dev libc6-dev libbz2-dev \
+    libreadline-dev libncursesw5-dev libssl-dev libsqlite3-dev tk-dev libgdbm-dev libc6-dev libbz2-dev \
     net-tools libtool ccache && \
     apt-get clean -y
 
-COPY tools/manylinux1/build_scripts/* /root/python/
-RUN cd /root/python/ && source build_utils && MY_DIR=/root/python/ build_cpythons 3.6.0 3.7.0
-
-# # Install Go and glide
-# RUN wget -qO- https://storage.googleapis.com/golang/go1.8.1.linux-amd64.tar.gz | \
-    # tar -xz -C /usr/local && \
-    # mkdir /root/gopath && \
-    # mkdir /root/gopath/bin && \
-    # mkdir /root/gopath/src
-# ENV GOROOT=/usr/local/go GOPATH=/root/gopath
-# # should not be in the same line with GOROOT definition, otherwise docker build could not find GOROOT.
-# ENV PATH=${PATH}:${GOROOT}/bin:${GOPATH}/bin
-# # install glide
-# RUN curl -s -q https://glide.sh/get | sh
-
-# # Install TensorRT
-# # following TensorRT.tar.gz is not the default official one, we do two miny changes:
-# # 1. Remove the unnecessary files to make the library small. TensorRT.tar.gz only contains include and lib now,
-# #    and its size is only one-third of the official one.
-# # 2. Manually add ~IPluginFactory() in IPluginFactory class of NvInfer.h, otherwise, it couldn't work in paddle.
-# #    See https://github.com/PaddlePaddle/Paddle/issues/10129 for details.
-# RUN wget -qO- http://paddlepaddledeps.cdn.bcebos.com/TensorRT-4.0.0.3.Ubuntu-16.04.4.x86_64-gnu.cuda-8.0.cudnn7.0.tar.gz | \
-    # tar -xz -C /usr/local && \
-    # cp -rf /usr/local/TensorRT/include /usr && \
-    # cp -rf /usr/local/TensorRT/lib /usr
-
-# # git credential to skip password typing
-# RUN git config --global credential.helper store
-
-# # Fix locales to en_US.UTF-8
-# RUN localedef -i en_US -f UTF-8 en_US.UTF-8
-
-# # FIXME: due to temporary ipykernel dependency issue, specify ipykernel jupyter
-# # version util jupyter fixes this issue.
-
-# # specify sphinx version as 1.5.6 and remove -U option for [pip install -U
-# # sphinx-rtd-theme] since -U option will cause sphinx being updated to newest
-# # version(1.7.1 for now), which causes building documentation failed.
-# RUN pip3 install -U wheel && \
-    # pip3 install -U docopt PyYAML sphinx==1.5.6 && \
-    # pip3 install sphinx-rtd-theme==0.1.9 recommonmark && \
-    # easy_install -U pip && \
-    # pip install -U pip setuptools wheel && \
-    # pip install -U docopt PyYAML sphinx==1.5.6 && \
-    # pip install sphinx-rtd-theme==0.1.9 recommonmark
-
-# RUN pip3 install 'pre-commit==1.10.4' 'ipython==5.3.0' && \
-    # pip3 install 'ipykernel==4.6.0' 'jupyter==1.0.0' && \
-    # pip3 install opencv-python && \
-    # pip install 'pre-commit==1.10.4' 'ipython==5.3.0' && \
-    # pip install 'ipykernel==4.6.0' 'jupyter==1.0.0' && \
-    # pip install opencv-python
-
-# #For docstring checker
-# RUN pip3 install pylint pytest astroid isort
-# RUN pip install pylint pytest astroid isort LinkChecker
-
-# COPY ./python/requirements.txt /root/
-# RUN pip3 install -r /root/requirements.txt
-# RUN pip install -r /root/requirements.txt
-
-# # To fix https://github.com/PaddlePaddle/Paddle/issues/1954, we use
-# # the solution in https://urllib3.readthedocs.io/en/latest/user-guide.html#ssl-py2
-# RUN apt-get install -y libssl-dev libffi-dev
-# RUN pip3 install certifi urllib3[secure]
-# RUN pip install certifi urllib3[secure]
-
-
-# # Install woboq_codebrowser to /woboq
-# RUN git clone https://github.com/woboq/woboq_codebrowser /woboq && \
-    # (cd /woboq \
-     # cmake -DLLVM_CONFIG_EXECUTABLE=/usr/bin/llvm-config-3.8 \
-           # -DCMAKE_BUILD_TYPE=Release . \
-     # make)
-
-# # Configure OpenSSH server. c.f. https://docs.docker.com/engine/examples/running_ssh_service
-# RUN mkdir /var/run/sshd
-# RUN echo 'root:root' | chpasswd
-# RUN sed -ri 's/^PermitRootLogin\s+.*/PermitRootLogin yes/' /etc/ssh/sshd_config
-# RUN sed -ri 's/UsePAM yes/#UsePAM yes/g' /etc/ssh/sshd_config
-# EXPOSE 22
+# Install Go and glide
+RUN wget -qO- https://storage.googleapis.com/golang/go1.8.1.linux-amd64.tar.gz | \
+    tar -xz -C /usr/local && \
+    mkdir /root/gopath && \
+    mkdir /root/gopath/bin && \
+    mkdir /root/gopath/src
+ENV GOROOT=/usr/local/go GOPATH=/root/gopath
+# should not be in the same line with GOROOT definition, otherwise docker build could not find GOROOT.
+ENV PATH=${PATH}:${GOROOT}/bin:${GOPATH}/bin
+# install glide
+RUN curl -s -q https://glide.sh/get | sh
+
+# Install TensorRT
+# following TensorRT.tar.gz is not the default official one, we do two miny changes:
+# 1. Remove the unnecessary files to make the library small. TensorRT.tar.gz only contains include and lib now,
+#    and its size is only one-third of the official one.
+# 2. Manually add ~IPluginFactory() in IPluginFactory class of NvInfer.h, otherwise, it couldn't work in paddle.
+#    See https://github.com/PaddlePaddle/Paddle/issues/10129 for details.
+RUN wget -qO- http://paddlepaddledeps.cdn.bcebos.com/TensorRT-4.0.0.3.Ubuntu-16.04.4.x86_64-gnu.cuda-8.0.cudnn7.0.tar.gz | \
+    tar -xz -C /usr/local && \
+    cp -rf /usr/local/TensorRT/include /usr && \
+    cp -rf /usr/local/TensorRT/lib /usr
+
+# git credential to skip password typing
+RUN git config --global credential.helper store
+
+# Fix locales to en_US.UTF-8
+RUN localedef -i en_US -f UTF-8 en_US.UTF-8
+
+# FIXME: due to temporary ipykernel dependency issue, specify ipykernel jupyter
+# version util jupyter fixes this issue.
+
+# specify sphinx version as 1.5.6 and remove -U option for [pip install -U
+# sphinx-rtd-theme] since -U option will cause sphinx being updated to newest
+# version(1.7.1 for now), which causes building documentation failed.
+RUN pip3.5 install -U wheel && \
+    pip3.5 install -U docopt PyYAML sphinx==1.5.6 && \
+    pip3.5 install sphinx-rtd-theme==0.1.9 recommonmark && \
+    pip3.6 install -U wheel && \
+    pip3.6 install -U docopt PyYAML sphinx==1.5.6 && \
+    pip3.6 install sphinx-rtd-theme==0.1.9 recommonmark && \
+    pip3.7 install -U wheel && \
+    pip3.7 install -U docopt PyYAML sphinx==1.5.6 && \
+    pip3.7 install sphinx-rtd-theme==0.1.9 recommonmark && \
+    easy_install -U pip && \
+    pip install -U pip setuptools wheel && \
+    pip install -U docopt PyYAML sphinx==1.5.6 && \
+    pip install sphinx-rtd-theme==0.1.9 recommonmark
+
+RUN pip3.5 install 'pre-commit==1.10.4' 'ipython==5.3.0' && \
+    pip3.5 install 'ipykernel==4.6.0' 'jupyter==1.0.0' && \
+    pip3.5 install opencv-python && \
+    pip3.6 install 'pre-commit==1.10.4' 'ipython==5.3.0' && \
+    pip3.6 install 'ipykernel==4.6.0' 'jupyter==1.0.0' && \
+    pip3.6 install opencv-python && \
+    pip3.7 install 'pre-commit==1.10.4' 'ipython==5.3.0' && \
+    pip3.7 install 'ipykernel==4.6.0' 'jupyter==1.0.0' && \
+    pip3.7 install opencv-python && \
+    pip install 'pre-commit==1.10.4' 'ipython==5.3.0' && \
+    pip install 'ipykernel==4.6.0' 'jupyter==1.0.0' && \
+    pip install opencv-python
+
+#For docstring checker
+RUN pip3.5 install pylint pytest astroid isort
+RUN pip3.6 install pylint pytest astroid isort
+RUN pip3.7 install pylint pytest astroid isort
+RUN pip install pylint pytest astroid isort LinkChecker
+
+COPY ./python/requirements.txt /root/
+RUN pip3.5 install -r /root/requirements.txt
+RUN pip3.6 install -r /root/requirements.txt
+RUN pip3.7 install -r /root/requirements.txt
+RUN pip install -r /root/requirements.txt
+
+# To fix https://github.com/PaddlePaddle/Paddle/issues/1954, we use
+# the solution in https://urllib3.readthedocs.io/en/latest/user-guide.html#ssl-py2
+RUN apt-get install -y libssl-dev libffi-dev
+RUN pip3.5 install certifi urllib3[secure]
+RUN pip3.6 install certifi urllib3[secure]
+RUN pip3.7 install certifi urllib3[secure]
+RUN pip install certifi urllib3[secure]
+
+
+# Install woboq_codebrowser to /woboq
+RUN git clone https://github.com/woboq/woboq_codebrowser /woboq && \
+    (cd /woboq \
+     cmake -DLLVM_CONFIG_EXECUTABLE=/usr/bin/llvm-config-3.8 \
+           -DCMAKE_BUILD_TYPE=Release . \
+     make)
+
+# Configure OpenSSH server. c.f. https://docs.docker.com/engine/examples/running_ssh_service
+RUN mkdir /var/run/sshd
+RUN echo 'root:root' | chpasswd
+RUN sed -ri 's/^PermitRootLogin\s+.*/PermitRootLogin yes/' /etc/ssh/sshd_config
+RUN sed -ri 's/UsePAM yes/#UsePAM yes/g' /etc/ssh/sshd_config
+EXPOSE 22

From 155a0f78e6f8688a68dae765dee4d25e08bc614b Mon Sep 17 00:00:00 2001
From: Min <minqiyang@baidu.com>
Date: Sat, 24 Nov 2018 17:17:14 +0800
Subject: [PATCH 080/113] Polish code

test=develop
---
 Dockerfile | 24 +++++++++++-------------
 1 file changed, 11 insertions(+), 13 deletions(-)

diff --git a/Dockerfile b/Dockerfile
index 6f45c79f3a..9459552890 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -34,13 +34,13 @@ RUN mkdir -p /root/python_build/ && wget -q https://www.sqlite.org/2018/sqlite-a
     ./configure -prefix=/usr/local && make -j8 && make install && cd ../ && rm sqlite-autoconf-3250300.tar.gz && \
     wget -q https://www.python.org/ftp/python/3.6.0/Python-3.6.0.tgz && \
     tar -xzf Python-3.6.0.tgz && cd Python-3.6.0 && \
-    CFLAGS="-Wformat" ./configure --prefix=/usr/local/python3.6 --enable-shared > /dev/null && \
+    CFLAGS="-Wformat" ./configure --prefix=/usr/local/ --enable-shared > /dev/null && \
     make -j8 > /dev/null && make altinstall > /dev/null
 
 # Install Python3.7
 RUN wget -q https://www.python.org/ftp/python/3.7.0/Python-3.7.0.tgz && \
     tar -xzf Python-3.7.0.tgz && cd Python-3.7.0 && \
-    CFLAGS="-Wformat" ./configure --prefix=/usr/local/python3.7 --enable-shared > /dev/null && \
+    CFLAGS="-Wformat" ./configure --prefix=/usr/local/ --enable-shared > /dev/null && \
     make -j8 > /dev/null && make altinstall > /dev/null
 
 RUN apt-get update && \
@@ -54,8 +54,6 @@ RUN apt-get update && \
     automake locales clang-format swig cmake  \
     liblapack-dev liblapacke-dev \
     clang-3.8 llvm-3.8 libclang-3.8-dev \
-    build-essential checkinstall \
-    libreadline-dev libncursesw5-dev libssl-dev libsqlite3-dev tk-dev libgdbm-dev libc6-dev libbz2-dev \
     net-tools libtool ccache && \
     apt-get clean -y
 
@@ -94,9 +92,9 @@ RUN localedef -i en_US -f UTF-8 en_US.UTF-8
 # specify sphinx version as 1.5.6 and remove -U option for [pip install -U
 # sphinx-rtd-theme] since -U option will cause sphinx being updated to newest
 # version(1.7.1 for now), which causes building documentation failed.
-RUN pip3.5 install -U wheel && \
-    pip3.5 install -U docopt PyYAML sphinx==1.5.6 && \
-    pip3.5 install sphinx-rtd-theme==0.1.9 recommonmark && \
+RUN pip3 install -U wheel && \
+    pip3 install -U docopt PyYAML sphinx==1.5.6 && \
+    pip3 install sphinx-rtd-theme==0.1.9 recommonmark && \
     pip3.6 install -U wheel && \
     pip3.6 install -U docopt PyYAML sphinx==1.5.6 && \
     pip3.6 install sphinx-rtd-theme==0.1.9 recommonmark && \
@@ -108,9 +106,9 @@ RUN pip3.5 install -U wheel && \
     pip install -U docopt PyYAML sphinx==1.5.6 && \
     pip install sphinx-rtd-theme==0.1.9 recommonmark
 
-RUN pip3.5 install 'pre-commit==1.10.4' 'ipython==5.3.0' && \
-    pip3.5 install 'ipykernel==4.6.0' 'jupyter==1.0.0' && \
-    pip3.5 install opencv-python && \
+RUN pip3 install 'pre-commit==1.10.4' 'ipython==5.3.0' && \
+    pip3 install 'ipykernel==4.6.0' 'jupyter==1.0.0' && \
+    pip3 install opencv-python && \
     pip3.6 install 'pre-commit==1.10.4' 'ipython==5.3.0' && \
     pip3.6 install 'ipykernel==4.6.0' 'jupyter==1.0.0' && \
     pip3.6 install opencv-python && \
@@ -122,13 +120,13 @@ RUN pip3.5 install 'pre-commit==1.10.4' 'ipython==5.3.0' && \
     pip install opencv-python
 
 #For docstring checker
-RUN pip3.5 install pylint pytest astroid isort
+RUN pip3 install pylint pytest astroid isort
 RUN pip3.6 install pylint pytest astroid isort
 RUN pip3.7 install pylint pytest astroid isort
 RUN pip install pylint pytest astroid isort LinkChecker
 
 COPY ./python/requirements.txt /root/
-RUN pip3.5 install -r /root/requirements.txt
+RUN pip3 install -r /root/requirements.txt
 RUN pip3.6 install -r /root/requirements.txt
 RUN pip3.7 install -r /root/requirements.txt
 RUN pip install -r /root/requirements.txt
@@ -136,7 +134,7 @@ RUN pip install -r /root/requirements.txt
 # To fix https://github.com/PaddlePaddle/Paddle/issues/1954, we use
 # the solution in https://urllib3.readthedocs.io/en/latest/user-guide.html#ssl-py2
 RUN apt-get install -y libssl-dev libffi-dev
-RUN pip3.5 install certifi urllib3[secure]
+RUN pip3 install certifi urllib3[secure]
 RUN pip3.6 install certifi urllib3[secure]
 RUN pip3.7 install certifi urllib3[secure]
 RUN pip install certifi urllib3[secure]

From 05e6a7141717f1eb1e73836c7aec67faea4d4db5 Mon Sep 17 00:00:00 2001
From: Min <minqiyang@baidu.com>
Date: Sat, 24 Nov 2018 17:19:22 +0800
Subject: [PATCH 081/113] Polish code

test=develop
---
 .dockerignore | 1 -
 1 file changed, 1 deletion(-)

diff --git a/.dockerignore b/.dockerignore
index 49adfe4f0a..2b2e74053d 100644
--- a/.dockerignore
+++ b/.dockerignore
@@ -1,6 +1,5 @@
 *.DS_Store
 build/
-build*
 *.user
 .vscode
 .idea

From 8038cd10a93c66405bc7221f3d6cf1605c25df0d Mon Sep 17 00:00:00 2001
From: minqiyang <minqiyang@baidu.com>
Date: Sat, 24 Nov 2018 19:19:44 +0800
Subject: [PATCH 082/113] Upgrade pybind11 to v2.2.4 to support Python3.7

test=develop
---
 cmake/external/pybind11.cmake | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cmake/external/pybind11.cmake b/cmake/external/pybind11.cmake
index c885877a2b..3a10ea945d 100644
--- a/cmake/external/pybind11.cmake
+++ b/cmake/external/pybind11.cmake
@@ -26,7 +26,7 @@ ExternalProject_Add(
         extern_pybind
         ${EXTERNAL_PROJECT_LOG_ARGS}
         GIT_REPOSITORY  "https://github.com/pybind/pybind11.git"
-        GIT_TAG         "v2.1.1"
+        GIT_TAG         "v2.2.4"
         PREFIX          ${PYBIND_SOURCE_DIR}
         UPDATE_COMMAND  ""
         CONFIGURE_COMMAND ""

From bc4923321ce286f357792c5a28884a665fcb87b7 Mon Sep 17 00:00:00 2001
From: Jiabin Yang <marsyang199376@gmail.com>
Date: Sat, 24 Nov 2018 19:32:46 +0800
Subject: [PATCH 083/113] Update __init__.py

---
 python/paddle/fluid/__init__.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/python/paddle/fluid/__init__.py b/python/paddle/fluid/__init__.py
index ac2a7ea47e..ff651db043 100644
--- a/python/paddle/fluid/__init__.py
+++ b/python/paddle/fluid/__init__.py
@@ -120,7 +120,6 @@ def __bootstrap__():
         'reader_queue_speed_test_mode', 'print_sub_graph_dir'
     ]
     if 'Darwin' not in sysstr:
-        print("aaaaa")
         read_env_flags.append('use_pinned_memory')
 
     if os.name != 'nt':

From 81994e84e055cba8b4d3fe0b1ecb94b12d731661 Mon Sep 17 00:00:00 2001
From: minqiyang <minqiyang@baidu.com>
Date: Sat, 24 Nov 2018 19:37:37 +0800
Subject: [PATCH 084/113] Change the include files because the version changes
 of pybind11

test=develop
---
 paddle/fluid/pybind/tensor_py.h | 1 -
 paddle/scripts/paddle_build.sh  | 2 +-
 2 files changed, 1 insertion(+), 2 deletions(-)

diff --git a/paddle/fluid/pybind/tensor_py.h b/paddle/fluid/pybind/tensor_py.h
index b39323f843..02a75236f6 100644
--- a/paddle/fluid/pybind/tensor_py.h
+++ b/paddle/fluid/pybind/tensor_py.h
@@ -21,7 +21,6 @@ limitations under the License. */
 #include "paddle/fluid/memory/memcpy.h"
 #include "paddle/fluid/platform/device_context.h"
 #include "paddle/fluid/platform/float16.h"
-#include "pybind11/common.h"
 #include "pybind11/numpy.h"
 #include "pybind11/pybind11.h"
 
diff --git a/paddle/scripts/paddle_build.sh b/paddle/scripts/paddle_build.sh
index 9632eaec00..86925b26e7 100755
--- a/paddle/scripts/paddle_build.sh
+++ b/paddle/scripts/paddle_build.sh
@@ -149,7 +149,7 @@ function cmake_gen() {
             elif [ "$1" == "cp37-cp37m" ]; then
                 export LD_LIBRARY_PATH=/opt/_internal/cpython-3.7.0/lib/:${LD_LIBRARY_PATH}
                 export PATH=/opt/_internal/cpython-3.7.0/bin/:${PATH}
-                export PYTHON_FLAGS="-DPYTHON_EXECUTABLE:FILEPATH=/opt/_internal/cpython-3.7.0/bin/python3
+                export PYTHON_FLAGS="-DPYTHON_EXECUTABLE:FILEPATH=/opt/_internal/cpython-3.7.0/bin/python3.7
             -DPYTHON_INCLUDE_DIR:PATH=/opt/_internal/cpython-3.7.0/include/python3.7m
             -DPYTHON_LIBRARIES:FILEPATH=/opt/_internal/cpython-3.7.0/lib/libpython3.so"
            fi

From b67229187e67b04f6f6517cf8c0ceb7fcd8629f4 Mon Sep 17 00:00:00 2001
From: minqiyang <minqiyang@baidu.com>
Date: Sat, 24 Nov 2018 19:47:50 +0800
Subject: [PATCH 085/113] Change to PYBIND11_MODULE because the deprecation of
 PYBIND11_PLUGIN

test=develop
---
 paddle/fluid/pybind/pybind.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc
index 795800fd51..bf86b83d4e 100644
--- a/paddle/fluid/pybind/pybind.cc
+++ b/paddle/fluid/pybind/pybind.cc
@@ -86,7 +86,7 @@ bool IsCompiledWithDIST() {
 #endif
 }
 
-PYBIND11_PLUGIN(core) {
+PYBIND11_MODULE(core) {
   // Not used, just make sure cpu_info.cc is linked.
   paddle::platform::CpuTotalPhysicalMemory();
 

From d2045260a5cd907238e594483daf0d2fbfa51314 Mon Sep 17 00:00:00 2001
From: minqiyang <minqiyang@baidu.com>
Date: Sat, 24 Nov 2018 20:07:53 +0800
Subject: [PATCH 086/113] Change visibilities of variant_visitor of pybind11

test=develop
---
 paddle/fluid/pybind/protobuf.cc | 13 +++++++------
 paddle/fluid/pybind/pybind.cc   |  5 ++---
 2 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/paddle/fluid/pybind/protobuf.cc b/paddle/fluid/pybind/protobuf.cc
index 586e92c2b3..0443ff3fc3 100644
--- a/paddle/fluid/pybind/protobuf.cc
+++ b/paddle/fluid/pybind/protobuf.cc
@@ -30,11 +30,12 @@ namespace pybind11 {
 namespace detail {
 
 // Can be replaced by a generic lambda in C++14
-struct variant_caster_visitor : public boost::static_visitor<handle> {
+struct __attribute__((visibility("hidden"))) paddle_variant_caster_visitor
+    : public boost::static_visitor<handle> {
   return_value_policy policy;
   handle parent;
 
-  variant_caster_visitor(return_value_policy policy, handle parent)
+  paddle_variant_caster_visitor(return_value_policy policy, handle parent)
       : policy(policy), parent(parent) {}
 
   template <class T>
@@ -44,10 +45,10 @@ struct variant_caster_visitor : public boost::static_visitor<handle> {
 };
 
 template <class Variant>
-struct variant_caster;
+struct paddle_variant_caster;
 
 template <template <class...> class V, class... Ts>
-struct variant_caster<V<Ts...>> {
+struct paddle_variant_caster<V<Ts...>> {
   using Type = V<Ts...>;
 
   template <typename T>
@@ -90,7 +91,7 @@ struct variant_caster<V<Ts...>> {
 
   static handle cast(Type const &src, return_value_policy policy,
                      handle parent) {
-    variant_caster_visitor visitor(policy, parent);
+    paddle_variant_caster_visitor visitor(policy, parent);
     return boost::apply_visitor(visitor, src);
   }
 
@@ -101,7 +102,7 @@ struct variant_caster<V<Ts...>> {
 // Add specialization for concrete variant type
 template <class... Args>
 struct type_caster<boost::variant<Args...>>
-    : variant_caster<boost::variant<Args...>> {};
+    : paddle_variant_caster<boost::variant<Args...>> {};
 
 }  // namespace detail
 }  // namespace pybind11
diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc
index bf86b83d4e..a2a629acdf 100644
--- a/paddle/fluid/pybind/pybind.cc
+++ b/paddle/fluid/pybind/pybind.cc
@@ -86,12 +86,12 @@ bool IsCompiledWithDIST() {
 #endif
 }
 
-PYBIND11_MODULE(core) {
+PYBIND11_MODULE(core, m) {
   // Not used, just make sure cpu_info.cc is linked.
   paddle::platform::CpuTotalPhysicalMemory();
 
   paddle::memory::allocation::UseAllocatorStrategyGFlag();
-  py::module m("core", "C++ core of PaddlePaddle");
+  m.doc() = "C++ core of PaddlePaddle";
 
   // using framework in this function. Since it is inside a function, it will
   // not cause namespace pollution.
@@ -907,7 +907,6 @@ All parameter, weight, gradient are variables in Paddle.
       });
 
   BindRecordIOWriter(&m);
-  return m.ptr();
 }
 }  // namespace pybind
 }  // namespace paddle

From abbf1eb7e2efcdee65d044a6e84ed15192ef5bcb Mon Sep 17 00:00:00 2001
From: peizhilin <wopeizl@163.com>
Date: Sat, 24 Nov 2018 23:08:06 +0800
Subject: [PATCH 087/113] enable profiler features on windows

---
 paddle/fluid/framework/lod_tensor.cc            | 17 +----------------
 paddle/fluid/framework/lod_tensor_test.cc       |  2 --
 paddle/fluid/framework/operator.cc              |  5 +----
 .../fluid/inference/api/analysis_predictor.cc   |  4 ----
 paddle/fluid/inference/api/api_impl.cc          |  4 ----
 .../fluid/inference/tests/api/tester_helper.h   |  4 ----
 6 files changed, 2 insertions(+), 34 deletions(-)

diff --git a/paddle/fluid/framework/lod_tensor.cc b/paddle/fluid/framework/lod_tensor.cc
index 669d08c70c..9b2eeaf59a 100644
--- a/paddle/fluid/framework/lod_tensor.cc
+++ b/paddle/fluid/framework/lod_tensor.cc
@@ -26,10 +26,8 @@ limitations under the License. */
 #include "paddle/fluid/memory/memcpy.h"
 #include "paddle/fluid/memory/memory.h"
 
-#if !defined(_WIN32)
 #include "paddle/fluid/recordio/scanner.h"
 #include "paddle/fluid/recordio/writer.h"
-#endif  // _WIN32
 
 namespace paddle {
 namespace framework {
@@ -305,7 +303,6 @@ void DeserializeFromStream(std::istream &is, LoDTensor *tensor,
   TensorFromStream(is, static_cast<Tensor *>(tensor), dev_ctx);
 }
 
-#if !defined(_WIN32)
 void WriteToRecordIO(recordio::Writer *writer,
                      const std::vector<LoDTensor> &tensor,
                      const platform::DeviceContext &dev_ctx) {
@@ -335,19 +332,7 @@ bool ReadFromRecordIO(recordio::Scanner *scanner,
 
   return true;
 }
-#else
-class Writer {};
-class Scanner {};
-void WriteToRecordIO(recordio::Writer *writer,
-                     const std::vector<LoDTensor> &tensor,
-                     const platform::DeviceContext &dev_ctx) {}
-bool ReadFromRecordIO(recordio::Scanner *scanner,
-                      const platform::DeviceContext &dev_ctx,
-                      std::vector<LoDTensor> *result_ptr) {
-  PADDLE_ENFORCE("windows didn't supported recordio!.");
-  return true;
-}
-#endif  // _WIN32
+
 std::vector<LoDTensor> LoDTensor::SplitLoDTensor(
     const std::vector<platform::Place> places) const {
   check_memory_size();
diff --git a/paddle/fluid/framework/lod_tensor_test.cc b/paddle/fluid/framework/lod_tensor_test.cc
index cbf5fd04d7..cd50aaa260 100644
--- a/paddle/fluid/framework/lod_tensor_test.cc
+++ b/paddle/fluid/framework/lod_tensor_test.cc
@@ -274,7 +274,6 @@ TEST(LoD, ConvertToOffsetBasedLoD) {
   EXPECT_EQ(offset_lod, expected);
 }
 
-#if !defined(_WIN32)
 template <typename T>
 static void TestRecordIO() {
   LoDTensor tensor;
@@ -321,7 +320,6 @@ TEST(LoDTensor, RecordIO) {
   TestRecordIO<float>();
   TestRecordIO<double>();
 }
-#endif  // !defined(_WIN32)
 
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/fluid/framework/operator.cc b/paddle/fluid/framework/operator.cc
index 1ec170b6f6..1f82a57769 100644
--- a/paddle/fluid/framework/operator.cc
+++ b/paddle/fluid/framework/operator.cc
@@ -156,14 +156,11 @@ void OperatorBase::Run(const Scope& scope, const platform::Place& place) {
 // The profile has a process-wide mutex, results in serious performance issue
 // in concurrency scenerio. Here use an `if` to fix this issue.
 // Please not remove the `if`, ask @Superjomn if there are any concern.
-#ifndef _WIN32
   if (platform::IsProfileEnabled()) {
     platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
     platform::RecordEvent record_event(Type(), pool.Get(place));
     RunImpl(scope, place);
-  } else
-#endif
-  {
+  } else  {
     RunImpl(scope, place);
   }
   VLOG(30) << place << " " << DebugStringEx(&scope);
diff --git a/paddle/fluid/inference/api/analysis_predictor.cc b/paddle/fluid/inference/api/analysis_predictor.cc
index cb14d2a260..8b9f872530 100644
--- a/paddle/fluid/inference/api/analysis_predictor.cc
+++ b/paddle/fluid/inference/api/analysis_predictor.cc
@@ -56,7 +56,6 @@ bool AnalysisPredictor::Init(
     const std::shared_ptr<framework::Scope> &parent_scope,
     const std::shared_ptr<framework::ProgramDesc> &program) {
   VLOG(30) << "Predictor::init()";
-#if !defined(_WIN32)
   if (FLAGS_profile) {
     LOG(WARNING) << "Profiler is actived, might affect the performance";
     LOG(INFO) << "You can turn off by set gflags '-profile false'";
@@ -64,7 +63,6 @@ bool AnalysisPredictor::Init(
                                            : platform::ProfilerState::kCPU;
     platform::EnableProfiler(tracking_device);
   }
-#endif
 
   // no matter with or without MKLDNN
   paddle::platform::SetNumThreads(FLAGS_paddle_num_threads);
@@ -501,12 +499,10 @@ bool AnalysisPredictor::LoadParameters() {
 }
 
 AnalysisPredictor::~AnalysisPredictor() {
-#if !defined(_WIN32)
   if (FLAGS_profile) {
     platform::DisableProfiler(platform::EventSortingKey::kTotal,
                               "./profile.log");
   }
-#endif
   if (sub_scope_) {
     scope_->DeleteScope(sub_scope_);
   }
diff --git a/paddle/fluid/inference/api/api_impl.cc b/paddle/fluid/inference/api/api_impl.cc
index fcbc3803d0..d80d8097ff 100644
--- a/paddle/fluid/inference/api/api_impl.cc
+++ b/paddle/fluid/inference/api/api_impl.cc
@@ -64,7 +64,6 @@ void NativePaddlePredictor::PrepareFeedFetch() {
 bool NativePaddlePredictor::Init(
     std::shared_ptr<framework::Scope> parent_scope) {
   VLOG(3) << "Predictor::init()";
-#if !defined(_WIN32)
   if (FLAGS_profile) {
     LOG(WARNING) << "Profiler is actived, might affect the performance";
     LOG(INFO) << "You can turn off by set gflags '-profile false'";
@@ -73,7 +72,6 @@ bool NativePaddlePredictor::Init(
                                            : platform::ProfilerState::kCPU;
     platform::EnableProfiler(tracking_device);
   }
-#endif
 
   // no matter with or without MKLDNN
   paddle::platform::SetNumThreads(FLAGS_paddle_num_threads);
@@ -121,12 +119,10 @@ bool NativePaddlePredictor::Init(
 }
 
 NativePaddlePredictor::~NativePaddlePredictor() {
-#if !defined(_WIN32)
   if (FLAGS_profile) {
     platform::DisableProfiler(platform::EventSortingKey::kTotal,
                               "./profile.log");
   }
-#endif
   if (sub_scope_) {
     scope_->DeleteScope(sub_scope_);
   }
diff --git a/paddle/fluid/inference/tests/api/tester_helper.h b/paddle/fluid/inference/tests/api/tester_helper.h
index 7b686045a5..23507cf9ab 100644
--- a/paddle/fluid/inference/tests/api/tester_helper.h
+++ b/paddle/fluid/inference/tests/api/tester_helper.h
@@ -177,11 +177,9 @@ void TestOneThreadPrediction(
     warmup_timer.tic();
     predictor->Run(inputs[0], outputs, batch_size);
     PrintTime(batch_size, 1, 1, 0, warmup_timer.toc(), 1);
-#if !defined(_WIN32)
     if (FLAGS_profile) {
       paddle::platform::ResetProfiler();
     }
-#endif
   }
 
   LOG(INFO) << "Run " << num_times << " times...";
@@ -230,11 +228,9 @@ void TestMultiThreadPrediction(
         warmup_timer.tic();
         predictor->Run(inputs[0], outputs, batch_size);
         PrintTime(batch_size, 1, num_threads, tid, warmup_timer.toc(), 1);
-#if !defined(_WIN32)
         if (FLAGS_profile) {
           paddle::platform::ResetProfiler();
         }
-#endif
       }
 
       LOG(INFO) << "Thread " << tid << " run " << num_times << " times...";

From 23604231dad1e048797e309978b84a15b7b9fd2d Mon Sep 17 00:00:00 2001
From: peizhilin <wopeizl@163.com>
Date: Sat, 24 Nov 2018 23:21:53 +0800
Subject: [PATCH 088/113] code style fix test=develop

---
 paddle/fluid/framework/operator.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/paddle/fluid/framework/operator.cc b/paddle/fluid/framework/operator.cc
index 1f82a57769..013b2e536d 100644
--- a/paddle/fluid/framework/operator.cc
+++ b/paddle/fluid/framework/operator.cc
@@ -160,7 +160,7 @@ void OperatorBase::Run(const Scope& scope, const platform::Place& place) {
     platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
     platform::RecordEvent record_event(Type(), pool.Get(place));
     RunImpl(scope, place);
-  } else  {
+  } else {
     RunImpl(scope, place);
   }
   VLOG(30) << place << " " << DebugStringEx(&scope);

From ba562bc1255c6302e3d2f44527e3fdcf735a6a81 Mon Sep 17 00:00:00 2001
From: peizhilin <wopeizl@163.com>
Date: Sat, 24 Nov 2018 23:53:17 +0800
Subject: [PATCH 089/113] code style fix test=develop

---
 paddle/fluid/framework/operator.cc | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/paddle/fluid/framework/operator.cc b/paddle/fluid/framework/operator.cc
index 013b2e536d..60fe838601 100644
--- a/paddle/fluid/framework/operator.cc
+++ b/paddle/fluid/framework/operator.cc
@@ -153,9 +153,9 @@ void OperatorBase::Run(const Scope& scope, const platform::Place& place) {
 #endif
   }
 
-// The profile has a process-wide mutex, results in serious performance issue
-// in concurrency scenerio. Here use an `if` to fix this issue.
-// Please not remove the `if`, ask @Superjomn if there are any concern.
+  // The profile has a process-wide mutex, results in serious performance issue
+  // in concurrency scenerio. Here use an `if` to fix this issue.
+  // Please not remove the `if`, ask @Superjomn if there are any concern.
   if (platform::IsProfileEnabled()) {
     platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
     platform::RecordEvent record_event(Type(), pool.Get(place));

From ee73810fd5686ac497e2a0fb659639c6db6d0a05 Mon Sep 17 00:00:00 2001
From: minqiyang <minqiyang@baidu.com>
Date: Sun, 25 Nov 2018 14:07:44 +0800
Subject: [PATCH 090/113] Fix API.spec

test=develop
---
 paddle/fluid/API.spec | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/paddle/fluid/API.spec b/paddle/fluid/API.spec
index 8397ae093b..0a71f15343 100644
--- a/paddle/fluid/API.spec
+++ b/paddle/fluid/API.spec
@@ -26,10 +26,10 @@ paddle.fluid.release_memory ArgSpec(args=['input_program', 'skip_opt_set'], vara
 paddle.fluid.DistributeTranspilerConfig.__init__ 
 paddle.fluid.ParallelExecutor.__init__ ArgSpec(args=['self', 'use_cuda', 'loss_name', 'main_program', 'share_vars_from', 'exec_strategy', 'build_strategy', 'num_trainers', 'trainer_id', 'scope'], varargs=None, keywords=None, defaults=(None, None, None, None, None, 1, 0, None))
 paddle.fluid.ParallelExecutor.run ArgSpec(args=['self', 'fetch_list', 'feed', 'feed_dict', 'return_numpy'], varargs=None, keywords=None, defaults=(None, None, True))
-paddle.fluid.ExecutionStrategy.__init__ __init__(self: paddle.fluid.core.ExecutionStrategy) -> None
-paddle.fluid.BuildStrategy.GradientScaleStrategy.__init__ __init__(self: paddle.fluid.core.GradientScaleStrategy, arg0: int) -> None
-paddle.fluid.BuildStrategy.ReduceStrategy.__init__ __init__(self: paddle.fluid.core.ReduceStrategy, arg0: int) -> None
-paddle.fluid.BuildStrategy.__init__ __init__(self: paddle.fluid.core.BuildStrategy) -> None
+paddle.fluid.ExecutionStrategy.__init__ __init__(self: paddle.fluid.core.ParallelExecutor.ExecutionStrategy) -> None
+paddle.fluid.BuildStrategy.GradientScaleStrategy.__init__ __init__(self: paddle.fluid.core.ParallelExecutor.BuildStrategy.GradientScaleStrategy, arg0: int) -> None
+paddle.fluid.BuildStrategy.ReduceStrategy.__init__ __init__(self: paddle.fluid.core.ParallelExecutor.BuildStrategy.ReduceStrategy, arg0: int) -> None
+paddle.fluid.BuildStrategy.__init__ __init__(self: paddle.fluid.core.ParallelExecutor.BuildStrategy) -> None
 paddle.fluid.create_lod_tensor ArgSpec(args=['data', 'recursive_seq_lens', 'place'], varargs=None, keywords=None, defaults=None)
 paddle.fluid.create_random_int_lodtensor ArgSpec(args=['recursive_seq_lens', 'base_shape', 'place', 'low', 'high'], varargs=None, keywords=None, defaults=None)
 paddle.fluid.io.save_vars ArgSpec(args=['executor', 'dirname', 'main_program', 'vars', 'predicate', 'filename'], varargs=None, keywords=None, defaults=(None, None, None, None))

From c1bf9664cdf1187c3e01750ef68cfbbd8d788b7d Mon Sep 17 00:00:00 2001
From: gongweibao <weibao.gong@gmail.com>
Date: Sun, 25 Nov 2018 14:11:23 +0800
Subject: [PATCH 091/113] Add options to disable SO_REUSEPORT of grpc. (#14269)

---
 .../operators/distributed/grpc_client.cc      |  5 +++++
 .../operators/distributed/grpc_server.cc      | 20 +++++++++++++++++++
 .../operators/distributed/sendrecvop_utils.cc |  2 ++
 python/paddle/fluid/__init__.py               |  1 +
 4 files changed, 28 insertions(+)

diff --git a/paddle/fluid/operators/distributed/grpc_client.cc b/paddle/fluid/operators/distributed/grpc_client.cc
index c28f86146d..0bd76b3f6c 100644
--- a/paddle/fluid/operators/distributed/grpc_client.cc
+++ b/paddle/fluid/operators/distributed/grpc_client.cc
@@ -22,6 +22,8 @@ limitations under the License. */
 #include "paddle/fluid/operators/distributed/request_handler.h"
 #include "paddle/fluid/platform/profiler.h"
 
+DECLARE_bool(rpc_disable_reuse_port);
+
 namespace paddle {
 namespace operators {
 namespace distributed {
@@ -383,6 +385,9 @@ std::shared_ptr<grpc::Channel> GRPCClient::GetChannel(const std::string& ep) {
   // Channel configurations:
   grpc::ChannelArguments args;
   args.SetInt(GRPC_ARG_MAX_RECONNECT_BACKOFF_MS, 2000);
+  if (FLAGS_rpc_disable_reuse_port) {
+    args.SetInt(GRPC_ARG_ALLOW_REUSEPORT, 0);
+  }
   args.SetCompressionAlgorithm(GRPC_COMPRESS_NONE);
   args.SetMaxSendMessageSize(std::numeric_limits<int>::max());
   args.SetMaxReceiveMessageSize(std::numeric_limits<int>::max());
diff --git a/paddle/fluid/operators/distributed/grpc_server.cc b/paddle/fluid/operators/distributed/grpc_server.cc
index ffd2b1707b..77bf67be25 100644
--- a/paddle/fluid/operators/distributed/grpc_server.cc
+++ b/paddle/fluid/operators/distributed/grpc_server.cc
@@ -20,6 +20,8 @@ limitations under the License. */
 
 using ::grpc::ServerAsyncResponseWriter;
 
+DECLARE_bool(rpc_disable_reuse_port);
+
 namespace paddle {
 namespace operators {
 namespace distributed {
@@ -252,6 +254,20 @@ void AsyncGRPCServer::WaitServerReady() {
   VLOG(40) << "AsyncGRPCServer WaitSeverReady";
 }
 
+// Define an option subclass in order to disable SO_REUSEPORT for the
+// server socket.
+// Come from:
+// https://github.com/tensorflow/tensorflow/blob/master/tensorflow/core/distributed_runtime/rpc/grpc_server_lib.cc
+class NoReusePortOption : public ::grpc::ServerBuilderOption {
+ public:
+  void UpdateArguments(::grpc::ChannelArguments* args) override {
+    args->SetInt(GRPC_ARG_ALLOW_REUSEPORT, 0);
+  }
+
+  void UpdatePlugins(std::vector<std::unique_ptr<::grpc::ServerBuilderPlugin>>*
+                         plugins) override {}
+};
+
 void AsyncGRPCServer::StartServer() {
   ::grpc::ServerBuilder builder;
   builder.AddListeningPort(bind_address_, ::grpc::InsecureServerCredentials(),
@@ -259,6 +275,10 @@ void AsyncGRPCServer::StartServer() {
 
   builder.SetMaxSendMessageSize(std::numeric_limits<int>::max());
   builder.SetMaxReceiveMessageSize(std::numeric_limits<int>::max());
+  if (FLAGS_rpc_disable_reuse_port) {
+    builder.SetOption(
+        std::unique_ptr<::grpc::ServerBuilderOption>(new NoReusePortOption));
+  }
   builder.RegisterService(&service_);
 
   for (auto t : rpc_call_map_) {
diff --git a/paddle/fluid/operators/distributed/sendrecvop_utils.cc b/paddle/fluid/operators/distributed/sendrecvop_utils.cc
index 374fa680e3..df5af3476f 100644
--- a/paddle/fluid/operators/distributed/sendrecvop_utils.cc
+++ b/paddle/fluid/operators/distributed/sendrecvop_utils.cc
@@ -22,6 +22,8 @@ limitations under the License. */
 #include "paddle/fluid/operators/distributed/sendrecvop_utils.h"
 #include "paddle/fluid/operators/distributed/variable_response.h"
 
+DEFINE_bool(rpc_disable_reuse_port, false, "Disable SO_REUSEPORT or not.");
+
 namespace paddle {
 namespace operators {
 namespace distributed {
diff --git a/python/paddle/fluid/__init__.py b/python/paddle/fluid/__init__.py
index 3c092dee34..d851b9dfaa 100644
--- a/python/paddle/fluid/__init__.py
+++ b/python/paddle/fluid/__init__.py
@@ -129,6 +129,7 @@ def __bootstrap__():
         read_env_flags.append('rpc_send_thread_num')
         read_env_flags.append('rpc_get_thread_num')
         read_env_flags.append('rpc_prefetch_thread_num')
+        read_env_flags.append('rpc_disable_reuse_port')
 
     if core.is_compiled_with_cuda():
         read_env_flags += [

From a7188d5bc7a8990ce228902e0361ae7880bb1927 Mon Sep 17 00:00:00 2001
From: Yan Chunwei <yanchunwei@outlook.com>
Date: Sun, 25 Nov 2018 17:01:49 +0800
Subject: [PATCH 092/113] fix executor transfer cache bug (#14518)

---
 paddle/fluid/framework/CMakeLists.txt         |  8 ++-
 paddle/fluid/framework/executor.cc            |  1 +
 paddle/fluid/framework/naive_executor.cc      |  1 +
 paddle/fluid/framework/operator.cc            | 49 +++++--------
 paddle/fluid/framework/operator.h             |  4 ++
 .../fluid/framework/transfer_scope_cache.cc   | 72 +++++++++++++++++++
 paddle/fluid/framework/transfer_scope_cache.h | 41 +++++++++++
 7 files changed, 143 insertions(+), 33 deletions(-)
 create mode 100644 paddle/fluid/framework/transfer_scope_cache.cc
 create mode 100644 paddle/fluid/framework/transfer_scope_cache.h

diff --git a/paddle/fluid/framework/CMakeLists.txt b/paddle/fluid/framework/CMakeLists.txt
index 281d073166..83c8478685 100644
--- a/paddle/fluid/framework/CMakeLists.txt
+++ b/paddle/fluid/framework/CMakeLists.txt
@@ -116,8 +116,14 @@ cc_test(op_proto_maker_test SRCS op_proto_maker_test.cc DEPS op_proto_maker)
 cc_library(op_info SRCS op_info.cc DEPS attribute framework_proto)
 cc_library(shape_inference SRCS shape_inference.cc DEPS ddim attribute device_context)
 
+if (NOT WIN32)
+cc_library(transfer_scope_cache SRCS transfer_scope_cache.cc DEPS scope framework_proto)
+cc_library(operator SRCS operator.cc DEPS op_info device_context tensor scope glog
+    shape_inference data_transform lod_tensor profiler transfer_scope_cache)
+else()
 cc_library(operator SRCS operator.cc DEPS op_info device_context tensor scope glog
-    shape_inference data_transform lod_tensor profiler)
+    shape_inference data_transform lod_tensor)
+endif(NOT WIN32)
 
 cc_test(operator_test SRCS operator_test.cc DEPS operator op_registry device_context)
 
diff --git a/paddle/fluid/framework/executor.cc b/paddle/fluid/framework/executor.cc
index 7ce08b728d..f6c82995e1 100644
--- a/paddle/fluid/framework/executor.cc
+++ b/paddle/fluid/framework/executor.cc
@@ -20,6 +20,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/ngraph_operator.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/reader.h"
+#include "paddle/fluid/framework/transfer_scope_cache.h"
 #include "paddle/fluid/operators/detail/macros.h"
 #include "paddle/fluid/platform/place.h"
 #include "paddle/fluid/platform/profiler.h"
diff --git a/paddle/fluid/framework/naive_executor.cc b/paddle/fluid/framework/naive_executor.cc
index e8e53f988f..e829563952 100644
--- a/paddle/fluid/framework/naive_executor.cc
+++ b/paddle/fluid/framework/naive_executor.cc
@@ -83,6 +83,7 @@ void NaiveExecutor::Run() {
   for (auto &op : ops_) {
     VLOG(3) << std::this_thread::get_id() << " run " << op->Type()
             << " on scope " << scope_;
+    op->SetIsCalledByExecutor(false);
     op->Run(*scope_, place_);
   }
 }
diff --git a/paddle/fluid/framework/operator.cc b/paddle/fluid/framework/operator.cc
index 1ec170b6f6..0084573cd0 100644
--- a/paddle/fluid/framework/operator.cc
+++ b/paddle/fluid/framework/operator.cc
@@ -22,6 +22,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/framework/shape_inference.h"
+#include "paddle/fluid/framework/transfer_scope_cache.h"
 #include "paddle/fluid/framework/var_type.h"
 #include "paddle/fluid/platform/profiler.h"
 
@@ -33,11 +34,6 @@ DEFINE_bool(check_nan_inf, false,
 namespace paddle {
 namespace framework {
 
-// Combine two hash values to a single hash.
-inline size_t CombineHash(size_t seed, size_t a) {
-  return (seed ^ a) + 0x9e3779b9 + (seed << 6) + (seed >> 2);
-}
-
 std::vector<std::tuple<platform::Place, LibraryType>> kKernelPriority = {
     std::make_tuple(platform::CUDAPlace(0), LibraryType::kCUDNN),
     std::make_tuple(platform::CUDAPlace(0), LibraryType::kPlain),
@@ -797,17 +793,6 @@ void OperatorWithKernel::TransferInplaceVarsBack(
 Scope* OperatorWithKernel::TryTransferData(
     const Scope& scope, const OpKernelType& expected_kernel_key,
     std::vector<std::string>* transfered_inplace_vars) const {
-// In the inference scenerio, the scopes will be reused across the batches, so
-// the `new_scope` here will result in GPU memroy explosion over the running of
-// operators.
-// We use a thread_local cache to fix that issue, the key in the cache is the
-// combination of the `scope` argument, from_kernel_type, target_kernel_type.
-// Have a discussion with @Superjomn or the inference developers if some changes
-// on this logic for this macro might not tested on the other scenerios.
-#ifdef PADDLE_ON_INFERENCE
-  thread_local std::unordered_map<size_t, Scope*> infer_transfer_scope_cache;
-#endif
-
   Scope* new_scope = nullptr;
   for (auto& var_name_item : Inputs()) {
     for (auto& var_name : var_name_item.second) {
@@ -838,23 +823,23 @@ Scope* OperatorWithKernel::TryTransferData(
       VLOG(30) << "Transform Variable " << var_name << " from "
                << kernel_type_for_var << " to " << expected_kernel_key;
 
-#ifdef PADDLE_ON_INFERENCE
-      size_t infer_cache_key =
-          CombineHash(OpKernelType::Hash()(kernel_type_for_var),
-                      OpKernelType::Hash()(expected_kernel_key));
-      infer_cache_key =
-          CombineHash(infer_cache_key, std::hash<const Scope*>()(&scope));
-
-      auto it = infer_transfer_scope_cache.find(infer_cache_key);
-      if (it != infer_transfer_scope_cache.end()) {
-        new_scope = infer_transfer_scope_cache[infer_cache_key];
-      } else {
-        new_scope = &scope.NewScope();
-        infer_transfer_scope_cache[infer_cache_key] = new_scope;
+      // In the inference scenerio, the scopes will be reused across the
+      // batches, so the `new_scope` here will result in GPU memroy explosion
+      // over the  running of operators.
+      // We use a thread_local cache to fix that issue, the key in the cache is
+      // the combination of the `scope` argument, from_kernel_type,
+      // target_kernel_type.
+      // Have a discussion with @Superjomn or the inference developers if some
+      // changes on this logic for this macro might not tested on the other
+      // scenerios.
+      // If this op is not called by an Executor or ParallelExecutor, it should
+      // called by a NaiveExecutor, the NaiveExecutor will cache the scopes and
+      // variables, that behavior a lot different.
+      if (!run_by_executor_) {
+        new_scope = TryCreateTransferScope(kernel_type_for_var,
+                                           expected_kernel_key, &scope);
       }
-#endif
-
-      if (new_scope == nullptr) {
+      if (!new_scope) {
         new_scope = &scope.NewScope();
       }
 
diff --git a/paddle/fluid/framework/operator.h b/paddle/fluid/framework/operator.h
index ef83833217..bfdfdc56b3 100644
--- a/paddle/fluid/framework/operator.h
+++ b/paddle/fluid/framework/operator.h
@@ -127,6 +127,8 @@ class OperatorBase {
   //! Get all outputs variable names
   virtual std::vector<std::string> OutputVars(bool has_intermediate) const;
 
+  void SetIsCalledByExecutor(bool x) { run_by_executor_ = x; }
+
  protected:
   std::string type_;
   // NOTE: in case of OpGrad, inputs_ contains:
@@ -139,6 +141,8 @@ class OperatorBase {
   // IG (Inputs Gradients)
   VariableNameMap outputs_;
   AttributeMap attrs_;
+  // Whether this operator executes in an Executor.
+  bool run_by_executor_{true};
 
  private:
   void GenerateTemporaryNames();
diff --git a/paddle/fluid/framework/transfer_scope_cache.cc b/paddle/fluid/framework/transfer_scope_cache.cc
new file mode 100644
index 0000000000..e52a8317e2
--- /dev/null
+++ b/paddle/fluid/framework/transfer_scope_cache.cc
@@ -0,0 +1,72 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/framework/transfer_scope_cache.h"
+
+namespace paddle {
+namespace framework {
+
+std::unordered_map<size_t, Scope*>& global_transfer_data_cache() {
+  thread_local auto* x = new std::unordered_map<size_t, Scope*>;
+  return *x;
+}
+
+std::unordered_set<Scope*>& global_transfer_scope_cache() {
+  thread_local auto* x = new std::unordered_set<Scope*>;
+  return *x;
+}
+
+Scope* TryCreateTransferScope(OpKernelType type0, OpKernelType type1,
+                              const Scope* scope) {
+  Scope* new_scope{nullptr};
+  size_t infer_cache_key =
+      CombineHash(OpKernelType::Hash()(type0), OpKernelType::Hash()(type1));
+  infer_cache_key =
+      CombineHash(infer_cache_key, std::hash<const Scope*>()(scope));
+
+  auto it = global_transfer_data_cache().find(infer_cache_key);
+  if (it != global_transfer_data_cache().end()) {
+    new_scope = global_transfer_data_cache()[infer_cache_key];
+  } else {
+    new_scope = &scope->NewScope();
+    global_transfer_data_cache()[infer_cache_key] = new_scope;
+  }
+  global_transfer_scope_cache().insert(new_scope);
+  return new_scope;
+}
+
+void RemoveKidsFromTransferScopeCache(Scope* scope) {
+  auto it = global_transfer_scope_cache().find(scope);
+  if (it != global_transfer_scope_cache().end()) {
+    global_transfer_scope_cache().erase(it);
+  }
+  for (auto* s : scope->kids()) {
+    auto it = global_transfer_scope_cache().find(s);
+    if (it != global_transfer_scope_cache().end()) {
+      global_transfer_scope_cache().erase(it);
+    }
+  }
+
+  // remove global transfer data cache
+  auto& cache = global_transfer_data_cache();
+  for (auto it = cache.begin(); it != cache.end();) {
+    if (it->second == scope)
+      it = cache.erase(it);
+    else
+      it++;
+  }
+}
+
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/transfer_scope_cache.h b/paddle/fluid/framework/transfer_scope_cache.h
new file mode 100644
index 0000000000..86fc0bf529
--- /dev/null
+++ b/paddle/fluid/framework/transfer_scope_cache.h
@@ -0,0 +1,41 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <thread>  // NOLINT
+#include <unordered_map>
+#include <unordered_set>
+#include "paddle/fluid/framework/op_kernel_type.h"
+#include "paddle/fluid/framework/scope.h"
+
+namespace paddle {
+namespace framework {
+
+std::unordered_map<size_t, Scope*>& global_transfer_data_cache();
+
+std::unordered_set<Scope*>& global_transfer_scope_cache();
+
+// Combine two hash values to a single hash.
+static size_t CombineHash(size_t seed, size_t a) {
+  return (seed ^ a) + 0x9e3779b9 + (seed << 6) + (seed >> 2);
+}
+
+Scope* TryCreateTransferScope(OpKernelType type0, OpKernelType type1,
+                              const Scope* scope);
+
+void RemoveKidsFromTransferScopeCache(Scope* scope);
+
+}  // namespace framework
+}  // namespace paddle

From c52f65e0711cf5bb061df9743a631e9e63694c68 Mon Sep 17 00:00:00 2001
From: qingqing01 <dangqingqing@baidu.com>
Date: Sun, 25 Nov 2018 17:33:31 +0800
Subject: [PATCH 093/113] Fix the random fail in test_image_classification.py
 (#14551)

test=develop
---
 python/paddle/fluid/tests/book/test_image_classification.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/paddle/fluid/tests/book/test_image_classification.py b/python/paddle/fluid/tests/book/test_image_classification.py
index cba486cf59..c91bd27895 100644
--- a/python/paddle/fluid/tests/book/test_image_classification.py
+++ b/python/paddle/fluid/tests/book/test_image_classification.py
@@ -239,7 +239,7 @@ def infer(use_cuda, save_dirname=None):
         assert len(results[0]) == len(transpiler_results[0])
         for i in range(len(results[0])):
             np.testing.assert_almost_equal(
-                results[0][i], transpiler_results[0][i], decimal=5)
+                results[0][i], transpiler_results[0][i], decimal=4)
 
         print("infer results: ", results[0])
 

From c92c440fa16ebc0d442f56a1860acf43167726f6 Mon Sep 17 00:00:00 2001
From: minqiyang <minqiyang@baidu.com>
Date: Sun, 25 Nov 2018 22:24:14 +0800
Subject: [PATCH 094/113] Add python3.6 and python3.7 support to production
 generated Dockerfile

test=develop
---
 paddle/scripts/paddle_build.sh | 49 ++++++++++++++++++++++++++++++++++
 1 file changed, 49 insertions(+)

diff --git a/paddle/scripts/paddle_build.sh b/paddle/scripts/paddle_build.sh
index 9632eaec00..e69492da90 100755
--- a/paddle/scripts/paddle_build.sh
+++ b/paddle/scripts/paddle_build.sh
@@ -671,6 +671,55 @@ EOF
     ${DOCKERFILE_CUBLAS_DSO}
     ${DOCKERFILE_GPU_ENV}
     ENV NCCL_LAUNCH_MODE PARALLEL
+EOF
+    elif [ "$1" == "cp36-cp36m" ]; then
+        cat >> ${PADDLE_ROOT}/build/Dockerfile <<EOF
+    ADD python/dist/*.whl /
+    # run paddle version to install python packages first
+    RUN apt-get update && ${NCCL_DEPS}
+    RUN apt-get install -y make build-essential libssl-dev zlib1g-dev libbz2-dev \
+        libreadline-dev libsqlite3-dev wget curl llvm libncurses5-dev libncursesw5-dev \
+        xz-utils tk-dev libffi-dev liblzma-dev
+    RUN mkdir -p /root/python_build/ && wget -q https://www.sqlite.org/2018/sqlite-autoconf-3250300.tar.gz && \
+        tar -zxf sqlite-autoconf-3250300.tar.gz && cd sqlite-autoconf-3250300 && \
+        ./configure -prefix=/usr/local && make -j8 && make install && cd ../ && rm sqlite-autoconf-3250300.tar.gz && \
+        wget -q https://www.python.org/ftp/python/3.6.0/Python-3.6.0.tgz && \
+        tar -xzf Python-3.6.0.tgz && cd Python-3.6.0 && \
+        CFLAGS="-Wformat" ./configure --prefix=/usr/local/ --enable-shared > /dev/null && \
+        make -j8 > /dev/null && make altinstall > /dev/null
+    RUN apt-get install -y libgtk2.0-dev dmidecode python3-tk && \
+        pip3.6 install opencv-python && pip3.6 install /*.whl; apt-get install -f -y && \
+        apt-get clean -y && \
+        rm -f /*.whl && \
+        ${PADDLE_VERSION} && \
+        ldconfig
+    ${DOCKERFILE_CUDNN_DSO}
+    ${DOCKERFILE_CUBLAS_DSO}
+    ${DOCKERFILE_GPU_ENV}
+    ENV NCCL_LAUNCH_MODE PARALLEL
+EOF
+    elif [ "$1" == "cp37-cp37m" ]; then
+        cat >> ${PADDLE_ROOT}/build/Dockerfile <<EOF
+    ADD python/dist/*.whl /
+    # run paddle version to install python packages first
+    RUN apt-get update && ${NCCL_DEPS}
+    RUN apt-get install -y make build-essential libssl-dev zlib1g-dev libbz2-dev \
+        libreadline-dev libsqlite3-dev wget curl llvm libncurses5-dev libncursesw5-dev \
+        xz-utils tk-dev libffi-dev liblzma-dev
+    RUN wget -q https://www.python.org/ftp/python/3.7.0/Python-3.7.0.tgz && \
+        tar -xzf Python-3.7.0.tgz && cd Python-3.7.0 && \
+        CFLAGS="-Wformat" ./configure --prefix=/usr/local/ --enable-shared > /dev/null && \
+        make -j8 > /dev/null && make altinstall > /dev/null
+    RUN apt-get install -y libgtk2.0-dev dmidecode python3-tk && \
+        pip3.7 install opencv-python && pip3.7 install /*.whl; apt-get install -f -y && \
+        apt-get clean -y && \
+        rm -f /*.whl && \
+        ${PADDLE_VERSION} && \
+        ldconfig
+    ${DOCKERFILE_CUDNN_DSO}
+    ${DOCKERFILE_CUBLAS_DSO}
+    ${DOCKERFILE_GPU_ENV}
+    ENV NCCL_LAUNCH_MODE PARALLEL
 EOF
     else
         cat >> ${PADDLE_ROOT}/build/Dockerfile <<EOF

From 923c8e33325690d304cf6f0e13f29a4ea1611544 Mon Sep 17 00:00:00 2001
From: Yan Chunwei <yanchunwei@outlook.com>
Date: Sun, 25 Nov 2018 22:40:24 +0800
Subject: [PATCH 095/113] add benchmark for inference (#14571)

---
 paddle/fluid/inference/CMakeLists.txt         |  1 +
 paddle/fluid/inference/utils/CMakeLists.txt   |  2 +
 paddle/fluid/inference/utils/benchmark.cc     | 49 +++++++++++++++++
 paddle/fluid/inference/utils/benchmark.h      | 52 +++++++++++++++++++
 .../fluid/inference/utils/benchmark_tester.cc | 39 ++++++++++++++
 5 files changed, 143 insertions(+)
 create mode 100644 paddle/fluid/inference/utils/CMakeLists.txt
 create mode 100644 paddle/fluid/inference/utils/benchmark.cc
 create mode 100644 paddle/fluid/inference/utils/benchmark.h
 create mode 100644 paddle/fluid/inference/utils/benchmark_tester.cc

diff --git a/paddle/fluid/inference/CMakeLists.txt b/paddle/fluid/inference/CMakeLists.txt
index 2c5364b724..058a5b5f46 100644
--- a/paddle/fluid/inference/CMakeLists.txt
+++ b/paddle/fluid/inference/CMakeLists.txt
@@ -4,6 +4,7 @@ endif()
 # analysis and tensorrt must be added before creating static library,
 # otherwise, there would be undefined reference to them in static library.
 add_subdirectory(analysis)
+add_subdirectory(utils)
 if (TENSORRT_FOUND)
   add_subdirectory(tensorrt)
 endif()
diff --git a/paddle/fluid/inference/utils/CMakeLists.txt b/paddle/fluid/inference/utils/CMakeLists.txt
new file mode 100644
index 0000000000..2104e4ac72
--- /dev/null
+++ b/paddle/fluid/inference/utils/CMakeLists.txt
@@ -0,0 +1,2 @@
+cc_library(benchmark SRCS benchmark.cc DEPS enforce)
+cc_test(test_benchmark SRCS benchmark_tester.cc DEPS benchmark)
diff --git a/paddle/fluid/inference/utils/benchmark.cc b/paddle/fluid/inference/utils/benchmark.cc
new file mode 100644
index 0000000000..021edc2de5
--- /dev/null
+++ b/paddle/fluid/inference/utils/benchmark.cc
@@ -0,0 +1,49 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/inference/utils/benchmark.h"
+#include <sstream>
+#include "paddle/fluid/platform/enforce.h"
+
+namespace paddle {
+namespace inference {
+
+std::string Benchmark::SerializeToString() const {
+  std::stringstream ss;
+  ss << "-----------------------------------------------------\n";
+  ss << "name\t";
+  ss << "batch_size\t";
+  ss << "num_threads\t";
+  ss << "latency\t";
+  ss << "qps";
+  ss << '\n';
+
+  ss << name_ << "\t";
+  ss << batch_size_ << "\t";
+  ss << num_threads_ << "\t";
+  ss << latency_ << "\t";
+  ss << 1000 / latency_;
+  ss << '\n';
+  return ss.str();
+}
+void Benchmark::PersistToFile(const std::string &path) const {
+  std::ofstream file(path, std::ios::app);
+  PADDLE_ENFORCE(file.is_open(), "Can not open %s to add benchmark", path);
+  file << SerializeToString();
+  file.flush();
+  file.close();
+}
+
+}  // namespace inference
+}  // namespace paddle
diff --git a/paddle/fluid/inference/utils/benchmark.h b/paddle/fluid/inference/utils/benchmark.h
new file mode 100644
index 0000000000..80e8f77adb
--- /dev/null
+++ b/paddle/fluid/inference/utils/benchmark.h
@@ -0,0 +1,52 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <fstream>
+#include <iostream>
+
+namespace paddle {
+namespace inference {
+
+/*
+ * Helper class to calculate the performance.
+ */
+struct Benchmark {
+  int batch_size() const { return batch_size_; }
+  void SetBatchSize(int x) { batch_size_ = x; }
+
+  int num_threads() const { return num_threads_; }
+  void SetNumThreads(int x) { num_threads_ = x; }
+
+  bool use_gpu() const { return use_gpu_; }
+  void SetUseGpu() { use_gpu_ = true; }
+
+  int latency() const { return latency_; }
+  void SetLatency(int x) { latency_ = x; }
+
+  const std::string& name() const { return name_; }
+  void SetName(const std::string& name) { name_ = name; }
+
+  std::string SerializeToString() const;
+  void PersistToFile(const std::string& path) const;
+
+ private:
+  bool use_gpu_{false};
+  int batch_size_{0};
+  int latency_;
+  int num_threads_{1};
+  std::string name_;
+};
+
+}  // namespace inference
+}  // namespace paddle
diff --git a/paddle/fluid/inference/utils/benchmark_tester.cc b/paddle/fluid/inference/utils/benchmark_tester.cc
new file mode 100644
index 0000000000..eb25547408
--- /dev/null
+++ b/paddle/fluid/inference/utils/benchmark_tester.cc
@@ -0,0 +1,39 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/inference/utils/benchmark.h"
+#include <glog/logging.h>
+#include <gtest/gtest.h>
+
+using namespace paddle::inference;
+TEST(Benchmark, basic) {
+  Benchmark benchmark;
+  benchmark.SetName("key0");
+  benchmark.SetBatchSize(10);
+  benchmark.SetUseGpu();
+  benchmark.SetLatency(220);
+  LOG(INFO) << "benchmark:\n" << benchmark.SerializeToString();
+}
+
+TEST(Benchmark, PersistToFile) {
+  Benchmark benchmark;
+  benchmark.SetName("key0");
+  benchmark.SetBatchSize(10);
+  benchmark.SetUseGpu();
+  benchmark.SetLatency(220);
+
+  benchmark.PersistToFile("1.log");
+  benchmark.PersistToFile("1.log");
+  benchmark.PersistToFile("1.log");
+}
\ No newline at end of file

From 6274c0f7fc4821fdd5df30a9dded7b3919fc5cbe Mon Sep 17 00:00:00 2001
From: minqiyang <minqiyang@baidu.com>
Date: Sun, 25 Nov 2018 22:53:05 +0800
Subject: [PATCH 096/113] Remove build content of Python3 in Dockerfile

test=develop
---
 Dockerfile | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/Dockerfile b/Dockerfile
index 9459552890..84e1edbee9 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -43,6 +43,8 @@ RUN wget -q https://www.python.org/ftp/python/3.7.0/Python-3.7.0.tgz && \
     CFLAGS="-Wformat" ./configure --prefix=/usr/local/ --enable-shared > /dev/null && \
     make -j8 > /dev/null && make altinstall > /dev/null
 
+RUN rm -r /root/python_build
+
 RUN apt-get update && \
     apt-get install -y --allow-downgrades patchelf \
     python3 python3-dev python3-pip \

From 840c1b29adb1168f8ff84d1151462b0d171c741b Mon Sep 17 00:00:00 2001
From: dzhwinter <dongzhihong01@baidu.com>
Date: Mon, 26 Nov 2018 10:30:25 +0800
Subject: [PATCH 097/113] test=develop (#14562)

* test=develop

remove code.

* test=develop
---
 paddle/fluid/inference/api/demo_ci/CMakeLists.txt | 2 --
 1 file changed, 2 deletions(-)

diff --git a/paddle/fluid/inference/api/demo_ci/CMakeLists.txt b/paddle/fluid/inference/api/demo_ci/CMakeLists.txt
index 49683eab07..8fb464c0f5 100644
--- a/paddle/fluid/inference/api/demo_ci/CMakeLists.txt
+++ b/paddle/fluid/inference/api/demo_ci/CMakeLists.txt
@@ -46,8 +46,6 @@ if(WITH_GPU)
     endif()
   endif(NOT WIN32)
 endif()
-
-include_directories("D:/Paddle/")
 include_directories("${PADDLE_LIB}")
 include_directories("${PADDLE_LIB}/third_party/install/protobuf/include")
 include_directories("${PADDLE_LIB}/third_party/install/glog/include")

From 83444de41c9ec0d446bf867c1ad7340bb36fd859 Mon Sep 17 00:00:00 2001
From: JiabinYang <marsyang199376@gmail.com>
Date: Mon, 26 Nov 2018 03:12:41 +0000
Subject: [PATCH 098/113] polish init.py, test=develop

---
 python/paddle/fluid/__init__.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/python/paddle/fluid/__init__.py b/python/paddle/fluid/__init__.py
index ac2a7ea47e..ff651db043 100644
--- a/python/paddle/fluid/__init__.py
+++ b/python/paddle/fluid/__init__.py
@@ -120,7 +120,6 @@ def __bootstrap__():
         'reader_queue_speed_test_mode', 'print_sub_graph_dir'
     ]
     if 'Darwin' not in sysstr:
-        print("aaaaa")
         read_env_flags.append('use_pinned_memory')
 
     if os.name != 'nt':

From 4b40c0013b53759e42dfc06525cfd1239945fc82 Mon Sep 17 00:00:00 2001
From: superjomn <yanchunwei@outlook.com>
Date: Mon, 26 Nov 2018 11:19:15 +0800
Subject: [PATCH 099/113] fix compile

test=develop
---
 paddle/fluid/framework/CMakeLists.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/paddle/fluid/framework/CMakeLists.txt b/paddle/fluid/framework/CMakeLists.txt
index 83c8478685..0975497a17 100644
--- a/paddle/fluid/framework/CMakeLists.txt
+++ b/paddle/fluid/framework/CMakeLists.txt
@@ -117,7 +117,7 @@ cc_library(op_info SRCS op_info.cc DEPS attribute framework_proto)
 cc_library(shape_inference SRCS shape_inference.cc DEPS ddim attribute device_context)
 
 if (NOT WIN32)
-cc_library(transfer_scope_cache SRCS transfer_scope_cache.cc DEPS scope framework_proto)
+cc_library(transfer_scope_cache SRCS transfer_scope_cache.cc DEPS scope framework_proto device_context)
 cc_library(operator SRCS operator.cc DEPS op_info device_context tensor scope glog
     shape_inference data_transform lod_tensor profiler transfer_scope_cache)
 else()

From bf222f197d0ac5f7416d5e70d5ddca1009b96559 Mon Sep 17 00:00:00 2001
From: Yiqun Liu <liuyiqun01@baidu.com>
Date: Mon, 26 Nov 2018 11:21:26 +0800
Subject: [PATCH 100/113] Use sub scope in tensor_array_to_tensor op. (#14524)

test=develop
---
 paddle/fluid/framework/executor.cc            |  4 +-
 .../fluid/inference/api/analysis_predictor.cc |  1 -
 paddle/fluid/inference/api/api_impl.cc        |  1 -
 .../fluid/inference/tests/api/CMakeLists.txt  | 50 +++++++++----------
 .../operators/tensor_array_to_tensor_op.cc    | 10 ++--
 5 files changed, 32 insertions(+), 34 deletions(-)

diff --git a/paddle/fluid/framework/executor.cc b/paddle/fluid/framework/executor.cc
index f6c82995e1..3dc571d757 100644
--- a/paddle/fluid/framework/executor.cc
+++ b/paddle/fluid/framework/executor.cc
@@ -392,8 +392,8 @@ void Executor::RunPreparedContext(ExecutorPrepareContext* ctx, Scope* scope,
 
   int64_t max_memory_size = GetEagerDeletionThreshold();
   std::unique_ptr<GarbageCollector<Tensor>> gc;
-  // WhileOp would set keep_kids to false
-  // WhileGradOp would need the scopes created in WhileOp
+  // WhileOp would set keep_kids to true,
+  // because WhileGradOp needs the scopes created in WhileOp.
   // Perhaps, we should not perform eager deletion in WhileOp
   // The scopes and variables created by WhileOp would be deleted
   // in WhileGradOp.
diff --git a/paddle/fluid/inference/api/analysis_predictor.cc b/paddle/fluid/inference/api/analysis_predictor.cc
index c132ce326c..d111c0cfd9 100644
--- a/paddle/fluid/inference/api/analysis_predictor.cc
+++ b/paddle/fluid/inference/api/analysis_predictor.cc
@@ -174,7 +174,6 @@ bool AnalysisPredictor::Run(const std::vector<PaddleTensor> &inputs,
   inference::Timer timer;
   timer.tic();
   // set feed variable
-  std::vector<framework::LoDTensor> feeds;
   framework::Scope *scope = sub_scope_ ? sub_scope_ : scope_.get();
   if (!SetFeed(inputs, scope)) {
     LOG(ERROR) << "fail to set feed";
diff --git a/paddle/fluid/inference/api/api_impl.cc b/paddle/fluid/inference/api/api_impl.cc
index 66a8e51396..ff375c73a7 100644
--- a/paddle/fluid/inference/api/api_impl.cc
+++ b/paddle/fluid/inference/api/api_impl.cc
@@ -138,7 +138,6 @@ bool NativePaddlePredictor::Run(const std::vector<PaddleTensor> &inputs,
   Timer timer;
   timer.tic();
   // set feed variable
-  std::vector<framework::LoDTensor> feeds;
   framework::Scope *scope = sub_scope_ != nullptr ? sub_scope_ : scope_.get();
   if (!SetFeed(inputs, scope)) {
     LOG(ERROR) << "fail to set feed";
diff --git a/paddle/fluid/inference/tests/api/CMakeLists.txt b/paddle/fluid/inference/tests/api/CMakeLists.txt
index e8bd13037e..7dc88d9dd0 100644
--- a/paddle/fluid/inference/tests/api/CMakeLists.txt
+++ b/paddle/fluid/inference/tests/api/CMakeLists.txt
@@ -74,7 +74,7 @@ inference_analysis_api_test(test_analyzer_seq_conv1 ${SEQ_CONV1_INSTALL_DIR} ana
 # ocr
 set(OCR_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/ocr")
 if (NOT EXISTS ${OCR_INSTALL_DIR})
-  inference_download_and_uncompress(${OCR_INSTALL_DIR} "http://paddlemodels.cdn.bcebos.com/" "inference-vis-demos%2Focr.tar.gz")
+    inference_download_and_uncompress(${OCR_INSTALL_DIR} "http://paddlemodels.cdn.bcebos.com/" "inference-vis-demos%2Focr.tar.gz")
 endif()
 inference_analysis_api_test(test_analyzer_ocr ${OCR_INSTALL_DIR} analyzer_vis_tester.cc)
 
@@ -88,31 +88,31 @@ inference_analysis_api_test_with_fake_data(test_analyzer_mobilenet
 
 # anakin
 if (WITH_ANAKIN AND WITH_MKL) # only needed in CI
-   # anakin rnn1
-   set(ANAKIN_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/anakin")
-   set(ANAKIN_RNN1_INSTALL_DIR "${ANAKIN_INSTALL_DIR}/rnn1")
-   inference_download(${ANAKIN_RNN1_INSTALL_DIR} ${INFERENCE_URL} "anakin_test%2Fditu_rnn.anakin2.model.bin")
-   inference_download(${ANAKIN_RNN1_INSTALL_DIR} ${INFERENCE_URL} "anakin_test%2Fditu_rnn_data.txt")
-   cc_test(test_anakin_rnn1 SRCS anakin_rnn1_tester.cc
-           ARGS --model=${ANAKIN_RNN1_INSTALL_DIR}/anakin_test%2Fditu_rnn.anakin2.model.bin
-                --datapath=${ANAKIN_RNN1_INSTALL_DIR}/anakin_test%2Fditu_rnn_data.txt
-           DEPS inference_anakin_api_shared SERIAL)
-   # anakin mobilenet
-   if(WITH_GPU)
-       set(ANAKIN_MOBILENET_INSTALL_DIR "${ANAKIN_INSTALL_DIR}/mobilenet")
-       inference_download(${ANAKIN_MOBILENET_INSTALL_DIR} ${INFERENCE_URL} "mobilenet_v2.anakin.bin")
-       cc_test(test_anakin_mobilenet SRCS anakin_mobilenet_tester.cc
-               ARGS --model=${ANAKIN_MOBILENET_INSTALL_DIR}/mobilenet_v2.anakin.bin
-               DEPS inference_anakin_api_shared dynload_cuda SERIAL)
-   endif()
+    # anakin rnn1
+    set(ANAKIN_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/anakin")
+    set(ANAKIN_RNN1_INSTALL_DIR "${ANAKIN_INSTALL_DIR}/rnn1")
+    inference_download(${ANAKIN_RNN1_INSTALL_DIR} ${INFERENCE_URL} "anakin_test%2Fditu_rnn.anakin2.model.bin")
+    inference_download(${ANAKIN_RNN1_INSTALL_DIR} ${INFERENCE_URL} "anakin_test%2Fditu_rnn_data.txt")
+    cc_test(test_anakin_rnn1 SRCS anakin_rnn1_tester.cc
+            ARGS --model=${ANAKIN_RNN1_INSTALL_DIR}/anakin_test%2Fditu_rnn.anakin2.model.bin
+                 --datapath=${ANAKIN_RNN1_INSTALL_DIR}/anakin_test%2Fditu_rnn_data.txt
+            DEPS inference_anakin_api_shared SERIAL)
+    # anakin mobilenet
+    if(WITH_GPU)
+        set(ANAKIN_MOBILENET_INSTALL_DIR "${ANAKIN_INSTALL_DIR}/mobilenet")
+        inference_download(${ANAKIN_MOBILENET_INSTALL_DIR} ${INFERENCE_URL} "mobilenet_v2.anakin.bin")
+        cc_test(test_anakin_mobilenet SRCS anakin_mobilenet_tester.cc
+                ARGS --model=${ANAKIN_MOBILENET_INSTALL_DIR}/mobilenet_v2.anakin.bin
+                DEPS inference_anakin_api_shared dynload_cuda SERIAL)
+    endif()
 endif()
 
 if(WITH_GPU AND TENSORRT_FOUND)
-   set(TRT_MODEL_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/trt")
-   if (NOT EXISTS ${TRT_MODEL_INSTALL_DIR})
-       inference_download_and_uncompress(${TRT_MODEL_INSTALL_DIR} ${INFERENCE_URL}/tensorrt_test "trt_test_models.tar.gz")
-   endif()
-   inference_analysis_test(test_trt_models SRCS trt_models_tester.cc
-      EXTRA_DEPS ${INFERENCE_EXTRA_DEPS}
-        ARGS --infer_model=${TRT_MODEL_INSTALL_DIR}/trt_test_models SERIAL)
+    set(TRT_MODEL_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/trt")
+    if (NOT EXISTS ${TRT_MODEL_INSTALL_DIR})
+        inference_download_and_uncompress(${TRT_MODEL_INSTALL_DIR} ${INFERENCE_URL}/tensorrt_test "trt_test_models.tar.gz")
+    endif()
+    inference_analysis_test(test_trt_models SRCS trt_models_tester.cc
+            EXTRA_DEPS ${INFERENCE_EXTRA_DEPS}
+            ARGS --infer_model=${TRT_MODEL_INSTALL_DIR}/trt_test_models SERIAL)
 endif()
diff --git a/paddle/fluid/operators/tensor_array_to_tensor_op.cc b/paddle/fluid/operators/tensor_array_to_tensor_op.cc
index 96dc123f6a..58a74ec2c1 100644
--- a/paddle/fluid/operators/tensor_array_to_tensor_op.cc
+++ b/paddle/fluid/operators/tensor_array_to_tensor_op.cc
@@ -106,9 +106,9 @@ class LoDTensorArray2TensorOp : public framework::OperatorBase {
     out_inx_dim[0] = inx.size();
     out_inx.Resize(out_inx_dim);
 
+    auto &local_scope = scope.NewScope();
     std::string var_name = "out_index";
-    framework::Variable *tmp_index_var =
-        const_cast<framework::Scope &>(scope).Var(var_name);
+    framework::Variable *tmp_index_var = local_scope.Var(var_name);
     auto &tmp_index_tensor =
         *(tmp_index_var->GetMutable<paddle::framework::LoDTensor>());
     tmp_index_tensor.Resize(out_inx_dim);
@@ -128,12 +128,12 @@ class LoDTensorArray2TensorOp : public framework::OperatorBase {
     out_dims[axis] = out_dim_sum;
     out.Resize(out_dims);
 
-    LodTensorArray2LodTensorVector(scope, base_name, Input("X"), &names);
-    // Invoke Reshape Op
+    LodTensorArray2LodTensorVector(local_scope, base_name, Input("X"), &names);
+    // Invoke concat Op
     auto concat_op = framework::OpRegistry::CreateOp(
         "concat", {{"X", names}}, {{"Out", {Output("Out")}}}, attrs);
 
-    concat_op->Run(scope, place);
+    concat_op->Run(local_scope, place);
   }
 };
 

From 1afa9492af7b76d15d38bc2f49d28069832c2cc9 Mon Sep 17 00:00:00 2001
From: peizhilin <wopeizl@163.com>
Date: Mon, 26 Nov 2018 11:33:12 +0800
Subject: [PATCH 101/113] Recover the profiler

---
 cmake/cuda.cmake                                |  7 +++++--
 paddle/fluid/framework/lod_tensor.cc            | 17 +----------------
 paddle/fluid/framework/lod_tensor_test.cc       |  2 --
 paddle/fluid/framework/operator.cc              | 11 ++++-------
 .../fluid/inference/api/analysis_predictor.cc   |  4 ----
 paddle/fluid/inference/api/api_impl.cc          |  4 ----
 .../fluid/inference/tests/api/tester_helper.h   |  4 ----
 7 files changed, 10 insertions(+), 39 deletions(-)

diff --git a/cmake/cuda.cmake b/cmake/cuda.cmake
index 4c7e0fd3f6..414e92eb27 100644
--- a/cmake/cuda.cmake
+++ b/cmake/cuda.cmake
@@ -199,10 +199,13 @@ elseif(CMAKE_BUILD_TYPE  STREQUAL "MinSizeRel")
     list(APPEND CUDA_NVCC_FLAGS  ${CMAKE_CXX_FLAGS_RELEASE})
 endif()
 else(NOT WIN32)
+list(APPEND CUDA_NVCC_FLAGS  "--compiler-options;/bigobj")
 if(CMAKE_BUILD_TYPE  STREQUAL "Debug")
-    list(APPEND CUDA_NVCC_FLAGS  "-g -G --compiler-options;/bigobj")
+  list(APPEND CUDA_NVCC_FLAGS  "-g -G")
+  # match the cl's _ITERATOR_DEBUG_LEVEL
+  list(APPEND CUDA_NVCC_FLAGS  "-D_DEBUG")
 elseif(CMAKE_BUILD_TYPE STREQUAL "Release")
-  list(APPEND CUDA_NVCC_FLAGS "-O3 -DNDEBUG --compiler-options;/bigobj")
+  list(APPEND CUDA_NVCC_FLAGS "-O3 -DNDEBUG")
 else()
   message(FATAL "Windows only support Release or Debug build now. Please set visual studio build type to Release/Debug, x64 build.")
 endif()
diff --git a/paddle/fluid/framework/lod_tensor.cc b/paddle/fluid/framework/lod_tensor.cc
index 669d08c70c..9b2eeaf59a 100644
--- a/paddle/fluid/framework/lod_tensor.cc
+++ b/paddle/fluid/framework/lod_tensor.cc
@@ -26,10 +26,8 @@ limitations under the License. */
 #include "paddle/fluid/memory/memcpy.h"
 #include "paddle/fluid/memory/memory.h"
 
-#if !defined(_WIN32)
 #include "paddle/fluid/recordio/scanner.h"
 #include "paddle/fluid/recordio/writer.h"
-#endif  // _WIN32
 
 namespace paddle {
 namespace framework {
@@ -305,7 +303,6 @@ void DeserializeFromStream(std::istream &is, LoDTensor *tensor,
   TensorFromStream(is, static_cast<Tensor *>(tensor), dev_ctx);
 }
 
-#if !defined(_WIN32)
 void WriteToRecordIO(recordio::Writer *writer,
                      const std::vector<LoDTensor> &tensor,
                      const platform::DeviceContext &dev_ctx) {
@@ -335,19 +332,7 @@ bool ReadFromRecordIO(recordio::Scanner *scanner,
 
   return true;
 }
-#else
-class Writer {};
-class Scanner {};
-void WriteToRecordIO(recordio::Writer *writer,
-                     const std::vector<LoDTensor> &tensor,
-                     const platform::DeviceContext &dev_ctx) {}
-bool ReadFromRecordIO(recordio::Scanner *scanner,
-                      const platform::DeviceContext &dev_ctx,
-                      std::vector<LoDTensor> *result_ptr) {
-  PADDLE_ENFORCE("windows didn't supported recordio!.");
-  return true;
-}
-#endif  // _WIN32
+
 std::vector<LoDTensor> LoDTensor::SplitLoDTensor(
     const std::vector<platform::Place> places) const {
   check_memory_size();
diff --git a/paddle/fluid/framework/lod_tensor_test.cc b/paddle/fluid/framework/lod_tensor_test.cc
index cbf5fd04d7..cd50aaa260 100644
--- a/paddle/fluid/framework/lod_tensor_test.cc
+++ b/paddle/fluid/framework/lod_tensor_test.cc
@@ -274,7 +274,6 @@ TEST(LoD, ConvertToOffsetBasedLoD) {
   EXPECT_EQ(offset_lod, expected);
 }
 
-#if !defined(_WIN32)
 template <typename T>
 static void TestRecordIO() {
   LoDTensor tensor;
@@ -321,7 +320,6 @@ TEST(LoDTensor, RecordIO) {
   TestRecordIO<float>();
   TestRecordIO<double>();
 }
-#endif  // !defined(_WIN32)
 
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/fluid/framework/operator.cc b/paddle/fluid/framework/operator.cc
index 1ec170b6f6..60fe838601 100644
--- a/paddle/fluid/framework/operator.cc
+++ b/paddle/fluid/framework/operator.cc
@@ -153,17 +153,14 @@ void OperatorBase::Run(const Scope& scope, const platform::Place& place) {
 #endif
   }
 
-// The profile has a process-wide mutex, results in serious performance issue
-// in concurrency scenerio. Here use an `if` to fix this issue.
-// Please not remove the `if`, ask @Superjomn if there are any concern.
-#ifndef _WIN32
+  // The profile has a process-wide mutex, results in serious performance issue
+  // in concurrency scenerio. Here use an `if` to fix this issue.
+  // Please not remove the `if`, ask @Superjomn if there are any concern.
   if (platform::IsProfileEnabled()) {
     platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
     platform::RecordEvent record_event(Type(), pool.Get(place));
     RunImpl(scope, place);
-  } else
-#endif
-  {
+  } else {
     RunImpl(scope, place);
   }
   VLOG(30) << place << " " << DebugStringEx(&scope);
diff --git a/paddle/fluid/inference/api/analysis_predictor.cc b/paddle/fluid/inference/api/analysis_predictor.cc
index cb14d2a260..8b9f872530 100644
--- a/paddle/fluid/inference/api/analysis_predictor.cc
+++ b/paddle/fluid/inference/api/analysis_predictor.cc
@@ -56,7 +56,6 @@ bool AnalysisPredictor::Init(
     const std::shared_ptr<framework::Scope> &parent_scope,
     const std::shared_ptr<framework::ProgramDesc> &program) {
   VLOG(30) << "Predictor::init()";
-#if !defined(_WIN32)
   if (FLAGS_profile) {
     LOG(WARNING) << "Profiler is actived, might affect the performance";
     LOG(INFO) << "You can turn off by set gflags '-profile false'";
@@ -64,7 +63,6 @@ bool AnalysisPredictor::Init(
                                            : platform::ProfilerState::kCPU;
     platform::EnableProfiler(tracking_device);
   }
-#endif
 
   // no matter with or without MKLDNN
   paddle::platform::SetNumThreads(FLAGS_paddle_num_threads);
@@ -501,12 +499,10 @@ bool AnalysisPredictor::LoadParameters() {
 }
 
 AnalysisPredictor::~AnalysisPredictor() {
-#if !defined(_WIN32)
   if (FLAGS_profile) {
     platform::DisableProfiler(platform::EventSortingKey::kTotal,
                               "./profile.log");
   }
-#endif
   if (sub_scope_) {
     scope_->DeleteScope(sub_scope_);
   }
diff --git a/paddle/fluid/inference/api/api_impl.cc b/paddle/fluid/inference/api/api_impl.cc
index fcbc3803d0..d80d8097ff 100644
--- a/paddle/fluid/inference/api/api_impl.cc
+++ b/paddle/fluid/inference/api/api_impl.cc
@@ -64,7 +64,6 @@ void NativePaddlePredictor::PrepareFeedFetch() {
 bool NativePaddlePredictor::Init(
     std::shared_ptr<framework::Scope> parent_scope) {
   VLOG(3) << "Predictor::init()";
-#if !defined(_WIN32)
   if (FLAGS_profile) {
     LOG(WARNING) << "Profiler is actived, might affect the performance";
     LOG(INFO) << "You can turn off by set gflags '-profile false'";
@@ -73,7 +72,6 @@ bool NativePaddlePredictor::Init(
                                            : platform::ProfilerState::kCPU;
     platform::EnableProfiler(tracking_device);
   }
-#endif
 
   // no matter with or without MKLDNN
   paddle::platform::SetNumThreads(FLAGS_paddle_num_threads);
@@ -121,12 +119,10 @@ bool NativePaddlePredictor::Init(
 }
 
 NativePaddlePredictor::~NativePaddlePredictor() {
-#if !defined(_WIN32)
   if (FLAGS_profile) {
     platform::DisableProfiler(platform::EventSortingKey::kTotal,
                               "./profile.log");
   }
-#endif
   if (sub_scope_) {
     scope_->DeleteScope(sub_scope_);
   }
diff --git a/paddle/fluid/inference/tests/api/tester_helper.h b/paddle/fluid/inference/tests/api/tester_helper.h
index 7b686045a5..23507cf9ab 100644
--- a/paddle/fluid/inference/tests/api/tester_helper.h
+++ b/paddle/fluid/inference/tests/api/tester_helper.h
@@ -177,11 +177,9 @@ void TestOneThreadPrediction(
     warmup_timer.tic();
     predictor->Run(inputs[0], outputs, batch_size);
     PrintTime(batch_size, 1, 1, 0, warmup_timer.toc(), 1);
-#if !defined(_WIN32)
     if (FLAGS_profile) {
       paddle::platform::ResetProfiler();
     }
-#endif
   }
 
   LOG(INFO) << "Run " << num_times << " times...";
@@ -230,11 +228,9 @@ void TestMultiThreadPrediction(
         warmup_timer.tic();
         predictor->Run(inputs[0], outputs, batch_size);
         PrintTime(batch_size, 1, num_threads, tid, warmup_timer.toc(), 1);
-#if !defined(_WIN32)
         if (FLAGS_profile) {
           paddle::platform::ResetProfiler();
         }
-#endif
       }
 
       LOG(INFO) << "Thread " << tid << " run " << num_times << " times...";

From b2f8d4183df1e3129f6dbe822cb02ad418716cda Mon Sep 17 00:00:00 2001
From: peizhilin <wopeizl@163.com>
Date: Mon, 26 Nov 2018 11:55:19 +0800
Subject: [PATCH 102/113] Given the different fraction_of_gpu_memory_to_use
 depends on platform

---
 paddle/fluid/platform/gpu_info.cc | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/paddle/fluid/platform/gpu_info.cc b/paddle/fluid/platform/gpu_info.cc
index e0d0051ad0..9d8a11a114 100644
--- a/paddle/fluid/platform/gpu_info.cc
+++ b/paddle/fluid/platform/gpu_info.cc
@@ -19,10 +19,16 @@ limitations under the License. */
 #include "gflags/gflags.h"
 #include "paddle/fluid/platform/enforce.h"
 
+#ifndef _WIN32
+const float fraction_of_gpu_memory_to_use = 0.92f;
+#else
 // fraction_of_gpu_memory_to_use cannot be too high on windows,
 // since the win32 graphic sub-system can occupy some GPU memory
 // which may lead to insufficient memory left for paddle
-DEFINE_double(fraction_of_gpu_memory_to_use, 0.5,
+const float fraction_of_gpu_memory_to_use = 0.5f;
+#endif
+
+DEFINE_double(fraction_of_gpu_memory_to_use, fraction_of_gpu_memory_to_use,
               "Allocate a trunk of gpu memory that is this fraction of the "
               "total gpu memory size. Future memory usage will be allocated "
               "from the trunk. If the trunk doesn't have enough gpu memory, "

From 39ec80def42ef48112ee0025e2a318138f1992b7 Mon Sep 17 00:00:00 2001
From: qingqing01 <dangqingqing@baidu.com>
Date: Mon, 26 Nov 2018 12:49:17 +0800
Subject: [PATCH 103/113] Remove the memory copy of feeding data in C++
 inference API (#14577)

* Remove the memory copy for feeding data in C++ inference API
* Fix compling dependence
* Fix compling in ONLY_CPU mode
---
 paddle/fluid/inference/api/CMakeLists.txt     |  4 +++-
 .../fluid/inference/api/analysis_predictor.cc | 23 ++++++++++++++----
 paddle/fluid/inference/api/api_impl.cc        | 24 +++++++++++++++----
 3 files changed, 40 insertions(+), 11 deletions(-)

diff --git a/paddle/fluid/inference/api/CMakeLists.txt b/paddle/fluid/inference/api/CMakeLists.txt
index e9969b84f3..eda251c534 100644
--- a/paddle/fluid/inference/api/CMakeLists.txt
+++ b/paddle/fluid/inference/api/CMakeLists.txt
@@ -30,7 +30,9 @@ cc_library(paddle_pass_builder SRCS paddle_pass_builder.cc)
 cc_library(analysis_predictor SRCS analysis_predictor.cc DEPS paddle_inference_api analysis naive_executor zero_copy_tensor reset_tensor_array analysis_config paddle_pass_builder ir_pass_manager)
 cc_library(zero_copy_tensor SRCS details/zero_copy_tensor.cc DEPS scope lod_tensor enforce)
 cc_library(zero_copy_tensor_dummy SRCS details/zero_copy_tensor_dummy.cc)
-cc_library(paddle_inference_api SRCS api.cc api_impl.cc helper.cc DEPS lod_tensor scope paddle_pass_builder reset_tensor_array analysis_config analysis_config paddle_pass_builder DEPS zero_copy_tensor)
+cc_library(paddle_inference_api SRCS api.cc api_impl.cc helper.cc DEPS
+           lod_tensor scope paddle_pass_builder reset_tensor_array analysis_config
+           analysis_config paddle_pass_builder zero_copy_tensor reset_tensor_array)
 
 cc_test(test_paddle_inference_api
         SRCS api_tester.cc
diff --git a/paddle/fluid/inference/api/analysis_predictor.cc b/paddle/fluid/inference/api/analysis_predictor.cc
index d111c0cfd9..72ac534384 100644
--- a/paddle/fluid/inference/api/analysis_predictor.cc
+++ b/paddle/fluid/inference/api/analysis_predictor.cc
@@ -31,6 +31,7 @@
 #include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
 #endif
 #include "paddle/fluid/inference/utils/singleton.h"
+#include "paddle/fluid/memory/memcpy.h"
 #include "paddle/fluid/platform/cpu_helper.h"
 #include "paddle/fluid/platform/profiler.h"
 
@@ -214,17 +215,29 @@ bool AnalysisPredictor::SetFeed(const std::vector<PaddleTensor> &inputs,
     framework::DDim ddim = framework::make_ddim(inputs[i].shape);
     void *input_ptr;
     if (inputs[i].dtype == PaddleDType::INT64) {
-      input_ptr = input.mutable_data<int64_t>(ddim, platform::CPUPlace());
+      input_ptr = input.mutable_data<int64_t>(ddim, place_);
     } else if (inputs[i].dtype == PaddleDType::FLOAT32) {
-      input_ptr = input.mutable_data<float>(ddim, platform::CPUPlace());
+      input_ptr = input.mutable_data<float>(ddim, place_);
     } else {
       LOG(ERROR) << "unsupported feed type " << inputs[i].dtype;
       return false;
     }
 
-    // TODO(panyx0718): Init LoDTensor from existing memcpy to save a copy.
-    std::memcpy(static_cast<void *>(input_ptr), inputs[i].data.data(),
-                inputs[i].data.length());
+    if (platform::is_cpu_place(place_)) {
+      // TODO(panyx0718): Init LoDTensor from existing memcpy to save a copy.
+      std::memcpy(static_cast<void *>(input_ptr), inputs[i].data.data(),
+                  inputs[i].data.length());
+    } else {
+#ifdef PADDLE_WITH_CUDA
+      auto dst_gpu_place = boost::get<platform::CUDAPlace>(place_);
+      memory::Copy(dst_gpu_place, static_cast<void *>(input_ptr),
+                   platform::CPUPlace(), inputs[i].data.data(),
+                   inputs[i].data.length(),
+                   0);  // stream 0 for sync copy
+#else
+      PADDLE_THROW("Not compile with CUDA, should not reach here.");
+#endif
+    }
     // TODO(Superjomn) Low performance, need optimization for heavy LoD copy.
     framework::LoD lod;
     for (auto &level : inputs[i].lod) {
diff --git a/paddle/fluid/inference/api/api_impl.cc b/paddle/fluid/inference/api/api_impl.cc
index ff375c73a7..0f88ad14b0 100644
--- a/paddle/fluid/inference/api/api_impl.cc
+++ b/paddle/fluid/inference/api/api_impl.cc
@@ -24,6 +24,7 @@ limitations under the License. */
 #include "paddle/fluid/inference/api/api_impl.h"
 #include "paddle/fluid/inference/api/details/reset_tensor_array.h"
 #include "paddle/fluid/inference/api/helper.h"
+#include "paddle/fluid/memory/memcpy.h"
 #include "paddle/fluid/platform/cpu_helper.h"
 #include "paddle/fluid/platform/profiler.h"
 
@@ -193,17 +194,30 @@ bool NativePaddlePredictor::SetFeed(const std::vector<PaddleTensor> &inputs,
     framework::DDim ddim = framework::make_ddim(inputs[i].shape);
     void *input_ptr;
     if (inputs[i].dtype == PaddleDType::INT64) {
-      input_ptr = input.mutable_data<int64_t>(ddim, platform::CPUPlace());
+      input_ptr = input.mutable_data<int64_t>(ddim, place_);
     } else if (inputs[i].dtype == PaddleDType::FLOAT32) {
-      input_ptr = input.mutable_data<float>(ddim, platform::CPUPlace());
+      input_ptr = input.mutable_data<float>(ddim, place_);
     } else {
       LOG(ERROR) << "unsupported feed type " << inputs[i].dtype;
       return false;
     }
 
-    // TODO(panyx0718): Init LoDTensor from existing memcpy to save a copy.
-    std::memcpy(static_cast<void *>(input_ptr), inputs[i].data.data(),
-                inputs[i].data.length());
+    if (platform::is_cpu_place(place_)) {
+      // TODO(panyx0718): Init LoDTensor from existing memcpy to save a copy.
+      std::memcpy(static_cast<void *>(input_ptr), inputs[i].data.data(),
+                  inputs[i].data.length());
+    } else {
+#ifdef PADDLE_WITH_CUDA
+      auto dst_gpu_place = boost::get<platform::CUDAPlace>(place_);
+      memory::Copy(dst_gpu_place, static_cast<void *>(input_ptr),
+                   platform::CPUPlace(), inputs[i].data.data(),
+                   inputs[i].data.length(),
+                   0);  // stream 0 for sync copy
+#else
+      PADDLE_THROW("Not compile with CUDA, should not reach here.");
+#endif
+    }
+
     // TODO(Superjomn) Low performance, need optimization for heavy LoD copy.
     framework::LoD lod;
     for (auto &level : inputs[i].lod) {

From e0d47cc9411006dcf6d17189349e8de7940535b2 Mon Sep 17 00:00:00 2001
From: peizhilin <wopeizl@163.com>
Date: Mon, 26 Nov 2018 13:50:46 +0800
Subject: [PATCH 104/113] test=develop

---
 cmake/external/eigen.cmake | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cmake/external/eigen.cmake b/cmake/external/eigen.cmake
index 573ad5e5f0..6aef97f212 100644
--- a/cmake/external/eigen.cmake
+++ b/cmake/external/eigen.cmake
@@ -17,7 +17,7 @@ if(WITH_AMD_GPU)
         extern_eigen3
         ${EXTERNAL_PROJECT_LOG_ARGS}
         GIT_REPOSITORY  "https://github.com/sabreshao/hipeigen.git"
-        GIT_TAG         0cba03ff9f8f9f70bbd92ac5857b031aa8fed6f9
+        GIT_TAG         7cb2b6e5a4b4a1efe658abb215cd866c6fb2275e
         PREFIX          ${EIGEN_SOURCE_DIR}
         UPDATE_COMMAND  ""
         CONFIGURE_COMMAND ""

From 3639d99f9910e98f4cccb0e2de1f583d62ab0028 Mon Sep 17 00:00:00 2001
From: tangwei12 <tangwei12@baidu.com>
Date: Mon, 26 Nov 2018 14:30:21 +0800
Subject: [PATCH 105/113] Fix save and load lookup table/optimizer vars
 (#14301)

*  fix mkdir conflict

*  fix load/save lookup tables

 test=develop

* add lookup_table_utils

* fix load optimize vars on pserver

* delete lookup table utils

* fix save and load lookup tables

* fix load optimizer var

* fix load optimizer var, test=develop

* fix python 3 style, test=develop

* move lookup_table_utils to contrib utils
---
 .../fluid/operators/lookup_sparse_table_op.cc |   1 +
 paddle/fluid/operators/sum_op.h               |   3 +
 python/paddle/fluid/contrib/utils/__init__.py |   4 +-
 .../fluid/contrib/utils/lookup_table_utils.py | 256 ++++++++++++++++++
 python/paddle/fluid/framework.py              |  20 ++
 python/paddle/fluid/io.py                     |  51 +++-
 .../fluid/transpiler/distribute_transpiler.py |  29 +-
 7 files changed, 342 insertions(+), 22 deletions(-)
 create mode 100644 python/paddle/fluid/contrib/utils/lookup_table_utils.py

diff --git a/paddle/fluid/operators/lookup_sparse_table_op.cc b/paddle/fluid/operators/lookup_sparse_table_op.cc
index a6843f20a5..1b55527fd3 100644
--- a/paddle/fluid/operators/lookup_sparse_table_op.cc
+++ b/paddle/fluid/operators/lookup_sparse_table_op.cc
@@ -67,6 +67,7 @@ class LookupSparseTableOp : public framework::OperatorBase {
                       framework::proto::VarType::FP32,
                       "The sparse table only support FP32");
     w_t->Get(ids_t, out_t, true, is_test);
+    out_t->set_lod(ids_t.lod());
   }
 };
 
diff --git a/paddle/fluid/operators/sum_op.h b/paddle/fluid/operators/sum_op.h
index 19b2c68c82..76cc796a9b 100644
--- a/paddle/fluid/operators/sum_op.h
+++ b/paddle/fluid/operators/sum_op.h
@@ -127,6 +127,9 @@ class SumKernel : public framework::OpKernel<T> {
         math::scatter::MergeAdd<DeviceContext, T> merge_add;
         merge_add(context.template device_context<DeviceContext>(), inputs,
                   out);
+
+        out->SyncIndex();
+
       } else {
         // no data, just set a empty out tensor.
         out->mutable_value()->mutable_data<T>(framework::make_ddim({0}),
diff --git a/python/paddle/fluid/contrib/utils/__init__.py b/python/paddle/fluid/contrib/utils/__init__.py
index df6d367782..6e479bdc2b 100644
--- a/python/paddle/fluid/contrib/utils/__init__.py
+++ b/python/paddle/fluid/contrib/utils/__init__.py
@@ -13,8 +13,10 @@
 # limitations under the License.
 
 from __future__ import print_function
-
+from . import lookup_table_utils
+from .lookup_table_utils import *
 from . import hdfs_utils
 from .hdfs_utils import *
 
+__all__ = lookup_table_utils.__all__
 __all__ = hdfs_utils.__all__
diff --git a/python/paddle/fluid/contrib/utils/lookup_table_utils.py b/python/paddle/fluid/contrib/utils/lookup_table_utils.py
new file mode 100644
index 0000000000..cc2418238f
--- /dev/null
+++ b/python/paddle/fluid/contrib/utils/lookup_table_utils.py
@@ -0,0 +1,256 @@
+# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import os
+import time
+import logging
+
+import paddle
+import paddle.fluid as fluid
+from paddle.fluid import core
+from paddle.fluid import io
+from paddle.fluid import Program
+
+__all__ = [
+    "load_inference_model", "load_persistable_vars",
+    "convert_dist_to_sparse_program"
+]
+
+logging.basicConfig(format='%(asctime)s - %(levelname)s - %(message)s')
+_logger = logging.getLogger("lookup_table_utils")
+_logger.setLevel(logging.INFO)
+
+model_filename = "__model__"
+lookup_table_dir = "__lookup_table__"
+
+
+def __insert_lookup_sparse_table_op(main_program, idx, ids, w, out):
+    main_program.global_block()._insert_op(
+        index=idx,
+        type="lookup_sparse_table",
+        inputs={"Ids": [ids],
+                "W": [w]},
+        outputs={"Out": [out]},
+        attrs={
+            "is_distributed": False,
+            "is_sparse": True,
+            "grad_inplace": False
+        })
+
+
+def __get_prefetch_op_tuples(main_program):
+    # current lookup tables op is split_ids->prefetch->merge_ids
+    prefetch_op_tuples = None
+    op_types = [op.type for op in main_program.global_block().ops]
+
+    for i in range(len(op_types)):
+        if op_types[i] == "prefetch":
+            if op_types[i - 1] == "split_ids" and op_types[i +
+                                                           1] == "merge_ids":
+                split_ids_op_id = i - 1
+                split_ids_inputs = main_program.global_block().ops[i - 1].input(
+                    "Ids")
+                prefetch_op_inputs = main_program.global_block().ops[i].input(
+                    "X")
+                prefetch_op_outputs = main_program.global_block().ops[i].output(
+                    "Out")
+                merge_ids_outputs = main_program.global_block().ops[
+                    i + 1].output("Out")
+
+                need_delete_vars = []
+                need_delete_vars.extend(prefetch_op_inputs)
+                need_delete_vars.extend(prefetch_op_outputs)
+
+                prefetch_op_tuples = (split_ids_op_id, split_ids_inputs,
+                                      merge_ids_outputs, need_delete_vars)
+                break
+    return prefetch_op_tuples
+
+
+def convert_dist_to_sparse_program(main_program):
+    if not main_program._distributed_lookup_table:
+        _logger.warn(
+            "There are no distributed lookup tables need to be converted")
+        return
+
+    # create table param and grad var in pserver program
+    origin_emb_var = "{}.origin".format(main_program._distributed_lookup_table)
+    emb_var = main_program._distributed_lookup_table
+    main_program.global_block()._rename_var(emb_var, origin_emb_var)
+    origin_param_var = main_program.global_block().vars[origin_emb_var]
+
+    param_var = main_program.global_block().create_var(
+        name=emb_var,
+        shape=origin_param_var.shape,
+        dtype=origin_param_var.dtype,
+        type=core.VarDesc.VarType.SELECTED_ROWS,
+        persistable=True)
+    # parameter must be selected rows
+    param_var.desc.set_type(core.VarDesc.VarType.SELECTED_ROWS)
+    main_program._sync_with_cpp()
+
+    prefetch_op_tuples = __get_prefetch_op_tuples(main_program)
+
+    split_ids_id = prefetch_op_tuples[0]
+
+    for idx in range(split_ids_id + 2, split_ids_id - 1, -1):
+        main_program.global_block()._remove_op(idx)
+    main_program.desc.flush()
+
+    in_out_pairs = zip(prefetch_op_tuples[1], prefetch_op_tuples[2])
+
+    for in_out_pair in in_out_pairs:
+        idx = split_ids_id
+        ids = main_program.global_block().vars[in_out_pair[0]]
+        out = main_program.global_block().vars[in_out_pair[1]]
+        __insert_lookup_sparse_table_op(main_program, idx, ids, param_var, out)
+        main_program.desc.flush()
+    return main_program
+
+
+def load_persistable_vars(executor, dirname, program, lookup_table_var):
+    def _is_checkpoint_var(exclude_fluid_vars=None):
+        """
+        the checkpoint will not save or load all the variables.
+        var type is FEED_MINIBATCH/FETCH_LIST/RAW or var name ends with @GRAD are discarded.
+
+        : param var(Variable)
+        """
+
+        if exclude_fluid_vars is None:
+            exclude_fluid_vars = []
+
+        def is_valid(var):
+            if var.desc.type() == core.VarDesc.VarType.FEED_MINIBATCH or \
+                    var.desc.type() == core.VarDesc.VarType.FETCH_LIST or \
+                    var.desc.type() == core.VarDesc.VarType.RAW:
+                return False
+            # @GRAD are named for gradient variables, checkpoint will not save it.
+            if "@GRAD" in var.name:
+                return False
+            # .trainer_ are named for distribute train variables, checkpoint will not save it.
+            if ".trainer_" in var.name:
+                return False
+
+            # .block is named for distribute train variables, checkpoint will not save it.
+            if ".block" in var.name:
+                return False
+
+            if "tmp_" in var.name:
+                return False
+
+            if var.name in exclude_fluid_vars:
+                return False
+
+            return var.persistable
+
+        return is_valid
+
+    def _load_lookup_table_vars(executor, dirname, main_program,
+                                lookup_table_vars):
+        if not os.path.isdir(dirname):
+            raise ValueError("There is no directory named '%s'", dirname)
+
+        lookup_table_dirname = os.path.join(dirname, lookup_table_dir)
+
+        emb_var_name = lookup_table_vars[0]
+        emb_var = main_program.global_block().var(emb_var_name)
+
+        emb_files = []
+        for emb_name in os.listdir(lookup_table_dirname):
+            if emb_var_name in emb_name:
+                emb_files.append(emb_name)
+
+        convert_program = Program()
+        global_block = convert_program.global_block()
+
+        emb_var = global_block.create_var(
+            name=emb_var.name,
+            shape=emb_var.shape,
+            dtype=emb_var.dtype,
+            type=core.VarDesc.VarType.SELECTED_ROWS,
+            persistable=True)
+        emb_var.desc.set_type(core.VarDesc.VarType.SELECTED_ROWS)
+
+        sums = []
+
+        for i, emb_file in enumerate(emb_files):
+            var_name = "{}_{}".format(emb_var.name, i)
+            param_var = global_block.create_var(
+                name=var_name,
+                shape=emb_var.shape,
+                dtype=emb_var.dtype,
+                type=core.VarDesc.VarType.SELECTED_ROWS,
+                persistable=True)
+            param_var.desc.set_type(core.VarDesc.VarType.SELECTED_ROWS)
+            global_block.append_op(
+                type='load',
+                inputs={},
+                outputs={'Out': [param_var]},
+                attrs={
+                    'file_path': os.path.join(lookup_table_dirname, var_name)
+                })
+            sums.append(param_var)
+        global_block.append_op(
+            type='sum', inputs={"X": sums}, outputs={'Out': emb_var}, attrs={})
+        global_block.append_op(type='delete_var', inputs={'X': sums})
+        executor.run(convert_program)
+
+    _logger.info("Start Load Sparse Program With "
+                 "Distributed Lookup Table Vars from {}, time = {}".format(
+                     dirname, time.ctime()))
+
+    lookup_table_vars = [lookup_table_var]
+
+    io.load_vars(
+        executor,
+        dirname=dirname,
+        main_program=program,
+        predicate=_is_checkpoint_var(lookup_table_vars),
+        filename=None)
+
+    _load_lookup_table_vars(executor, dirname, program, lookup_table_vars)
+
+    _logger.info("Finish Load Sparse Program With "
+                 "Distributed Lookup Table Vars from {}, time = {}".format(
+                     dirname, time.ctime()))
+
+
+def load_inference_model(dirname, executor, lookup_table_var_name):
+    if not os.path.isdir(dirname):
+        raise ValueError("There is no directory named '%s'", dirname)
+
+    local_model = os.path.join(dirname, model_filename)
+
+    with open(local_model, "rb") as f:
+        program_desc_str = f.read()
+
+    program = Program.parse_from_string(program_desc_str)
+
+    if not core._is_program_version_supported(program._version()):
+        raise ValueError("Unsupported program version: %d\n" %
+                         program._version())
+
+    # Binary data also need version.
+    load_persistable_vars(executor, dirname, program, lookup_table_var_name)
+
+    feed_target_names = program.desc.get_feed_target_names()
+    fetch_target_names = program.desc.get_fetch_target_names()
+    fetch_targets = [
+        program.global_block().var(name) for name in fetch_target_names
+    ]
+
+    return [program, feed_target_names, fetch_targets]
diff --git a/python/paddle/fluid/framework.py b/python/paddle/fluid/framework.py
index fd03dff386..b991187d42 100644
--- a/python/paddle/fluid/framework.py
+++ b/python/paddle/fluid/framework.py
@@ -1698,6 +1698,7 @@ class Program(object):
 
         p._copy_param_info_from(self)
         p._copy_data_info_from(self)
+        p._copy_dist_param_info_from(self)
         return p
 
     def _prune(self, targets):
@@ -1938,6 +1939,25 @@ class Program(object):
                              "program, with represent the same topology")
         self.global_block()._copy_param_info_from(other.global_block())
 
+    def _copy_dist_param_info_from(self, other):
+        """
+        Copy the information of distributed information from other program.
+
+        Args:
+            other(Program): Other program
+
+        Returns:
+            None
+        """
+        if not isinstance(other, Program):
+            raise TypeError("_copy_dist_param_info_from should be invoked with "
+                            "Program")
+        self._is_distributed = other._is_distributed
+        self._is_chief = other._is_chief
+        self._slice_vars_and_attrs = other._slice_vars_and_attrs
+        self._endpoints = other._endpoints
+        self._distributed_lookup_table = other._distributed_lookup_table
+
     def _copy_data_info_from(self, other):
         """
         Copy the information of data variables from other program.
diff --git a/python/paddle/fluid/io.py b/python/paddle/fluid/io.py
index 8936d884dd..26d7af87b3 100644
--- a/python/paddle/fluid/io.py
+++ b/python/paddle/fluid/io.py
@@ -165,6 +165,7 @@ def save_vars(executor,
 
         save_vars(
             executor,
+            main_program=main_program,
             dirname=dirname,
             vars=list(filter(predicate, main_program.list_vars())),
             filename=filename)
@@ -172,11 +173,18 @@ def save_vars(executor,
         save_program = Program()
         save_block = save_program.global_block()
 
+        if main_program is None:
+            main_program = default_main_program()
+        if not isinstance(main_program, Program):
+            raise TypeError("program should be as Program type or None")
+
         save_var_map = {}
         for each_var in vars:
             # NOTE: don't save the variable which type is RAW
             if each_var.type == core.VarDesc.VarType.RAW:
                 continue
+            if each_var.name == main_program._distributed_lookup_table:
+                continue
             new_var = _clone_var_in_block_(save_block, each_var)
             if filename is None:
                 save_block.append_op(
@@ -198,6 +206,16 @@ def save_vars(executor,
                 outputs={},
                 attrs={'file_path': os.path.join(dirname, filename)})
 
+        # if there is lookup table, the trainer 0 will notify all pserver to save.
+        if main_program._is_distributed and main_program._is_chief and main_program._distributed_lookup_table:
+            lookup_table_filename = os.path.join(dirname, "__lookup_table__")
+            attrs = {}
+            attrs['epmap'] = main_program._endpoints
+            attrs['dir'] = lookup_table_filename
+            attrs['lookup_table'] = main_program._distributed_lookup_table
+            save_block.append_op(
+                type='checkpoint_notify', inputs={}, outputs={}, attrs=attrs)
+
         executor.run(save_program)
 
 
@@ -379,11 +397,22 @@ def load_vars(executor,
         load_prog = Program()
         load_block = load_prog.global_block()
 
+        if main_program is None:
+            main_program = default_main_program()
+        if not isinstance(main_program, Program):
+            raise TypeError("program should be as Program type or None")
+
+        load_slice_vars = []
+        for each_var in main_program._slice_vars_and_attrs:
+            load_slice_vars.append(each_var[2].name)
+
         load_var_map = {}
         for each_var in vars:
             assert isinstance(each_var, Variable)
             if each_var.type == core.VarDesc.VarType.RAW:
                 continue
+            if each_var.name in load_slice_vars:
+                continue
             new_var = _clone_var_in_block_(load_block, each_var)
             if filename is None:
                 load_block.append_op(
@@ -406,9 +435,6 @@ def load_vars(executor,
                 attrs={'file_path': os.path.join(dirname, filename)})
         executor.run(load_prog)
 
-        if main_program is None:
-            main_program = default_main_program()
-
         # load slice vars on pserver, if have it.
         _load_slice_up_vars(executor, dirname,
                             main_program._slice_vars_and_attrs)
@@ -618,13 +644,6 @@ def save_inference_model(dirname,
     if main_program is None:
         main_program = default_main_program()
 
-    # if there is lookup table, the trainer 0 will notify all pserver to save.
-    if main_program._is_distributed and main_program._is_chief and main_program._distributed_lookup_table:
-        lookup_table_filename = os.path.join(dirname, "__lookup_table__")
-        _save_lookup_tables_by_notify(executor, lookup_table_filename,
-                                      main_program._distributed_lookup_table,
-                                      main_program._endpoints)
-
     # when a pserver and a trainer running on the same machine, mkdir may conflict
     try:
         os.makedirs(dirname)
@@ -642,6 +661,9 @@ def save_inference_model(dirname,
     # it can only be loaded for inference directly. If it's false, the whole
     # original program and related meta are saved so that future usage can be
     # more flexible.
+
+    origin_program = main_program.clone()
+
     if export_for_deployment:
         main_program = main_program.clone()
         global_block = main_program.global_block()
@@ -666,8 +688,11 @@ def save_inference_model(dirname,
         with open(model_basename + ".main_program", "wb") as f:
             f.write(main_program.desc.serialize_to_string())
 
+    main_program._copy_dist_param_info_from(origin_program)
+
     if params_filename is not None:
         params_filename = os.path.basename(params_filename)
+
     save_persistables(executor, dirname, main_program, params_filename)
 
 
@@ -897,6 +922,9 @@ def _load_slice_up_vars(executor, dirname, slice_vars_and_attrs):
         slice_var = var_tuple[2]
         end = start + slice_var.shape[0]
 
+        orig_var_name = orig_var.name
+        orig_var.name = "{}.origin".format(orig_var_name)
+
         clone_orig_var = load_block.create_var(
             name=orig_var.name,
             type=orig_var.type,
@@ -915,7 +943,7 @@ def _load_slice_up_vars(executor, dirname, slice_vars_and_attrs):
             type='load',
             inputs={},
             outputs={'Out': [clone_orig_var]},
-            attrs={'file_path': os.path.join(dirname, clone_orig_var.name)})
+            attrs={'file_path': os.path.join(dirname, orig_var_name)})
         load_block.append_op(
             type="slice",
             inputs={'Input': clone_orig_var},
@@ -924,6 +952,7 @@ def _load_slice_up_vars(executor, dirname, slice_vars_and_attrs):
                    'starts': [start],
                    'ends': [end]})
         need_delete_vars.append(clone_orig_var)
+
     load_block.append_op(
         type='delete_var',
         inputs={'X': need_delete_vars}, )
diff --git a/python/paddle/fluid/transpiler/distribute_transpiler.py b/python/paddle/fluid/transpiler/distribute_transpiler.py
index 89bc248027..ebd0d18d36 100644
--- a/python/paddle/fluid/transpiler/distribute_transpiler.py
+++ b/python/paddle/fluid/transpiler/distribute_transpiler.py
@@ -644,6 +644,9 @@ in a single call.")
             else:
                 recv_inputs.append(single_trainer_var)
 
+        self._slice_params_and_optimizes = self._get_slice_vars_and_attrs(
+            endpoint)
+
         # step 3
         # Create a union-find data structure from optimize ops,
         # If two ops are connected, we could add these two ops
@@ -766,7 +769,7 @@ in a single call.")
                                                grad_to_block_id, merged_var,
                                                lr_ops)
 
-# dedup grad to ids list
+        # dedup grad to ids list
         grad_to_block_id = list(set(grad_to_block_id))
         # append global ops
         if global_ops:
@@ -827,8 +830,8 @@ in a single call.")
             attrs=attrs)
 
         # add distributed attrs
-        pserver_program._slice_vars_and_attrs = self._get_slice_vars_and_attrs(
-            endpoint)
+        pserver_program._slice_vars_and_attrs = list(
+            self._slice_params_and_optimizes.values())
 
         pserver_program._sync_with_cpp()
         # save pserver program to generate pserver side startup relatively.
@@ -941,12 +944,12 @@ to transpile() call.")
                     outputs={"Out": startup_tmpvar})
 
         # add slice vars
-        s_prog._slice_vars_and_attrs = self._get_slice_vars_and_attrs(endpoint)
+        s_prog._slice_vars_and_attrs = pserver_program._slice_vars_and_attrs
 
         return s_prog
 
     def _get_slice_vars_and_attrs(self, endpoint):
-        slice_vars_and_attrs = []
+        slice_vars_and_attrs = {}
         block_suffix = "block"
         for param in self.param_grad_ep_mapping[endpoint]["params"]:
             orig_var_name, block_name, _ = self._get_varname_parts(param.name)
@@ -960,8 +963,7 @@ to transpile() call.")
             slice_vars = self.param_var_mapping[orig_var_name]
             for slice_var in slice_vars[:block_idx]:
                 skip_dim0 += slice_var.shape[0]
-            slice_vars_and_attrs.append([orig_var, skip_dim0, param])
-
+            slice_vars_and_attrs[param.name] = [orig_var, skip_dim0, param]
         return slice_vars_and_attrs
 
     # ====================== private transpiler functions =====================
@@ -1662,10 +1664,10 @@ to transpile() call.")
             if key in ["Param", "Grad", "LearningRate"]:
                 continue
             var = self.origin_program.global_block().vars[opt_op.input(key)[0]]
+            param_var = new_inputs["Param"]
             # update accumulator variable shape
-            param_shape = new_inputs["Param"].shape
-            new_shape = self._get_optimizer_input_shape(opt_op.type, key,
-                                                        var.shape, param_shape)
+            new_shape = self._get_optimizer_input_shape(
+                opt_op.type, key, var.shape, param_var.shape)
             tmpvar = pserver_block.create_var(
                 name=var.name,
                 persistable=var.persistable,
@@ -1673,6 +1675,13 @@ to transpile() call.")
                 shape=new_shape)
             new_inputs[key] = tmpvar
 
+            # var shape been changed
+            if new_shape != var.shape:
+                slice_var_args = self._slice_params_and_optimizes[
+                    param_var.name]
+                self._slice_params_and_optimizes[
+                    var.name] = [var, slice_var_args[1], tmpvar]
+
         # change output's ParamOut variable
         outputs = self._get_output_map_from_op(
             self.origin_program.global_block().vars, opt_op)

From 5c073a4db215f3c441ed76145398131c2b561708 Mon Sep 17 00:00:00 2001
From: Yan Chunwei <yanchunwei@outlook.com>
Date: Mon, 26 Nov 2018 15:53:37 +0800
Subject: [PATCH 106/113] fix transfer cache thread_local bug (#14581)

---
 .../fluid/framework/transfer_scope_cache.cc   | 38 +++++++------------
 1 file changed, 14 insertions(+), 24 deletions(-)

diff --git a/paddle/fluid/framework/transfer_scope_cache.cc b/paddle/fluid/framework/transfer_scope_cache.cc
index e52a8317e2..f6219a1417 100644
--- a/paddle/fluid/framework/transfer_scope_cache.cc
+++ b/paddle/fluid/framework/transfer_scope_cache.cc
@@ -17,16 +17,28 @@
 namespace paddle {
 namespace framework {
 
+// Holds all the transfer scope across the process.
 std::unordered_map<size_t, Scope*>& global_transfer_data_cache() {
-  thread_local auto* x = new std::unordered_map<size_t, Scope*>;
+  typedef std::unordered_map<size_t, Scope*> map_t;
+  thread_local std::unique_ptr<map_t> x(new map_t);
   return *x;
 }
 
+// Holds all the transfer scope for this thread.
 std::unordered_set<Scope*>& global_transfer_scope_cache() {
-  thread_local auto* x = new std::unordered_set<Scope*>;
+  typedef std::unordered_set<Scope*> set_t;
+  thread_local std::unique_ptr<set_t> x(new set_t);
   return *x;
 }
 
+// Try to create a transfer scope. If one cached scope has match the
+// requirement, just return that one.
+// Inputs:
+// @type0: the source kernel type.
+// @type1: the target kernel type.
+// @scope: the execution scope of this op.
+// Returns: A scope used to hold the transfer data across the different kernel
+// type.
 Scope* TryCreateTransferScope(OpKernelType type0, OpKernelType type1,
                               const Scope* scope) {
   Scope* new_scope{nullptr};
@@ -46,27 +58,5 @@ Scope* TryCreateTransferScope(OpKernelType type0, OpKernelType type1,
   return new_scope;
 }
 
-void RemoveKidsFromTransferScopeCache(Scope* scope) {
-  auto it = global_transfer_scope_cache().find(scope);
-  if (it != global_transfer_scope_cache().end()) {
-    global_transfer_scope_cache().erase(it);
-  }
-  for (auto* s : scope->kids()) {
-    auto it = global_transfer_scope_cache().find(s);
-    if (it != global_transfer_scope_cache().end()) {
-      global_transfer_scope_cache().erase(it);
-    }
-  }
-
-  // remove global transfer data cache
-  auto& cache = global_transfer_data_cache();
-  for (auto it = cache.begin(); it != cache.end();) {
-    if (it->second == scope)
-      it = cache.erase(it);
-    else
-      it++;
-  }
-}
-
 }  // namespace framework
 }  // namespace paddle

From 6224e61fd94e6ad87f18c2808a76256b516fa3f3 Mon Sep 17 00:00:00 2001
From: qingqing01 <dangqingqing@baidu.com>
Date: Mon, 26 Nov 2018 15:54:39 +0800
Subject: [PATCH 107/113]  Transpose-Flatten-Concat fusion operator.  (#14568)

* Transpose-Flatten-Concat fusion operator.
* Add unit testing and fix bug.
---
 cmake/operators.cmake                         |   3 +-
 paddle/fluid/operators/fused/CMakeLists.txt   |   6 +-
 .../fusion_transpose_flatten_concat_op.cc     | 114 +++++++++++++++++
 .../fusion_transpose_flatten_concat_op.cu.cc  | 115 ++++++++++++++++++
 .../fusion_transpose_flatten_concat_op.h      |  50 ++++++++
 ...test_fusion_transpose_flatten_concat_op.py | 105 ++++++++++++++++
 6 files changed, 391 insertions(+), 2 deletions(-)
 create mode 100644 paddle/fluid/operators/fused/fusion_transpose_flatten_concat_op.cc
 create mode 100644 paddle/fluid/operators/fused/fusion_transpose_flatten_concat_op.cu.cc
 create mode 100644 paddle/fluid/operators/fused/fusion_transpose_flatten_concat_op.h
 create mode 100644 python/paddle/fluid/tests/unittests/test_fusion_transpose_flatten_concat_op.py

diff --git a/cmake/operators.cmake b/cmake/operators.cmake
index 17107e0698..89726bf985 100644
--- a/cmake/operators.cmake
+++ b/cmake/operators.cmake
@@ -109,7 +109,8 @@ function(op_library TARGET)
 
     # Define operators that don't need pybind here.
     foreach(manual_pybind_op "compare_op" "logical_op" "nccl_op"
-"tensor_array_read_write_op" "tensorrt_engine_op" "conv_fusion_op")
+"tensor_array_read_write_op" "tensorrt_engine_op" "conv_fusion_op"
+"fusion_transpose_flatten_concat_op")
         if ("${TARGET}" STREQUAL "${manual_pybind_op}")
             set(pybind_flag 1)
         endif()
diff --git a/paddle/fluid/operators/fused/CMakeLists.txt b/paddle/fluid/operators/fused/CMakeLists.txt
index 5d468316e8..a0397acab1 100644
--- a/paddle/fluid/operators/fused/CMakeLists.txt
+++ b/paddle/fluid/operators/fused/CMakeLists.txt
@@ -1,2 +1,6 @@
 include(operators)
-register_operators()
+register_operators(EXCLUDES fusion_transpose_flatten_concat_op)
+if (WITH_GPU)
+  op_library(fusion_transpose_flatten_concat_op)
+  file(APPEND ${pybind_file} "USE_CUDA_ONLY_OP(fusion_transpose_flatten_concat);\n")
+endif()
diff --git a/paddle/fluid/operators/fused/fusion_transpose_flatten_concat_op.cc b/paddle/fluid/operators/fused/fusion_transpose_flatten_concat_op.cc
new file mode 100644
index 0000000000..39356c9afc
--- /dev/null
+++ b/paddle/fluid/operators/fused/fusion_transpose_flatten_concat_op.cc
@@ -0,0 +1,114 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/fused/fusion_transpose_flatten_concat_op.h"
+#include <string>
+#include <vector>
+#include "paddle/fluid/framework/op_registry.h"
+
+namespace paddle {
+namespace operators {
+
+using framework::Tensor;
+
+class TransposeFlattenConcatFusionOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext *ctx) const override {
+    PADDLE_ENFORCE_GE(ctx->Inputs("X").size(), 1UL,
+                      "Inputs(X) of ConcatOp should be empty.");
+    PADDLE_ENFORCE(ctx->HasOutput("Out"),
+                   "Output(Out) of ConcatOp should not be null.");
+
+    auto ins = ctx->GetInputsDim("X");
+    const size_t n = ins.size();
+    PADDLE_ENFORCE_GT(n, 0, "Input tensors count should > 0.");
+
+    std::vector<int> trans_axis =
+        ctx->Attrs().Get<std::vector<int>>("trans_axis");
+    int flatten_axis = ctx->Attrs().Get<int>("flatten_axis");
+    int concat_axis = ctx->Attrs().Get<int>("concat_axis");
+
+    size_t x_rank = ins[0].size();
+    size_t trans_axis_size = trans_axis.size();
+    PADDLE_ENFORCE_EQ(x_rank, trans_axis_size,
+                      "The input tensor's rank(%d) "
+                      "should be equal to the permutation axis's size(%d)",
+                      x_rank, trans_axis_size);
+
+    auto dims0 =
+        GetFlattenShape(flatten_axis, GetPermuteShape(trans_axis, ins[0]));
+    std::vector<int> out_dims(dims0);
+    for (size_t i = 1; i < n; i++) {
+      auto dimsi =
+          GetFlattenShape(flatten_axis, GetPermuteShape(trans_axis, ins[i]));
+      for (int j = 0; j < static_cast<int>(dims0.size()); j++) {
+        if (j == concat_axis) {
+          out_dims[concat_axis] += dimsi[j];
+        } else {
+          PADDLE_ENFORCE_EQ(out_dims[j], dimsi[j],
+                            "After flatting, the %d-th dim should be save "
+                            "except the specify axis.",
+                            j);
+        }
+      }
+    }
+    if (out_dims[concat_axis] < 0) {
+      out_dims[concat_axis] = -1;
+    }
+    ctx->SetOutputDim("Out", framework::make_ddim(out_dims));
+  }
+};
+
+class TransposeFlattenConcatFusionOpMaker
+    : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() override {
+    AddInput(
+        "X",
+        "(Tensor) The input tensor, tensors with rank up to 6 are supported.")
+        .AsDuplicable();
+    AddOutput("Out", "(Tensor)The output tensor.");
+    AddAttr<std::vector<int>>(
+        "trans_axis",
+        "(vector<int>) A list of values, and the size of the list should be "
+        "the same with the input tensor rank. This operator permutes the input "
+        "tensor's axes according to the values given.");
+    AddAttr<int>("flatten_axis",
+                 "(int)"
+                 "Indicate up to which input dimensions (exclusive) should be"
+                 "flattened to the outer dimension of the output. The value"
+                 "for axis must be in the range [0, R], where R is the rank of"
+                 "the input tensor. When axis = 0, the shape of the output"
+                 "tensor is (1, (d_0 X d_1 ... d_n), where the shape of the"
+                 "input tensor is (d_0, d_1, ... d_n).");
+    AddAttr<int>("concat_axis",
+                 "The axis along which the input tensors will be concatenated. "
+                 "It should be 0 or 1, since the tensor is 2D after flatting.");
+    AddComment(R"DOC(
+
+
+)DOC");
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OPERATOR(fusion_transpose_flatten_concat,
+                  ops::TransposeFlattenConcatFusionOp,
+                  ops::TransposeFlattenConcatFusionOpMaker,
+                  paddle::framework::EmptyGradOpMaker);
diff --git a/paddle/fluid/operators/fused/fusion_transpose_flatten_concat_op.cu.cc b/paddle/fluid/operators/fused/fusion_transpose_flatten_concat_op.cu.cc
new file mode 100644
index 0000000000..6ccb670d73
--- /dev/null
+++ b/paddle/fluid/operators/fused/fusion_transpose_flatten_concat_op.cu.cc
@@ -0,0 +1,115 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/fused/fusion_transpose_flatten_concat_op.h"
+#include <vector>
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/platform/cudnn_helper.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename T>
+using CudnnDataType = platform::CudnnDataType<T>;
+
+template <typename T>
+class TransposeFlattenConcatFusionKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto ins = ctx.MultiInput<framework::Tensor>("X");
+    auto* out = ctx.Output<framework::Tensor>("Out");
+    out->mutable_data<T>(ctx.GetPlace());
+    auto odims = out->dims();
+
+    std::vector<int> trans_axis = ctx.Attr<std::vector<int>>("trans_axis");
+    int flatten_axis = ctx.Attr<int>("flatten_axis");
+    int concat_axis = ctx.Attr<int>("concat_axis");
+
+    int rank = ins[0]->dims().size();
+    // use at least 4D in cudnnTransformTensor
+    int max_dim = rank < 4 ? 4 : rank;
+    std::vector<int> stride_x(max_dim, 0);
+    std::vector<int> stride_y(max_dim, 0);
+    std::vector<int> dims_y(max_dim, 0);
+
+    cudnnTensorDescriptor_t in_desc;
+    cudnnTensorDescriptor_t out_desc;
+    CUDNN_ENFORCE(platform::dynload::cudnnCreateTensorDescriptor(&in_desc));
+    CUDNN_ENFORCE(platform::dynload::cudnnCreateTensorDescriptor(&out_desc));
+    cudnnDataType_t cudnn_dtype = CudnnDataType<T>::type;
+
+    auto& dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
+    auto handle = dev_ctx.cudnn_handle();
+
+    T* odata = out->data<T>();
+    for (size_t k = 0; k < ins.size(); ++k) {
+      auto perm_shape = GetPermuteShape(trans_axis, ins[k]->dims());
+      int osize = 1;
+      auto idims = ins[k]->dims();
+      for (int i = 0; i < rank; i++) {
+        stride_x[i] = 1;
+        for (int j = trans_axis[i] + 1; j < rank; j++) {
+          stride_x[i] *= idims[j];
+        }
+        dims_y[i] = perm_shape[i];
+        osize *= perm_shape[i];
+      }
+      stride_y[rank - 1] = 1;
+      for (int i = rank - 2; i >= 0; i--) {
+        if (((i + 1) == flatten_axis) && (concat_axis == 1)) {
+          stride_y[i] = odims[1];
+        } else {
+          stride_y[i] = stride_y[i + 1] * perm_shape[i + 1];
+        }
+      }
+
+      // Since concat is aftern flatten, the output is 2D tensor.
+      // If concat_axis is 0, each input's permutated tensor is continuous.
+      // If concat_axis is 1, the stride of 0-th dim of each input's
+      // permutated tensor is odims()[1].
+
+      for (int i = rank; i < max_dim; i++) {
+        stride_x[i] = 1;
+        stride_y[i] = 1;
+        dims_y[i] = 1;
+      }
+
+      CUDNN_ENFORCE(platform::dynload::cudnnSetTensorNdDescriptor(
+          in_desc, cudnn_dtype, max_dim, dims_y.data(), stride_x.data()));
+      CUDNN_ENFORCE(platform::dynload::cudnnSetTensorNdDescriptor(
+          out_desc, cudnn_dtype, max_dim, dims_y.data(), stride_y.data()));
+
+      CUDNN_ENFORCE(platform::dynload::cudnnTransformTensor(
+          handle, CudnnDataType<T>::kOne(), in_desc,
+          static_cast<const void*>(ins[k]->data<T>()),
+          CudnnDataType<T>::kZero(), out_desc, static_cast<void*>(odata)));
+      if (concat_axis == 0) {
+        odata += osize;
+      } else {
+        auto flat_shape = GetFlattenShape(flatten_axis, perm_shape);
+        odata += flat_shape[1];
+      }
+    }
+    CUDNN_ENFORCE(platform::dynload::cudnnDestroyTensorDescriptor(in_desc));
+    CUDNN_ENFORCE(platform::dynload::cudnnDestroyTensorDescriptor(out_desc));
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP_CUDA_KERNEL(fusion_transpose_flatten_concat,
+                        ops::TransposeFlattenConcatFusionKernel<float>,
+                        ops::TransposeFlattenConcatFusionKernel<double>);
diff --git a/paddle/fluid/operators/fused/fusion_transpose_flatten_concat_op.h b/paddle/fluid/operators/fused/fusion_transpose_flatten_concat_op.h
new file mode 100644
index 0000000000..66d5bea679
--- /dev/null
+++ b/paddle/fluid/operators/fused/fusion_transpose_flatten_concat_op.h
@@ -0,0 +1,50 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <string>
+#include <vector>
+#include "paddle/fluid/framework/ddim.h"
+
+namespace paddle {
+namespace operators {
+
+inline std::vector<int32_t> GetPermuteShape(const std::vector<int>& axis,
+                                            const framework::DDim& in_dims) {
+  std::vector<int32_t> out_dims(in_dims.size());
+  for (size_t i = 0; i < axis.size(); i++) {
+    out_dims[i] = in_dims[axis[i]];
+  }
+  return out_dims;
+}
+
+inline std::vector<int32_t> GetFlattenShape(const int axis,
+                                            const std::vector<int>& in_dims) {
+  int64_t outer = 1, inner = 1;
+  for (int i = 0; i < static_cast<int>(in_dims.size()); ++i) {
+    if (i < axis) {
+      outer *= in_dims[i];
+    } else {
+      inner *= in_dims[i];
+    }
+  }
+  std::vector<int32_t> out_shape(2);
+  out_shape[0] = outer;
+  out_shape[1] = inner;
+  return out_shape;
+}
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/python/paddle/fluid/tests/unittests/test_fusion_transpose_flatten_concat_op.py b/python/paddle/fluid/tests/unittests/test_fusion_transpose_flatten_concat_op.py
new file mode 100644
index 0000000000..4aa7f76495
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_fusion_transpose_flatten_concat_op.py
@@ -0,0 +1,105 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import numpy as np
+from op_test import OpTest
+import paddle.fluid.core as core
+
+
+class TestFusionTransposeFlattenConcationOp(OpTest):
+    def setUp(self):
+        self.init_test_case()
+        self.op_type = "fusion_transpose_flatten_concat"
+
+        ins = []
+        flats = []
+        for i in range(len(self.shapes)):
+            in_shape = self.shapes[i]
+            a = np.random.random(in_shape).astype("float32")
+            ins.append(("x%d" % i, a))
+
+            b = a.transpose(self.trans_axis)
+            flat_shape = (np.prod(b.shape[:self.flatten_axis]),
+                          np.prod(b.shape[self.flatten_axis:]))
+            c = b.reshape(flat_shape)
+            flats.append(c)
+        out = np.concatenate(flats, axis=self.concat_axis)
+
+        self.inputs = {'X': ins}
+        self.attrs = {
+            'trans_axis': list(self.trans_axis),
+            'flatten_axis': self.flatten_axis,
+            'concat_axis': self.concat_axis
+        }
+        self.outputs = {'Out': out}
+
+    def test_check_output(self):
+        if core.is_compiled_with_cuda():
+            place = core.CUDAPlace(0)
+            self.check_output_with_place(place, 1e-6)
+        else:
+            pass
+
+    def init_test_case(self):
+        self.shapes = [(3, 4, 17, 17), (3, 8, 7, 7), (3, 12, 5, 5)]
+        self.trans_axis = (0, 2, 3, 1)
+        self.flatten_axis = 1
+        self.concat_axis = 1
+
+
+class TestCase1(TestFusionTransposeFlattenConcationOp):
+    def init_test_case(self):
+        self.shapes = [(3, 4, 18, 17), (3, 8, 18, 7), (6, 12, 9, 5)]
+        self.trans_axis = (0, 2, 3, 1)
+        self.flatten_axis = 2
+        self.concat_axis = 1
+
+
+class TestCase2(TestFusionTransposeFlattenConcationOp):
+    def init_test_case(self):
+        self.shapes = [(3, 8, 20, 17), (3, 8, 19, 17), (3, 8, 40, 17)]
+        self.trans_axis = (0, 2, 3, 1)
+        self.flatten_axis = 2
+        self.concat_axis = 0
+
+
+class TestCase3(TestFusionTransposeFlattenConcationOp):
+    def init_test_case(self):
+        self.shapes = [(3, 8, 20, 17), (3, 8, 19, 17), (3, 8, 40, 17)]
+        self.trans_axis = (0, 3, 2, 1)
+        self.flatten_axis = 1
+        self.concat_axis = 1
+
+
+class TestCase4(TestFusionTransposeFlattenConcationOp):
+    def init_test_case(self):
+        self.shapes = [(3, 8, 9, 17), (8, 3, 9, 17), (4, 6, 9, 17)]
+        self.trans_axis = (0, 2, 1, 3)
+        self.flatten_axis = 3
+        self.concat_axis = 1
+
+
+class TestCase5(TestFusionTransposeFlattenConcationOp):
+    def init_test_case(self):
+        self.shapes = [(3, 8, 9, 17, 2), (3, 8, 2, 17, 9), (3, 17, 9, 8, 2)]
+        self.trans_axis = (0, 2, 1, 4, 3)
+        self.flatten_axis = 1
+        self.concat_axis = 1
+
+
+if __name__ == '__main__':
+    unittest.main()

From 1f0291a51e52098e84e55dccff7300a2edbd1b32 Mon Sep 17 00:00:00 2001
From: tensor-tang <tangjian03@baidu.com>
Date: Mon, 26 Nov 2018 08:38:37 +0000
Subject: [PATCH 108/113] add comments and follow comments

test=develop
---
 .../fluid/operators/math/jit_kernel_refer.h   |  4 +++
 .../fluid/operators/math/jit_kernel_test.cc   | 25 ++++++++++---------
 2 files changed, 17 insertions(+), 12 deletions(-)

diff --git a/paddle/fluid/operators/math/jit_kernel_refer.h b/paddle/fluid/operators/math/jit_kernel_refer.h
index bcb6615df8..e0b2e3c7fa 100644
--- a/paddle/fluid/operators/math/jit_kernel_refer.h
+++ b/paddle/fluid/operators/math/jit_kernel_refer.h
@@ -116,6 +116,7 @@ void (*getActFunc(const std::string& type))(const T*, T*, int) {  // NOLINT
   return nullptr;
 }
 
+// compute ct and ht
 template <typename T>
 void LSTMCtHt(lstm_t* step, const lstm_attr_t* attr) {
   T* gates = reinterpret_cast<T*>(step->gates);
@@ -199,6 +200,7 @@ void GRUH1(gru_t* step, const gru_attr_t* attr) {
   VMul(gates, gates + d2, ht, d);
 }
 
+// compute the first part of GRU: ht = act_gate(r) * ht_1
 template <typename T>
 void GRUHtPart1(gru_t* step, const gru_attr_t* attr) {
   // W: {W_update, W_reset; W_state}
@@ -210,6 +212,8 @@ void GRUHtPart1(gru_t* step, const gru_attr_t* attr) {
   VMul(ht_1, gates + attr->d, ht, attr->d);
 }
 
+// compute the second part of GRU:
+// ht = act_gate(u) * act_cand(s) + (1-act_gate(u)) * ht_1
 template <typename T>
 void GRUHtPart2(gru_t* step, const gru_attr_t* attr) {
   T* gates = reinterpret_cast<T*>(step->gates);
diff --git a/paddle/fluid/operators/math/jit_kernel_test.cc b/paddle/fluid/operators/math/jit_kernel_test.cc
index cc8a5d4d86..6476e95dc0 100644
--- a/paddle/fluid/operators/math/jit_kernel_test.cc
+++ b/paddle/fluid/operators/math/jit_kernel_test.cc
@@ -86,7 +86,7 @@ TEST(JitKernel, vrelu) {
         vrelu_intri8(d, x_data, zref_data);
       }
       auto si1 = GetCurrentUS();
-      VLOG(30) << "Vec size 8 intr takes: " << (si1 - si0) / repeat;
+      VLOG(30) << "Vec size 8 intr takes: " << (si1 - si0) / repeat << " us";
     }
 #endif
     auto ttgts = GetCurrentUS();
@@ -96,7 +96,7 @@ TEST(JitKernel, vrelu) {
     auto ttgte = GetCurrentUS();
     VLOG(30) << "Vec size " << d
              << ": refer takes: " << (trefe - trefs) / repeat
-             << " us, tgt takes: " << (ttgte - ttgts) / repeat;
+             << " us, tgt takes: " << (ttgte - ttgts) / repeat << " us";
     for (int i = 0; i < d; ++i) {
       EXPECT_NEAR(ztgt_data[i], zref_data[i], 1e-3);
     }
@@ -129,7 +129,7 @@ TEST(JitKernel, vaddbias) {
 
     VLOG(30) << "Vec size " << d
              << ": refer takes: " << (trefe - trefs) / repeat
-             << " us, tgt takes: " << (ttgte - ttgts) / repeat;
+             << " us, tgt takes: " << (ttgte - ttgts) / repeat << " us";
     for (int i = 0; i < d; ++i) {
       EXPECT_NEAR(ztgt_data[i], zref_data[i], 1e-3);
     }
@@ -182,7 +182,7 @@ TEST(JitKernel, vexp) {
 #else
              << " us, "
 #endif
-             << "tgt takes: " << (ttgte - ttgts) / repeat;
+             << "tgt takes: " << (ttgte - ttgts) / repeat << " us";
     for (int i = 0; i < d; ++i) {
       EXPECT_NEAR(ztgt_data[i], zref_data[i], 1e-3);
     }
@@ -238,7 +238,7 @@ TEST(JitKernel, vsigmoid) {
     VLOG(30) << "Vec size " << d
              << ": refer takes: " << (trefe - trefs) / repeat
              << " us, better(jit exp) takes: " << (tmkle - tmkls) / repeat
-             << " us, tgt takes: " << (ttgte - ttgts) / repeat;
+             << " us, tgt takes: " << (ttgte - ttgts) / repeat << " us";
     for (int i = 0; i < d; ++i) {
       EXPECT_NEAR(ztgt_data[i], zref_data[i], 1e-3);
     }
@@ -299,7 +299,7 @@ TEST(JitKernel, vtanh) {
     VLOG(30) << "Vec size " << d
              << ": refer takes: " << (trefe - trefs) / repeat
              << " us, better(jit exp) takes: " << (tmkle - tmkls) / repeat
-             << " us, tgt takes: " << (ttgte - ttgts) / repeat;
+             << " us, tgt takes: " << (ttgte - ttgts) / repeat << " us";
     for (int i = 0; i < d; ++i) {
       EXPECT_NEAR(ztgt_data[i], zref_data[i], 1e-3);
     }
@@ -400,7 +400,7 @@ TEST(JitKernel, lstm) {
     VLOG(30) << "Vec size " << d
              << ": refer takes: " << (trefe - trefs) / repeat
              << " us, better(jit) takes: " << (tmkle - tmkls) / repeat
-             << " us, tgt takes: " << (ttgte - ttgts) / repeat;
+             << " us, tgt takes: " << (ttgte - ttgts) / repeat << " us";
   }
 }
 
@@ -474,7 +474,7 @@ TEST(JitKernel, vscal) {
       }
       auto si3 = GetCurrentUS();
       VLOG(30) << "Vec size 8 intr takes: " << (si1 - si0) / repeat
-               << " us, inplace: " << (si3 - si2) / repeat;
+               << " us, inplace: " << (si3 - si2) / repeat << " us";
     }
 #endif
 
@@ -498,7 +498,8 @@ TEST(JitKernel, vscal) {
              << " us, "
 #endif
              << "tgt takes: " << (ttgte - ttgts) / repeat
-             << "us, tgt inplace takes: " << (ttgte1 - ttgts1) / repeat;
+             << "us, tgt inplace takes: " << (ttgte1 - ttgts1) / repeat
+             << " us";
     for (int i = 0; i < d; ++i) {
       EXPECT_NEAR(ztgt_data[i], zref_data[i], 1e-3);
     }
@@ -573,7 +574,7 @@ TEST(JitKernel, vmul) {
 #else
              << " us, "
 #endif
-             << "tgt takes: " << (ttgte - ttgts) / repeat;
+             << "tgt takes: " << (ttgte - ttgts) / repeat << " us";
     for (int i = 0; i < d; ++i) {
       EXPECT_NEAR(ztgt_data[i], zref_data[i], 1e-3);
     }
@@ -648,7 +649,7 @@ TEST(JitKernel, vadd) {
 #else
              << " us, "
 #endif
-             << "tgt takes: " << (ttgte - ttgts) / repeat;
+             << "tgt takes: " << (ttgte - ttgts) / repeat << " us";
     for (int i = 0; i < d; ++i) {
       EXPECT_NEAR(ztgt_data[i], zref_data[i], 1e-3);
     }
@@ -701,7 +702,7 @@ TEST(JitKernel, vaddrelu) {
     VLOG(30) << "Vec size " << d
              << ": refer takes: " << (trefe - trefs) / repeat
              << " us, better takes: " << (tmkle - tmkls) / repeat << " us, "
-             << "tgt takes: " << (ttgte - ttgts) / repeat;
+             << "tgt takes: " << (ttgte - ttgts) / repeat << " us";
     for (int i = 0; i < d; ++i) {
       EXPECT_NEAR(ztgt_data[i], zref_data[i], 1e-3);
     }

From 78f563917ca95d718ce3fe70fe147d67cf8d4f20 Mon Sep 17 00:00:00 2001
From: dengkaipeng <dengkaipeng@baidu.com>
Date: Mon, 26 Nov 2018 16:28:21 +0800
Subject: [PATCH 110/113] revert interpolate_op to bilinear_interp_op &
 nearest_interp_op. test=develop

---
 paddle/fluid/operators/interpolate_op.cc      |  16 +-
 paddle/fluid/operators/interpolate_op.cu      |  10 +-
 python/paddle/fluid/layers/nn.py              |  17 +-
 ...olate_op.py => test_bilinear_interp_op.py} | 165 ++-------------
 .../tests/unittests/test_nearest_interp_op.py | 197 ++++++++++++++++++
 5 files changed, 242 insertions(+), 163 deletions(-)
 rename python/paddle/fluid/tests/unittests/{test_interpolate_op.py => test_bilinear_interp_op.py} (53%)
 create mode 100644 python/paddle/fluid/tests/unittests/test_nearest_interp_op.py

diff --git a/paddle/fluid/operators/interpolate_op.cc b/paddle/fluid/operators/interpolate_op.cc
index 8f979e05d3..203291ed61 100644
--- a/paddle/fluid/operators/interpolate_op.cc
+++ b/paddle/fluid/operators/interpolate_op.cc
@@ -132,11 +132,19 @@ class InterpolateOpGrad : public framework::OperatorWithKernel {
 }  // namespace paddle
 
 namespace ops = paddle::operators;
-REGISTER_OPERATOR(interpolate, ops::InterpolateOp, ops::InterpolateOpMaker,
+REGISTER_OPERATOR(bilinear_interp, ops::InterpolateOp, ops::InterpolateOpMaker,
                   paddle::framework::DefaultGradOpDescMaker<true>);
-REGISTER_OPERATOR(interpolate_grad, ops::InterpolateOpGrad);
-REGISTER_OP_CPU_KERNEL(interpolate, ops::InterpolateKernel<float>,
+REGISTER_OPERATOR(bilinear_interp_grad, ops::InterpolateOpGrad);
+REGISTER_OPERATOR(nearest_interp, ops::InterpolateOp, ops::InterpolateOpMaker,
+                  paddle::framework::DefaultGradOpDescMaker<true>);
+REGISTER_OPERATOR(nearest_interp_grad, ops::InterpolateOpGrad);
+REGISTER_OP_CPU_KERNEL(bilinear_interp, ops::InterpolateKernel<float>,
+                       ops::InterpolateKernel<double>,
+                       ops::InterpolateKernel<uint8_t>);
+REGISTER_OP_CPU_KERNEL(bilinear_interp_grad, ops::InterpolateGradKernel<float>,
+                       ops::InterpolateGradKernel<double>);
+REGISTER_OP_CPU_KERNEL(nearest_interp, ops::InterpolateKernel<float>,
                        ops::InterpolateKernel<double>,
                        ops::InterpolateKernel<uint8_t>);
-REGISTER_OP_CPU_KERNEL(interpolate_grad, ops::InterpolateGradKernel<float>,
+REGISTER_OP_CPU_KERNEL(nearest_interp_grad, ops::InterpolateGradKernel<float>,
                        ops::InterpolateGradKernel<double>);
diff --git a/paddle/fluid/operators/interpolate_op.cu b/paddle/fluid/operators/interpolate_op.cu
index 190afbdac4..99ac725f73 100644
--- a/paddle/fluid/operators/interpolate_op.cu
+++ b/paddle/fluid/operators/interpolate_op.cu
@@ -284,9 +284,15 @@ class InterpolateGradOpCUDAKernel : public framework::OpKernel<T> {
 }  // namespace paddle
 
 namespace ops = paddle::operators;
-REGISTER_OP_CUDA_KERNEL(interpolate, ops::InterpolateOpCUDAKernel<float>,
+REGISTER_OP_CUDA_KERNEL(bilinear_interp, ops::InterpolateOpCUDAKernel<float>,
                         ops::InterpolateOpCUDAKernel<double>,
                         ops::InterpolateOpCUDAKernel<int>);
-REGISTER_OP_CUDA_KERNEL(interpolate_grad,
+REGISTER_OP_CUDA_KERNEL(bilinear_interp_grad,
+                        ops::InterpolateGradOpCUDAKernel<float>,
+                        ops::InterpolateGradOpCUDAKernel<double>);
+REGISTER_OP_CUDA_KERNEL(nearest_interp, ops::InterpolateOpCUDAKernel<float>,
+                        ops::InterpolateOpCUDAKernel<double>,
+                        ops::InterpolateOpCUDAKernel<int>);
+REGISTER_OP_CUDA_KERNEL(nearest_interp_grad,
                         ops::InterpolateGradOpCUDAKernel<float>,
                         ops::InterpolateGradOpCUDAKernel<double>);
diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py
index 7af1f380e7..f0c5e67cca 100644
--- a/python/paddle/fluid/layers/nn.py
+++ b/python/paddle/fluid/layers/nn.py
@@ -5870,9 +5870,10 @@ def image_resize(input,
         raise ValueError(
             "The 'resample' of image_resize can only be 'BILINEAR' or 'NEAREST' currently."
         )
+    resample_type = resample_methods[resample]
     if out_shape is None and scale is None:
         raise ValueError("One of out_shape and scale must not be None.")
-    helper = LayerHelper('interpolate', **locals())
+    helper = LayerHelper('{}_interp'.format(resample_type), **locals())
     dtype = helper.input_dtype()
 
     def _is_list_or_turple_(data):
@@ -5906,18 +5907,16 @@ def image_resize(input,
 
     out = helper.create_variable_for_type_inference(dtype)
     helper.append_op(
-        type='interpolate',
+        type='{}_interp'.format(resample_type),
         inputs=inputs,
         outputs={"Out": out},
-        attrs={
-            "out_h": out_h,
-            "out_w": out_w,
-            "interp_method": resample_methods[resample]
-        })
+        attrs={"out_h": out_h,
+               "out_w": out_w,
+               "interp_method": resample_type})
     return out
 
 
-@templatedoc(op_type="interpolate")
+@templatedoc(op_type="bilinear_interp")
 def resize_bilinear(input,
                     out_shape=None,
                     scale=None,
@@ -5973,7 +5972,7 @@ def resize_bilinear(input,
     return image_resize(input, out_shape, scale, name, 'BILINEAR', actual_shape)
 
 
-@templatedoc(op_type="interpolate")
+@templatedoc(op_type="nearest_interp")
 def resize_nearest(input,
                    out_shape=None,
                    scale=None,
diff --git a/python/paddle/fluid/tests/unittests/test_interpolate_op.py b/python/paddle/fluid/tests/unittests/test_bilinear_interp_op.py
similarity index 53%
rename from python/paddle/fluid/tests/unittests/test_interpolate_op.py
rename to python/paddle/fluid/tests/unittests/test_bilinear_interp_op.py
index 9748d094cd..c8a7063dc1 100644
--- a/python/paddle/fluid/tests/unittests/test_interpolate_op.py
+++ b/python/paddle/fluid/tests/unittests/test_bilinear_interp_op.py
@@ -20,36 +20,6 @@ from op_test import OpTest
 import paddle.fluid.core as core
 
 
-def nearest_neighbor_interp_np(X,
-                               out_h,
-                               out_w,
-                               out_size=None,
-                               actual_shape=None):
-    """nearest neighbor interpolation implement in shape [N, C, H, W]"""
-    if out_size is not None:
-        out_h = out_size[0]
-        out_w = out_size[1]
-    if actual_shape is not None:
-        out_h = actual_shape[0]
-        out_w = actual_shape[1]
-    n, c, in_h, in_w = X.shape
-
-    ratio_h = ratio_w = 0.0
-    if out_h > 1:
-        ratio_h = (in_h - 1.0) / (out_h - 1.0)
-    if out_w > 1:
-        ratio_w = (in_w - 1.0) / (out_w - 1.0)
-
-    out = np.zeros((n, c, out_h, out_w))
-    for i in range(out_h):
-        in_i = int(ratio_h * i + 0.5)
-        for j in range(out_w):
-            in_j = int(ratio_w * j + 0.5)
-            out[:, :, i, j] = X[:, :, in_i, in_j]
-
-    return out.astype(X.dtype)
-
-
 def bilinear_interp_np(input, out_h, out_w, out_size=None, actual_shape=None):
     """bilinear interpolation implement in shape [N, C, H, W]"""
     if out_size is not None:
@@ -87,22 +57,16 @@ def bilinear_interp_np(input, out_h, out_w, out_size=None, actual_shape=None):
     return out.astype(input.dtype)
 
 
-INTERPOLATE_FUNCS = {
-    'bilinear': bilinear_interp_np,
-    'nearest': nearest_neighbor_interp_np,
-}
-
-
-class TestInterpolateOp(OpTest):
+class TestBilinearInterpOp(OpTest):
     def setUp(self):
         self.out_size = None
         self.actual_shape = None
         self.init_test_case()
-        self.op_type = "interpolate"
+        self.op_type = "bilinear_interp"
         input_np = np.random.random(self.input_shape).astype("float32")
 
-        output_np = INTERPOLATE_FUNCS[self.interp_method](
-            input_np, self.out_h, self.out_w, self.out_size, self.actual_shape)
+        output_np = bilinear_interp_np(input_np, self.out_h, self.out_w,
+                                       self.out_size, self.actual_shape)
         self.inputs = {'X': input_np}
         if self.out_size is not None:
             self.inputs['OutSize'] = self.out_size
@@ -129,7 +93,7 @@ class TestInterpolateOp(OpTest):
         self.out_size = np.array([3, 3]).astype("int32")
 
 
-class TestBilinearInterpCase1(TestInterpolateOp):
+class TestBilinearInterpCase1(TestBilinearInterpOp):
     def init_test_case(self):
         self.interp_method = 'bilinear'
         self.input_shape = [4, 1, 7, 8]
@@ -137,7 +101,7 @@ class TestBilinearInterpCase1(TestInterpolateOp):
         self.out_w = 1
 
 
-class TestBilinearInterpCase2(TestInterpolateOp):
+class TestBilinearInterpCase2(TestBilinearInterpOp):
     def init_test_case(self):
         self.interp_method = 'bilinear'
         self.input_shape = [3, 3, 9, 6]
@@ -145,7 +109,7 @@ class TestBilinearInterpCase2(TestInterpolateOp):
         self.out_w = 12
 
 
-class TestBilinearInterpCase3(TestInterpolateOp):
+class TestBilinearInterpCase3(TestBilinearInterpOp):
     def init_test_case(self):
         self.interp_method = 'bilinear'
         self.input_shape = [1, 1, 128, 64]
@@ -153,7 +117,7 @@ class TestBilinearInterpCase3(TestInterpolateOp):
         self.out_w = 128
 
 
-class TestBilinearInterpCase4(TestInterpolateOp):
+class TestBilinearInterpCase4(TestBilinearInterpOp):
     def init_test_case(self):
         self.interp_method = 'bilinear'
         self.input_shape = [4, 1, 7, 8]
@@ -162,7 +126,7 @@ class TestBilinearInterpCase4(TestInterpolateOp):
         self.out_size = np.array([2, 2]).astype("int32")
 
 
-class TestBilinearInterpCase5(TestInterpolateOp):
+class TestBilinearInterpCase5(TestBilinearInterpOp):
     def init_test_case(self):
         self.interp_method = 'bilinear'
         self.input_shape = [3, 3, 9, 6]
@@ -171,7 +135,7 @@ class TestBilinearInterpCase5(TestInterpolateOp):
         self.out_size = np.array([11, 11]).astype("int32")
 
 
-class TestBilinearInterpCase6(TestInterpolateOp):
+class TestBilinearInterpCase6(TestBilinearInterpOp):
     def init_test_case(self):
         self.interp_method = 'bilinear'
         self.input_shape = [1, 1, 128, 64]
@@ -180,7 +144,7 @@ class TestBilinearInterpCase6(TestInterpolateOp):
         self.out_size = np.array([65, 129]).astype("int32")
 
 
-class TestBilinearInterpActualShape(TestInterpolateOp):
+class TestBilinearInterpActualShape(TestBilinearInterpOp):
     def init_test_case(self):
         self.interp_method = 'bilinear'
         self.input_shape = [3, 2, 32, 16]
@@ -189,25 +153,16 @@ class TestBilinearInterpActualShape(TestInterpolateOp):
         self.out_size = np.array([66, 40]).astype("int32")
 
 
-class TestBilinearInterpBigScale(TestInterpolateOp):
-    def init_test_case(self):
-        self.interp_method = 'bilinear'
-        self.input_shape = [4, 4, 64, 32]
-        self.out_h = 100
-        self.out_w = 50
-        self.out_size = np.array([101, 51]).astype('int32')
-
-
-class TestInterpolateOpUint8(OpTest):
+class TestBilinearInterpOpUint8(OpTest):
     def setUp(self):
         self.out_size = None
         self.actual_shape = None
         self.init_test_case()
-        self.op_type = "interpolate"
+        self.op_type = "bilinear_interp"
         input_np = np.random.randint(
             low=0, high=256, size=self.input_shape).astype("uint8")
-        output_np = INTERPOLATE_FUNCS[self.interp_method](
-            input_np, self.out_h, self.out_w, self.out_size, self.actual_shape)
+        output_np = bilinear_interp_np(input_np, self.out_h, self.out_w,
+                                       self.out_size, self.actual_shape)
         self.inputs = {'X': input_np}
         if self.out_size is not None:
             self.inputs['OutSize'] = self.out_size
@@ -228,7 +183,7 @@ class TestInterpolateOpUint8(OpTest):
         self.out_w = 9
 
 
-class TestBilinearInterpCase1Uint8(TestInterpolateOpUint8):
+class TestBilinearInterpCase1Uint8(TestBilinearInterpOpUint8):
     def init_test_case(self):
         self.interp_method = 'bilinear'
         self.input_shape = [2, 3, 128, 64]
@@ -236,7 +191,7 @@ class TestBilinearInterpCase1Uint8(TestInterpolateOpUint8):
         self.out_w = 50
 
 
-class TestBilinearInterpCase2Uint8(TestInterpolateOpUint8):
+class TestBilinearInterpCase2Uint8(TestBilinearInterpOpUint8):
     def init_test_case(self):
         self.interp_method = 'bilinear'
         self.input_shape = [4, 1, 7, 8]
@@ -245,91 +200,5 @@ class TestBilinearInterpCase2Uint8(TestInterpolateOpUint8):
         self.out_size = np.array([6, 15]).astype("int32")
 
 
-class TestNearestNeighborInterpCase1(TestInterpolateOp):
-    def init_test_case(self):
-        self.interp_method = 'nearest'
-        self.input_shape = [4, 1, 7, 8]
-        self.out_h = 1
-        self.out_w = 1
-
-
-class TestNearestNeighborInterpCase2(TestInterpolateOp):
-    def init_test_case(self):
-        self.interp_method = 'nearest'
-        self.input_shape = [3, 3, 9, 6]
-        self.out_h = 12
-        self.out_w = 12
-
-
-class TestNearestNeighborInterpCase3(TestInterpolateOp):
-    def init_test_case(self):
-        self.interp_method = 'nearest'
-        self.input_shape = [1, 1, 128, 64]
-        self.out_h = 64
-        self.out_w = 128
-
-
-class TestNearestNeighborInterpCase4(TestInterpolateOp):
-    def init_test_case(self):
-        self.interp_method = 'nearest'
-        self.input_shape = [4, 1, 7, 8]
-        self.out_h = 1
-        self.out_w = 1
-        self.out_size = np.array([2, 2]).astype("int32")
-
-
-class TestNearestNeighborInterpCase5(TestInterpolateOp):
-    def init_test_case(self):
-        self.interp_method = 'nearest'
-        self.input_shape = [3, 3, 9, 6]
-        self.out_h = 12
-        self.out_w = 12
-        self.out_size = np.array([11, 11]).astype("int32")
-
-
-class TestNearestNeighborInterpCase6(TestInterpolateOp):
-    def init_test_case(self):
-        self.interp_method = 'nearest'
-        self.input_shape = [1, 1, 128, 64]
-        self.out_h = 64
-        self.out_w = 128
-        self.out_size = np.array([65, 129]).astype("int32")
-
-
-class TestNearestNeighborInterpActualShape(TestInterpolateOp):
-    def init_test_case(self):
-        self.interp_method = 'nearest'
-        self.input_shape = [3, 2, 32, 16]
-        self.out_h = 64
-        self.out_w = 32
-        self.out_size = np.array([66, 40]).astype("int32")
-
-
-class TestNearestNeighborInterpBigScale(TestInterpolateOp):
-    def init_test_case(self):
-        self.interp_method = 'nearest'
-        self.input_shape = [4, 4, 64, 32]
-        self.out_h = 100
-        self.out_w = 50
-        self.out_size = np.array([101, 51]).astype('int32')
-
-
-class TestNearestNeighborInterpCase1Uint8(TestInterpolateOpUint8):
-    def init_test_case(self):
-        self.interp_method = 'nearest'
-        self.input_shape = [2, 3, 128, 64]
-        self.out_h = 120
-        self.out_w = 50
-
-
-class TestNearestNeighborInterpCase2Uint8(TestInterpolateOpUint8):
-    def init_test_case(self):
-        self.interp_method = 'nearest'
-        self.input_shape = [4, 1, 7, 8]
-        self.out_h = 5
-        self.out_w = 13
-        self.out_size = np.array([6, 15]).astype("int32")
-
-
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_nearest_interp_op.py b/python/paddle/fluid/tests/unittests/test_nearest_interp_op.py
new file mode 100644
index 0000000000..242709425f
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_nearest_interp_op.py
@@ -0,0 +1,197 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import numpy as np
+from op_test import OpTest
+import paddle.fluid.core as core
+
+
+def nearest_neighbor_interp_np(X,
+                               out_h,
+                               out_w,
+                               out_size=None,
+                               actual_shape=None):
+    """nearest neighbor interpolation implement in shape [N, C, H, W]"""
+    if out_size is not None:
+        out_h = out_size[0]
+        out_w = out_size[1]
+    if actual_shape is not None:
+        out_h = actual_shape[0]
+        out_w = actual_shape[1]
+    n, c, in_h, in_w = X.shape
+
+    ratio_h = ratio_w = 0.0
+    if out_h > 1:
+        ratio_h = (in_h - 1.0) / (out_h - 1.0)
+    if out_w > 1:
+        ratio_w = (in_w - 1.0) / (out_w - 1.0)
+
+    out = np.zeros((n, c, out_h, out_w))
+    for i in range(out_h):
+        in_i = int(ratio_h * i + 0.5)
+        for j in range(out_w):
+            in_j = int(ratio_w * j + 0.5)
+            out[:, :, i, j] = X[:, :, in_i, in_j]
+
+    return out.astype(X.dtype)
+
+
+class TestNearestInterpOp(OpTest):
+    def setUp(self):
+        self.out_size = None
+        self.actual_shape = None
+        self.init_test_case()
+        self.op_type = "nearest_interp"
+        input_np = np.random.random(self.input_shape).astype("float32")
+
+        output_np = nearest_neighbor_interp_np(input_np, self.out_h, self.out_w,
+                                               self.out_size, self.actual_shape)
+        self.inputs = {'X': input_np}
+        if self.out_size is not None:
+            self.inputs['OutSize'] = self.out_size
+        if self.actual_shape is not None:
+            self.inputs['OutSize'] = self.actual_shape
+        self.attrs = {
+            'out_h': self.out_h,
+            'out_w': self.out_w,
+            'interp_method': self.interp_method
+        }
+        self.outputs = {'Out': output_np}
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad(self):
+        self.check_grad(['X'], 'Out', in_place=True)
+
+    def init_test_case(self):
+        self.interp_method = 'nearest'
+        self.input_shape = [2, 3, 4, 4]
+        self.out_h = 2
+        self.out_w = 2
+        self.out_size = np.array([3, 3]).astype("int32")
+
+
+class TestNearestNeighborInterpCase1(TestNearestInterpOp):
+    def init_test_case(self):
+        self.interp_method = 'nearest'
+        self.input_shape = [4, 1, 7, 8]
+        self.out_h = 1
+        self.out_w = 1
+
+
+class TestNearestNeighborInterpCase2(TestNearestInterpOp):
+    def init_test_case(self):
+        self.interp_method = 'nearest'
+        self.input_shape = [3, 3, 9, 6]
+        self.out_h = 12
+        self.out_w = 12
+
+
+class TestNearestNeighborInterpCase3(TestNearestInterpOp):
+    def init_test_case(self):
+        self.interp_method = 'nearest'
+        self.input_shape = [1, 1, 128, 64]
+        self.out_h = 64
+        self.out_w = 128
+
+
+class TestNearestNeighborInterpCase4(TestNearestInterpOp):
+    def init_test_case(self):
+        self.interp_method = 'nearest'
+        self.input_shape = [4, 1, 7, 8]
+        self.out_h = 1
+        self.out_w = 1
+        self.out_size = np.array([2, 2]).astype("int32")
+
+
+class TestNearestNeighborInterpCase5(TestNearestInterpOp):
+    def init_test_case(self):
+        self.interp_method = 'nearest'
+        self.input_shape = [3, 3, 9, 6]
+        self.out_h = 12
+        self.out_w = 12
+        self.out_size = np.array([11, 11]).astype("int32")
+
+
+class TestNearestNeighborInterpCase6(TestNearestInterpOp):
+    def init_test_case(self):
+        self.interp_method = 'nearest'
+        self.input_shape = [1, 1, 128, 64]
+        self.out_h = 64
+        self.out_w = 128
+        self.out_size = np.array([65, 129]).astype("int32")
+
+
+class TestNearestNeighborInterpActualShape(TestNearestInterpOp):
+    def init_test_case(self):
+        self.interp_method = 'nearest'
+        self.input_shape = [3, 2, 32, 16]
+        self.out_h = 64
+        self.out_w = 32
+        self.out_size = np.array([66, 40]).astype("int32")
+
+
+class TestNearestInterpOpUint8(OpTest):
+    def setUp(self):
+        self.out_size = None
+        self.actual_shape = None
+        self.init_test_case()
+        self.op_type = "nearest_interp"
+        input_np = np.random.randint(
+            low=0, high=256, size=self.input_shape).astype("uint8")
+        output_np = nearest_neighbor_interp_np(input_np, self.out_h, self.out_w,
+                                               self.out_size, self.actual_shape)
+        self.inputs = {'X': input_np}
+        if self.out_size is not None:
+            self.inputs['OutSize'] = self.out_size
+        self.attrs = {
+            'out_h': self.out_h,
+            'out_w': self.out_w,
+            'interp_method': self.interp_method
+        }
+        self.outputs = {'Out': output_np}
+
+    def test_check_output(self):
+        self.check_output_with_place(place=core.CPUPlace(), atol=1)
+
+    def init_test_case(self):
+        self.interp_method = 'nearest'
+        self.input_shape = [1, 3, 9, 6]
+        self.out_h = 10
+        self.out_w = 9
+
+
+class TestNearestNeighborInterpCase1Uint8(TestNearestInterpOpUint8):
+    def init_test_case(self):
+        self.interp_method = 'nearest'
+        self.input_shape = [2, 3, 128, 64]
+        self.out_h = 120
+        self.out_w = 50
+
+
+class TestNearestNeighborInterpCase2Uint8(TestNearestInterpOpUint8):
+    def init_test_case(self):
+        self.interp_method = 'nearest'
+        self.input_shape = [4, 1, 7, 8]
+        self.out_h = 5
+        self.out_w = 13
+        self.out_size = np.array([6, 15]).astype("int32")
+
+
+if __name__ == "__main__":
+    unittest.main()

From ae84b9f0a436cfea790dbc8c0b8a0559c39c45aa Mon Sep 17 00:00:00 2001
From: dengkaipeng <dengkaipeng@baidu.com>
Date: Mon, 26 Nov 2018 16:39:32 +0800
Subject: [PATCH 111/113] set unittest serial. test=develop

---
 python/paddle/fluid/tests/unittests/CMakeLists.txt | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/python/paddle/fluid/tests/unittests/CMakeLists.txt b/python/paddle/fluid/tests/unittests/CMakeLists.txt
index 4fa69191ad..d6d701c83f 100644
--- a/python/paddle/fluid/tests/unittests/CMakeLists.txt
+++ b/python/paddle/fluid/tests/unittests/CMakeLists.txt
@@ -81,12 +81,14 @@ list(REMOVE_ITEM TEST_OPS test_dist_se_resnext)
 list(REMOVE_ITEM TEST_OPS test_dist_transformer)
 list(REMOVE_ITEM TEST_OPS test_parallel_executor_transformer)
 list(REMOVE_ITEM TEST_OPS test_image_classification_resnet)
-list(REMOVE_ITEM TEST_OPS test_interpolate_op)
+list(REMOVE_ITEM TEST_OPS test_bilinear_interp_op)
+list(REMOVE_ITEM TEST_OPS test_nearest_interp_op)
 foreach(TEST_OP ${TEST_OPS})
     py_test_modules(${TEST_OP} MODULES ${TEST_OP})
 endforeach(TEST_OP)
 py_test_modules(test_warpctc_op MODULES test_warpctc_op ENVS FLAGS_warpctc_dir=${WARPCTC_LIB_DIR} SERIAL)
-py_test_modules(test_interpolate_op MODULES test_interpolate_op SERIAL)
+py_test_modules(test_bilinear_interp_op MODULES test_bilinear_interp_op SERIAL)
+py_test_modules(test_nearest_interp_op MODULES test_nearest_interp_op SERIAL)
 if(WITH_DISTRIBUTE)
     py_test_modules(test_dist_train MODULES test_dist_train SERIAL)
     set_tests_properties(test_listen_and_serv_op PROPERTIES TIMEOUT 20)

From bb489d4cc94fc245811af28bff07671de246f61c Mon Sep 17 00:00:00 2001
From: dengkaipeng <dengkaipeng@baidu.com>
Date: Mon, 26 Nov 2018 17:36:48 +0800
Subject: [PATCH 112/113] add interp_method default bilinear. test=develop

---
 paddle/fluid/operators/interpolate_op.cc | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/paddle/fluid/operators/interpolate_op.cc b/paddle/fluid/operators/interpolate_op.cc
index 203291ed61..4d25822259 100644
--- a/paddle/fluid/operators/interpolate_op.cc
+++ b/paddle/fluid/operators/interpolate_op.cc
@@ -76,11 +76,12 @@ class InterpolateOpMaker : public framework::OpProtoAndCheckerMaker {
 
     AddAttr<int>("out_h", "output height of interpolate op.");
     AddAttr<int>("out_w", "output width of interpolate op.");
-    AddAttr<std::string>(
-        "interp_method",
-        "(string), interpolation method, can be \"bilinear\" for "
-        "bilinear interpolation and \"nearest\" for nearest "
-        "neighbor interpolation.");
+    AddAttr<std::string>("interp_method",
+                         "(string, default \"bilinear\"), interpolation "
+                         "method, can be \"bilinear\" for "
+                         "bilinear interpolation and \"nearest\" for nearest "
+                         "neighbor interpolation.")
+        .SetDefault("bilinear");
     AddComment(R"DOC(
           This operator samples input X to given output shape by using specified
           interpolation method, the interpolation methods can be \"nearest\"

From 6861fb66a1c96e2162b310aa5e4cb2c31f59afad Mon Sep 17 00:00:00 2001
From: Wu Yi <typhoonzero1986@gmail.com>
Date: Tue, 27 Nov 2018 10:45:26 +0800
Subject: [PATCH 113/113] disable dist_se_resnext test for later fixes
 test=develop (#14605)

---
 python/paddle/fluid/tests/unittests/CMakeLists.txt | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/python/paddle/fluid/tests/unittests/CMakeLists.txt b/python/paddle/fluid/tests/unittests/CMakeLists.txt
index 4fa69191ad..227a1832d5 100644
--- a/python/paddle/fluid/tests/unittests/CMakeLists.txt
+++ b/python/paddle/fluid/tests/unittests/CMakeLists.txt
@@ -93,13 +93,13 @@ if(WITH_DISTRIBUTE)
     if(NOT APPLE)
         set_tests_properties(test_dist_mnist PROPERTIES TIMEOUT 200)
         set_tests_properties(test_dist_word2vec PROPERTIES TIMEOUT 200)
-        py_test_modules(test_dist_se_resnext MODULES test_dist_se_resnext)
-        set_tests_properties(test_dist_se_resnext PROPERTIES TIMEOUT 1000)
-        # FIXME(typhoonzero): add this back
-	#py_test_modules(test_dist_transformer MODULES test_dist_transformer)
-	#set_tests_properties(test_dist_transformer PROPERTIES TIMEOUT 1000)
+        # FIXME(typhoonzero): add these tests back
+	# py_test_modules(test_dist_se_resnext MODULES test_dist_se_resnext)
+	# set_tests_properties(test_dist_se_resnext PROPERTIES TIMEOUT 1000)
+	# py_test_modules(test_dist_transformer MODULES test_dist_transformer)
+	# set_tests_properties(test_dist_transformer PROPERTIES TIMEOUT 1000)
         # TODO(typhoonzero): make dist test parallel when fix port management issue
-        set_tests_properties(test_dist_mnist test_dist_word2vec test_dist_se_resnext test_dist_ctr test_dist_simnet_bow test_dist_save_load test_dist_text_classification test_dist_mnist_batch_merge PROPERTIES RUN_SERIAL TRUE)
+        set_tests_properties(test_dist_mnist test_dist_word2vec test_dist_ctr test_dist_simnet_bow test_dist_save_load test_dist_text_classification test_dist_mnist_batch_merge PROPERTIES RUN_SERIAL TRUE)
     endif(NOT APPLE)
     py_test_modules(test_dist_transpiler MODULES test_dist_transpiler)
 endif()