From 9d67c1fb69538faa2e74fbeca85ea685e5229a60 Mon Sep 17 00:00:00 2001
From: peizhilin <wopeizl@163.com>
Date: Mon, 5 Nov 2018 15:13:53 +0800
Subject: [PATCH 01/50] cpu build support

---
 CMakeLists.txt                                |   6 +
 cmake/external/boost.cmake                    |  57 ++++---
 cmake/external/eigen.cmake                    |   5 +-
 cmake/external/gflags.cmake                   |  14 +-
 cmake/external/glog.cmake                     |   9 +-
 cmake/external/gtest.cmake                    |   5 +-
 cmake/external/openblas.cmake                 | 143 ++++++++++--------
 cmake/external/protobuf.cmake                 |  15 +-
 cmake/external/python.cmake                   |  42 +++++
 cmake/external/xxhash.cmake                   |  61 ++++++--
 cmake/external/zlib.cmake                     |   5 +-
 cmake/generic.cmake                           |  50 +++++-
 cmake/inference_lib.cmake                     |  28 +++-
 paddle/fluid/CMakeLists.txt                   |   9 +-
 .../framework/ir/attention_lstm_fuse_pass.cc  |  18 +--
 paddle/fluid/framework/ir/node.h              |   2 +-
 paddle/fluid/framework/ir/pass.h              |   4 +-
 paddle/fluid/framework/operator.cc            |   5 +-
 paddle/fluid/inference/CMakeLists.txt         |   4 +
 paddle/fluid/inference/analysis/helper.h      |   4 +
 paddle/fluid/inference/api/api_impl.cc        |   4 +
 paddle/fluid/inference/api/helper.h           |   4 +
 paddle/fluid/operators/CMakeLists.txt         |   5 +-
 .../fluid/operators/elementwise_op_function.h |  26 ++++
 paddle/fluid/operators/math/CMakeLists.txt    |   4 +-
 paddle/fluid/platform/init.cc                 |   2 +
 paddle/fluid/platform/nccl_helper.h           |   2 +
 paddle/fluid/platform/variant.h               |   8 +
 paddle/fluid/pybind/CMakeLists.txt            |   8 +-
 paddle/fluid/pybind/pybind.cc                 |  30 +++-
 python/CMakeLists.txt                         |  39 +++--
 python/setup.py.in                            |  51 ++++---
 32 files changed, 497 insertions(+), 172 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index e5b2f32fba..9a895a19c4 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -26,6 +26,11 @@ message(STATUS "C compiler: ${CMAKE_C_COMPILER}, version: "
         "${CMAKE_C_COMPILER_ID} ${CMAKE_C_COMPILER_VERSION}")
 if(WIN32)
     set(CMAKE_STATIC_LIBRARY_PREFIX lib)
+    add_definitions("/DGOOGLE_GLOG_DLL_DECL=")
+    set(CMAKE_C_FLAGS_DEBUG   "${CMAKE_C_FLAGS_DEBUG} /bigobj /MTd")
+    set(CMAKE_C_FLAGS_RELEASE  "${CMAKE_C_FLAGS_RELEASE} /bigobj /MT")
+    set(CMAKE_CXX_FLAGS_DEBUG  "${CMAKE_CXX_FLAGS_DEBUG} /bigobj /MTd")
+    set(CMAKE_CXX_FLAGS_RELEASE   "${CMAKE_CXX_FLAGS_RELEASE} /bigobj /MT")
 endif(WIN32)
 
 if(NOT CMAKE_CROSSCOMPILING)
@@ -73,6 +78,7 @@ option(WITH_INFERENCE_API_TEST   "Test fluid inference high-level api interface"
 option(WITH_SYSTEM_BLAS   "Use system blas library"           OFF)
 option(PY_VERSION       "Compile PaddlePaddle with python3 support"     ${PY_VERSION})
 option(WITH_FAST_MATH   "Make use of fast math library, might affect the precision to some extent" ON)
+option(WITH_PREBUILD_OPENBLAS "Make use of the pre-built openblas library"                         ON)
 
 # PY_VERSION
 if(NOT PY_VERSION)
diff --git a/cmake/external/boost.cmake b/cmake/external/boost.cmake
index ada61de8eb..7c19183df4 100644
--- a/cmake/external/boost.cmake
+++ b/cmake/external/boost.cmake
@@ -28,34 +28,47 @@ if((NOT DEFINED BOOST_TAR) OR (NOT DEFINED BOOST_URL))
     set(BOOST_TAR "boost_1_41_0" CACHE STRING "" FORCE)
     set(BOOST_URL "http://paddlepaddledeps.cdn.bcebos.com/${BOOST_TAR}.tar.gz" CACHE STRING "" FORCE)
 endif()
-IF (WIN32)
-    MESSAGE(WARNING, "In windows, boost can not be downloaded automaticlly, please build it manually and put it at " ${THIRD_PARTY_PATH}install/boost)
-else()
-    MESSAGE(STATUS "BOOST_TAR: ${BOOST_TAR}, BOOST_URL: ${BOOST_URL}")
-ENDIF(WIN32)
+
+MESSAGE(STATUS "BOOST_TAR: ${BOOST_TAR}, BOOST_URL: ${BOOST_URL}")
 
 set(BOOST_SOURCES_DIR ${THIRD_PARTY_PATH}/boost)
 set(BOOST_DOWNLOAD_DIR  "${BOOST_SOURCES_DIR}/src/${BOOST_PROJECT}")
-set(BOOST_INCLUDE_DIR "${BOOST_DOWNLOAD_DIR}/${BOOST_TAR}" CACHE PATH "boost include directory." FORCE)
+if (WIN32)
+    set(BOOST_INCLUDE_DIR "${BOOST_DOWNLOAD_DIR}" CACHE PATH "boost include directory." FORCE)
+else(WIN32)
+    set(BOOST_INCLUDE_DIR "${BOOST_DOWNLOAD_DIR}/${BOOST_TAR}" CACHE PATH "boost include directory." FORCE)
+endif (WIN32)
 set_directory_properties(PROPERTIES CLEAN_NO_CUSTOM 1)
 
 include_directories(${BOOST_INCLUDE_DIR})
-
-if (NOT WIN32)
-ExternalProject_Add(
-    ${BOOST_PROJECT}
-    ${EXTERNAL_PROJECT_LOG_ARGS}
-    DOWNLOAD_DIR          ${BOOST_DOWNLOAD_DIR}
-    DOWNLOAD_COMMAND      wget --no-check-certificate ${BOOST_URL} -c -q -O ${BOOST_TAR}.tar.gz
-    && tar zxf ${BOOST_TAR}.tar.gz
-    DOWNLOAD_NO_PROGRESS  1
-    PREFIX                ${BOOST_SOURCES_DIR}
-    CONFIGURE_COMMAND     ""
-    BUILD_COMMAND         ""
-    INSTALL_COMMAND       ""
-    UPDATE_COMMAND        ""
-)
-endif(NOT WIN32)
+if (WIN32)
+    ExternalProject_Add(
+        ${BOOST_PROJECT}
+        ${EXTERNAL_PROJECT_LOG_ARGS}
+        DOWNLOAD_DIR          ${BOOST_DOWNLOAD_DIR}
+        URL      ${BOOST_URL}
+        DOWNLOAD_NO_PROGRESS  0
+        PREFIX                ${BOOST_SOURCES_DIR}
+        CONFIGURE_COMMAND     ""
+        BUILD_COMMAND         ""
+        INSTALL_COMMAND       ""
+        UPDATE_COMMAND        ""
+        )
+else()
+    ExternalProject_Add(
+        ${BOOST_PROJECT}
+        ${EXTERNAL_PROJECT_LOG_ARGS}
+        DOWNLOAD_DIR          ${BOOST_DOWNLOAD_DIR}
+        DOWNLOAD_COMMAND      "wget --no-check-certificate ${BOOST_URL} -c -q -O ${BOOST_TAR}.tar.gz 
+            && tar zxf ${BOOST_TAR}.tar.gz"
+        DOWNLOAD_NO_PROGRESS  0
+        PREFIX                ${BOOST_SOURCES_DIR}
+        CONFIGURE_COMMAND     ""
+        BUILD_COMMAND         ""
+        INSTALL_COMMAND       ""
+        UPDATE_COMMAND        ""
+        )
+endif ()
 
 if (${CMAKE_VERSION} VERSION_LESS "3.3.0" OR NOT WIN32)
     set(dummyfile ${CMAKE_CURRENT_BINARY_DIR}/boost_dummy.c)
diff --git a/cmake/external/eigen.cmake b/cmake/external/eigen.cmake
index 573ad5e5f0..2aa64a350a 100644
--- a/cmake/external/eigen.cmake
+++ b/cmake/external/eigen.cmake
@@ -29,10 +29,11 @@ else()
     ExternalProject_Add(
         extern_eigen3
         ${EXTERNAL_PROJECT_LOG_ARGS}
-        GIT_REPOSITORY  "https://github.com/eigenteam/eigen-git-mirror"
+#            GIT_REPOSITORY  "https://github.com/eigenteam/eigen-git-mirror"
+            GIT_REPOSITORY  "http://admin@localhost:8080/r/eigen3.git"
         # eigen on cuda9.1 missing header of math_funtions.hpp
         # https://stackoverflow.com/questions/43113508/math-functions-hpp-not-found-when-using-cuda-with-eigen
-        GIT_TAG         917060c364181f33a735dc023818d5a54f60e54c
+#        GIT_TAG         917060c364181f33a735dc023818d5a54f60e54c
         PREFIX          ${EIGEN_SOURCE_DIR}
         DOWNLOAD_NAME   "eigen"
         UPDATE_COMMAND  ""
diff --git a/cmake/external/gflags.cmake b/cmake/external/gflags.cmake
index cf58cc3976..9c6974b8f0 100644
--- a/cmake/external/gflags.cmake
+++ b/cmake/external/gflags.cmake
@@ -28,14 +28,20 @@ INCLUDE_DIRECTORIES(${GFLAGS_INCLUDE_DIR})
 ExternalProject_Add(
     extern_gflags
     ${EXTERNAL_PROJECT_LOG_ARGS}
-    GIT_REPOSITORY  "https://github.com/gflags/gflags.git"
-    GIT_TAG         77592648e3f3be87d6c7123eb81cbad75f9aef5a
+#    GIT_REPOSITORY  "https://github.com/gflags/gflags.git"
+    GIT_REPOSITORY  "http://admin@localhost:8080/r/gflags.git"
+#    GIT_TAG         77592648e3f3be87d6c7123eb81cbad75f9aef5a
     PREFIX          ${GFLAGS_SOURCES_DIR}
     UPDATE_COMMAND  ""
     CMAKE_ARGS      -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}
                     -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER}
                     -DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS}
+                    -DCMAKE_CXX_FLAGS_RELEASE=${CMAKE_CXX_FLAGS_RELEASE}
+                    -DCMAKE_CXX_FLAGS_DEBUG=${CMAKE_CXX_FLAGS_DEBUG}
                     -DCMAKE_C_FLAGS=${CMAKE_C_FLAGS}
+                    -DCMAKE_C_FLAGS_DEBUG=${DCMAKE_C_FLAGS_DEBUG}
+                    -DCMAKE_C_FLAGS_RELEASE=${DCMAKE_C_FLAGS_RELEASE}
+                    -DBUILD_STATIC_LIBS=ON
                     -DCMAKE_INSTALL_PREFIX=${GFLAGS_INSTALL_DIR}
                     -DCMAKE_POSITION_INDEPENDENT_CODE=ON
                     -DBUILD_TESTING=OFF
@@ -48,8 +54,8 @@ ExternalProject_Add(
 IF(WIN32)
   IF(NOT EXISTS "${GFLAGS_INSTALL_DIR}/lib/libgflags.lib")
     add_custom_command(TARGET extern_gflags POST_BUILD
-    COMMAND cmake -E rename ${GFLAGS_INSTALL_DIR}/lib/gflags_static.lib ${GFLAGS_INSTALL_DIR}/lib/libgflags.lib
-  )
+            COMMAND cmake -E copy ${GFLAGS_INSTALL_DIR}/lib/gflags_static.lib ${GFLAGS_INSTALL_DIR}/lib/libgflags.lib
+            )
   ENDIF()
 ENDIF(WIN32)
 ADD_LIBRARY(gflags STATIC IMPORTED GLOBAL)
diff --git a/cmake/external/glog.cmake b/cmake/external/glog.cmake
index 25ef2970ac..84f8127760 100644
--- a/cmake/external/glog.cmake
+++ b/cmake/external/glog.cmake
@@ -34,19 +34,24 @@ ELSE()
   SET(GLOG_REPOSITORY "https://github.com/google/glog.git")
   SET(GLOG_TAG "v0.3.5")
 ENDIF()
+  SET(GLOG_REPOSITORY "http://admin@localhost:8080/r/glog.git")
 
 ExternalProject_Add(
     extern_glog
     ${EXTERNAL_PROJECT_LOG_ARGS}
     DEPENDS gflags
     GIT_REPOSITORY  ${GLOG_REPOSITORY}
-    GIT_TAG         ${GLOG_TAG}
+   # GIT_TAG         ${GLOG_TAG}
     PREFIX          ${GLOG_SOURCES_DIR}
     UPDATE_COMMAND  ""
     CMAKE_ARGS      -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}
                     -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER}
                     -DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS}
+                    -DCMAKE_CXX_FLAGS_RELEASE=${CMAKE_CXX_FLAGS_RELEASE}
+                    -DCMAKE_CXX_FLAGS_DEBUG=${CMAKE_CXX_FLAGS_DEBUG}
                     -DCMAKE_C_FLAGS=${CMAKE_C_FLAGS}
+                    -DCMAKE_C_FLAGS_DEBUG=${DCMAKE_C_FLAGS_DEBUG}
+                    -DCMAKE_C_FLAGS_RELEASE=${DCMAKE_C_FLAGS_RELEASE}
                     -DCMAKE_INSTALL_PREFIX=${GLOG_INSTALL_DIR}
                     -DCMAKE_INSTALL_LIBDIR=${GLOG_INSTALL_DIR}/lib
                     -DCMAKE_POSITION_INDEPENDENT_CODE=ON
@@ -63,7 +68,7 @@ ExternalProject_Add(
 IF(WIN32)
   IF(NOT EXISTS "${GLOG_INSTALL_DIR}/lib/libglog.lib")
     add_custom_command(TARGET extern_glog POST_BUILD
-    COMMAND cmake -E rename ${GLOG_INSTALL_DIR}/lib/glog.lib ${GLOG_INSTALL_DIR}/lib/libglog.lib
+    COMMAND cmake -E copy ${GLOG_INSTALL_DIR}/lib/glog.lib ${GLOG_INSTALL_DIR}/lib/libglog.lib
   )
   ENDIF()
 ENDIF(WIN32)
diff --git a/cmake/external/gtest.cmake b/cmake/external/gtest.cmake
index d335298742..4f5acc92f0 100644
--- a/cmake/external/gtest.cmake
+++ b/cmake/external/gtest.cmake
@@ -43,8 +43,9 @@ IF(WITH_TESTING)
         extern_gtest
         ${EXTERNAL_PROJECT_LOG_ARGS}
         DEPENDS         ${GTEST_DEPENDS}
-        GIT_REPOSITORY  "https://github.com/google/googletest.git"
-        GIT_TAG         "release-1.8.0"
+            #        GIT_REPOSITORY  "https://github.com/google/googletest.git"
+                    GIT_REPOSITORY  "http://admin@localhost:8080/r/gtest.git"
+#        GIT_TAG         "release-1.8.0"
         PREFIX          ${GTEST_SOURCES_DIR}
         UPDATE_COMMAND  ""
         CMAKE_ARGS      -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}
diff --git a/cmake/external/openblas.cmake b/cmake/external/openblas.cmake
index 755dbd610c..664422813d 100644
--- a/cmake/external/openblas.cmake
+++ b/cmake/external/openblas.cmake
@@ -17,12 +17,8 @@ IF(USE_EIGEN_FOR_BLAS)
 ENDIF(USE_EIGEN_FOR_BLAS)
 
 INCLUDE(cblas)
-# IF(WIN32 AND NOT ${CBLAS_FOUND})
-
-
 
 IF(NOT ${CBLAS_FOUND})
-
     INCLUDE(ExternalProject)
 
     SET(CBLAS_SOURCES_DIR ${THIRD_PARTY_PATH}/openblas)
@@ -34,66 +30,95 @@ IF(NOT ${CBLAS_FOUND})
         CACHE FILEPATH "openblas library." FORCE)
 
     ADD_DEFINITIONS(-DPADDLE_USE_OPENBLAS)
-    IF (WIN32)
-        SET(CBLAS_FOUND true)
-        MESSAGE(WARNING, "In windows, openblas only support msvc build, please build it manually and put it at " ${CBLAS_INSTALL_DIR})
-    ENDIF(WIN32)
 
-    IF (NOT WIN32)
-    SET(OPENBLAS_CC "${CMAKE_C_COMPILER} -Wno-unused-but-set-variable -Wno-unused-variable")
-    SET(OPENBLAS_COMMIT "v0.2.20")
-
-    IF(CMAKE_CROSSCOMPILING)
-        SET(OPTIONAL_ARGS HOSTCC=${HOST_C_COMPILER})
-        GET_FILENAME_COMPONENT(CROSS_SUFFIX ${CMAKE_C_COMPILER} DIRECTORY)
-        SET(CROSS_SUFFIX ${CROSS_SUFFIX}/)
-        IF(ANDROID)
-            IF(ANDROID_ABI MATCHES "^armeabi(-v7a)?$")
-                # use softfp
-                SET(OPTIONAL_ARGS ${OPTIONAL_ARGS} TARGET=ARMV7 ARM_SOFTFP_ABI=1 USE_THREAD=0)
-            ELSEIF(ANDROID_ABI STREQUAL "arm64-v8a")
-                SET(OPTIONAL_ARGS ${OPTIONAL_ARGS} TARGET=ARMV8 BINARY=64 USE_THREAD=0)
+    IF (WITH_PREBUILD_OPENBLAS)
+        SET(CBLAS_FOUND true)
+        MESSAGE(STATUS, "Use prebuild openblas, please put it at " ${CBLAS_INSTALL_DIR})
+    ELSE(WITH_PREBUILD_OPENBLAS)
+        SET(OPENBLAS_CC "${CMAKE_C_COMPILER} -Wno-unused-but-set-variable -Wno-unused-variable")
+        SET(OPENBLAS_COMMIT "v0.2.20")
+
+        IF(CMAKE_CROSSCOMPILING)
+            SET(OPTIONAL_ARGS HOSTCC=${HOST_C_COMPILER})
+            GET_FILENAME_COMPONENT(CROSS_SUFFIX ${CMAKE_C_COMPILER} DIRECTORY)
+            SET(CROSS_SUFFIX ${CROSS_SUFFIX}/)
+            IF(ANDROID)
+                IF(ANDROID_ABI MATCHES "^armeabi(-v7a)?$")
+                    # use softfp
+                    SET(OPTIONAL_ARGS ${OPTIONAL_ARGS} TARGET=ARMV7 ARM_SOFTFP_ABI=1 USE_THREAD=0)
+                ELSEIF(ANDROID_ABI STREQUAL "arm64-v8a")
+                    SET(OPTIONAL_ARGS ${OPTIONAL_ARGS} TARGET=ARMV8 BINARY=64 USE_THREAD=0)
+                ENDIF()
+            ELSEIF(IOS)
+                IF(CMAKE_OSX_ARCHITECTURES MATCHES "arm64")
+                    SET(OPENBLAS_CC "${OPENBLAS_CC} ${CMAKE_C_FLAGS} -isysroot ${CMAKE_OSX_SYSROOT}")
+                    SET(OPENBLAS_CC "${OPENBLAS_CC} -arch arm64")
+                    SET(OPTIONAL_ARGS ${OPTIONAL_ARGS} TARGET=ARMV8 BINARY=64 USE_THREAD=0 CROSS_SUFFIX=${CROSS_SUFFIX})
+                ELSE()
+                    MESSAGE(FATAL_ERROR "OpenBLAS only support arm64 architectures on iOS. "
+                           "You can set IOS_USE_VECLIB_FOR_BLAS=ON or USE_EIGEN_FOR_BLAS=ON to use other blas library instead.")
+                ENDIF()
+            ELSEIF(RPI)
+                # use hardfp
+                SET(OPTIONAL_ARGS ${OPTIONAL_ARGS} TARGET=ARMV7 USE_THREAD=0)
             ENDIF()
-        ELSEIF(IOS)
-            IF(CMAKE_OSX_ARCHITECTURES MATCHES "arm64")
-                SET(OPENBLAS_CC "${OPENBLAS_CC} ${CMAKE_C_FLAGS} -isysroot ${CMAKE_OSX_SYSROOT}")
-                SET(OPENBLAS_CC "${OPENBLAS_CC} -arch arm64")
-                SET(OPTIONAL_ARGS ${OPTIONAL_ARGS} TARGET=ARMV8 BINARY=64 USE_THREAD=0 CROSS_SUFFIX=${CROSS_SUFFIX})
-            ELSE()
-                MESSAGE(FATAL_ERROR "OpenBLAS only support arm64 architectures on iOS. "
-                       "You can set IOS_USE_VECLIB_FOR_BLAS=ON or USE_EIGEN_FOR_BLAS=ON to use other blas library instead.")
+        ELSE()
+            IF(APPLE)
+                SET(OPENBLAS_CC "${CMAKE_C_COMPILER} -isysroot ${CMAKE_OSX_SYSROOT}")
+            ENDIF()
+            SET(OPTIONAL_ARGS "")
+            IF(CMAKE_SYSTEM_PROCESSOR MATCHES "^x86(_64)?$")
+                SET(OPTIONAL_ARGS DYNAMIC_ARCH=1 NUM_THREADS=64)
             ENDIF()
-        ELSEIF(RPI)
-            # use hardfp
-            SET(OPTIONAL_ARGS ${OPTIONAL_ARGS} TARGET=ARMV7 USE_THREAD=0)
-        ENDIF()
-    ELSE()
-        IF(APPLE)
-            SET(OPENBLAS_CC "${CMAKE_C_COMPILER} -isysroot ${CMAKE_OSX_SYSROOT}")
-        ENDIF()
-        SET(OPTIONAL_ARGS "")
-        IF(CMAKE_SYSTEM_PROCESSOR MATCHES "^x86(_64)?$")
-            SET(OPTIONAL_ARGS DYNAMIC_ARCH=1 NUM_THREADS=64)
         ENDIF()
-    ENDIF()
 
-    SET(COMMON_ARGS CC=${OPENBLAS_CC} NO_SHARED=1 NO_LAPACK=1 libs)
-    ExternalProject_Add(
-        extern_openblas
-        ${EXTERNAL_PROJECT_LOG_ARGS}
-        GIT_REPOSITORY      https://github.com/xianyi/OpenBLAS.git
-        GIT_TAG             ${OPENBLAS_COMMIT}
-        PREFIX              ${CBLAS_SOURCES_DIR}
-        INSTALL_DIR         ${CBLAS_INSTALL_DIR}
-        BUILD_IN_SOURCE     1
-        BUILD_COMMAND       ${CMAKE_MAKE_PROGRAM} ${COMMON_ARGS} ${OPTIONAL_ARGS}
-        INSTALL_COMMAND     ${CMAKE_MAKE_PROGRAM} install NO_SHARED=1 NO_LAPACK=1 PREFIX=<INSTALL_DIR> 
-                            && rm -r ${CBLAS_INSTALL_DIR}/lib/cmake ${CBLAS_INSTALL_DIR}/lib/pkgconfig
-        UPDATE_COMMAND      ""
-        CONFIGURE_COMMAND   ""
-    )
-    ELSE()
-    ENDIF(NOT WIN32)
+        IF(WIN32)
+            ExternalProject_Add(
+                extern_openblas
+                ${EXTERNAL_PROJECT_LOG_ARGS}
+    #              GIT_REPOSITORY      https://github.com/xianyi/OpenBLAS.git
+                GIT_REPOSITORY      http://admin@localhost:8080/r/openblas.git
+    #              GIT_TAG             ${OPENBLAS_COMMIT}
+                PREFIX              ${CBLAS_SOURCES_DIR}
+                INSTALL_DIR         ${CBLAS_INSTALL_DIR}
+                BUILD_IN_SOURCE     1
+                CMAKE_ARGS    -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}
+                -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER}
+                -DCMAKE_C_FLAGS=${CMAKE_C_FLAGS}
+                -DCMAKE_C_FLAGS_DEBUG=${DCMAKE_C_FLAGS_DEBUG}
+                -DCMAKE_C_FLAGS_RELEASE=${DCMAKE_C_FLAGS_RELEASE}
+                -DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS}
+                -DCMAKE_CXX_FLAGS_RELEASE=${CMAKE_CXX_FLAGS_RELEASE}
+                -DCMAKE_CXX_FLAGS_DEBUG=${CMAKE_CXX_FLAGS_DEBUG}
+                -DNO_SHARED=ON
+                -DNO_STATIC=OFF
+                -DBUILD_WITHOUT_LAPACK=ON
+                -DUSE_THREAD=OFF
+                -DCMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE}
+                CMAKE_CACHE_ARGS -DCMAKE_INSTALL_PREFIX:PATH=${CBLAS_INSTALL_DIR}
+                -DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON
+                -DCMAKE_BUILD_TYPE:STRING=${THIRD_PARTY_BUILD_TYPE}
+                )
+        ELSE(WIN32)
+            SET(COMMON_ARGS CC=${OPENBLAS_CC} NO_SHARED=1 NO_LAPACK=1 libs)
+            ExternalProject_Add(
+                extern_openblas
+                ${EXTERNAL_PROJECT_LOG_ARGS}
+                #  GIT_REPOSITORY      https://github.com/xianyi/OpenBLAS.git
+                GIT_REPOSITORY      http://admin@localhost:8080/r/openblas.git
+                #  GIT_TAG             ${OPENBLAS_COMMIT}
+                PREFIX              ${CBLAS_SOURCES_DIR}
+                INSTALL_DIR         ${CBLAS_INSTALL_DIR}
+                BUILD_IN_SOURCE     1
+                BUILD_COMMAND       ${CMAKE_MAKE_PROGRAM} ${COMMON_ARGS} ${OPTIONAL_ARGS}
+                INSTALL_COMMAND     ${CMAKE_MAKE_PROGRAM} install NO_SHARED=1 NO_LAPACK=1 PREFIX=<INSTALL_DIR>
+                && rm -r ${CBLAS_INSTALL_DIR}/lib/cmake ${CBLAS_INSTALL_DIR}/lib/pkgconfig
+                UPDATE_COMMAND      ""
+                CONFIGURE_COMMAND   ""
+                )
+        ENDIF(WIN32)
+    ENDIF (WITH_PREBUILD_OPENBLAS)
+
     SET(CBLAS_PROVIDER openblas)
     IF(WITH_C_API)
         INSTALL(DIRECTORY ${CBLAS_INC_DIR} DESTINATION third_party/openblas)
diff --git a/cmake/external/protobuf.cmake b/cmake/external/protobuf.cmake
index 550b0dada8..d4c6ea7819 100644
--- a/cmake/external/protobuf.cmake
+++ b/cmake/external/protobuf.cmake
@@ -144,7 +144,6 @@ endmacro()
 set(PROTOBUF_ROOT "" CACHE PATH "Folder contains protobuf")
 IF (WIN32)
     SET(PROTOBUF_ROOT ${THIRD_PARTY_PATH}/install/protobuf)
-    MESSAGE(WARNING, "In windows, protobuf only support msvc build, please build it manually and put it at " ${PROTOBUF_ROOT})
 ENDIF(WIN32)
 
 if (NOT "${PROTOBUF_ROOT}" STREQUAL "")
@@ -192,16 +191,24 @@ FUNCTION(build_protobuf TARGET_NAME BUILD_FOR_HOST)
         SET(OPTIONAL_ARGS
             "-DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}"
             "-DCMAKE_C_COMPILER=${CMAKE_C_COMPILER}"
-            "-DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS}"
             "-DCMAKE_C_FLAGS=${CMAKE_C_FLAGS}"
+            "-DCMAKE_C_FLAGS_DEBUG=${DCMAKE_C_FLAGS_DEBUG}"
+            "-DCMAKE_C_FLAGS_RELEASE=${DCMAKE_C_FLAGS_RELEASE}"
+            "-DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS}"
+            "-DCMAKE_CXX_FLAGS_RELEASE=${CMAKE_CXX_FLAGS_RELEASE}"
+            "-DCMAKE_CXX_FLAGS_DEBUG=${CMAKE_CXX_FLAGS_DEBUG}"
             "-Dprotobuf_WITH_ZLIB=ON"
             "-DZLIB_ROOT:FILEPATH=${ZLIB_ROOT}"
             ${EXTERNAL_OPTIONAL_ARGS})
         SET(OPTIONAL_CACHE_ARGS "-DZLIB_ROOT:STRING=${ZLIB_ROOT}")
     ENDIF()
+    IF(WIN32)
+        SET(OPTIONAL_ARGS ${OPTIONAL_ARGS} "-DCMAKE_GENERATOR_PLATFORM=x64")
+    ENDIF()
 
-    SET(PROTOBUF_REPO "https://github.com/google/protobuf.git")
-    SET(PROTOBUF_TAG "9f75c5aa851cd877fb0d93ccc31b8567a6706546")
+   # SET(PROTOBUF_REPO "https://github.com/google/protobuf.git")
+  #  SET(PROTOBUF_TAG "9f75c5aa851cd877fb0d93ccc31b8567a6706546")
+    SET(PROTOBUF_REPO http://admin@localhost:8080/r/protobuf.git)
     IF(MOBILE_INFERENCE)
         # The reason why the official version is not used is described in
         # https://github.com/PaddlePaddle/Paddle/issues/6114
diff --git a/cmake/external/python.cmake b/cmake/external/python.cmake
index f17b8d46dc..a3599dd798 100644
--- a/cmake/external/python.cmake
+++ b/cmake/external/python.cmake
@@ -21,6 +21,48 @@ INCLUDE(python_module)
 FIND_PACKAGE(PythonInterp ${PY_VERSION})
 FIND_PACKAGE(PythonLibs ${PY_VERSION})
 
+if(WIN32)
+    execute_process(COMMAND "${PYTHON_EXECUTABLE}" "-c"
+"from distutils import sysconfig as s;import sys;import struct;
+print(sys.prefix);
+print(s.get_config_var('LDVERSION') or s.get_config_var('VERSION'));
+"
+            RESULT_VARIABLE _PYTHON_SUCCESS
+            OUTPUT_VARIABLE _PYTHON_VALUES
+            ERROR_VARIABLE _PYTHON_ERROR_VALUE)
+
+    if(NOT _PYTHON_SUCCESS MATCHES 0)
+        set(PYTHONLIBS_FOUND FALSE)
+        return()
+    endif()
+
+    # Convert the process output into a list
+    string(REGEX REPLACE ";" "\\\\;" _PYTHON_VALUES ${_PYTHON_VALUES})
+    string(REGEX REPLACE "\n" ";" _PYTHON_VALUES ${_PYTHON_VALUES})
+    list(GET _PYTHON_VALUES 0 PYTHON_PREFIX)
+    list(GET _PYTHON_VALUES 1 PYTHON_LIBRARY_SUFFIX)
+
+    # Make sure all directory separators are '/'
+    string(REGEX REPLACE "\\\\" "/" PYTHON_PREFIX ${PYTHON_PREFIX})
+
+    set(PYTHON_LIBRARY
+            "${PYTHON_PREFIX}/libs/Python${PYTHON_LIBRARY_SUFFIX}.lib")
+
+    # when run in a venv, PYTHON_PREFIX points to it. But the libraries remain in the
+    # original python installation. They may be found relative to PYTHON_INCLUDE_DIR.
+    if(NOT EXISTS "${PYTHON_LIBRARY}")
+        get_filename_component(_PYTHON_ROOT ${PYTHON_INCLUDE_DIR} DIRECTORY)
+        set(PYTHON_LIBRARY
+                "${_PYTHON_ROOT}/libs/Python${PYTHON_LIBRARY_SUFFIX}.lib")
+    endif()
+
+    # raise an error if the python libs are still not found.
+    if(NOT EXISTS "${PYTHON_LIBRARY}")
+        message(FATAL_ERROR "Python libraries not found")
+    endif()
+    SET(PYTHON_LIBRARIES "${PYTHON_LIBRARY}")
+endif(WIN32)
+
 # Fixme: Maybe find a static library. Get SHARED/STATIC by FIND_PACKAGE.
 ADD_LIBRARY(python SHARED IMPORTED GLOBAL)
 SET_PROPERTY(TARGET python PROPERTY IMPORTED_LOCATION ${PYTHON_LIBRARIES})
diff --git a/cmake/external/xxhash.cmake b/cmake/external/xxhash.cmake
index c227e09719..4c2d64f627 100644
--- a/cmake/external/xxhash.cmake
+++ b/cmake/external/xxhash.cmake
@@ -14,23 +14,52 @@ ELSE()
   ENDIF(APPLE)
 ENDIF()
 
-ExternalProject_Add(
-    extern_xxhash
-    ${EXTERNAL_PROJECT_LOG_ARGS}
-    GIT_REPOSITORY  "https://github.com/Cyan4973/xxHash"
-    GIT_TAG         "v0.6.5"
-    PREFIX          ${XXHASH_SOURCE_DIR}
-    DOWNLOAD_NAME   "xxhash"
-    UPDATE_COMMAND  ""
-    CONFIGURE_COMMAND ""
-    BUILD_IN_SOURCE 1
-    PATCH_COMMAND
-    BUILD_COMMAND     ${BUILD_CMD}
-    INSTALL_COMMAND   export PREFIX=${XXHASH_INSTALL_DIR}/ && make install
-    TEST_COMMAND      ""
-)
+if(WIN32)
+  ExternalProject_Add(
+          extern_xxhash
+          ${EXTERNAL_PROJECT_LOG_ARGS}
+          GIT_REPOSITORY  "https://github.com/Cyan4973/xxHash"
+          GIT_TAG         "v0.6.5"
+          PREFIX          ${XXHASH_SOURCE_DIR}
+          DOWNLOAD_NAME   "xxhash"
+          UPDATE_COMMAND  ""
+          BUILD_IN_SOURCE 1
+          PATCH_COMMAND
+          CONFIGURE_COMMAND
+          ${CMAKE_COMMAND} ${XXHASH_SOURCE_DIR}/src/extern_xxhash/cmake_unofficial
+          -DCMAKE_INSTALL_PREFIX:PATH=${XXHASH_INSTALL_DIR}
+          -DCMAKE_BUILD_TYPE:STRING=${CMAKE_BUILD_TYPE}
+          -DCMAKE_VERBOSE_MAKEFILE:BOOL=OFF
+          -DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON
+          -DBUILD_XXHSUM=OFF
+          -DCMAKE_GENERATOR_PLATFORM=x64
+          -DBUILD_SHARED_LIBS=OFF
+          ${OPTIONAL_CACHE_ARGS}
+          TEST_COMMAND      ""
+  )
+else()
+  ExternalProject_Add(
+      extern_xxhash
+      ${EXTERNAL_PROJECT_LOG_ARGS}
+      GIT_REPOSITORY  "https://github.com/Cyan4973/xxHash"
+      GIT_TAG         "v0.6.5"
+      PREFIX          ${XXHASH_SOURCE_DIR}
+      DOWNLOAD_NAME   "xxhash"
+      UPDATE_COMMAND  ""
+      CONFIGURE_COMMAND ""
+      BUILD_IN_SOURCE 1
+      PATCH_COMMAND
+      BUILD_COMMAND     ${BUILD_CMD}
+      INSTALL_COMMAND   export PREFIX=${XXHASH_INSTALL_DIR}/ && make install
+      TEST_COMMAND      ""
+  )
+endif()
 
-set(XXHASH_LIBRARIES "${XXHASH_INSTALL_DIR}/lib/libxxhash.a")
+if (WIN32)
+  set(XXHASH_LIBRARIES "${XXHASH_INSTALL_DIR}/lib/xxhash.lib")
+else()
+  set(XXHASH_LIBRARIES "${XXHASH_INSTALL_DIR}/lib/libxxhash.a")
+endif ()
 INCLUDE_DIRECTORIES(${XXHASH_INCLUDE_DIR})
 
 add_library(xxhash STATIC IMPORTED GLOBAL)
diff --git a/cmake/external/zlib.cmake b/cmake/external/zlib.cmake
index c3d7323545..b65f2afbc2 100644
--- a/cmake/external/zlib.cmake
+++ b/cmake/external/zlib.cmake
@@ -31,8 +31,9 @@ INCLUDE_DIRECTORIES(${THIRD_PARTY_PATH}/install) # For Paddle code to include zl
 ExternalProject_Add(
     extern_zlib
     ${EXTERNAL_PROJECT_LOG_ARGS}
-    GIT_REPOSITORY  "https://github.com/madler/zlib.git"
-    GIT_TAG         "v1.2.8"
+        #    GIT_REPOSITORY  "https://github.com/madler/zlib.git"
+            GIT_REPOSITORY  "http://admin@localhost:8080/r/zlib.git"
+#    GIT_TAG         "v1.2.8"
     PREFIX          ${ZLIB_SOURCES_DIR}
     UPDATE_COMMAND  ""
     CMAKE_ARGS      -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER}
diff --git a/cmake/generic.cmake b/cmake/generic.cmake
index 62227c6784..174e5b2d17 100644
--- a/cmake/generic.cmake
+++ b/cmake/generic.cmake
@@ -266,7 +266,11 @@ function(cc_library TARGET_NAME)
       if("${cc_library_DEPS};" MATCHES "python;")
         list(REMOVE_ITEM cc_library_DEPS python)
         add_dependencies(${TARGET_NAME} python)
-        target_link_libraries(${TARGET_NAME} "-Wl,-undefined,dynamic_lookup")
+        if(WIN32)
+          target_link_libraries(${TARGET_NAME} ${PYTHON_LIBRARIES})
+        else()
+          target_link_libraries(${TARGET_NAME} "-Wl,-undefined,dynamic_lookup")
+        endif(WIN32)
       endif()
       target_link_libraries(${TARGET_NAME} ${cc_library_DEPS})
       add_dependencies(${TARGET_NAME} ${cc_library_DEPS})
@@ -288,6 +292,50 @@ function(cc_library TARGET_NAME)
   endif(cc_library_SRCS)
 endfunction(cc_library)
 
+# The link operation under windows may exceeds the maximum characters limit, simply break the link command
+# into multiple link opeartion can fix that, say
+# original:
+#     lib /out:target.lib a.lib b.lib c.lib d.lib
+# after:
+#    1. lib /out:dummy_lib_1.lib a.lib b.lib
+#    2. lib /out:dummy_lib_2.lib c.lib d.lib
+#    1. lib /out:target.lib dummy_lib_1.lib dummy_lib_2.lib
+function(sep_library TARGET_NAME)
+  set(options STATIC static SHARED shared)
+  set(oneValueArgs "")
+  set(multiValueArgs SRCS DEPS)
+  set(${TARGET_NAME}_dummy_flag "")
+  if(${sep_library_STATIC})
+    set(${TARGET_NAME}_dummy_flag "STATIC")
+  elseif(${sep_library_SHARED})
+    set(${TARGET_NAME}_dummy_flag "SHARED")
+  endif()
+  cmake_parse_arguments(sep_library "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
+  set(dummy_index 1)
+  set(dummy_offset 1)
+  # the dummy target would be consisted of limit size libraries
+  set(dummy_limit 50)
+  list(LENGTH sep_library_DEPS sep_all_len)
+  foreach(v ${sep_library_DEPS})
+    list(APPEND dummy_list ${v})
+    list(LENGTH dummy_list listlen )
+    if ((${listlen} GREATER ${dummy_limit}) OR (${dummy_offset} EQUAL ${sep_all_len}))
+      message("create dummy library ${TARGET_NAME}_dummy_lib_${dummy_index} for ${TARGET_NAME}")
+      #            set(dummyfile ${CMAKE_CURRENT_BINARY_DIR}/${TARGET_NAME}_dummy_${dummy_index}.c)
+      #            file(WRITE ${dummyfile} "const char *dummy_${TARGET_NAME}_${dummy_index} = \"${dummyfile}\";")
+      #            cc_library(${TARGET_NAME}_dummy_lib_${dummy_index} ${${TARGET_NAME}_dummy_flag} SRCS ${dummyfile} DEPS ${dummy_list})
+      cc_library(${TARGET_NAME}_dummy_lib_${dummy_index} ${${TARGET_NAME}_dummy_flag} DEPS ${dummy_list})
+      foreach(i ${dummy_list})
+        list(REMOVE_AT dummy_list 0)
+      endforeach()
+      list(APPEND ${TARGET_NAME}_dummy_list ${TARGET_NAME}_dummy_lib_${dummy_index})
+      MATH(EXPR dummy_index "${dummy_index}+1")
+    endif()
+    MATH(EXPR dummy_offset "${dummy_offset}+1")
+  endforeach()
+  cc_library(${TARGET_NAME} ${${TARGET_NAME}_dummy_flag} SRCS ${sep_library_SRCS} DEPS ${${TARGET_NAME}_dummy_list})
+endfunction(sep_library)
+
 function(cc_binary TARGET_NAME)
   set(options "")
   set(oneValueArgs "")
diff --git a/cmake/inference_lib.cmake b/cmake/inference_lib.cmake
index efdb093a7b..8af88833db 100644
--- a/cmake/inference_lib.cmake
+++ b/cmake/inference_lib.cmake
@@ -31,10 +31,32 @@ function(copy TARGET)
     foreach(index RANGE ${len})
         list(GET copy_lib_SRCS ${index} src)
         list(GET copy_lib_DSTS ${index} dst)
-        add_custom_command(TARGET ${TARGET} PRE_BUILD
-          COMMAND mkdir -p "${dst}"
-          COMMAND cp -r "${src}" "${dst}"
+        
+        if (WIN32) 
+        # windows cmd shell will not expand wildcard automatically.
+        # below expand the files,libs and copy them by rules.
+        file(GLOB header_files ${src} "*.h")
+        file(GLOB static_lib_files ${src} "*.lib")
+        file(GLOB dll_lib_files ${src} "*.dll")
+        set(src_files ${header_files} ${static_lib_files} ${dll_lib_files})
+
+        if (NOT "${src_files}" STREQUAL "")
+        list(REMOVE_DUPLICATES src_files)
+        endif()
+        add_custom_command(TARGET ${TARGET} PRE_BUILD 
+          COMMAND ${CMAKE_COMMAND} -E make_directory  "${dst}"
+          )
+        foreach(src_file ${src_files})
+          add_custom_command(TARGET ${TARGET} PRE_BUILD
+          COMMAND ${CMAKE_COMMAND} -E copy "${src_file}" "${dst}"
+          COMMENT "copying ${src_file} -> ${dst}")
+        endforeach()
+        else() # not windows
+        add_custom_command(TARGET ${TARGET} PRE_BUILD 
+          COMMAND ${CMAKE_COMMAND} -E make_directory  "${dst}"
+          COMMAND ${CMAKE_COMMAND} -E copy "${src_files}" "${dst}"
           COMMENT "copying ${src} -> ${dst}")
+        endif(WIN32)
     endforeach()
 endfunction()
 
diff --git a/paddle/fluid/CMakeLists.txt b/paddle/fluid/CMakeLists.txt
index 7d48f00571..528d627728 100644
--- a/paddle/fluid/CMakeLists.txt
+++ b/paddle/fluid/CMakeLists.txt
@@ -4,11 +4,14 @@ add_subdirectory(framework)
 add_subdirectory(operators)
 add_subdirectory(string)
 
-if (NOT WIN32)
 add_subdirectory(pybind)
+if (NOT WIN32)
 add_subdirectory(recordio)
 endif(NOT WIN32)
 
-# NOTE: please add subdirectory inference at last.
-add_subdirectory(inference)
+if(WITH_INFERENCE)
+  # NOTE: please add subdirectory inference at last.
+  add_subdirectory(inference)
+endif()
+
 add_subdirectory(train)
diff --git a/paddle/fluid/framework/ir/attention_lstm_fuse_pass.cc b/paddle/fluid/framework/ir/attention_lstm_fuse_pass.cc
index 6090f1fe76..66d81f0ec4 100644
--- a/paddle/fluid/framework/ir/attention_lstm_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/attention_lstm_fuse_pass.cc
@@ -211,12 +211,12 @@ void PrepareLSTMWeight(const LoDTensor& W_forget_w0,
   VLOG(3) << "LSTMWeight resized to " << out->dims();
 
   float* out_data = out->mutable_data<float>(platform::CPUPlace());
-  std::array<const float*, 4> tensors(
-      {{W_forget_w0.data<float>(), W_input_w0.data<float>(),
-        W_output_w0.data<float>(), W_cell_w0.data<float>()}});
-  std::array<const float*, 4> tensors1(
-      {{W_forget_w1.data<float>(), W_input_w1.data<float>(),
-        W_output_w1.data<float>(), W_cell_w1.data<float>()}});
+  std::array<const float*, 4> tensors = 
+      {W_forget_w0.data<float>(), W_input_w0.data<float>(),
+        W_output_w0.data<float>(), W_cell_w0.data<float>()};
+  std::array<const float*, 4> tensors1 = 
+      {W_forget_w1.data<float>(), W_input_w1.data<float>(),
+        W_output_w1.data<float>(), W_cell_w1.data<float>()};
 
   for (int row = 0; row < D; row++) {
     for (int col = 0; col < 4; col++) {
@@ -238,9 +238,9 @@ void PrepareLSTMWeight(const LoDTensor& W_forget_w0,
 void PrepareLSTMBias(const LoDTensor& B_forget, const LoDTensor& B_input,
                      const LoDTensor& B_output, const LoDTensor& B_cell,
                      LoDTensor* out) {
-  std::array<const float*, 4> tensors(
-      {{B_forget.data<float>(), B_input.data<float>(), B_output.data<float>(),
-        B_cell.data<float>()}});
+  std::array<const float*, 4> tensors = 
+      {B_forget.data<float>(), B_input.data<float>(), B_output.data<float>(),
+        B_cell.data<float>()};
 
   PADDLE_ENFORCE_EQ(B_forget.dims().size(), 1);
   int D = B_forget.dims()[0];
diff --git a/paddle/fluid/framework/ir/node.h b/paddle/fluid/framework/ir/node.h
index d6d42f5e92..2565fc2ab8 100644
--- a/paddle/fluid/framework/ir/node.h
+++ b/paddle/fluid/framework/ir/node.h
@@ -28,7 +28,7 @@ namespace ir {
 class Node {
  public:
   enum class Type { kOperation, kVariable };
-  static constexpr char kControlDepVarName[] = "__control_var";
+  static constexpr const char kControlDepVarName[] = "__control_var";
 
   Type NodeType() const { return type_; }
 
diff --git a/paddle/fluid/framework/ir/pass.h b/paddle/fluid/framework/ir/pass.h
index 9570c59cff..e1767337ab 100644
--- a/paddle/fluid/framework/ir/pass.h
+++ b/paddle/fluid/framework/ir/pass.h
@@ -207,7 +207,7 @@ struct PassRegistrar : public Registrar {
     return 0;                                                         \
   }                                                                   \
   static ::paddle::framework::ir::PassRegistrar<pass_class>           \
-      &__pass_tmp_registrar_##pass_type##__ __attribute__((unused)) = \
+      &__pass_tmp_registrar_##pass_type##__ __UNUSED__() = \
           __pass_registrar_##pass_type##__
 
 #define USE_PASS(pass_type)                                           \
@@ -215,7 +215,7 @@ struct PassRegistrar : public Registrar {
       __use_pass_itself_##pass_type,                                  \
       "USE_PASS must be called in global namespace");                 \
   extern int TouchPassRegistrar_##pass_type();                        \
-  static int use_pass_itself_##pass_type##_ __attribute__((unused)) = \
+  static int use_pass_itself_##pass_type##_ __UNUSED__() = \
       TouchPassRegistrar_##pass_type()
 
 }  // namespace ir
diff --git a/paddle/fluid/framework/operator.cc b/paddle/fluid/framework/operator.cc
index 45fc36c706..35f872ec00 100644
--- a/paddle/fluid/framework/operator.cc
+++ b/paddle/fluid/framework/operator.cc
@@ -153,11 +153,14 @@ void OperatorBase::Run(const Scope& scope, const platform::Place& place) {
   // The profile has a process-wide mutex, results in serious performance issue
   // in concurrency scenerio. Here use an `if` to fix this issue.
   // Please not remove the `if`, ask @Superjomn if there are any concern.
+#ifndef _WIN32
   if (platform::IsProfileEnabled()) {
     platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
     platform::RecordEvent record_event(Type(), pool.Get(place));
     RunImpl(scope, place);
-  } else {
+  } else
+#endif
+  {
     RunImpl(scope, place);
   }
   VLOG(3) << place << " " << DebugStringEx(&scope);
diff --git a/paddle/fluid/inference/CMakeLists.txt b/paddle/fluid/inference/CMakeLists.txt
index dbbe8bcba6..39d3691471 100644
--- a/paddle/fluid/inference/CMakeLists.txt
+++ b/paddle/fluid/inference/CMakeLists.txt
@@ -30,7 +30,11 @@ if (WITH_GPU AND TENSORRT_FOUND)
 endif()
 
 # Create static library
+if(WIN32)
+sep_library(paddle_fluid DEPS ${fluid_modules} ${STATIC_INFERENCE_APIS} zero_copy_tensor reset_tensor_array)
+else()
 cc_library(paddle_fluid DEPS ${fluid_modules} ${STATIC_INFERENCE_APIS} zero_copy_tensor reset_tensor_array)
+endif()
 
 if(NOT APPLE)
   # TODO(liuyiqu: Temporarily disable the link flag because it is not support on Mac.
diff --git a/paddle/fluid/inference/analysis/helper.h b/paddle/fluid/inference/analysis/helper.h
index 5151e2b69a..fe96d8604c 100644
--- a/paddle/fluid/inference/analysis/helper.h
+++ b/paddle/fluid/inference/analysis/helper.h
@@ -126,7 +126,11 @@ T &GetFromScope(const framework::Scope &scope, const std::string &name) {
 
 static void ExecShellCommand(const std::string &cmd, std::string *message) {
   char buffer[128];
+#if !defined(_WIN32)
   std::shared_ptr<FILE> pipe(popen(cmd.c_str(), "r"), pclose);
+#else
+  std::shared_ptr<FILE> pipe(_popen(cmd.c_str(), "r"), _pclose);
+#endif  // _WIN32
   if (!pipe) {
     LOG(ERROR) << "error running command: " << cmd;
     return;
diff --git a/paddle/fluid/inference/api/api_impl.cc b/paddle/fluid/inference/api/api_impl.cc
index d06ab8f8c8..a576ab13df 100644
--- a/paddle/fluid/inference/api/api_impl.cc
+++ b/paddle/fluid/inference/api/api_impl.cc
@@ -75,6 +75,10 @@ bool NativePaddlePredictor::Init(
   }
 #endif
 
+  // windows has no support for openblas multi-thread
+#ifdef _WIN32
+	FLAGS_paddle_num_threads = 1;
+#endif
   // no matter with or without MKLDNN
   paddle::platform::SetNumThreads(FLAGS_paddle_num_threads);
 
diff --git a/paddle/fluid/inference/api/helper.h b/paddle/fluid/inference/api/helper.h
index e46dc13269..83910585b7 100644
--- a/paddle/fluid/inference/api/helper.h
+++ b/paddle/fluid/inference/api/helper.h
@@ -15,8 +15,12 @@
 #pragma once
 
 #include <glog/logging.h>
+#if !defined(_WIN32)
 #include <sys/time.h>
+#else
+#endif
 #include <chrono>  // NOLINT
+#include <iterator>
 #include <numeric>
 #include <sstream>
 #include <string>
diff --git a/paddle/fluid/operators/CMakeLists.txt b/paddle/fluid/operators/CMakeLists.txt
index 919ad96f7a..2ecbdbdbbe 100644
--- a/paddle/fluid/operators/CMakeLists.txt
+++ b/paddle/fluid/operators/CMakeLists.txt
@@ -86,7 +86,8 @@ function(op_library TARGET)
     # remove windows unsupported op, because windows has no nccl, no warpctc such ops.
     foreach(windows_unsupport_op "nccl_op" "gen_nccl_id_op" "warpctc_op" "hierarchical_sigmoid_op"
      "crf_decoding_op" "select_op" "lstmp_op" "gru_op" "fusion_gru_op" "lstm_op" "fusion_lstm_op" "cumsum_op"
-      "fusion_seqconv_eltadd_relu_op" "channel_send_op" "channel_create_op" "channel_close_op" "channel_recv_op")
+      "fusion_seqconv_eltadd_relu_op" "channel_send_op" "channel_create_op" "channel_close_op" "channel_recv_op"
+            "fusion_seqexpand_concat_fc_op" "attention_lstm_op" "fused_embedding_fc_lstm_op" "fc_op")
         if ("${TARGET}" STREQUAL "${windows_unsupport_op}")
           return()
         endif()
@@ -301,8 +302,10 @@ op_library(flatten_op DEPS reshape_op)
 op_library(sequence_pad_op DEPS sequence_padding)
 op_library(unstack_op DEPS stack_op)
 op_library(fake_quantize_op DEPS memory)
+if (NOT WIN32)
 op_library(crf_decoding_op DEPS jit_kernel)
 op_library(fusion_lstm_op DEPS jit_kernel)
+endif(NOT WIN32)
 if (WITH_GPU)
     op_library(conv_op DEPS vol2col depthwise_conv im2col)
     op_library(layer_norm_op DEPS cub)
diff --git a/paddle/fluid/operators/elementwise_op_function.h b/paddle/fluid/operators/elementwise_op_function.h
index 7c84a9d813..a6933a16df 100644
--- a/paddle/fluid/operators/elementwise_op_function.h
+++ b/paddle/fluid/operators/elementwise_op_function.h
@@ -111,6 +111,17 @@ class RowwiseTransformIterator<T, platform::CPUDeviceContext>
     return *this;
   }
 
+  RowwiseTransformIterator<T, platform::CPUDeviceContext> &operator+(int n) {
+    while(n-- > 0) {
+      ++i_;
+      if (UNLIKELY(i_ == n_)) {
+        i_ = 0;
+      }
+    }
+
+    return *this;
+  }
+
   bool operator==(const RowwiseTransformIterator<T, platform::CPUDeviceContext>
                       &rhs) const {
     return (ptr_ + i_) == &(*rhs);
@@ -149,6 +160,21 @@ class MidWiseTransformIterator<T, platform::CPUDeviceContext>
     return *this;
   }
 
+  MidWiseTransformIterator<T, platform::CPUDeviceContext> &operator+(int n) {
+    while(n-- > 0) {
+      ++j_;
+      if (UNLIKELY(j_ == post_)) {
+        ++i_;
+        j_ = 0;
+        if (UNLIKELY(i_ == n_)) {
+          i_ = 0;
+        }
+      }
+    }
+
+    return *this;
+  }
+
   bool operator==(const MidWiseTransformIterator<T, platform::CPUDeviceContext>
                       &rhs) const {
     return (ptr_ + i_) == &(*rhs);
diff --git a/paddle/fluid/operators/math/CMakeLists.txt b/paddle/fluid/operators/math/CMakeLists.txt
index 17b675fba8..77802dd102 100644
--- a/paddle/fluid/operators/math/CMakeLists.txt
+++ b/paddle/fluid/operators/math/CMakeLists.txt
@@ -75,7 +75,9 @@ if(WITH_GPU)
 endif()
 cc_test(concat_test SRCS concat_test.cc DEPS concat_and_split)
 cc_test(cpu_vec_test SRCS cpu_vec_test.cc DEPS blas cpu_info)
-cc_library(jit_kernel 
+if (NOT WIN32)
+cc_library(jit_kernel
     SRCS jit_kernel.cc jit_kernel_blas.cc jit_kernel_exp.cc jit_kernel_rnn.cc jit_kernel_crf_decode.cc
     DEPS cpu_info cblas)
 cc_test(jit_kernel_test SRCS jit_kernel_test.cc DEPS jit_kernel)
+endif()
diff --git a/paddle/fluid/platform/init.cc b/paddle/fluid/platform/init.cc
index ab91ca5345..1cc5a3d49f 100644
--- a/paddle/fluid/platform/init.cc
+++ b/paddle/fluid/platform/init.cc
@@ -137,7 +137,9 @@ void InitGLOG(const std::string &prog_name) {
   // glog will not hold the ARGV[0] inside.
   // Use strdup to alloc a new string.
   google::InitGoogleLogging(strdup(prog_name.c_str()));
+#ifndef _WIN32
   google::InstallFailureSignalHandler();
+#endif
 }
 
 }  // namespace framework
diff --git a/paddle/fluid/platform/nccl_helper.h b/paddle/fluid/platform/nccl_helper.h
index 115abb98d5..abab202c59 100644
--- a/paddle/fluid/platform/nccl_helper.h
+++ b/paddle/fluid/platform/nccl_helper.h
@@ -12,6 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+#ifndef _WIN32
 #pragma once
 
 #include <stdio.h>
@@ -149,3 +150,4 @@ struct NCCLContextMap {
 
 }  // namespace platform
 }  // namespace paddle
+#endif
\ No newline at end of file
diff --git a/paddle/fluid/platform/variant.h b/paddle/fluid/platform/variant.h
index dc9fad29f2..148e1ae6eb 100644
--- a/paddle/fluid/platform/variant.h
+++ b/paddle/fluid/platform/variant.h
@@ -42,3 +42,11 @@ limitations under the License. */
 #include <boost/mpl/comparison.hpp>
 #include <boost/mpl/less_equal.hpp>
 #include <boost/variant.hpp>
+
+// some platform-independent defintion
+#if defined(WIN32) || defined(_WIN32) || defined(__WIN32__)
+#define __UNUSED__()
+#define __builtin_expect(EXP, C)  (EXP)
+#else
+#define __UNUSED__() __attribute__((unused))
+#endif
\ No newline at end of file
diff --git a/paddle/fluid/pybind/CMakeLists.txt b/paddle/fluid/pybind/CMakeLists.txt
index e7f634c4a6..572b1a4f04 100644
--- a/paddle/fluid/pybind/CMakeLists.txt
+++ b/paddle/fluid/pybind/CMakeLists.txt
@@ -2,8 +2,8 @@
 set(PYBIND_DEPS pybind python proto_desc memory executor prune  feed_fetch_method pass_builder)
 set(PYBIND_SRCS pybind.cc exception.cc protobuf.cc const_value.cc)
 if(NOT WIN32)
-list(APPEND PYBIND_DEPS parallel_executor profiler)
-list(APPEND PYBIND_SRCS recordio.cc)
+  list(APPEND PYBIND_DEPS parallel_executor profiler)
+  list(APPEND PYBIND_SRCS recordio.cc)
 endif()
 if(WITH_PYTHON)
   if(WITH_AMD_GPU)
@@ -21,5 +21,9 @@ if(WITH_PYTHON)
     endif(NOT APPLE AND NOT ANDROID AND NOT WIN32)
   endif(WITH_AMD_GPU)
 
+  if(WIN32)
+    target_link_libraries(paddle_pybind shlwapi)
+  endif(WIN32)
+
   cc_test(tensor_py_test SRCS tensor_py_test.cc DEPS python)
 endif(WITH_PYTHON)
diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc
index 5f15a29f4c..9dbb2928d3 100644
--- a/paddle/fluid/pybind/pybind.cc
+++ b/paddle/fluid/pybind/pybind.cc
@@ -21,6 +21,13 @@ limitations under the License. */
 #include <utility>
 #include <vector>
 
+#if defined(_WIN32)
+#define NOMINMAX
+#define GLOG_NO_ABBREVIATED_SEVERITIES  // msvc conflict logging with windows.h
+#define GOOGLE_GLOG_DLL_DECL
+#include <Windows.h>
+#endif
+
 #include "paddle/fluid/framework/executor.h"
 #include "paddle/fluid/framework/feed_fetch_method.h"
 #include "paddle/fluid/framework/framework.pb.h"
@@ -29,7 +36,9 @@ limitations under the License. */
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/lod_tensor_array.h"
 #include "paddle/fluid/framework/op_registry.h"
+#ifndef _WIN32
 #include "paddle/fluid/framework/parallel_executor.h"
+#endif
 #include "paddle/fluid/framework/prune.h"
 #include "paddle/fluid/framework/reader.h"
 #include "paddle/fluid/framework/selected_rows.h"
@@ -50,7 +59,9 @@ limitations under the License. */
 #include "paddle/fluid/string/to_string.h"
 
 #ifdef PADDLE_WITH_CUDA
+#ifndef _WIN32
 #include "paddle/fluid/operators/nccl/nccl_gpu_common.h"
+#endif
 #include "paddle/fluid/platform/cuda_profiler.h"
 #include "paddle/fluid/platform/gpu_info.h"
 #endif
@@ -340,22 +351,25 @@ All parameter, weight, gradient are variables in Paddle.
       .def("get_lod_tensor_array",
            [](Variable &self) { return self.GetMutable<LoDTensorArray>(); },
            py::return_value_policy::reference)
-#ifdef PADDLE_WITH_CUDA
-      .def("get_communicator",
+#if (defined(PADDLE_WITH_CUDA) && !defined(_WIN32))
+	  .def("get_communicator",
            [](Variable &self) -> platform::Communicator * {
              return self.GetMutable<platform::Communicator>();
            },
            py::return_value_policy::reference)
-#endif
       .def("get_reader",
            [](Variable &self) -> framework::ReaderHolder * {
              PADDLE_ENFORCE(self.IsType<framework::ReaderHolder>());
              return self.GetMutable<framework::ReaderHolder>();
            },
-           py::return_value_policy::reference);
+           py::return_value_policy::reference)
+#endif
+;
 
+#if !defined(_WIN32)
   py::class_<framework::ReaderHolder>(m, "Reader", "")
       .def("reset", &framework::ReaderHolder::ResetAll);
+#endif
 
   using LoDTensorBlockingQueue =
       ::paddle::operators::reader::LoDTensorBlockingQueue;
@@ -480,7 +494,7 @@ All parameter, weight, gradient are variables in Paddle.
 #endif
                 });;
 // clang-format on
-#ifdef PADDLE_WITH_CUDA
+#if (defined(PADDLE_WITH_CUDA) && !defined(_WIN32))
   py::class_<platform::Communicator>(m, "Communicator").def(py::init<>());
 #endif
   py::class_<platform::CUDAPlace>(m, "CUDAPlace")
@@ -617,11 +631,14 @@ All parameter, weight, gradient are variables in Paddle.
 #ifdef PADDLE_WITH_CUDA
   m.def("get_cuda_device_count", platform::GetCUDADeviceCount);
 
+#ifndef _WIN32
   m.def("nvprof_init", platform::CudaProfilerInit);
   m.def("nvprof_start", platform::CudaProfilerStart);
   m.def("nvprof_stop", platform::CudaProfilerStop);
 #endif
+#endif
 
+#ifndef _WIN32
   py::enum_<platform::ProfilerState>(m, "ProfilerState", py::arithmetic())
       .value("kDisabled", platform::ProfilerState::kDisabled)
       .value("kCPU", platform::ProfilerState::kCPU)
@@ -642,6 +659,7 @@ All parameter, weight, gradient are variables in Paddle.
   m.def("disable_profiler", platform::DisableProfiler);
   m.def("is_profiler_enabled", platform::IsProfileEnabled);
   m.def("reset_profiler", platform::ResetProfiler);
+#endif
 
   py::class_<ir::Pass, std::shared_ptr<ir::Pass>> pass(m, "Pass");
   pass.def(py::init())
@@ -670,6 +688,7 @@ All parameter, weight, gradient are variables in Paddle.
       .def("remove_pass",
            [](ir::PassBuilder &self, size_t idx) { self.RemovePass(idx); });
 
+#ifndef _WIN32
   // -- python binds for parallel executor.
   py::class_<ParallelExecutor> pe(m, "ParallelExecutor");
   py::class_<ExecutionStrategy> exec_strategy(pe, "ExecutionStrategy", R"DOC(
@@ -864,6 +883,7 @@ All parameter, weight, gradient are variables in Paddle.
       });
 
   BindRecordIOWriter(&m);
+#endif
   return m.ptr();
 }
 }  // namespace pybind
diff --git a/python/CMakeLists.txt b/python/CMakeLists.txt
index 0d29f2ad20..6994d47ff6 100644
--- a/python/CMakeLists.txt
+++ b/python/CMakeLists.txt
@@ -46,22 +46,39 @@ endif()
 configure_file(${CMAKE_CURRENT_SOURCE_DIR}/setup.py.in
     ${CMAKE_CURRENT_BINARY_DIR}/setup.py)
 
-set(FLUID_CORE ${PADDLE_BINARY_DIR}/python/paddle/fluid/core.so)
+IF(WIN32)
+    # Python would use the .pyd by default under Windows series platform
+    set(FLUID_CORE ${PADDLE_BINARY_DIR}/python/paddle/fluid/core.pyd)
+ELSE()
+    set(FLUID_CORE ${PADDLE_BINARY_DIR}/python/paddle/fluid/core.so)
+ENDIF()
 add_custom_command(OUTPUT ${FLUID_CORE}
         COMMAND cmake -E copy $<TARGET_FILE:paddle_pybind> ${FLUID_CORE}
         DEPENDS paddle_pybind)
 add_custom_target(copy_paddle_pybind ALL DEPENDS ${FLUID_CORE})
 
-
-add_custom_command(OUTPUT ${PADDLE_PYTHON_BUILD_DIR}/.timestamp
-    COMMAND touch stub.cc
-    COMMAND cp -r ${PADDLE_SOURCE_DIR}/python/paddle ${PADDLE_BINARY_DIR}/python
-    COMMAND cp -r ${PADDLE_SOURCE_DIR}/paddle/py_paddle ${PADDLE_BINARY_DIR}/python/
-    COMMAND env ${py_env} ${PYTHON_EXECUTABLE} setup.py bdist_wheel
-    COMMAND ${CMAKE_COMMAND} -E touch ${PADDLE_PYTHON_BUILD_DIR}/.timestamp
-    COMMAND ${CMAKE_COMMAND} -E remove_directory ${PADDLE_PYTHON_BUILD_DIR}/lib-python
-    COMMAND ${CMAKE_COMMAND} -E copy_directory ${PADDLE_PYTHON_BUILD_DIR}/lib.* ${PADDLE_PYTHON_BUILD_DIR}/lib-python
-    DEPENDS gen_proto_py copy_paddle_pybind ${FLUID_CORE} framework_py_proto profiler_py_proto ${PY_FILES} ${external_project_dependencies} ${COPY_PADDLE_MASTER})
+IF(WIN32)
+	add_custom_command(OUTPUT ${PADDLE_PYTHON_BUILD_DIR}/.timestamp
+#		COMMAND ${CMAKE_COMMAND} -E touch stub.cc
+		COMMAND ${CMAKE_COMMAND} -E make_directory ${PADDLE_BINARY_DIR}/python/paddle
+		COMMAND ${CMAKE_COMMAND} -E copy_directory ${PADDLE_SOURCE_DIR}/python/paddle ${PADDLE_BINARY_DIR}/python/paddle
+		COMMAND ${CMAKE_COMMAND} -E copy_directory ${PADDLE_SOURCE_DIR}/paddle/py_paddle ${PADDLE_BINARY_DIR}/python/
+		COMMAND ${CMAKE_COMMAND} -E env ${py_env} ${PYTHON_EXECUTABLE} setup.py bdist_wheel
+		COMMAND ${CMAKE_COMMAND} -E touch ${PADDLE_PYTHON_BUILD_DIR}/.timestamp
+		COMMAND ${CMAKE_COMMAND} -E remove_directory ${PADDLE_PYTHON_BUILD_DIR}/lib-python
+#		COMMAND ${CMAKE_COMMAND} -E copy_directory ${PADDLE_PYTHON_BUILD_DIR}/lib* ${PADDLE_PYTHON_BUILD_DIR}/lib-python
+		DEPENDS gen_proto_py copy_paddle_pybind ${FLUID_CORE} framework_py_proto profiler_py_proto ${PY_FILES} ${external_project_dependencies} ${COPY_PADDLE_MASTER})
+ELSE(WIN32)
+	add_custom_command(OUTPUT ${PADDLE_PYTHON_BUILD_DIR}/.timestamp
+		COMMAND touch stub.cc
+		COMMAND cp -r ${PADDLE_SOURCE_DIR}/python/paddle ${PADDLE_BINARY_DIR}/python
+		COMMAND cp -r ${PADDLE_SOURCE_DIR}/paddle/py_paddle ${PADDLE_BINARY_DIR}/python/
+		COMMAND env ${py_env} ${PYTHON_EXECUTABLE} setup.py bdist_wheel
+		COMMAND ${CMAKE_COMMAND} -E touch ${PADDLE_PYTHON_BUILD_DIR}/.timestamp
+		COMMAND ${CMAKE_COMMAND} -E remove_directory ${PADDLE_PYTHON_BUILD_DIR}/lib-python
+		COMMAND ${CMAKE_COMMAND} -E copy_directory ${PADDLE_PYTHON_BUILD_DIR}/lib* ${PADDLE_PYTHON_BUILD_DIR}/lib-python
+		DEPENDS gen_proto_py copy_paddle_pybind ${FLUID_CORE} framework_py_proto profiler_py_proto ${PY_FILES} ${external_project_dependencies} ${COPY_PADDLE_MASTER})
+ENDIF()
 
 set(paddle_python_deps ${PADDLE_PYTHON_BUILD_DIR}/.timestamp ${MKL_DEPENDS})
 if(NOT WITH_FLUID_ONLY)
diff --git a/python/setup.py.in b/python/setup.py.in
index b376be0ea3..9dad434893 100644
--- a/python/setup.py.in
+++ b/python/setup.py.in
@@ -9,7 +9,7 @@ class BinaryDistribution(Distribution):
 
 RC      = 0
 
-
+ext_name = '.dll' if os.name == 'nt' else '.so'
 
 def git_commit():
     try:
@@ -136,10 +136,13 @@ if '${WITH_FLUID_ONLY}'== 'OFF':
                    '${PADDLE_BINARY_DIR}/paddle/legacy/pserver/paddle_pserver_main',
                    '${PADDLE_BINARY_DIR}/paddle/scripts/paddle']
 
-package_data={'paddle.fluid': ['core.so']}
+package_data={'paddle.fluid': ['core' + (ext_name if os.name != 'nt' else '.pyd')]}
+if os.name == 'nt':
+    package_data['paddle.fluid'] += ['openblas' + ext_name]
+
 if '${WITH_FLUID_ONLY}'== 'OFF':
-    package_data['paddle.v2.master']=['libpaddle_master.so']
-    package_data['py_paddle']=['*.py','_swig_paddle.so']
+    package_data['paddle.v2.master']=['libpaddle_master' + ext_name]
+    package_data['py_paddle']=['*.py','_swig_paddle' +  + ext_name]
 
 package_dir={
     '': '${PADDLE_BINARY_DIR}/python',
@@ -153,13 +156,15 @@ if '${WITH_FLUID_ONLY}'== 'OFF':
     package_dir['py_paddle']='${PADDLE_BINARY_DIR}/python/py_paddle'
 
 # put all thirdparty libraries in paddle.libs
-package_data['paddle.libs']=['libwarpctc.so']
 libs_path='${PADDLE_BINARY_DIR}/python/paddle/libs'
-shutil.copy('${WARPCTC_LIBRARIES}', libs_path)
+if os.name != 'nt':
+    package_data['paddle.libs']= []
+    package_data['paddle.libs']=['libwarpctc' + ext_name]
+    shutil.copy('${WARPCTC_LIBRARIES}', libs_path)
 if '${WITH_MKL}' == 'ON':
     shutil.copy('${MKLML_LIB}', libs_path)
     shutil.copy('${MKLML_IOMP_LIB}', libs_path)
-    package_data['paddle.libs']+=['libmklml_intel.so','libiomp5.so']
+    package_data['paddle.libs']+=['libmklml_intel' + ext_name,'libiomp5' + ext_name]
 if '${CMAKE_BUILD_TYPE}' == 'Release':
     # only change rpath in Release mode.
     if '${WITH_MKLDNN}' == 'ON':
@@ -183,21 +188,29 @@ package_dir['paddle.libs']=libs_path
 # core.so is in paddle.fluid, thus paddle/fluid/../libs will pointer to above libraries.
 # This operation will fix https://github.com/PaddlePaddle/Paddle/issues/3213
 if '${CMAKE_BUILD_TYPE}' == 'Release':
-    # only change rpath in Release mode, since in Debug mode, core.so is too large to be changed.
-    if "@APPLE@" == "1":
-        command = "install_name_tool -id \"@loader_path/../libs/\" ${PADDLE_BINARY_DIR}/python/paddle/fluid/core.so"
-    else:
-        command = "patchelf --set-rpath '$ORIGIN/../libs/' ${PADDLE_BINARY_DIR}/python/paddle/fluid/core.so"
-    if os.system(command) != 0:
-        raise Exception("patch core.so failed, command: %s" % command)
-    if '${WITH_FLUID_ONLY}'== 'OFF':
-        # change rpath of _swig_paddle.so.
+    if os.name != 'nt':
+        # only change rpath in Release mode, since in Debug mode, core.so is too large to be changed.
         if "@APPLE@" == "1":
-            command = "install_name_tool -id \"@loader_path/../paddle/libs/\" ${PADDLE_BINARY_DIR}/python/py_paddle/_swig_paddle.so"
+            command = "install_name_tool -id \"@loader_path/../libs/\" ${PADDLE_BINARY_DIR}/python/paddle/fluid/core" + ext_name
         else:
-            command = "patchelf --set-rpath '$ORIGIN/../paddle/libs/' ${PADDLE_BINARY_DIR}/python/py_paddle/_swig_paddle.so"
+            command = "patchelf --set-rpath '$ORIGIN/../libs/' ${PADDLE_BINARY_DIR}/python/paddle/fluid/core" + ext_name
         if os.system(command) != 0:
-            raise Exception("patch _swig_paddle.so failed, command: %s" % command)
+            raise Exception("patch core.so failed, command: %s" % command)
+        if '${WITH_FLUID_ONLY}'== 'OFF':
+            # change rpath of _swig_paddle.so.
+            if "@APPLE@" == "1":
+                command = "install_name_tool -id \"@loader_path/../paddle/libs/\" ${PADDLE_BINARY_DIR}/python/py_paddle/_swig_paddle" + ext_name
+            else:
+                command = "patchelf --set-rpath '$ORIGIN/../paddle/libs/' ${PADDLE_BINARY_DIR}/python/py_paddle/_swig_paddle" + ext_name
+            if os.system(command) != 0:
+                raise Exception("patch _swig_paddle.so failed, command: %s" % command)
+
+if os.name == 'nt':
+    # fix the path separator under windows
+    fix_package_dir = {}
+    for k, v in package_dir.items():
+        fix_package_dir[k] = v.replace('/', '\\')
+    package_dir = fix_package_dir
 
 setup(name='${PACKAGE_NAME}',
       version='${PADDLE_VERSION}',

From 71d7980f69ff09ab10ef55b8667ba26067d1c033 Mon Sep 17 00:00:00 2001
From: peizhilin <wopeizl@163.com>
Date: Mon, 5 Nov 2018 21:06:57 +0800
Subject: [PATCH 02/50] fix build issue 1

---
 paddle/fluid/CMakeLists.txt                   |  6 ++---
 paddle/fluid/framework/garbage_collector.h    |  2 +-
 paddle/fluid/inference/CMakeLists.txt         | 25 +++++++++++++------
 .../fluid/inference/analysis/CMakeLists.txt   |  4 +++
 .../detection/roi_perspective_transform_op.cu |  8 ++++--
 .../fluid/operators/math/sequence_pooling.cu  |  5 ++++
 .../fluid/platform/stream_callback_manager.h  |  2 +-
 paddle/fluid/pybind/CMakeLists.txt            |  2 +-
 python/CMakeLists.txt                         |  8 +++---
 python/requirements.txt                       |  2 +-
 python/setup.py.in                            |  4 ++-
 11 files changed, 45 insertions(+), 23 deletions(-)

diff --git a/paddle/fluid/CMakeLists.txt b/paddle/fluid/CMakeLists.txt
index 528d627728..abadda3adb 100644
--- a/paddle/fluid/CMakeLists.txt
+++ b/paddle/fluid/CMakeLists.txt
@@ -9,9 +9,7 @@ if (NOT WIN32)
 add_subdirectory(recordio)
 endif(NOT WIN32)
 
-if(WITH_INFERENCE)
-  # NOTE: please add subdirectory inference at last.
-  add_subdirectory(inference)
-endif()
+# NOTE: please add subdirectory inference at last.
+add_subdirectory(inference)
 
 add_subdirectory(train)
diff --git a/paddle/fluid/framework/garbage_collector.h b/paddle/fluid/framework/garbage_collector.h
index b403252c97..818b3334ea 100644
--- a/paddle/fluid/framework/garbage_collector.h
+++ b/paddle/fluid/framework/garbage_collector.h
@@ -29,7 +29,7 @@ template <typename T>
 class GarbageCollector {
  public:
   GarbageCollector(const platform::Place &place, size_t max_memory_size)
-      : max_memory_size_(std::max(max_memory_size, static_cast<size_t>(1))) {
+      : max_memory_size_((std::max)(max_memory_size, static_cast<size_t>(1))) {
     garbages_.reset(new std::deque<T *>());
     dev_ctx_ = platform::DeviceContextPool::Instance().Get(place);
   }
diff --git a/paddle/fluid/inference/CMakeLists.txt b/paddle/fluid/inference/CMakeLists.txt
index 39d3691471..921bca77e9 100644
--- a/paddle/fluid/inference/CMakeLists.txt
+++ b/paddle/fluid/inference/CMakeLists.txt
@@ -15,7 +15,11 @@ cc_library(paddle_fluid_api
 get_property(fluid_modules GLOBAL PROPERTY FLUID_MODULES)
 
 # paddle_fluid_origin exclude inference api interface
-cc_library(paddle_fluid_origin DEPS ${fluid_modules} paddle_fluid_api)
+if(WIN32)
+  sep_library(paddle_fluid_origin DEPS ${fluid_modules} paddle_fluid_api)
+else(WIN32)
+  cc_library(paddle_fluid_origin DEPS ${fluid_modules} paddle_fluid_api)
+endif(WIN32)
 
 add_subdirectory(api)
 
@@ -31,10 +35,10 @@ endif()
 
 # Create static library
 if(WIN32)
-sep_library(paddle_fluid DEPS ${fluid_modules} ${STATIC_INFERENCE_APIS} zero_copy_tensor reset_tensor_array)
-else()
-cc_library(paddle_fluid DEPS ${fluid_modules} ${STATIC_INFERENCE_APIS} zero_copy_tensor reset_tensor_array)
-endif()
+  sep_library(paddle_fluid DEPS ${fluid_modules} ${STATIC_INFERENCE_APIS} zero_copy_tensor reset_tensor_array)
+else(WIN32)
+  cc_library(paddle_fluid DEPS ${fluid_modules} ${STATIC_INFERENCE_APIS} zero_copy_tensor reset_tensor_array)
+endif(WIN32)
 
 if(NOT APPLE)
   # TODO(liuyiqu: Temporarily disable the link flag because it is not support on Mac.
@@ -43,11 +47,16 @@ if(NOT APPLE)
 endif()
 
 # Create shared library
-cc_library(paddle_fluid_shared SHARED SRCS ${SHARED_INFERENCE_SRCS}
-    DEPS ${fluid_modules} paddle_fluid_api reset_tensor_array)
+if(WIN32)
+  sep_library(paddle_fluid_shared SHARED SRCS ${SHARED_INFERENCE_SRCS}
+          DEPS ${fluid_modules} paddle_fluid_api reset_tensor_array)
+else(WIN32)
+  cc_library(paddle_fluid_shared SHARED SRCS ${SHARED_INFERENCE_SRCS}
+      DEPS ${fluid_modules} paddle_fluid_api reset_tensor_array)
+endif()
 
 set_target_properties(paddle_fluid_shared PROPERTIES OUTPUT_NAME paddle_fluid)
-if(NOT APPLE)
+if(NOT APPLE AND NOT WIN32)
   # TODO(liuyiqun): Temporarily disable the link flag because it is not support on Mac.
   set(LINK_FLAGS "-Wl,--version-script ${CMAKE_CURRENT_SOURCE_DIR}/paddle_fluid.map")
   set_target_properties(paddle_fluid_shared PROPERTIES LINK_FLAGS "${LINK_FLAGS}")
diff --git a/paddle/fluid/inference/analysis/CMakeLists.txt b/paddle/fluid/inference/analysis/CMakeLists.txt
index d4d2fd4634..10b97e992e 100644
--- a/paddle/fluid/inference/analysis/CMakeLists.txt
+++ b/paddle/fluid/inference/analysis/CMakeLists.txt
@@ -20,6 +20,10 @@ cc_test(test_node SRCS node_tester.cc DEPS analysis)
 cc_test(test_dot SRCS dot_tester.cc DEPS analysis)
 cc_binary(inference_analyzer SRCS analyzer_main.cc DEPS analysis paddle_fluid)
 
+if(WIN32)
+    target_link_libraries(inference_analyzer shlwapi)
+endif(WIN32)
+
 function (inference_analysis_test TARGET)
     if(WITH_TESTING)
         set(options "")
diff --git a/paddle/fluid/operators/detection/roi_perspective_transform_op.cu b/paddle/fluid/operators/detection/roi_perspective_transform_op.cu
index c82930cc49..862d664d42 100644
--- a/paddle/fluid/operators/detection/roi_perspective_transform_op.cu
+++ b/paddle/fluid/operators/detection/roi_perspective_transform_op.cu
@@ -15,6 +15,10 @@ limitations under the License. */
 #include <algorithm>
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/platform/cuda_primitives.h"
+#include "paddle/fluid/platform/float16.h"
+
+using paddle::platform::PADDLE_CUDA_NUM_THREADS;
+using paddle::platform::float16;
 
 namespace paddle {
 namespace operators {
@@ -31,12 +35,12 @@ namespace operators {
 
 template <typename T>
 __device__ bool GT_E(T a, T b) {
-  return (a > b) || fabs(a - b) < 1e-4;
+  return (a > b) || Eigen::numext::abs(a - b) < 1e-4;
 }
 
 template <typename T>
 __device__ bool LT_E(T a, T b) {
-  return (a < b) || fabs(a - b) < 1e-4;
+  return (a < b) || Eigen::numext::abs(a - b) < 1e-4;
 }
 
 template <typename T>
diff --git a/paddle/fluid/operators/math/sequence_pooling.cu b/paddle/fluid/operators/math/sequence_pooling.cu
index 0015fafbc8..e468cd23e8 100644
--- a/paddle/fluid/operators/math/sequence_pooling.cu
+++ b/paddle/fluid/operators/math/sequence_pooling.cu
@@ -21,7 +21,12 @@ namespace paddle {
 namespace operators {
 namespace math {
 
+#if defined(__FLT_MAX__)
 #define FLT_MAX __FLT_MAX__
+#else
+#include <float.h>
+#include <limits>
+#endif
 
 template <typename T>
 struct MaxPoolFunctor {
diff --git a/paddle/fluid/platform/stream_callback_manager.h b/paddle/fluid/platform/stream_callback_manager.h
index 6c984065aa..5f10137dcf 100644
--- a/paddle/fluid/platform/stream_callback_manager.h
+++ b/paddle/fluid/platform/stream_callback_manager.h
@@ -18,8 +18,8 @@
 #include <cuda_runtime.h>
 #include <functional>
 #include <memory>
-#include "ThreadPool.h"
 #include "paddle/fluid/platform/enforce.h"
+#include "third_party/threadpool/src/extern_threadpool/ThreadPool.h"
 
 namespace paddle {
 namespace platform {
diff --git a/paddle/fluid/pybind/CMakeLists.txt b/paddle/fluid/pybind/CMakeLists.txt
index 572b1a4f04..a4baa37c32 100644
--- a/paddle/fluid/pybind/CMakeLists.txt
+++ b/paddle/fluid/pybind/CMakeLists.txt
@@ -4,7 +4,7 @@ set(PYBIND_SRCS pybind.cc exception.cc protobuf.cc const_value.cc)
 if(NOT WIN32)
   list(APPEND PYBIND_DEPS parallel_executor profiler)
   list(APPEND PYBIND_SRCS recordio.cc)
-endif()
+endif(NOT WIN32)
 if(WITH_PYTHON)
   if(WITH_AMD_GPU)
     hip_library(paddle_pybind SHARED
diff --git a/python/CMakeLists.txt b/python/CMakeLists.txt
index 6994d47ff6..391094b5b2 100644
--- a/python/CMakeLists.txt
+++ b/python/CMakeLists.txt
@@ -60,13 +60,13 @@ add_custom_target(copy_paddle_pybind ALL DEPENDS ${FLUID_CORE})
 IF(WIN32)
 	add_custom_command(OUTPUT ${PADDLE_PYTHON_BUILD_DIR}/.timestamp
 #		COMMAND ${CMAKE_COMMAND} -E touch stub.cc
-		COMMAND ${CMAKE_COMMAND} -E make_directory ${PADDLE_BINARY_DIR}/python/paddle
-		COMMAND ${CMAKE_COMMAND} -E copy_directory ${PADDLE_SOURCE_DIR}/python/paddle ${PADDLE_BINARY_DIR}/python/paddle
-		COMMAND ${CMAKE_COMMAND} -E copy_directory ${PADDLE_SOURCE_DIR}/paddle/py_paddle ${PADDLE_BINARY_DIR}/python/
+        COMMAND ${CMAKE_COMMAND} -E make_directory ${PADDLE_BINARY_DIR}/python/paddle
+        COMMAND ${CMAKE_COMMAND} -E copy ${PADDLE_SOURCE_DIR}/python/paddle ${PADDLE_BINARY_DIR}/python/paddle
+		COMMAND ${CMAKE_COMMAND} -E copy ${PADDLE_SOURCE_DIR}/paddle/py_paddle ${PADDLE_BINARY_DIR}/python/
 		COMMAND ${CMAKE_COMMAND} -E env ${py_env} ${PYTHON_EXECUTABLE} setup.py bdist_wheel
 		COMMAND ${CMAKE_COMMAND} -E touch ${PADDLE_PYTHON_BUILD_DIR}/.timestamp
 		COMMAND ${CMAKE_COMMAND} -E remove_directory ${PADDLE_PYTHON_BUILD_DIR}/lib-python
-#		COMMAND ${CMAKE_COMMAND} -E copy_directory ${PADDLE_PYTHON_BUILD_DIR}/lib* ${PADDLE_PYTHON_BUILD_DIR}/lib-python
+		COMMAND ${CMAKE_COMMAND} -E copy ${PADDLE_PYTHON_BUILD_DIR}/libs ${PADDLE_PYTHON_BUILD_DIR}/lib-python
 		DEPENDS gen_proto_py copy_paddle_pybind ${FLUID_CORE} framework_py_proto profiler_py_proto ${PY_FILES} ${external_project_dependencies} ${COPY_PADDLE_MASTER})
 ELSE(WIN32)
 	add_custom_command(OUTPUT ${PADDLE_PYTHON_BUILD_DIR}/.timestamp
diff --git a/python/requirements.txt b/python/requirements.txt
index 84cf440397..7a24dd519a 100644
--- a/python/requirements.txt
+++ b/python/requirements.txt
@@ -1,7 +1,7 @@
 requests==2.9.2
 numpy>=1.12,<=1.14 #TODO:change to ">=1.12" when numpy fix bug in 1.15 and higher version
 protobuf==3.1
-recordio>=0.1.0
+recordio>=0.1.0; sys_platform != 'win32'
 matplotlib==2.2.3 # TODO: let python3 paddlepaddle package use latest matplotlib
 rarfile
 scipy>=0.19.0
diff --git a/python/setup.py.in b/python/setup.py.in
index 9dad434893..c442055208 100644
--- a/python/setup.py.in
+++ b/python/setup.py.in
@@ -205,19 +205,21 @@ if '${CMAKE_BUILD_TYPE}' == 'Release':
             if os.system(command) != 0:
                 raise Exception("patch _swig_paddle.so failed, command: %s" % command)
 
+ext_modules = [Extension('_foo', ['stub.cc'])]
 if os.name == 'nt':
     # fix the path separator under windows
     fix_package_dir = {}
     for k, v in package_dir.items():
         fix_package_dir[k] = v.replace('/', '\\')
     package_dir = fix_package_dir
+    ext_modules = []
 
 setup(name='${PACKAGE_NAME}',
       version='${PADDLE_VERSION}',
       description='Parallel Distributed Deep Learning',
       install_requires=setup_requires,
       packages=packages,
-      ext_modules=[Extension('_foo', ['stub.cc'])],
+      ext_modules=ext_modules,
       package_data=package_data,
       package_dir=package_dir,
       scripts=paddle_bins

From 1f12ba61927c292993af066dd5930e613734ba52 Mon Sep 17 00:00:00 2001
From: peizhilin <wopeizl@163.com>
Date: Tue, 6 Nov 2018 14:38:54 +0800
Subject: [PATCH 03/50] gpu support, fix build issue: 1. Non utf-8 characters
 within comments of OPs may lead to protobuf fail to parse_from_string 2.
 comment out some ops which not supported on windows 3. cuda libs may not be
 correctly linked to target on windows

---
 cmake/cuda.cmake                              |   3 +
 paddle/fluid/inference/CMakeLists.txt         |  10 +
 .../fluid/operators/pad_constant_like_op.cc   |   2 +-
 paddle/fluid/operators/roi_pool_op.cc         |   2 +-
 paddle/fluid/operators/unpool_op.cc           |   4 +-
 paddle/fluid/pybind/CMakeLists.txt            |   4 +
 python/CMakeLists.txt                         |   5 +-
 python/paddle/fluid/__init__.py               |  21 +-
 python/paddle/fluid/contrib/inferencer.py     |   4 +-
 python/paddle/fluid/contrib/trainer.py        |   3 +-
 python/paddle/fluid/framework.py              |   2 +-
 python/paddle/fluid/layers/io.py              | 124 +++---
 python/paddle/fluid/layers/nn.py              | 363 +++++++++---------
 python/paddle/fluid/layers/ops.py             |  34 +-
 python/setup.py.in                            |   3 +-
 15 files changed, 311 insertions(+), 273 deletions(-)

diff --git a/cmake/cuda.cmake b/cmake/cuda.cmake
index f507bb41a1..1cc882cce7 100644
--- a/cmake/cuda.cmake
+++ b/cmake/cuda.cmake
@@ -157,6 +157,9 @@ list(APPEND EXTERNAL_LIBS ${CUDA_LIBRARIES} ${CUDA_rt_LIBRARY})
 if(NOT WITH_DSO)
     # TODO(panyx0718): CUPTI only allows DSO?
     list(APPEND EXTERNAL_LIBS ${CUDNN_LIBRARY} ${CUPTI_LIBRARY} ${CUDA_CUBLAS_LIBRARIES} ${CUDA_curand_LIBRARY} ${NCCL_LIBRARY})
+    if(WIN32)
+      set_property(GLOBAL PROPERTY CUDA_MODULES ${CUDNN_LIBRARY} ${CUDA_CUBLAS_LIBRARIES} ${CUDA_curand_LIBRARY})
+    endif(WIN32)
 endif(NOT WITH_DSO)
 
 # setting nvcc arch flags
diff --git a/paddle/fluid/inference/CMakeLists.txt b/paddle/fluid/inference/CMakeLists.txt
index 921bca77e9..c8a950fce0 100644
--- a/paddle/fluid/inference/CMakeLists.txt
+++ b/paddle/fluid/inference/CMakeLists.txt
@@ -13,10 +13,14 @@ cc_library(paddle_fluid_api
     DEPS ${FLUID_CORE_MODULES} ${GLOB_OP_LIB}) 
 
 get_property(fluid_modules GLOBAL PROPERTY FLUID_MODULES)
+get_property(cuda_modules GLOBAL PROPERTY CUDA_MODULES)
 
 # paddle_fluid_origin exclude inference api interface
 if(WIN32)
   sep_library(paddle_fluid_origin DEPS ${fluid_modules} paddle_fluid_api)
+  if(WITH_GPU AND NOT WITH_DSO)
+    target_link_libraries(paddle_fluid_origin ${cuda_modules})
+  endif(WITH_GPU AND NOT WITH_DSO)
 else(WIN32)
   cc_library(paddle_fluid_origin DEPS ${fluid_modules} paddle_fluid_api)
 endif(WIN32)
@@ -36,6 +40,9 @@ endif()
 # Create static library
 if(WIN32)
   sep_library(paddle_fluid DEPS ${fluid_modules} ${STATIC_INFERENCE_APIS} zero_copy_tensor reset_tensor_array)
+  if(WITH_GPU AND NOT WITH_DSO)
+    target_link_libraries(paddle_fluid ${cuda_modules})
+  endif(WITH_GPU AND NOT WITH_DSO)
 else(WIN32)
   cc_library(paddle_fluid DEPS ${fluid_modules} ${STATIC_INFERENCE_APIS} zero_copy_tensor reset_tensor_array)
 endif(WIN32)
@@ -50,6 +57,9 @@ endif()
 if(WIN32)
   sep_library(paddle_fluid_shared SHARED SRCS ${SHARED_INFERENCE_SRCS}
           DEPS ${fluid_modules} paddle_fluid_api reset_tensor_array)
+  if(WITH_GPU AND NOT WITH_DSO)
+    target_link_libraries(paddle_fluid_origin ${cuda_modules})
+  endif(WITH_GPU AND NOT WITH_DSO)
 else(WIN32)
   cc_library(paddle_fluid_shared SHARED SRCS ${SHARED_INFERENCE_SRCS}
       DEPS ${fluid_modules} paddle_fluid_api reset_tensor_array)
diff --git a/paddle/fluid/operators/pad_constant_like_op.cc b/paddle/fluid/operators/pad_constant_like_op.cc
index 37646c7b4c..685ebc3937 100644
--- a/paddle/fluid/operators/pad_constant_like_op.cc
+++ b/paddle/fluid/operators/pad_constant_like_op.cc
@@ -74,7 +74,7 @@ PadConstantLikeOp Operator.
 
 Pad input(Y) with a pad_value, the number of values padded to the edges of each
 axis is specified by the difference of the shape of X and Y.
-((0, shape_x_0 - shape_y_0), … (0, shape_x_n - shape_y_n)) unique pad widths for
+((0, shape_x_0 - shape_y_0), ... (0, shape_x_n - shape_y_n)) unique pad widths for
 each axis.
 The input should be a k-D tensor(k > 0 and k < 7). As an example:
 
diff --git a/paddle/fluid/operators/roi_pool_op.cc b/paddle/fluid/operators/roi_pool_op.cc
index 8e29761ec2..043ea680d1 100644
--- a/paddle/fluid/operators/roi_pool_op.cc
+++ b/paddle/fluid/operators/roi_pool_op.cc
@@ -122,7 +122,7 @@ class ROIPoolOpMaker : public framework::OpProtoAndCheckerMaker {
               "(Tensor), "
               "Argmaxes corresponding to indices in X used "
               "for gradient computation. Only output "
-              "if arg “is_test” is false.")
+              "if arg \"is_test\" is false.")
         .AsIntermediate();
     AddAttr<float>("spatial_scale",
                    "(float, default 1.0), "
diff --git a/paddle/fluid/operators/unpool_op.cc b/paddle/fluid/operators/unpool_op.cc
index 1d441b43b1..6d2ccb38f6 100644
--- a/paddle/fluid/operators/unpool_op.cc
+++ b/paddle/fluid/operators/unpool_op.cc
@@ -57,8 +57,8 @@ class Unpool2dOpMaker : public framework::OpProtoAndCheckerMaker {
 Input shape is: $(N, C_{in}, H_{in}, W_{in})$, Output shape is:
 $(N, C_{out}, H_{out}, W_{out})$, where
 $$
-H_{out} = (H_{in}−1) * strides[0] − 2 * paddings[0] + ksize[0] \\
-W_{out} = (W_{in}−1) * strides[1] − 2 * paddings[1] + ksize[1]
+H_{out} = (H_{in}-1) * strides[0] - 2 * paddings[0] + ksize[0] \\
+W_{out} = (W_{in}-1) * strides[1] - 2 * paddings[1] + ksize[1]
 $$
 Paper: http://www.matthewzeiler.com/wp-content/uploads/2017/07/iccv2011.pdf
 )DOC");
diff --git a/paddle/fluid/pybind/CMakeLists.txt b/paddle/fluid/pybind/CMakeLists.txt
index a4baa37c32..6afa53cd36 100644
--- a/paddle/fluid/pybind/CMakeLists.txt
+++ b/paddle/fluid/pybind/CMakeLists.txt
@@ -22,6 +22,10 @@ if(WITH_PYTHON)
   endif(WITH_AMD_GPU)
 
   if(WIN32)
+    if(WITH_GPU AND NOT WITH_DSO)
+      get_property(cuda_modules GLOBAL PROPERTY CUDA_MODULES)
+      target_link_libraries(paddle_pybind ${cuda_modules})
+    endif(WITH_GPU AND NOT WITH_DSO)
     target_link_libraries(paddle_pybind shlwapi)
   endif(WIN32)
 
diff --git a/python/CMakeLists.txt b/python/CMakeLists.txt
index 391094b5b2..879d4d6bf9 100644
--- a/python/CMakeLists.txt
+++ b/python/CMakeLists.txt
@@ -61,12 +61,13 @@ IF(WIN32)
 	add_custom_command(OUTPUT ${PADDLE_PYTHON_BUILD_DIR}/.timestamp
 #		COMMAND ${CMAKE_COMMAND} -E touch stub.cc
         COMMAND ${CMAKE_COMMAND} -E make_directory ${PADDLE_BINARY_DIR}/python/paddle
+        COMMAND ${CMAKE_COMMAND} -E make_directory ${PADDLE_BINARY_DIR}/python/paddle/libs
         COMMAND ${CMAKE_COMMAND} -E copy ${PADDLE_SOURCE_DIR}/python/paddle ${PADDLE_BINARY_DIR}/python/paddle
 		COMMAND ${CMAKE_COMMAND} -E copy ${PADDLE_SOURCE_DIR}/paddle/py_paddle ${PADDLE_BINARY_DIR}/python/
 		COMMAND ${CMAKE_COMMAND} -E env ${py_env} ${PYTHON_EXECUTABLE} setup.py bdist_wheel
 		COMMAND ${CMAKE_COMMAND} -E touch ${PADDLE_PYTHON_BUILD_DIR}/.timestamp
-		COMMAND ${CMAKE_COMMAND} -E remove_directory ${PADDLE_PYTHON_BUILD_DIR}/lib-python
-		COMMAND ${CMAKE_COMMAND} -E copy ${PADDLE_PYTHON_BUILD_DIR}/libs ${PADDLE_PYTHON_BUILD_DIR}/lib-python
+#		COMMAND ${CMAKE_COMMAND} -E remove_directory ${PADDLE_PYTHON_BUILD_DIR}/libs
+#		COMMAND ${CMAKE_COMMAND} -E copy ${PADDLE_PYTHON_BUILD_DIR}/libs ${PADDLE_PYTHON_BUILD_DIR}/libs
 		DEPENDS gen_proto_py copy_paddle_pybind ${FLUID_CORE} framework_py_proto profiler_py_proto ${PY_FILES} ${external_project_dependencies} ${COPY_PADDLE_MASTER})
 ELSE(WIN32)
 	add_custom_command(OUTPUT ${PADDLE_PYTHON_BUILD_DIR}/.timestamp
diff --git a/python/paddle/fluid/__init__.py b/python/paddle/fluid/__init__.py
index 737c8be814..70c1f95899 100644
--- a/python/paddle/fluid/__init__.py
+++ b/python/paddle/fluid/__init__.py
@@ -13,6 +13,7 @@
 # limitations under the License.
 
 from __future__ import print_function
+import os
 # import all class inside framework into fluid module
 from . import framework
 from .framework import *
@@ -43,16 +44,17 @@ from .lod_tensor import create_lod_tensor, create_random_int_lodtensor
 from . import clip
 from . import profiler
 from . import unique_name
-from . import recordio_writer
-from . import parallel_executor
-from .parallel_executor import *
+if os.name != 'nt':
+    from . import recordio_writer
+    from . import parallel_executor
+    from .parallel_executor import *
 from paddle.fluid.layers.math_op_patch import monkey_patch_variable
 
 Tensor = LoDTensor
 
 __all__ = framework.__all__ + executor.__all__ + \
     trainer.__all__ + inferencer.__all__ + transpiler.__all__ + \
-    parallel_executor.__all__ + lod_tensor.__all__ + [
+    lod_tensor.__all__ + [
         'io',
         'initializer',
         'layers',
@@ -78,7 +80,8 @@ __all__ = framework.__all__ + executor.__all__ + \
         'recordio_writer',
         'Scope',
     ]
-
+if os.name != 'nt':
+    __all__ += parallel_executor.__all__
 
 def __bootstrap__():
     """
@@ -110,12 +113,16 @@ def __bootstrap__():
     os.environ['OMP_NUM_THREADS'] = str(num_threads)
 
     read_env_flags = [
-        'use_pinned_memory', 'check_nan_inf', 'benchmark', 'warpctc_dir',
+        'use_pinned_memory', 'check_nan_inf', 'benchmark',
         'eager_delete_scope', 'use_mkldnn', 'initial_cpu_memory_in_mb',
         'init_allocated_mem', 'free_idle_memory', 'paddle_num_threads',
-        'dist_threadpool_size', 'cpu_deterministic', 'eager_delete_tensor_gb',
+        'dist_threadpool_size', 'eager_delete_tensor_gb',
         'reader_queue_speed_test_mode'
     ]
+    if os.name != 'nt':
+        read_env_flags.append('warpctc_dir')
+        read_env_flags.append('cpu_deterministic')
+
     if core.is_compiled_with_dist():
         read_env_flags.append('rpc_deadline')
         read_env_flags.append('rpc_server_profile_period')
diff --git a/python/paddle/fluid/contrib/inferencer.py b/python/paddle/fluid/contrib/inferencer.py
index b8d5f4ffea..b966ae01d0 100644
--- a/python/paddle/fluid/contrib/inferencer.py
+++ b/python/paddle/fluid/contrib/inferencer.py
@@ -15,13 +15,15 @@
 from __future__ import print_function
 
 import contextlib
+import os
 
 from .. import core
 
 from .. import executor
 from .. import framework
 from .. import io
-from .. import parallel_executor
+if os.name != 'nt':
+    from .. import parallel_executor
 from .. import unique_name
 from .trainer import check_and_get_place
 
diff --git a/python/paddle/fluid/contrib/trainer.py b/python/paddle/fluid/contrib/trainer.py
index 8569e486f9..096821a5ba 100644
--- a/python/paddle/fluid/contrib/trainer.py
+++ b/python/paddle/fluid/contrib/trainer.py
@@ -28,7 +28,8 @@ from .. import framework
 from .. import io
 # optimizer is same as the parameter of Trainer.__init__. Rename it to opt_module
 from .. import optimizer as opt_module
-from .. import parallel_executor
+if os.name != 'nt':
+    from .. import parallel_executor
 from ..transpiler import distribute_transpiler
 
 __all__ = [
diff --git a/python/paddle/fluid/framework.py b/python/paddle/fluid/framework.py
index fd03dff386..0282ffec16 100644
--- a/python/paddle/fluid/framework.py
+++ b/python/paddle/fluid/framework.py
@@ -536,7 +536,7 @@ class Operator(object):
     OP_WITHOUT_KERNEL_SET = {
         'feed', 'fetch', 'save', 'load', 'recurrent', 'go',
         'rnn_memory_helper_grad', 'conditional_block', 'while', 'send', 'recv',
-        'listen_and_serv', 'parallel_do', 'save_combine', 'load_combine',
+        'listen_and_serv', 'parallel_do', 'save_combine', 'loadload_combine',
         'ncclInit', 'select', 'checkpoint_notify', 'gen_nccl_id'
     }
 
diff --git a/python/paddle/fluid/layers/io.py b/python/paddle/fluid/layers/io.py
index 95e13669ad..e2d304dc86 100644
--- a/python/paddle/fluid/layers/io.py
+++ b/python/paddle/fluid/layers/io.py
@@ -15,6 +15,7 @@
 from __future__ import print_function
 import contextlib
 import multiprocessing
+import os
 import six
 import threading
 
@@ -344,70 +345,71 @@ def _copy_reader_create_op_(block, op):
     return new_op
 
 
-@templatedoc(op_type='create_recordio_file_reader')
-def open_recordio_file(filename,
-                       shapes,
-                       lod_levels,
-                       dtypes,
-                       pass_num=1,
-                       for_parallel=True):
-    """
-    ${comment}
-
-    Args:
-       filename(${filename_type}): ${filename_comment}.
-       shapes(list): List of tuples which declaring data shapes.
-       lod_levels(${lod_levels_type}): ${lod_levels_comment}.
-       dtypes(list): List of strs which declaring data type.
-       pass_num(int): Number of passes to run.
-       for_parallel(Bool): Set it as True if you are going to run
-            subsequent operators in parallel.
-
-    Returns:
-       ${out_comment}.
-
-    Examples:
-
-        >>> import paddle.fluid as fluid
-        >>> reader = fluid.layers.io.open_recordio_file(
-        >>>                               filename='./data.recordio',
-        >>>                               shapes=[(3,224,224), (1)],
-        >>>                               lod_levels=[0, 0],
-        >>>                               dtypes=['float32', 'int64'])
-        >>> # Via the reader, we can use 'read_file' layer to get data:
-        >>> image, label = fluid.layers.io.read_file(reader)
-    """
-    dtypes = [convert_np_dtype_to_dtype_(dt) for dt in dtypes]
-    shape_concat = []
-    ranks = []
-
-    for shape in shapes:
-        shape_concat.extend(shape)
-        ranks.append(len(shape))
-
-    var_name = unique_name('open_recordio_file')
-
-    startup_blk = default_startup_program().current_block()
-    startup_var = startup_blk.create_var(name=var_name)
-    startup_blk.append_op(
-        type='create_recordio_file_reader',
-        outputs={'Out': [startup_var]},
-        attrs={
-            'shape_concat': shape_concat,
-            'lod_levels': lod_levels,
-            'filename': filename,
-            'ranks': ranks
-        })
+if os.name != 'nt':
+    @templatedoc(op_type='create_recordio_file_reader')
+    def open_recordio_file(filename,
+                           shapes,
+                           lod_levels,
+                           dtypes,
+                           pass_num=1,
+                           for_parallel=True):
+        """
+        ${comment}
+
+        Args:
+           filename(${filename_type}): ${filename_comment}.
+           shapes(list): List of tuples which declaring data shapes.
+           lod_levels(${lod_levels_type}): ${lod_levels_comment}.
+           dtypes(list): List of strs which declaring data type.
+           pass_num(int): Number of passes to run.
+           for_parallel(Bool): Set it as True if you are going to run
+                subsequent operators in parallel.
+
+        Returns:
+           ${out_comment}.
+
+        Examples:
+
+            >>> import paddle.fluid as fluid
+            >>> reader = fluid.layers.io.open_recordio_file(
+            >>>                               filename='./data.recordio',
+            >>>                               shapes=[(3,224,224), (1)],
+            >>>                               lod_levels=[0, 0],
+            >>>                               dtypes=['float32', 'int64'])
+            >>> # Via the reader, we can use 'read_file' layer to get data:
+            >>> image, label = fluid.layers.io.read_file(reader)
+        """
+        dtypes = [convert_np_dtype_to_dtype_(dt) for dt in dtypes]
+        shape_concat = []
+        ranks = []
+
+        for shape in shapes:
+            shape_concat.extend(shape)
+            ranks.append(len(shape))
+
+        var_name = unique_name('open_recordio_file')
+
+        startup_blk = default_startup_program().current_block()
+        startup_var = startup_blk.create_var(name=var_name)
+        startup_blk.append_op(
+            type='create_recordio_file_reader',
+            outputs={'Out': [startup_var]},
+            attrs={
+                'shape_concat': shape_concat,
+                'lod_levels': lod_levels,
+                'filename': filename,
+                'ranks': ranks
+            })
 
-    startup_var.desc.set_dtypes(dtypes)
-    startup_var.persistable = True
-    main_prog_var = _copy_reader_var_(default_main_program().current_block(),
-                                      startup_var)
+        startup_var.desc.set_dtypes(dtypes)
+        startup_var.persistable = True
+        main_prog_var = _copy_reader_var_(default_main_program().current_block(),
+                                          startup_var)
 
-    if pass_num > 1:
-        main_prog_var = multi_pass(reader=main_prog_var, pass_num=pass_num)
+        if pass_num > 1:
+            main_prog_var = multi_pass(reader=main_prog_var, pass_num=pass_num)
 
-    return monkey_patch_reader_methods(main_prog_var)
+        return monkey_patch_reader_methods(main_prog_var)
 
 
 def random_data_generator(low, high, shapes, lod_levels, for_parallel=True):
diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py
index 110e6d5ab2..d201357e6f 100644
--- a/python/paddle/fluid/layers/nn.py
+++ b/python/paddle/fluid/layers/nn.py
@@ -18,6 +18,7 @@ All layers just related to the neural network.
 from __future__ import print_function
 
 import numpy as np
+import os
 from ..layer_helper import LayerHelper
 from ..initializer import Normal, Constant
 from ..framework import Variable, OpProtoHolder
@@ -31,12 +32,10 @@ from functools import reduce
 __all__ = [
     'fc',
     'embedding',
-    'dynamic_lstm',
     'dynamic_lstmp',
     'dynamic_gru',
     'gru_unit',
     'linear_chain_crf',
-    'crf_decoding',
     'cos_sim',
     'cross_entropy',
     'square_error_cost',
@@ -95,7 +94,6 @@ __all__ = [
     'pad',
     'pad_constant_like',
     'label_smooth',
-    'roi_pool',
     'roi_align',
     'dice_loss',
     'image_resize',
@@ -160,6 +158,10 @@ __all__ = [
     'log_loss',
     'add_position_encoding',
 ]
+if os.name != 'nt':
+    __all__.append('dynamic_lstm')
+    __all__.append('crf_decoding')
+    __all__.append('roi_pool')
 
 
 def fc(input,
@@ -334,126 +336,127 @@ def embedding(input,
     return tmp
 
 
-@templatedoc(op_type="lstm")
-def dynamic_lstm(input,
-                 size,
-                 h_0=None,
-                 c_0=None,
-                 param_attr=None,
-                 bias_attr=None,
-                 use_peepholes=True,
-                 is_reverse=False,
-                 gate_activation='sigmoid',
-                 cell_activation='tanh',
-                 candidate_activation='tanh',
-                 dtype='float32',
-                 name=None):
-    """
-    ${comment}
-
-    Args:
-        input (Variable): ${input_comment}
-        size (int): 4 * hidden size.
-        h_0(Variable): The initial hidden state is an optional input, default is zero.
-                       This is a tensor with shape (N x D), where N is the
-                       batch size and D is the hidden size.
-        c_0(Variable): The initial cell state is an optional input, default is zero.
-                       This is a tensor with shape (N x D), where N is the
-                       batch size. `h_0` and `c_0` can be NULL but only at the same time.
-        param_attr(ParamAttr|None): The parameter attribute for the learnable
-                               hidden-hidden weights.
-
-                               - Weights = {:math:`W_{ch}, W_{ih}, \
-                                                W_{fh}, W_{oh}`}
-                               - The shape is (D x 4D), where D is the hidden
-                                 size.
-
-                               If it is set to None or one attribute of ParamAttr,
-                               dynamic_lstm will create ParamAttr as param_attr.
-                               If the Initializer of the param_attr is not set, the
-                               parameter is initialized with Xavier. Default: None.
-        bias_attr (ParamAttr|None): The bias attribute for the learnable bias
-                              weights, which contains two parts, input-hidden
-                              bias weights and peephole connections weights if
-                              setting `use_peepholes` to `True`.
-
-                              1. `use_peepholes = False`
-                                 - Biases = {:math:`b_c, b_i, b_f, b_o`}.
-                                 - The shape is (1 x 4D).
-                              2. `use_peepholes = True`
-                                 - Biases = { :math:`b_c, b_i, b_f, b_o, W_{ic}, \
-                                                 W_{fc}, W_{oc}`}.
-                                 - The shape is (1 x 7D).
-
-                              If it is set to None or one attribute of ParamAttr,
-                              dynamic_lstm will create ParamAttr as bias_attr.
-                              If the Initializer of the bias_attr is not set,
-                              the bias is initialized zero. Default: None.
-        use_peepholes (bool): ${use_peepholes_comment}
-        is_reverse (bool): ${is_reverse_comment}
-        gate_activation (str): ${gate_activation_comment}
-        cell_activation (str): ${cell_activation_comment}
-        candidate_activation (str): ${candidate_activation_comment}
-        dtype (str): Data type. Choices = ["float32", "float64"], default "float32".
-        name (str|None): A name for this layer(optional). If set None, the layer
-                         will be named automatically.
-
-    Returns:
-        tuple: The hidden state, and cell state of LSTM. The shape of both \
-        is (T x D), and lod is the same with the `input`.
-
-    Examples:
-        .. code-block:: python
-
-            hidden_dim = 512
-            forward_proj = fluid.layers.fc(input=input_seq, size=hidden_dim * 4,
-                                           bias_attr=False)
-            forward, _ = fluid.layers.dynamic_lstm(
-                input=forward_proj, size=hidden_dim * 4, use_peepholes=False)
-    """
-    assert bias_attr is not False, "bias_attr should not be False in dynamic_lstmp."
-    helper = LayerHelper('lstm', **locals())
-    size = size // 4
-    weight = helper.create_parameter(
-        attr=helper.param_attr, shape=[size, 4 * size], dtype=dtype)
-    bias_size = [1, 7 * size]
-    if not use_peepholes:
-        bias_size[1] = 4 * size
-    bias = helper.create_parameter(
-        attr=helper.bias_attr, shape=bias_size, dtype=dtype, is_bias=True)
+if os.name != 'nt':
+    @templatedoc(op_type="lstm")
+    def dynamic_lstm(input,
+                     size,
+                     h_0=None,
+                     c_0=None,
+                     param_attr=None,
+                     bias_attr=None,
+                     use_peepholes=True,
+                     is_reverse=False,
+                     gate_activation='sigmoid',
+                     cell_activation='tanh',
+                     candidate_activation='tanh',
+                     dtype='float32',
+                     name=None):
+        """
+        ${comment}
+
+        Args:
+            input (Variable): ${input_comment}
+            size (int): 4 * hidden size.
+            h_0(Variable): The initial hidden state is an optional input, default is zero.
+                           This is a tensor with shape (N x D), where N is the
+                           batch size and D is the hidden size.
+            c_0(Variable): The initial cell state is an optional input, default is zero.
+                           This is a tensor with shape (N x D), where N is the
+                           batch size. `h_0` and `c_0` can be NULL but only at the same time.
+            param_attr(ParamAttr|None): The parameter attribute for the learnable
+                                   hidden-hidden weights.
+
+                                   - Weights = {:math:`W_{ch}, W_{ih}, \
+                                                    W_{fh}, W_{oh}`}
+                                   - The shape is (D x 4D), where D is the hidden
+                                     size.
+
+                                   If it is set to None or one attribute of ParamAttr,
+                                   dynamic_lstm will create ParamAttr as param_attr.
+                                   If the Initializer of the param_attr is not set, the
+                                   parameter is initialized with Xavier. Default: None.
+            bias_attr (ParamAttr|None): The bias attribute for the learnable bias
+                                  weights, which contains two parts, input-hidden
+                                  bias weights and peephole connections weights if
+                                  setting `use_peepholes` to `True`.
+
+                                  1. `use_peepholes = False`
+                                     - Biases = {:math:`b_c, b_i, b_f, b_o`}.
+                                     - The shape is (1 x 4D).
+                                  2. `use_peepholes = True`
+                                     - Biases = { :math:`b_c, b_i, b_f, b_o, W_{ic}, \
+                                                     W_{fc}, W_{oc}`}.
+                                     - The shape is (1 x 7D).
+
+                                  If it is set to None or one attribute of ParamAttr,
+                                  dynamic_lstm will create ParamAttr as bias_attr.
+                                  If the Initializer of the bias_attr is not set,
+                                  the bias is initialized zero. Default: None.
+            use_peepholes (bool): ${use_peepholes_comment}
+            is_reverse (bool): ${is_reverse_comment}
+            gate_activation (str): ${gate_activation_comment}
+            cell_activation (str): ${cell_activation_comment}
+            candidate_activation (str): ${candidate_activation_comment}
+            dtype (str): Data type. Choices = ["float32", "float64"], default "float32".
+            name (str|None): A name for this layer(optional). If set None, the layer
+                             will be named automatically.
+
+        Returns:
+            tuple: The hidden state, and cell state of LSTM. The shape of both \
+            is (T x D), and lod is the same with the `input`.
+
+        Examples:
+            .. code-block:: python
+
+                hidden_dim = 512
+                forward_proj = fluid.layers.fc(input=input_seq, size=hidden_dim * 4,
+                                               bias_attr=False)
+                forward, _ = fluid.layers.dynamic_lstm(
+                    input=forward_proj, size=hidden_dim * 4, use_peepholes=False)
+        """
+        assert bias_attr is not False, "bias_attr should not be False in dynamic_lstmp."
+        helper = LayerHelper('lstm', **locals())
+        size = size // 4
+        weight = helper.create_parameter(
+            attr=helper.param_attr, shape=[size, 4 * size], dtype=dtype)
+        bias_size = [1, 7 * size]
+        if not use_peepholes:
+            bias_size[1] = 4 * size
+        bias = helper.create_parameter(
+            attr=helper.bias_attr, shape=bias_size, dtype=dtype, is_bias=True)
 
-    hidden = helper.create_variable_for_type_inference(dtype)
-    cell = helper.create_variable_for_type_inference(dtype)
-    batch_gate = helper.create_variable_for_type_inference(dtype)
-    batch_cell_pre_act = helper.create_variable_for_type_inference(dtype)
-    inputs = {'Input': input, 'Weight': weight, 'Bias': bias}
-    batch_size = input.shape[0]
-    if h_0:
-        assert h_0.shape == (batch_size, size), \
-            'The shape of h0 should be (batch_size, %d)' % size
-        inputs['H0'] = h_0
-    if c_0:
-        assert c_0.shape == (batch_size, size), \
-            'The shape of c0 should be (batch_size, %d)' % size
-        inputs['C0'] = c_0
+        hidden = helper.create_variable_for_type_inference(dtype)
+        cell = helper.create_variable_for_type_inference(dtype)
+        batch_gate = helper.create_variable_for_type_inference(dtype)
+        batch_cell_pre_act = helper.create_variable_for_type_inference(dtype)
+        inputs = {'Input': input, 'Weight': weight, 'Bias': bias}
+        batch_size = input.shape[0]
+        if h_0:
+            assert h_0.shape == (batch_size, size), \
+                'The shape of h0 should be (batch_size, %d)' % size
+            inputs['H0'] = h_0
+        if c_0:
+            assert c_0.shape == (batch_size, size), \
+                'The shape of c0 should be (batch_size, %d)' % size
+            inputs['C0'] = c_0
 
-    helper.append_op(
-        type='lstm',
-        inputs=inputs,
-        outputs={
-            'Hidden': hidden,
-            'Cell': cell,
-            'BatchGate': batch_gate,
-            'BatchCellPreAct': batch_cell_pre_act
-        },
-        attrs={
-            'use_peepholes': use_peepholes,
-            'is_reverse': is_reverse,
-            'gate_activation': gate_activation,
-            'cell_activation': cell_activation,
-            'candidate_activation': candidate_activation
-        })
-    return hidden, cell
+        helper.append_op(
+            type='lstm',
+            inputs=inputs,
+            outputs={
+                'Hidden': hidden,
+                'Cell': cell,
+                'BatchGate': batch_gate,
+                'BatchCellPreAct': batch_cell_pre_act
+            },
+            attrs={
+                'use_peepholes': use_peepholes,
+                'is_reverse': is_reverse,
+                'gate_activation': gate_activation,
+                'cell_activation': cell_activation,
+                'candidate_activation': candidate_activation
+            })
+        return hidden, cell
 
 
 def dynamic_lstmp(input,
@@ -923,39 +926,40 @@ def linear_chain_crf(input, label, param_attr=None):
     return log_likelihood
 
 
-@templatedoc()
-def crf_decoding(input, param_attr, label=None):
-    """
-    ${comment}
+if os.name != 'nt':
+    @templatedoc()
+    def crf_decoding(input, param_attr, label=None):
+        """
+        ${comment}
 
-    Args:
-        input(${emission_type}): ${emission_comment}
+        Args:
+            input(${emission_type}): ${emission_comment}
 
-        param_attr(ParamAttr): The parameter attribute for training.
+            param_attr(ParamAttr): The parameter attribute for training.
 
-        label(${label_type}): ${label_comment}
+            label(${label_type}): ${label_comment}
 
-    Returns:
-        Variable: ${viterbi_path_comment}
+        Returns:
+            Variable: ${viterbi_path_comment}
 
-    Examples:
-        .. code-block:: python
+        Examples:
+            .. code-block:: python
 
-           crf_decode = layers.crf_decoding(
-                input=hidden, param_attr=ParamAttr(name="crfw"))
-    """
-    helper = LayerHelper('crf_decoding', **locals())
-    transition = helper.get_parameter(param_attr.name)
-    viterbi_path = helper.create_variable_for_type_inference(
-        dtype=helper.input_dtype())
-    helper.append_op(
-        type='crf_decoding',
-        inputs={"Emission": [input],
-                "Transition": transition,
-                "Label": label},
-        outputs={"ViterbiPath": [viterbi_path]})
+               crf_decode = layers.crf_decoding(
+                    input=hidden, param_attr=ParamAttr(name="crfw"))
+        """
+        helper = LayerHelper('crf_decoding', **locals())
+        transition = helper.get_parameter(param_attr.name)
+        viterbi_path = helper.create_variable_for_type_inference(
+            dtype=helper.input_dtype())
+        helper.append_op(
+            type='crf_decoding',
+            inputs={"Emission": [input],
+                    "Transition": transition,
+                    "Label": label},
+            outputs={"ViterbiPath": [viterbi_path]})
 
-    return viterbi_path
+        return viterbi_path
 
 
 @templatedoc()
@@ -5443,42 +5447,43 @@ def label_smooth(label,
     return smooth_label
 
 
-@templatedoc()
-def roi_pool(input, rois, pooled_height=1, pooled_width=1, spatial_scale=1.0):
-    """
-    ${comment}
+if os.name != 'nt':
+    @templatedoc()
+    def roi_pool(input, rois, pooled_height=1, pooled_width=1, spatial_scale=1.0):
+        """
+        ${comment}
 
-    Args:
-        input (Variable): ${x_comment}
-        rois (Variable): ROIs (Regions of Interest) to pool over.
-        pooled_height (integer): ${pooled_height_comment} Default: 1
-        pooled_width (integer): ${pooled_width_comment} Default: 1
-        spatial_scale (float): ${spatial_scale_comment} Default: 1.0
+        Args:
+            input (Variable): ${x_comment}
+            rois (Variable): ROIs (Regions of Interest) to pool over.
+            pooled_height (integer): ${pooled_height_comment} Default: 1
+            pooled_width (integer): ${pooled_width_comment} Default: 1
+            spatial_scale (float): ${spatial_scale_comment} Default: 1.0
 
-    Returns:
-        Variable: ${out_comment}.
+        Returns:
+            Variable: ${out_comment}.
 
-    Examples:
-        .. code-block:: python
+        Examples:
+            .. code-block:: python
 
-            pool_out = fluid.layers.roi_pool(input=x, rois=rois, 7, 7, 1.0)
-    """
-    helper = LayerHelper('roi_pool', **locals())
-    dtype = helper.input_dtype()
-    pool_out = helper.create_variable_for_type_inference(dtype)
-    argmaxes = helper.create_variable_for_type_inference(dtype='int32')
-    helper.append_op(
-        type="roi_pool",
-        inputs={"X": input,
-                "ROIs": rois},
-        outputs={"Out": pool_out,
-                 "Argmax": argmaxes},
-        attrs={
-            "pooled_height": pooled_height,
-            "pooled_width": pooled_width,
-            "spatial_scale": spatial_scale
-        })
-    return pool_out
+                pool_out = fluid.layers.roi_pool(input=x, rois=rois, 7, 7, 1.0)
+        """
+        helper = LayerHelper('roi_pool', **locals())
+        dtype = helper.input_dtype()
+        pool_out = helper.create_variable_for_type_inference(dtype)
+        argmaxes = helper.create_variable_for_type_inference(dtype='int32')
+        helper.append_op(
+            type="roi_pool",
+            inputs={"X": input,
+                    "ROIs": rois},
+            outputs={"Out": pool_out,
+                     "Argmax": argmaxes},
+            attrs={
+                "pooled_height": pooled_height,
+                "pooled_width": pooled_width,
+                "spatial_scale": spatial_scale
+            })
+        return pool_out
 
 
 @templatedoc()
diff --git a/python/paddle/fluid/layers/ops.py b/python/paddle/fluid/layers/ops.py
index 1ff40a26f2..df52b7042f 100644
--- a/python/paddle/fluid/layers/ops.py
+++ b/python/paddle/fluid/layers/ops.py
@@ -13,6 +13,7 @@
 # limitations under the License.
 
 from __future__ import print_function
+import os
 from .layer_function_generator import generate_layer_fn, generate_layer_fn_noattr
 from .. import core
 from ..framework import convert_np_dtype_to_dtype_
@@ -99,27 +100,28 @@ Examples:
     >>> result = fluid.layers.hard_shrink(x=data, threshold=0.3)
 """
 
-__all__ += ['cumsum']
+if os.name != 'nt':
+    __all__ += ['cumsum']
 
-_cum_sum_ = generate_layer_fn('cumsum')
+    _cum_sum_ = generate_layer_fn('cumsum')
 
 
-def cumsum(x, axis=None, exclusive=None, reverse=None):
-    locals_var = locals().keys()
-    kwargs = dict()
-    for name in locals_var:
-        val = locals()[name]
-        if val is not None:
-            kwargs[name] = val
-    return _cum_sum_(**kwargs)
-
+    def cumsum(x, axis=None, exclusive=None, reverse=None):
+        locals_var = locals().keys()
+        kwargs = dict()
+        for name in locals_var:
+            val = locals()[name]
+            if val is not None:
+                kwargs[name] = val
+        return _cum_sum_(**kwargs)
 
-cumsum.__doc__ = _cum_sum_.__doc__ + """
-Examples:
 
-    >>> data = fluid.layers.data(name="input", shape=[32, 784])
-    >>> result = fluid.layers.cumsum(data, axis=0)
-"""
+    cumsum.__doc__ = _cum_sum_.__doc__ + """
+    Examples:
+    
+        >>> data = fluid.layers.data(name="input", shape=[32, 784])
+        >>> result = fluid.layers.cumsum(data, axis=0)
+    """
 
 __all__ += ['thresholded_relu']
 
diff --git a/python/setup.py.in b/python/setup.py.in
index c442055208..ce65d0003f 100644
--- a/python/setup.py.in
+++ b/python/setup.py.in
@@ -180,7 +180,8 @@ if '${CMAKE_BUILD_TYPE}' == 'Release':
         package_data['paddle.libs']+=['libmkldnn.so.0']
         shutil.copy('${MKLDNN_SHARED_LIB}', libs_path)
 # remove unused paddle/libs/__init__.py
-os.remove(libs_path+'/__init__.py')
+if os.path.isfile(libs_path+'/__init__.py'):
+    os.remove(libs_path+'/__init__.py')
 package_dir['paddle.libs']=libs_path
 
 # change rpath of core.so, add $ORIGIN/../libs/ to it.

From a37918c31f740b5b6a886bb472ce52f8d4e65659 Mon Sep 17 00:00:00 2001
From: peizhilin <wopeizl@163.com>
Date: Tue, 6 Nov 2018 17:28:12 +0800
Subject: [PATCH 04/50] fix python package issue

---
 paddle/fluid/framework/CMakeLists.txt | 19 ++++++++++-----
 python/CMakeLists.txt                 | 33 ++++++++++++++-------------
 2 files changed, 30 insertions(+), 22 deletions(-)

diff --git a/paddle/fluid/framework/CMakeLists.txt b/paddle/fluid/framework/CMakeLists.txt
index 8442911406..2bab3a15b1 100644
--- a/paddle/fluid/framework/CMakeLists.txt
+++ b/paddle/fluid/framework/CMakeLists.txt
@@ -140,16 +140,23 @@ cc_library(proto_desc SRCS var_desc.cc op_desc.cc block_desc.cc program_desc.cc
 cc_library(op_registry SRCS op_registry.cc DEPS op_proto_maker op_info operator glog proto_desc)
 nv_test(op_registry_test SRCS op_registry_test.cc DEPS op_registry)
 
-if (NOT WIN32)
 py_proto_compile(framework_py_proto SRCS framework.proto)
 # Generate an empty __init__.py to make framework_py_proto as a valid python module.
 add_custom_target(framework_py_proto_init ALL COMMAND ${CMAKE_COMMAND} -E touch __init__.py)
 add_dependencies(framework_py_proto framework_py_proto_init)
-add_custom_command(TARGET framework_py_proto POST_BUILD
-    COMMAND ${CMAKE_COMMAND} -E make_directory ${PADDLE_BINARY_DIR}/python/paddle/fluid/proto
-    COMMAND cp *.py ${PADDLE_BINARY_DIR}/python/paddle/fluid/proto/
-    COMMENT "Copy generated python proto into directory paddle/fluid/proto."
-    WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
+if (NOT WIN32)
+  add_custom_command(TARGET framework_py_proto POST_BUILD
+      COMMAND ${CMAKE_COMMAND} -E make_directory ${PADDLE_BINARY_DIR}/python/paddle/fluid/proto
+      COMMAND cp *.py ${PADDLE_BINARY_DIR}/python/paddle/fluid/proto/
+      COMMENT "Copy generated python proto into directory paddle/fluid/proto."
+      WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
+else(NOT WIN32)
+  string(REPLACE "/" "\\" proto_dstpath "${PADDLE_BINARY_DIR}/python/paddle/fluid/proto/")
+  add_custom_command(TARGET framework_py_proto POST_BUILD
+          COMMAND ${CMAKE_COMMAND} -E make_directory ${PADDLE_BINARY_DIR}/python/paddle/fluid/proto
+          COMMAND copy /Y *.py ${proto_dstpath}
+          COMMENT "Copy generated python proto into directory paddle/fluid/proto."
+          WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
 endif(NOT WIN32)
 
 cc_library(lod_rank_table SRCS lod_rank_table.cc DEPS lod_tensor)
diff --git a/python/CMakeLists.txt b/python/CMakeLists.txt
index 879d4d6bf9..139176b0d6 100644
--- a/python/CMakeLists.txt
+++ b/python/CMakeLists.txt
@@ -45,30 +45,31 @@ endif()
 
 configure_file(${CMAKE_CURRENT_SOURCE_DIR}/setup.py.in
     ${CMAKE_CURRENT_BINARY_DIR}/setup.py)
-
 IF(WIN32)
     # Python would use the .pyd by default under Windows series platform
-    set(FLUID_CORE ${PADDLE_BINARY_DIR}/python/paddle/fluid/core.pyd)
+    set(FLUID_DST_DIR ${PADDLE_BINARY_DIR}/python/paddle/fluid/)
+    get_filename_component(openblas_refpath ${CBLAS_LIBRARIES} DIRECTORY)
+    set(FLUID_CORE ${FLUID_DST_DIR}/core.pyd)
+    add_custom_command(OUTPUT ${FLUID_CORE}
+            COMMAND cmake -E copy $<TARGET_FILE:paddle_pybind> ${FLUID_CORE}
+            COMMAND cmake -E copy ${openblas_refpath}/openblas.dll ${FLUID_DST_DIR}
+            DEPENDS paddle_pybind)
 ELSE()
     set(FLUID_CORE ${PADDLE_BINARY_DIR}/python/paddle/fluid/core.so)
+    add_custom_command(OUTPUT ${FLUID_CORE}
+            COMMAND cmake -E copy $<TARGET_FILE:paddle_pybind> ${FLUID_CORE}
+            DEPENDS paddle_pybind)
 ENDIF()
-add_custom_command(OUTPUT ${FLUID_CORE}
-        COMMAND cmake -E copy $<TARGET_FILE:paddle_pybind> ${FLUID_CORE}
-        DEPENDS paddle_pybind)
 add_custom_target(copy_paddle_pybind ALL DEPENDS ${FLUID_CORE})
 
 IF(WIN32)
-	add_custom_command(OUTPUT ${PADDLE_PYTHON_BUILD_DIR}/.timestamp
-#		COMMAND ${CMAKE_COMMAND} -E touch stub.cc
-        COMMAND ${CMAKE_COMMAND} -E make_directory ${PADDLE_BINARY_DIR}/python/paddle
-        COMMAND ${CMAKE_COMMAND} -E make_directory ${PADDLE_BINARY_DIR}/python/paddle/libs
-        COMMAND ${CMAKE_COMMAND} -E copy ${PADDLE_SOURCE_DIR}/python/paddle ${PADDLE_BINARY_DIR}/python/paddle
-		COMMAND ${CMAKE_COMMAND} -E copy ${PADDLE_SOURCE_DIR}/paddle/py_paddle ${PADDLE_BINARY_DIR}/python/
-		COMMAND ${CMAKE_COMMAND} -E env ${py_env} ${PYTHON_EXECUTABLE} setup.py bdist_wheel
-		COMMAND ${CMAKE_COMMAND} -E touch ${PADDLE_PYTHON_BUILD_DIR}/.timestamp
-#		COMMAND ${CMAKE_COMMAND} -E remove_directory ${PADDLE_PYTHON_BUILD_DIR}/libs
-#		COMMAND ${CMAKE_COMMAND} -E copy ${PADDLE_PYTHON_BUILD_DIR}/libs ${PADDLE_PYTHON_BUILD_DIR}/libs
-		DEPENDS gen_proto_py copy_paddle_pybind ${FLUID_CORE} framework_py_proto profiler_py_proto ${PY_FILES} ${external_project_dependencies} ${COPY_PADDLE_MASTER})
+    add_custom_command(OUTPUT ${PADDLE_PYTHON_BUILD_DIR}/.timestamp
+            COMMAND ${CMAKE_COMMAND} -E copy_directory ${PADDLE_SOURCE_DIR}/python/paddle ${PADDLE_BINARY_DIR}/python/paddle/
+            COMMAND ${CMAKE_COMMAND} -E copy_directory ${PADDLE_SOURCE_DIR}/paddle/py_paddle ${PADDLE_BINARY_DIR}/python/
+            COMMAND ${CMAKE_COMMAND} -E env ${py_env} ${PYTHON_EXECUTABLE} setup.py bdist_wheel
+            COMMAND ${CMAKE_COMMAND} -E touch ${PADDLE_PYTHON_BUILD_DIR}/.timestamp
+            COMMAND ${CMAKE_COMMAND} -E remove_directory ${PADDLE_PYTHON_BUILD_DIR}/lib-python
+            DEPENDS gen_proto_py copy_paddle_pybind ${FLUID_CORE} framework_py_proto profiler_py_proto ${PY_FILES} ${external_project_dependencies} ${COPY_PADDLE_MASTER})
 ELSE(WIN32)
 	add_custom_command(OUTPUT ${PADDLE_PYTHON_BUILD_DIR}/.timestamp
 		COMMAND touch stub.cc

From 77892124fb7babd0b1651092958878555764bdbf Mon Sep 17 00:00:00 2001
From: peizhilin <wopeizl@163.com>
Date: Wed, 7 Nov 2018 14:22:07 +0800
Subject: [PATCH 05/50] online configuration

---
 cmake/external/eigen.cmake    |  5 ++---
 cmake/external/gflags.cmake   |  5 ++---
 cmake/external/glog.cmake     |  1 -
 cmake/external/gtest.cmake    |  5 ++---
 cmake/external/openblas.cmake | 10 ++++------
 cmake/external/protobuf.cmake |  5 ++---
 cmake/external/zlib.cmake     |  5 ++---
 7 files changed, 14 insertions(+), 22 deletions(-)

diff --git a/cmake/external/eigen.cmake b/cmake/external/eigen.cmake
index 2aa64a350a..573ad5e5f0 100644
--- a/cmake/external/eigen.cmake
+++ b/cmake/external/eigen.cmake
@@ -29,11 +29,10 @@ else()
     ExternalProject_Add(
         extern_eigen3
         ${EXTERNAL_PROJECT_LOG_ARGS}
-#            GIT_REPOSITORY  "https://github.com/eigenteam/eigen-git-mirror"
-            GIT_REPOSITORY  "http://admin@localhost:8080/r/eigen3.git"
+        GIT_REPOSITORY  "https://github.com/eigenteam/eigen-git-mirror"
         # eigen on cuda9.1 missing header of math_funtions.hpp
         # https://stackoverflow.com/questions/43113508/math-functions-hpp-not-found-when-using-cuda-with-eigen
-#        GIT_TAG         917060c364181f33a735dc023818d5a54f60e54c
+        GIT_TAG         917060c364181f33a735dc023818d5a54f60e54c
         PREFIX          ${EIGEN_SOURCE_DIR}
         DOWNLOAD_NAME   "eigen"
         UPDATE_COMMAND  ""
diff --git a/cmake/external/gflags.cmake b/cmake/external/gflags.cmake
index 9c6974b8f0..7a0369b9df 100644
--- a/cmake/external/gflags.cmake
+++ b/cmake/external/gflags.cmake
@@ -28,9 +28,8 @@ INCLUDE_DIRECTORIES(${GFLAGS_INCLUDE_DIR})
 ExternalProject_Add(
     extern_gflags
     ${EXTERNAL_PROJECT_LOG_ARGS}
-#    GIT_REPOSITORY  "https://github.com/gflags/gflags.git"
-    GIT_REPOSITORY  "http://admin@localhost:8080/r/gflags.git"
-#    GIT_TAG         77592648e3f3be87d6c7123eb81cbad75f9aef5a
+    GIT_REPOSITORY  "https://github.com/gflags/gflags.git"
+    GIT_TAG         77592648e3f3be87d6c7123eb81cbad75f9aef5a
     PREFIX          ${GFLAGS_SOURCES_DIR}
     UPDATE_COMMAND  ""
     CMAKE_ARGS      -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}
diff --git a/cmake/external/glog.cmake b/cmake/external/glog.cmake
index 84f8127760..ac2f2be83b 100644
--- a/cmake/external/glog.cmake
+++ b/cmake/external/glog.cmake
@@ -34,7 +34,6 @@ ELSE()
   SET(GLOG_REPOSITORY "https://github.com/google/glog.git")
   SET(GLOG_TAG "v0.3.5")
 ENDIF()
-  SET(GLOG_REPOSITORY "http://admin@localhost:8080/r/glog.git")
 
 ExternalProject_Add(
     extern_glog
diff --git a/cmake/external/gtest.cmake b/cmake/external/gtest.cmake
index 4f5acc92f0..d335298742 100644
--- a/cmake/external/gtest.cmake
+++ b/cmake/external/gtest.cmake
@@ -43,9 +43,8 @@ IF(WITH_TESTING)
         extern_gtest
         ${EXTERNAL_PROJECT_LOG_ARGS}
         DEPENDS         ${GTEST_DEPENDS}
-            #        GIT_REPOSITORY  "https://github.com/google/googletest.git"
-                    GIT_REPOSITORY  "http://admin@localhost:8080/r/gtest.git"
-#        GIT_TAG         "release-1.8.0"
+        GIT_REPOSITORY  "https://github.com/google/googletest.git"
+        GIT_TAG         "release-1.8.0"
         PREFIX          ${GTEST_SOURCES_DIR}
         UPDATE_COMMAND  ""
         CMAKE_ARGS      -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}
diff --git a/cmake/external/openblas.cmake b/cmake/external/openblas.cmake
index 664422813d..2b46936c18 100644
--- a/cmake/external/openblas.cmake
+++ b/cmake/external/openblas.cmake
@@ -76,9 +76,8 @@ IF(NOT ${CBLAS_FOUND})
             ExternalProject_Add(
                 extern_openblas
                 ${EXTERNAL_PROJECT_LOG_ARGS}
-    #              GIT_REPOSITORY      https://github.com/xianyi/OpenBLAS.git
-                GIT_REPOSITORY      http://admin@localhost:8080/r/openblas.git
-    #              GIT_TAG             ${OPENBLAS_COMMIT}
+                GIT_REPOSITORY      https://github.com/xianyi/OpenBLAS.git
+                GIT_TAG             ${OPENBLAS_COMMIT}
                 PREFIX              ${CBLAS_SOURCES_DIR}
                 INSTALL_DIR         ${CBLAS_INSTALL_DIR}
                 BUILD_IN_SOURCE     1
@@ -104,9 +103,8 @@ IF(NOT ${CBLAS_FOUND})
             ExternalProject_Add(
                 extern_openblas
                 ${EXTERNAL_PROJECT_LOG_ARGS}
-                #  GIT_REPOSITORY      https://github.com/xianyi/OpenBLAS.git
-                GIT_REPOSITORY      http://admin@localhost:8080/r/openblas.git
-                #  GIT_TAG             ${OPENBLAS_COMMIT}
+                GIT_REPOSITORY      https://github.com/xianyi/OpenBLAS.git
+                GIT_TAG             ${OPENBLAS_COMMIT}
                 PREFIX              ${CBLAS_SOURCES_DIR}
                 INSTALL_DIR         ${CBLAS_INSTALL_DIR}
                 BUILD_IN_SOURCE     1
diff --git a/cmake/external/protobuf.cmake b/cmake/external/protobuf.cmake
index d4c6ea7819..bb1fcf356f 100644
--- a/cmake/external/protobuf.cmake
+++ b/cmake/external/protobuf.cmake
@@ -206,9 +206,8 @@ FUNCTION(build_protobuf TARGET_NAME BUILD_FOR_HOST)
         SET(OPTIONAL_ARGS ${OPTIONAL_ARGS} "-DCMAKE_GENERATOR_PLATFORM=x64")
     ENDIF()
 
-   # SET(PROTOBUF_REPO "https://github.com/google/protobuf.git")
-  #  SET(PROTOBUF_TAG "9f75c5aa851cd877fb0d93ccc31b8567a6706546")
-    SET(PROTOBUF_REPO http://admin@localhost:8080/r/protobuf.git)
+    SET(PROTOBUF_REPO "https://github.com/google/protobuf.git")
+    SET(PROTOBUF_TAG "9f75c5aa851cd877fb0d93ccc31b8567a6706546")
     IF(MOBILE_INFERENCE)
         # The reason why the official version is not used is described in
         # https://github.com/PaddlePaddle/Paddle/issues/6114
diff --git a/cmake/external/zlib.cmake b/cmake/external/zlib.cmake
index b65f2afbc2..c3d7323545 100644
--- a/cmake/external/zlib.cmake
+++ b/cmake/external/zlib.cmake
@@ -31,9 +31,8 @@ INCLUDE_DIRECTORIES(${THIRD_PARTY_PATH}/install) # For Paddle code to include zl
 ExternalProject_Add(
     extern_zlib
     ${EXTERNAL_PROJECT_LOG_ARGS}
-        #    GIT_REPOSITORY  "https://github.com/madler/zlib.git"
-            GIT_REPOSITORY  "http://admin@localhost:8080/r/zlib.git"
-#    GIT_TAG         "v1.2.8"
+    GIT_REPOSITORY  "https://github.com/madler/zlib.git"
+    GIT_TAG         "v1.2.8"
     PREFIX          ${ZLIB_SOURCES_DIR}
     UPDATE_COMMAND  ""
     CMAKE_ARGS      -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER}

From 130cdda65b1b148c5f11a4dac1ee8848658a8587 Mon Sep 17 00:00:00 2001
From: peizhilin <wopeizl@163.com>
Date: Wed, 7 Nov 2018 19:14:25 +0800
Subject: [PATCH 06/50] add gpu debug mode

---
 cmake/cuda.cmake              | 8 ++++++--
 cmake/cudnn.cmake             | 7 ++++++-
 cmake/external/eigen.cmake    | 7 ++++---
 cmake/external/gflags.cmake   | 2 +-
 cmake/external/glog.cmake     | 2 +-
 cmake/external/gtest.cmake    | 2 +-
 cmake/external/openblas.cmake | 4 ++--
 cmake/external/protobuf.cmake | 2 +-
 cmake/external/zlib.cmake     | 2 +-
 9 files changed, 23 insertions(+), 13 deletions(-)

diff --git a/cmake/cuda.cmake b/cmake/cuda.cmake
index 1cc882cce7..45a4b13288 100644
--- a/cmake/cuda.cmake
+++ b/cmake/cuda.cmake
@@ -167,8 +167,12 @@ select_nvcc_arch_flags(NVCC_FLAGS_EXTRA)
 list(APPEND CUDA_NVCC_FLAGS ${NVCC_FLAGS_EXTRA})
 message(STATUS "Added CUDA NVCC flags for: ${NVCC_FLAGS_EXTRA_readable}")
 
-# Set C++11 support
-set(CUDA_PROPAGATE_HOST_FLAGS OFF)
+if (WIN32)
+  set(CUDA_PROPAGATE_HOST_FLAGS ON)
+else (WIN32)
+  # Set C++11 support
+  set(CUDA_PROPAGATE_HOST_FLAGS OFF)
+endif (WIN32)
 
 # Release/Debug flags set by cmake. Such as -O3 -g -DNDEBUG etc.
 # So, don't set these flags here.
diff --git a/cmake/cudnn.cmake b/cmake/cudnn.cmake
index cd51533926..09bec347db 100644
--- a/cmake/cudnn.cmake
+++ b/cmake/cudnn.cmake
@@ -2,7 +2,12 @@ if(NOT WITH_GPU)
     return()
 endif()
 
-set(CUDNN_ROOT "/usr" CACHE PATH "CUDNN ROOT")
+if(WIN32)
+    set(CUDNN_ROOT ${CUDA_TOOLKIT_ROOT_DIR})
+else(WIN32)
+    set(CUDNN_ROOT "/usr" CACHE PATH "CUDNN ROOT")
+endif(WIN32)
+
 find_path(CUDNN_INCLUDE_DIR cudnn.h
     PATHS ${CUDNN_ROOT} ${CUDNN_ROOT}/include
     $ENV{CUDNN_ROOT} $ENV{CUDNN_ROOT}/include ${CUDA_TOOLKIT_INCLUDE}
diff --git a/cmake/external/eigen.cmake b/cmake/external/eigen.cmake
index 2aa64a350a..98079678ae 100644
--- a/cmake/external/eigen.cmake
+++ b/cmake/external/eigen.cmake
@@ -16,8 +16,9 @@ if(WITH_AMD_GPU)
     ExternalProject_Add(
         extern_eigen3
         ${EXTERNAL_PROJECT_LOG_ARGS}
-        GIT_REPOSITORY  "https://github.com/sabreshao/hipeigen.git"
-        GIT_TAG         0cba03ff9f8f9f70bbd92ac5857b031aa8fed6f9
+#        GIT_REPOSITORY  "https://github.com/sabreshao/hipeigen.git"
+#        GIT_TAG         0cba03ff9f8f9f70bbd92ac5857b031aa8fed6f9
+            GIT_REPOSITORY  "http://admin@172.20.90.14:8080/r/eigen3.git"
         PREFIX          ${EIGEN_SOURCE_DIR}
         UPDATE_COMMAND  ""
         CONFIGURE_COMMAND ""
@@ -30,7 +31,7 @@ else()
         extern_eigen3
         ${EXTERNAL_PROJECT_LOG_ARGS}
 #            GIT_REPOSITORY  "https://github.com/eigenteam/eigen-git-mirror"
-            GIT_REPOSITORY  "http://admin@localhost:8080/r/eigen3.git"
+            GIT_REPOSITORY  "http://admin@172.20.90.14:8080/r/eigen3.git"
         # eigen on cuda9.1 missing header of math_funtions.hpp
         # https://stackoverflow.com/questions/43113508/math-functions-hpp-not-found-when-using-cuda-with-eigen
 #        GIT_TAG         917060c364181f33a735dc023818d5a54f60e54c
diff --git a/cmake/external/gflags.cmake b/cmake/external/gflags.cmake
index 9c6974b8f0..73ea80ea45 100644
--- a/cmake/external/gflags.cmake
+++ b/cmake/external/gflags.cmake
@@ -29,7 +29,7 @@ ExternalProject_Add(
     extern_gflags
     ${EXTERNAL_PROJECT_LOG_ARGS}
 #    GIT_REPOSITORY  "https://github.com/gflags/gflags.git"
-    GIT_REPOSITORY  "http://admin@localhost:8080/r/gflags.git"
+    GIT_REPOSITORY  "http://admin@172.20.90.14:8080/r/gflags.git"
 #    GIT_TAG         77592648e3f3be87d6c7123eb81cbad75f9aef5a
     PREFIX          ${GFLAGS_SOURCES_DIR}
     UPDATE_COMMAND  ""
diff --git a/cmake/external/glog.cmake b/cmake/external/glog.cmake
index 84f8127760..5184a83bdd 100644
--- a/cmake/external/glog.cmake
+++ b/cmake/external/glog.cmake
@@ -34,7 +34,7 @@ ELSE()
   SET(GLOG_REPOSITORY "https://github.com/google/glog.git")
   SET(GLOG_TAG "v0.3.5")
 ENDIF()
-  SET(GLOG_REPOSITORY "http://admin@localhost:8080/r/glog.git")
+  SET(GLOG_REPOSITORY "http://admin@172.20.90.14:8080/r/glog.git")
 
 ExternalProject_Add(
     extern_glog
diff --git a/cmake/external/gtest.cmake b/cmake/external/gtest.cmake
index 4f5acc92f0..da539d52bd 100644
--- a/cmake/external/gtest.cmake
+++ b/cmake/external/gtest.cmake
@@ -44,7 +44,7 @@ IF(WITH_TESTING)
         ${EXTERNAL_PROJECT_LOG_ARGS}
         DEPENDS         ${GTEST_DEPENDS}
             #        GIT_REPOSITORY  "https://github.com/google/googletest.git"
-                    GIT_REPOSITORY  "http://admin@localhost:8080/r/gtest.git"
+                    GIT_REPOSITORY  "http://admin@172.20.90.14:8080/r/gtest.git"
 #        GIT_TAG         "release-1.8.0"
         PREFIX          ${GTEST_SOURCES_DIR}
         UPDATE_COMMAND  ""
diff --git a/cmake/external/openblas.cmake b/cmake/external/openblas.cmake
index 664422813d..c6dace512e 100644
--- a/cmake/external/openblas.cmake
+++ b/cmake/external/openblas.cmake
@@ -77,7 +77,7 @@ IF(NOT ${CBLAS_FOUND})
                 extern_openblas
                 ${EXTERNAL_PROJECT_LOG_ARGS}
     #              GIT_REPOSITORY      https://github.com/xianyi/OpenBLAS.git
-                GIT_REPOSITORY      http://admin@localhost:8080/r/openblas.git
+                GIT_REPOSITORY      http://admin@172.20.90.14:8080/r/openblas.git
     #              GIT_TAG             ${OPENBLAS_COMMIT}
                 PREFIX              ${CBLAS_SOURCES_DIR}
                 INSTALL_DIR         ${CBLAS_INSTALL_DIR}
@@ -105,7 +105,7 @@ IF(NOT ${CBLAS_FOUND})
                 extern_openblas
                 ${EXTERNAL_PROJECT_LOG_ARGS}
                 #  GIT_REPOSITORY      https://github.com/xianyi/OpenBLAS.git
-                GIT_REPOSITORY      http://admin@localhost:8080/r/openblas.git
+                GIT_REPOSITORY      http://admin@172.20.90.14:8080/r/openblas.git
                 #  GIT_TAG             ${OPENBLAS_COMMIT}
                 PREFIX              ${CBLAS_SOURCES_DIR}
                 INSTALL_DIR         ${CBLAS_INSTALL_DIR}
diff --git a/cmake/external/protobuf.cmake b/cmake/external/protobuf.cmake
index d4c6ea7819..43b69e72dd 100644
--- a/cmake/external/protobuf.cmake
+++ b/cmake/external/protobuf.cmake
@@ -208,7 +208,7 @@ FUNCTION(build_protobuf TARGET_NAME BUILD_FOR_HOST)
 
    # SET(PROTOBUF_REPO "https://github.com/google/protobuf.git")
   #  SET(PROTOBUF_TAG "9f75c5aa851cd877fb0d93ccc31b8567a6706546")
-    SET(PROTOBUF_REPO http://admin@localhost:8080/r/protobuf.git)
+    SET(PROTOBUF_REPO http://admin@172.20.90.14:8080/r/protobuf.git)
     IF(MOBILE_INFERENCE)
         # The reason why the official version is not used is described in
         # https://github.com/PaddlePaddle/Paddle/issues/6114
diff --git a/cmake/external/zlib.cmake b/cmake/external/zlib.cmake
index b65f2afbc2..456f26385c 100644
--- a/cmake/external/zlib.cmake
+++ b/cmake/external/zlib.cmake
@@ -32,7 +32,7 @@ ExternalProject_Add(
     extern_zlib
     ${EXTERNAL_PROJECT_LOG_ARGS}
         #    GIT_REPOSITORY  "https://github.com/madler/zlib.git"
-            GIT_REPOSITORY  "http://admin@localhost:8080/r/zlib.git"
+            GIT_REPOSITORY  "http://admin@172.20.90.14:8080/r/zlib.git"
 #    GIT_TAG         "v1.2.8"
     PREFIX          ${ZLIB_SOURCES_DIR}
     UPDATE_COMMAND  ""

From e3f7be959d69486263f25d82ab56aec771629610 Mon Sep 17 00:00:00 2001
From: peizhilin <wopeizl@163.com>
Date: Wed, 7 Nov 2018 20:47:35 +0800
Subject: [PATCH 07/50] fix the debug flag for nvcc

---
 cmake/cuda.cmake | 14 ++++++--------
 1 file changed, 6 insertions(+), 8 deletions(-)

diff --git a/cmake/cuda.cmake b/cmake/cuda.cmake
index 45a4b13288..cdcbb79792 100644
--- a/cmake/cuda.cmake
+++ b/cmake/cuda.cmake
@@ -167,12 +167,8 @@ select_nvcc_arch_flags(NVCC_FLAGS_EXTRA)
 list(APPEND CUDA_NVCC_FLAGS ${NVCC_FLAGS_EXTRA})
 message(STATUS "Added CUDA NVCC flags for: ${NVCC_FLAGS_EXTRA_readable}")
 
-if (WIN32)
-  set(CUDA_PROPAGATE_HOST_FLAGS ON)
-else (WIN32)
-  # Set C++11 support
-  set(CUDA_PROPAGATE_HOST_FLAGS OFF)
-endif (WIN32)
+# Set C++11 support
+set(CUDA_PROPAGATE_HOST_FLAGS OFF)
 
 # Release/Debug flags set by cmake. Such as -O3 -g -DNDEBUG etc.
 # So, don't set these flags here.
@@ -203,10 +199,12 @@ elseif(CMAKE_BUILD_TYPE  STREQUAL "MinSizeRel")
     list(APPEND CUDA_NVCC_FLAGS  ${CMAKE_CXX_FLAGS_RELEASE})
 endif()
 else(NOT WIN32)
-if(CMAKE_BUILD_TYPE STREQUAL "Release")
+if(CMAKE_BUILD_TYPE  STREQUAL "Debug")
+    list(APPEND CUDA_NVCC_FLAGS  "-g -lineinfo -G")
+elseif(CMAKE_BUILD_TYPE STREQUAL "Release")
   list(APPEND CUDA_NVCC_FLAGS "-O3 -DNDEBUG")
 else()
-  message(FATAL "Windows only support Release build now. Please set visual studio build type to Release, x64 build.")
+  message(FATAL "Windows only support Release or Debug build now. Please set visual studio build type to Release/Debug, x64 build.")
 endif()
 endif(NOT WIN32)
 

From 3c439feadc1bfae9f1daa203bd19b22be1fb37fe Mon Sep 17 00:00:00 2001
From: peizhilin <wopeizl@163.com>
Date: Wed, 7 Nov 2018 21:13:19 +0800
Subject: [PATCH 08/50] remove the duplicate flag

---
 cmake/cuda.cmake | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cmake/cuda.cmake b/cmake/cuda.cmake
index cdcbb79792..964d5fd45b 100644
--- a/cmake/cuda.cmake
+++ b/cmake/cuda.cmake
@@ -200,7 +200,7 @@ elseif(CMAKE_BUILD_TYPE  STREQUAL "MinSizeRel")
 endif()
 else(NOT WIN32)
 if(CMAKE_BUILD_TYPE  STREQUAL "Debug")
-    list(APPEND CUDA_NVCC_FLAGS  "-g -lineinfo -G")
+    list(APPEND CUDA_NVCC_FLAGS  "-g -G")
 elseif(CMAKE_BUILD_TYPE STREQUAL "Release")
   list(APPEND CUDA_NVCC_FLAGS "-O3 -DNDEBUG")
 else()

From 52d3cd964e330662b5e63542b544a5dd20b9b193 Mon Sep 17 00:00:00 2001
From: peizhilin <wopeizl@163.com>
Date: Thu, 8 Nov 2018 15:19:22 +0800
Subject: [PATCH 09/50] fix

---
 cmake/external/glog.cmake | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cmake/external/glog.cmake b/cmake/external/glog.cmake
index ac2f2be83b..2a34c96ab9 100644
--- a/cmake/external/glog.cmake
+++ b/cmake/external/glog.cmake
@@ -40,7 +40,7 @@ ExternalProject_Add(
     ${EXTERNAL_PROJECT_LOG_ARGS}
     DEPENDS gflags
     GIT_REPOSITORY  ${GLOG_REPOSITORY}
-   # GIT_TAG         ${GLOG_TAG}
+    GIT_TAG         ${GLOG_TAG}
     PREFIX          ${GLOG_SOURCES_DIR}
     UPDATE_COMMAND  ""
     CMAKE_ARGS      -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}

From 45125ba538f7f647cff99aed34d9e18b4d7584f5 Mon Sep 17 00:00:00 2001
From: peizhilin <wopeizl@163.com>
Date: Thu, 8 Nov 2018 18:02:44 +0800
Subject: [PATCH 10/50] fix share library issue

---
 CMakeLists.txt                         |  2 +-
 cmake/generic.cmake                    | 17 ++++++-----------
 paddle/fluid/inference/CMakeLists.txt  |  1 +
 paddle/fluid/inference/api/api_impl.cc |  4 ----
 paddle/fluid/platform/device_context.h |  2 +-
 paddle/fluid/platform/init.cc          |  8 ++++++++
 6 files changed, 17 insertions(+), 17 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index eabacbf7cc..cd8c54e24e 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -77,7 +77,7 @@ option(WITH_INFERENCE_API_TEST   "Test fluid inference high-level api interface"
 option(WITH_SYSTEM_BLAS   "Use system blas library"           OFF)
 option(PY_VERSION       "Compile PaddlePaddle with python3 support"     ${PY_VERSION})
 option(WITH_FAST_MATH   "Make use of fast math library, might affect the precision to some extent" ON)
-option(WITH_PREBUILD_OPENBLAS "Make use of the pre-built openblas library"                         ON)
+option(WITH_PREBUILD_OPENBLAS "Make use of the pre-built openblas library"                         ${WIN32})
 
 # PY_VERSION
 if(NOT PY_VERSION)
diff --git a/cmake/generic.cmake b/cmake/generic.cmake
index 174e5b2d17..e21f89c7c5 100644
--- a/cmake/generic.cmake
+++ b/cmake/generic.cmake
@@ -304,12 +304,6 @@ function(sep_library TARGET_NAME)
   set(options STATIC static SHARED shared)
   set(oneValueArgs "")
   set(multiValueArgs SRCS DEPS)
-  set(${TARGET_NAME}_dummy_flag "")
-  if(${sep_library_STATIC})
-    set(${TARGET_NAME}_dummy_flag "STATIC")
-  elseif(${sep_library_SHARED})
-    set(${TARGET_NAME}_dummy_flag "SHARED")
-  endif()
   cmake_parse_arguments(sep_library "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
   set(dummy_index 1)
   set(dummy_offset 1)
@@ -321,10 +315,7 @@ function(sep_library TARGET_NAME)
     list(LENGTH dummy_list listlen )
     if ((${listlen} GREATER ${dummy_limit}) OR (${dummy_offset} EQUAL ${sep_all_len}))
       message("create dummy library ${TARGET_NAME}_dummy_lib_${dummy_index} for ${TARGET_NAME}")
-      #            set(dummyfile ${CMAKE_CURRENT_BINARY_DIR}/${TARGET_NAME}_dummy_${dummy_index}.c)
-      #            file(WRITE ${dummyfile} "const char *dummy_${TARGET_NAME}_${dummy_index} = \"${dummyfile}\";")
-      #            cc_library(${TARGET_NAME}_dummy_lib_${dummy_index} ${${TARGET_NAME}_dummy_flag} SRCS ${dummyfile} DEPS ${dummy_list})
-      cc_library(${TARGET_NAME}_dummy_lib_${dummy_index} ${${TARGET_NAME}_dummy_flag} DEPS ${dummy_list})
+      cc_library(${TARGET_NAME}_dummy_lib_${dummy_index} STATIC DEPS ${dummy_list})
       foreach(i ${dummy_list})
         list(REMOVE_AT dummy_list 0)
       endforeach()
@@ -333,7 +324,11 @@ function(sep_library TARGET_NAME)
     endif()
     MATH(EXPR dummy_offset "${dummy_offset}+1")
   endforeach()
-  cc_library(${TARGET_NAME} ${${TARGET_NAME}_dummy_flag} SRCS ${sep_library_SRCS} DEPS ${${TARGET_NAME}_dummy_list})
+  if(${sep_library_SHARED})
+    cc_library(${TARGET_NAME} SHARED SRCS ${sep_library_SRCS} DEPS ${${TARGET_NAME}_dummy_list})
+  else(${sep_library_SHARED})
+    cc_library(${TARGET_NAME} STATIC SRCS ${sep_library_SRCS} DEPS ${${TARGET_NAME}_dummy_list})
+  endif(${sep_library_SHARED})
 endfunction(sep_library)
 
 function(cc_binary TARGET_NAME)
diff --git a/paddle/fluid/inference/CMakeLists.txt b/paddle/fluid/inference/CMakeLists.txt
index da1711fc18..f09a434950 100644
--- a/paddle/fluid/inference/CMakeLists.txt
+++ b/paddle/fluid/inference/CMakeLists.txt
@@ -60,6 +60,7 @@ endif()
 if(WIN32)
   sep_library(paddle_fluid_shared SHARED SRCS ${SHARED_INFERENCE_SRCS}
           DEPS ${fluid_modules} paddle_fluid_api reset_tensor_array)
+  target_link_libraries(paddle_fluid_shared shlwapi)
   if(WITH_GPU AND NOT WITH_DSO)
     target_link_libraries(paddle_fluid_origin ${cuda_modules})
   endif(WITH_GPU AND NOT WITH_DSO)
diff --git a/paddle/fluid/inference/api/api_impl.cc b/paddle/fluid/inference/api/api_impl.cc
index a576ab13df..d06ab8f8c8 100644
--- a/paddle/fluid/inference/api/api_impl.cc
+++ b/paddle/fluid/inference/api/api_impl.cc
@@ -75,10 +75,6 @@ bool NativePaddlePredictor::Init(
   }
 #endif
 
-  // windows has no support for openblas multi-thread
-#ifdef _WIN32
-	FLAGS_paddle_num_threads = 1;
-#endif
   // no matter with or without MKLDNN
   paddle::platform::SetNumThreads(FLAGS_paddle_num_threads);
 
diff --git a/paddle/fluid/platform/device_context.h b/paddle/fluid/platform/device_context.h
index df248f9bb1..892984dc3e 100644
--- a/paddle/fluid/platform/device_context.h
+++ b/paddle/fluid/platform/device_context.h
@@ -32,7 +32,7 @@ limitations under the License. */
 #include "glog/logging.h"
 #include "paddle/fluid/platform/enforce.h"
 #include "paddle/fluid/platform/place.h"
-#ifdef PADDLE_WITH_CUDA
+#ifdef PADDLE_WITH_CUDA && !defined(_WIN32)
 #include "paddle/fluid/platform/stream_callback_manager.h"
 #endif
 #include "unsupported/Eigen/CXX11/Tensor"
diff --git a/paddle/fluid/platform/init.cc b/paddle/fluid/platform/init.cc
index f61abfc43d..092585ed2a 100644
--- a/paddle/fluid/platform/init.cc
+++ b/paddle/fluid/platform/init.cc
@@ -112,6 +112,14 @@ void InitDevices(bool init_p2p, const std::vector<int> devices) {
   }
   places.emplace_back(platform::CPUPlace());
   platform::DeviceContextPool::Init(places);
+
+// windows has no support for openblas multi-thread
+#ifdef _WIN32
+    if (FLAGS_paddle_num_threads > 1) {
+        FLAGS_paddle_num_threads = 1;
+    }
+#endif
+
 #ifndef PADDLE_WITH_MKLDNN
   platform::SetNumThreads(FLAGS_paddle_num_threads);
 #endif

From dcfab11193444ec08525c06a92d77038dd276d7a Mon Sep 17 00:00:00 2001
From: peizhilin <wopeizl@163.com>
Date: Thu, 8 Nov 2018 21:26:38 +0800
Subject: [PATCH 11/50] merge from develop

---
 paddle/fluid/framework/details/multi_devices_graph_pass.cc | 6 ++++++
 paddle/fluid/operators/math/CMakeLists.txt                 | 5 +++++
 paddle/fluid/operators/math/selected_rows_functor.h        | 2 ++
 paddle/scripts/paddle_build.sh                             | 7 ++++++-
 4 files changed, 19 insertions(+), 1 deletion(-)

diff --git a/paddle/fluid/framework/details/multi_devices_graph_pass.cc b/paddle/fluid/framework/details/multi_devices_graph_pass.cc
index 67d29a42d7..3dc177a8cb 100644
--- a/paddle/fluid/framework/details/multi_devices_graph_pass.cc
+++ b/paddle/fluid/framework/details/multi_devices_graph_pass.cc
@@ -648,6 +648,12 @@ int MultiDevSSAGraphBuilder::GetVarDeviceID(
     const ir::Graph &graph, const std::string &varname,
     const std::unordered_map<std::string, int> &sharded_var_device) const {
   auto got = sharded_var_device.find(varname);
+  if (got == sharded_var_device.end()) {
+    auto pos = varname.find(framework::kNewGradSuffix);
+    if (pos != std::string::npos) {
+      got = sharded_var_device.find(varname.substr(0, pos));
+    }
+  }
   return got == sharded_var_device.end() ? -1 : got->second;
 }
 
diff --git a/paddle/fluid/operators/math/CMakeLists.txt b/paddle/fluid/operators/math/CMakeLists.txt
index 7f9a55acf8..c87d4241d0 100644
--- a/paddle/fluid/operators/math/CMakeLists.txt
+++ b/paddle/fluid/operators/math/CMakeLists.txt
@@ -57,6 +57,9 @@ math_library(sequence_padding)
 math_library(sequence_pooling DEPS math_function)
 math_library(sequence_scale)
 math_library(softmax DEPS math_function)
+if (NOT WIN32)
+    math_library(matrix_bit_code)
+endif (NOT WIN32)
 math_library(unpooling)
 math_library(vol2col)
 
@@ -80,4 +83,6 @@ if (NOT WIN32)
         list(APPEND JIT_KERNEL_SRCS jit_gen.cc jit_code.cc)
         list(APPEND JIT_KERNEL_DEPS xbyak)
     endif()
+    cc_library(jit_kernel SRCS ${JIT_KERNEL_SRCS} DEPS ${JIT_KERNEL_DEPS})
+    cc_test(jit_kernel_test SRCS jit_kernel_test.cc DEPS jit_kernel)
 endif (NOT WIN32)
diff --git a/paddle/fluid/operators/math/selected_rows_functor.h b/paddle/fluid/operators/math/selected_rows_functor.h
index b24ffb57ac..6d146d39d6 100644
--- a/paddle/fluid/operators/math/selected_rows_functor.h
+++ b/paddle/fluid/operators/math/selected_rows_functor.h
@@ -64,6 +64,8 @@ struct SelectedRowsSumTo {
                   framework::SelectedRows* input2);
 };
 
+// FIXME: The result of SelectedRowsAddToTensor maybe non deterministic,
+// because it uses CudaAtomicAdd.
 // input2 = input1 + input2
 template <typename DeviceContext, typename T>
 struct SelectedRowsAddToTensor {
diff --git a/paddle/scripts/paddle_build.sh b/paddle/scripts/paddle_build.sh
index d7676f89ab..2f5fef36c4 100755
--- a/paddle/scripts/paddle_build.sh
+++ b/paddle/scripts/paddle_build.sh
@@ -367,7 +367,12 @@ function run_test() {
     Running unit tests ...
     ========================================
 EOF
-        ctest --output-on-failure
+        if [ ${TESTING_DEBUG_MODE:-OFF} == "ON" ] ; then
+            ctest -V
+        else
+            ctest --output-on-failure
+        fi
+
         # make install should also be test when unittest
         make install -j `nproc`
         pip install ${INSTALL_PREFIX:-/paddle/build}/opt/paddle/share/wheels/*.whl

From 41b423d41be8cb6893df2549ce473f8542a40c15 Mon Sep 17 00:00:00 2001
From: peizhilin <wopeizl@163.com>
Date: Thu, 8 Nov 2018 21:29:12 +0800
Subject: [PATCH 12/50] remove duplicate

---
 paddle/fluid/operators/math/CMakeLists.txt | 1 -
 1 file changed, 1 deletion(-)

diff --git a/paddle/fluid/operators/math/CMakeLists.txt b/paddle/fluid/operators/math/CMakeLists.txt
index c87d4241d0..cc3cc9787a 100644
--- a/paddle/fluid/operators/math/CMakeLists.txt
+++ b/paddle/fluid/operators/math/CMakeLists.txt
@@ -76,7 +76,6 @@ endif()
 cc_test(concat_test SRCS concat_test.cc DEPS concat_and_split)
 cc_test(cpu_vec_test SRCS cpu_vec_test.cc DEPS blas cpu_info)
 if (NOT WIN32)
-    math_library(matrix_bit_code)
     set(JIT_KERNEL_SRCS jit_kernel.cc jit_kernel_blas.cc jit_kernel_exp.cc jit_kernel_rnn.cc jit_kernel_crf_decode.cc)
     set(JIT_KERNEL_DEPS cpu_info cblas gflags enforce)
     if(WITH_XBYAK)

From 9fa96147c2fd704541d66be8d6c0c35f4f575f94 Mon Sep 17 00:00:00 2001
From: peizhilin <wopeizl@163.com>
Date: Thu, 8 Nov 2018 21:47:33 +0800
Subject: [PATCH 13/50] fix the typo

---
 cmake/external/gflags.cmake   | 4 ++--
 cmake/external/glog.cmake     | 4 ++--
 cmake/external/openblas.cmake | 4 ++--
 cmake/external/protobuf.cmake | 4 ++--
 4 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/cmake/external/gflags.cmake b/cmake/external/gflags.cmake
index 5ed78bcf75..9f4c5d29b2 100644
--- a/cmake/external/gflags.cmake
+++ b/cmake/external/gflags.cmake
@@ -38,8 +38,8 @@ ExternalProject_Add(
                     -DCMAKE_CXX_FLAGS_RELEASE=${CMAKE_CXX_FLAGS_RELEASE}
                     -DCMAKE_CXX_FLAGS_DEBUG=${CMAKE_CXX_FLAGS_DEBUG}
                     -DCMAKE_C_FLAGS=${CMAKE_C_FLAGS}
-                    -DCMAKE_C_FLAGS_DEBUG=${DCMAKE_C_FLAGS_DEBUG}
-                    -DCMAKE_C_FLAGS_RELEASE=${DCMAKE_C_FLAGS_RELEASE}
+                    -DCMAKE_C_FLAGS_DEBUG=${CMAKE_C_FLAGS_DEBUG}
+                    -DCMAKE_C_FLAGS_RELEASE=${CMAKE_C_FLAGS_RELEASE}
                     -DBUILD_STATIC_LIBS=ON
                     -DCMAKE_INSTALL_PREFIX=${GFLAGS_INSTALL_DIR}
                     -DCMAKE_POSITION_INDEPENDENT_CODE=ON
diff --git a/cmake/external/glog.cmake b/cmake/external/glog.cmake
index 2a34c96ab9..8cd0455c16 100644
--- a/cmake/external/glog.cmake
+++ b/cmake/external/glog.cmake
@@ -49,8 +49,8 @@ ExternalProject_Add(
                     -DCMAKE_CXX_FLAGS_RELEASE=${CMAKE_CXX_FLAGS_RELEASE}
                     -DCMAKE_CXX_FLAGS_DEBUG=${CMAKE_CXX_FLAGS_DEBUG}
                     -DCMAKE_C_FLAGS=${CMAKE_C_FLAGS}
-                    -DCMAKE_C_FLAGS_DEBUG=${DCMAKE_C_FLAGS_DEBUG}
-                    -DCMAKE_C_FLAGS_RELEASE=${DCMAKE_C_FLAGS_RELEASE}
+                    -DCMAKE_C_FLAGS_DEBUG=${CMAKE_C_FLAGS_DEBUG}
+                    -DCMAKE_C_FLAGS_RELEASE=${CMAKE_C_FLAGS_RELEASE}
                     -DCMAKE_INSTALL_PREFIX=${GLOG_INSTALL_DIR}
                     -DCMAKE_INSTALL_LIBDIR=${GLOG_INSTALL_DIR}/lib
                     -DCMAKE_POSITION_INDEPENDENT_CODE=ON
diff --git a/cmake/external/openblas.cmake b/cmake/external/openblas.cmake
index 844863a425..38e23d8ccf 100644
--- a/cmake/external/openblas.cmake
+++ b/cmake/external/openblas.cmake
@@ -84,8 +84,8 @@ IF(NOT ${CBLAS_FOUND})
                 CMAKE_ARGS    -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}
                 -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER}
                 -DCMAKE_C_FLAGS=${CMAKE_C_FLAGS}
-                -DCMAKE_C_FLAGS_DEBUG=${DCMAKE_C_FLAGS_DEBUG}
-                -DCMAKE_C_FLAGS_RELEASE=${DCMAKE_C_FLAGS_RELEASE}
+                -DCMAKE_C_FLAGS_DEBUG=${CMAKE_C_FLAGS_DEBUG}
+                -DCMAKE_C_FLAGS_RELEASE=${CMAKE_C_FLAGS_RELEASE}
                 -DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS}
                 -DCMAKE_CXX_FLAGS_RELEASE=${CMAKE_CXX_FLAGS_RELEASE}
                 -DCMAKE_CXX_FLAGS_DEBUG=${CMAKE_CXX_FLAGS_DEBUG}
diff --git a/cmake/external/protobuf.cmake b/cmake/external/protobuf.cmake
index bb1fcf356f..e1c6df87c1 100644
--- a/cmake/external/protobuf.cmake
+++ b/cmake/external/protobuf.cmake
@@ -192,8 +192,8 @@ FUNCTION(build_protobuf TARGET_NAME BUILD_FOR_HOST)
             "-DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}"
             "-DCMAKE_C_COMPILER=${CMAKE_C_COMPILER}"
             "-DCMAKE_C_FLAGS=${CMAKE_C_FLAGS}"
-            "-DCMAKE_C_FLAGS_DEBUG=${DCMAKE_C_FLAGS_DEBUG}"
-            "-DCMAKE_C_FLAGS_RELEASE=${DCMAKE_C_FLAGS_RELEASE}"
+            "-DCMAKE_C_FLAGS_DEBUG=${CMAKE_C_FLAGS_DEBUG}"
+            "-DCMAKE_C_FLAGS_RELEASE=${CMAKE_C_FLAGS_RELEASE}"
             "-DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS}"
             "-DCMAKE_CXX_FLAGS_RELEASE=${CMAKE_CXX_FLAGS_RELEASE}"
             "-DCMAKE_CXX_FLAGS_DEBUG=${CMAKE_CXX_FLAGS_DEBUG}"

From ded93a354a467322bb59156954629e55ae2b7504 Mon Sep 17 00:00:00 2001
From: peizhilin <wopeizl@163.com>
Date: Thu, 8 Nov 2018 21:49:15 +0800
Subject: [PATCH 14/50] fix the typo

---
 cmake/external/gflags.cmake   | 4 ++--
 cmake/external/glog.cmake     | 4 ++--
 cmake/external/openblas.cmake | 4 ++--
 cmake/external/protobuf.cmake | 4 ++--
 4 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/cmake/external/gflags.cmake b/cmake/external/gflags.cmake
index dcbff05d0d..dbd6c3b75e 100644
--- a/cmake/external/gflags.cmake
+++ b/cmake/external/gflags.cmake
@@ -39,8 +39,8 @@ ExternalProject_Add(
                     -DCMAKE_CXX_FLAGS_RELEASE=${CMAKE_CXX_FLAGS_RELEASE}
                     -DCMAKE_CXX_FLAGS_DEBUG=${CMAKE_CXX_FLAGS_DEBUG}
                     -DCMAKE_C_FLAGS=${CMAKE_C_FLAGS}
-                    -DCMAKE_C_FLAGS_DEBUG=${DCMAKE_C_FLAGS_DEBUG}
-                    -DCMAKE_C_FLAGS_RELEASE=${DCMAKE_C_FLAGS_RELEASE}
+                    -DCMAKE_C_FLAGS_DEBUG=${CMAKE_C_FLAGS_DEBUG}
+                    -DCMAKE_C_FLAGS_RELEASE=${CMAKE_C_FLAGS_RELEASE}
                     -DBUILD_STATIC_LIBS=ON
                     -DCMAKE_INSTALL_PREFIX=${GFLAGS_INSTALL_DIR}
                     -DCMAKE_POSITION_INDEPENDENT_CODE=ON
diff --git a/cmake/external/glog.cmake b/cmake/external/glog.cmake
index 5184a83bdd..a3f3c6adf3 100644
--- a/cmake/external/glog.cmake
+++ b/cmake/external/glog.cmake
@@ -50,8 +50,8 @@ ExternalProject_Add(
                     -DCMAKE_CXX_FLAGS_RELEASE=${CMAKE_CXX_FLAGS_RELEASE}
                     -DCMAKE_CXX_FLAGS_DEBUG=${CMAKE_CXX_FLAGS_DEBUG}
                     -DCMAKE_C_FLAGS=${CMAKE_C_FLAGS}
-                    -DCMAKE_C_FLAGS_DEBUG=${DCMAKE_C_FLAGS_DEBUG}
-                    -DCMAKE_C_FLAGS_RELEASE=${DCMAKE_C_FLAGS_RELEASE}
+                    -DCMAKE_C_FLAGS_DEBUG=${CMAKE_C_FLAGS_DEBUG}
+                    -DCMAKE_C_FLAGS_RELEASE=${CMAKE_C_FLAGS_RELEASE}
                     -DCMAKE_INSTALL_PREFIX=${GLOG_INSTALL_DIR}
                     -DCMAKE_INSTALL_LIBDIR=${GLOG_INSTALL_DIR}/lib
                     -DCMAKE_POSITION_INDEPENDENT_CODE=ON
diff --git a/cmake/external/openblas.cmake b/cmake/external/openblas.cmake
index 8c172437d4..829641fb97 100644
--- a/cmake/external/openblas.cmake
+++ b/cmake/external/openblas.cmake
@@ -85,8 +85,8 @@ IF(NOT ${CBLAS_FOUND})
                 CMAKE_ARGS    -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}
                 -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER}
                 -DCMAKE_C_FLAGS=${CMAKE_C_FLAGS}
-                -DCMAKE_C_FLAGS_DEBUG=${DCMAKE_C_FLAGS_DEBUG}
-                -DCMAKE_C_FLAGS_RELEASE=${DCMAKE_C_FLAGS_RELEASE}
+                -DCMAKE_C_FLAGS_DEBUG=${CMAKE_C_FLAGS_DEBUG}
+                -DCMAKE_C_FLAGS_RELEASE=${CMAKE_C_FLAGS_RELEASE}
                 -DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS}
                 -DCMAKE_CXX_FLAGS_RELEASE=${CMAKE_CXX_FLAGS_RELEASE}
                 -DCMAKE_CXX_FLAGS_DEBUG=${CMAKE_CXX_FLAGS_DEBUG}
diff --git a/cmake/external/protobuf.cmake b/cmake/external/protobuf.cmake
index 43b69e72dd..75ffabca7c 100644
--- a/cmake/external/protobuf.cmake
+++ b/cmake/external/protobuf.cmake
@@ -192,8 +192,8 @@ FUNCTION(build_protobuf TARGET_NAME BUILD_FOR_HOST)
             "-DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}"
             "-DCMAKE_C_COMPILER=${CMAKE_C_COMPILER}"
             "-DCMAKE_C_FLAGS=${CMAKE_C_FLAGS}"
-            "-DCMAKE_C_FLAGS_DEBUG=${DCMAKE_C_FLAGS_DEBUG}"
-            "-DCMAKE_C_FLAGS_RELEASE=${DCMAKE_C_FLAGS_RELEASE}"
+            "-DCMAKE_C_FLAGS_DEBUG=${CMAKE_C_FLAGS_DEBUG}"
+            "-DCMAKE_C_FLAGS_RELEASE=${CMAKE_C_FLAGS_RELEASE}"
             "-DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS}"
             "-DCMAKE_CXX_FLAGS_RELEASE=${CMAKE_CXX_FLAGS_RELEASE}"
             "-DCMAKE_CXX_FLAGS_DEBUG=${CMAKE_CXX_FLAGS_DEBUG}"

From e8519a6e89a8dac1e0e7a9bc8a8c180042648fac Mon Sep 17 00:00:00 2001
From: peizhilin <wopeizl@163.com>
Date: Fri, 9 Nov 2018 10:40:45 +0800
Subject: [PATCH 15/50] use the ext_name instead of specific extension name

---
 python/setup.py.in | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/python/setup.py.in b/python/setup.py.in
index 2a311d319b..48db2420b4 100644
--- a/python/setup.py.in
+++ b/python/setup.py.in
@@ -184,27 +184,27 @@ if os.path.isfile(libs_path+'/__init__.py'):
     os.remove(libs_path+'/__init__.py')
 package_dir['paddle.libs']=libs_path
 
-# change rpath of core.so, add $ORIGIN/../libs/ to it.
-# The reason is that libwarpctc.so, libiomp5.so etc are in paddle.libs, and
-# core.so is in paddle.fluid, thus paddle/fluid/../libs will pointer to above libraries.
+# change rpath of core.ext, add $ORIGIN/../libs/ to it.
+# The reason is that libwarpctc.ext, libiomp5.ext etc are in paddle.libs, and
+# core.ext is in paddle.fluid, thus paddle/fluid/../libs will pointer to above libraries.
 # This operation will fix https://github.com/PaddlePaddle/Paddle/issues/3213
 if '${CMAKE_BUILD_TYPE}' == 'Release':
     if os.name != 'nt':
-        # only change rpath in Release mode, since in Debug mode, core.so is too large to be changed.
+        # only change rpath in Release mode, since in Debug mode, core.xx is too large to be changed.
         if "@APPLE@" == "1":
             command = "install_name_tool -id \"@loader_path/../libs/\" ${PADDLE_BINARY_DIR}/python/paddle/fluid/core" + ext_name
         else:
             command = "patchelf --set-rpath '$ORIGIN/../libs/' ${PADDLE_BINARY_DIR}/python/paddle/fluid/core" + ext_name
         if os.system(command) != 0:
-            raise Exception("patch core.so failed, command: %s" % command)
+            raise Exception("patch core.%s failed, command: %s" % (ext_name, command))
         if '${WITH_FLUID_ONLY}'== 'OFF':
-            # change rpath of _swig_paddle.so.
+            # change rpath of _swig_paddle.xx.
             if "@APPLE@" == "1":
                 command = "install_name_tool -id \"@loader_path/../paddle/libs/\" ${PADDLE_BINARY_DIR}/python/py_paddle/_swig_paddle" + ext_name
             else:
                 command = "patchelf --set-rpath '$ORIGIN/../paddle/libs/' ${PADDLE_BINARY_DIR}/python/py_paddle/_swig_paddle" + ext_name
             if os.system(command) != 0:
-                raise Exception("patch _swig_paddle.so failed, command: %s" % command)
+                raise Exception("patch _swig_paddle.%s failed, command: %s" % (ext_name, command))
 
 ext_modules = [Extension('_foo', ['stub.cc'])]
 if os.name == 'nt':

From 8ae010b72b3cb72d4803fdaa1d953d7d03e1b63c Mon Sep 17 00:00:00 2001
From: peizhilin <wopeizl@163.com>
Date: Fri, 9 Nov 2018 12:26:18 +0800
Subject: [PATCH 16/50] fix the typo

---
 python/paddle/fluid/framework.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/paddle/fluid/framework.py b/python/paddle/fluid/framework.py
index 0282ffec16..fd03dff386 100644
--- a/python/paddle/fluid/framework.py
+++ b/python/paddle/fluid/framework.py
@@ -536,7 +536,7 @@ class Operator(object):
     OP_WITHOUT_KERNEL_SET = {
         'feed', 'fetch', 'save', 'load', 'recurrent', 'go',
         'rnn_memory_helper_grad', 'conditional_block', 'while', 'send', 'recv',
-        'listen_and_serv', 'parallel_do', 'save_combine', 'loadload_combine',
+        'listen_and_serv', 'parallel_do', 'save_combine', 'load_combine',
         'ncclInit', 'select', 'checkpoint_notify', 'gen_nccl_id'
     }
 

From d08334011a155f00bc1160adf2e400a00f7c66c3 Mon Sep 17 00:00:00 2001
From: peizhilin <wopeizl@163.com>
Date: Fri, 9 Nov 2018 14:09:27 +0800
Subject: [PATCH 17/50] fix merge issue

---
 paddle/fluid/framework/ir/pass.h | 22 ----------------------
 1 file changed, 22 deletions(-)

diff --git a/paddle/fluid/framework/ir/pass.h b/paddle/fluid/framework/ir/pass.h
index a9199414ba..e1767337ab 100644
--- a/paddle/fluid/framework/ir/pass.h
+++ b/paddle/fluid/framework/ir/pass.h
@@ -217,28 +217,6 @@ struct PassRegistrar : public Registrar {
   extern int TouchPassRegistrar_##pass_type();                        \
   static int use_pass_itself_##pass_type##_ __UNUSED__() = \
       TouchPassRegistrar_##pass_type()
-#else
-#define REGISTER_PASS(pass_type, pass_class)                        \
-  STATIC_ASSERT_PASS_GLOBAL_NAMESPACE(                              \
-      __reg_pass__##pass_type,                                      \
-      "REGISTER_PASS must be called in global namespace");          \
-  static ::paddle::framework::ir::PassRegistrar<pass_class>         \
-      __pass_registrar_##pass_type##__(#pass_type);                 \
-  int TouchPassRegistrar_##pass_type() {                            \
-    __pass_registrar_##pass_type##__.Touch();                       \
-    return 0;                                                       \
-  }                                                                 \
-  static ::paddle::framework::ir::PassRegistrar<pass_class> UNUSED( \
-      &__pass_tmp_registrar_##pass_type##__) =                      \
-      __pass_registrar_##pass_type##__
-
-#define USE_PASS(pass_type)                           \
-  STATIC_ASSERT_PASS_GLOBAL_NAMESPACE(                \
-      __use_pass_itself_##pass_type,                  \
-      "USE_PASS must be called in global namespace"); \
-  extern int TouchPassRegistrar_##pass_type();        \
-  static int UNUSED(use_pass_itself_##pass_type##_) = \
-      TouchPassRegistrar_##pass_type()
 
 }  // namespace ir
 }  // namespace framework

From 4b1f1a878732b920f94f3e42d0cb328c308d4bca Mon Sep 17 00:00:00 2001
From: peizhilin <wopeizl@163.com>
Date: Fri, 9 Nov 2018 14:21:34 +0800
Subject: [PATCH 18/50] fix merge issue

---
 paddle/fluid/inference/analysis/helper.h | 1 +
 paddle/fluid/platform/init.cc            | 2 ++
 paddle/fluid/platform/port.h             | 3 +--
 3 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/paddle/fluid/inference/analysis/helper.h b/paddle/fluid/inference/analysis/helper.h
index 5151e2b69a..ea568a581d 100644
--- a/paddle/fluid/inference/analysis/helper.h
+++ b/paddle/fluid/inference/analysis/helper.h
@@ -26,6 +26,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/scope.h"
 #include "paddle/fluid/framework/variable.h"
 #include "paddle/fluid/platform/enforce.h"
+#include "paddle/fluid/platform/port.h"
 
 namespace paddle {
 namespace inference {
diff --git a/paddle/fluid/platform/init.cc b/paddle/fluid/platform/init.cc
index 4910baec6a..092585ed2a 100644
--- a/paddle/fluid/platform/init.cc
+++ b/paddle/fluid/platform/init.cc
@@ -175,7 +175,9 @@ void InitGLOG(const std::string &prog_name) {
   // glog will not hold the ARGV[0] inside.
   // Use strdup to alloc a new string.
   google::InitGoogleLogging(strdup(prog_name.c_str()));
+#ifndef _WIN32
   google::InstallFailureSignalHandler();
+#endif
 }
 
 }  // namespace framework
diff --git a/paddle/fluid/platform/port.h b/paddle/fluid/platform/port.h
index cf9f4aa95b..4ff07edc19 100644
--- a/paddle/fluid/platform/port.h
+++ b/paddle/fluid/platform/port.h
@@ -30,11 +30,10 @@
 #include <sys/stat.h>
 #include <algorithm>  // std::accumulate
 #else
+#include <stdio.h>
 #include <io.h>  // _popen, _pclose
 #include <windows.h>
-#if defined(_WIN32)
 #include <numeric>  // std::accumulate in msvc
-#endif
 // windows version of __attribute__((unused))
 #define UNUSED __pragma(warning(suppress : 4100))
 

From 350f1f397178ac7d6a73f0c9b5cb00c2d65e5e47 Mon Sep 17 00:00:00 2001
From: peizhilin <wopeizl@163.com>
Date: Fri, 9 Nov 2018 14:29:58 +0800
Subject: [PATCH 19/50] remove duplicate function definition

---
 paddle/fluid/inference/analysis/helper.h | 14 --------------
 1 file changed, 14 deletions(-)

diff --git a/paddle/fluid/inference/analysis/helper.h b/paddle/fluid/inference/analysis/helper.h
index ea568a581d..2517f5a373 100644
--- a/paddle/fluid/inference/analysis/helper.h
+++ b/paddle/fluid/inference/analysis/helper.h
@@ -125,20 +125,6 @@ T &GetFromScope(const framework::Scope &scope, const std::string &name) {
   return *var->GetMutable<T>();
 }
 
-static void ExecShellCommand(const std::string &cmd, std::string *message) {
-  char buffer[128];
-  std::shared_ptr<FILE> pipe(popen(cmd.c_str(), "r"), pclose);
-  if (!pipe) {
-    LOG(ERROR) << "error running command: " << cmd;
-    return;
-  }
-  while (!feof(pipe.get())) {
-    if (fgets(buffer, 128, pipe.get()) != nullptr) {
-      *message += buffer;
-    }
-  }
-}
-
 static framework::proto::ProgramDesc LoadProgramDesc(
     const std::string &model_path) {
   std::ifstream fin(model_path, std::ios::in | std::ios::binary);

From 4bd0c4c5ee47378e0eabaa7cbc88a5d1c6c30a17 Mon Sep 17 00:00:00 2001
From: peizhilin <wopeizl@163.com>
Date: Fri, 9 Nov 2018 15:14:33 +0800
Subject: [PATCH 20/50] test=develop

---
 paddle/fluid/platform/port.h | 68 ++++++++++++++++++------------------
 1 file changed, 34 insertions(+), 34 deletions(-)

diff --git a/paddle/fluid/platform/port.h b/paddle/fluid/platform/port.h
index 4ff07edc19..347622f212 100644
--- a/paddle/fluid/platform/port.h
+++ b/paddle/fluid/platform/port.h
@@ -24,42 +24,42 @@
 #include "glog/logging.h"
 
 #if !defined(_WIN32)
-#define UNUSED __attribute__((unused))
-#include <dlfcn.h>     //  dladdr
-#include <execinfo.h>  // backtrace
-#include <sys/stat.h>
-#include <algorithm>  // std::accumulate
+    #define UNUSED __attribute__((unused))
+    #include <dlfcn.h>     //  dladdr
+    #include <execinfo.h>  // backtrace
+    #include <sys/stat.h>
+    #include <algorithm>  // std::accumulate
 #else
-#include <stdio.h>
-#include <io.h>  // _popen, _pclose
-#include <windows.h>
-#include <numeric>  // std::accumulate in msvc
-// windows version of __attribute__((unused))
-#define UNUSED __pragma(warning(suppress : 4100))
-
-#ifndef S_ISDIR  // windows port for sys/stat.h
-#define S_ISDIR(mode) (((mode)&S_IFMT) == S_IFDIR)
-#endif  // S_ISDIR
-
-static void *dlsym(void *handle, const char *symbol_name) {
-  FARPROC found_symbol;
-  found_symbol = GetProcAddress((HMODULE)handle, symbol_name);
-
-  if (found_symbol == NULL) {
-    throw std::runtime_error(std::string(symbol_name) + " not found.");
-  }
-  return reinterpret_cast<void *>(found_symbol);
-}
+    #include <stdio.h>
+    #include <io.h>  // _popen, _pclose
+    #include <windows.h>
+    #include <numeric>  // std::accumulate in msvc
+    // windows version of __attribute__((unused))
+    #define UNUSED __pragma(warning(suppress : 4100))
+
+    #ifndef S_ISDIR  // windows port for sys/stat.h
+    #define S_ISDIR(mode) (((mode)&S_IFMT) == S_IFDIR)
+    #endif  // S_ISDIR
+
+    static void *dlsym(void *handle, const char *symbol_name) {
+      FARPROC found_symbol;
+      found_symbol = GetProcAddress((HMODULE)handle, symbol_name);
+
+      if (found_symbol == NULL) {
+        throw std::runtime_error(std::string(symbol_name) + " not found.");
+      }
+      return reinterpret_cast<void *>(found_symbol);
+    }
 
-static void *dlopen(const char *filename, int flag) {
-  std::string file_name(filename);
-  file_name.replace(0, file_name.size() - 1, '/', '\\');
-  HMODULE hModule = LoadLibrary(file_name.c_str());
-  if (!hModule) {
-    throw std::runtime_error(file_name + " not found.");
-  }
-  return reinterpret_cast<void *>(hModule);
-}
+    static void *dlopen(const char *filename, int flag) {
+      std::string file_name(filename);
+      file_name.replace(0, file_name.size() - 1, '/', '\\');
+      HMODULE hModule = LoadLibrary(file_name.c_str());
+      if (!hModule) {
+        throw std::runtime_error(file_name + " not found.");
+      }
+      return reinterpret_cast<void *>(hModule);
+    }
 
 #endif  // !_WIN32
 

From 1b0ce151dfb5e34197b3ed1f5e08e14faa625810 Mon Sep 17 00:00:00 2001
From: peizhilin <wopeizl@163.com>
Date: Fri, 9 Nov 2018 17:29:07 +0800
Subject: [PATCH 21/50] fix API check issue

---
 python/paddle/fluid/layers/nn.py | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py
index ad4c3773d5..b379c52350 100644
--- a/python/paddle/fluid/layers/nn.py
+++ b/python/paddle/fluid/layers/nn.py
@@ -163,10 +163,6 @@ __all__ = [
     'log_loss',
     'add_position_encoding',
 ]
-if os.name != 'nt':
-    __all__.append('dynamic_lstm')
-    __all__.append('crf_decoding')
-    __all__.append('roi_pool')
 
 
 def fc(input,

From e768c370e8cd303534495191f11bc7d288b357f9 Mon Sep 17 00:00:00 2001
From: peizhilin <wopeizl@163.com>
Date: Fri, 9 Nov 2018 18:10:03 +0800
Subject: [PATCH 22/50] fix api check

---
 python/paddle/fluid/layers/nn.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py
index b379c52350..c757b080f8 100644
--- a/python/paddle/fluid/layers/nn.py
+++ b/python/paddle/fluid/layers/nn.py
@@ -33,10 +33,12 @@ from .. import core
 __all__ = [
     'fc',
     'embedding',
+    'dynamic_lstm',
     'dynamic_lstmp',
     'dynamic_gru',
     'gru_unit',
     'linear_chain_crf',
+    'crf_decoding',
     'cos_sim',
     'cross_entropy',
     'square_error_cost',
@@ -95,6 +97,7 @@ __all__ = [
     'pad',
     'pad_constant_like',
     'label_smooth',
+    'roi_pool',
     'roi_align',
     'dice_loss',
     'image_resize',

From 81476ff3cfa2c6bf342728a25ea91533a44c2d97 Mon Sep 17 00:00:00 2001
From: peizhilin <wopeizl@163.com>
Date: Fri, 9 Nov 2018 18:27:18 +0800
Subject: [PATCH 23/50] fix api check

---
 python/paddle/fluid/__init__.py  | 12 +++++-------
 python/paddle/fluid/framework.py |  2 +-
 2 files changed, 6 insertions(+), 8 deletions(-)

diff --git a/python/paddle/fluid/__init__.py b/python/paddle/fluid/__init__.py
index 6c45e15168..c4a5421cdb 100644
--- a/python/paddle/fluid/__init__.py
+++ b/python/paddle/fluid/__init__.py
@@ -44,17 +44,16 @@ from .lod_tensor import create_lod_tensor, create_random_int_lodtensor
 from . import clip
 from . import profiler
 from . import unique_name
-if os.name != 'nt':
-    from . import recordio_writer
-    from . import parallel_executor
-    from .parallel_executor import *
+from . import recordio_writer
+from . import parallel_executor
+from .parallel_executor import *
 from paddle.fluid.layers.math_op_patch import monkey_patch_variable
 
 Tensor = LoDTensor
 
 __all__ = framework.__all__ + executor.__all__ + \
     trainer.__all__ + inferencer.__all__ + transpiler.__all__ + \
-    lod_tensor.__all__ + [
+    parallel_executor.__all__ + lod_tensor.__all__ + [
         'io',
         'initializer',
         'layers',
@@ -80,8 +79,7 @@ __all__ = framework.__all__ + executor.__all__ + \
         'recordio_writer',
         'Scope',
     ]
-if os.name != 'nt':
-    __all__ += parallel_executor.__all__
+
 
 def __bootstrap__():
     """
diff --git a/python/paddle/fluid/framework.py b/python/paddle/fluid/framework.py
index 0282ffec16..fd03dff386 100644
--- a/python/paddle/fluid/framework.py
+++ b/python/paddle/fluid/framework.py
@@ -536,7 +536,7 @@ class Operator(object):
     OP_WITHOUT_KERNEL_SET = {
         'feed', 'fetch', 'save', 'load', 'recurrent', 'go',
         'rnn_memory_helper_grad', 'conditional_block', 'while', 'send', 'recv',
-        'listen_and_serv', 'parallel_do', 'save_combine', 'loadload_combine',
+        'listen_and_serv', 'parallel_do', 'save_combine', 'load_combine',
         'ncclInit', 'select', 'checkpoint_notify', 'gen_nccl_id'
     }
 

From 7638f0afb30b849f6a237438c97c8d5680572cf4 Mon Sep 17 00:00:00 2001
From: peizhilin <wopeizl@163.com>
Date: Fri, 9 Nov 2018 21:07:42 +0800
Subject: [PATCH 24/50] simplify the logic

---
 cmake/external/openblas.cmake         | 61 +++++++--------------------
 paddle/fluid/operators/CMakeLists.txt |  2 +-
 paddle/fluid/platform/variant.h       |  2 +-
 3 files changed, 18 insertions(+), 47 deletions(-)

diff --git a/cmake/external/openblas.cmake b/cmake/external/openblas.cmake
index ac31423a6d..25431f0aee 100644
--- a/cmake/external/openblas.cmake
+++ b/cmake/external/openblas.cmake
@@ -72,51 +72,22 @@ IF(NOT ${CBLAS_FOUND})
             ENDIF()
         ENDIF()
 
-        IF(WIN32)
-            ExternalProject_Add(
-                extern_openblas
-                ${EXTERNAL_PROJECT_LOG_ARGS}
-    #              GIT_REPOSITORY      https://github.com/xianyi/OpenBLAS.git
-                GIT_REPOSITORY      http://admin@172.20.90.14:8080/r/openblas.git
-    #              GIT_TAG             ${OPENBLAS_COMMIT}
-                PREFIX              ${CBLAS_SOURCES_DIR}
-                INSTALL_DIR         ${CBLAS_INSTALL_DIR}
-                BUILD_IN_SOURCE     1
-                CMAKE_ARGS    -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}
-                -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER}
-                -DCMAKE_C_FLAGS=${CMAKE_C_FLAGS}
-                -DCMAKE_C_FLAGS_DEBUG=${CMAKE_C_FLAGS_DEBUG}
-                -DCMAKE_C_FLAGS_RELEASE=${CMAKE_C_FLAGS_RELEASE}
-                -DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS}
-                -DCMAKE_CXX_FLAGS_RELEASE=${CMAKE_CXX_FLAGS_RELEASE}
-                -DCMAKE_CXX_FLAGS_DEBUG=${CMAKE_CXX_FLAGS_DEBUG}
-                -DNO_SHARED=ON
-                -DNO_STATIC=OFF
-                -DBUILD_WITHOUT_LAPACK=ON
-                -DUSE_THREAD=OFF
-                -DCMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE}
-                CMAKE_CACHE_ARGS -DCMAKE_INSTALL_PREFIX:PATH=${CBLAS_INSTALL_DIR}
-                -DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON
-                -DCMAKE_BUILD_TYPE:STRING=${THIRD_PARTY_BUILD_TYPE}
-                )
-        ELSE(WIN32)
-            SET(COMMON_ARGS CC=${OPENBLAS_CC} NO_SHARED=1 NO_LAPACK=1 libs)
-            ExternalProject_Add(
-                extern_openblas
-                ${EXTERNAL_PROJECT_LOG_ARGS}
-                #  GIT_REPOSITORY      https://github.com/xianyi/OpenBLAS.git
-                GIT_REPOSITORY      http://admin@172.20.90.14:8080/r/openblas.git
-                #  GIT_TAG             ${OPENBLAS_COMMIT}
-                PREFIX              ${CBLAS_SOURCES_DIR}
-                INSTALL_DIR         ${CBLAS_INSTALL_DIR}
-                BUILD_IN_SOURCE     1
-                BUILD_COMMAND       ${CMAKE_MAKE_PROGRAM} ${COMMON_ARGS} ${OPTIONAL_ARGS}
-                INSTALL_COMMAND     ${CMAKE_MAKE_PROGRAM} install NO_SHARED=1 NO_LAPACK=1 PREFIX=<INSTALL_DIR>
-                && rm -r ${CBLAS_INSTALL_DIR}/lib/cmake ${CBLAS_INSTALL_DIR}/lib/pkgconfig
-                UPDATE_COMMAND      ""
-                CONFIGURE_COMMAND   ""
-                )
-        ENDIF(WIN32)
+        SET(COMMON_ARGS CC=${OPENBLAS_CC} NO_SHARED=1 NO_LAPACK=1 libs)
+        ExternalProject_Add(
+            extern_openblas
+            ${EXTERNAL_PROJECT_LOG_ARGS}
+            #  GIT_REPOSITORY      https://github.com/xianyi/OpenBLAS.git
+            GIT_REPOSITORY      http://admin@172.20.90.14:8080/r/openblas.git
+            #  GIT_TAG             ${OPENBLAS_COMMIT}
+            PREFIX              ${CBLAS_SOURCES_DIR}
+            INSTALL_DIR         ${CBLAS_INSTALL_DIR}
+            BUILD_IN_SOURCE     1
+            BUILD_COMMAND       ${CMAKE_MAKE_PROGRAM} ${COMMON_ARGS} ${OPTIONAL_ARGS}
+            INSTALL_COMMAND     ${CMAKE_MAKE_PROGRAM} install NO_SHARED=1 NO_LAPACK=1 PREFIX=<INSTALL_DIR>
+            && rm -r ${CBLAS_INSTALL_DIR}/lib/cmake ${CBLAS_INSTALL_DIR}/lib/pkgconfig
+            UPDATE_COMMAND      ""
+            CONFIGURE_COMMAND   ""
+            )
     ENDIF (WITH_PREBUILD_OPENBLAS)
 
     SET(CBLAS_PROVIDER openblas)
diff --git a/paddle/fluid/operators/CMakeLists.txt b/paddle/fluid/operators/CMakeLists.txt
index df8d8e557c..3bc3b3c5e3 100644
--- a/paddle/fluid/operators/CMakeLists.txt
+++ b/paddle/fluid/operators/CMakeLists.txt
@@ -320,8 +320,8 @@ op_library(save_op DEPS lod_tensor)
 op_library(load_op DEPS lod_tensor)
 op_library(save_combine_op DEPS lod_tensor)
 op_library(load_combine_op DEPS lod_tensor)
-op_library(tensor_array_to_tensor_op DEPS concat_op)
 op_library(concat_op DEPS concat_and_split)
+op_library(tensor_array_to_tensor_op DEPS concat_op)
 
 list(REMOVE_ITEM GENERAL_OPS ${DEPS_OPS})
 
diff --git a/paddle/fluid/platform/variant.h b/paddle/fluid/platform/variant.h
index 148e1ae6eb..fb6a8bb96f 100644
--- a/paddle/fluid/platform/variant.h
+++ b/paddle/fluid/platform/variant.h
@@ -44,7 +44,7 @@ limitations under the License. */
 #include <boost/variant.hpp>
 
 // some platform-independent defintion
-#if defined(WIN32) || defined(_WIN32) || defined(__WIN32__)
+#if defined(_WIN32)
 #define __UNUSED__()
 #define __builtin_expect(EXP, C)  (EXP)
 #else

From dc339b78d72a69de2b2fb07ff2f3c3f4cf1c017e Mon Sep 17 00:00:00 2001
From: peizhilin <wopeizl@163.com>
Date: Mon, 12 Nov 2018 11:33:46 +0800
Subject: [PATCH 25/50] fix code style

---
 CMakeLists.txt                                |   1 -
 cmake/external/openblas.cmake                 | 107 +++++++++---------
 .../framework/ir/attention_lstm_fuse_pass.cc  |  12 +-
 paddle/fluid/framework/ir/pass.h              |   4 +-
 paddle/fluid/platform/port.h                  |   4 -
 .../fluid/platform/stream_callback_manager.h  |   2 +-
 paddle/fluid/platform/variant.h               |   4 +-
 7 files changed, 65 insertions(+), 69 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index cd8c54e24e..32b369bec5 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -77,7 +77,6 @@ option(WITH_INFERENCE_API_TEST   "Test fluid inference high-level api interface"
 option(WITH_SYSTEM_BLAS   "Use system blas library"           OFF)
 option(PY_VERSION       "Compile PaddlePaddle with python3 support"     ${PY_VERSION})
 option(WITH_FAST_MATH   "Make use of fast math library, might affect the precision to some extent" ON)
-option(WITH_PREBUILD_OPENBLAS "Make use of the pre-built openblas library"                         ${WIN32})
 
 # PY_VERSION
 if(NOT PY_VERSION)
diff --git a/cmake/external/openblas.cmake b/cmake/external/openblas.cmake
index 25431f0aee..aeb976b840 100644
--- a/cmake/external/openblas.cmake
+++ b/cmake/external/openblas.cmake
@@ -31,65 +31,66 @@ IF(NOT ${CBLAS_FOUND})
 
     ADD_DEFINITIONS(-DPADDLE_USE_OPENBLAS)
 
-    IF (WITH_PREBUILD_OPENBLAS)
+    IF (WIN32)
         SET(CBLAS_FOUND true)
-        MESSAGE(STATUS, "Use prebuild openblas, please put it at " ${CBLAS_INSTALL_DIR})
-    ELSE(WITH_PREBUILD_OPENBLAS)
-        SET(OPENBLAS_CC "${CMAKE_C_COMPILER} -Wno-unused-but-set-variable -Wno-unused-variable")
-        SET(OPENBLAS_COMMIT "v0.2.20")
+        MESSAGE(WARNING, "In windows, openblas only support msvc build, please build it manually and put it at " ${CBLAS_INSTALL_DIR})
+    ENDIF(WIN32)
 
-        IF(CMAKE_CROSSCOMPILING)
-            SET(OPTIONAL_ARGS HOSTCC=${HOST_C_COMPILER})
-            GET_FILENAME_COMPONENT(CROSS_SUFFIX ${CMAKE_C_COMPILER} DIRECTORY)
-            SET(CROSS_SUFFIX ${CROSS_SUFFIX}/)
-            IF(ANDROID)
-                IF(ANDROID_ABI MATCHES "^armeabi(-v7a)?$")
-                    # use softfp
-                    SET(OPTIONAL_ARGS ${OPTIONAL_ARGS} TARGET=ARMV7 ARM_SOFTFP_ABI=1 USE_THREAD=0)
-                ELSEIF(ANDROID_ABI STREQUAL "arm64-v8a")
-                    SET(OPTIONAL_ARGS ${OPTIONAL_ARGS} TARGET=ARMV8 BINARY=64 USE_THREAD=0)
-                ENDIF()
-            ELSEIF(IOS)
-                IF(CMAKE_OSX_ARCHITECTURES MATCHES "arm64")
-                    SET(OPENBLAS_CC "${OPENBLAS_CC} ${CMAKE_C_FLAGS} -isysroot ${CMAKE_OSX_SYSROOT}")
-                    SET(OPENBLAS_CC "${OPENBLAS_CC} -arch arm64")
-                    SET(OPTIONAL_ARGS ${OPTIONAL_ARGS} TARGET=ARMV8 BINARY=64 USE_THREAD=0 CROSS_SUFFIX=${CROSS_SUFFIX})
-                ELSE()
-                    MESSAGE(FATAL_ERROR "OpenBLAS only support arm64 architectures on iOS. "
-                           "You can set IOS_USE_VECLIB_FOR_BLAS=ON or USE_EIGEN_FOR_BLAS=ON to use other blas library instead.")
-                ENDIF()
-            ELSEIF(RPI)
-                # use hardfp
-                SET(OPTIONAL_ARGS ${OPTIONAL_ARGS} TARGET=ARMV7 USE_THREAD=0)
-            ENDIF()
-        ELSE()
-            IF(APPLE)
-                SET(OPENBLAS_CC "${CMAKE_C_COMPILER} -isysroot ${CMAKE_OSX_SYSROOT}")
+    IF (NOT WIN32)
+    SET(OPENBLAS_CC "${CMAKE_C_COMPILER} -Wno-unused-but-set-variable -Wno-unused-variable")
+    SET(OPENBLAS_COMMIT "v0.2.20")
+
+    IF(CMAKE_CROSSCOMPILING)
+        SET(OPTIONAL_ARGS HOSTCC=${HOST_C_COMPILER})
+        GET_FILENAME_COMPONENT(CROSS_SUFFIX ${CMAKE_C_COMPILER} DIRECTORY)
+        SET(CROSS_SUFFIX ${CROSS_SUFFIX}/)
+        IF(ANDROID)
+            IF(ANDROID_ABI MATCHES "^armeabi(-v7a)?$")
+                # use softfp
+                SET(OPTIONAL_ARGS ${OPTIONAL_ARGS} TARGET=ARMV7 ARM_SOFTFP_ABI=1 USE_THREAD=0)
+            ELSEIF(ANDROID_ABI STREQUAL "arm64-v8a")
+                SET(OPTIONAL_ARGS ${OPTIONAL_ARGS} TARGET=ARMV8 BINARY=64 USE_THREAD=0)
             ENDIF()
-            SET(OPTIONAL_ARGS "")
-            IF(CMAKE_SYSTEM_PROCESSOR MATCHES "^x86(_64)?$")
-                SET(OPTIONAL_ARGS DYNAMIC_ARCH=1 NUM_THREADS=64)
+        ELSEIF(IOS)
+            IF(CMAKE_OSX_ARCHITECTURES MATCHES "arm64")
+                SET(OPENBLAS_CC "${OPENBLAS_CC} ${CMAKE_C_FLAGS} -isysroot ${CMAKE_OSX_SYSROOT}")
+                SET(OPENBLAS_CC "${OPENBLAS_CC} -arch arm64")
+                SET(OPTIONAL_ARGS ${OPTIONAL_ARGS} TARGET=ARMV8 BINARY=64 USE_THREAD=0 CROSS_SUFFIX=${CROSS_SUFFIX})
+            ELSE()
+                MESSAGE(FATAL_ERROR "OpenBLAS only support arm64 architectures on iOS. "
+                       "You can set IOS_USE_VECLIB_FOR_BLAS=ON or USE_EIGEN_FOR_BLAS=ON to use other blas library instead.")
             ENDIF()
+        ELSEIF(RPI)
+            # use hardfp
+            SET(OPTIONAL_ARGS ${OPTIONAL_ARGS} TARGET=ARMV7 USE_THREAD=0)
         ENDIF()
+    ELSE()
+        IF(APPLE)
+            SET(OPENBLAS_CC "${CMAKE_C_COMPILER} -isysroot ${CMAKE_OSX_SYSROOT}")
+        ENDIF()
+        SET(OPTIONAL_ARGS "")
+        IF(CMAKE_SYSTEM_PROCESSOR MATCHES "^x86(_64)?$")
+            SET(OPTIONAL_ARGS DYNAMIC_ARCH=1 NUM_THREADS=64)
+        ENDIF()
+    ENDIF()
 
-        SET(COMMON_ARGS CC=${OPENBLAS_CC} NO_SHARED=1 NO_LAPACK=1 libs)
-        ExternalProject_Add(
-            extern_openblas
-            ${EXTERNAL_PROJECT_LOG_ARGS}
-            #  GIT_REPOSITORY      https://github.com/xianyi/OpenBLAS.git
-            GIT_REPOSITORY      http://admin@172.20.90.14:8080/r/openblas.git
-            #  GIT_TAG             ${OPENBLAS_COMMIT}
-            PREFIX              ${CBLAS_SOURCES_DIR}
-            INSTALL_DIR         ${CBLAS_INSTALL_DIR}
-            BUILD_IN_SOURCE     1
-            BUILD_COMMAND       ${CMAKE_MAKE_PROGRAM} ${COMMON_ARGS} ${OPTIONAL_ARGS}
-            INSTALL_COMMAND     ${CMAKE_MAKE_PROGRAM} install NO_SHARED=1 NO_LAPACK=1 PREFIX=<INSTALL_DIR>
-            && rm -r ${CBLAS_INSTALL_DIR}/lib/cmake ${CBLAS_INSTALL_DIR}/lib/pkgconfig
-            UPDATE_COMMAND      ""
-            CONFIGURE_COMMAND   ""
-            )
-    ENDIF (WITH_PREBUILD_OPENBLAS)
-
+    SET(COMMON_ARGS CC=${OPENBLAS_CC} NO_SHARED=1 NO_LAPACK=1 libs)
+    ExternalProject_Add(
+        extern_openblas
+        ${EXTERNAL_PROJECT_LOG_ARGS}
+        GIT_REPOSITORY      https://github.com/xianyi/OpenBLAS.git
+        GIT_TAG             ${OPENBLAS_COMMIT}
+        PREFIX              ${CBLAS_SOURCES_DIR}
+        INSTALL_DIR         ${CBLAS_INSTALL_DIR}
+        BUILD_IN_SOURCE     1
+        BUILD_COMMAND       ${CMAKE_MAKE_PROGRAM} ${COMMON_ARGS} ${OPTIONAL_ARGS}
+        INSTALL_COMMAND     ${CMAKE_MAKE_PROGRAM} install NO_SHARED=1 NO_LAPACK=1 PREFIX=<INSTALL_DIR> 
+                            && rm -r ${CBLAS_INSTALL_DIR}/lib/cmake ${CBLAS_INSTALL_DIR}/lib/pkgconfig
+        UPDATE_COMMAND      ""
+        CONFIGURE_COMMAND   ""
+    )
+    ELSE()
+    ENDIF(NOT WIN32)
     SET(CBLAS_PROVIDER openblas)
     IF(WITH_C_API)
         INSTALL(DIRECTORY ${CBLAS_INC_DIR} DESTINATION third_party/openblas)
diff --git a/paddle/fluid/framework/ir/attention_lstm_fuse_pass.cc b/paddle/fluid/framework/ir/attention_lstm_fuse_pass.cc
index dfef9f381b..ecefab32bb 100644
--- a/paddle/fluid/framework/ir/attention_lstm_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/attention_lstm_fuse_pass.cc
@@ -211,12 +211,12 @@ void PrepareLSTMWeight(const LoDTensor& W_forget_w0,
   VLOG(30) << "LSTMWeight resized to " << out->dims();
 
   float* out_data = out->mutable_data<float>(platform::CPUPlace());
-  std::array<const float*, 4> tensors = 
+  std::array<const float*, 4> tensors{
       {W_forget_w0.data<float>(), W_input_w0.data<float>(),
-        W_output_w0.data<float>(), W_cell_w0.data<float>()};
-  std::array<const float*, 4> tensors1 = 
+        W_output_w0.data<float>(), W_cell_w0.data<float>()}};
+  std::array<const float*, 4> tensors1{
       {W_forget_w1.data<float>(), W_input_w1.data<float>(),
-        W_output_w1.data<float>(), W_cell_w1.data<float>()};
+        W_output_w1.data<float>(), W_cell_w1.data<float>()}};
 
   for (int row = 0; row < D; row++) {
     for (int col = 0; col < 4; col++) {
@@ -238,9 +238,9 @@ void PrepareLSTMWeight(const LoDTensor& W_forget_w0,
 void PrepareLSTMBias(const LoDTensor& B_forget, const LoDTensor& B_input,
                      const LoDTensor& B_output, const LoDTensor& B_cell,
                      LoDTensor* out) {
-  std::array<const float*, 4> tensors = 
+  std::array<const float*, 4> tensors{
       {B_forget.data<float>(), B_input.data<float>(), B_output.data<float>(),
-        B_cell.data<float>()};
+        B_cell.data<float>()}};
 
   PADDLE_ENFORCE_EQ(B_forget.dims().size(), 1);
   int D = B_forget.dims()[0];
diff --git a/paddle/fluid/framework/ir/pass.h b/paddle/fluid/framework/ir/pass.h
index 8d699146bd..5f7cea65d9 100644
--- a/paddle/fluid/framework/ir/pass.h
+++ b/paddle/fluid/framework/ir/pass.h
@@ -207,7 +207,7 @@ struct PassRegistrar : public Registrar {
     return 0;                                                         \
   }                                                                   \
   static ::paddle::framework::ir::PassRegistrar<pass_class>           \
-      &__pass_tmp_registrar_##pass_type##__ __UNUSED__() = \
+      &__pass_tmp_registrar_##pass_type##__ UNUSED = \
           __pass_registrar_##pass_type##__
 
 #define USE_PASS(pass_type)                                           \
@@ -215,7 +215,7 @@ struct PassRegistrar : public Registrar {
       __use_pass_itself_##pass_type,                                  \
       "USE_PASS must be called in global namespace");                 \
   extern int TouchPassRegistrar_##pass_type();                        \
-  static int use_pass_itself_##pass_type##_ __UNUSED__() = \
+  static int use_pass_itself_##pass_type##_ UNUSED = \
       TouchPassRegistrar_##pass_type()
 
 }  // namespace ir
diff --git a/paddle/fluid/platform/port.h b/paddle/fluid/platform/port.h
index 4ff07edc19..55e1dd87c2 100644
--- a/paddle/fluid/platform/port.h
+++ b/paddle/fluid/platform/port.h
@@ -24,7 +24,6 @@
 #include "glog/logging.h"
 
 #if !defined(_WIN32)
-#define UNUSED __attribute__((unused))
 #include <dlfcn.h>     //  dladdr
 #include <execinfo.h>  // backtrace
 #include <sys/stat.h>
@@ -34,9 +33,6 @@
 #include <io.h>  // _popen, _pclose
 #include <windows.h>
 #include <numeric>  // std::accumulate in msvc
-// windows version of __attribute__((unused))
-#define UNUSED __pragma(warning(suppress : 4100))
-
 #ifndef S_ISDIR  // windows port for sys/stat.h
 #define S_ISDIR(mode) (((mode)&S_IFMT) == S_IFDIR)
 #endif  // S_ISDIR
diff --git a/paddle/fluid/platform/stream_callback_manager.h b/paddle/fluid/platform/stream_callback_manager.h
index 3cd1628a0b..0e88a439cf 100644
--- a/paddle/fluid/platform/stream_callback_manager.h
+++ b/paddle/fluid/platform/stream_callback_manager.h
@@ -18,8 +18,8 @@
 #include <cuda_runtime.h>
 #include <functional>
 #include <memory>
+#include "ThreadPool.h"
 #include "paddle/fluid/platform/enforce.h"
-#include "third_party/threadpool/src/extern_threadpool/ThreadPool.h"
 
 namespace paddle {
 namespace platform {
diff --git a/paddle/fluid/platform/variant.h b/paddle/fluid/platform/variant.h
index fb6a8bb96f..e9d90ac1ec 100644
--- a/paddle/fluid/platform/variant.h
+++ b/paddle/fluid/platform/variant.h
@@ -45,8 +45,8 @@ limitations under the License. */
 
 // some platform-independent defintion
 #if defined(_WIN32)
-#define __UNUSED__()
+#define UNUSED
 #define __builtin_expect(EXP, C)  (EXP)
 #else
-#define __UNUSED__() __attribute__((unused))
+#define UNUSED __attribute__((unused))
 #endif
\ No newline at end of file

From 7840d181c9b84b25fad80a69c49cc09a29e158f2 Mon Sep 17 00:00:00 2001
From: peizhilin <wopeizl@163.com>
Date: Mon, 12 Nov 2018 11:49:13 +0800
Subject: [PATCH 26/50] fix style issue

---
 doc/v2/dev/contribute_to_paddle_en.md           |  2 +-
 paddle/fluid/framework/data_type_transform.cu   | 14 ++++++++++++++
 paddle/fluid/framework/tensor_util.cu           | 14 ++++++++++++++
 paddle/fluid/platform/nccl_helper.h             |  2 +-
 paddle/fluid/platform/variant.h                 |  2 +-
 python/paddle/fluid/__init__.py                 |  9 ++++-----
 python/paddle/fluid/layers/io.py                |  5 +++--
 python/paddle/fluid/layers/nn.py                | 17 +++++++++++++----
 python/paddle/fluid/layers/ops.py               |  2 --
 .../paddle/trainer_config_helpers/networks.py   |  4 ++--
 10 files changed, 53 insertions(+), 18 deletions(-)

diff --git a/doc/v2/dev/contribute_to_paddle_en.md b/doc/v2/dev/contribute_to_paddle_en.md
index c97564d93a..7272339644 120000
--- a/doc/v2/dev/contribute_to_paddle_en.md
+++ b/doc/v2/dev/contribute_to_paddle_en.md
@@ -1 +1 @@
-../../../CONTRIBUTING.md
\ No newline at end of file
+../../../CONTRIBUTING.md
diff --git a/paddle/fluid/framework/data_type_transform.cu b/paddle/fluid/framework/data_type_transform.cu
index f46491293e..7dd9cb5cfd 120000
--- a/paddle/fluid/framework/data_type_transform.cu
+++ b/paddle/fluid/framework/data_type_transform.cu
@@ -1 +1,15 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
 data_type_transform.cc
\ No newline at end of file
diff --git a/paddle/fluid/framework/tensor_util.cu b/paddle/fluid/framework/tensor_util.cu
index edd88c4e54..251c3a5e40 120000
--- a/paddle/fluid/framework/tensor_util.cu
+++ b/paddle/fluid/framework/tensor_util.cu
@@ -1 +1,15 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
 tensor_util.cc
\ No newline at end of file
diff --git a/paddle/fluid/platform/nccl_helper.h b/paddle/fluid/platform/nccl_helper.h
index cbe03c163f..a6360a884d 100644
--- a/paddle/fluid/platform/nccl_helper.h
+++ b/paddle/fluid/platform/nccl_helper.h
@@ -150,4 +150,4 @@ struct NCCLContextMap {
 
 }  // namespace platform
 }  // namespace paddle
-#endif
\ No newline at end of file
+#endif
diff --git a/paddle/fluid/platform/variant.h b/paddle/fluid/platform/variant.h
index e9d90ac1ec..1b10db8669 100644
--- a/paddle/fluid/platform/variant.h
+++ b/paddle/fluid/platform/variant.h
@@ -49,4 +49,4 @@ limitations under the License. */
 #define __builtin_expect(EXP, C)  (EXP)
 #else
 #define UNUSED __attribute__((unused))
-#endif
\ No newline at end of file
+#endif
diff --git a/python/paddle/fluid/__init__.py b/python/paddle/fluid/__init__.py
index d3ad2149bc..2e1b4b2ead 100644
--- a/python/paddle/fluid/__init__.py
+++ b/python/paddle/fluid/__init__.py
@@ -112,11 +112,10 @@ def __bootstrap__():
     os.environ['OMP_NUM_THREADS'] = str(num_threads)
 
     read_env_flags = [
-        'use_pinned_memory', 'check_nan_inf', 'benchmark',
-        'eager_delete_scope', 'use_mkldnn', 'initial_cpu_memory_in_mb',
-        'init_allocated_mem', 'free_idle_memory', 'paddle_num_threads',
-        'dist_threadpool_size', 'eager_delete_tensor_gb',
-        'reader_queue_speed_test_mode'
+        'use_pinned_memory', 'check_nan_inf', 'benchmark', 'eager_delete_scope',
+        'use_mkldnn', 'initial_cpu_memory_in_mb', 'init_allocated_mem',
+        'free_idle_memory', 'paddle_num_threads', 'dist_threadpool_size',
+        'eager_delete_tensor_gb', 'reader_queue_speed_test_mode'
     ]
     if os.name != 'nt':
         read_env_flags.append('warpctc_dir')
diff --git a/python/paddle/fluid/layers/io.py b/python/paddle/fluid/layers/io.py
index d50f6744df..a9075045a2 100644
--- a/python/paddle/fluid/layers/io.py
+++ b/python/paddle/fluid/layers/io.py
@@ -348,6 +348,7 @@ def _copy_reader_create_op_(block, op):
 
 
 if os.name != 'nt':
+
     @templatedoc(op_type='create_recordio_file_reader')
     def open_recordio_file(filename,
                            shapes,
@@ -405,8 +406,8 @@ if os.name != 'nt':
 
         startup_var.desc.set_dtypes(dtypes)
         startup_var.persistable = True
-        main_prog_var = _copy_reader_var_(default_main_program().current_block(),
-                                          startup_var)
+        main_prog_var = _copy_reader_var_(
+            default_main_program().current_block(), startup_var)
 
         if pass_num > 1:
             main_prog_var = multi_pass(reader=main_prog_var, pass_num=pass_num)
diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py
index c0278efb60..4b9264bfb6 100644
--- a/python/paddle/fluid/layers/nn.py
+++ b/python/paddle/fluid/layers/nn.py
@@ -342,6 +342,7 @@ def embedding(input,
 
 
 if os.name != 'nt':
+
     @templatedoc(op_type="lstm")
     def dynamic_lstm(input,
                      size,
@@ -961,6 +962,7 @@ def linear_chain_crf(input, label, param_attr=None):
 
 
 if os.name != 'nt':
+
     @templatedoc()
     def crf_decoding(input, param_attr, label=None):
         """
@@ -988,9 +990,11 @@ if os.name != 'nt':
             dtype=helper.input_dtype())
         helper.append_op(
             type='crf_decoding',
-            inputs={"Emission": [input],
-                    "Transition": transition,
-                    "Label": label},
+            inputs={
+                "Emission": [input],
+                "Transition": transition,
+                "Label": label
+            },
             outputs={"ViterbiPath": [viterbi_path]})
 
         return viterbi_path
@@ -5530,8 +5534,13 @@ def label_smooth(label,
 
 
 if os.name != 'nt':
+
     @templatedoc()
-    def roi_pool(input, rois, pooled_height=1, pooled_width=1, spatial_scale=1.0):
+    def roi_pool(input,
+                 rois,
+                 pooled_height=1,
+                 pooled_width=1,
+                 spatial_scale=1.0):
         """
         ${comment}
 
diff --git a/python/paddle/fluid/layers/ops.py b/python/paddle/fluid/layers/ops.py
index df52b7042f..66eb1229aa 100644
--- a/python/paddle/fluid/layers/ops.py
+++ b/python/paddle/fluid/layers/ops.py
@@ -105,7 +105,6 @@ if os.name != 'nt':
 
     _cum_sum_ = generate_layer_fn('cumsum')
 
-
     def cumsum(x, axis=None, exclusive=None, reverse=None):
         locals_var = locals().keys()
         kwargs = dict()
@@ -115,7 +114,6 @@ if os.name != 'nt':
                 kwargs[name] = val
         return _cum_sum_(**kwargs)
 
-
     cumsum.__doc__ = _cum_sum_.__doc__ + """
     Examples:
     
diff --git a/python/paddle/trainer_config_helpers/networks.py b/python/paddle/trainer_config_helpers/networks.py
index b5cde7bac7..1e961b936f 100644
--- a/python/paddle/trainer_config_helpers/networks.py
+++ b/python/paddle/trainer_config_helpers/networks.py
@@ -1719,7 +1719,7 @@ def inputs(layers, *args):
     if len(args) != 0:
         layers.extend(args)
 
-    Inputs(*[l.name for l in layers])
+    Inputs(* [l.name for l in layers])
 
 
 def outputs(layers, *args):
@@ -1769,7 +1769,7 @@ def outputs(layers, *args):
     assert len(layers) > 0
 
     if HasInputsSet():  # input already set
-        Outputs(*[l.name for l in layers])
+        Outputs(* [l.name for l in layers])
         return  # just return outputs.
 
     if len(layers) != 1:

From 1b75fd2236dfa3563226306269fc04f63395e8f6 Mon Sep 17 00:00:00 2001
From: peizhilin <wopeizl@163.com>
Date: Mon, 12 Nov 2018 13:23:41 +0800
Subject: [PATCH 27/50] revert

---
 paddle/fluid/framework/data_type_transform.cu | 14 --------------
 paddle/fluid/framework/tensor_util.cu         | 14 --------------
 2 files changed, 28 deletions(-)

diff --git a/paddle/fluid/framework/data_type_transform.cu b/paddle/fluid/framework/data_type_transform.cu
index 7dd9cb5cfd..f46491293e 120000
--- a/paddle/fluid/framework/data_type_transform.cu
+++ b/paddle/fluid/framework/data_type_transform.cu
@@ -1,15 +1 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
 data_type_transform.cc
\ No newline at end of file
diff --git a/paddle/fluid/framework/tensor_util.cu b/paddle/fluid/framework/tensor_util.cu
index 251c3a5e40..edd88c4e54 120000
--- a/paddle/fluid/framework/tensor_util.cu
+++ b/paddle/fluid/framework/tensor_util.cu
@@ -1,15 +1 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
 tensor_util.cc
\ No newline at end of file

From 2d7134bc37fc0a9fa4b02a83fc1a20bf48c47674 Mon Sep 17 00:00:00 2001
From: nhzlx <zlx_hg@163.com>
Date: Tue, 13 Nov 2018 02:33:42 +0000
Subject: [PATCH 28/50] add initial code for plugin

---
 .../fluid/inference/tensorrt/CMakeLists.txt   |   1 +
 .../inference/tensorrt/convert/CMakeLists.txt |   2 +-
 .../inference/tensorrt/convert/concat_op.cc   |   2 +-
 .../tensorrt/plugin/.trt_plugin_utils.h.swp   | Bin 0 -> 12288 bytes
 .../inference/tensorrt/plugin/CMakeLists.txt  |   2 +
 .../tensorrt/plugin/plugin_factory.cc         |  64 ++++++++++
 .../tensorrt/plugin/plugin_factory.h          |  91 ++++++++++++++
 .../inference/tensorrt/plugin/plugin_utils.cc |  37 ++++++
 .../inference/tensorrt/plugin/plugin_utils.h  |  34 ++++++
 .../inference/tensorrt/plugin/serialize.hpp   | 111 +++++++++++++++++
 .../tensorrt/plugin/split_op_plugin.cu        | 114 ++++++++++++++++++
 .../tensorrt/plugin/split_op_plugin.h         |  62 ++++++++++
 .../inference/tensorrt/plugin/trt_plugin.cc   |  63 ++++++++++
 .../inference/tensorrt/plugin/trt_plugin.h    |  72 +++++++++++
 14 files changed, 653 insertions(+), 2 deletions(-)
 create mode 100644 paddle/fluid/inference/tensorrt/plugin/.trt_plugin_utils.h.swp
 create mode 100644 paddle/fluid/inference/tensorrt/plugin/CMakeLists.txt
 create mode 100644 paddle/fluid/inference/tensorrt/plugin/plugin_factory.cc
 create mode 100644 paddle/fluid/inference/tensorrt/plugin/plugin_factory.h
 create mode 100644 paddle/fluid/inference/tensorrt/plugin/plugin_utils.cc
 create mode 100644 paddle/fluid/inference/tensorrt/plugin/plugin_utils.h
 create mode 100644 paddle/fluid/inference/tensorrt/plugin/serialize.hpp
 create mode 100644 paddle/fluid/inference/tensorrt/plugin/split_op_plugin.cu
 create mode 100644 paddle/fluid/inference/tensorrt/plugin/split_op_plugin.h
 create mode 100644 paddle/fluid/inference/tensorrt/plugin/trt_plugin.cc
 create mode 100644 paddle/fluid/inference/tensorrt/plugin/trt_plugin.h

diff --git a/paddle/fluid/inference/tensorrt/CMakeLists.txt b/paddle/fluid/inference/tensorrt/CMakeLists.txt
index a610687a5b..e09705e3c6 100644
--- a/paddle/fluid/inference/tensorrt/CMakeLists.txt
+++ b/paddle/fluid/inference/tensorrt/CMakeLists.txt
@@ -1,4 +1,5 @@
 nv_library(tensorrt_engine SRCS engine.cc DEPS framework_proto device_context)
 nv_test(test_tensorrt SRCS test_tensorrt.cc DEPS dynload_cuda device_context dynamic_loader)
 nv_test(test_tensorrt_engine SRCS test_engine.cc DEPS dynload_cuda tensorrt_engine)
+add_subdirectory(plugin)
 add_subdirectory(convert)
diff --git a/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt b/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt
index 0a35e10f69..e34d5db6b8 100644
--- a/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt
+++ b/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt
@@ -2,7 +2,7 @@
 nv_library(tensorrt_converter
   SRCS mul_op.cc conv2d_op.cc fc_op.cc pool2d_op.cc elementwise_op.cc
 batch_norm_op.cc activation_op.cc softmax_op.cc concat_op.cc dropout_op.cc pad_op.cc
-  DEPS tensorrt_engine operator scope framework_proto op_registry)
+  DEPS tensorrt_engine tensorrt_plugin operator scope framework_proto op_registry)
 
 nv_test(test_op_converter SRCS test_op_converter.cc DEPS
   ${FLUID_CORE_MODULES} tensorrt_engine tensorrt_converter)
diff --git a/paddle/fluid/inference/tensorrt/convert/concat_op.cc b/paddle/fluid/inference/tensorrt/convert/concat_op.cc
index 60c16e35ed..cd1bb892bd 100644
--- a/paddle/fluid/inference/tensorrt/convert/concat_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/concat_op.cc
@@ -19,7 +19,7 @@ namespace inference {
 namespace tensorrt {
 
 /*
- * MulOp, IMatrixMultiplyLayer in TRT. This Layer doesn't has weights.
+ * ConcatOp
  */
 class ConcatOpConverter : public OpConverter {
  public:
diff --git a/paddle/fluid/inference/tensorrt/plugin/.trt_plugin_utils.h.swp b/paddle/fluid/inference/tensorrt/plugin/.trt_plugin_utils.h.swp
new file mode 100644
index 0000000000000000000000000000000000000000..08d1434089f792131d0e6a545ad8675b3ba4892c
GIT binary patch
literal 12288
zcmeI2O^@3|7{{j^p`|Nc4hRkpS%h|18#|j)QMP48x^J}z$to{h6%JuykDWn|?T*Kr
z-9?2DfCL|cmJa|&Q27KL5nrL)$^i+%0sfAYCc8jeP6#xXez83>e*DaD>}Xec`jzX>
zJM_9$W!M%N`}xJM-+PzcUl_i{n2KYaH$Q4Sx>;E(2T4}0Uc6XdyYLz)S1OiBT&vHe
zmsxH+%wyPT>Q(K-<Iql&y~jN-6n0YV{4kTAEhAqj5xK(FB1&VWw4H=mAR}8V-75}z
znU-N{4IY)0={E+9fti6!`{hgLx)-WzmArAW^fEpBOn26Aj*J0gz!)$Fi~(c77%&Em
z0b}5QZ$Kw!*vFXV6O);~GTEOwwV#}toiShx7z4(DF<=ZB1IB<cU<?=o#(*(k4EzTT
zaF4NHo@DIyQ%D}a|4;w_|Mom%cfm*CP4GN;4*d8uWB0&q&<8Jo@6R#z75Ea|0iS_S
zK@+TlH^8f40sMBBv0uSm@Hyb%eee?a2|3&cKY(w*7vK)ab2|hdfJJZ~{EnROf$zX4
zpbpM~v%qY|fH7bU7z4(DF<=ZB1CME7QAD2fiENuX+l_X!wNr2I_coi2PRqI4E1FuJ
z-d?@C>oj_u*<rWi)Z4v(_ws8!{B<~+z;3g-Ti@<&cDHJI^WSx`TjS|Qd??a{yKHg}
zVYjBoMc=WYqE_|P6@ngAt<s6EJj7h=<I`(!qVu7_PX&7D*m3#n$was-Xj)Ip82#0j
z%45;fE0~I_#85<<hs&2)s+Eia+CJ2ZySkY)AZf{sqg2yDEWH&<g_1n%d0g{VDq>9A
zLJ<Xeu*#S|N`xnTO0~DX?q`v!WgK1E8<+KV##O$p&dO3AZ+U}?=FR*TDs%M?hF>0M
zTy$)S;*Q9+=z6^1b16^LT;5=8y5=)G^x<6NY+2`9i)R1>*&<|xnJ4H<<G{%am#x9Z
ztg-Zu3!AM$X+G#w6%P?LQ9KxOicuQPSSW{5^IT&}S>y>tdLXDSUDy;Bt=OzBFsE4?
z`EfBOO@u3b=~Bp}Amf3cAU+T(%2zoNYADlm9F<2N+jlzGn%xfV*IKPwqvLG1so5fY
z7i>8lr`f>S4%Her=xwL5wMs(beu~6lqC}b!?k9&yD1~P+Pv*~2KhwhdbGjr`nja8H
z#3L%z+T^rLbhXxY+N-P^g?UgVZe~&;O8<y>g6J;qhXQjM<@e)(66n)09%33PsWB<6
zcI@jBSFkqFI$5{v(P(6GyyydA#VW87B)68@b!QXbc-!Twh4R=NaYjRaL~&npMC0Vf
z3C{czQn_YFlW|e3DNA$bn2s-zlsI%nqVrTx`;JsOR*puHG#|oZlSP{sQyV8YUCvdy
zE>ylKgv4kmU)0j%q7vRvX0OmDa#J!GXj*cYsajdPM0?|+`r?ynnI6O{wWt<`)XE2@
M)XHC^gM4-V01zic1poj5

literal 0
HcmV?d00001

diff --git a/paddle/fluid/inference/tensorrt/plugin/CMakeLists.txt b/paddle/fluid/inference/tensorrt/plugin/CMakeLists.txt
new file mode 100644
index 0000000000..1b91c864c9
--- /dev/null
+++ b/paddle/fluid/inference/tensorrt/plugin/CMakeLists.txt
@@ -0,0 +1,2 @@
+nv_library(tensorrt_plugin SRCS plugin_factory.cc plugin_utils.cc
+trt_plugin.cc split_op_plugin.cu DEPS enforce)
diff --git a/paddle/fluid/inference/tensorrt/plugin/plugin_factory.cc b/paddle/fluid/inference/tensorrt/plugin/plugin_factory.cc
new file mode 100644
index 0000000000..5ebcd44611
--- /dev/null
+++ b/paddle/fluid/inference/tensorrt/plugin/plugin_factory.cc
@@ -0,0 +1,64 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/inference/tensorrt/plugin/plugin_factory.h"
+
+namespace paddle {
+namespace inference {
+namespace tensorrt {
+
+PluginTensorRT* PluginFactoryTensorRT::createPlugin(const char* layer_name,
+                                                    const void* serial_data,
+                                                    size_t serial_length) {
+  size_t parsed_byte = 0;
+  std::string encoded_op_name =
+      ExtractOpName(serial_data, serial_length, &parsed_byte);
+
+  if (!IsPlugin(encoded_op_name)) {
+    return nullptr;
+  }
+
+  auto plugin_ptr =
+      plugin_registry_[encoded_op_name].first(serial_data, serial_length);
+  owned_plugins_.emplace_back(plugin_ptr);
+
+  return plugin_ptr;
+}
+
+PluginTensorRT* PluginFactoryTensorRT::CreatePlugin(
+    const std::string& op_name) {
+  if (!IsPlugin(op_name)) return nullptr;
+
+  auto plugin_ptr = plugin_registry_[op_name].second();
+  owned_plugins_.emplace_back(plugin_ptr);
+
+  return plugin_ptr;
+}
+
+bool PluginFactoryTensorRT::RegisterPlugin(
+    const std::string& op_name, PluginDeserializeFunc deserialize_func,
+    PluginConstructFunc construct_func) {
+  if (IsPlugin(op_name)) return false;
+
+  auto ret = plugin_registry_.emplace(
+      op_name, std::make_pair(deserialize_func, construct_func));
+
+  return ret.second;
+}
+
+void PluginFactoryTensorRT::DestroyPlugins() { owned_plugins_.clear(); }
+
+}  // namespace tensorrt
+}  // namespace inference
+}  // namespace paddle
diff --git a/paddle/fluid/inference/tensorrt/plugin/plugin_factory.h b/paddle/fluid/inference/tensorrt/plugin/plugin_factory.h
new file mode 100644
index 0000000000..00435766f7
--- /dev/null
+++ b/paddle/fluid/inference/tensorrt/plugin/plugin_factory.h
@@ -0,0 +1,91 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <memory>
+#include <unordered_map>
+
+#include "NvInfer.h"
+#include "paddle/fluid/inference/tensorrt/plugin/plugin_utils.h"
+#include "paddle/fluid/inference/tensorrt/plugin/trt_plugin.h"
+#include "paddle/fluid/platform/enforce.h"
+
+namespace paddle {
+namespace inference {
+namespace tensorrt {
+
+class PluginFactoryTensorRT : public nvinfer1::IPluginFactory {
+ public:
+  static PluginFactoryTensorRT* GetInstance() {
+    static PluginFactoryTensorRT* factory_instance =
+        new PluginFactoryTensorRT();
+    return factory_instance;
+  }
+
+  // Deserialization method
+  PluginTensorRT* createPlugin(const char* layer_name, const void* serial_data,
+                               size_t serial_length) override;
+
+  // Plugin construction, PluginFactoryTensorRT owns the plugin.
+  PluginTensorRT* CreatePlugin(const std::string& op_name);
+
+  bool RegisterPlugin(const std::string& op_name,
+                      PluginDeserializeFunc deserialize_func,
+                      PluginConstructFunc construct_func);
+
+  bool IsPlugin(const std::string& op_name) {
+    return plugin_registry_.find(op_name) != plugin_registry_.end();
+  }
+
+  size_t CountOwnedPlugins() { return owned_plugins_.size(); }
+
+  void DestroyPlugins();
+
+ protected:
+  std::unordered_map<std::string,
+                     std::pair<PluginDeserializeFunc, PluginConstructFunc>>
+      plugin_registry_;
+  std::vector<std::unique_ptr<PluginTensorRT>> owned_plugins_;
+};
+
+class TrtPluginRegistrar {
+ public:
+  TrtPluginRegistrar(const std::string& name,
+                     PluginDeserializeFunc deserialize_func,
+                     PluginConstructFunc construct_func) {
+    auto factory = PluginFactoryTensorRT::GetInstance();
+    // platform::PADDLE_ENFORCE(factory->RegisterPlugin(name, deserialize_func,
+    // construct_func),  "Falied to register plugin [%s]", name);
+    // platform::PADDLE_ENFORCE(factory->RegisterPlugin(name, deserialize_func,
+    // construct_func));
+    factory->RegisterPlugin(name, deserialize_func, construct_func);
+  }
+};
+
+#define REGISTER_TRT_PLUGIN(name, deserialize_func, construct_func)    \
+  REGISTER_TRT_PLUGIN_UNIQ_HELPER(__COUNTER__, name, deserialize_func, \
+                                  construct_func)
+#define REGISTER_TRT_PLUGIN_UNIQ_HELPER(ctr, name, deserialize_func, \
+                                        construct_func)              \
+  REGISTER_TRT_PLUGIN_UNIQ(ctr, name, deserialize_func, construct_func)
+#define REGISTER_TRT_PLUGIN_UNIQ(ctr, name, deserialize_func, construct_func) \
+  static ::paddle::inference::tensorrt::TrtPluginRegistrar                    \
+      trt_plugin_registrar##ctr __attribute__((unused)) =                     \
+          ::paddle::inference::tensorrt::TrtPluginRegistrar(                  \
+              name, deserialize_func, construct_func)
+
+}  // namespace tensorrt
+}  // namespace inference
+}  // namespace paddle
diff --git a/paddle/fluid/inference/tensorrt/plugin/plugin_utils.cc b/paddle/fluid/inference/tensorrt/plugin/plugin_utils.cc
new file mode 100644
index 0000000000..2cc4162aa7
--- /dev/null
+++ b/paddle/fluid/inference/tensorrt/plugin/plugin_utils.cc
@@ -0,0 +1,37 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/inference/tensorrt/plugin/plugin_utils.h"
+#include <cassert>
+
+namespace paddle {
+namespace inference {
+namespace tensorrt {
+
+std::string ExtractOpName(const void* serial_data, size_t serial_length,
+                          size_t* incremental) {
+  size_t op_name_char_count = *static_cast<const size_t*>(serial_data);
+  *incremental = sizeof(size_t) + op_name_char_count;
+
+  assert(serial_length >= *incremental);
+
+  const char* buffer = static_cast<const char*>(serial_data) + sizeof(size_t);
+  std::string op_name(buffer, op_name_char_count);
+
+  return op_name;
+}
+
+}  // namespace tensorrt
+}  // namespace inference
+}  // namespace paddle
diff --git a/paddle/fluid/inference/tensorrt/plugin/plugin_utils.h b/paddle/fluid/inference/tensorrt/plugin/plugin_utils.h
new file mode 100644
index 0000000000..fb6608c12a
--- /dev/null
+++ b/paddle/fluid/inference/tensorrt/plugin/plugin_utils.h
@@ -0,0 +1,34 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include <functional>
+
+#include "NvInfer.h"
+#include "paddle/fluid/inference/tensorrt/plugin/trt_plugin.h"
+
+namespace paddle {
+namespace inference {
+namespace tensorrt {
+
+typedef std::function<PluginTensorRT*(const void*, size_t)>
+    PluginDeserializeFunc;
+typedef std::function<PluginTensorRT*(void)> PluginConstructFunc;
+
+std::string ExtractOpName(const void* serial_data, size_t serial_length,
+                          size_t* incremental);
+
+}  // namespace tensorrt
+}  // namespace inference
+}  // namespze paddle
diff --git a/paddle/fluid/inference/tensorrt/plugin/serialize.hpp b/paddle/fluid/inference/tensorrt/plugin/serialize.hpp
new file mode 100644
index 0000000000..96df352feb
--- /dev/null
+++ b/paddle/fluid/inference/tensorrt/plugin/serialize.hpp
@@ -0,0 +1,111 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <cassert>
+#include <cstring>
+#include <type_traits>
+#include <vector>
+
+template <typename T>
+inline void serialize_value(void** buffer, T const& value);
+
+template <typename T>
+inline void deserialize_value(void const** buffer, size_t* buffer_size,
+                              T* value);
+
+namespace {
+
+template <typename T, class Enable = void>
+struct Serializer {};
+
+template <typename T>
+struct Serializer<T, typename std::enable_if<std::is_arithmetic<T>::value ||
+                                             std::is_enum<T>::value ||
+                                             std::is_pod<T>::value>::type> {
+  static size_t serialized_size(T const& value) { return sizeof(T); }
+  static void serialize(void** buffer, T const& value) {
+    ::memcpy(*buffer, &value, sizeof(T));
+    reinterpret_cast<char*&>(*buffer) += sizeof(T);
+  }
+  static void deserialize(void const** buffer, size_t* buffer_size, T* value) {
+    assert(*buffer_size >= sizeof(T));
+    ::memcpy(value, *buffer, sizeof(T));
+    reinterpret_cast<char const*&>(*buffer) += sizeof(T);
+    *buffer_size -= sizeof(T);
+  }
+};
+
+template <>
+struct Serializer<const char*> {
+  static size_t serialized_size(const char* value) { return strlen(value) + 1; }
+  static void serialize(void** buffer, const char* value) {
+    ::strcpy(static_cast<char*>(*buffer), value);
+    reinterpret_cast<char*&>(*buffer) += strlen(value) + 1;
+  }
+  static void deserialize(void const** buffer, size_t* buffer_size,
+                          const char** value) {
+    *value = static_cast<char const*>(*buffer);
+    size_t data_size = strnlen(*value, *buffer_size) + 1;
+    assert(*buffer_size >= data_size);
+    reinterpret_cast<char const*&>(*buffer) += data_size;
+    *buffer_size -= data_size;
+  }
+};
+
+template <typename T>
+struct Serializer<std::vector<T>,
+                  typename std::enable_if<std::is_arithmetic<T>::value ||
+                                          std::is_enum<T>::value ||
+                                          std::is_pod<T>::value>::type> {
+  static size_t serialized_size(std::vector<T> const& value) {
+    return sizeof(value.size()) + value.size() * sizeof(T);
+  }
+  static void serialize(void** buffer, std::vector<T> const& value) {
+    serialize_value(buffer, value.size());
+    size_t nbyte = value.size() * sizeof(T);
+    ::memcpy(*buffer, value.data(), nbyte);
+    reinterpret_cast<char*&>(*buffer) += nbyte;
+  }
+  static void deserialize(void const** buffer, size_t* buffer_size,
+                          std::vector<T>* value) {
+    size_t size;
+    deserialize_value(buffer, buffer_size, &size);
+    value->resize(size);
+    size_t nbyte = value->size() * sizeof(T);
+    assert(*buffer_size >= nbyte);
+    ::memcpy(value->data(), *buffer, nbyte);
+    reinterpret_cast<char const*&>(*buffer) += nbyte;
+    *buffer_size -= nbyte;
+  }
+};
+
+}  // namespace
+
+template <typename T>
+inline size_t serialized_size(T const& value) {
+  return Serializer<T>::serialized_size(value);
+}
+
+template <typename T>
+inline void serialize_value(void** buffer, T const& value) {
+  return Serializer<T>::serialize(buffer, value);
+}
+
+template <typename T>
+inline void deserialize_value(void const** buffer, size_t* buffer_size,
+                              T* value) {
+  return Serializer<T>::deserialize(buffer, buffer_size, value);
+}
diff --git a/paddle/fluid/inference/tensorrt/plugin/split_op_plugin.cu b/paddle/fluid/inference/tensorrt/plugin/split_op_plugin.cu
new file mode 100644
index 0000000000..044c229b55
--- /dev/null
+++ b/paddle/fluid/inference/tensorrt/plugin/split_op_plugin.cu
@@ -0,0 +1,114 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <cassert>
+#include "paddle/fluid/inference/tensorrt/plugin/split_op_plugin.h"
+
+namespace paddle {
+namespace inference {
+namespace tensorrt {
+
+SplitPlugin* CreateSplitPlugin() { return new SplitPlugin(); };
+
+nvinfer1::Dims SplitPlugin::getOutputDimensions(int index,
+                                                const nvinfer1::Dims* inputDims,
+                                                int nbInputs) {
+  assert(nbInputs == 1);
+  assert(index < this->getNbOutputs());
+  nvinfer1::Dims const& input_dims = inputDims[0];
+  nvinfer1::Dims output_dims = input_dims;
+  output_dims.d[axis_] = output_lenght_.at(index);
+  return output_dims;
+}
+
+int SplitPlugin::initialize() {
+  std::vector<int> segment_offsets(1, 0);
+  for (int i = 0; i < this->getNbOutputs(); ++i) {
+    segment_offsets.push_back(segment_offsets.back() + output_lenght_[i]);
+  }
+  d_segment_offsets_ = segment_offsets;
+  nvinfer1::Dims dims = this->getInputDims(0);
+  nx_ = 1;
+  for (int i = dims.nbDims - 1; i > axis_; --i) {
+    nx_ *= dims.d[i];
+  }
+  ny_ = dims.d[axis_];
+  nz_ = 1;
+  for (int i = axis_ - 1; i >= 0; --i) {
+    nz_ *= dims.d[i];
+  }
+  return 0;
+}
+
+template <typename T>
+__device__ int upper_bound(T const* vals, int n, T const& key) {
+  int i = 0;
+  while (n > 0) {
+    int m = n / 2;
+    int j = i + m;
+    if (!(key < vals[j])) {
+      i = j + 1;
+      n -= m + 1;
+    } else {
+      n = m;
+    }
+  }
+  return i;
+}
+
+template <typename T>
+__global__ void split_kernel(int nsegment,
+                             int const* __restrict__ segment_offsets,
+                             T const* __restrict__ idata, T* const* odatas,
+                             int nx, int srcny_, int nz) {
+  int x0 = threadIdx.x + blockIdx.x * blockDim.x;
+  int src_y0 = threadIdx.y + blockIdx.y * blockDim.y;
+  int z0 = threadIdx.z + blockIdx.z * blockDim.z;
+  for (int z = z0; z < nz; z += blockDim.z * gridDim.z) {
+    for (int src_y = src_y0; src_y < srcny_; src_y += blockDim.y * gridDim.y) {
+      for (int x = x0; x < nx; x += blockDim.x * gridDim.x) {
+        int segment = upper_bound(segment_offsets, nsegment, src_y) - 1;
+        int dst_y = src_y - segment_offsets[segment];
+        int dstny_ = segment_offsets[segment + 1] - segment_offsets[segment];
+        odatas[segment][x + nx * (dst_y + dstny_ * z)] =
+            idata[x + nx * (src_y + srcny_ * z)];
+      }
+    }
+  }
+}
+
+int SplitPlugin::enqueue(int batchSize, const void* const* inputs,
+                         void** outputs, void* workspace, cudaStream_t stream) {
+  auto const& input_dims = this->getInputDims(0);
+  int const* d_segment_offsets_ptr =
+      thrust::raw_pointer_cast(&d_segment_offsets_[0]);
+  float const* idata = reinterpret_cast<float const*>(inputs[0]);
+  float** odatas = reinterpret_cast<float**>(outputs);
+
+  int nz = nz_ * batchSize;
+  dim3 block(32, 16);
+  dim3 grid(std::min((nx_ - 1) / block.x + 1, 65535u),
+            std::min((ny_ - 1) / block.y + 1, 65535u),
+            std::min((nz_ - 1) / block.z + 1, 65535u));
+
+  split_kernel<<<grid, block, 0, stream>>>(d_segment_offsets_.size(),
+                                           d_segment_offsets_ptr, idata, odatas,
+                                           nx_, ny_, nz);
+
+  return cudaGetLastError() != cudaSuccess;
+}
+
+}  // tensorrt
+}  // inference
+}  // paddle
diff --git a/paddle/fluid/inference/tensorrt/plugin/split_op_plugin.h b/paddle/fluid/inference/tensorrt/plugin/split_op_plugin.h
new file mode 100644
index 0000000000..406c822bb5
--- /dev/null
+++ b/paddle/fluid/inference/tensorrt/plugin/split_op_plugin.h
@@ -0,0 +1,62 @@
+
+#pragma once
+
+#include "paddle/fluid/inference/tensorrt/plugin/trt_plugin.h"
+#include <thrust/device_vector.h>
+
+namespace paddle {
+namespace inference {
+namespace tensorrt {
+
+class SplitPlugin : public PluginTensorRT {
+  int axis_;
+  std::vector<int> output_lenght_;
+  int nx_, ny_, nz_;
+  thrust::device_vector<int> d_segment_offsets_;
+
+ protected:
+  virtual size_t getSerializationSize() override {
+    return serialized_size(axis_) + serialized_size(output_lenght_)
+      + getBaseSerializationSize();
+  }
+
+  virtual void serialize(void *buffer) override {
+    serializeBase(buffer);
+    serialize_value(&buffer, axis_);
+    serialize_value(&buffer, output_lenght_);
+  }
+
+ public:
+  Split() {}
+  SplitPlugin(void const* serialData, size_t serialLength) {
+    deserializeBase(serialData, serialLength);
+    deserialize_value(&serialData, &serialLength, &axis_);
+    deserialize_value(&serialData, &serialLength, &output_lenght_);
+  }
+
+  SplitPlugin* clone() const override {
+    return new SplitPlugin(axis_, output_lenght_);
+  }
+
+  virtual const char* getPluginType() const override { return "split"; }
+  virtual int getNbOutputs() const override { return output_lenght_.size(); }
+  virtual nvinfer1::Dims getOutputDimensions(int index,
+                                             const nvinfer1::Dims *inputs, int nbInputDims) override;
+  virtual int initialize() override;
+  virtual int enqueue(int batchSize,
+                      const void *const *inputs, void **outputs,
+                      void *workspace, cudaStream_t stream) override;
+
+  void setAxis(int axis) {
+    axis_ = axis;
+  }
+
+  void setOutputLengths(const std::vector<int> & output_lengths) {
+    output_length_ = output_lengths;
+  }
+
+};
+
+} // tensorrt
+} // inference
+} // paddle
diff --git a/paddle/fluid/inference/tensorrt/plugin/trt_plugin.cc b/paddle/fluid/inference/tensorrt/plugin/trt_plugin.cc
new file mode 100644
index 0000000000..4eff6665d4
--- /dev/null
+++ b/paddle/fluid/inference/tensorrt/plugin/trt_plugin.cc
@@ -0,0 +1,63 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/inference/tensorrt/plugin/trt_plugin.h"
+#include "paddle/fluid/inference/tensorrt/plugin/plugin_utils.h"
+
+namespace paddle {
+namespace inference {
+namespace tensorrt {
+
+void PluginTensorRT::serializeBase(void*& buffer) {
+  serialize_value(&buffer, input_dims_);
+  serialize_value(&buffer, max_batch_size_);
+  serialize_value(&buffer, data_type_);
+  serialize_value(&buffer, data_format_);
+}
+
+void PluginTensorRT::deserializeBase(void const*& serialData,
+                                     size_t& serialLength) {
+  deserialize_value(&serialData, &serialLength, &input_dims_);
+  deserialize_value(&serialData, &serialLength, &max_batch_size_);
+  deserialize_value(&serialData, &serialLength, &data_type_);
+  deserialize_value(&serialData, &serialLength, &data_format_);
+}
+
+size_t PluginTensorRT::getBaseSerializationSize() {
+  return (serialized_size(input_dims_) + serialized_size(max_batch_size_) +
+          serialized_size(data_type_) + serialized_size(data_format_));
+}
+
+bool PluginTensorRT::supportsFormat(nvinfer1::DataType type,
+                                    nvinfer1::PluginFormat format) const {
+  return ((type == nvinfer1::DataType::kFLOAT ||
+           type == nvinfer1::DataType::kHALF) &&
+          (format == nvinfer1::PluginFormat::kNCHW));
+}
+
+void PluginTensorRT::configureWithFormat(const nvinfer1::Dims* inputDims,
+                                         int nbInputs,
+                                         const nvinfer1::Dims* outputDims,
+                                         int nbOutputs, nvinfer1::DataType type,
+                                         nvinfer1::PluginFormat format,
+                                         int maxBatchSize) {
+  data_type_ = type;
+  data_format_ = format;
+  input_dims_.assign(inputDims, inputDims + nbInputs);
+  max_batch_size_ = maxBatchSize;
+}
+
+}  // namespace tensorrt
+}  // namespace inference
+}  // namespace paddle
diff --git a/paddle/fluid/inference/tensorrt/plugin/trt_plugin.h b/paddle/fluid/inference/tensorrt/plugin/trt_plugin.h
new file mode 100644
index 0000000000..8168646bde
--- /dev/null
+++ b/paddle/fluid/inference/tensorrt/plugin/trt_plugin.h
@@ -0,0 +1,72 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <NvInfer.h>
+#include <cassert>
+#include <cstring>
+#include <iostream>
+#include <unordered_map>
+#include <vector>
+
+#include "paddle/fluid/inference/tensorrt/plugin/serialize.hpp"
+
+namespace paddle {
+namespace inference {
+namespace tensorrt {
+
+class PluginTensorRT : public nvinfer1::IPluginExt {
+ public:
+  PluginTensorRT() {}
+  PluginTensorRT(const void* serialized_data, size_t length) {}
+  nvinfer1::Dims const& getInputDims(int index) const {
+    return input_dims_.at(index);
+  }
+  size_t getMaxBatchSize() const { return max_batch_size_; }
+  nvinfer1::DataType getDataType() const { return data_type_; }
+  nvinfer1::PluginFormat getDataFormat() const { return data_format_; }
+  virtual const char* getPluginVersion() const { return "1"; }
+  size_t getWorkspaceSize(int) const override { return 0; }
+  void terminate() override {}
+  virtual ~PluginTensorRT() {}
+
+  // The following functions need to be overrided in the subclass.
+  virtual nvinfer1::IPluginExt* clone() const = 0;
+  virtual const char* getPluginType() const = 0;
+  int initialize() override { return 0; }
+  bool supportsFormat(nvinfer1::DataType type,
+                      nvinfer1::PluginFormat format) const override;
+  void configureWithFormat(const nvinfer1::Dims* inputDims, int nbInputs,
+                           const nvinfer1::Dims* outputDims, int nbOutputs,
+                           nvinfer1::DataType type,
+                           nvinfer1::PluginFormat format,
+                           int maxBatchSize) override;
+  virtual void serialize(void* buffer) override;
+  virtual size_t getSerializationSize() override;
+
+ protected:
+  void deserializeBase(void const*& serialData, size_t& serialLength);
+  size_t getBaseSerializationSize();
+  void serializeBase(void*& buffer);
+
+  std::vector<nvinfer1::Dims> input_dims_;
+  size_t max_batch_size_;
+  nvinfer1::DataType data_type_;
+  nvinfer1::PluginFormat data_format_;
+};
+
+}  // namespace tensorrt
+}  // namespace inference
+}  // namespace paddle

From 99dffb91d668d70b7c110f76de70d9666c5dc7d4 Mon Sep 17 00:00:00 2001
From: Xin Pan <panxin.grad@gmail.com>
Date: Thu, 8 Nov 2018 20:20:33 +0800
Subject: [PATCH 29/50] allow to repeatedly share and update BuildStrategy

test=develop
---
 paddle/fluid/framework/details/build_strategy.cc | 16 ++++++++++------
 paddle/fluid/framework/details/build_strategy.h  |  4 +++-
 paddle/fluid/pybind/pybind.cc                    |  9 ++++++---
 .../fluid/tests/unittests/test_pass_builder.py   |  2 +-
 4 files changed, 20 insertions(+), 11 deletions(-)

diff --git a/paddle/fluid/framework/details/build_strategy.cc b/paddle/fluid/framework/details/build_strategy.cc
index 48f94a1f05..132725fa7e 100644
--- a/paddle/fluid/framework/details/build_strategy.cc
+++ b/paddle/fluid/framework/details/build_strategy.cc
@@ -79,9 +79,15 @@ class ParallelExecutorPassBuilder : public ir::PassBuilder {
   BuildStrategy strategy_;
 };
 
-std::shared_ptr<ir::PassBuilder> BuildStrategy::CreatePassesFromStrategy()
-    const {
+std::shared_ptr<ir::PassBuilder> BuildStrategy::CreatePassesFromStrategy(
+    bool from_user) const {
+  if (finalized_by_user_) {
+    return pass_builder_;
+  }
   pass_builder_.reset(new ParallelExecutorPassBuilder(*this));
+  if (from_user) {
+    finalized_by_user_ = true;
+  }
   return pass_builder_;
 }
 
@@ -95,10 +101,8 @@ std::unique_ptr<ir::Graph> BuildStrategy::Apply(
 #else
     const bool use_cuda) const {
 #endif
-  // Create a default one if not initialized by user.
-  if (!pass_builder_) {
-    CreatePassesFromStrategy();
-  }
+  // Create a default one if not finalized by user.
+  CreatePassesFromStrategy(false);
 
   std::unique_ptr<ir::Graph> graph(new ir::Graph(main_program));
 
diff --git a/paddle/fluid/framework/details/build_strategy.h b/paddle/fluid/framework/details/build_strategy.h
index 6c7b54db8f..e9deebd504 100644
--- a/paddle/fluid/framework/details/build_strategy.h
+++ b/paddle/fluid/framework/details/build_strategy.h
@@ -80,7 +80,8 @@ struct BuildStrategy {
   // from python side.
   // A new PassBuilder is created based on configs defined above and
   // passes are owned by the PassBuilder.
-  std::shared_ptr<ir::PassBuilder> CreatePassesFromStrategy() const;
+  std::shared_ptr<ir::PassBuilder> CreatePassesFromStrategy(
+      bool from_user) const;
 
   // Apply the passes built by the pass_builder_. The passes will be
   // applied to the Program and output an ir::Graph.
@@ -97,6 +98,7 @@ struct BuildStrategy {
 #endif
 
  private:
+  mutable bool finalized_by_user_ = false;
   mutable std::shared_ptr<ir::PassBuilder> pass_builder_;
 };
 
diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc
index 238cc19189..b7776df904 100644
--- a/paddle/fluid/pybind/pybind.cc
+++ b/paddle/fluid/pybind/pybind.cc
@@ -855,10 +855,13 @@ All parameter, weight, gradient are variables in Paddle.
           R"DOC(The type is BOOL, fuse_elewise_add_act_ops indicate whether
                      to fuse elementwise_add_op and activation_op,
                      it may make the execution faster. Default False)DOC")
-      .def("_create_passes_from_strategy",
+      .def("_finalize_strategy_and_create_passes",
            [](BuildStrategy &self) -> std::shared_ptr<ir::PassBuilder> {
-             return self.CreatePassesFromStrategy();
-           });
+             return self.CreatePassesFromStrategy(true);
+           },
+           R"DOC(Allow user to customized passes. Normally model-specific
+                optimization passes should be defined in this way. BuildStrategy
+                cannot be updated after being finalized.)DOC");
 
   pe.def(py::init<const std::vector<platform::Place> &,
                   const std::unordered_set<std::string> &,
diff --git a/python/paddle/fluid/tests/unittests/test_pass_builder.py b/python/paddle/fluid/tests/unittests/test_pass_builder.py
index 288c5f6a1f..65ad63dc01 100644
--- a/python/paddle/fluid/tests/unittests/test_pass_builder.py
+++ b/python/paddle/fluid/tests/unittests/test_pass_builder.py
@@ -94,7 +94,7 @@ class TestPassBuilder(unittest.TestCase):
 
     def test_parallel_testing_with_new_strategy(self):
         build_strategy = fluid.BuildStrategy()
-        pass_builder = build_strategy._create_passes_from_strategy()
+        pass_builder = build_strategy._finalize_strategy_and_create_passes()
         origin_len = len(pass_builder.all_passes())
 
         viz_pass = pass_builder.append_pass("graph_viz_pass")

From 759ffca42330f40a5655dae304faa3d9057bc004 Mon Sep 17 00:00:00 2001
From: Xin Pan <panxin.grad@gmail.com>
Date: Tue, 13 Nov 2018 13:15:12 +0800
Subject: [PATCH 30/50] some improvements

test=develop
---
 paddle/fluid/framework/details/build_strategy.cc |  8 ++++----
 paddle/fluid/framework/details/build_strategy.h  | 11 +++++++++--
 paddle/fluid/pybind/pybind.cc                    |  7 +++++++
 3 files changed, 20 insertions(+), 6 deletions(-)

diff --git a/paddle/fluid/framework/details/build_strategy.cc b/paddle/fluid/framework/details/build_strategy.cc
index 132725fa7e..37202f8695 100644
--- a/paddle/fluid/framework/details/build_strategy.cc
+++ b/paddle/fluid/framework/details/build_strategy.cc
@@ -80,13 +80,13 @@ class ParallelExecutorPassBuilder : public ir::PassBuilder {
 };
 
 std::shared_ptr<ir::PassBuilder> BuildStrategy::CreatePassesFromStrategy(
-    bool from_user) const {
-  if (finalized_by_user_) {
+    bool finalize_strategy) const {
+  if (is_finalized_) {
     return pass_builder_;
   }
   pass_builder_.reset(new ParallelExecutorPassBuilder(*this));
-  if (from_user) {
-    finalized_by_user_ = true;
+  if (finalize_strategy) {
+    is_finalized_ = true;
   }
   return pass_builder_;
 }
diff --git a/paddle/fluid/framework/details/build_strategy.h b/paddle/fluid/framework/details/build_strategy.h
index e9deebd504..fc2641dbd4 100644
--- a/paddle/fluid/framework/details/build_strategy.h
+++ b/paddle/fluid/framework/details/build_strategy.h
@@ -75,13 +75,20 @@ struct BuildStrategy {
 
   bool remove_unnecessary_lock_{false};
 
+  // NOTE:
+  // Before you add new options, think if it's a general strategy that works
+  // with other strategy. If not, the strategy should be created through
+  // CreatePassesFromStrategy and the pass can be managed separately.
+
   // User normally doesn't need to call this API.
   // The PassBuilder allows for more customized insert, remove of passes
   // from python side.
   // A new PassBuilder is created based on configs defined above and
   // passes are owned by the PassBuilder.
   std::shared_ptr<ir::PassBuilder> CreatePassesFromStrategy(
-      bool from_user) const;
+      bool finalize_strategy) const;
+
+  bool IsFinalized() const { return is_finalized_; }
 
   // Apply the passes built by the pass_builder_. The passes will be
   // applied to the Program and output an ir::Graph.
@@ -98,7 +105,7 @@ struct BuildStrategy {
 #endif
 
  private:
-  mutable bool finalized_by_user_ = false;
+  mutable bool is_finalized_ = false;
   mutable std::shared_ptr<ir::PassBuilder> pass_builder_;
 };
 
diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc
index b7776df904..68b80c6311 100644
--- a/paddle/fluid/pybind/pybind.cc
+++ b/paddle/fluid/pybind/pybind.cc
@@ -791,6 +791,7 @@ All parameter, weight, gradient are variables in Paddle.
           "reduce_strategy",
           [](const BuildStrategy &self) { return self.reduce_; },
           [](BuildStrategy &self, BuildStrategy::ReduceStrategy strategy) {
+            PADDLE_ENFORCE(!self.IsFinalized(), "BuildStrategy is finlaized.");
             self.reduce_ = strategy;
           },
           R"DOC(The type is STR, there are two reduce strategies in ParallelExecutor,
@@ -804,6 +805,7 @@ All parameter, weight, gradient are variables in Paddle.
           [](const BuildStrategy &self) { return self.gradient_scale_; },
           [](BuildStrategy &self,
              BuildStrategy::GradientScaleStrategy strategy) {
+            PADDLE_ENFORCE(!self.IsFinalized(), "BuildStrategy is finlaized.");
             self.gradient_scale_ = strategy;
           },
           R"DOC(The type is STR, there are three ways of defining :math:`loss@grad` in
@@ -815,6 +817,7 @@ All parameter, weight, gradient are variables in Paddle.
           "debug_graphviz_path",
           [](const BuildStrategy &self) { return self.debug_graphviz_path_; },
           [](BuildStrategy &self, const std::string &path) {
+            PADDLE_ENFORCE(!self.IsFinalized(), "BuildStrategy is finlaized.");
             self.debug_graphviz_path_ = path;
           },
           R"DOC(The type is STR, debug_graphviz_path indicate the path that
@@ -824,6 +827,7 @@ All parameter, weight, gradient are variables in Paddle.
           "enable_data_balance",
           [](const BuildStrategy &self) { return self.enable_data_balance_; },
           [](BuildStrategy &self, bool b) {
+            PADDLE_ENFORCE(!self.IsFinalized(), "BuildStrategy is finlaized.");
             self.enable_data_balance_ = b;
           })  // FIXME(chengudo): enable_data_balance seems not important
       .def_property(
@@ -832,6 +836,7 @@ All parameter, weight, gradient are variables in Paddle.
             return self.enable_sequential_execution_;
           },
           [](BuildStrategy &self, bool b) {
+            PADDLE_ENFORCE(!self.IsFinalized(), "BuildStrategy is finlaized.");
             self.enable_sequential_execution_ = b;
           },
           R"DOC(The type is BOOL. If set True, the execution order of ops would be the same as what is in the program. Default False.)DOC")
@@ -841,6 +846,7 @@ All parameter, weight, gradient are variables in Paddle.
             return self.remove_unnecessary_lock_;
           },
           [](BuildStrategy &self, bool b) {
+            PADDLE_ENFORCE(!self.IsFinalized(), "BuildStrategy is finlaized.");
             self.remove_unnecessary_lock_ = b;
           },
           R"DOC(The type is BOOL. If set True, some locks in GPU ops would be released and ParallelExecutor would run faster. Default False.)DOC")
@@ -850,6 +856,7 @@ All parameter, weight, gradient are variables in Paddle.
             return self.fuse_elewise_add_act_ops_;
           },
           [](BuildStrategy &self, bool b) {
+            PADDLE_ENFORCE(!self.IsFinalized(), "BuildStrategy is finlaized.");
             self.fuse_elewise_add_act_ops_ = b;
           },
           R"DOC(The type is BOOL, fuse_elewise_add_act_ops indicate whether

From ea3538d8ddc7fb6df7559697614751fb4683cd53 Mon Sep 17 00:00:00 2001
From: baojun-nervana <baojun.liu@intel.com>
Date: Mon, 12 Nov 2018 10:13:32 -0800
Subject: [PATCH 31/50] Added fused operator test=develop

---
 paddle/fluid/framework/CMakeLists.txt     |   8 +-
 paddle/fluid/framework/executor.cc        |  22 ++-
 paddle/fluid/framework/ngraph_bridge.cc   |  39 ++++
 paddle/fluid/framework/ngraph_bridge.h    |  58 ++++++
 paddle/fluid/framework/ngraph_operator.cc | 216 ++++++++++++++++++++++
 paddle/fluid/framework/ngraph_operator.h  |  72 ++++++++
 python/paddle/fluid/__init__.py           |   8 +-
 7 files changed, 416 insertions(+), 7 deletions(-)
 create mode 100644 paddle/fluid/framework/ngraph_bridge.cc
 create mode 100644 paddle/fluid/framework/ngraph_bridge.h
 create mode 100644 paddle/fluid/framework/ngraph_operator.cc
 create mode 100644 paddle/fluid/framework/ngraph_operator.h

diff --git a/paddle/fluid/framework/CMakeLists.txt b/paddle/fluid/framework/CMakeLists.txt
index 8442911406..50e0677c21 100644
--- a/paddle/fluid/framework/CMakeLists.txt
+++ b/paddle/fluid/framework/CMakeLists.txt
@@ -136,6 +136,10 @@ cc_library(version SRCS version.cc)
 cc_test(version_test SRCS version_test.cc DEPS version)
 
 cc_library(proto_desc SRCS var_desc.cc op_desc.cc block_desc.cc program_desc.cc DEPS shape_inference op_info operator glog version)
+cc_library(ngraph_bridge SRCS ngraph_bridge.cc DEPS operator framework_proto)
+cc_library(ngraph_operator SRCS ngraph_operator.cc DEPS ngraph_bridge operator op_info device_context tensor scope glog
+  shape_inference data_transform lod_tensor profiler)
+
 
 cc_library(op_registry SRCS op_registry.cc DEPS op_proto_maker op_info operator glog proto_desc)
 nv_test(op_registry_test SRCS op_registry_test.cc DEPS op_registry)
@@ -163,10 +167,10 @@ if(WITH_DISTRIBUTE)
   set(DISTRIBUTE_COMPILE_FLAGS "-Wno-non-virtual-dtor -Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor")
   set_source_files_properties(executor.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
 else()
-  cc_library(executor SRCS executor.cc DEPS op_registry device_context scope framework_proto glog lod_rank_table feed_fetch_method graph_to_program_pass)
+  cc_library(executor SRCS executor.cc DEPS op_registry device_context scope framework_proto glog lod_rank_table feed_fetch_method graph_to_program_pass ngraph_operator)
   cc_test(test_naive_executor SRCS naive_executor_test.cc DEPS naive_executor elementwise_add_op)
 endif()
- 
+
 if (NOT WIN32)
 cc_library(parallel_executor SRCS parallel_executor.cc DEPS
         threaded_ssa_graph_executor scope_buffered_ssa_graph_executor
diff --git a/paddle/fluid/framework/executor.cc b/paddle/fluid/framework/executor.cc
index fc6b325286..7c9c8331e2 100644
--- a/paddle/fluid/framework/executor.cc
+++ b/paddle/fluid/framework/executor.cc
@@ -17,6 +17,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/feed_fetch_method.h"
 #include "paddle/fluid/framework/lod_rank_table.h"
 #include "paddle/fluid/framework/lod_tensor_array.h"
+#include "paddle/fluid/framework/ngraph_operator.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/reader.h"
 #include "paddle/fluid/operators/detail/macros.h"
@@ -25,6 +26,7 @@ limitations under the License. */
 
 DECLARE_bool(benchmark);
 DEFINE_bool(use_mkldnn, false, "Use MKLDNN to run");
+DEFINE_bool(use_ngraph, false, "Use NGRAPH to run");
 
 namespace paddle {
 namespace framework {
@@ -81,6 +83,24 @@ static void DeleteUnusedTensors(const Scope& scope, const OperatorBase* op,
   }
 }
 
+static void EnableFusedOp(ExecutorPrepareContext* ctx) {
+#ifdef PADDLE_WITH_NGRAPH
+  VLOG(3) << "use_ngraph=True";
+  auto intervals = FusedOperator::FusedOpIntervals(&ctx->ops_);
+  for (auto& interval : intervals) {
+    auto* fused_op = new FusedOperator(ctx->prog_, ctx->block_id_,
+                                       interval.at(0), interval.at(1));
+    *interval[0] = std::unique_ptr<OperatorBase>(fused_op);
+  }
+  for (auto it = intervals.rbegin(); it != intervals.rend(); ++it) {
+    ctx->ops_.erase(it->at(0) + 1, it->at(1));
+  }
+#else
+  LOG(WARNING)
+      << "'NGRAPH' is not supported, Please re-compile with WITH_NGRAPH option";
+#endif
+}
+
 Executor::Executor(const platform::Place& place) : place_(place) {}
 
 void Executor::Close() {
@@ -338,6 +358,7 @@ std::unique_ptr<ExecutorPrepareContext> Executor::Prepare(
   for (auto& op_desc : block.AllOps()) {
     ctx->ops_.push_back(OpRegistry::CreateOp(*op_desc));
   }
+  if (FLAGS_use_ngraph) EnableFusedOp(ctx.get());
   return ctx;
 }
 
@@ -485,6 +506,5 @@ void Executor::EnableMKLDNN(const ProgramDesc& program) {
       << "'MKLDNN' is not supported, Please re-compile with WITH_MKLDNN option";
 #endif
 }
-
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/fluid/framework/ngraph_bridge.cc b/paddle/fluid/framework/ngraph_bridge.cc
new file mode 100644
index 0000000000..8177436d0b
--- /dev/null
+++ b/paddle/fluid/framework/ngraph_bridge.cc
@@ -0,0 +1,39 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef PADDLE_WITH_NGRAPH
+#include <algorithm>
+#include <functional>
+
+#include "paddle/fluid/framework/ngraph_bridge.h"
+
+#include "ngraph/ngraph.hpp"
+
+namespace paddle {
+namespace framework {
+
+std::map<std::string,
+         std::function<void(const std::shared_ptr<OperatorBase>&,
+                            std::shared_ptr<std::unordered_map<
+                                std::string, std::shared_ptr<ngraph::Node>>>)>>
+    NgraphBridge::NG_NODE_MAP = {};
+
+void NgraphBridge::build_graph(const std::shared_ptr<OperatorBase>& op) {
+  auto& op_type = op->Type();
+  NG_NODE_MAP[op_type](op, ngb_node_map);
+}
+
+}  // namespace framework
+}  // namespace paddle
+#endif
diff --git a/paddle/fluid/framework/ngraph_bridge.h b/paddle/fluid/framework/ngraph_bridge.h
new file mode 100644
index 0000000000..55bf0d21f3
--- /dev/null
+++ b/paddle/fluid/framework/ngraph_bridge.h
@@ -0,0 +1,58 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#ifdef PADDLE_WITH_NGRAPH
+
+#include <algorithm>
+#include <map>
+#include <string>
+#include <unordered_map>
+#include <vector>
+
+#include "paddle/fluid/framework/operator.h"
+#include "paddle/fluid/platform/enforce.h"
+
+#include "ngraph/ngraph.hpp"
+
+namespace paddle {
+namespace framework {
+
+class NgraphBridge {
+ public:
+  static std::map<
+      std::string,
+      std::function<void(const std::shared_ptr<OperatorBase>&,
+                         std::shared_ptr<std::unordered_map<
+                             std::string, std::shared_ptr<ngraph::Node>>>)>>
+      NG_NODE_MAP;
+
+  explicit NgraphBridge(
+      std::shared_ptr<
+          std::unordered_map<std::string, std::shared_ptr<ngraph::Node>>>
+          var_node_map)
+      : ngb_node_map(var_node_map) {}
+
+  void build_graph(const std::shared_ptr<OperatorBase>& op);
+
+ private:
+  std::shared_ptr<
+      std::unordered_map<std::string, std::shared_ptr<ngraph::Node>>>
+      ngb_node_map;
+};
+
+}  // namespace framework
+}  // namespace paddle
+#endif
diff --git a/paddle/fluid/framework/ngraph_operator.cc b/paddle/fluid/framework/ngraph_operator.cc
new file mode 100644
index 0000000000..70e6f97b4c
--- /dev/null
+++ b/paddle/fluid/framework/ngraph_operator.cc
@@ -0,0 +1,216 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef PADDLE_WITH_NGRAPH
+#include <glog/logging.h>
+
+#include <algorithm>
+#include <map>
+
+#include "paddle/fluid/framework/feed_fetch_type.h"
+#include "paddle/fluid/framework/ngraph_operator.h"
+#include "paddle/fluid/framework/shape_inference.h"
+#include "paddle/fluid/framework/var_desc.h"
+#include "paddle/fluid/framework/var_type.h"
+
+namespace paddle {
+namespace framework {
+
+static std::map<proto::VarType::Type, ngraph::element::Type> pd2ng_type_map = {
+    {proto::VarType::FP32, ngraph::element::f32},
+    {proto::VarType::FP64, ngraph::element::f64},
+    {proto::VarType::INT32, ngraph::element::i32},
+    {proto::VarType::INT64, ngraph::element::i64},
+    {proto::VarType::BOOL, ngraph::element::boolean},
+};
+
+class NgraphOperator {
+ public:
+  explicit NgraphOperator(const Scope& scope, const platform::Place& place,
+                          const std::vector<std::shared_ptr<OperatorBase>>& ops,
+                          const std::unordered_map<
+                              std::string, ngraph::element::Type>& var_type_map,
+                          const std::unordered_set<std::string>& persist,
+                          const std::unordered_set<std::string>& fetches,
+                          const std::unordered_set<std::string>& post_op_inputs,
+                          int is_test_or_train)
+      : scope(scope),
+        place(place),
+        fused_ops(ops),
+        var_type_map(var_type_map),
+        persistables(persist),
+        fetches(fetches),
+        post_op_inputs(post_op_inputs),
+        is_test_or_train(is_test_or_train) {}
+
+  void Run(const Scope& scope, const platform::Place& place) const;
+
+ private:
+  static std::unordered_map<std::string, std::shared_ptr<ngraph::Function>>
+      func_cache;
+  const Scope& scope;
+  const platform::Place& place;
+  std::vector<std::shared_ptr<OperatorBase>> fused_ops;
+  std::unordered_map<std::string, ngraph::element::Type> var_type_map;
+  std::unordered_set<std::string> persistables;
+  std::unordered_set<std::string> fetches;
+  std::unordered_set<std::string> post_op_inputs;
+  // 0 = default; 1 = (is_test && not is_complete)
+  // 2 = (is_test && is_complete)
+  // 3 = (is_training && not is_complete)
+  // 4 = (is_training && is_complete)
+  int is_test_or_train;
+};
+
+std::vector<std::vector<std::vector<std::unique_ptr<OperatorBase>>::iterator>>
+FusedOperator::FusedOpIntervals(
+    std::vector<std::unique_ptr<paddle::framework::OperatorBase>>* ops) {
+  std::vector<std::vector<std::vector<std::unique_ptr<OperatorBase>>::iterator>>
+      intervals;
+  if (ops->empty()) {
+    return intervals;
+  }
+  size_t size = ops->size();
+  size_t left = 0;
+  while (left < size && ops.at(left)->Type() != kFeedOpType) {
+    ++left;
+  }
+  if (left == size) {
+    return intervals;
+  }
+  while (left < size && ops->at(left)->Type() == kFeedOpType) {
+    ++left;
+  }
+
+  size_t right = left;
+  while (right < size && ops->at(right)->Type() != kFetchOpType) {
+    ++right;
+  }
+  if (right == size) {
+    return intervals;
+  }
+  if (left >= right) return intervals;
+
+  // (left, right - 1) represents indices between feed and fetch
+  size_t pivot = left;
+  while (pivot < right) {
+    auto op_type = ops->at(pivot)->Type();
+    if (paddle::framework::NgraphBridge::NG_NODE_MAP.find(op_type) ==
+        paddle::framework::NgraphBridge::NG_NODE_MAP.end()) {
+      ++pivot;
+    } else {
+      size_t start = pivot, end = start;
+      while (pivot < right &&
+             (paddle::framework::NgraphBridge::NG_NODE_MAP.find(
+                  ops.at(pivot)->Type()) !=
+              paddle::framework::NgraphBridge::NG_NODE_MAP.end())) {
+        ++pivot;
+        ++end;
+      }
+      std::vector<std::vector<std::unique_ptr<OperatorBase>>::iterator>
+          interval = {ops->begin() + start, ops->begin() + end};
+      intervals.push_back(interval);
+    }
+  }  // end while
+
+  return intervals;
+}
+
+FusedOperator::FusedOperator(
+    const ProgramDesc& prog, size_t block_id,
+    std::vector<std::unique_ptr<OperatorBase>>::iterator start,
+    std::vector<std::unique_ptr<OperatorBase>>::iterator end,
+    const std::string& type = "fused_op", const VariableNameMap& inputs = {},
+    const VariableNameMap& outputs = {}, const AttributeMap& attrs = {})
+    : OperatorBase(type, inputs, outputs, attrs), pdesc(prog), block(block_id) {
+  for (std::vector<std::unique_ptr<OperatorBase>>::iterator it = start;
+       it != end; ++it) {
+    fused_ops.push_back(std::move(*it));
+  }
+
+  for (std::vector<std::unique_ptr<OperatorBase>>::iterator it = end;
+       (*it)->Type() != kFetchOpType; ++it) {
+    for (auto& var_name_item : (*it)->Inputs()) {
+      for (auto& var_name : var_name_item.second) {
+        post_op_inputs.insert(var_name);
+      }
+    }
+  }
+
+  if ((*(start - 1))->Type() == kFeedOpType && (*end)->Type() == kFetchOpType) {
+    is_complete = true;
+  }
+
+  process();
+}
+
+void FusedOperator::process() {
+  auto& bdesc = pdesc.Block(block);
+  for (auto& var : bdesc.AllVars()) {
+    if (!(var->GetType() == proto::VarType::SELECTED_ROWS ||
+          var->GetType() == proto::VarType::LOD_TENSOR ||
+          var->GetType() == proto::VarType::LOD_TENSOR_ARRAY)) {
+      continue;
+    }
+
+    auto var_name = var->Name();
+    if (var->Name() == framework::kEmptyVarName) {
+      continue;
+    }
+
+    if (var_name != "fetch" && var_name != "feed") {
+      auto pd_type = var->GetDataType();
+      if (pd2ng_type_map.find(pd_type) == pd2ng_type_map.end()) {
+        PADDLE_THROW("Data type of var %s not found in pd2ng_type_map",
+                     var_name);
+      }
+      var_type_map[var_name] = pd2ng_type_map[pd_type];
+    }
+
+    if (var->Persistable()) {
+      persistables.insert(var->Name());
+    }
+  }
+
+  for (auto* op : bdesc.AllOps()) {
+    if (op->Type() == kFetchOpType) {
+      std::string fetch_target_name = op->Input("X")[0];
+      fetches.insert(fetch_target_name);
+    }
+  }
+}
+
+void FusedOperator::RunImpl(const Scope& scope,
+                            const platform::Place& place) const {
+  int is_test_or_train = 1;
+  auto& bdesc = pdesc.Block(block);
+  for (auto* op : bdesc.AllOps()) {
+    if (op->Type().find("_grad") != std::string::npos) {
+      is_test_or_train = 3;
+      break;
+    }
+  }
+
+  if (is_complete) {
+    is_test_or_train = is_test_or_train == 1 ? 2 : 4;
+  }
+
+  NgraphOperator ngraph_op(scope, place, fused_ops, var_type_map, persistables,
+                           fetches, post_op_inputs, is_test_or_train);
+  ngraph_op.Run(scope, place);
+}
+
+}  // namespace framework
+}  // namespace paddle
+#endif
diff --git a/paddle/fluid/framework/ngraph_operator.h b/paddle/fluid/framework/ngraph_operator.h
new file mode 100644
index 0000000000..eb77c78115
--- /dev/null
+++ b/paddle/fluid/framework/ngraph_operator.h
@@ -0,0 +1,72 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#ifdef PADDLE_WITH_NGRAPH
+
+#include <algorithm>
+#include <atomic>
+#include <string>
+#include <unordered_map>
+#include <vector>
+
+#include "paddle/fluid/framework/attribute.h"
+#include "paddle/fluid/framework/framework.pb.h"
+#include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/framework/ngraph_bridge.h"
+#include "paddle/fluid/framework/op_info.h"
+#include "paddle/fluid/framework/op_kernel_type.h"
+#include "paddle/fluid/framework/operator.h"
+#include "paddle/fluid/framework/program_desc.h"
+#include "paddle/fluid/framework/scope.h"
+#include "paddle/fluid/framework/tensor.h"
+#include "paddle/fluid/platform/variant.h"
+
+#include "ngraph/ngraph.hpp"
+
+namespace paddle {
+namespace framework {
+
+class FusedOperator : public OperatorBase {
+ public:
+  static std::vector<
+      std::vector<std::vector<std::unique_ptr<OperatorBase>>::iterator>>
+  FusedOpIntervals(
+      std::vector<std::unique_ptr<paddle::framework::OperatorBase>>* ops);
+
+  explicit FusedOperator(
+      const ProgramDesc& prog, size_t block_id,
+      std::vector<std::unique_ptr<OperatorBase>>::iterator start,
+      std::vector<std::unique_ptr<OperatorBase>>::iterator end,
+      const std::string& type = "fused_op", const VariableNameMap& inputs = {},
+      const VariableNameMap& outputs = {}, const AttributeMap& attrs = {});
+
+  void RunImpl(const Scope& scope, const platform::Place& place) const final;
+
+ private:
+  const ProgramDesc pdesc;
+  size_t block;
+  std::vector<std::shared_ptr<OperatorBase>> fused_ops;
+  std::unordered_map<std::string, ngraph::element::Type> var_type_map;
+  std::unordered_set<std::string> persistables;
+  std::unordered_set<std::string> fetches;
+  std::unordered_set<std::string> post_op_inputs;
+  bool is_complete = false;
+
+  void process();
+};
+}  // namespace framework
+}  // namespace paddle
+#endif
diff --git a/python/paddle/fluid/__init__.py b/python/paddle/fluid/__init__.py
index 0b997009bf..dd57a8aac2 100644
--- a/python/paddle/fluid/__init__.py
+++ b/python/paddle/fluid/__init__.py
@@ -112,10 +112,10 @@ def __bootstrap__():
 
     read_env_flags = [
         'use_pinned_memory', 'check_nan_inf', 'benchmark', 'warpctc_dir',
-        'eager_delete_scope', 'use_mkldnn', 'initial_cpu_memory_in_mb',
-        'init_allocated_mem', 'free_idle_memory', 'paddle_num_threads',
-        'dist_threadpool_size', 'cpu_deterministic', 'eager_delete_tensor_gb',
-        'reader_queue_speed_test_mode'
+        'eager_delete_scope', 'use_mkldnn', 'use_ngraph',
+        'initial_cpu_memory_in_mb', 'init_allocated_mem', 'free_idle_memory',
+        'paddle_num_threads', 'dist_threadpool_size', 'cpu_deterministic',
+        'eager_delete_tensor_gb', 'reader_queue_speed_test_mode'
     ]
     if core.is_compiled_with_dist():
         read_env_flags.append('rpc_deadline')

From d38fd6a0fcd754907ff17fe896651c5274c7f672 Mon Sep 17 00:00:00 2001
From: nhzlx <zlx_hg@163.com>
Date: Tue, 13 Nov 2018 08:23:26 +0000
Subject: [PATCH 32/50] add plugin support and offer an simple split sample

---
 paddle/fluid/inference/analysis/analyzer.cc   |  2 +-
 .../api/api_tensorrt_subgraph_engine.cc       |  1 +
 .../inference/tensorrt/convert/CMakeLists.txt |  7 +-
 .../inference/tensorrt/convert/split_op.cc    | 73 +++++++++++++++
 .../tensorrt/convert/test_split_op.cc         | 53 +++++++++++
 paddle/fluid/inference/tensorrt/engine.cc     |  6 ++
 paddle/fluid/inference/tensorrt/engine.h      |  5 +
 .../inference/tensorrt/plugin/CMakeLists.txt  |  3 +-
 .../tensorrt/plugin/plugin_factory.cc         | 64 -------------
 .../tensorrt/plugin/plugin_factory.h          | 91 -------------------
 .../inference/tensorrt/plugin/plugin_utils.cc | 37 --------
 .../inference/tensorrt/plugin/plugin_utils.h  | 34 -------
 .../plugin/{serialize.hpp => serialize.h}     |  0
 .../tensorrt/plugin/split_op_plugin.cu        | 70 ++++----------
 .../tensorrt/plugin/split_op_plugin.h         | 61 ++++++++-----
 .../inference/tensorrt/plugin/trt_plugin.cc   |  4 +-
 .../inference/tensorrt/plugin/trt_plugin.h    |  8 +-
 17 files changed, 208 insertions(+), 311 deletions(-)
 create mode 100644 paddle/fluid/inference/tensorrt/convert/split_op.cc
 create mode 100644 paddle/fluid/inference/tensorrt/convert/test_split_op.cc
 delete mode 100644 paddle/fluid/inference/tensorrt/plugin/plugin_factory.cc
 delete mode 100644 paddle/fluid/inference/tensorrt/plugin/plugin_factory.h
 delete mode 100644 paddle/fluid/inference/tensorrt/plugin/plugin_utils.cc
 delete mode 100644 paddle/fluid/inference/tensorrt/plugin/plugin_utils.h
 rename paddle/fluid/inference/tensorrt/plugin/{serialize.hpp => serialize.h} (100%)

diff --git a/paddle/fluid/inference/analysis/analyzer.cc b/paddle/fluid/inference/analysis/analyzer.cc
index a3440cfc78..cd6636a7eb 100644
--- a/paddle/fluid/inference/analysis/analyzer.cc
+++ b/paddle/fluid/inference/analysis/analyzer.cc
@@ -71,7 +71,7 @@ class DfgPassManagerImpl final : public DfgPassManager {
         std::unordered_set<std::string> teller_set(
             {"mul", "conv2d", "pool2d", "relu", "softmax", "sigmoid",
              "depthwise_conv2d", "batch_norm", "concat", "tanh", "pad",
-             "elementwise_add", "dropout"});
+             "elementwise_add", "dropout", "split"});
         if (!node->IsFunction()) return false;
 
         const auto* func = static_cast<const Function*>(node);
diff --git a/paddle/fluid/inference/api/api_tensorrt_subgraph_engine.cc b/paddle/fluid/inference/api/api_tensorrt_subgraph_engine.cc
index 94b3933497..eceab6e2be 100644
--- a/paddle/fluid/inference/api/api_tensorrt_subgraph_engine.cc
+++ b/paddle/fluid/inference/api/api_tensorrt_subgraph_engine.cc
@@ -186,3 +186,4 @@ USE_TRT_CONVERTER(batch_norm);
 USE_TRT_CONVERTER(concat);
 USE_TRT_CONVERTER(dropout);
 USE_TRT_CONVERTER(pad);
+USE_TRT_CONVERTER(split);
diff --git a/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt b/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt
index e34d5db6b8..ed4c398cee 100644
--- a/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt
+++ b/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt
@@ -1,7 +1,8 @@
 # Add TRT tests
 nv_library(tensorrt_converter
   SRCS mul_op.cc conv2d_op.cc fc_op.cc pool2d_op.cc elementwise_op.cc
-batch_norm_op.cc activation_op.cc softmax_op.cc concat_op.cc dropout_op.cc pad_op.cc
+batch_norm_op.cc activation_op.cc softmax_op.cc concat_op.cc dropout_op.cc
+pad_op.cc split_op.cc
   DEPS tensorrt_engine tensorrt_plugin operator scope framework_proto op_registry)
 
 nv_test(test_op_converter SRCS test_op_converter.cc DEPS
@@ -28,6 +29,8 @@ nv_test(test_trt_concat_op SRCS test_concat_op.cc concat_op.cc
         DEPS ${FLUID_CORE_MODULES} tensorrt_engine concat_op SERIAL)
 nv_test(test_trt_dropout_op SRCS test_dropout_op.cc dropout_op.cc
         DEPS ${FLUID_CORE_MODULES} tensorrt_engine dropout_op SERIAL)
-
 nv_test(test_trt_pad_op SRCS test_pad_op.cc pad_op.cc
         DEPS ${FLUID_CORE_MODULES} tensorrt_engine pad_op SERIAL)
+nv_test(test_trt_split_op SRCS test_split_op.cc split_op.cc
+        DEPS ${FLUID_CORE_MODULES} tensorrt_engine tensorrt_plugin
+split_op concat_op SERIAL)
diff --git a/paddle/fluid/inference/tensorrt/convert/split_op.cc b/paddle/fluid/inference/tensorrt/convert/split_op.cc
new file mode 100644
index 0000000000..60d07859f3
--- /dev/null
+++ b/paddle/fluid/inference/tensorrt/convert/split_op.cc
@@ -0,0 +1,73 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
+#include "paddle/fluid/inference/tensorrt/plugin/split_op_plugin.h"
+
+namespace paddle {
+namespace inference {
+namespace tensorrt {
+
+/*
+ * SplitOp.
+ */
+class SplitOpConverter : public OpConverter {
+ public:
+  void operator()(const framework::proto::OpDesc& op,
+                  const framework::Scope& scope, bool test_mode) override {
+    VLOG(40) << "convert a fluid split op to tensorrt split layer";
+
+    framework::OpDesc op_desc(op, nullptr);
+    // Declare inputs
+    auto* input = engine_->GetITensor(op_desc.Input("X")[0]);
+    auto input_dims = input->getDimensions();
+    int input_num = op_desc.Input("X").size();
+    size_t output_num = op_desc.Output("Out").size();
+
+    PADDLE_ENFORCE(input_num == 1);
+    int axis = boost::get<int>(op_desc.GetAttr("axis"));
+    std::vector<int> output_lengths =
+        boost::get<std::vector<int>>(op_desc.GetAttr("sections"));
+    PADDLE_ENFORCE(axis != 0);
+    if (axis < 0) {
+      axis += input_dims.nbDims;
+    } else {
+      axis -= 1;
+    }
+
+    PADDLE_ENFORCE(output_lengths.size() == output_num);
+
+    SplitPlugin* plugin = new SplitPlugin(axis, output_lengths);
+    nvinfer1::IPluginLayer* layer =
+        engine_->addPlugin(&input, input_num, plugin);
+
+    std::string layer_name = "split (Output: ";
+    for (size_t i = 0; i < output_num; i++) {
+      auto output_name = op_desc.Output("Out")[i];
+      layer->getOutput(i)->setName(output_name.c_str());
+      engine_->SetITensor(output_name, layer->getOutput(i));
+      layer_name += output_name;
+      if (test_mode) {
+        engine_->DeclareOutput(output_name);
+      }
+    }
+    layer->setName((layer_name + ")").c_str());
+  }
+};
+
+}  // namespace tensorrt
+}  // namespace inference
+}  // namespace paddle
+
+REGISTER_TRT_OP_CONVERTER(split, SplitOpConverter);
diff --git a/paddle/fluid/inference/tensorrt/convert/test_split_op.cc b/paddle/fluid/inference/tensorrt/convert/test_split_op.cc
new file mode 100644
index 0000000000..f81d011552
--- /dev/null
+++ b/paddle/fluid/inference/tensorrt/convert/test_split_op.cc
@@ -0,0 +1,53 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <gtest/gtest.h>
+#include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
+#include "paddle/fluid/inference/tensorrt/convert/ut_helper.h"
+
+namespace paddle {
+namespace inference {
+namespace tensorrt {
+
+TEST(split_op, test) {
+  std::unordered_set<std::string> parameters({""});
+  framework::Scope scope;
+  TRTConvertValidation validator(10, parameters, scope, 1000);
+  validator.DeclInputVar("split_input", nvinfer1::DimsCHW(3, 2, 2));
+  validator.DeclOutputVar("split_out1", nvinfer1::DimsCHW(2, 2, 2));
+  validator.DeclOutputVar("split_out2", nvinfer1::DimsCHW(1, 2, 2));
+
+  // Prepare Op description
+  framework::OpDesc desc;
+  desc.SetType("split");
+  desc.SetInput("X", {"split_input"});
+  desc.SetOutput("Out", {"split_out1", "split_out2"});
+
+  int num = 0;
+  int axis = 1;
+  std::vector<int> output_lengths = {2, 1};
+  desc.SetAttr("axis", axis);
+  desc.SetAttr("num", num);
+  desc.SetAttr("sections", output_lengths);
+
+  validator.SetOp(*desc.Proto());
+
+  validator.Execute(1);
+}
+
+}  // namespace tensorrt
+}  // namespace inference
+}  // namespace paddle
+
+USE_OP(split);
diff --git a/paddle/fluid/inference/tensorrt/engine.cc b/paddle/fluid/inference/tensorrt/engine.cc
index 9e0f958447..426bf169bb 100644
--- a/paddle/fluid/inference/tensorrt/engine.cc
+++ b/paddle/fluid/inference/tensorrt/engine.cc
@@ -254,6 +254,12 @@ void TensorRTEngine::freshDeviceId() {
   cudaSetDevice(device_);
 }
 
+nvinfer1::IPluginLayer *TensorRTEngine::addPlugin(
+    nvinfer1::ITensor *const *inputs, int nbInputs, PluginTensorRT *plugin) {
+  owned_plugin_.emplace_back(plugin);
+  return infer_network_.get()->addPluginExt(inputs, nbInputs, *plugin);
+}
+
 }  // namespace tensorrt
 }  // namespace inference
 }  // namespace paddle
diff --git a/paddle/fluid/inference/tensorrt/engine.h b/paddle/fluid/inference/tensorrt/engine.h
index 828181200e..216606a291 100644
--- a/paddle/fluid/inference/tensorrt/engine.h
+++ b/paddle/fluid/inference/tensorrt/engine.h
@@ -22,6 +22,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/tensor.h"
 #include "paddle/fluid/inference/engine.h"
 #include "paddle/fluid/inference/tensorrt/helper.h"
+#include "paddle/fluid/inference/tensorrt/plugin/trt_plugin.h"
 #include "paddle/fluid/inference/utils/singleton.h"
 
 namespace paddle {
@@ -125,6 +126,8 @@ class TensorRTEngine : public EngineBase {
   void SetRuntimeBatch(size_t batch_size);
   int GetRuntimeBatch();
   int GetDevice() { return device_; }
+  nvinfer1::IPluginLayer* addPlugin(nvinfer1::ITensor* const* inputs,
+                                    int nbInputs, PluginTensorRT*);
 
   // A pointer to CPU memory is needed of the TRT weight.
   // Before TRT runs, fluid loads weight into GPU storage.
@@ -164,8 +167,10 @@ class TensorRTEngine : public EngineBase {
   std::unordered_map<std::string /*name*/, size_t /*max size*/> buffer_sizes_;
   std::unordered_map<std::string /*name*/, nvinfer1::ITensor* /*ITensor*/>
       itensor_map_;
+
   // The specific GPU id that the TensorRTEngine bounded to.
   int device_;
+  std::vector<std::unique_ptr<PluginTensorRT>> owned_plugin_;
 
   // TensorRT related internal members
   template <typename T>
diff --git a/paddle/fluid/inference/tensorrt/plugin/CMakeLists.txt b/paddle/fluid/inference/tensorrt/plugin/CMakeLists.txt
index 1b91c864c9..71b7a55161 100644
--- a/paddle/fluid/inference/tensorrt/plugin/CMakeLists.txt
+++ b/paddle/fluid/inference/tensorrt/plugin/CMakeLists.txt
@@ -1,2 +1 @@
-nv_library(tensorrt_plugin SRCS plugin_factory.cc plugin_utils.cc
-trt_plugin.cc split_op_plugin.cu DEPS enforce)
+nv_library(tensorrt_plugin SRCS trt_plugin.cc split_op_plugin.cu DEPS enforce)
diff --git a/paddle/fluid/inference/tensorrt/plugin/plugin_factory.cc b/paddle/fluid/inference/tensorrt/plugin/plugin_factory.cc
deleted file mode 100644
index 5ebcd44611..0000000000
--- a/paddle/fluid/inference/tensorrt/plugin/plugin_factory.cc
+++ /dev/null
@@ -1,64 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/inference/tensorrt/plugin/plugin_factory.h"
-
-namespace paddle {
-namespace inference {
-namespace tensorrt {
-
-PluginTensorRT* PluginFactoryTensorRT::createPlugin(const char* layer_name,
-                                                    const void* serial_data,
-                                                    size_t serial_length) {
-  size_t parsed_byte = 0;
-  std::string encoded_op_name =
-      ExtractOpName(serial_data, serial_length, &parsed_byte);
-
-  if (!IsPlugin(encoded_op_name)) {
-    return nullptr;
-  }
-
-  auto plugin_ptr =
-      plugin_registry_[encoded_op_name].first(serial_data, serial_length);
-  owned_plugins_.emplace_back(plugin_ptr);
-
-  return plugin_ptr;
-}
-
-PluginTensorRT* PluginFactoryTensorRT::CreatePlugin(
-    const std::string& op_name) {
-  if (!IsPlugin(op_name)) return nullptr;
-
-  auto plugin_ptr = plugin_registry_[op_name].second();
-  owned_plugins_.emplace_back(plugin_ptr);
-
-  return plugin_ptr;
-}
-
-bool PluginFactoryTensorRT::RegisterPlugin(
-    const std::string& op_name, PluginDeserializeFunc deserialize_func,
-    PluginConstructFunc construct_func) {
-  if (IsPlugin(op_name)) return false;
-
-  auto ret = plugin_registry_.emplace(
-      op_name, std::make_pair(deserialize_func, construct_func));
-
-  return ret.second;
-}
-
-void PluginFactoryTensorRT::DestroyPlugins() { owned_plugins_.clear(); }
-
-}  // namespace tensorrt
-}  // namespace inference
-}  // namespace paddle
diff --git a/paddle/fluid/inference/tensorrt/plugin/plugin_factory.h b/paddle/fluid/inference/tensorrt/plugin/plugin_factory.h
deleted file mode 100644
index 00435766f7..0000000000
--- a/paddle/fluid/inference/tensorrt/plugin/plugin_factory.h
+++ /dev/null
@@ -1,91 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <memory>
-#include <unordered_map>
-
-#include "NvInfer.h"
-#include "paddle/fluid/inference/tensorrt/plugin/plugin_utils.h"
-#include "paddle/fluid/inference/tensorrt/plugin/trt_plugin.h"
-#include "paddle/fluid/platform/enforce.h"
-
-namespace paddle {
-namespace inference {
-namespace tensorrt {
-
-class PluginFactoryTensorRT : public nvinfer1::IPluginFactory {
- public:
-  static PluginFactoryTensorRT* GetInstance() {
-    static PluginFactoryTensorRT* factory_instance =
-        new PluginFactoryTensorRT();
-    return factory_instance;
-  }
-
-  // Deserialization method
-  PluginTensorRT* createPlugin(const char* layer_name, const void* serial_data,
-                               size_t serial_length) override;
-
-  // Plugin construction, PluginFactoryTensorRT owns the plugin.
-  PluginTensorRT* CreatePlugin(const std::string& op_name);
-
-  bool RegisterPlugin(const std::string& op_name,
-                      PluginDeserializeFunc deserialize_func,
-                      PluginConstructFunc construct_func);
-
-  bool IsPlugin(const std::string& op_name) {
-    return plugin_registry_.find(op_name) != plugin_registry_.end();
-  }
-
-  size_t CountOwnedPlugins() { return owned_plugins_.size(); }
-
-  void DestroyPlugins();
-
- protected:
-  std::unordered_map<std::string,
-                     std::pair<PluginDeserializeFunc, PluginConstructFunc>>
-      plugin_registry_;
-  std::vector<std::unique_ptr<PluginTensorRT>> owned_plugins_;
-};
-
-class TrtPluginRegistrar {
- public:
-  TrtPluginRegistrar(const std::string& name,
-                     PluginDeserializeFunc deserialize_func,
-                     PluginConstructFunc construct_func) {
-    auto factory = PluginFactoryTensorRT::GetInstance();
-    // platform::PADDLE_ENFORCE(factory->RegisterPlugin(name, deserialize_func,
-    // construct_func),  "Falied to register plugin [%s]", name);
-    // platform::PADDLE_ENFORCE(factory->RegisterPlugin(name, deserialize_func,
-    // construct_func));
-    factory->RegisterPlugin(name, deserialize_func, construct_func);
-  }
-};
-
-#define REGISTER_TRT_PLUGIN(name, deserialize_func, construct_func)    \
-  REGISTER_TRT_PLUGIN_UNIQ_HELPER(__COUNTER__, name, deserialize_func, \
-                                  construct_func)
-#define REGISTER_TRT_PLUGIN_UNIQ_HELPER(ctr, name, deserialize_func, \
-                                        construct_func)              \
-  REGISTER_TRT_PLUGIN_UNIQ(ctr, name, deserialize_func, construct_func)
-#define REGISTER_TRT_PLUGIN_UNIQ(ctr, name, deserialize_func, construct_func) \
-  static ::paddle::inference::tensorrt::TrtPluginRegistrar                    \
-      trt_plugin_registrar##ctr __attribute__((unused)) =                     \
-          ::paddle::inference::tensorrt::TrtPluginRegistrar(                  \
-              name, deserialize_func, construct_func)
-
-}  // namespace tensorrt
-}  // namespace inference
-}  // namespace paddle
diff --git a/paddle/fluid/inference/tensorrt/plugin/plugin_utils.cc b/paddle/fluid/inference/tensorrt/plugin/plugin_utils.cc
deleted file mode 100644
index 2cc4162aa7..0000000000
--- a/paddle/fluid/inference/tensorrt/plugin/plugin_utils.cc
+++ /dev/null
@@ -1,37 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/inference/tensorrt/plugin/plugin_utils.h"
-#include <cassert>
-
-namespace paddle {
-namespace inference {
-namespace tensorrt {
-
-std::string ExtractOpName(const void* serial_data, size_t serial_length,
-                          size_t* incremental) {
-  size_t op_name_char_count = *static_cast<const size_t*>(serial_data);
-  *incremental = sizeof(size_t) + op_name_char_count;
-
-  assert(serial_length >= *incremental);
-
-  const char* buffer = static_cast<const char*>(serial_data) + sizeof(size_t);
-  std::string op_name(buffer, op_name_char_count);
-
-  return op_name;
-}
-
-}  // namespace tensorrt
-}  // namespace inference
-}  // namespace paddle
diff --git a/paddle/fluid/inference/tensorrt/plugin/plugin_utils.h b/paddle/fluid/inference/tensorrt/plugin/plugin_utils.h
deleted file mode 100644
index fb6608c12a..0000000000
--- a/paddle/fluid/inference/tensorrt/plugin/plugin_utils.h
+++ /dev/null
@@ -1,34 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include <functional>
-
-#include "NvInfer.h"
-#include "paddle/fluid/inference/tensorrt/plugin/trt_plugin.h"
-
-namespace paddle {
-namespace inference {
-namespace tensorrt {
-
-typedef std::function<PluginTensorRT*(const void*, size_t)>
-    PluginDeserializeFunc;
-typedef std::function<PluginTensorRT*(void)> PluginConstructFunc;
-
-std::string ExtractOpName(const void* serial_data, size_t serial_length,
-                          size_t* incremental);
-
-}  // namespace tensorrt
-}  // namespace inference
-}  // namespze paddle
diff --git a/paddle/fluid/inference/tensorrt/plugin/serialize.hpp b/paddle/fluid/inference/tensorrt/plugin/serialize.h
similarity index 100%
rename from paddle/fluid/inference/tensorrt/plugin/serialize.hpp
rename to paddle/fluid/inference/tensorrt/plugin/serialize.h
diff --git a/paddle/fluid/inference/tensorrt/plugin/split_op_plugin.cu b/paddle/fluid/inference/tensorrt/plugin/split_op_plugin.cu
index 044c229b55..ed43c4d435 100644
--- a/paddle/fluid/inference/tensorrt/plugin/split_op_plugin.cu
+++ b/paddle/fluid/inference/tensorrt/plugin/split_op_plugin.cu
@@ -12,6 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+#include <stdio.h>
 #include <cassert>
 #include "paddle/fluid/inference/tensorrt/plugin/split_op_plugin.h"
 
@@ -19,8 +20,6 @@ namespace paddle {
 namespace inference {
 namespace tensorrt {
 
-SplitPlugin* CreateSplitPlugin() { return new SplitPlugin(); };
-
 nvinfer1::Dims SplitPlugin::getOutputDimensions(int index,
                                                 const nvinfer1::Dims* inputDims,
                                                 int nbInputs) {
@@ -28,15 +27,16 @@ nvinfer1::Dims SplitPlugin::getOutputDimensions(int index,
   assert(index < this->getNbOutputs());
   nvinfer1::Dims const& input_dims = inputDims[0];
   nvinfer1::Dims output_dims = input_dims;
-  output_dims.d[axis_] = output_lenght_.at(index);
+  output_dims.d[axis_] = output_length_.at(index);
   return output_dims;
 }
 
 int SplitPlugin::initialize() {
   std::vector<int> segment_offsets(1, 0);
   for (int i = 0; i < this->getNbOutputs(); ++i) {
-    segment_offsets.push_back(segment_offsets.back() + output_lenght_[i]);
+    segment_offsets.push_back(segment_offsets.back() + output_length_[i]);
   }
+  segment_offsets_ = segment_offsets;
   d_segment_offsets_ = segment_offsets;
   nvinfer1::Dims dims = this->getInputDims(0);
   nx_ = 1;
@@ -51,60 +51,30 @@ int SplitPlugin::initialize() {
   return 0;
 }
 
-template <typename T>
-__device__ int upper_bound(T const* vals, int n, T const& key) {
-  int i = 0;
-  while (n > 0) {
-    int m = n / 2;
-    int j = i + m;
-    if (!(key < vals[j])) {
-      i = j + 1;
-      n -= m + 1;
-    } else {
-      n = m;
-    }
-  }
-  return i;
-}
-
-template <typename T>
-__global__ void split_kernel(int nsegment,
-                             int const* __restrict__ segment_offsets,
-                             T const* __restrict__ idata, T* const* odatas,
-                             int nx, int srcny_, int nz) {
-  int x0 = threadIdx.x + blockIdx.x * blockDim.x;
-  int src_y0 = threadIdx.y + blockIdx.y * blockDim.y;
-  int z0 = threadIdx.z + blockIdx.z * blockDim.z;
-  for (int z = z0; z < nz; z += blockDim.z * gridDim.z) {
-    for (int src_y = src_y0; src_y < srcny_; src_y += blockDim.y * gridDim.y) {
-      for (int x = x0; x < nx; x += blockDim.x * gridDim.x) {
-        int segment = upper_bound(segment_offsets, nsegment, src_y) - 1;
-        int dst_y = src_y - segment_offsets[segment];
-        int dstny_ = segment_offsets[segment + 1] - segment_offsets[segment];
-        odatas[segment][x + nx * (dst_y + dstny_ * z)] =
-            idata[x + nx * (src_y + srcny_ * z)];
-      }
-    }
-  }
-}
-
 int SplitPlugin::enqueue(int batchSize, const void* const* inputs,
                          void** outputs, void* workspace, cudaStream_t stream) {
   auto const& input_dims = this->getInputDims(0);
+  int input_size = 0;
   int const* d_segment_offsets_ptr =
       thrust::raw_pointer_cast(&d_segment_offsets_[0]);
   float const* idata = reinterpret_cast<float const*>(inputs[0]);
   float** odatas = reinterpret_cast<float**>(outputs);
 
-  int nz = nz_ * batchSize;
-  dim3 block(32, 16);
-  dim3 grid(std::min((nx_ - 1) / block.x + 1, 65535u),
-            std::min((ny_ - 1) / block.y + 1, 65535u),
-            std::min((nz_ - 1) / block.z + 1, 65535u));
-
-  split_kernel<<<grid, block, 0, stream>>>(d_segment_offsets_.size(),
-                                           d_segment_offsets_ptr, idata, odatas,
-                                           nx_, ny_, nz);
+  // kernel impl here.
+  int inputBatchOffset = nx_ * ny_ * nz_;
+  for (size_t i = 0; i < this->getNbOutputs(); i++) {
+    for (size_t j = 0; j < batchSize; j++) {
+      cudaMemcpyAsync(
+          odatas[i] +
+              j * (segment_offsets_[i + 1] - segment_offsets_[i]) * nx_ *
+                  sizeof(float),
+          inputs[0] +
+              (inputBatchOffset * j + segment_offsets_[i] * nx_) *
+                  sizeof(float),
+          (segment_offsets_[i + 1] - segment_offsets_[i]) * nx_ * sizeof(float),
+          cudaMemcpyDeviceToDevice, stream);
+    }
+  }
 
   return cudaGetLastError() != cudaSuccess;
 }
diff --git a/paddle/fluid/inference/tensorrt/plugin/split_op_plugin.h b/paddle/fluid/inference/tensorrt/plugin/split_op_plugin.h
index 406c822bb5..59be609111 100644
--- a/paddle/fluid/inference/tensorrt/plugin/split_op_plugin.h
+++ b/paddle/fluid/inference/tensorrt/plugin/split_op_plugin.h
@@ -1,8 +1,21 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
 
 #pragma once
 
-#include "paddle/fluid/inference/tensorrt/plugin/trt_plugin.h"
 #include <thrust/device_vector.h>
+#include "paddle/fluid/inference/tensorrt/plugin/trt_plugin.h"
 
 namespace paddle {
 namespace inference {
@@ -10,53 +23,55 @@ namespace tensorrt {
 
 class SplitPlugin : public PluginTensorRT {
   int axis_;
-  std::vector<int> output_lenght_;
+  std::vector<int> output_length_;
   int nx_, ny_, nz_;
   thrust::device_vector<int> d_segment_offsets_;
+  std::vector<int> segment_offsets_;
 
  protected:
   virtual size_t getSerializationSize() override {
-    return serialized_size(axis_) + serialized_size(output_lenght_)
-      + getBaseSerializationSize();
+    return serialized_size(axis_) + serialized_size(output_length_) +
+           getBaseSerializationSize();
   }
 
   virtual void serialize(void *buffer) override {
     serializeBase(buffer);
     serialize_value(&buffer, axis_);
-    serialize_value(&buffer, output_lenght_);
+    serialize_value(&buffer, output_length_);
   }
 
  public:
-  Split() {}
-  SplitPlugin(void const* serialData, size_t serialLength) {
+  SplitPlugin(int axis, std::vector<int> const &output_lengths)
+      : axis_(axis), output_length_(output_lengths) {
+    assert(axis <= nvinfer1::Dims::MAX_DIMS);
+  }
+
+  SplitPlugin(void const *serialData, size_t serialLength) {
     deserializeBase(serialData, serialLength);
     deserialize_value(&serialData, &serialLength, &axis_);
-    deserialize_value(&serialData, &serialLength, &output_lenght_);
+    deserialize_value(&serialData, &serialLength, &output_length_);
   }
 
-  SplitPlugin* clone() const override {
-    return new SplitPlugin(axis_, output_lenght_);
+  SplitPlugin *clone() const override {
+    return new SplitPlugin(axis_, output_length_);
   }
 
-  virtual const char* getPluginType() const override { return "split"; }
-  virtual int getNbOutputs() const override { return output_lenght_.size(); }
+  virtual const char *getPluginType() const override { return "split"; }
+  virtual int getNbOutputs() const override { return output_length_.size(); }
   virtual nvinfer1::Dims getOutputDimensions(int index,
-                                             const nvinfer1::Dims *inputs, int nbInputDims) override;
+                                             const nvinfer1::Dims *inputs,
+                                             int nbInputDims) override;
   virtual int initialize() override;
-  virtual int enqueue(int batchSize,
-                      const void *const *inputs, void **outputs,
+  virtual int enqueue(int batchSize, const void *const *inputs, void **outputs,
                       void *workspace, cudaStream_t stream) override;
 
-  void setAxis(int axis) {
-    axis_ = axis;
-  }
+  void setAxis(int axis) { axis_ = axis; }
 
-  void setOutputLengths(const std::vector<int> & output_lengths) {
+  void setOutputLengths(const std::vector<int> &output_lengths) {
     output_length_ = output_lengths;
   }
-
 };
 
-} // tensorrt
-} // inference
-} // paddle
+}  // tensorrt
+}  // inference
+}  // paddle
diff --git a/paddle/fluid/inference/tensorrt/plugin/trt_plugin.cc b/paddle/fluid/inference/tensorrt/plugin/trt_plugin.cc
index 4eff6665d4..975a5ed162 100644
--- a/paddle/fluid/inference/tensorrt/plugin/trt_plugin.cc
+++ b/paddle/fluid/inference/tensorrt/plugin/trt_plugin.cc
@@ -13,7 +13,6 @@
 // limitations under the License.
 
 #include "paddle/fluid/inference/tensorrt/plugin/trt_plugin.h"
-#include "paddle/fluid/inference/tensorrt/plugin/plugin_utils.h"
 
 namespace paddle {
 namespace inference {
@@ -41,8 +40,7 @@ size_t PluginTensorRT::getBaseSerializationSize() {
 
 bool PluginTensorRT::supportsFormat(nvinfer1::DataType type,
                                     nvinfer1::PluginFormat format) const {
-  return ((type == nvinfer1::DataType::kFLOAT ||
-           type == nvinfer1::DataType::kHALF) &&
+  return ((type == nvinfer1::DataType::kFLOAT) &&
           (format == nvinfer1::PluginFormat::kNCHW));
 }
 
diff --git a/paddle/fluid/inference/tensorrt/plugin/trt_plugin.h b/paddle/fluid/inference/tensorrt/plugin/trt_plugin.h
index 8168646bde..44869b390f 100644
--- a/paddle/fluid/inference/tensorrt/plugin/trt_plugin.h
+++ b/paddle/fluid/inference/tensorrt/plugin/trt_plugin.h
@@ -14,14 +14,14 @@
 
 #pragma once
 
-#include <NvInfer.h>
 #include <cassert>
 #include <cstring>
 #include <iostream>
 #include <unordered_map>
 #include <vector>
+#include "NvInfer.h"
 
-#include "paddle/fluid/inference/tensorrt/plugin/serialize.hpp"
+#include "paddle/fluid/inference/tensorrt/plugin/serialize.h"
 
 namespace paddle {
 namespace inference {
@@ -53,8 +53,8 @@ class PluginTensorRT : public nvinfer1::IPluginExt {
                            nvinfer1::DataType type,
                            nvinfer1::PluginFormat format,
                            int maxBatchSize) override;
-  virtual void serialize(void* buffer) override;
-  virtual size_t getSerializationSize() override;
+  virtual void serialize(void* buffer) = 0;
+  virtual size_t getSerializationSize() = 0;
 
  protected:
   void deserializeBase(void const*& serialData, size_t& serialLength);

From 44ecf9a4816222df9fb673aa4ab9d4f74cb4acd3 Mon Sep 17 00:00:00 2001
From: Xin Pan <panxin.grad@gmail.com>
Date: Tue, 13 Nov 2018 17:03:47 +0800
Subject: [PATCH 33/50] fix

test=develop
---
 python/paddle/fluid/tests/unittests/test_dist_base.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/paddle/fluid/tests/unittests/test_dist_base.py b/python/paddle/fluid/tests/unittests/test_dist_base.py
index 4b8a215190..97e7ee6229 100644
--- a/python/paddle/fluid/tests/unittests/test_dist_base.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_base.py
@@ -105,7 +105,7 @@ class TestDistRunnerBase(object):
             build_stra.reduce_strategy = fluid.BuildStrategy.ReduceStrategy.AllReduce
 
         if args.batch_merge_repeat > 1:
-            pass_builder = build_stra._create_passes_from_strategy()
+            pass_builder = build_stra._finalize_strategy_and_create_passes()
             mypass = pass_builder.insert_pass(
                 len(pass_builder.all_passes()) - 2, "multi_batch_merge_pass")
             mypass.set_int("num_repeats", args.batch_merge_repeat)

From d219818434cd7f3a952a4f597032a292c98a72da Mon Sep 17 00:00:00 2001
From: Dang Qingqing <dangqingqing@baidu.com>
Date: Tue, 13 Nov 2018 13:27:43 +0800
Subject: [PATCH 34/50] Fix compiling in cuDNN v5.

test=develop
---
 paddle/fluid/operators/conv_cudnn_op.cu.cc | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/paddle/fluid/operators/conv_cudnn_op.cu.cc b/paddle/fluid/operators/conv_cudnn_op.cu.cc
index 3083e622c3..3a4086274d 100644
--- a/paddle/fluid/operators/conv_cudnn_op.cu.cc
+++ b/paddle/fluid/operators/conv_cudnn_op.cu.cc
@@ -50,12 +50,18 @@ static constexpr char kCUDNNBwdFilterAlgoCache[] = "kCUDNNBwdFilterAlgoCache";
 static constexpr size_t kCONV_CUDNN_WORKSPACE_LIMIT_BYTES =
     static_cast<size_t>(1024) * 1024 * 1024;
 
-static constexpr size_t kNUM_CUDNN_FWD_ALGS =
-    CUDNN_CONVOLUTION_BWD_FILTER_ALGO_COUNT;
+#if CUDNN_VERSION_MIN(6, 0, 5)
+static constexpr size_t kNUM_CUDNN_FWD_ALGS = CUDNN_CONVOLUTION_FWD_ALGO_COUNT;
 static constexpr size_t kNUM_CUDNN_BWD_FILTER_ALGS =
     CUDNN_CONVOLUTION_BWD_FILTER_ALGO_COUNT;
 static constexpr size_t kNUM_CUDNN_BWD_DATA_ALGS =
     CUDNN_CONVOLUTION_BWD_DATA_ALGO_COUNT;
+#else
+// cuDNN v5 has no CUDNN_CONVOLUTION_FWD_ALGO_COUNT etc.
+static constexpr size_t kNUM_CUDNN_FWD_ALGS = 7;
+static constexpr size_t kNUM_CUDNN_BWD_FILTER_ALGS = 4;
+static constexpr size_t kNUM_CUDNN_BWD_DATA_ALGS = 5;
+#endif
 
 template <typename T>
 class CUDNNConvOpKernel : public framework::OpKernel<T> {

From 0b96268057275615bce25b97708f9efd5c06bce1 Mon Sep 17 00:00:00 2001
From: nhzlx <zlx_hg@163.com>
Date: Tue, 13 Nov 2018 11:41:39 +0000
Subject: [PATCH 35/50] fix comments

test=develop
---
 .../inference/tensorrt/convert/split_op.cc    |  4 +-
 paddle/fluid/inference/tensorrt/engine.cc     |  2 +-
 paddle/fluid/inference/tensorrt/engine.h      |  2 +-
 .../inference/tensorrt/plugin/serialize.h     | 52 +++++++++----------
 .../tensorrt/plugin/split_op_plugin.cu        |  3 --
 .../tensorrt/plugin/split_op_plugin.h         | 23 ++++----
 .../inference/tensorrt/plugin/trt_plugin.cc   | 20 +++----
 .../inference/tensorrt/plugin/trt_plugin.h    | 18 +++++--
 8 files changed, 64 insertions(+), 60 deletions(-)

diff --git a/paddle/fluid/inference/tensorrt/convert/split_op.cc b/paddle/fluid/inference/tensorrt/convert/split_op.cc
index 60d07859f3..12179cccc7 100644
--- a/paddle/fluid/inference/tensorrt/convert/split_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/split_op.cc
@@ -35,6 +35,7 @@ class SplitOpConverter : public OpConverter {
     int input_num = op_desc.Input("X").size();
     size_t output_num = op_desc.Output("Out").size();
 
+    // Get Attrs
     PADDLE_ENFORCE(input_num == 1);
     int axis = boost::get<int>(op_desc.GetAttr("axis"));
     std::vector<int> output_lengths =
@@ -48,9 +49,10 @@ class SplitOpConverter : public OpConverter {
 
     PADDLE_ENFORCE(output_lengths.size() == output_num);
 
+    //
     SplitPlugin* plugin = new SplitPlugin(axis, output_lengths);
     nvinfer1::IPluginLayer* layer =
-        engine_->addPlugin(&input, input_num, plugin);
+        engine_->AddPlugin(&input, input_num, plugin);
 
     std::string layer_name = "split (Output: ";
     for (size_t i = 0; i < output_num; i++) {
diff --git a/paddle/fluid/inference/tensorrt/engine.cc b/paddle/fluid/inference/tensorrt/engine.cc
index 426bf169bb..0e06a8f804 100644
--- a/paddle/fluid/inference/tensorrt/engine.cc
+++ b/paddle/fluid/inference/tensorrt/engine.cc
@@ -254,7 +254,7 @@ void TensorRTEngine::freshDeviceId() {
   cudaSetDevice(device_);
 }
 
-nvinfer1::IPluginLayer *TensorRTEngine::addPlugin(
+nvinfer1::IPluginLayer *TensorRTEngine::AddPlugin(
     nvinfer1::ITensor *const *inputs, int nbInputs, PluginTensorRT *plugin) {
   owned_plugin_.emplace_back(plugin);
   return infer_network_.get()->addPluginExt(inputs, nbInputs, *plugin);
diff --git a/paddle/fluid/inference/tensorrt/engine.h b/paddle/fluid/inference/tensorrt/engine.h
index 216606a291..335acdf653 100644
--- a/paddle/fluid/inference/tensorrt/engine.h
+++ b/paddle/fluid/inference/tensorrt/engine.h
@@ -126,7 +126,7 @@ class TensorRTEngine : public EngineBase {
   void SetRuntimeBatch(size_t batch_size);
   int GetRuntimeBatch();
   int GetDevice() { return device_; }
-  nvinfer1::IPluginLayer* addPlugin(nvinfer1::ITensor* const* inputs,
+  nvinfer1::IPluginLayer* AddPlugin(nvinfer1::ITensor* const* inputs,
                                     int nbInputs, PluginTensorRT*);
 
   // A pointer to CPU memory is needed of the TRT weight.
diff --git a/paddle/fluid/inference/tensorrt/plugin/serialize.h b/paddle/fluid/inference/tensorrt/plugin/serialize.h
index 96df352feb..50c0b17d78 100644
--- a/paddle/fluid/inference/tensorrt/plugin/serialize.h
+++ b/paddle/fluid/inference/tensorrt/plugin/serialize.h
@@ -20,11 +20,11 @@
 #include <vector>
 
 template <typename T>
-inline void serialize_value(void** buffer, T const& value);
+inline void SerializeValue(void** buffer, T const& value);
 
 template <typename T>
-inline void deserialize_value(void const** buffer, size_t* buffer_size,
-                              T* value);
+inline void DeserializeValue(void const** buffer, size_t* buffer_size,
+                             T* value);
 
 namespace {
 
@@ -35,14 +35,14 @@ template <typename T>
 struct Serializer<T, typename std::enable_if<std::is_arithmetic<T>::value ||
                                              std::is_enum<T>::value ||
                                              std::is_pod<T>::value>::type> {
-  static size_t serialized_size(T const& value) { return sizeof(T); }
-  static void serialize(void** buffer, T const& value) {
-    ::memcpy(*buffer, &value, sizeof(T));
+  static size_t SerializedSize(T const& value) { return sizeof(T); }
+  static void Serialize(void** buffer, T const& value) {
+    std::memcpy(*buffer, &value, sizeof(T));
     reinterpret_cast<char*&>(*buffer) += sizeof(T);
   }
-  static void deserialize(void const** buffer, size_t* buffer_size, T* value) {
+  static void Deserialize(void const** buffer, size_t* buffer_size, T* value) {
     assert(*buffer_size >= sizeof(T));
-    ::memcpy(value, *buffer, sizeof(T));
+    std::memcpy(value, *buffer, sizeof(T));
     reinterpret_cast<char const*&>(*buffer) += sizeof(T);
     *buffer_size -= sizeof(T);
   }
@@ -50,12 +50,12 @@ struct Serializer<T, typename std::enable_if<std::is_arithmetic<T>::value ||
 
 template <>
 struct Serializer<const char*> {
-  static size_t serialized_size(const char* value) { return strlen(value) + 1; }
-  static void serialize(void** buffer, const char* value) {
-    ::strcpy(static_cast<char*>(*buffer), value);
+  static size_t SerializedSize(const char* value) { return strlen(value) + 1; }
+  static void Serialize(void** buffer, const char* value) {
+    std::strcpy(static_cast<char*>(*buffer), value);
     reinterpret_cast<char*&>(*buffer) += strlen(value) + 1;
   }
-  static void deserialize(void const** buffer, size_t* buffer_size,
+  static void Deserialize(void const** buffer, size_t* buffer_size,
                           const char** value) {
     *value = static_cast<char const*>(*buffer);
     size_t data_size = strnlen(*value, *buffer_size) + 1;
@@ -70,23 +70,23 @@ struct Serializer<std::vector<T>,
                   typename std::enable_if<std::is_arithmetic<T>::value ||
                                           std::is_enum<T>::value ||
                                           std::is_pod<T>::value>::type> {
-  static size_t serialized_size(std::vector<T> const& value) {
+  static size_t SerializedSize(std::vector<T> const& value) {
     return sizeof(value.size()) + value.size() * sizeof(T);
   }
-  static void serialize(void** buffer, std::vector<T> const& value) {
-    serialize_value(buffer, value.size());
+  static void Serialize(void** buffer, std::vector<T> const& value) {
+    SerializeValue(buffer, value.size());
     size_t nbyte = value.size() * sizeof(T);
-    ::memcpy(*buffer, value.data(), nbyte);
+    std::memcpy(*buffer, value.data(), nbyte);
     reinterpret_cast<char*&>(*buffer) += nbyte;
   }
-  static void deserialize(void const** buffer, size_t* buffer_size,
+  static void Deserialize(void const** buffer, size_t* buffer_size,
                           std::vector<T>* value) {
     size_t size;
-    deserialize_value(buffer, buffer_size, &size);
+    DeserializeValue(buffer, buffer_size, &size);
     value->resize(size);
     size_t nbyte = value->size() * sizeof(T);
     assert(*buffer_size >= nbyte);
-    ::memcpy(value->data(), *buffer, nbyte);
+    std::memcpy(value->data(), *buffer, nbyte);
     reinterpret_cast<char const*&>(*buffer) += nbyte;
     *buffer_size -= nbyte;
   }
@@ -95,17 +95,17 @@ struct Serializer<std::vector<T>,
 }  // namespace
 
 template <typename T>
-inline size_t serialized_size(T const& value) {
-  return Serializer<T>::serialized_size(value);
+inline size_t SerializedSize(T const& value) {
+  return Serializer<T>::SerializedSize(value);
 }
 
 template <typename T>
-inline void serialize_value(void** buffer, T const& value) {
-  return Serializer<T>::serialize(buffer, value);
+inline void SerializeValue(void** buffer, T const& value) {
+  return Serializer<T>::Serialize(buffer, value);
 }
 
 template <typename T>
-inline void deserialize_value(void const** buffer, size_t* buffer_size,
-                              T* value) {
-  return Serializer<T>::deserialize(buffer, buffer_size, value);
+inline void DeserializeValue(void const** buffer, size_t* buffer_size,
+                             T* value) {
+  return Serializer<T>::Deserialize(buffer, buffer_size, value);
 }
diff --git a/paddle/fluid/inference/tensorrt/plugin/split_op_plugin.cu b/paddle/fluid/inference/tensorrt/plugin/split_op_plugin.cu
index ed43c4d435..bd6a44dcc1 100644
--- a/paddle/fluid/inference/tensorrt/plugin/split_op_plugin.cu
+++ b/paddle/fluid/inference/tensorrt/plugin/split_op_plugin.cu
@@ -37,7 +37,6 @@ int SplitPlugin::initialize() {
     segment_offsets.push_back(segment_offsets.back() + output_length_[i]);
   }
   segment_offsets_ = segment_offsets;
-  d_segment_offsets_ = segment_offsets;
   nvinfer1::Dims dims = this->getInputDims(0);
   nx_ = 1;
   for (int i = dims.nbDims - 1; i > axis_; --i) {
@@ -55,8 +54,6 @@ int SplitPlugin::enqueue(int batchSize, const void* const* inputs,
                          void** outputs, void* workspace, cudaStream_t stream) {
   auto const& input_dims = this->getInputDims(0);
   int input_size = 0;
-  int const* d_segment_offsets_ptr =
-      thrust::raw_pointer_cast(&d_segment_offsets_[0]);
   float const* idata = reinterpret_cast<float const*>(inputs[0]);
   float** odatas = reinterpret_cast<float**>(outputs);
 
diff --git a/paddle/fluid/inference/tensorrt/plugin/split_op_plugin.h b/paddle/fluid/inference/tensorrt/plugin/split_op_plugin.h
index 59be609111..7281e40c33 100644
--- a/paddle/fluid/inference/tensorrt/plugin/split_op_plugin.h
+++ b/paddle/fluid/inference/tensorrt/plugin/split_op_plugin.h
@@ -14,7 +14,6 @@
 
 #pragma once
 
-#include <thrust/device_vector.h>
 #include "paddle/fluid/inference/tensorrt/plugin/trt_plugin.h"
 
 namespace paddle {
@@ -25,19 +24,21 @@ class SplitPlugin : public PluginTensorRT {
   int axis_;
   std::vector<int> output_length_;
   int nx_, ny_, nz_;
-  thrust::device_vector<int> d_segment_offsets_;
   std::vector<int> segment_offsets_;
 
  protected:
   virtual size_t getSerializationSize() override {
-    return serialized_size(axis_) + serialized_size(output_length_) +
+    return SerializedSize(axis_) + SerializedSize(output_length_) +
            getBaseSerializationSize();
   }
 
+  // TRT will call this func when we need to serialize the configuration of
+  // tensorrt.
+  // It should not be called by users.
   virtual void serialize(void *buffer) override {
     serializeBase(buffer);
-    serialize_value(&buffer, axis_);
-    serialize_value(&buffer, output_length_);
+    SerializeValue(&buffer, axis_);
+    SerializeValue(&buffer, output_length_);
   }
 
  public:
@@ -46,10 +47,12 @@ class SplitPlugin : public PluginTensorRT {
     assert(axis <= nvinfer1::Dims::MAX_DIMS);
   }
 
+  // It was used for tensorrt deserialization.
+  // It should not be called by users.
   SplitPlugin(void const *serialData, size_t serialLength) {
     deserializeBase(serialData, serialLength);
-    deserialize_value(&serialData, &serialLength, &axis_);
-    deserialize_value(&serialData, &serialLength, &output_length_);
+    DeserializeValue(&serialData, &serialLength, &axis_);
+    DeserializeValue(&serialData, &serialLength, &output_length_);
   }
 
   SplitPlugin *clone() const override {
@@ -64,12 +67,6 @@ class SplitPlugin : public PluginTensorRT {
   virtual int initialize() override;
   virtual int enqueue(int batchSize, const void *const *inputs, void **outputs,
                       void *workspace, cudaStream_t stream) override;
-
-  void setAxis(int axis) { axis_ = axis; }
-
-  void setOutputLengths(const std::vector<int> &output_lengths) {
-    output_length_ = output_lengths;
-  }
 };
 
 }  // tensorrt
diff --git a/paddle/fluid/inference/tensorrt/plugin/trt_plugin.cc b/paddle/fluid/inference/tensorrt/plugin/trt_plugin.cc
index 975a5ed162..08016d84b1 100644
--- a/paddle/fluid/inference/tensorrt/plugin/trt_plugin.cc
+++ b/paddle/fluid/inference/tensorrt/plugin/trt_plugin.cc
@@ -19,23 +19,23 @@ namespace inference {
 namespace tensorrt {
 
 void PluginTensorRT::serializeBase(void*& buffer) {
-  serialize_value(&buffer, input_dims_);
-  serialize_value(&buffer, max_batch_size_);
-  serialize_value(&buffer, data_type_);
-  serialize_value(&buffer, data_format_);
+  SerializeValue(&buffer, input_dims_);
+  SerializeValue(&buffer, max_batch_size_);
+  SerializeValue(&buffer, data_type_);
+  SerializeValue(&buffer, data_format_);
 }
 
 void PluginTensorRT::deserializeBase(void const*& serialData,
                                      size_t& serialLength) {
-  deserialize_value(&serialData, &serialLength, &input_dims_);
-  deserialize_value(&serialData, &serialLength, &max_batch_size_);
-  deserialize_value(&serialData, &serialLength, &data_type_);
-  deserialize_value(&serialData, &serialLength, &data_format_);
+  DeserializeValue(&serialData, &serialLength, &input_dims_);
+  DeserializeValue(&serialData, &serialLength, &max_batch_size_);
+  DeserializeValue(&serialData, &serialLength, &data_type_);
+  DeserializeValue(&serialData, &serialLength, &data_format_);
 }
 
 size_t PluginTensorRT::getBaseSerializationSize() {
-  return (serialized_size(input_dims_) + serialized_size(max_batch_size_) +
-          serialized_size(data_type_) + serialized_size(data_format_));
+  return (SerializedSize(input_dims_) + SerializedSize(max_batch_size_) +
+          SerializedSize(data_type_) + SerializedSize(data_format_));
 }
 
 bool PluginTensorRT::supportsFormat(nvinfer1::DataType type,
diff --git a/paddle/fluid/inference/tensorrt/plugin/trt_plugin.h b/paddle/fluid/inference/tensorrt/plugin/trt_plugin.h
index 44869b390f..4d85e955a4 100644
--- a/paddle/fluid/inference/tensorrt/plugin/trt_plugin.h
+++ b/paddle/fluid/inference/tensorrt/plugin/trt_plugin.h
@@ -41,11 +41,7 @@ class PluginTensorRT : public nvinfer1::IPluginExt {
   size_t getWorkspaceSize(int) const override { return 0; }
   void terminate() override {}
   virtual ~PluginTensorRT() {}
-
-  // The following functions need to be overrided in the subclass.
-  virtual nvinfer1::IPluginExt* clone() const = 0;
-  virtual const char* getPluginType() const = 0;
-  int initialize() override { return 0; }
+  // Check format support. The default is FLOAT32 and NCHW.
   bool supportsFormat(nvinfer1::DataType type,
                       nvinfer1::PluginFormat format) const override;
   void configureWithFormat(const nvinfer1::Dims* inputDims, int nbInputs,
@@ -53,12 +49,24 @@ class PluginTensorRT : public nvinfer1::IPluginExt {
                            nvinfer1::DataType type,
                            nvinfer1::PluginFormat format,
                            int maxBatchSize) override;
+
+  // *NOTE* The following functions need to be overrided in the subclass.
+  virtual nvinfer1::IPluginExt* clone() const = 0;
+  virtual const char* getPluginType() const = 0;
+  // Initialize the layer for execution. This is called when the engine is
+  // created.
+  int initialize() override { return 0; }
+  // Serialize the layer config to buffer.
   virtual void serialize(void* buffer) = 0;
   virtual size_t getSerializationSize() = 0;
+  virtual int enqueue(int batchSize, const void* const* inputs, void** outputs,
+                      void* workspace, cudaStream_t stream) = 0;
 
  protected:
+  // Deserialize input_dims, max_batch_size, data_type, data_format
   void deserializeBase(void const*& serialData, size_t& serialLength);
   size_t getBaseSerializationSize();
+  // Serialize input_dims, max_batch_size, data_type, data_format
   void serializeBase(void*& buffer);
 
   std::vector<nvinfer1::Dims> input_dims_;

From 8e0616ebeed0a74d6efb836fcaff147862095203 Mon Sep 17 00:00:00 2001
From: Qiao Longfei <qiaolongfei@baidu.com>
Date: Tue, 13 Nov 2018 22:51:32 +0800
Subject: [PATCH 36/50] fix prelu test=develop

---
 python/paddle/fluid/layers/nn.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py
index d3623464e9..bbc2686091 100644
--- a/python/paddle/fluid/layers/nn.py
+++ b/python/paddle/fluid/layers/nn.py
@@ -6811,7 +6811,7 @@ def prelu(x, mode, param_attr=None, name=None):
         alpha_shape = x.shape
     dtype = helper.input_dtype(input_param_name='x')
     alpha = helper.create_parameter(
-        attr=param_attr,
+        attr=helper.param_attr,
         shape=alpha_shape,
         dtype='float32',
         is_bias=False,

From 51a538e0554ff08ee4cd80e1ecc849f564e3416e Mon Sep 17 00:00:00 2001
From: baojun-nervana <baojun.liu@intel.com>
Date: Tue, 13 Nov 2018 14:14:24 -0800
Subject: [PATCH 37/50] Fix style and use enum test=develop

---
 paddle/fluid/framework/ngraph_operator.cc | 80 ++++++++++++-----------
 paddle/fluid/framework/ngraph_operator.h  | 18 ++---
 2 files changed, 51 insertions(+), 47 deletions(-)

diff --git a/paddle/fluid/framework/ngraph_operator.cc b/paddle/fluid/framework/ngraph_operator.cc
index 70e6f97b4c..d967b2780c 100644
--- a/paddle/fluid/framework/ngraph_operator.cc
+++ b/paddle/fluid/framework/ngraph_operator.cc
@@ -35,6 +35,13 @@ static std::map<proto::VarType::Type, ngraph::element::Type> pd2ng_type_map = {
     {proto::VarType::BOOL, ngraph::element::boolean},
 };
 
+typedef enum {                /* nGraph support state on ops          */
+               FULL_TRAIN,    /* Support full ops for train           */
+               PARTIAL_TRAIN, /* Support partial ops for train        */
+               FULL_TEST,     /* Support full list of ops for test    */
+               PARTIAL_TEST   /* Support partial list of ops for test */
+} op_state;
+
 class NgraphOperator {
  public:
   explicit NgraphOperator(const Scope& scope, const platform::Place& place,
@@ -44,33 +51,29 @@ class NgraphOperator {
                           const std::unordered_set<std::string>& persist,
                           const std::unordered_set<std::string>& fetches,
                           const std::unordered_set<std::string>& post_op_inputs,
-                          int is_test_or_train)
-      : scope(scope),
-        place(place),
-        fused_ops(ops),
-        var_type_map(var_type_map),
-        persistables(persist),
-        fetches(fetches),
-        post_op_inputs(post_op_inputs),
-        is_test_or_train(is_test_or_train) {}
+                          op_state ng_op_state)
+      : scope_(scope),
+        place_(place),
+        fused_ops_(ops),
+        var_type_map_(var_type_map),
+        persistables_(persist),
+        fetches_(fetches),
+        post_op_inputs_(post_op_inputs),
+        ng_op_state_(ng_op_state) {}
 
   void Run(const Scope& scope, const platform::Place& place) const;
 
  private:
   static std::unordered_map<std::string, std::shared_ptr<ngraph::Function>>
       func_cache;
-  const Scope& scope;
-  const platform::Place& place;
-  std::vector<std::shared_ptr<OperatorBase>> fused_ops;
-  std::unordered_map<std::string, ngraph::element::Type> var_type_map;
-  std::unordered_set<std::string> persistables;
-  std::unordered_set<std::string> fetches;
-  std::unordered_set<std::string> post_op_inputs;
-  // 0 = default; 1 = (is_test && not is_complete)
-  // 2 = (is_test && is_complete)
-  // 3 = (is_training && not is_complete)
-  // 4 = (is_training && is_complete)
-  int is_test_or_train;
+  const Scope& scope_;
+  const platform::Place& place_;
+  std::vector<std::shared_ptr<OperatorBase>> fused_ops_;
+  std::unordered_map<std::string, ngraph::element::Type> var_type_map_;
+  std::unordered_set<std::string> persistables_;
+  std::unordered_set<std::string> fetches_;
+  std::unordered_set<std::string> post_op_inputs_;
+  op_state ng_op_state_;
 };
 
 std::vector<std::vector<std::vector<std::unique_ptr<OperatorBase>>::iterator>>
@@ -131,19 +134,19 @@ FusedOperator::FusedOperator(
     const ProgramDesc& prog, size_t block_id,
     std::vector<std::unique_ptr<OperatorBase>>::iterator start,
     std::vector<std::unique_ptr<OperatorBase>>::iterator end,
-    const std::string& type = "fused_op", const VariableNameMap& inputs = {},
-    const VariableNameMap& outputs = {}, const AttributeMap& attrs = {})
+    const std::string& type, const VariableNameMap& inputs,
+    const VariableNameMap& outputs, const AttributeMap& attrs)
     : OperatorBase(type, inputs, outputs, attrs), pdesc(prog), block(block_id) {
   for (std::vector<std::unique_ptr<OperatorBase>>::iterator it = start;
        it != end; ++it) {
-    fused_ops.push_back(std::move(*it));
+    fused_ops_.push_back(std::move(*it));
   }
 
   for (std::vector<std::unique_ptr<OperatorBase>>::iterator it = end;
        (*it)->Type() != kFetchOpType; ++it) {
     for (auto& var_name_item : (*it)->Inputs()) {
       for (auto& var_name : var_name_item.second) {
-        post_op_inputs.insert(var_name);
+        post_op_inputs_.insert(var_name);
       }
     }
   }
@@ -152,11 +155,11 @@ FusedOperator::FusedOperator(
     is_complete = true;
   }
 
-  process();
+  Process();
 }
 
-void FusedOperator::process() {
-  auto& bdesc = pdesc.Block(block);
+void FusedOperator::Process() {
+  auto& bdesc = pdesc_.Block(block_);
   for (auto& var : bdesc.AllVars()) {
     if (!(var->GetType() == proto::VarType::SELECTED_ROWS ||
           var->GetType() == proto::VarType::LOD_TENSOR ||
@@ -175,39 +178,40 @@ void FusedOperator::process() {
         PADDLE_THROW("Data type of var %s not found in pd2ng_type_map",
                      var_name);
       }
-      var_type_map[var_name] = pd2ng_type_map[pd_type];
+      var_type_map_[var_name] = pd2ng_type_map[pd_type];
     }
 
     if (var->Persistable()) {
-      persistables.insert(var->Name());
+      persistables_.insert(var->Name());
     }
   }
 
   for (auto* op : bdesc.AllOps()) {
     if (op->Type() == kFetchOpType) {
       std::string fetch_target_name = op->Input("X")[0];
-      fetches.insert(fetch_target_name);
+      fetches_.insert(fetch_target_name);
     }
   }
 }
 
 void FusedOperator::RunImpl(const Scope& scope,
                             const platform::Place& place) const {
-  int is_test_or_train = 1;
-  auto& bdesc = pdesc.Block(block);
+  op_state ng_op_state = PARTIAL_TEST;
+  auto& bdesc = pdesc_.Block(block_);
   for (auto* op : bdesc.AllOps()) {
     if (op->Type().find("_grad") != std::string::npos) {
-      is_test_or_train = 3;
+      ng_op_state = PARTIAL_TRAIN;
       break;
     }
   }
 
-  if (is_complete) {
-    is_test_or_train = is_test_or_train == 1 ? 2 : 4;
+  if (is_full) {
+    ng_op_state = ng_op_state == PARTIAL_TEST ? FULL_TEST : FULL_TRAIN;
   }
 
-  NgraphOperator ngraph_op(scope, place, fused_ops, var_type_map, persistables,
-                           fetches, post_op_inputs, is_test_or_train);
+  NgraphOperator ngraph_op(scope, place, fused_ops_, var_type_map_,
+                           persistables_, fetches_, post_op_inputs_,
+                           ng_op_state);
   ngraph_op.Run(scope, place);
 }
 
diff --git a/paddle/fluid/framework/ngraph_operator.h b/paddle/fluid/framework/ngraph_operator.h
index eb77c78115..0f655cef1d 100644
--- a/paddle/fluid/framework/ngraph_operator.h
+++ b/paddle/fluid/framework/ngraph_operator.h
@@ -56,16 +56,16 @@ class FusedOperator : public OperatorBase {
   void RunImpl(const Scope& scope, const platform::Place& place) const final;
 
  private:
-  const ProgramDesc pdesc;
-  size_t block;
-  std::vector<std::shared_ptr<OperatorBase>> fused_ops;
-  std::unordered_map<std::string, ngraph::element::Type> var_type_map;
-  std::unordered_set<std::string> persistables;
-  std::unordered_set<std::string> fetches;
-  std::unordered_set<std::string> post_op_inputs;
-  bool is_complete = false;
+  const ProgramDesc pdesc_;
+  size_t block_;
+  std::vector<std::shared_ptr<OperatorBase>> fused_ops_;
+  std::unordered_map<std::string, ngraph::element::Type> var_type_map_;
+  std::unordered_set<std::string> persistables_;
+  std::unordered_set<std::string> fetches_;
+  std::unordered_set<std::string> post_op_inputs_;
+  bool is_full_ = false;
 
-  void process();
+  void Process();
 };
 }  // namespace framework
 }  // namespace paddle

From a61909ff47e559726bd51ad8694779b372e62636 Mon Sep 17 00:00:00 2001
From: peizhilin <wopeizl@163.com>
Date: Wed, 14 Nov 2018 10:32:22 +0800
Subject: [PATCH 38/50] test=develop

---
 .../fluid/framework/ir/attention_lstm_fuse_pass.cc   | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/paddle/fluid/framework/ir/attention_lstm_fuse_pass.cc b/paddle/fluid/framework/ir/attention_lstm_fuse_pass.cc
index ecefab32bb..d61ff04bc7 100644
--- a/paddle/fluid/framework/ir/attention_lstm_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/attention_lstm_fuse_pass.cc
@@ -212,11 +212,11 @@ void PrepareLSTMWeight(const LoDTensor& W_forget_w0,
 
   float* out_data = out->mutable_data<float>(platform::CPUPlace());
   std::array<const float*, 4> tensors{
-      {W_forget_w0.data<float>(), W_input_w0.data<float>(),
-        W_output_w0.data<float>(), W_cell_w0.data<float>()}};
+      W_forget_w0.data<float>(), W_input_w0.data<float>(),
+        W_output_w0.data<float>(), W_cell_w0.data<float>()};
   std::array<const float*, 4> tensors1{
-      {W_forget_w1.data<float>(), W_input_w1.data<float>(),
-        W_output_w1.data<float>(), W_cell_w1.data<float>()}};
+      W_forget_w1.data<float>(), W_input_w1.data<float>(),
+        W_output_w1.data<float>(), W_cell_w1.data<float>()};
 
   for (int row = 0; row < D; row++) {
     for (int col = 0; col < 4; col++) {
@@ -239,8 +239,8 @@ void PrepareLSTMBias(const LoDTensor& B_forget, const LoDTensor& B_input,
                      const LoDTensor& B_output, const LoDTensor& B_cell,
                      LoDTensor* out) {
   std::array<const float*, 4> tensors{
-      {B_forget.data<float>(), B_input.data<float>(), B_output.data<float>(),
-        B_cell.data<float>()}};
+      B_forget.data<float>(), B_input.data<float>(), B_output.data<float>(),
+        B_cell.data<float>()};
 
   PADDLE_ENFORCE_EQ(B_forget.dims().size(), 1);
   int D = B_forget.dims()[0];

From 447bf7c80b70dafb8369403c751dcb0572f88494 Mon Sep 17 00:00:00 2001
From: peizhilin <wopeizl@163.com>
Date: Wed, 14 Nov 2018 11:26:33 +0800
Subject: [PATCH 39/50] test=develop

---
 cmake/inference_lib.cmake                      | 1 +
 paddle/fluid/inference/analysis/CMakeLists.txt | 4 ----
 2 files changed, 1 insertion(+), 4 deletions(-)

diff --git a/cmake/inference_lib.cmake b/cmake/inference_lib.cmake
index dc6906bfb3..729bdcb3dc 100644
--- a/cmake/inference_lib.cmake
+++ b/cmake/inference_lib.cmake
@@ -188,6 +188,7 @@ copy(inference_lib DEPS ${inference_deps}
        ${src_dir}/${module}/api/paddle_*.h
        ${PADDLE_BINARY_DIR}/paddle/fluid/inference/api/paddle_inference_pass.h
   DSTS ${dst_dir}/${module} ${dst_dir}/${module} ${dst_dir}/${module} ${dst_dir}/${module}
+        )
 
 set(module "platform")
 copy(platform_lib DEPS profiler_py_proto
diff --git a/paddle/fluid/inference/analysis/CMakeLists.txt b/paddle/fluid/inference/analysis/CMakeLists.txt
index 07a45ece02..344aecaae5 100644
--- a/paddle/fluid/inference/analysis/CMakeLists.txt
+++ b/paddle/fluid/inference/analysis/CMakeLists.txt
@@ -21,10 +21,6 @@ cc_library(analysis SRCS
 
 cc_test(test_dot SRCS dot_tester.cc DEPS analysis)
 
-if(WIN32)
-    target_link_libraries(inference_analyzer shlwapi)
-endif(WIN32)
-
 function (inference_analysis_test TARGET)
     if(WITH_TESTING)
         set(options "")

From 42c48c3a82201b871f8c90341074ebd9791901b0 Mon Sep 17 00:00:00 2001
From: peizhilin <wopeizl@163.com>
Date: Wed, 14 Nov 2018 11:37:43 +0800
Subject: [PATCH 40/50] fix

---
 cmake/inference_lib.cmake                      | 1 +
 paddle/fluid/inference/analysis/CMakeLists.txt | 4 ----
 2 files changed, 1 insertion(+), 4 deletions(-)

diff --git a/cmake/inference_lib.cmake b/cmake/inference_lib.cmake
index dc6906bfb3..729bdcb3dc 100644
--- a/cmake/inference_lib.cmake
+++ b/cmake/inference_lib.cmake
@@ -188,6 +188,7 @@ copy(inference_lib DEPS ${inference_deps}
        ${src_dir}/${module}/api/paddle_*.h
        ${PADDLE_BINARY_DIR}/paddle/fluid/inference/api/paddle_inference_pass.h
   DSTS ${dst_dir}/${module} ${dst_dir}/${module} ${dst_dir}/${module} ${dst_dir}/${module}
+        )
 
 set(module "platform")
 copy(platform_lib DEPS profiler_py_proto
diff --git a/paddle/fluid/inference/analysis/CMakeLists.txt b/paddle/fluid/inference/analysis/CMakeLists.txt
index 07a45ece02..344aecaae5 100644
--- a/paddle/fluid/inference/analysis/CMakeLists.txt
+++ b/paddle/fluid/inference/analysis/CMakeLists.txt
@@ -21,10 +21,6 @@ cc_library(analysis SRCS
 
 cc_test(test_dot SRCS dot_tester.cc DEPS analysis)
 
-if(WIN32)
-    target_link_libraries(inference_analyzer shlwapi)
-endif(WIN32)
-
 function (inference_analysis_test TARGET)
     if(WITH_TESTING)
         set(options "")

From 08d1dc84a97f4a40daf82de42006a8e97cd81bcf Mon Sep 17 00:00:00 2001
From: peizhilin <wopeizl@163.com>
Date: Wed, 14 Nov 2018 11:40:05 +0800
Subject: [PATCH 41/50] fix

---
 paddle/fluid/framework/ir/node.cc | 1 -
 1 file changed, 1 deletion(-)

diff --git a/paddle/fluid/framework/ir/node.cc b/paddle/fluid/framework/ir/node.cc
index 57ee426f73..f34ce62b1e 100644
--- a/paddle/fluid/framework/ir/node.cc
+++ b/paddle/fluid/framework/ir/node.cc
@@ -23,7 +23,6 @@ namespace ir {
 #else
         const char Node::kControlDepVarName[] = "__control_var";
 #endif
-int Node::count_ = 0;
 
 std::unique_ptr<Node> CreateNodeForTest(const std::string& name,
                                         Node::Type type) {

From bae3659714ac7e033c220bb7c3df9400b6c02992 Mon Sep 17 00:00:00 2001
From: Xin Pan <panxin.grad@gmail.com>
Date: Wed, 14 Nov 2018 14:47:47 +0800
Subject: [PATCH 42/50] more test

test=develop
---
 paddle/fluid/pybind/pybind.cc                            | 6 +++---
 python/paddle/fluid/tests/unittests/test_pass_builder.py | 5 +++++
 2 files changed, 8 insertions(+), 3 deletions(-)

diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc
index 68b80c6311..50b7a08876 100644
--- a/paddle/fluid/pybind/pybind.cc
+++ b/paddle/fluid/pybind/pybind.cc
@@ -650,9 +650,9 @@ All parameter, weight, gradient are variables in Paddle.
           [](ir::Pass &self, const std::string &name, const std::string &attr) {
             self.Set<std::string>(name, new std::string(attr));
           })
-      .def("set_int", [](ir::Pass &self, const std::string &name, int val) {
-        self.Set<const int>(name, new int(val));
-      });
+      .def("set_int", [](ir::Pass &self, const std::string &name,
+                         int val) { self.Set<const int>(name, new int(val)); })
+      .def("type", &ir::Pass::Type);
 
   py::class_<ir::PassBuilder, std::shared_ptr<ir::PassBuilder>> pb(
       m, "PassBuilder");
diff --git a/python/paddle/fluid/tests/unittests/test_pass_builder.py b/python/paddle/fluid/tests/unittests/test_pass_builder.py
index 65ad63dc01..5a3ec8ff01 100644
--- a/python/paddle/fluid/tests/unittests/test_pass_builder.py
+++ b/python/paddle/fluid/tests/unittests/test_pass_builder.py
@@ -94,7 +94,12 @@ class TestPassBuilder(unittest.TestCase):
 
     def test_parallel_testing_with_new_strategy(self):
         build_strategy = fluid.BuildStrategy()
+        self.assertFalse(build_strategy.fuse_elewise_add_act_ops)
+        build_strategy.fuse_elewise_add_act_ops = True
         pass_builder = build_strategy._finalize_strategy_and_create_passes()
+        self.assertTrue("fuse_elewise_add_act_pass" in
+                        [p.type() for p in pass_builder.all_passes()])
+
         origin_len = len(pass_builder.all_passes())
 
         viz_pass = pass_builder.append_pass("graph_viz_pass")

From 83ddafb515c664ae0d8e37c1e1ed423c077b829e Mon Sep 17 00:00:00 2001
From: Yu Yang <reyoung@126.com>
Date: Wed, 14 Nov 2018 14:50:31 +0800
Subject: [PATCH 43/50] Splict cicheks jobs and expose anakin options (#14327)

* Split cichecks

test=develop

* feat(Anakin): expose anakin options to paddle cmake option

Expose ANAKIN_BUILD_FAT_BIN, ANAKIN_BUILD_CROSS_PLANTFORM to Paddle cmake option

test=develop
---
 CMakeLists.txt                 |  2 ++
 cmake/external/anakin.cmake    |  8 +++++---
 paddle/scripts/paddle_build.sh | 15 +++++++++++++++
 3 files changed, 22 insertions(+), 3 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 291a960b14..bd53604075 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -66,6 +66,8 @@ option(WITH_ARM_FP16    "Use half precision support on armv8.2-a cpu"   OFF)
 option(WITH_CONTRIB     "Compile the third-party contributation"        OFF)
 option(REPLACE_ENFORCE_GLOG "Replace PADDLE_ENFORCE with glog/CHECK for better debug." OFF)
 option(WITH_ANAKIN      "Compile with Anakin library"                   OFF)
+option(ANAKIN_BUILD_FAT_BIN "Build anakin cuda fat-bin lib for all device plantform, ignored when WITH_ANAKIN=OFF" OFF)
+option(ANAKIN_BUILD_CROSS_PLANTFORM "Build anakin lib for any nvidia device plantform. ignored when WITH_ANAKIN=OFF" ON)
 option(WITH_GRPC     "Use grpc as the default rpc framework"            ${WITH_DISTRIBUTE})
 option(WITH_BRPC_RDMA     "Use brpc rdma as the rpc protocal"           OFF)
 option(ON_INFER         "Turn on inference optimization."               OFF)
diff --git a/cmake/external/anakin.cmake b/cmake/external/anakin.cmake
index 84354c446e..06fc6061bc 100644
--- a/cmake/external/anakin.cmake
+++ b/cmake/external/anakin.cmake
@@ -58,19 +58,21 @@ ExternalProject_Add(
                         -DPROTOBUF_ROOT=${THIRD_PARTY_PATH}/install/protobuf
                         -DMKLML_ROOT=${THIRD_PARTY_PATH}/install/mklml
                         -DENABLE_OP_TIMER=${ANAKIN_ENABLE_OP_TIMER}
+                        -DBUILD_FAT_BIN=${ANAKIN_BUILD_FAT_BIN}
+                        -DBUILD_CROSS_PLANTFORM=${ANAKIN_BUILD_CROSS_PLANTFORM}
                         ${EXTERNAL_OPTIONAL_ARGS}
     CMAKE_CACHE_ARGS    -DCMAKE_INSTALL_PREFIX:PATH=${ANAKIN_INSTALL_DIR}
 )
 
 message(STATUS "Anakin for inference is enabled")
 message(STATUS "Anakin is set INCLUDE:${ANAKIN_INCLUDE} LIBRARY:${ANAKIN_LIBRARY}")
-
+add_dependencies(extern_anakin protobuf mklml)
 add_library(anakin_shared SHARED IMPORTED GLOBAL)
 set_property(TARGET anakin_shared PROPERTY IMPORTED_LOCATION ${ANAKIN_SHARED_LIB})
-add_dependencies(anakin_shared extern_anakin protobuf mklml)
+add_dependencies(anakin_shared extern_anakin)
 
 add_library(anakin_saber SHARED IMPORTED GLOBAL)
 set_property(TARGET anakin_saber PROPERTY IMPORTED_LOCATION ${ANAKIN_SABER_LIB})
-add_dependencies(anakin_saber extern_anakin protobuf mklml)
+add_dependencies(anakin_saber extern_anakin)
 
 list(APPEND external_project_dependencies anakin_shared anakin_saber)
diff --git a/paddle/scripts/paddle_build.sh b/paddle/scripts/paddle_build.sh
index a51c9becd4..32f9bca645 100755
--- a/paddle/scripts/paddle_build.sh
+++ b/paddle/scripts/paddle_build.sh
@@ -156,6 +156,8 @@ function cmake_gen() {
         -DWITH_INFERENCE_API_TEST=${WITH_INFERENCE_API_TEST:-ON}
         -DINFERENCE_DEMO_INSTALL_DIR=${INFERENCE_DEMO_INSTALL_DIR}
         -DWITH_ANAKIN=${WITH_ANAKIN:-OFF}
+        -DANAKIN_BUILD_FAT_BIN=${ANAKIN_BUILD_FAT_BIN:OFF}
+        -DANAKIN_BUILD_CROSS_PLANTFORM=${ANAKIN_BUILD_CROSS_PLANTFORM:ON}
         -DPY_VERSION=${PY_VERSION:-2.7}
         -DCMAKE_INSTALL_PREFIX=${INSTALL_PREFIX:-/paddle/build}
     ========================================
@@ -188,6 +190,8 @@ EOF
         -DWITH_INFERENCE_API_TEST=${WITH_INFERENCE_API_TEST:-ON} \
         -DINFERENCE_DEMO_INSTALL_DIR=${INFERENCE_DEMO_INSTALL_DIR} \
         -DWITH_ANAKIN=${WITH_ANAKIN:-OFF} \
+        -DANAKIN_BUILD_FAT_BIN=${ANAKIN_BUILD_FAT_BIN:OFF}\
+        -DANAKIN_BUILD_CROSS_PLANTFORM=${ANAKIN_BUILD_CROSS_PLANTFORM:ON}\
         -DPY_VERSION=${PY_VERSION:-2.7} \
         -DCMAKE_INSTALL_PREFIX=${INSTALL_PREFIX:-/paddle/build}
 
@@ -777,6 +781,17 @@ function main() {
         test_fluid_lib
         assert_api_spec_approvals
         ;;
+      assert_api)
+        assert_api_not_changed ${PYTHON_ABI:-""}
+        ;;
+      test_inference)
+        gen_capi_package
+        gen_fluid_lib
+        test_fluid_lib
+        ;;
+      assert_api_approvals)
+        assert_api_spec_approvals
+        ;;
       maccheck)
         cmake_gen ${PYTHON_ABI:-""}
         build_mac

From 1a9008c420cf95e38d49959c313705ebf3d3ff8c Mon Sep 17 00:00:00 2001
From: peizhilin <wopeizl@163.com>
Date: Wed, 14 Nov 2018 17:09:21 +0800
Subject: [PATCH 44/50] code style fix test=develop

---
 .../framework/ir/attention_lstm_fuse_pass.cc  |  6 +-
 paddle/fluid/framework/ir/node.cc             |  4 +-
 paddle/fluid/framework/ir/node.h              |  4 +-
 paddle/fluid/framework/ir/pass.h              | 36 ++++++------
 paddle/fluid/framework/operator.cc            |  6 +-
 paddle/fluid/inference/api/helper.h           |  2 +-
 .../fluid/operators/elementwise_op_function.h |  4 +-
 paddle/fluid/operators/grid_sampler_op.h      |  4 +-
 paddle/fluid/platform/init.cc                 |  6 +-
 paddle/fluid/platform/port.h                  | 56 +++++++++----------
 paddle/fluid/platform/variant.h               |  2 +-
 paddle/fluid/pybind/pybind.cc                 |  4 +-
 12 files changed, 67 insertions(+), 67 deletions(-)

diff --git a/paddle/fluid/framework/ir/attention_lstm_fuse_pass.cc b/paddle/fluid/framework/ir/attention_lstm_fuse_pass.cc
index 64d585c222..c436dd414d 100644
--- a/paddle/fluid/framework/ir/attention_lstm_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/attention_lstm_fuse_pass.cc
@@ -213,10 +213,10 @@ void PrepareLSTMWeight(const LoDTensor& W_forget_w0,
   float* out_data = out->mutable_data<float>(platform::CPUPlace());
   std::array<const float*, 4> tensors{
       W_forget_w0.data<float>(), W_input_w0.data<float>(),
-        W_output_w0.data<float>(), W_cell_w0.data<float>()};
+      W_output_w0.data<float>(), W_cell_w0.data<float>()};
   std::array<const float*, 4> tensors1{
       W_forget_w1.data<float>(), W_input_w1.data<float>(),
-        W_output_w1.data<float>(), W_cell_w1.data<float>()};
+      W_output_w1.data<float>(), W_cell_w1.data<float>()};
 
   for (int row = 0; row < D; row++) {
     for (int col = 0; col < 4; col++) {
@@ -240,7 +240,7 @@ void PrepareLSTMBias(const LoDTensor& B_forget, const LoDTensor& B_input,
                      LoDTensor* out) {
   std::array<const float*, 4> tensors{
       B_forget.data<float>(), B_input.data<float>(), B_output.data<float>(),
-        B_cell.data<float>()};
+      B_cell.data<float>()};
 
   PADDLE_ENFORCE_EQ(B_forget.dims().size(), 1);
   int D = B_forget.dims()[0];
diff --git a/paddle/fluid/framework/ir/node.cc b/paddle/fluid/framework/ir/node.cc
index f34ce62b1e..50d9113088 100644
--- a/paddle/fluid/framework/ir/node.cc
+++ b/paddle/fluid/framework/ir/node.cc
@@ -19,9 +19,9 @@ namespace framework {
 namespace ir {
 // msvc15 don't support constexpr in correct way.
 #if !defined(_WIN32)
-        constexpr char Node::kControlDepVarName[];
+constexpr char Node::kControlDepVarName[];
 #else
-        const char Node::kControlDepVarName[] = "__control_var";
+const char Node::kControlDepVarName[] = "__control_var";
 #endif
 
 std::unique_ptr<Node> CreateNodeForTest(const std::string& name,
diff --git a/paddle/fluid/framework/ir/node.h b/paddle/fluid/framework/ir/node.h
index 21dd43bc1d..d2a393b3f1 100644
--- a/paddle/fluid/framework/ir/node.h
+++ b/paddle/fluid/framework/ir/node.h
@@ -56,9 +56,9 @@ class Node {
 
   enum class Type { kOperation, kVariable };
 #if !defined(_WIN32)  // msvc not support constexpr correctly.
-        static constexpr char kControlDepVarName[] = "__control_var";
+  static constexpr char kControlDepVarName[] = "__control_var";
 #else
-        static const char kControlDepVarName[];
+  static const char kControlDepVarName[];
 #endif
 
   Type NodeType() const { return type_; }
diff --git a/paddle/fluid/framework/ir/pass.h b/paddle/fluid/framework/ir/pass.h
index e8dd48a535..615b539695 100644
--- a/paddle/fluid/framework/ir/pass.h
+++ b/paddle/fluid/framework/ir/pass.h
@@ -197,26 +197,26 @@ struct PassRegistrar : public Registrar {
                 msg)
 
 // Register a new pass that can be applied on the IR.
-#define REGISTER_PASS(pass_type, pass_class)                          \
-  STATIC_ASSERT_PASS_GLOBAL_NAMESPACE(                                \
-      __reg_pass__##pass_type,                                        \
-      "REGISTER_PASS must be called in global namespace");            \
-  static ::paddle::framework::ir::PassRegistrar<pass_class>           \
-      __pass_registrar_##pass_type##__(#pass_type);                   \
-  int TouchPassRegistrar_##pass_type() {                              \
-    __pass_registrar_##pass_type##__.Touch();                         \
-    return 0;                                                         \
-  }                                                                   \
-  static ::paddle::framework::ir::PassRegistrar<pass_class>           \
-      &__pass_tmp_registrar_##pass_type##__ UNUSED = \
+#define REGISTER_PASS(pass_type, pass_class)                \
+  STATIC_ASSERT_PASS_GLOBAL_NAMESPACE(                      \
+      __reg_pass__##pass_type,                              \
+      "REGISTER_PASS must be called in global namespace");  \
+  static ::paddle::framework::ir::PassRegistrar<pass_class> \
+      __pass_registrar_##pass_type##__(#pass_type);         \
+  int TouchPassRegistrar_##pass_type() {                    \
+    __pass_registrar_##pass_type##__.Touch();               \
+    return 0;                                               \
+  }                                                         \
+  static ::paddle::framework::ir::PassRegistrar<pass_class> \
+      &__pass_tmp_registrar_##pass_type##__ UNUSED =        \
           __pass_registrar_##pass_type##__
 
-#define USE_PASS(pass_type)                                           \
-  STATIC_ASSERT_PASS_GLOBAL_NAMESPACE(                                \
-      __use_pass_itself_##pass_type,                                  \
-      "USE_PASS must be called in global namespace");                 \
-  extern int TouchPassRegistrar_##pass_type();                        \
-  static int use_pass_itself_##pass_type##_ UNUSED = \
+#define USE_PASS(pass_type)                           \
+  STATIC_ASSERT_PASS_GLOBAL_NAMESPACE(                \
+      __use_pass_itself_##pass_type,                  \
+      "USE_PASS must be called in global namespace"); \
+  extern int TouchPassRegistrar_##pass_type();        \
+  static int use_pass_itself_##pass_type##_ UNUSED =  \
       TouchPassRegistrar_##pass_type()
 
 }  // namespace ir
diff --git a/paddle/fluid/framework/operator.cc b/paddle/fluid/framework/operator.cc
index 36fe5724ea..6bd744edc2 100644
--- a/paddle/fluid/framework/operator.cc
+++ b/paddle/fluid/framework/operator.cc
@@ -150,9 +150,9 @@ void OperatorBase::Run(const Scope& scope, const platform::Place& place) {
 #endif
   }
 
-  // The profile has a process-wide mutex, results in serious performance issue
-  // in concurrency scenerio. Here use an `if` to fix this issue.
-  // Please not remove the `if`, ask @Superjomn if there are any concern.
+// The profile has a process-wide mutex, results in serious performance issue
+// in concurrency scenerio. Here use an `if` to fix this issue.
+// Please not remove the `if`, ask @Superjomn if there are any concern.
 #ifndef _WIN32
   if (platform::IsProfileEnabled()) {
     platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
diff --git a/paddle/fluid/inference/api/helper.h b/paddle/fluid/inference/api/helper.h
index ba72fba8be..6f9d663121 100644
--- a/paddle/fluid/inference/api/helper.h
+++ b/paddle/fluid/inference/api/helper.h
@@ -20,9 +20,9 @@
 #else
 #endif
 
-#include <iterator>
 #include <algorithm>
 #include <chrono>  // NOLINT
+#include <iterator>
 #include <numeric>
 #include <sstream>
 #include <string>
diff --git a/paddle/fluid/operators/elementwise_op_function.h b/paddle/fluid/operators/elementwise_op_function.h
index d7444bcfe0..7bb6934e14 100644
--- a/paddle/fluid/operators/elementwise_op_function.h
+++ b/paddle/fluid/operators/elementwise_op_function.h
@@ -112,7 +112,7 @@ class RowwiseTransformIterator<T, platform::CPUDeviceContext>
   }
 
   RowwiseTransformIterator<T, platform::CPUDeviceContext> &operator+(int n) {
-    while(n-- > 0) {
+    while (n-- > 0) {
       ++i_;
       if (UNLIKELY(i_ == n_)) {
         i_ = 0;
@@ -161,7 +161,7 @@ class MidWiseTransformIterator<T, platform::CPUDeviceContext>
   }
 
   MidWiseTransformIterator<T, platform::CPUDeviceContext> &operator+(int n) {
-    while(n-- > 0) {
+    while (n-- > 0) {
       ++j_;
       if (UNLIKELY(j_ == post_)) {
         ++i_;
diff --git a/paddle/fluid/operators/grid_sampler_op.h b/paddle/fluid/operators/grid_sampler_op.h
index 00fba457bb..08a6043eb0 100644
--- a/paddle/fluid/operators/grid_sampler_op.h
+++ b/paddle/fluid/operators/grid_sampler_op.h
@@ -67,10 +67,10 @@ static void CalcGridLocations(const platform::CPUDeviceContext& ctx,
   Tensor half_ymax;
   half_xmax.mutable_data<T>({n, h, w}, ctx.GetPlace());
   auto half_xmax_t =
-          EigenTensor<T, 3>::From(half_xmax).setConstant(0.5 * x_max);
+      EigenTensor<T, 3>::From(half_xmax).setConstant(0.5 * x_max);
   half_ymax.mutable_data<T>({n, h, w}, ctx.GetPlace());
   auto half_ymax_t =
-          EigenTensor<T, 3>::From(half_ymax).setConstant(0.5 * y_max);
+      EigenTensor<T, 3>::From(half_ymax).setConstant(0.5 * y_max);
 
   // scale grid to [0, h-1/w-1]
   auto grid_x_t = EigenTensor<T, 3>::From(grid_x);
diff --git a/paddle/fluid/platform/init.cc b/paddle/fluid/platform/init.cc
index 6156067645..84d1b852cb 100644
--- a/paddle/fluid/platform/init.cc
+++ b/paddle/fluid/platform/init.cc
@@ -115,9 +115,9 @@ void InitDevices(bool init_p2p, const std::vector<int> devices) {
 
 // windows has no support for openblas multi-thread
 #ifdef _WIN32
-    if (FLAGS_paddle_num_threads > 1) {
-        FLAGS_paddle_num_threads = 1;
-    }
+  if (FLAGS_paddle_num_threads > 1) {
+    FLAGS_paddle_num_threads = 1;
+  }
 #endif
 
 #ifndef PADDLE_WITH_MKLDNN
diff --git a/paddle/fluid/platform/port.h b/paddle/fluid/platform/port.h
index d3a6e28549..8823e97b0b 100644
--- a/paddle/fluid/platform/port.h
+++ b/paddle/fluid/platform/port.h
@@ -24,38 +24,38 @@
 #include "glog/logging.h"
 
 #if !defined(_WIN32)
-  #include <dlfcn.h>     //  dladdr
-  #include <execinfo.h>  // backtrace
-  #include <sys/stat.h>
-  #include <algorithm>  // std::accumulate
+#include <dlfcn.h>     //  dladdr
+#include <execinfo.h>  // backtrace
+#include <sys/stat.h>
+#include <algorithm>  // std::accumulate
 #else
-  #include <stdio.h>
-  #include <io.h>  // _popen, _pclose
-  #include <windows.h>
-  #include <numeric>  // std::accumulate in msvc
-  #ifndef S_ISDIR  // windows port for sys/stat.h
-  #define S_ISDIR(mode) (((mode)&S_IFMT) == S_IFDIR)
-  #endif  // S_ISDIR
-
-  static void *dlsym(void *handle, const char *symbol_name) {
-    FARPROC found_symbol;
-    found_symbol = GetProcAddress((HMODULE)handle, symbol_name);
-
-    if (found_symbol == NULL) {
-      throw std::runtime_error(std::string(symbol_name) + " not found.");
-    }
-    return reinterpret_cast<void *>(found_symbol);
+#include <io.h>  // _popen, _pclose
+#include <stdio.h>
+#include <windows.h>
+#include <numeric>  // std::accumulate in msvc
+#ifndef S_ISDIR     // windows port for sys/stat.h
+#define S_ISDIR(mode) (((mode)&S_IFMT) == S_IFDIR)
+#endif  // S_ISDIR
+
+static void *dlsym(void *handle, const char *symbol_name) {
+  FARPROC found_symbol;
+  found_symbol = GetProcAddress((HMODULE)handle, symbol_name);
+
+  if (found_symbol == NULL) {
+    throw std::runtime_error(std::string(symbol_name) + " not found.");
   }
+  return reinterpret_cast<void *>(found_symbol);
+}
 
-  static void *dlopen(const char *filename, int flag) {
-    std::string file_name(filename);
-    file_name.replace(0, file_name.size() - 1, '/', '\\');
-    HMODULE hModule = LoadLibrary(file_name.c_str());
-    if (!hModule) {
-      throw std::runtime_error(file_name + " not found.");
-    }
-    return reinterpret_cast<void *>(hModule);
+static void *dlopen(const char *filename, int flag) {
+  std::string file_name(filename);
+  file_name.replace(0, file_name.size() - 1, '/', '\\');
+  HMODULE hModule = LoadLibrary(file_name.c_str());
+  if (!hModule) {
+    throw std::runtime_error(file_name + " not found.");
   }
+  return reinterpret_cast<void *>(hModule);
+}
 
 #endif  // !_WIN32
 
diff --git a/paddle/fluid/platform/variant.h b/paddle/fluid/platform/variant.h
index 1b10db8669..42bff087d2 100644
--- a/paddle/fluid/platform/variant.h
+++ b/paddle/fluid/platform/variant.h
@@ -46,7 +46,7 @@ limitations under the License. */
 // some platform-independent defintion
 #if defined(_WIN32)
 #define UNUSED
-#define __builtin_expect(EXP, C)  (EXP)
+#define __builtin_expect(EXP, C) (EXP)
 #else
 #define UNUSED __attribute__((unused))
 #endif
diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc
index 6cba3395bf..592c40cf1c 100644
--- a/paddle/fluid/pybind/pybind.cc
+++ b/paddle/fluid/pybind/pybind.cc
@@ -352,7 +352,7 @@ All parameter, weight, gradient are variables in Paddle.
            [](Variable &self) { return self.GetMutable<LoDTensorArray>(); },
            py::return_value_policy::reference)
 #if (defined(PADDLE_WITH_CUDA) && !defined(_WIN32))
-	  .def("get_communicator",
+      .def("get_communicator",
            [](Variable &self) -> platform::Communicator * {
              return self.GetMutable<platform::Communicator>();
            },
@@ -364,7 +364,7 @@ All parameter, weight, gradient are variables in Paddle.
            },
            py::return_value_policy::reference)
 #endif
-;
+      ;
 
 #if !defined(_WIN32)
   py::class_<framework::ReaderHolder>(m, "Reader", "")

From e2a1cd19f1602fff49fe5fccf54b96dd99ddcd90 Mon Sep 17 00:00:00 2001
From: peizhilin <wopeizl@163.com>
Date: Wed, 14 Nov 2018 17:17:17 +0800
Subject: [PATCH 45/50] code style fix test=develop

---
 python/paddle/trainer_config_helpers/networks.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/python/paddle/trainer_config_helpers/networks.py b/python/paddle/trainer_config_helpers/networks.py
index 1e961b936f..b5cde7bac7 100644
--- a/python/paddle/trainer_config_helpers/networks.py
+++ b/python/paddle/trainer_config_helpers/networks.py
@@ -1719,7 +1719,7 @@ def inputs(layers, *args):
     if len(args) != 0:
         layers.extend(args)
 
-    Inputs(* [l.name for l in layers])
+    Inputs(*[l.name for l in layers])
 
 
 def outputs(layers, *args):
@@ -1769,7 +1769,7 @@ def outputs(layers, *args):
     assert len(layers) > 0
 
     if HasInputsSet():  # input already set
-        Outputs(* [l.name for l in layers])
+        Outputs(*[l.name for l in layers])
         return  # just return outputs.
 
     if len(layers) != 1:

From 228e1934b81c1d25555c269c28923aef8d192154 Mon Sep 17 00:00:00 2001
From: peizhilin <wopeizl@163.com>
Date: Wed, 14 Nov 2018 17:17:17 +0800
Subject: [PATCH 46/50] code style fix test=develop

---
 python/paddle/trainer_config_helpers/networks.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/python/paddle/trainer_config_helpers/networks.py b/python/paddle/trainer_config_helpers/networks.py
index 1e961b936f..b5cde7bac7 100644
--- a/python/paddle/trainer_config_helpers/networks.py
+++ b/python/paddle/trainer_config_helpers/networks.py
@@ -1719,7 +1719,7 @@ def inputs(layers, *args):
     if len(args) != 0:
         layers.extend(args)
 
-    Inputs(* [l.name for l in layers])
+    Inputs(*[l.name for l in layers])
 
 
 def outputs(layers, *args):
@@ -1769,7 +1769,7 @@ def outputs(layers, *args):
     assert len(layers) > 0
 
     if HasInputsSet():  # input already set
-        Outputs(* [l.name for l in layers])
+        Outputs(*[l.name for l in layers])
         return  # just return outputs.
 
     if len(layers) != 1:

From 15bdb7ef14f7ce9ff4c72d31a17ab9a1d03204d7 Mon Sep 17 00:00:00 2001
From: nhzlx <zlx_hg@163.com>
Date: Wed, 14 Nov 2018 10:31:17 +0000
Subject: [PATCH 47/50] delete error uploaded files

test=develop
---
 .../tensorrt/plugin/.trt_plugin_utils.h.swp     | Bin 12288 -> 0 bytes
 1 file changed, 0 insertions(+), 0 deletions(-)
 delete mode 100644 paddle/fluid/inference/tensorrt/plugin/.trt_plugin_utils.h.swp

diff --git a/paddle/fluid/inference/tensorrt/plugin/.trt_plugin_utils.h.swp b/paddle/fluid/inference/tensorrt/plugin/.trt_plugin_utils.h.swp
deleted file mode 100644
index 08d1434089f792131d0e6a545ad8675b3ba4892c..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 12288
zcmeI2O^@3|7{{j^p`|Nc4hRkpS%h|18#|j)QMP48x^J}z$to{h6%JuykDWn|?T*Kr
z-9?2DfCL|cmJa|&Q27KL5nrL)$^i+%0sfAYCc8jeP6#xXez83>e*DaD>}Xec`jzX>
zJM_9$W!M%N`}xJM-+PzcUl_i{n2KYaH$Q4Sx>;E(2T4}0Uc6XdyYLz)S1OiBT&vHe
zmsxH+%wyPT>Q(K-<Iql&y~jN-6n0YV{4kTAEhAqj5xK(FB1&VWw4H=mAR}8V-75}z
znU-N{4IY)0={E+9fti6!`{hgLx)-WzmArAW^fEpBOn26Aj*J0gz!)$Fi~(c77%&Em
z0b}5QZ$Kw!*vFXV6O);~GTEOwwV#}toiShx7z4(DF<=ZB1IB<cU<?=o#(*(k4EzTT
zaF4NHo@DIyQ%D}a|4;w_|Mom%cfm*CP4GN;4*d8uWB0&q&<8Jo@6R#z75Ea|0iS_S
zK@+TlH^8f40sMBBv0uSm@Hyb%eee?a2|3&cKY(w*7vK)ab2|hdfJJZ~{EnROf$zX4
zpbpM~v%qY|fH7bU7z4(DF<=ZB1CME7QAD2fiENuX+l_X!wNr2I_coi2PRqI4E1FuJ
z-d?@C>oj_u*<rWi)Z4v(_ws8!{B<~+z;3g-Ti@<&cDHJI^WSx`TjS|Qd??a{yKHg}
zVYjBoMc=WYqE_|P6@ngAt<s6EJj7h=<I`(!qVu7_PX&7D*m3#n$was-Xj)Ip82#0j
z%45;fE0~I_#85<<hs&2)s+Eia+CJ2ZySkY)AZf{sqg2yDEWH&<g_1n%d0g{VDq>9A
zLJ<Xeu*#S|N`xnTO0~DX?q`v!WgK1E8<+KV##O$p&dO3AZ+U}?=FR*TDs%M?hF>0M
zTy$)S;*Q9+=z6^1b16^LT;5=8y5=)G^x<6NY+2`9i)R1>*&<|xnJ4H<<G{%am#x9Z
ztg-Zu3!AM$X+G#w6%P?LQ9KxOicuQPSSW{5^IT&}S>y>tdLXDSUDy;Bt=OzBFsE4?
z`EfBOO@u3b=~Bp}Amf3cAU+T(%2zoNYADlm9F<2N+jlzGn%xfV*IKPwqvLG1so5fY
z7i>8lr`f>S4%Her=xwL5wMs(beu~6lqC}b!?k9&yD1~P+Pv*~2KhwhdbGjr`nja8H
z#3L%z+T^rLbhXxY+N-P^g?UgVZe~&;O8<y>g6J;qhXQjM<@e)(66n)09%33PsWB<6
zcI@jBSFkqFI$5{v(P(6GyyydA#VW87B)68@b!QXbc-!Twh4R=NaYjRaL~&npMC0Vf
z3C{czQn_YFlW|e3DNA$bn2s-zlsI%nqVrTx`;JsOR*puHG#|oZlSP{sQyV8YUCvdy
zE>ylKgv4kmU)0j%q7vRvX0OmDa#J!GXj*cYsajdPM0?|+`r?ynnI6O{wWt<`)XE2@
M)XHC^gM4-V01zic1poj5


From b361579f09840910fd5ab6c3118b74b67f4939b6 Mon Sep 17 00:00:00 2001
From: Jacek Czaja <jacek.czaja@intel.com>
Date: Wed, 14 Nov 2018 12:20:36 +0100
Subject: [PATCH 48/50] - Softmax for Inference is enabled when ON_INFER is set

test=develop
---
 paddle/fluid/operators/math/softmax.cc        |  6 ++-
 paddle/fluid/operators/math/softmax.cu        | 11 +++--
 paddle/fluid/operators/math/softmax.h         |  2 +-
 paddle/fluid/operators/math/softmax_impl.h    | 41 +++++++++++++++++--
 paddle/fluid/operators/softmax_op.h           |  7 +++-
 .../operators/softmax_with_cross_entropy_op.h |  4 +-
 6 files changed, 58 insertions(+), 13 deletions(-)

diff --git a/paddle/fluid/operators/math/softmax.cc b/paddle/fluid/operators/math/softmax.cc
index 78c65af24a..fa2018178f 100644
--- a/paddle/fluid/operators/math/softmax.cc
+++ b/paddle/fluid/operators/math/softmax.cc
@@ -19,8 +19,10 @@ namespace paddle {
 namespace operators {
 namespace math {
 
-template class SoftmaxFunctor<platform::CPUDeviceContext, float>;
-template class SoftmaxFunctor<platform::CPUDeviceContext, double>;
+template class SoftmaxFunctor<platform::CPUDeviceContext, float, true>;
+template class SoftmaxFunctor<platform::CPUDeviceContext, float, false>;
+template class SoftmaxFunctor<platform::CPUDeviceContext, double, true>;
+template class SoftmaxFunctor<platform::CPUDeviceContext, double, false>;
 template class SoftmaxGradFunctor<platform::CPUDeviceContext, float>;
 template class SoftmaxGradFunctor<platform::CPUDeviceContext, double>;
 
diff --git a/paddle/fluid/operators/math/softmax.cu b/paddle/fluid/operators/math/softmax.cu
index ce183ed364..2e9669049e 100644
--- a/paddle/fluid/operators/math/softmax.cu
+++ b/paddle/fluid/operators/math/softmax.cu
@@ -98,9 +98,14 @@ template class SoftmaxGradCUDNNFunctor<float>;
 template class SoftmaxGradCUDNNFunctor<double>;
 template class SoftmaxGradCUDNNFunctor<platform::float16>;
 
-template class SoftmaxFunctor<platform::CUDADeviceContext, platform::float16>;
-template class SoftmaxFunctor<platform::CUDADeviceContext, float>;
-template class SoftmaxFunctor<platform::CUDADeviceContext, double>;
+template class SoftmaxFunctor<platform::CUDADeviceContext, platform::float16,
+                              false>;
+template class SoftmaxFunctor<platform::CUDADeviceContext, platform::float16,
+                              true>;
+template class SoftmaxFunctor<platform::CUDADeviceContext, float, false>;
+template class SoftmaxFunctor<platform::CUDADeviceContext, double, false>;
+template class SoftmaxFunctor<platform::CUDADeviceContext, float, true>;
+template class SoftmaxFunctor<platform::CUDADeviceContext, double, true>;
 template class SoftmaxGradFunctor<platform::CUDADeviceContext, float>;
 template class SoftmaxGradFunctor<platform::CUDADeviceContext, double>;
 template class SoftmaxGradFunctor<platform::CUDADeviceContext,
diff --git a/paddle/fluid/operators/math/softmax.h b/paddle/fluid/operators/math/softmax.h
index da1f0b672d..bf698dc2f7 100644
--- a/paddle/fluid/operators/math/softmax.h
+++ b/paddle/fluid/operators/math/softmax.h
@@ -19,7 +19,7 @@ namespace paddle {
 namespace operators {
 namespace math {
 
-template <typename DeviceContext, typename T>
+template <typename DeviceContext, typename T, bool is_test>
 class SoftmaxFunctor {
  public:
   void operator()(const DeviceContext& context, const framework::Tensor* X,
diff --git a/paddle/fluid/operators/math/softmax_impl.h b/paddle/fluid/operators/math/softmax_impl.h
index dd9971ba09..7cf98f2725 100644
--- a/paddle/fluid/operators/math/softmax_impl.h
+++ b/paddle/fluid/operators/math/softmax_impl.h
@@ -32,10 +32,10 @@ struct ValueClip {
   }
 };
 
-template <typename DeviceContext, typename T>
-void SoftmaxFunctor<DeviceContext, T>::operator()(const DeviceContext& context,
-                                                  const framework::Tensor* X,
-                                                  framework::Tensor* Y) {
+template <typename DeviceContext, typename T, bool is_test>
+void SoftmaxFunctor<DeviceContext, T, is_test>::operator()(
+    const DeviceContext& context, const framework::Tensor* X,
+    framework::Tensor* Y) {
   auto logits = EigenMatrix<T>::From(*X);
   auto softmax = EigenMatrix<T>::From(*Y);
 
@@ -65,6 +65,39 @@ void SoftmaxFunctor<DeviceContext, T>::operator()(const DeviceContext& context,
                                                  .broadcast(one_by_class));
 }
 
+template <typename DeviceContext, typename T>
+class SoftmaxFunctor<DeviceContext, T, true> {
+  void operator()(const DeviceContext& context, const framework::Tensor* X,
+                  framework::Tensor* Y) {
+    auto logits = EigenMatrix<T>::From(*X);
+    auto softmax = EigenMatrix<T>::From(*Y);
+
+    const int kBatchDim = 0;
+    const int kClassDim = 1;
+
+    const int batch_size = logits.dimension(kBatchDim);
+    const int num_classes = logits.dimension(kClassDim);
+
+    Eigen::DSizes<int, 1> along_class(kClassDim);
+    Eigen::DSizes<int, 2> batch_by_one(batch_size, 1);
+    Eigen::DSizes<int, 2> one_by_class(1, num_classes);
+
+    auto shifted_logits = (logits -
+                           logits.maximum(along_class)
+                               .eval()
+                               .reshape(batch_by_one)
+                               .broadcast(one_by_class));
+
+    softmax.device(*context.eigen_device()) = shifted_logits.exp();
+    softmax.device(*context.eigen_device()) = (softmax *
+                                               softmax.sum(along_class)
+                                                   .inverse()
+                                                   .eval()
+                                                   .reshape(batch_by_one)
+                                                   .broadcast(one_by_class));
+  }
+};
+
 template <typename DeviceContext, typename T>
 void SoftmaxGradFunctor<DeviceContext, T>::operator()(
     const DeviceContext& context, const framework::Tensor* y,
diff --git a/paddle/fluid/operators/softmax_op.h b/paddle/fluid/operators/softmax_op.h
index cf1eeb017d..2fea8a65bc 100644
--- a/paddle/fluid/operators/softmax_op.h
+++ b/paddle/fluid/operators/softmax_op.h
@@ -35,8 +35,13 @@ class SoftmaxKernel : public framework::OpKernel<T> {
     Tensor X_2d = framework::ReshapeToMatrix(*X, rank - 1);
     Tensor Out_2d = framework::ReshapeToMatrix(*Out, rank - 1);
 
-    math::SoftmaxFunctor<DeviceContext, T>()(
+#ifdef ON_INFER
+    math::SoftmaxFunctor<DeviceContext, T, true>()(
         context.template device_context<DeviceContext>(), &X_2d, &Out_2d);
+#else
+    math::SoftmaxFunctor<DeviceContext, T, false>()(
+        context.template device_context<DeviceContext>(), &X_2d, &Out_2d);
+#endif
   }
 };
 
diff --git a/paddle/fluid/operators/softmax_with_cross_entropy_op.h b/paddle/fluid/operators/softmax_with_cross_entropy_op.h
index e9aba3b37b..c0530e3d8b 100644
--- a/paddle/fluid/operators/softmax_with_cross_entropy_op.h
+++ b/paddle/fluid/operators/softmax_with_cross_entropy_op.h
@@ -42,8 +42,8 @@ class SoftmaxWithCrossEntropyKernel : public framework::OpKernel<T> {
 
     auto& dev_ctx =
         context.template device_context<platform::CPUDeviceContext>();
-    math::SoftmaxFunctor<platform::CPUDeviceContext, T>()(dev_ctx, logits,
-                                                          softmax);
+    math::SoftmaxFunctor<platform::CPUDeviceContext, T, false>()(
+        dev_ctx, logits, softmax);
     math::CrossEntropyFunctor<platform::CPUDeviceContext, T>()(
         dev_ctx, loss, softmax, labels, context.Attr<bool>("soft_label"),
         context.Attr<int>("ignore_index"));

From 0ef2a37c0e3675be44e4bb556ca601d7d43c79a7 Mon Sep 17 00:00:00 2001
From: peizhilin <wopeizl@163.com>
Date: Wed, 14 Nov 2018 19:57:44 +0800
Subject: [PATCH 49/50] merge from develop

---
 .../fluid/inference/analysis/CMakeLists.txt   | 27 ++++++++-----------
 1 file changed, 11 insertions(+), 16 deletions(-)

diff --git a/paddle/fluid/inference/analysis/CMakeLists.txt b/paddle/fluid/inference/analysis/CMakeLists.txt
index 344aecaae5..eb89fc5e11 100644
--- a/paddle/fluid/inference/analysis/CMakeLists.txt
+++ b/paddle/fluid/inference/analysis/CMakeLists.txt
@@ -21,22 +21,17 @@ cc_library(analysis SRCS
 
 cc_test(test_dot SRCS dot_tester.cc DEPS analysis)
 
-function (inference_analysis_test TARGET)
-    if(WITH_TESTING)
-        set(options "")
-        set(oneValueArgs "")
-        set(multiValueArgs SRCS ARGS EXTRA_DEPS)
-        cmake_parse_arguments(analysis_test "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
-        set(mem_opt "")
-        if(WITH_GPU)
-            set(mem_opt "--fraction_of_gpu_memory_to_use=0.5")
-        endif()
-        cc_test(${TARGET}
-                SRCS "${analysis_test_SRCS}"
-                DEPS analysis pass ${GLOB_PASS_LIB} ${analysis_test_EXTRA_DEPS}
-                ARGS --inference_model_dir=${PYTHON_TESTS_DIR}/book/word2vec.inference.model ${mem_opt} ${analysis_test_ARGS})
-        set_tests_properties(${TARGET} PROPERTIES DEPENDS test_word2vec)
-    endif(WITH_TESTING)
+function(inference_analysis_test TARGET)
+  if(WITH_TESTING)
+     set(options "")
+     set(oneValueArgs "")
+     set(multiValueArgs SRCS ARGS EXTRA_DEPS)
+     cmake_parse_arguments(analysis_test "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
+     inference_base_test(${TARGET}
+             SRCS ${analysis_test_SRCS}
+             DEPS analysis pass ${GLOB_PASS_LIB} ${analysis_test_EXTRA_DEPS}
+             ARGS --inference_model_dir=${WORD2VEC_MODEL_DIR} ${analysis_test_ARGS})
+  endif()
 endfunction(inference_analysis_test)
 
 inference_analysis_test(test_analyzer SRCS analyzer_tester.cc EXTRA_DEPS paddle_inference_api)

From 9e6b1c5f974bdb42c8ec5dc323f76d405e8017d8 Mon Sep 17 00:00:00 2001
From: Yiqun Liu <liuyiqun01@baidu.com>
Date: Thu, 15 Nov 2018 10:28:52 +0800
Subject: [PATCH 50/50] Refine tester of TensorRT engine (#14390)

* Refine the tester for MixedRTPredictor.
test=develop

* Enable the profiler in TensorRT engine.

* Support the use of combined inference model in TensorRT unittest, and print the shape of feed targets.
---
 .../api/analysis_predictor_tester.cc          |   2 +-
 .../api/demo_ci/simple_on_word2vec.cc         |   2 +-
 .../api/demo_ci/trt_mobilenet_demo.cc         |   2 +-
 .../inference/api/paddle_analysis_config.h    |   2 +
 .../fluid/inference/api/paddle_pass_builder.h |   4 +-
 .../fluid/inference/tests/api/CMakeLists.txt  |   3 +-
 .../tests/api/analyzer_dam_tester.cc          |   7 +-
 .../tests/api/analyzer_lac_tester.cc          |   6 +-
 .../tests/api/analyzer_ner_tester.cc          |   6 +-
 .../tests/api/analyzer_resnet50_tester.cc     |   6 +-
 .../tests/api/analyzer_rnn1_tester.cc         |  10 +-
 .../tests/api/analyzer_rnn2_tester.cc         |   6 +-
 .../tests/api/analyzer_seq_conv1_tester.cc    |   6 +-
 .../analyzer_text_classification_tester.cc    |   9 +-
 .../tests/api/analyzer_vis_tester.cc          |   6 +-
 .../inference/tests/api/config_printer.h      |  79 ++++++
 .../fluid/inference/tests/api/tester_helper.h |  87 +++++--
 .../inference/tests/api/trt_models_tester.cc  | 245 +++++++++---------
 18 files changed, 315 insertions(+), 173 deletions(-)
 create mode 100644 paddle/fluid/inference/tests/api/config_printer.h

diff --git a/paddle/fluid/inference/api/analysis_predictor_tester.cc b/paddle/fluid/inference/api/analysis_predictor_tester.cc
index 1e6f75e364..d67305670c 100644
--- a/paddle/fluid/inference/api/analysis_predictor_tester.cc
+++ b/paddle/fluid/inference/api/analysis_predictor_tester.cc
@@ -15,7 +15,7 @@
 #include "paddle/fluid/inference/api/analysis_predictor.h"
 #include <glog/logging.h>
 #include <gtest/gtest.h>
-#include <thread>
+#include <thread>  // NOLINT
 #include "paddle/fluid/inference/api/helper.h"
 #include "paddle/fluid/inference/api/paddle_inference_api.h"
 
diff --git a/paddle/fluid/inference/api/demo_ci/simple_on_word2vec.cc b/paddle/fluid/inference/api/demo_ci/simple_on_word2vec.cc
index 6ae5198dab..3dd1d3c838 100644
--- a/paddle/fluid/inference/api/demo_ci/simple_on_word2vec.cc
+++ b/paddle/fluid/inference/api/demo_ci/simple_on_word2vec.cc
@@ -23,7 +23,7 @@ limitations under the License. */
 #include <memory>
 #include <thread>  //NOLINT
 
-#include "utils.h"
+#include "utils.h"  // NOLINT
 
 DEFINE_string(dirname, "", "Directory of the inference model.");
 DEFINE_bool(use_gpu, false, "Whether use gpu.");
diff --git a/paddle/fluid/inference/api/demo_ci/trt_mobilenet_demo.cc b/paddle/fluid/inference/api/demo_ci/trt_mobilenet_demo.cc
index 72d20bc59e..0eb620ea51 100644
--- a/paddle/fluid/inference/api/demo_ci/trt_mobilenet_demo.cc
+++ b/paddle/fluid/inference/api/demo_ci/trt_mobilenet_demo.cc
@@ -4,7 +4,7 @@ Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
 
-http://www.apache.org/licenses/LICENSE-2.0
+    http://www.apache.org/licenses/LICENSE-2.0
 
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
diff --git a/paddle/fluid/inference/api/paddle_analysis_config.h b/paddle/fluid/inference/api/paddle_analysis_config.h
index 82c04e9f3f..2ac736df7c 100644
--- a/paddle/fluid/inference/api/paddle_analysis_config.h
+++ b/paddle/fluid/inference/api/paddle_analysis_config.h
@@ -49,6 +49,8 @@ struct AnalysisConfig : public NativeConfig {
 
   void EnableTensorRtEngine(int workspace_size = 1 << 20,
                             int max_batch_size = 1);
+  bool use_tensorrt() const { return use_tensorrt_; }
+
   // NOTE this is just for internal development, please not use it.
   // NOT stable yet.
   void EnableMKLDNN();
diff --git a/paddle/fluid/inference/api/paddle_pass_builder.h b/paddle/fluid/inference/api/paddle_pass_builder.h
index 8aad5c5984..80658d3085 100644
--- a/paddle/fluid/inference/api/paddle_pass_builder.h
+++ b/paddle/fluid/inference/api/paddle_pass_builder.h
@@ -91,7 +91,7 @@ class CpuPassStrategy : public PassStrategy {
 
   virtual ~CpuPassStrategy() = default;
 
-  virtual void EnableMKLDNN() override {
+  void EnableMKLDNN() override {
 // TODO(Superjomn) Consider the way to mix CPU with GPU.
 #ifdef PADDLE_WITH_MKLDNN
     passes_.insert(passes_.begin(), "mkldnn_placement_pass");
@@ -123,7 +123,7 @@ class GpuPassStrategy : public PassStrategy {
   GpuPassStrategy(const GpuPassStrategy &other)
       : PassStrategy(other.AllPasses()) {}
 
-  virtual void EnableMKLDNN() override;
+  void EnableMKLDNN() override;
 
   virtual ~GpuPassStrategy() = default;
 };
diff --git a/paddle/fluid/inference/tests/api/CMakeLists.txt b/paddle/fluid/inference/tests/api/CMakeLists.txt
index fc3e44ffd7..4915f28f43 100644
--- a/paddle/fluid/inference/tests/api/CMakeLists.txt
+++ b/paddle/fluid/inference/tests/api/CMakeLists.txt
@@ -108,8 +108,7 @@ if(WITH_GPU AND TENSORRT_FOUND)
    if (NOT EXISTS ${TRT_MODEL_INSTALL_DIR})
        inference_download_and_uncompress(${TRT_MODEL_INSTALL_DIR} ${INFERENCE_URL}/tensorrt_test "trt_test_models.tar.gz")
    endif()
-
    inference_analysis_test(test_trt_models SRCS trt_models_tester.cc
       EXTRA_DEPS ${INFERENCE_EXTRA_DEPS} analysis ${analysis_deps} ir_pass_manager analysis_predictor
-        ARGS --dirname=${TRT_MODEL_INSTALL_DIR}/trt_test_models SERIAL)
+        ARGS --infer_model=${TRT_MODEL_INSTALL_DIR}/trt_test_models SERIAL)
 endif()
diff --git a/paddle/fluid/inference/tests/api/analyzer_dam_tester.cc b/paddle/fluid/inference/tests/api/analyzer_dam_tester.cc
index ceac5dc7e1..d1adc08667 100644
--- a/paddle/fluid/inference/tests/api/analyzer_dam_tester.cc
+++ b/paddle/fluid/inference/tests/api/analyzer_dam_tester.cc
@@ -178,7 +178,8 @@ TEST(Analyzer_dam, profile) {
   std::vector<PaddleTensor> outputs;
   std::vector<std::vector<PaddleTensor>> input_slots_all;
   SetInput(&input_slots_all);
-  TestPrediction(cfg, input_slots_all, &outputs, FLAGS_num_threads);
+  TestPrediction(reinterpret_cast<const PaddlePredictor::Config *>(&cfg),
+                 input_slots_all, &outputs, FLAGS_num_threads);
 
   if (FLAGS_num_threads == 1 && !FLAGS_test_all_data) {
     PADDLE_ENFORCE_GT(outputs.size(), 0);
@@ -216,7 +217,9 @@ TEST(Analyzer_dam, compare) {
   SetInput(&input_slots_all);
 
   if (FLAGS_use_analysis) {
-    CompareNativeAndAnalysis(cfg, input_slots_all);
+    CompareNativeAndAnalysis(
+        reinterpret_cast<const PaddlePredictor::Config *>(&cfg),
+        input_slots_all);
   }
 }
 
diff --git a/paddle/fluid/inference/tests/api/analyzer_lac_tester.cc b/paddle/fluid/inference/tests/api/analyzer_lac_tester.cc
index 5fb551810f..310852e2f7 100644
--- a/paddle/fluid/inference/tests/api/analyzer_lac_tester.cc
+++ b/paddle/fluid/inference/tests/api/analyzer_lac_tester.cc
@@ -133,7 +133,8 @@ TEST(Analyzer_LAC, profile) {
 
   std::vector<std::vector<PaddleTensor>> input_slots_all;
   SetInput(&input_slots_all);
-  TestPrediction(cfg, input_slots_all, &outputs, FLAGS_num_threads);
+  TestPrediction(reinterpret_cast<const PaddlePredictor::Config *>(&cfg),
+                 input_slots_all, &outputs, FLAGS_num_threads);
 
   if (FLAGS_num_threads == 1 && !FLAGS_test_all_data) {
     // the first inference result
@@ -175,7 +176,8 @@ TEST(Analyzer_LAC, compare) {
 
   std::vector<std::vector<PaddleTensor>> input_slots_all;
   SetInput(&input_slots_all);
-  CompareNativeAndAnalysis(cfg, input_slots_all);
+  CompareNativeAndAnalysis(
+      reinterpret_cast<const PaddlePredictor::Config *>(&cfg), input_slots_all);
 }
 
 }  // namespace analysis
diff --git a/paddle/fluid/inference/tests/api/analyzer_ner_tester.cc b/paddle/fluid/inference/tests/api/analyzer_ner_tester.cc
index d91f7c314d..3a5f844de3 100644
--- a/paddle/fluid/inference/tests/api/analyzer_ner_tester.cc
+++ b/paddle/fluid/inference/tests/api/analyzer_ner_tester.cc
@@ -121,7 +121,8 @@ TEST(Analyzer_Chinese_ner, profile) {
 
   std::vector<std::vector<PaddleTensor>> input_slots_all;
   SetInput(&input_slots_all);
-  TestPrediction(cfg, input_slots_all, &outputs, FLAGS_num_threads);
+  TestPrediction(reinterpret_cast<const PaddlePredictor::Config *>(&cfg),
+                 input_slots_all, &outputs, FLAGS_num_threads);
 
   if (FLAGS_num_threads == 1 && !FLAGS_test_all_data) {
     // the first inference result
@@ -160,7 +161,8 @@ TEST(Analyzer_Chinese_ner, compare) {
 
   std::vector<std::vector<PaddleTensor>> input_slots_all;
   SetInput(&input_slots_all);
-  CompareNativeAndAnalysis(cfg, input_slots_all);
+  CompareNativeAndAnalysis(
+      reinterpret_cast<const PaddlePredictor::Config *>(&cfg), input_slots_all);
 }
 
 }  // namespace inference
diff --git a/paddle/fluid/inference/tests/api/analyzer_resnet50_tester.cc b/paddle/fluid/inference/tests/api/analyzer_resnet50_tester.cc
index 5c92096d9d..2b936175ed 100644
--- a/paddle/fluid/inference/tests/api/analyzer_resnet50_tester.cc
+++ b/paddle/fluid/inference/tests/api/analyzer_resnet50_tester.cc
@@ -45,7 +45,8 @@ void profile(bool use_mkldnn = false) {
 
   std::vector<std::vector<PaddleTensor>> input_slots_all;
   SetInput(&input_slots_all);
-  TestPrediction(cfg, input_slots_all, &outputs, FLAGS_num_threads);
+  TestPrediction(reinterpret_cast<const PaddlePredictor::Config *>(&cfg),
+                 input_slots_all, &outputs, FLAGS_num_threads);
 }
 
 TEST(Analyzer_resnet50, profile) { profile(); }
@@ -74,7 +75,8 @@ void compare(bool use_mkldnn = false) {
 
   std::vector<std::vector<PaddleTensor>> input_slots_all;
   SetInput(&input_slots_all);
-  CompareNativeAndAnalysis(cfg, input_slots_all);
+  CompareNativeAndAnalysis(
+      reinterpret_cast<const PaddlePredictor::Config *>(&cfg), input_slots_all);
 }
 
 TEST(Analyzer_resnet50, compare) { compare(); }
diff --git a/paddle/fluid/inference/tests/api/analyzer_rnn1_tester.cc b/paddle/fluid/inference/tests/api/analyzer_rnn1_tester.cc
index 612ae121b2..1ae2b4b03a 100644
--- a/paddle/fluid/inference/tests/api/analyzer_rnn1_tester.cc
+++ b/paddle/fluid/inference/tests/api/analyzer_rnn1_tester.cc
@@ -233,8 +233,8 @@ TEST(Analyzer_rnn1, profile) {
 
   std::vector<std::vector<PaddleTensor>> input_slots_all;
   SetInput(&input_slots_all);
-  LOG(INFO) << "to test prediction";
-  TestPrediction(cfg, input_slots_all, &outputs, FLAGS_num_threads);
+  TestPrediction(reinterpret_cast<const PaddlePredictor::Config *>(&cfg),
+                 input_slots_all, &outputs, FLAGS_num_threads);
 }
 
 // Check the fuse status
@@ -261,7 +261,8 @@ TEST(Analyzer_rnn1, compare) {
 
   std::vector<std::vector<PaddleTensor>> input_slots_all;
   SetInput(&input_slots_all);
-  CompareNativeAndAnalysis(cfg, input_slots_all);
+  CompareNativeAndAnalysis(
+      reinterpret_cast<const PaddlePredictor::Config *>(&cfg), input_slots_all);
 }
 
 // Test Multi-Thread.
@@ -272,7 +273,8 @@ TEST(Analyzer_rnn1, multi_thread) {
 
   std::vector<std::vector<PaddleTensor>> input_slots_all;
   SetInput(&input_slots_all);
-  TestPrediction(cfg, input_slots_all, &outputs, 4 /* multi_thread */);
+  TestPrediction(reinterpret_cast<const PaddlePredictor::Config *>(&cfg),
+                 input_slots_all, &outputs, 4 /* multi_thread */);
 }
 
 // Validate that the AnalysisPredictor + ZeroCopyTensor really works by testing
diff --git a/paddle/fluid/inference/tests/api/analyzer_rnn2_tester.cc b/paddle/fluid/inference/tests/api/analyzer_rnn2_tester.cc
index e0eb919bd8..e2985006f0 100644
--- a/paddle/fluid/inference/tests/api/analyzer_rnn2_tester.cc
+++ b/paddle/fluid/inference/tests/api/analyzer_rnn2_tester.cc
@@ -132,7 +132,8 @@ TEST(Analyzer_rnn2, profile) {
 
   std::vector<std::vector<PaddleTensor>> input_slots_all;
   SetInput(&input_slots_all);
-  TestPrediction(cfg, input_slots_all, &outputs, FLAGS_num_threads);
+  TestPrediction(reinterpret_cast<const PaddlePredictor::Config *>(&cfg),
+                 input_slots_all, &outputs, FLAGS_num_threads);
 
   if (FLAGS_num_threads == 1 && !FLAGS_test_all_data) {
     // the first inference result
@@ -153,7 +154,8 @@ TEST(Analyzer_rnn2, compare) {
 
   std::vector<std::vector<PaddleTensor>> input_slots_all;
   SetInput(&input_slots_all);
-  CompareNativeAndAnalysis(cfg, input_slots_all);
+  CompareNativeAndAnalysis(
+      reinterpret_cast<const PaddlePredictor::Config *>(&cfg), input_slots_all);
 }
 
 }  // namespace inference
diff --git a/paddle/fluid/inference/tests/api/analyzer_seq_conv1_tester.cc b/paddle/fluid/inference/tests/api/analyzer_seq_conv1_tester.cc
index f590ef2796..858191184a 100644
--- a/paddle/fluid/inference/tests/api/analyzer_seq_conv1_tester.cc
+++ b/paddle/fluid/inference/tests/api/analyzer_seq_conv1_tester.cc
@@ -161,7 +161,8 @@ TEST(Analyzer_seq_conv1, profile) {
 
   std::vector<std::vector<PaddleTensor>> input_slots_all;
   SetInput(&input_slots_all);
-  TestPrediction(cfg, input_slots_all, &outputs, FLAGS_num_threads);
+  TestPrediction(reinterpret_cast<const PaddlePredictor::Config *>(&cfg),
+                 input_slots_all, &outputs, FLAGS_num_threads);
 
   if (FLAGS_num_threads == 1 && !FLAGS_test_all_data) {
     // the first inference result
@@ -199,7 +200,8 @@ TEST(Analyzer_seq_conv1, compare) {
 
   std::vector<std::vector<PaddleTensor>> input_slots_all;
   SetInput(&input_slots_all);
-  CompareNativeAndAnalysis(cfg, input_slots_all);
+  CompareNativeAndAnalysis(
+      reinterpret_cast<const PaddlePredictor::Config *>(&cfg), input_slots_all);
 }
 
 }  // namespace inference
diff --git a/paddle/fluid/inference/tests/api/analyzer_text_classification_tester.cc b/paddle/fluid/inference/tests/api/analyzer_text_classification_tester.cc
index 05bffede47..34a241f070 100644
--- a/paddle/fluid/inference/tests/api/analyzer_text_classification_tester.cc
+++ b/paddle/fluid/inference/tests/api/analyzer_text_classification_tester.cc
@@ -74,7 +74,8 @@ TEST(Analyzer_Text_Classification, profile) {
 
   std::vector<std::vector<PaddleTensor>> input_slots_all;
   SetInput(&input_slots_all);
-  TestPrediction(cfg, input_slots_all, &outputs, FLAGS_num_threads);
+  TestPrediction(reinterpret_cast<const PaddlePredictor::Config *>(&cfg),
+                 input_slots_all, &outputs, FLAGS_num_threads);
 
   if (FLAGS_num_threads == 1) {
     // Get output
@@ -101,7 +102,8 @@ TEST(Analyzer_Text_Classification, compare) {
 
   std::vector<std::vector<PaddleTensor>> input_slots_all;
   SetInput(&input_slots_all);
-  CompareNativeAndAnalysis(cfg, input_slots_all);
+  CompareNativeAndAnalysis(
+      reinterpret_cast<const PaddlePredictor::Config *>(&cfg), input_slots_all);
 }
 
 TEST(Analyzer_Text_Classification, compare_against_embedding_fc_lstm_fused) {
@@ -112,7 +114,8 @@ TEST(Analyzer_Text_Classification, compare_against_embedding_fc_lstm_fused) {
 
   std::vector<std::vector<PaddleTensor>> input_slots_all;
   SetInput(&input_slots_all);
-  CompareNativeAndAnalysis(cfg, input_slots_all);
+  CompareNativeAndAnalysis(
+      reinterpret_cast<const PaddlePredictor::Config *>(&cfg), input_slots_all);
 }
 
 }  // namespace inference
diff --git a/paddle/fluid/inference/tests/api/analyzer_vis_tester.cc b/paddle/fluid/inference/tests/api/analyzer_vis_tester.cc
index 8fafd25b78..16e1011dda 100644
--- a/paddle/fluid/inference/tests/api/analyzer_vis_tester.cc
+++ b/paddle/fluid/inference/tests/api/analyzer_vis_tester.cc
@@ -94,7 +94,8 @@ void profile(bool use_mkldnn = false) {
 
   std::vector<std::vector<PaddleTensor>> input_slots_all;
   SetInput(&input_slots_all);
-  TestPrediction(cfg, input_slots_all, &outputs, FLAGS_num_threads);
+  TestPrediction(reinterpret_cast<const PaddlePredictor::Config *>(&cfg),
+                 input_slots_all, &outputs, FLAGS_num_threads);
 
   if (FLAGS_num_threads == 1 && !FLAGS_test_all_data) {
     const float ocr_result_data[] = {
@@ -136,7 +137,8 @@ void compare(bool use_mkldnn = false) {
 
   std::vector<std::vector<PaddleTensor>> input_slots_all;
   SetInput(&input_slots_all);
-  CompareNativeAndAnalysis(cfg, input_slots_all);
+  CompareNativeAndAnalysis(
+      reinterpret_cast<const PaddlePredictor::Config *>(&cfg), input_slots_all);
 }
 
 TEST(Analyzer_vis, compare) { compare(); }
diff --git a/paddle/fluid/inference/tests/api/config_printer.h b/paddle/fluid/inference/tests/api/config_printer.h
new file mode 100644
index 0000000000..aa0c4b1d04
--- /dev/null
+++ b/paddle/fluid/inference/tests/api/config_printer.h
@@ -0,0 +1,79 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <ostream>
+#include <string>
+#include "paddle/fluid/inference/api/paddle_inference_api.h"
+
+namespace paddle {
+namespace inference {
+
+thread_local int num_spaces = 0;
+
+static std::string GenSpaces(int num_spaces) {
+  std::ostringstream os;
+  for (int i = 0; i < num_spaces; ++i) {
+    os << "  ";
+  }
+  return os.str();
+}
+
+std::ostream &operator<<(std::ostream &os,
+                         const PaddlePredictor::Config &config) {
+  os << GenSpaces(num_spaces) << "PaddlePredictor::Config {\n";
+  num_spaces++;
+  os << GenSpaces(num_spaces) << "model_dir: " << config.model_dir << "\n";
+  num_spaces--;
+  os << GenSpaces(num_spaces) << "}\n";
+  return os;
+}
+
+std::ostream &operator<<(std::ostream &os, const NativeConfig &config) {
+  os << GenSpaces(num_spaces) << "NativeConfig {\n";
+  num_spaces++;
+  os << *reinterpret_cast<const PaddlePredictor::Config *>(&config);
+  os << GenSpaces(num_spaces) << "use_gpu: " << config.use_gpu << "\n";
+  os << GenSpaces(num_spaces) << "device: " << config.device << "\n";
+  os << GenSpaces(num_spaces)
+     << "fraction_of_gpu_memory: " << config.fraction_of_gpu_memory << "\n";
+  os << GenSpaces(num_spaces) << "prog_file: " << config.prog_file << "\n";
+  os << GenSpaces(num_spaces) << "param_file: " << config.param_file << "\n";
+  os << GenSpaces(num_spaces)
+     << "specify_input_name: " << config.specify_input_name << "\n";
+  num_spaces--;
+  os << GenSpaces(num_spaces) << "}\n";
+  return os;
+}
+
+std::ostream &operator<<(std::ostream &os,
+                         const contrib::AnalysisConfig &config) {
+  os << GenSpaces(num_spaces) << "contrib::AnalysisConfig {\n";
+  num_spaces++;
+  os << *reinterpret_cast<const NativeConfig *>(&config);
+  os << GenSpaces(num_spaces) << "enable_ir_optim: " << config.enable_ir_optim
+     << "\n";
+  os << GenSpaces(num_spaces)
+     << "use_feed_fetch_ops: " << config.use_feed_fetch_ops << "\n";
+  os << GenSpaces(num_spaces) << "use_tensorrt: " << config.use_tensorrt()
+     << "\n";
+  os << GenSpaces(num_spaces) << "use_mkldnn: " << config.use_mkldnn() << "\n";
+  num_spaces--;
+  os << GenSpaces(num_spaces) << "}\n";
+  return os;
+}
+
+}  // namespace inference
+}  // namespace paddle
diff --git a/paddle/fluid/inference/tests/api/tester_helper.h b/paddle/fluid/inference/tests/api/tester_helper.h
index ab4ab20b58..a404691413 100644
--- a/paddle/fluid/inference/tests/api/tester_helper.h
+++ b/paddle/fluid/inference/tests/api/tester_helper.h
@@ -19,13 +19,16 @@
 #include <string>
 #include <thread>  // NOLINT
 #include <vector>
+
 #include "paddle/fluid/framework/ir/fuse_pass_base.h"
 #include "paddle/fluid/framework/scope.h"
 #include "paddle/fluid/inference/analysis/analyzer.h"
 #include "paddle/fluid/inference/analysis/ut_helper.h"
 #include "paddle/fluid/inference/api/analysis_predictor.h"
-#include "paddle/fluid/inference/api/helper.h"
 #include "paddle/fluid/inference/api/paddle_inference_pass.h"
+
+#include "paddle/fluid/inference/api/helper.h"
+#include "paddle/fluid/inference/tests/api/config_printer.h"
 #include "paddle/fluid/inference/tests/test_helper.h"
 #include "paddle/fluid/platform/profiler.h"
 
@@ -38,10 +41,18 @@ DEFINE_int32(num_threads, 1, "Running the inference program in multi-threads.");
 DEFINE_bool(use_analysis, true,
             "Running the inference program in analysis mode.");
 
+DECLARE_bool(profile);
+
 namespace paddle {
 namespace inference {
 
-using contrib::AnalysisConfig;
+void PrintConfig(const PaddlePredictor::Config *config, bool use_analysis) {
+  if (use_analysis) {
+    LOG(INFO) << *reinterpret_cast<const contrib::AnalysisConfig *>(config);
+    return;
+  }
+  LOG(INFO) << *config;
+}
 
 void CompareResult(const std::vector<PaddleTensor> &outputs,
                    const std::vector<PaddleTensor> &ref_outputs) {
@@ -77,12 +88,13 @@ void CompareResult(const std::vector<PaddleTensor> &outputs,
 }
 
 std::unique_ptr<PaddlePredictor> CreateTestPredictor(
-    const AnalysisConfig &config, bool use_analysis = true) {
+    const PaddlePredictor::Config *config, bool use_analysis = true) {
   if (use_analysis) {
-    return CreatePaddlePredictor<contrib::AnalysisConfig>(config);
-  } else {
-    return CreatePaddlePredictor<NativeConfig>(config);
+    return CreatePaddlePredictor<contrib::AnalysisConfig>(
+        *(reinterpret_cast<const contrib::AnalysisConfig *>(config)));
   }
+  return CreatePaddlePredictor<NativeConfig>(
+      *(reinterpret_cast<const NativeConfig *>(config)));
 }
 
 size_t GetSize(const PaddleTensor &out) { return VecReduceToInt(out.shape); }
@@ -111,11 +123,23 @@ std::unordered_map<std::string, int> GetFuseStatis(PaddlePredictor *predictor,
 }
 
 void SetFakeImageInput(std::vector<std::vector<PaddleTensor>> *inputs,
-                       const std::string &dirname) {
+                       const std::string &dirname, bool is_combined = true,
+                       std::string model_filename = "model",
+                       std::string params_filename = "params") {
   // Set fake_image_data
   PADDLE_ENFORCE_EQ(FLAGS_test_all_data, 0, "Only have single batch of data.");
-  std::vector<std::vector<int64_t>> feed_target_shapes =
-      GetFeedTargetShapes(dirname, true, "model", "params");
+  std::vector<std::vector<int64_t>> feed_target_shapes = GetFeedTargetShapes(
+      dirname, is_combined, model_filename, params_filename);
+  std::ostringstream os;
+  for (size_t i = 0; i < feed_target_shapes.size(); ++i) {
+    os << "feed target " << i << ": {" << feed_target_shapes[i][0];
+    for (size_t j = 1; j < feed_target_shapes[i].size(); ++j) {
+      os << ", " << feed_target_shapes[i][j];
+    }
+    os << "}\n";
+  }
+  LOG(INFO) << os.str();
+
   int dim1 = feed_target_shapes[0][1];
   int dim2 = feed_target_shapes[0][2];
   int dim3 = feed_target_shapes[0][3];
@@ -139,25 +163,43 @@ void SetFakeImageInput(std::vector<std::vector<PaddleTensor>> *inputs,
 }
 
 void TestOneThreadPrediction(
-    const AnalysisConfig &config,
+    const PaddlePredictor::Config *config,
     const std::vector<std::vector<PaddleTensor>> &inputs,
     std::vector<PaddleTensor> *outputs, bool use_analysis = true) {
   int batch_size = FLAGS_batch_size;
   int num_times = FLAGS_repeat;
   auto predictor = CreateTestPredictor(config, use_analysis);
-  Timer timer;
-  timer.tic();
-  for (int i = 0; i < num_times; i++) {
-    for (size_t j = 0; j < inputs.size(); j++) {
-      predictor->Run(inputs[j], outputs);
+
+  // warmup run
+  LOG(INFO) << "Warm up run...";
+  {
+    Timer warmup_timer;
+    warmup_timer.tic();
+    predictor->Run(inputs[0], outputs, batch_size);
+    PrintTime(batch_size, 1, 1, 0, warmup_timer.toc(), 1);
+#if !defined(_WIN32)
+    if (FLAGS_profile) {
+      paddle::platform::ResetProfiler();
+    }
+#endif
+  }
+
+  LOG(INFO) << "Run " << num_times << " times...";
+  {
+    Timer run_timer;
+    run_timer.tic();
+    for (int i = 0; i < num_times; i++) {
+      for (size_t j = 0; j < inputs.size(); j++) {
+        predictor->Run(inputs[j], outputs, batch_size);
+      }
     }
+    PrintTime(batch_size, num_times, 1, 0, run_timer.toc() / num_times,
+              inputs.size());
   }
-  PrintTime(batch_size, num_times, 1, 0, timer.toc() / num_times,
-            inputs.size());
 }
 
 void TestMultiThreadPrediction(
-    const AnalysisConfig &config,
+    const PaddlePredictor::Config *config,
     const std::vector<std::vector<PaddleTensor>> &inputs,
     std::vector<PaddleTensor> *outputs, int num_threads,
     bool use_analysis = true) {
@@ -200,12 +242,11 @@ void TestMultiThreadPrediction(
   }
 }
 
-void TestPrediction(const AnalysisConfig &config,
+void TestPrediction(const PaddlePredictor::Config *config,
                     const std::vector<std::vector<PaddleTensor>> &inputs,
                     std::vector<PaddleTensor> *outputs, int num_threads,
                     bool use_analysis = FLAGS_use_analysis) {
-  LOG(INFO) << "use_analysis: " << use_analysis
-            << ", use_mkldnn: " << config.use_mkldnn();
+  PrintConfig(config, use_analysis);
   if (num_threads == 1) {
     TestOneThreadPrediction(config, inputs, outputs, use_analysis);
   } else {
@@ -215,9 +256,9 @@ void TestPrediction(const AnalysisConfig &config,
 }
 
 void CompareNativeAndAnalysis(
-    const AnalysisConfig &config,
+    const PaddlePredictor::Config *config,
     const std::vector<std::vector<PaddleTensor>> &inputs) {
-  LOG(INFO) << "use_mkldnn: " << config.use_mkldnn();
+  PrintConfig(config, true);
   std::vector<PaddleTensor> native_outputs, analysis_outputs;
   TestOneThreadPrediction(config, inputs, &native_outputs, false);
   TestOneThreadPrediction(config, inputs, &analysis_outputs, true);
diff --git a/paddle/fluid/inference/tests/api/trt_models_tester.cc b/paddle/fluid/inference/tests/api/trt_models_tester.cc
index 71423154f8..922feba10f 100644
--- a/paddle/fluid/inference/tests/api/trt_models_tester.cc
+++ b/paddle/fluid/inference/tests/api/trt_models_tester.cc
@@ -1,148 +1,149 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
 
 #include <gflags/gflags.h>
 #include <glog/logging.h>
 #include <gtest/gtest.h>
-#include "paddle/fluid/inference/analysis/analyzer.h"
-#include "paddle/fluid/inference/api/helper.h"
-#include "paddle/fluid/inference/api/paddle_inference_api.h"
-#include "paddle/fluid/inference/api/paddle_inference_pass.h"
+
 #include "paddle/fluid/inference/tests/api/tester_helper.h"
 
 namespace paddle {
-using paddle::contrib::AnalysisConfig;
-
-DEFINE_string(dirname, "", "Directory of the inference model.");
-
-NativeConfig GetConfigNative() {
-  NativeConfig config;
-  config.model_dir = FLAGS_dirname;
-  // LOG(INFO) << "dirname  " << config.model_dir;
-  config.fraction_of_gpu_memory = 0.15;
-  config.use_gpu = true;
-  config.device = 0;
-  return config;
-}
-
-void PrepareTRTConfig(AnalysisConfig *config) {
-  config->model_dir = FLAGS_dirname + "/" + "mobilenet";
-  config->fraction_of_gpu_memory = 0.15;
-  config->EnableTensorRtEngine(1 << 10, 5);
-  config->pass_builder()->DeletePass("conv_bn_fuse_pass");
-  config->pass_builder()->DeletePass("fc_fuse_pass");
-  config->pass_builder()->TurnOnDebug();
+namespace inference {
+
+DEFINE_bool(use_tensorrt, true, "Test the performance of TensorRT engine.");
+DEFINE_string(prog_filename, "", "Name of model file.");
+DEFINE_string(param_filename, "", "Name of parameters file.");
+
+template <typename ConfigType>
+void SetConfig(ConfigType* config, std::string model_dir, bool use_gpu,
+               bool use_tensorrt = false, int batch_size = -1) {
+  if (!FLAGS_prog_filename.empty() && !FLAGS_param_filename.empty()) {
+    config->prog_file = model_dir + "/" + FLAGS_prog_filename;
+    config->param_file = model_dir + "/" + FLAGS_param_filename;
+  } else {
+    config->model_dir = model_dir;
+  }
+  if (use_gpu) {
+    config->use_gpu = true;
+    config->device = 0;
+    config->fraction_of_gpu_memory = 0.15;
+  }
 }
 
-void PrepareInputs(std::vector<PaddleTensor> *tensors, int batch_size) {
-  PADDLE_ENFORCE_EQ(tensors->size(), 1UL);
-  auto &tensor = tensors->front();
-  int height = 224;
-  int width = 224;
-  float *data = new float[batch_size * 3 * height * width];
-  memset(data, 0, sizeof(float) * (batch_size * 3 * height * width));
-  data[0] = 1.0f;
-
-  // Prepare inputs
-  tensor.name = "input_0";
-  tensor.shape = std::vector<int>({batch_size, 3, height, width});
-  tensor.data = PaddleBuf(static_cast<void *>(data),
-                          sizeof(float) * (batch_size * 3 * height * width));
-  tensor.dtype = PaddleDType::FLOAT32;
+template <>
+void SetConfig<contrib::AnalysisConfig>(contrib::AnalysisConfig* config,
+                                        std::string model_dir, bool use_gpu,
+                                        bool use_tensorrt, int batch_size) {
+  if (!FLAGS_prog_filename.empty() && !FLAGS_param_filename.empty()) {
+    config->prog_file = model_dir + "/" + FLAGS_prog_filename;
+    config->param_file = model_dir + "/" + FLAGS_param_filename;
+  } else {
+    config->model_dir = model_dir;
+  }
+  if (use_gpu) {
+    config->use_gpu = true;
+    config->device = 0;
+    config->fraction_of_gpu_memory = 0.15;
+    if (use_tensorrt) {
+      config->EnableTensorRtEngine(1 << 10, batch_size);
+      config->pass_builder()->DeletePass("conv_bn_fuse_pass");
+      config->pass_builder()->DeletePass("fc_fuse_pass");
+      config->pass_builder()->TurnOnDebug();
+    } else {
+      config->enable_ir_optim = true;
+    }
+  }
 }
 
-void CompareTensorRTWithFluid(int batch_size, std::string model_dirname) {
-  auto config0 = GetConfigNative();
-  config0.model_dir = model_dirname;
-
-  AnalysisConfig config1(true);
-  PrepareTRTConfig(&config1);
-  config1.model_dir = model_dirname;
-
-  auto predictor0 = CreatePaddlePredictor<NativeConfig>(config0);
-  auto predictor1 = CreatePaddlePredictor(config1);
-
-  // Prepare inputs
-  std::vector<PaddleTensor> paddle_tensor_feeds(1);
-  PrepareInputs(&paddle_tensor_feeds, batch_size);
-
-  // Prepare outputs
-  std::vector<PaddleTensor> outputs0;
-  std::vector<PaddleTensor> outputs1;
-  CHECK(predictor0->Run(paddle_tensor_feeds, &outputs0));
-  CHECK(predictor1->Run(paddle_tensor_feeds, &outputs1, batch_size));
-
-  const size_t num_elements = outputs0.front().data.length() / sizeof(float);
-  const size_t num_elements1 = outputs1.front().data.length() / sizeof(float);
-  EXPECT_EQ(num_elements, num_elements1);
-
-  auto *data0 = static_cast<float *>(outputs0.front().data.data());
-  auto *data1 = static_cast<float *>(outputs1.front().data.data());
-
-  ASSERT_GT(num_elements, 0UL);
-  for (size_t i = 0; i < std::min(num_elements, num_elements1); i++) {
-    EXPECT_NEAR(data0[i], data1[i], 1e-3);
+void profile(std::string model_dir, bool use_analysis, bool use_tensorrt) {
+  std::vector<std::vector<PaddleTensor>> inputs_all;
+  if (!FLAGS_prog_filename.empty() && !FLAGS_param_filename.empty()) {
+    SetFakeImageInput(&inputs_all, model_dir, true, FLAGS_prog_filename,
+                      FLAGS_param_filename);
+  } else {
+    SetFakeImageInput(&inputs_all, model_dir, false, "__model__", "");
   }
-}
 
-TEST(trt_models_test, mobilenet) {
-  CompareTensorRTWithFluid(1, FLAGS_dirname + "/" + "mobilenet");
-}
-TEST(trt_models_test, resnet50) {
-  CompareTensorRTWithFluid(1, FLAGS_dirname + "/" + "resnet50");
-}
-TEST(trt_models_test, resnext50) {
-  CompareTensorRTWithFluid(1, FLAGS_dirname + "/" + "resnext50");
+  std::vector<PaddleTensor> outputs;
+  if (use_analysis || use_tensorrt) {
+    contrib::AnalysisConfig config(true);
+    SetConfig<contrib::AnalysisConfig>(&config, model_dir, true, use_tensorrt,
+                                       FLAGS_batch_size);
+    TestPrediction(reinterpret_cast<PaddlePredictor::Config*>(&config),
+                   inputs_all, &outputs, FLAGS_num_threads, true);
+  } else {
+    NativeConfig config;
+    SetConfig<NativeConfig>(&config, model_dir, true, false);
+    TestPrediction(reinterpret_cast<PaddlePredictor::Config*>(&config),
+                   inputs_all, &outputs, FLAGS_num_threads, false);
+  }
 }
 
-TEST(trt_models_test, raw_gpu) {
-  std::string model_dir = FLAGS_dirname + "/" + "mobilenet";
-  auto config0 = GetConfigNative();
-  config0.model_dir = model_dir;
-  int batch_size = 2;
-
-  AnalysisConfig config1(true);
-  config1.fraction_of_gpu_memory = 0.1;
-  config1.enable_ir_optim = true;
-  config1.model_dir = model_dir;
+void compare(std::string model_dir, bool use_tensorrt) {
+  std::vector<std::vector<PaddleTensor>> inputs_all;
+  if (!FLAGS_prog_filename.empty() && !FLAGS_param_filename.empty()) {
+    SetFakeImageInput(&inputs_all, model_dir, true, FLAGS_prog_filename,
+                      FLAGS_param_filename);
+  } else {
+    SetFakeImageInput(&inputs_all, model_dir, false, "__model__", "");
+  }
 
-  auto predictor0 = CreatePaddlePredictor<NativeConfig>(config0);
-  auto predictor1 = CreatePaddlePredictor(config1);
+  std::vector<PaddleTensor> native_outputs;
+  NativeConfig native_config;
+  SetConfig<NativeConfig>(&native_config, model_dir, true, false,
+                          FLAGS_batch_size);
+  TestOneThreadPrediction(
+      reinterpret_cast<PaddlePredictor::Config*>(&native_config), inputs_all,
+      &native_outputs, false);
+
+  std::vector<PaddleTensor> analysis_outputs;
+  contrib::AnalysisConfig analysis_config(true);
+  SetConfig<contrib::AnalysisConfig>(&analysis_config, model_dir, true,
+                                     use_tensorrt, FLAGS_batch_size);
+  TestOneThreadPrediction(
+      reinterpret_cast<PaddlePredictor::Config*>(&analysis_config), inputs_all,
+      &analysis_outputs, true);
+
+  CompareResult(native_outputs, analysis_outputs);
+}
 
-  // Prepare inputs
-  std::vector<PaddleTensor> paddle_tensor_feeds(1);
-  PrepareInputs(&paddle_tensor_feeds, batch_size);
+TEST(TensorRT_mobilenet, compare) {
+  std::string model_dir = FLAGS_infer_model + "/mobilenet";
+  compare(model_dir, /* use_tensorrt */ true);
+}
 
-  // Prepare outputs
-  std::vector<PaddleTensor> outputs0;
-  std::vector<PaddleTensor> outputs1;
-  CHECK(predictor0->Run(paddle_tensor_feeds, &outputs0));
-  CHECK(predictor1->Run(paddle_tensor_feeds, &outputs1, batch_size));
+TEST(TensorRT_resnet50, compare) {
+  std::string model_dir = FLAGS_infer_model + "/resnet50";
+  compare(model_dir, /* use_tensorrt */ true);
+}
 
-  const size_t num_elements = outputs0.front().data.length() / sizeof(float);
-  const size_t num_elements1 = outputs1.front().data.length() / sizeof(float);
-  EXPECT_EQ(num_elements, num_elements1);
+TEST(TensorRT_resnext50, compare) {
+  std::string model_dir = FLAGS_infer_model + "/resnext50";
+  compare(model_dir, /* use_tensorrt */ true);
+}
 
-  auto *data0 = static_cast<float *>(outputs0.front().data.data());
-  auto *data1 = static_cast<float *>(outputs1.front().data.data());
+TEST(TensorRT_resnext50, profile) {
+  std::string model_dir = FLAGS_infer_model + "/resnext50";
+  profile(model_dir, /* use_analysis */ true, FLAGS_use_tensorrt);
+}
 
-  ASSERT_GT(num_elements, 0UL);
-  for (size_t i = 0; i < std::min(num_elements, num_elements1); i++) {
-    EXPECT_NEAR(data0[i], data1[i], 1e-3);
-  }
+TEST(TensorRT_mobilenet, analysis) {
+  std::string model_dir = FLAGS_infer_model + "/" + "mobilenet";
+  compare(model_dir, /* use_tensorrt */ false);
 }
 
+}  // namespace inference
 }  // namespace paddle
 
 USE_PASS(tensorrt_subgraph_pass);