From 5a6d7fe2ff6c946b2d9fe7816a9ee8a321c1b9fa Mon Sep 17 00:00:00 2001
From: peizhilin <wopeizl@163.com>
Date: Tue, 18 Dec 2018 11:36:42 +0800
Subject: [PATCH 01/28] add mkl,ctc support for windows

---
 CMakeLists.txt                                | 12 +--
 cmake/cuda.cmake                              |  3 +
 cmake/cudnn.cmake                             |  1 +
 cmake/external/cub.cmake                      |  2 +-
 cmake/external/dlpack.cmake                   |  2 +-
 cmake/external/mkldnn.cmake                   | 43 +++++++---
 cmake/external/mklml.cmake                    | 83 +++++++++++--------
 cmake/external/python.cmake                   |  8 +-
 cmake/external/warpctc.cmake                  | 30 +++++--
 cmake/external/xbyak.cmake                    |  4 +-
 cmake/generic.cmake                           |  6 +-
 cmake/inference_lib.cmake                     | 16 ++--
 cmake/operators.cmake                         |  2 +-
 cmake/simd.cmake                              | 73 ++++++++--------
 paddle/fluid/framework/CMakeLists.txt         |  3 +-
 .../framework/details/all_reduce_op_handle.cc |  2 +-
 paddle/fluid/framework/mixed_vector.h         | 10 +--
 paddle/fluid/framework/op_registry.h          |  3 +-
 .../inference/api/demo_ci/CMakeLists.txt      | 15 +++-
 .../fluid/memory/detail/system_allocator.cc   |  1 -
 paddle/fluid/operators/CMakeLists.txt         |  7 +-
 paddle/fluid/operators/cum_op.h               |  2 +
 .../elementwise/elementwise_mul_mkldnn_op.cc  |  3 +
 .../operators/math/detail/lstm_cpu_kernel.h   |  6 ++
 paddle/fluid/operators/math/jit_gen.h         |  3 +
 paddle/fluid/platform/cpu_info.cc             |  7 +-
 paddle/fluid/platform/dynload/CMakeLists.txt  |  2 -
 paddle/fluid/platform/dynload/cudnn.cc        |  4 +
 paddle/fluid/platform/dynload/cudnn.h         |  2 +-
 .../fluid/platform/dynload/dynamic_loader.cc  | 16 ++++
 .../fluid/platform/dynload/dynamic_loader.h   |  6 ++
 paddle/fluid/platform/dynload/mklml.h         |  2 +-
 paddle/fluid/platform/dynload/tensorrt.h      |  2 +-
 paddle/fluid/platform/dynload/warpctc.h       |  2 +-
 paddle/fluid/platform/port.h                  |  5 +-
 paddle/fluid/train/demo/CMakeLists.txt        | 18 +++-
 python/CMakeLists.txt                         | 16 ++--
 python/paddle/fluid/__init__.py               |  9 +-
 python/paddle/fluid/framework.py              | 18 ++--
 python/setup.py.in                            | 38 +++++----
 40 files changed, 315 insertions(+), 172 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 653ae4ffe5..efdb451f65 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -125,16 +125,12 @@ if(ANDROID OR IOS)
     add_definitions(-DPADDLE_MOBILE_INFERENCE)
 endif()
 
-if (APPLE OR WIN32)
+if (APPLE)
     set(WITH_MKL OFF CACHE STRING
-        "Disable MKL for building on mac and windows" FORCE)
+        "Disable MKL for building on mac" FORCE)
 endif()
 
 if (WIN32)
-    set(WITH_DSO OFF CACHE STRING
-            "Disable DSO when compiling for Windows" FORCE)
-    set(WITH_MKL OFF CACHE STRING
-            "Disable MKL when compiling for Windows" FORCE)
     set(WITH_DISTRIBUTE OFF CACHE STRING
             "Disable DISTRIBUTE when compiling for Windows" FORCE)
     set(WITH_C_API OFF CACHE STRING
@@ -207,10 +203,10 @@ include(external/xxhash)    # download xxhash
 include(external/dlpack)
 include(external/snappy)    # download snappy
 include(external/snappystream) # download snappystream
+include(external/warpctc)   # download, build, install warpctc
 
 if (NOT WIN32)
-# there is no official support of warpctc, nccl, cupti in windows
-include(external/warpctc)   # download, build, install warpctc
+# there is no official support of nccl, cupti in windows
 include(cupti)
 include(external/gzstream)
 endif (NOT WIN32)
diff --git a/cmake/cuda.cmake b/cmake/cuda.cmake
index 414e92eb27..5be7be6413 100644
--- a/cmake/cuda.cmake
+++ b/cmake/cuda.cmake
@@ -139,10 +139,12 @@ endfunction()
 message(STATUS "CUDA detected: " ${CUDA_VERSION})
 if (${CUDA_VERSION} LESS 7.0)
   set(paddle_known_gpu_archs ${paddle_known_gpu_archs})
+  add_definitions("-DPADDLE_CUDA_BINVER=\"60\"")
 elseif (${CUDA_VERSION} LESS 8.0) # CUDA 7.x
   set(paddle_known_gpu_archs ${paddle_known_gpu_archs7})
   list(APPEND CUDA_NVCC_FLAGS "-D_MWAITXINTRIN_H_INCLUDED")
   list(APPEND CUDA_NVCC_FLAGS "-D__STRICT_ANSI__")
+  add_definitions("-DPADDLE_CUDA_BINVER=\"70\"")
 elseif (${CUDA_VERSION} LESS 9.0) # CUDA 8.x
   set(paddle_known_gpu_archs ${paddle_known_gpu_archs8})
   list(APPEND CUDA_NVCC_FLAGS "-D_MWAITXINTRIN_H_INCLUDED")
@@ -150,6 +152,7 @@ elseif (${CUDA_VERSION} LESS 9.0) # CUDA 8.x
   # CUDA 8 may complain that sm_20 is no longer supported. Suppress the
   # warning for now.
   list(APPEND CUDA_NVCC_FLAGS "-Wno-deprecated-gpu-targets")
+  add_definitions("-DPADDLE_CUDA_BINVER=\"80\"")
 endif()
 
 include_directories(${CUDA_INCLUDE_DIRS})
diff --git a/cmake/cudnn.cmake b/cmake/cudnn.cmake
index 09bec347db..96a9917e76 100644
--- a/cmake/cudnn.cmake
+++ b/cmake/cudnn.cmake
@@ -89,6 +89,7 @@ if(CUDNN_FOUND)
         if(NOT CUDNN_MAJOR_VERSION)
             set(CUDNN_VERSION "???")
         else()
+            add_definitions("-DPADDLE_CUDNN_BINVER=\"${CUDNN_MAJOR_VERSION}\"")
             math(EXPR CUDNN_VERSION
                 "${CUDNN_MAJOR_VERSION} * 1000 +
                  ${CUDNN_MINOR_VERSION} * 100 + ${CUDNN_PATCHLEVEL_VERSION}")
diff --git a/cmake/external/cub.cmake b/cmake/external/cub.cmake
index c94849cf4b..f06728de91 100644
--- a/cmake/external/cub.cmake
+++ b/cmake/external/cub.cmake
@@ -32,4 +32,4 @@ endif()
 
 add_dependencies(cub extern_cub)
 
-LIST(APPEND externl_project_dependencies cub)
+LIST(APPEND external_project_dependencies cub)
diff --git a/cmake/external/dlpack.cmake b/cmake/external/dlpack.cmake
index 94d8fcc668..4587475d79 100644
--- a/cmake/external/dlpack.cmake
+++ b/cmake/external/dlpack.cmake
@@ -28,4 +28,4 @@ endif()
 
 add_dependencies(dlpack extern_dlpack)
 
-LIST(APPEND externl_project_dependencies dlpack)
+LIST(APPEND external_project_dependencies dlpack)
diff --git a/cmake/external/mkldnn.cmake b/cmake/external/mkldnn.cmake
index b280db23b9..c29375cd05 100644
--- a/cmake/external/mkldnn.cmake
+++ b/cmake/external/mkldnn.cmake
@@ -23,15 +23,14 @@ SET(MKLDNN_SOURCES_DIR    ${THIRD_PARTY_PATH}/mkldnn)
 SET(MKLDNN_INSTALL_DIR    ${THIRD_PARTY_PATH}/install/mkldnn)
 SET(MKLDNN_INC_DIR        "${MKLDNN_INSTALL_DIR}/include" CACHE PATH "mkldnn include directory." FORCE)
 
-IF(WIN32 OR APPLE)
+IF(APPLE)
     MESSAGE(WARNING
-        "Windows or Mac is not supported with MKLDNN in Paddle yet."
+        "Mac is not supported with MKLDNN in Paddle yet."
         "Force WITH_MKLDNN=OFF")
-    SET(WITH_MKLDNN OFF CACHE STRING "Disable MKLDNN in Windows and MacOS" FORCE)
+    SET(WITH_MKLDNN OFF CACHE STRING "Disable MKLDNN in MacOS" FORCE)
     return()
 ENDIF()
 
-SET(MKLDNN_LIB "${MKLDNN_INSTALL_DIR}/lib/libmkldnn.so" CACHE FILEPATH "mkldnn library." FORCE)
 MESSAGE(STATUS "Set ${MKLDNN_INSTALL_DIR}/lib to runtime path")
 SET(CMAKE_INSTALL_RPATH_USE_LINK_PATH TRUE)
 SET(CMAKE_INSTALL_RPATH "${CMAKE_INSTALL_RPATH}" "${MKLDNN_INSTALL_DIR}/lib")
@@ -44,10 +43,14 @@ IF(${CBLAS_PROVIDER} STREQUAL "MKLML")
 ELSE()
     MESSAGE(FATAL_ERROR "Should enable MKLML when build MKLDNN")
 ENDIF()
-SET(MKLDNN_FLAG "-Wno-error=strict-overflow -Wno-error=unused-result -Wno-error=array-bounds")
-SET(MKLDNN_FLAG "${MKLDNN_FLAG} -Wno-unused-result -Wno-unused-value")
-SET(MKLDNN_CFLAG "${CMAKE_C_FLAGS} ${MKLDNN_FLAG}")
-SET(MKLDNN_CXXFLAG "${CMAKE_CXX_FLAGS} ${MKLDNN_FLAG}")
+
+IF(NOT WIN32)
+    SET(MKLDNN_FLAG "-Wno-error=strict-overflow -Wno-error=unused-result -Wno-error=array-bounds")
+    SET(MKLDNN_FLAG "${MKLDNN_FLAG} -Wno-unused-result -Wno-unused-value")
+    SET(MKLDNN_CFLAG "${CMAKE_C_FLAGS} ${MKLDNN_FLAG}")
+    SET(MKLDNN_CXXFLAG "${CMAKE_CXX_FLAGS} ${MKLDNN_FLAG}")
+ENDIF(NOT WIN32)
+
 ExternalProject_Add(
     ${MKLDNN_PROJECT}
     ${EXTERNAL_PROJECT_LOG_ARGS}
@@ -58,8 +61,15 @@ ExternalProject_Add(
     UPDATE_COMMAND      ""
     CMAKE_ARGS          -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}
     CMAKE_ARGS          -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER}
+    CMAKE_ARGS          -DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS}
+    CMAKE_ARGS          -DCMAKE_CXX_FLAGS_RELEASE=${CMAKE_CXX_FLAGS_RELEASE}
+    CMAKE_ARGS          -DCMAKE_CXX_FLAGS_DEBUG=${CMAKE_CXX_FLAGS_DEBUG}
+    CMAKE_ARGS          -DCMAKE_C_FLAGS=${CMAKE_C_FLAGS}
+    CMAKE_ARGS          -DCMAKE_C_FLAGS_DEBUG=${CMAKE_C_FLAGS_DEBUG}
+    CMAKE_ARGS          -DCMAKE_C_FLAGS_RELEASE=${CMAKE_C_FLAGS_RELEASE}
     CMAKE_ARGS          -DCMAKE_INSTALL_PREFIX=${MKLDNN_INSTALL_DIR}
     CMAKE_ARGS          -DCMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE}
+    CMAKE_ARGS          -DCMAKE_POSITION_INDEPENDENT_CODE=ON
     CMAKE_ARGS          -DMKLROOT=${MKLML_ROOT}
     CMAKE_ARGS          -DCMAKE_C_FLAGS=${MKLDNN_CFLAG}
     CMAKE_ARGS          -DCMAKE_CXX_FLAGS=${MKLDNN_CXXFLAG}
@@ -67,6 +77,11 @@ ExternalProject_Add(
     CMAKE_CACHE_ARGS    -DCMAKE_INSTALL_PREFIX:PATH=${MKLDNN_INSTALL_DIR}
                         -DMKLROOT:PATH=${MKLML_ROOT}
 )
+if(WIN32)
+    SET(MKLDNN_LIB "${MKLDNN_INSTALL_DIR}/lib/mkldnn.lib" CACHE FILEPATH "mkldnn library." FORCE)
+else(WIN32)
+    SET(MKLDNN_LIB "${MKLDNN_INSTALL_DIR}/lib/libmkldnn.so" CACHE FILEPATH "mkldnn library." FORCE)
+endif(WIN32)
 
 ADD_LIBRARY(shared_mkldnn SHARED IMPORTED GLOBAL)
 SET_PROPERTY(TARGET shared_mkldnn PROPERTY IMPORTED_LOCATION ${MKLDNN_LIB})
@@ -85,10 +100,14 @@ ADD_DEPENDENCIES(mkldnn ${MKLDNN_PROJECT})
 
 # copy the real so.0 lib to install dir
 # it can be directly contained in wheel or capi
-SET(MKLDNN_SHARED_LIB ${MKLDNN_INSTALL_DIR}/libmkldnn.so.0)
-ADD_CUSTOM_COMMAND(OUTPUT ${MKLDNN_SHARED_LIB}
-    COMMAND cp ${MKLDNN_LIB} ${MKLDNN_SHARED_LIB}
-    DEPENDS mkldnn)
+if(WIN32)
+    SET(MKLDNN_SHARED_LIB ${MKLDNN_INSTALL_DIR}/lib/mkldnn.dll)
+else(WIN32)
+    SET(MKLDNN_SHARED_LIB ${MKLDNN_INSTALL_DIR}/libmkldnn.so.0)
+    ADD_CUSTOM_COMMAND(OUTPUT ${MKLDNN_SHARED_LIB}
+            COMMAND ${CMAKE_COMMAND} -E copy ${MKLDNN_LIB} ${MKLDNN_SHARED_LIB}
+            DEPENDS mkldnn)
+endif(WIN32)
 ADD_CUSTOM_TARGET(mkldnn_shared_lib ALL DEPENDS ${MKLDNN_SHARED_LIB})
 
 IF(WITH_C_API)
diff --git a/cmake/external/mklml.cmake b/cmake/external/mklml.cmake
index dc5427acd4..3da552e319 100644
--- a/cmake/external/mklml.cmake
+++ b/cmake/external/mklml.cmake
@@ -16,59 +16,76 @@ IF(NOT ${WITH_MKLML})
   return()
 ENDIF(NOT ${WITH_MKLML})
 
-IF(WIN32 OR APPLE)
+IF(APPLE)
     MESSAGE(WARNING
-        "Windows or Mac is not supported with MKLML in Paddle yet."
+        "Mac is not supported with MKLML in Paddle yet."
         "Force WITH_MKLML=OFF")
     SET(WITH_MKLML OFF CACHE STRING "Disable MKLML package in Windows and MacOS" FORCE)
     return()
 ENDIF()
 
 INCLUDE(ExternalProject)
-
-SET(MKLML_PROJECT       "extern_mklml")
-IF((NOT DEFINED MKLML_VER) OR (NOT DEFINED MKLML_URL))
-  MESSAGE(STATUS "use pre defined download url")
-  SET(MKLML_VER "mklml_lnx_2019.0.20180710" CACHE STRING "" FORCE)
-  SET(MKLML_URL "http://paddlepaddledeps.cdn.bcebos.com/${MKLML_VER}.tgz" CACHE STRING "" FORCE)
-ENDIF()
-MESSAGE(STATUS "MKLML_VER: ${MKLML_VER}, MKLML_URL: ${MKLML_URL}")
-SET(MKLML_SOURCE_DIR    "${THIRD_PARTY_PATH}/mklml")
-SET(MKLML_DOWNLOAD_DIR  "${MKLML_SOURCE_DIR}/src/${MKLML_PROJECT}")
 SET(MKLML_DST_DIR       "mklml")
 SET(MKLML_INSTALL_ROOT  "${THIRD_PARTY_PATH}/install")
 SET(MKLML_INSTALL_DIR   ${MKLML_INSTALL_ROOT}/${MKLML_DST_DIR})
 SET(MKLML_ROOT          ${MKLML_INSTALL_DIR})
 SET(MKLML_INC_DIR       ${MKLML_ROOT}/include)
 SET(MKLML_LIB_DIR       ${MKLML_ROOT}/lib)
-SET(MKLML_LIB           ${MKLML_LIB_DIR}/libmklml_intel.so)
-SET(MKLML_IOMP_LIB      ${MKLML_LIB_DIR}/libiomp5.so)
+if(WIN32)
+    SET(MKLML_LIB                 ${MKLML_LIB_DIR}/mklml.lib)
+    SET(MKLML_IOMP_LIB            ${MKLML_LIB_DIR}/libiomp5md.lib)
+    SET(MKLML_SHARED_LIB          ${MKLML_LIB_DIR}/mklml.dll)
+    SET(MKLML_SHARED_IOMP_LIB     ${MKLML_LIB_DIR}/libiomp5md.dll)
+else()
+    SET(MKLML_LIB                 ${MKLML_LIB_DIR}/libmklml_intel.so)
+    SET(MKLML_IOMP_LIB            ${MKLML_LIB_DIR}/libiomp5.so)
+    SET(MKLML_SHARED_LIB          ${MKLML_LIB_DIR}/libmklml_intel.so)
+    SET(MKLML_SHARED_IOMP_LIB     ${MKLML_LIB_DIR}/libiomp5.so)
+endif()
 SET(CMAKE_INSTALL_RPATH "${CMAKE_INSTALL_RPATH}" "${MKLML_ROOT}/lib")
 
-INCLUDE_DIRECTORIES(${MKLML_INC_DIR})
+if(WIN32)
+    MESSAGE(WARNING
+    "Please download the MKLML and and put it at " ${THIRD_PARTY_PATH}/install/mklml)
+else()
+    SET(MKLML_PROJECT       "extern_mklml")
+    IF((NOT DEFINED MKLML_VER) OR (NOT DEFINED MKLML_URL))
+      MESSAGE(STATUS "use pre defined download url")
+      SET(MKLML_VER "mklml_lnx_2019.0.20180710" CACHE STRING "" FORCE)
+      SET(MKLML_URL "http://paddlepaddledeps.cdn.bcebos.com/${MKLML_VER}.tgz" CACHE STRING "" FORCE)
+    ENDIF()
+    MESSAGE(STATUS "MKLML_VER: ${MKLML_VER}, MKLML_URL: ${MKLML_URL}")
+    SET(MKLML_SOURCE_DIR    "${THIRD_PARTY_PATH}/mklml")
+    SET(MKLML_DOWNLOAD_DIR  "${MKLML_SOURCE_DIR}/src/${MKLML_PROJECT}")
 
-FILE(WRITE ${MKLML_DOWNLOAD_DIR}/CMakeLists.txt
-  "PROJECT(MKLML)\n"
-  "cmake_minimum_required(VERSION 3.0)\n"
-  "install(DIRECTORY ${MKLML_VER}/include ${MKLML_VER}/lib \n"
-  "        DESTINATION ${MKLML_DST_DIR})\n")
+    FILE(WRITE ${MKLML_DOWNLOAD_DIR}/CMakeLists.txt
+      "PROJECT(MKLML)\n"
+      "cmake_minimum_required(VERSION 3.0)\n"
+      "install(DIRECTORY ${MKLML_VER}/include ${MKLML_VER}/lib \n"
+      "        DESTINATION ${MKLML_DST_DIR})\n")
 
-ExternalProject_Add(
-    ${MKLML_PROJECT}
-    ${EXTERNAL_PROJECT_LOG_ARGS}
-    PREFIX                ${MKLML_SOURCE_DIR}
-    DOWNLOAD_DIR          ${MKLML_DOWNLOAD_DIR}
-    DOWNLOAD_COMMAND      wget --no-check-certificate ${MKLML_URL} -c -q -O ${MKLML_VER}.tgz 
-                          && tar zxf ${MKLML_VER}.tgz
-    DOWNLOAD_NO_PROGRESS  1
-    UPDATE_COMMAND        ""
-    CMAKE_ARGS            -DCMAKE_INSTALL_PREFIX=${MKLML_INSTALL_ROOT}
-    CMAKE_CACHE_ARGS      -DCMAKE_INSTALL_PREFIX:PATH=${MKLML_INSTALL_ROOT}
-)
+    ExternalProject_Add(
+        ${MKLML_PROJECT}
+        ${EXTERNAL_PROJECT_LOG_ARGS}
+        PREFIX                ${MKLML_SOURCE_DIR}
+        DOWNLOAD_DIR          ${MKLML_DOWNLOAD_DIR}
+        DOWNLOAD_COMMAND      wget --no-check-certificate ${MKLML_URL} -c -q -O ${MKLML_VER}.tgz
+                              && tar zxf ${MKLML_VER}.tgz
+        DOWNLOAD_NO_PROGRESS  1
+        UPDATE_COMMAND        ""
+        CMAKE_ARGS            -DCMAKE_INSTALL_PREFIX=${MKLML_INSTALL_ROOT}
+        CMAKE_CACHE_ARGS      -DCMAKE_INSTALL_PREFIX:PATH=${MKLML_INSTALL_ROOT}
+    )
+endif()
+
+
+INCLUDE_DIRECTORIES(${MKLML_INC_DIR})
 
 ADD_LIBRARY(mklml SHARED IMPORTED GLOBAL)
 SET_PROPERTY(TARGET mklml PROPERTY IMPORTED_LOCATION ${MKLML_LIB})
-ADD_DEPENDENCIES(mklml ${MKLML_PROJECT})
+if(NOT WIN32)
+    ADD_DEPENDENCIES(mklml ${MKLML_PROJECT})
+endif()
 LIST(APPEND external_project_dependencies mklml)
 
 IF(WITH_C_API)
diff --git a/cmake/external/python.cmake b/cmake/external/python.cmake
index a3599dd798..edfb655541 100644
--- a/cmake/external/python.cmake
+++ b/cmake/external/python.cmake
@@ -23,9 +23,12 @@ FIND_PACKAGE(PythonLibs ${PY_VERSION})
 
 if(WIN32)
     execute_process(COMMAND "${PYTHON_EXECUTABLE}" "-c"
-"from distutils import sysconfig as s;import sys;import struct;
+"from distutils import sysconfig as s;import sys;import struct;import sysconfig;
 print(sys.prefix);
 print(s.get_config_var('LDVERSION') or s.get_config_var('VERSION'));
+print(sysconfig.get_platform());
+print(sysconfig.get_config_var('py_version_nodot'));
+print(sysconfig.get_config_var('SOABI'));
 "
             RESULT_VARIABLE _PYTHON_SUCCESS
             OUTPUT_VARIABLE _PYTHON_VALUES
@@ -41,6 +44,9 @@ print(s.get_config_var('LDVERSION') or s.get_config_var('VERSION'));
     string(REGEX REPLACE "\n" ";" _PYTHON_VALUES ${_PYTHON_VALUES})
     list(GET _PYTHON_VALUES 0 PYTHON_PREFIX)
     list(GET _PYTHON_VALUES 1 PYTHON_LIBRARY_SUFFIX)
+    list(GET _PYTHON_VALUES 2 SYS_PLATFORM)
+    list(GET _PYTHON_VALUES 3 PYTHON_SHORT_VERSION_NODOT)
+    list(GET _PYTHON_VALUES 4 PYTHON_SOABI)
 
     # Make sure all directory separators are '/'
     string(REGEX REPLACE "\\\\" "/" PYTHON_PREFIX ${PYTHON_PREFIX})
diff --git a/cmake/external/warpctc.cmake b/cmake/external/warpctc.cmake
index 07e1137e16..7b937c93fe 100644
--- a/cmake/external/warpctc.cmake
+++ b/cmake/external/warpctc.cmake
@@ -26,25 +26,33 @@ SET(WARPCTC_INCLUDE_DIR "${WARPCTC_INSTALL_DIR}/include"
 # Used in unit test test_WarpCTCLayer
 SET(WARPCTC_LIB_DIR "${WARPCTC_INSTALL_DIR}/lib"
     CACHE PATH "Warp-ctc Library Directory" FORCE)
-SET(WARPCTC_LIBRARIES "${WARPCTC_INSTALL_DIR}/lib/libwarpctc${CMAKE_SHARED_LIBRARY_SUFFIX}"
-    CACHE FILEPATH "Warp-ctc Library" FORCE)
 
-IF(CMAKE_CXX_COMPILER_ID STREQUAL "Clang" OR CMAKE_CXX_COMPILER_ID STREQUAL "AppleClang" )
+IF(CMAKE_CXX_COMPILER_ID STREQUAL "Clang" OR CMAKE_CXX_COMPILER_ID STREQUAL "AppleClang" OR WIN32)
     SET(USE_OMP OFF)
 ELSE()
     SET(USE_OMP ON)
 ENDIF()
 
+IF(WIN32)
+    SET(WARPCTC_REPOSITORY "https://github.com/wopeizl/warp-ctc.git")
+ELSE()
+    SET(WARPCTC_REPOSITORY "https://github.com/dzhwinter/warp-ctc.git")
+ENDIF()
+
 ExternalProject_Add(
     extern_warpctc
     ${EXTERNAL_PROJECT_LOG_ARGS}
-    GIT_REPOSITORY  "https://github.com/dzhwinter/warp-ctc.git"
+    GIT_REPOSITORY ${WARPCTC_REPOSITORY}
     PREFIX          ${WARPCTC_SOURCES_DIR}
     UPDATE_COMMAND  ""
     CMAKE_ARGS      -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}
                     -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER}
-                    -DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS}
                     -DCMAKE_C_FLAGS=${CMAKE_C_FLAGS}
+                    -DCMAKE_C_FLAGS_DEBUG=${CMAKE_C_FLAGS_DEBUG}
+                    -DCMAKE_C_FLAGS_RELEASE=${CMAKE_C_FLAGS_RELEASE}
+                    -DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS}
+                    -DCMAKE_CXX_FLAGS_RELEASE=${CMAKE_CXX_FLAGS_RELEASE}
+                    -DCMAKE_CXX_FLAGS_DEBUG=${CMAKE_CXX_FLAGS_DEBUG}
                     -DCMAKE_INSTALL_PREFIX=${WARPCTC_INSTALL_DIR}
                     -DWITH_GPU=${WITH_GPU}
                     -DWITH_OMP=${USE_OMP}
@@ -59,6 +67,18 @@ ExternalProject_Add(
                      -DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON
                      -DCMAKE_INSTALL_PREFIX:PATH=${WARPCTC_INSTALL_DIR}
 )
+IF(WIN32)
+    IF(NOT EXISTS "${WARPCTC_INSTALL_DIR}/lib/warpctc${CMAKE_SHARED_LIBRARY_SUFFIX}")
+        add_custom_command(TARGET extern_warpctc POST_BUILD
+                COMMAND cmake -E copy ${WARPCTC_INSTALL_DIR}/bin/warpctc${CMAKE_SHARED_LIBRARY_SUFFIX} ${WARPCTC_INSTALL_DIR}/lib/warpctc${CMAKE_SHARED_LIBRARY_SUFFIX}
+                )
+    ENDIF()
+    SET(WARPCTC_LIBRARIES "${WARPCTC_INSTALL_DIR}/lib/warpctc${CMAKE_SHARED_LIBRARY_SUFFIX}"
+            CACHE FILEPATH "Warp-ctc Library" FORCE)
+else(WIN32)
+    SET(WARPCTC_LIBRARIES "${WARPCTC_INSTALL_DIR}/lib/libwarpctc${CMAKE_SHARED_LIBRARY_SUFFIX}"
+            CACHE FILEPATH "Warp-ctc Library" FORCE)
+ENDIF(WIN32)
 
 MESSAGE(STATUS "warp-ctc library: ${WARPCTC_LIBRARIES}")
 INCLUDE_DIRECTORIES(${WARPCTC_INCLUDE_DIR}) # For warpctc code to include its headers.
diff --git a/cmake/external/xbyak.cmake b/cmake/external/xbyak.cmake
index 384c2f9328..42e39fb813 100644
--- a/cmake/external/xbyak.cmake
+++ b/cmake/external/xbyak.cmake
@@ -13,8 +13,8 @@
 # limitations under the License.
 
 set(WITH_XBYAK ON)
-if(WIN32 OR APPLE)
-    SET(WITH_XBYAK OFF CACHE STRING "Disable XBYAK in Windows and MacOS" FORCE)
+if(APPLE)
+    SET(WITH_XBYAK OFF CACHE STRING "Disable XBYAK in MacOS" FORCE)
     return()
 endif()
 
diff --git a/cmake/generic.cmake b/cmake/generic.cmake
index a8b9dcfcf5..c6fe2e970d 100644
--- a/cmake/generic.cmake
+++ b/cmake/generic.cmake
@@ -267,7 +267,11 @@ function(cc_library TARGET_NAME)
           list(APPEND cc_library_DEPS dynload_mklml)
         endif()
         add_dependencies(${TARGET_NAME} mklml)
-        target_link_libraries(${TARGET_NAME} "-L${MKLML_LIB_DIR} -liomp5 -Wl,--as-needed")
+        if(WIN32)
+          target_link_libraries(${TARGET_NAME} ${MKLML_IOMP_LIB})
+        else(WIN32)
+          target_link_libraries(${TARGET_NAME} "-L${MKLML_LIB_DIR} -liomp5 -Wl,--as-needed")
+        endif(WIN32)
       endif()
       # remove link to python, see notes at:
       # https://github.com/pybind/pybind11/blob/master/docs/compiling.rst#building-manually
diff --git a/cmake/inference_lib.cmake b/cmake/inference_lib.cmake
index 5aa7a8a752..a5b70b3c33 100644
--- a/cmake/inference_lib.cmake
+++ b/cmake/inference_lib.cmake
@@ -115,20 +115,20 @@ if (NOT PROTOBUF_FOUND OR WIN32)
             )
 endif ()
 
-if (NOT CBLAS_FOUND)
-    set(dst_dir "${FLUID_INSTALL_DIR}/third_party/install/openblas")
-    copy(openblas_lib
-            SRCS ${CBLAS_INSTALL_DIR}/lib ${CBLAS_INSTALL_DIR}/include
-            DSTS ${dst_dir} ${dst_dir}
-            DEPS extern_openblas
-            )
-elseif (WITH_MKLML)
+if (WITH_MKLML)
     set(dst_dir "${FLUID_INSTALL_DIR}/third_party/install/mklml")
     copy(mklml_lib
             SRCS ${MKLML_LIB} ${MKLML_IOMP_LIB} ${MKLML_INC_DIR}
             DSTS ${dst_dir}/lib ${dst_dir}/lib ${dst_dir}
             DEPS mklml
             )
+elseif (NOT CBLAS_FOUND OR WIN32)
+    set(dst_dir "${FLUID_INSTALL_DIR}/third_party/install/openblas")
+    copy(openblas_lib
+            SRCS ${CBLAS_INSTALL_DIR}/lib ${CBLAS_INSTALL_DIR}/include
+            DSTS ${dst_dir} ${dst_dir}
+            DEPS extern_openblas
+            )
 endif ()
 
 if (WITH_MKLDNN)
diff --git a/cmake/operators.cmake b/cmake/operators.cmake
index 2ced43f9e6..70d159b4f3 100644
--- a/cmake/operators.cmake
+++ b/cmake/operators.cmake
@@ -84,7 +84,7 @@ function(op_library TARGET)
     endif()
     if (WIN32)
     # remove windows unsupported op, because windows has no nccl, no warpctc such ops.
-    foreach(windows_unsupport_op "nccl_op" "gen_nccl_id_op" "warpctc_op")
+    foreach(windows_unsupport_op "nccl_op" "gen_nccl_id_op")
         if ("${TARGET}" STREQUAL "${windows_unsupport_op}")
           return()
         endif()
diff --git a/cmake/simd.cmake b/cmake/simd.cmake
index 86096d4fea..566dc75fda 100644
--- a/cmake/simd.cmake
+++ b/cmake/simd.cmake
@@ -57,46 +57,43 @@ int main()
     return 0;
 }" SSE3_FOUND)
 
-# disable AVX by default on windows
-if(NOT WIN32)
-    # Check AVX
-    set(CMAKE_REQUIRED_FLAGS ${AVX_FLAG})
-    set(AVX_FOUND_EXITCODE 1 CACHE STRING "Result from TRY_RUN" FORCE)
-    CHECK_CXX_SOURCE_RUNS("
-    #include <immintrin.h>
-    int main()
-    {
-        __m256 a = _mm256_set_ps (-1.0f, 2.0f, -3.0f, 4.0f, -1.0f, 2.0f, -3.0f, 4.0f);
-        __m256 b = _mm256_set_ps (1.0f, 2.0f, 3.0f, 4.0f, 1.0f, 2.0f, 3.0f, 4.0f);
-        __m256 result = _mm256_add_ps (a, b);
-        return 0;
-    }" AVX_FOUND)
+# Check AVX
+set(CMAKE_REQUIRED_FLAGS ${AVX_FLAG})
+set(AVX_FOUND_EXITCODE 1 CACHE STRING "Result from TRY_RUN" FORCE)
+CHECK_CXX_SOURCE_RUNS("
+#include <immintrin.h>
+int main()
+{
+    __m256 a = _mm256_set_ps (-1.0f, 2.0f, -3.0f, 4.0f, -1.0f, 2.0f, -3.0f, 4.0f);
+    __m256 b = _mm256_set_ps (1.0f, 2.0f, 3.0f, 4.0f, 1.0f, 2.0f, 3.0f, 4.0f);
+    __m256 result = _mm256_add_ps (a, b);
+    return 0;
+}" AVX_FOUND)
 
-    # Check AVX 2
-    set(CMAKE_REQUIRED_FLAGS ${AVX2_FLAG})
-    set(AVX2_FOUND_EXITCODE 1 CACHE STRING "Result from TRY_RUN" FORCE)
-    CHECK_CXX_SOURCE_RUNS("
-    #include <immintrin.h>
-    int main()
-    {
-        __m256i a = _mm256_set_epi32 (-1, 2, -3, 4, -1, 2, -3, 4);
-        __m256i result = _mm256_abs_epi32 (a);
-        return 0;
-    }" AVX2_FOUND)
+# Check AVX 2
+set(CMAKE_REQUIRED_FLAGS ${AVX2_FLAG})
+set(AVX2_FOUND_EXITCODE 1 CACHE STRING "Result from TRY_RUN" FORCE)
+CHECK_CXX_SOURCE_RUNS("
+#include <immintrin.h>
+int main()
+{
+    __m256i a = _mm256_set_epi32 (-1, 2, -3, 4, -1, 2, -3, 4);
+    __m256i result = _mm256_abs_epi32 (a);
+    return 0;
+}" AVX2_FOUND)
 
-    # Check AVX512F
-    set(CMAKE_REQUIRED_FLAGS ${AVX512F_FLAG})
-    set(AVX512F_FOUND_EXITCODE 1 CACHE STRING "Result from TRY_RUN" FORCE)
-    CHECK_CXX_SOURCE_RUNS("
-    #include <immintrin.h>
-    int main()
-    {
-        __m512i a = _mm512_set_epi32 (-1, 2, -3, 4, -1, 2, -3, 4,
-                                      13, -5, 6, -7, 9, 2, -6, 3);
-        __m512i result = _mm512_abs_epi32 (a);
-        return 0;
-    }" AVX512F_FOUND)
-endif(NOT WIN32)
+# Check AVX512F
+set(CMAKE_REQUIRED_FLAGS ${AVX512F_FLAG})
+set(AVX512F_FOUND_EXITCODE 1 CACHE STRING "Result from TRY_RUN" FORCE)
+CHECK_CXX_SOURCE_RUNS("
+#include <immintrin.h>
+int main()
+{
+    __m512i a = _mm512_set_epi32 (-1, 2, -3, 4, -1, 2, -3, 4,
+                                  13, -5, 6, -7, 9, 2, -6, 3);
+    __m512i result = _mm512_abs_epi32 (a);
+    return 0;
+}" AVX512F_FOUND)
 
 set(CMAKE_REQUIRED_FLAGS ${CMAKE_REQUIRED_FLAGS_RETAINED})
 mark_as_advanced(MMX_FOUND SSE2_FOUND SSE3_FOUND AVX_FOUND AVX2_FOUND AVX512F_FOUND)
diff --git a/paddle/fluid/framework/CMakeLists.txt b/paddle/fluid/framework/CMakeLists.txt
index 225dfb3e70..90083f690f 100644
--- a/paddle/fluid/framework/CMakeLists.txt
+++ b/paddle/fluid/framework/CMakeLists.txt
@@ -15,8 +15,7 @@ function(windows_symbolic TARGET)
     file(GENERATE OUTPUT ${final_path}/.${src}.cu INPUT ${final_path}/${src}.cc)
 
     add_custom_command(OUTPUT ${final_path}/.${src}.cu
-            COMMAND ${CMAKE_COMMAND} -E remove ${final_path}/.${src}.cu
-            COMMAND ${CMAKE_COMMAND} -E copy "${final_path}/${src}.cc" "${final_path}/.${src}.cu"
+            COMMAND ${CMAKE_COMMAND} -E copy_if_different "${final_path}/${src}.cc" "${final_path}/.${src}.cu"
             COMMENT "create hidden file of ${src}.cu")
     add_custom_target(${TARGET} ALL DEPENDS .${src}.cu)
   endforeach()
diff --git a/paddle/fluid/framework/details/all_reduce_op_handle.cc b/paddle/fluid/framework/details/all_reduce_op_handle.cc
index 9eaff1f560..de7c845884 100644
--- a/paddle/fluid/framework/details/all_reduce_op_handle.cc
+++ b/paddle/fluid/framework/details/all_reduce_op_handle.cc
@@ -50,7 +50,7 @@ void AllReduceOpHandle::RunImpl() {
 
 // FIXME(typhoonzero): If scope0(global scope) have NCCL_ID_VAR,
 // this is a distributed or inter-process call, find a better way.
-#ifdef PADDLE_WITH_CUDA
+#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
   if (NoDummyInputSize() == 1 &&
       local_scopes_[0]->FindLocalVar(NCCL_ID_VARNAME) == nullptr) {
 #else
diff --git a/paddle/fluid/framework/mixed_vector.h b/paddle/fluid/framework/mixed_vector.h
index 6940250c3f..c3a044d22c 100644
--- a/paddle/fluid/framework/mixed_vector.h
+++ b/paddle/fluid/framework/mixed_vector.h
@@ -215,8 +215,8 @@ class Vector {
       auto stream = dev_ctx->stream();
       void *src = gpu_->ptr();
       void *dst = cpu_.data();
-      memory::Copy(platform::CPUPlace(), dst, CUDAPlace().get(), src,
-                   gpu_->size(), stream);
+      paddle::memory::Copy(platform::CPUPlace(), dst, CUDAPlace().get(), src,
+                           gpu_->size(), stream);
       dev_ctx->Wait();
     }
 
@@ -261,8 +261,8 @@ class Vector {
       auto *dev_ctx = static_cast<platform::CUDADeviceContext *>(
           platform::DeviceContextPool::Instance().Get(place));
       auto stream = dev_ctx->stream();
-      memory::Copy(CUDAPlace().get(), dst, platform::CPUPlace(), src,
-                   gpu_->size(), stream);
+      paddle::memory::Copy(CUDAPlace().get(), dst, platform::CPUPlace(), src,
+                           gpu_->size(), stream);
     }
 
     void ImmutableCPU() const {
@@ -284,7 +284,7 @@ class Vector {
     bool IsInCPU() const { return flag_ & kDataInCPU; }
 
     mutable std::vector<T> cpu_;
-    mutable memory::AllocationPtr gpu_;
+    mutable paddle::memory::AllocationPtr gpu_;
     mutable int flag_;
 
     mutable std::mutex mtx_;
diff --git a/paddle/fluid/framework/op_registry.h b/paddle/fluid/framework/op_registry.h
index 6d39bb3c52..2c1648c81f 100644
--- a/paddle/fluid/framework/op_registry.h
+++ b/paddle/fluid/framework/op_registry.h
@@ -23,7 +23,8 @@ limitations under the License. */
 #include <unordered_map>
 #include <unordered_set>
 
-#include "glog/logging.h"  // For VLOG()
+#define GLOG_NO_ABBREVIATED_SEVERITIES  // msvc conflict logging with windows.h
+#include "glog/logging.h"               // For VLOG()
 #include "paddle/fluid/framework/attribute.h"
 #include "paddle/fluid/framework/details/op_registry.h"
 #include "paddle/fluid/framework/framework.pb.h"
diff --git a/paddle/fluid/inference/api/demo_ci/CMakeLists.txt b/paddle/fluid/inference/api/demo_ci/CMakeLists.txt
index 8d0d96d391..f42ee9a697 100644
--- a/paddle/fluid/inference/api/demo_ci/CMakeLists.txt
+++ b/paddle/fluid/inference/api/demo_ci/CMakeLists.txt
@@ -89,12 +89,21 @@ endif()
 
 if(WITH_MKL)
   include_directories("${PADDLE_LIB}/third_party/install/mklml/include")
-  set(MATH_LIB ${PADDLE_LIB}/third_party/install/mklml/lib/libmklml_intel${CMAKE_SHARED_LIBRARY_SUFFIX}
-               ${PADDLE_LIB}/third_party/install/mklml/lib/libiomp5${CMAKE_SHARED_LIBRARY_SUFFIX})
+  if(NOT WIN32)
+    set(MATH_LIB ${PADDLE_LIB}/third_party/install/mklml/lib/libmklml_intel${CMAKE_SHARED_LIBRARY_SUFFIX}
+                 ${PADDLE_LIB}/third_party/install/mklml/lib/libiomp5${CMAKE_SHARED_LIBRARY_SUFFIX})
+  else(WIN32)
+    set(MATH_LIB ${PADDLE_LIB}/third_party/install/mklml/lib/libmklml${CMAKE_SHARED_LIBRARY_SUFFIX}
+            ${PADDLE_LIB}/third_party/install/mklml/lib/libiomp5md${CMAKE_SHARED_LIBRARY_SUFFIX})
+  endif(WIN32)
   set(MKLDNN_PATH "${PADDLE_LIB}/third_party/install/mkldnn")
   if(EXISTS ${MKLDNN_PATH})
     include_directories("${MKLDNN_PATH}/include")
-    set(MKLDNN_LIB ${MKLDNN_PATH}/lib/libmkldnn.so.0)
+    if(WIN32)
+      set(MKLDNN_LIB ${MKLDNN_PATH}/lib/mkldnn.lib)
+    else(WIN32)
+      set(MKLDNN_LIB ${MKLDNN_PATH}/lib/libmkldnn.so.0)
+    endif(WIN32)
   endif()
 else()
   set(MATH_LIB ${PADDLE_LIB}/third_party/install/openblas/lib/libopenblas${CMAKE_STATIC_LIBRARY_SUFFIX})
diff --git a/paddle/fluid/memory/detail/system_allocator.cc b/paddle/fluid/memory/detail/system_allocator.cc
index 3e8fb83e9d..307c348822 100644
--- a/paddle/fluid/memory/detail/system_allocator.cc
+++ b/paddle/fluid/memory/detail/system_allocator.cc
@@ -17,7 +17,6 @@ limitations under the License. */
 
 #ifdef _WIN32
 #include <malloc.h>
-#include <windows.h>  // VirtualLock/VirtualUnlock
 #else
 #include <sys/mman.h>  // for mlock and munlock
 #endif
diff --git a/paddle/fluid/operators/CMakeLists.txt b/paddle/fluid/operators/CMakeLists.txt
index 257bfc0a3f..95ad67e33e 100644
--- a/paddle/fluid/operators/CMakeLists.txt
+++ b/paddle/fluid/operators/CMakeLists.txt
@@ -44,9 +44,8 @@ endif()
 
 register_operators(EXCLUDES warpctc_op conv_fusion_op DEPS ${OP_HEADER_DEPS} ${OP_PREFETCH_DEPS})
 
-
 # warpctc_op needs cudnn 7 above
-if (WITH_GPU AND NOT WIN32)
+if (WITH_GPU)
     if (${CUDNN_MAJOR_VERSION} VERSION_LESS 7)
         op_library(warpctc_op DEPS dynload_warpctc sequence_padding sequence_scale SRCS warpctc_op.cc warpctc_op.cu.cc)
     else()
@@ -64,9 +63,7 @@ endif()
 set(COMMON_OP_DEPS ${OP_HEADER_DEPS})
 
 set(COMMON_OP_DEPS ${COMMON_OP_DEPS} selected_rows_functor selected_rows lod_tensor maxouting unpooling pooling lod_rank_table context_project sequence_pooling executor)
-if (NOT WIN32)
-    set(COMMON_OP_DEPS ${COMMON_OP_DEPS} dynload_warpctc)
-endif()
+set(COMMON_OP_DEPS ${COMMON_OP_DEPS} dynload_warpctc)
 set(COMMON_OP_DEPS ${COMMON_OP_DEPS} sequence_padding sequence_scale cos_sim_functor memory jit_kernel concat_and_split cross_entropy softmax vol2col im2col sampler)
 set(COMMON_OP_DEPS ${COMMON_OP_DEPS} sequence2batch lstm_compute matrix_bit_code gru_compute activation_functions)
 if (WITH_GPU)
diff --git a/paddle/fluid/operators/cum_op.h b/paddle/fluid/operators/cum_op.h
index 999fdcff90..7c0fda4169 100644
--- a/paddle/fluid/operators/cum_op.h
+++ b/paddle/fluid/operators/cum_op.h
@@ -13,6 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #pragma once
+
+#include <array>
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/operator.h"
diff --git a/paddle/fluid/operators/elementwise/elementwise_mul_mkldnn_op.cc b/paddle/fluid/operators/elementwise/elementwise_mul_mkldnn_op.cc
index c600d1e3d7..bf9aef9135 100644
--- a/paddle/fluid/operators/elementwise/elementwise_mul_mkldnn_op.cc
+++ b/paddle/fluid/operators/elementwise/elementwise_mul_mkldnn_op.cc
@@ -19,6 +19,9 @@ limitations under the License. */
 #include "paddle/fluid/platform/mkldnn_helper.h"
 
 #include "paddle/fluid/operators/math/jit_kernel.h"
+#if defined(_WIN32) && defined(_WINSOCKAPI_)
+#define _WINSOCK2API_ /* Prevent inclusion of winsock2.h */
+#endif
 #include "xbyak/xbyak.h"
 #include "xbyak/xbyak_util.h"
 
diff --git a/paddle/fluid/operators/math/detail/lstm_cpu_kernel.h b/paddle/fluid/operators/math/detail/lstm_cpu_kernel.h
index ccbd05c82a..2e3779ff08 100644
--- a/paddle/fluid/operators/math/detail/lstm_cpu_kernel.h
+++ b/paddle/fluid/operators/math/detail/lstm_cpu_kernel.h
@@ -17,6 +17,12 @@ limitations under the License. */
 #include "paddle/fluid/operators/math/detail/activation_functions.h"
 #include "paddle/fluid/operators/math/lstm_compute.h"
 
+#if defined(_WIN32)
+#if defined(__AVX2__) || defined(__AVX__)
+inline __m256 operator+=(__m256 a, __m256 b) { return _mm256_add_ps(a, b); }
+#endif
+#endif
+
 namespace paddle {
 namespace operators {
 namespace math {
diff --git a/paddle/fluid/operators/math/jit_gen.h b/paddle/fluid/operators/math/jit_gen.h
index 6abf3434cc..2bc740e598 100644
--- a/paddle/fluid/operators/math/jit_gen.h
+++ b/paddle/fluid/operators/math/jit_gen.h
@@ -18,6 +18,9 @@ limitations under the License. */
 #include <type_traits>
 #include "paddle/fluid/platform/macros.h"
 
+#if defined(_WIN32) && defined(_WINSOCKAPI_)
+#define _WINSOCK2API_ /* Prevent inclusion of winsock2.h */
+#endif
 #define XBYAK_USE_MMAP_ALLOCATOR
 #include "xbyak/xbyak.h"
 #include "xbyak/xbyak_util.h"
diff --git a/paddle/fluid/platform/cpu_info.cc b/paddle/fluid/platform/cpu_info.cc
index f9a32bfa4c..1642c17809 100644
--- a/paddle/fluid/platform/cpu_info.cc
+++ b/paddle/fluid/platform/cpu_info.cc
@@ -14,6 +14,10 @@ limitations under the License. */
 
 #include "paddle/fluid/platform/cpu_info.h"
 
+#if defined(_WIN32)
+#define NOMINMAX  // msvc max/min macro conflict with std::min/max
+#endif
+
 #ifdef PADDLE_WITH_XBYAK
 #include "xbyak/xbyak.h"
 #include "xbyak/xbyak_util.h"
@@ -22,9 +26,8 @@ limitations under the License. */
 #ifdef __APPLE__
 #include <sys/sysctl.h>
 #include <sys/types.h>
-
 #elif defined(_WIN32)
-#define NOMINMAX  // msvc max/min macro conflict with std::min/max
+#define WIN32_LEAN_AND_MEAN
 #include <windows.h>
 #else
 #include <unistd.h>
diff --git a/paddle/fluid/platform/dynload/CMakeLists.txt b/paddle/fluid/platform/dynload/CMakeLists.txt
index 5939c500c9..07159d4a12 100644
--- a/paddle/fluid/platform/dynload/CMakeLists.txt
+++ b/paddle/fluid/platform/dynload/CMakeLists.txt
@@ -16,9 +16,7 @@ if (CUPTI_FOUND)
     list(APPEND CUDA_SRCS cupti.cc)
 endif(CUPTI_FOUND)
 nv_library(dynload_cuda SRCS ${CUDA_SRCS} DEPS dynamic_loader)
-if (NOT WIN32)
 cc_library(dynload_warpctc SRCS warpctc.cc DEPS dynamic_loader warpctc)
-endif(NOT WIN32)
 if (WITH_MKLML)
     cc_library(dynload_mklml SRCS mklml.cc DEPS dynamic_loader mklml)
 endif()
diff --git a/paddle/fluid/platform/dynload/cudnn.cc b/paddle/fluid/platform/dynload/cudnn.cc
index f3cd3b2bbe..91d9a1ef01 100644
--- a/paddle/fluid/platform/dynload/cudnn.cc
+++ b/paddle/fluid/platform/dynload/cudnn.cc
@@ -38,6 +38,10 @@ CUDNN_DNN_ROUTINE_EACH_AFTER_R4(DEFINE_WRAP);
 CUDNN_DNN_ROUTINE_EACH_R5(DEFINE_WRAP);
 #endif
 
+#ifdef CUDNN_DNN_ROUTINE_EACH_R6
+CUDNN_DNN_ROUTINE_EACH_R6(DEFINE_WRAP);
+#endif
+
 #ifdef CUDNN_DNN_ROUTINE_EACH_R7
 CUDNN_DNN_ROUTINE_EACH_R7(DEFINE_WRAP);
 #endif
diff --git a/paddle/fluid/platform/dynload/cudnn.h b/paddle/fluid/platform/dynload/cudnn.h
index 550fe2edee..2f4f8101e4 100644
--- a/paddle/fluid/platform/dynload/cudnn.h
+++ b/paddle/fluid/platform/dynload/cudnn.h
@@ -34,7 +34,7 @@ extern void EnforceCUDNNLoaded(const char* fn_name);
 #define DECLARE_DYNAMIC_LOAD_CUDNN_WRAP(__name)                            \
   struct DynLoad__##__name {                                               \
     template <typename... Args>                                            \
-    auto operator()(Args... args) -> decltype(__name(args...)) {           \
+    auto operator()(Args... args) -> DECLARE_TYPE(__name, args...) {       \
       using cudnn_func = decltype(&::__name);                              \
       std::call_once(cudnn_dso_flag, []() {                                \
         cudnn_dso_handle = paddle::platform::dynload::GetCUDNNDsoHandle(); \
diff --git a/paddle/fluid/platform/dynload/dynamic_loader.cc b/paddle/fluid/platform/dynload/dynamic_loader.cc
index cc5cda6106..15d5168366 100644
--- a/paddle/fluid/platform/dynload/dynamic_loader.cc
+++ b/paddle/fluid/platform/dynload/dynamic_loader.cc
@@ -53,6 +53,12 @@ namespace platform {
 namespace dynload {
 static constexpr char cupti_lib_path[] = CUPTI_LIB_PATH;
 
+#if defined(_WIN32) && defined(PADDLE_WITH_CUDA)
+static constexpr char* win_cublas_lib = "cublas64_" PADDLE_CUDA_BINVER ".dll";
+static constexpr char* win_curand_lib = "curand64_" PADDLE_CUDA_BINVER ".dll";
+static constexpr char* win_cudnn_lib = "cudnn64_" PADDLE_CUDNN_BINVER ".dll";
+#endif
+
 static inline std::string join(const std::string& part1,
                                const std::string& part2) {
   // directory separator
@@ -165,6 +171,8 @@ static inline void* GetDsoHandleFromSearchPath(const std::string& search_root,
 void* GetCublasDsoHandle() {
 #if defined(__APPLE__) || defined(__OSX__)
   return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcublas.dylib");
+#elif defined(_WIN32) && defined(PADDLE_WITH_CUDA)
+  return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, win_cublas_lib);
 #else
   return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcublas.so");
 #endif
@@ -173,6 +181,8 @@ void* GetCublasDsoHandle() {
 void* GetCUDNNDsoHandle() {
 #if defined(__APPLE__) || defined(__OSX__)
   return GetDsoHandleFromSearchPath(FLAGS_cudnn_dir, "libcudnn.dylib", false);
+#elif defined(_WIN32) && defined(PADDLE_WITH_CUDA)
+  return GetDsoHandleFromSearchPath(FLAGS_cudnn_dir, win_cudnn_lib);
 #else
   return GetDsoHandleFromSearchPath(FLAGS_cudnn_dir, "libcudnn.so", false);
 #endif
@@ -193,6 +203,8 @@ void* GetCUPTIDsoHandle() {
 void* GetCurandDsoHandle() {
 #if defined(__APPLE__) || defined(__OSX__)
   return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcurand.dylib");
+#elif defined(_WIN32) && defined(PADDLE_WITH_CUDA)
+  return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, win_curand_lib);
 #else
   return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcurand.so");
 #endif
@@ -201,6 +213,8 @@ void* GetCurandDsoHandle() {
 void* GetWarpCTCDsoHandle() {
 #if defined(__APPLE__) || defined(__OSX__)
   return GetDsoHandleFromSearchPath(FLAGS_warpctc_dir, "libwarpctc.dylib");
+#elif defined(_WIN32)
+  return GetDsoHandleFromSearchPath(FLAGS_warpctc_dir, "warpctc.dll");
 #else
   return GetDsoHandleFromSearchPath(FLAGS_warpctc_dir, "libwarpctc.so");
 #endif
@@ -225,6 +239,8 @@ void* GetTensorRtDsoHandle() {
 void* GetMKLMLDsoHandle() {
 #if defined(__APPLE__) || defined(__OSX__)
   return GetDsoHandleFromSearchPath(FLAGS_mklml_dir, "libmklml_intel.dylib");
+#elif defined(_WIN32)
+  return GetDsoHandleFromSearchPath(FLAGS_mklml_dir, "mklml.dll");
 #else
   return GetDsoHandleFromSearchPath(FLAGS_mklml_dir, "libmklml_intel.so");
 #endif
diff --git a/paddle/fluid/platform/dynload/dynamic_loader.h b/paddle/fluid/platform/dynload/dynamic_loader.h
index 84fd2ce998..edb4c649ad 100644
--- a/paddle/fluid/platform/dynload/dynamic_loader.h
+++ b/paddle/fluid/platform/dynload/dynamic_loader.h
@@ -18,6 +18,12 @@ namespace paddle {
 namespace platform {
 namespace dynload {
 
+#ifndef _WIN32
+#define DECLARE_TYPE(__name, ...) decltype(__name(__VA_ARGS__))
+#else
+#define DECLARE_TYPE(__name, ...) decltype(auto)
+#endif
+
 void* GetCublasDsoHandle();
 void* GetCUDNNDsoHandle();
 void* GetCUPTIDsoHandle();
diff --git a/paddle/fluid/platform/dynload/mklml.h b/paddle/fluid/platform/dynload/mklml.h
index f0a9736623..944b00bae1 100644
--- a/paddle/fluid/platform/dynload/mklml.h
+++ b/paddle/fluid/platform/dynload/mklml.h
@@ -34,7 +34,7 @@ extern void* mklml_dso_handle;
 #define DYNAMIC_LOAD_MKLML_WRAP(__name)                                    \
   struct DynLoad__##__name {                                               \
     template <typename... Args>                                            \
-    auto operator()(Args... args) -> decltype(__name(args...)) {           \
+    auto operator()(Args... args) -> DECLARE_TYPE(__name, args...) {       \
       using mklmlFunc = decltype(&::__name);                               \
       std::call_once(mklml_dso_flag, []() {                                \
         mklml_dso_handle = paddle::platform::dynload::GetMKLMLDsoHandle(); \
diff --git a/paddle/fluid/platform/dynload/tensorrt.h b/paddle/fluid/platform/dynload/tensorrt.h
index 5d67658b94..751aa54b1a 100644
--- a/paddle/fluid/platform/dynload/tensorrt.h
+++ b/paddle/fluid/platform/dynload/tensorrt.h
@@ -33,7 +33,7 @@ extern void* tensorrt_dso_handle;
 #define DECLARE_DYNAMIC_LOAD_TENSORRT_WRAP(__name)                      \
   struct DynLoad__##__name {                                            \
     template <typename... Args>                                         \
-    auto operator()(Args... args) -> decltype(__name(args...)) {        \
+    auto operator()(Args... args) -> DECLARE_TYPE(__name, args...) {    \
       using tensorrt_func = decltype(__name(args...)) (*)(Args...);     \
       std::call_once(tensorrt_dso_flag, []() {                          \
         tensorrt_dso_handle =                                           \
diff --git a/paddle/fluid/platform/dynload/warpctc.h b/paddle/fluid/platform/dynload/warpctc.h
index 18ed9956f1..bc1977b05d 100644
--- a/paddle/fluid/platform/dynload/warpctc.h
+++ b/paddle/fluid/platform/dynload/warpctc.h
@@ -34,7 +34,7 @@ extern void* warpctc_dso_handle;
 #define DYNAMIC_LOAD_WARPCTC_WRAP(__name)                                      \
   struct DynLoad__##__name {                                                   \
     template <typename... Args>                                                \
-    auto operator()(Args... args) -> decltype(__name(args...)) {               \
+    auto operator()(Args... args) -> DECLARE_TYPE(__name, args...) {           \
       using warpctcFunc = decltype(&::__name);                                 \
       std::call_once(warpctc_dso_flag, []() {                                  \
         warpctc_dso_handle = paddle::platform::dynload::GetWarpCTCDsoHandle(); \
diff --git a/paddle/fluid/platform/port.h b/paddle/fluid/platform/port.h
index ad070171df..41388d8959 100644
--- a/paddle/fluid/platform/port.h
+++ b/paddle/fluid/platform/port.h
@@ -37,6 +37,10 @@
 #define GOOGLE_GLOG_DLL_DECL
 #include <io.h>  // _popen, _pclose
 #include <stdio.h>
+#ifdef _WINSOCKAPI_
+/* Prevent inclusion of winsock.h in windows.h */
+#define WIN32_LEAN_AND_MEAN
+#endif
 #include <windows.h>
 #include <numeric>  // std::accumulate in msvc
 #ifndef S_ISDIR     // windows port for sys/stat.h
@@ -55,7 +59,6 @@ static void *dlsym(void *handle, const char *symbol_name) {
 
 static void *dlopen(const char *filename, int flag) {
   std::string file_name(filename);
-  file_name.replace(0, file_name.size() - 1, '/', '\\');
   HMODULE hModule = LoadLibrary(file_name.c_str());
   if (!hModule) {
     throw std::runtime_error(file_name + " not found.");
diff --git a/paddle/fluid/train/demo/CMakeLists.txt b/paddle/fluid/train/demo/CMakeLists.txt
index eabb51d370..af033fa740 100644
--- a/paddle/fluid/train/demo/CMakeLists.txt
+++ b/paddle/fluid/train/demo/CMakeLists.txt
@@ -35,16 +35,26 @@ add_executable(demo_trainer demo_trainer.cc)
 
 if(WITH_MKLDNN)
   include_directories("${PADDLE_LIB}/third_party/install/mkldnn/include")
-  set(MKLDNN_LIB ${PADDLE_LIB}/third_party/install/mkldnn/lib/libmkldnn.so.0)
-endif()
+  if(WIN32)
+    set(MKLDNN_LIB ${PADDLE_LIB}/third_party/install/mkldnn/lib/mkldnn.lib)
+  else(WIN32)
+    set(MKLDNN_LIB ${PADDLE_LIB}/third_party/install/mkldnn/lib/libmkldnn.so.0)
+  endif(WIN32)
+endif(WITH_MKLDNN)
 
 if(WITH_MKL)
   include_directories("${PADDLE_LIB}/third_party/install/mklml/include")
-  set(MATH_LIB ${PADDLE_LIB}/third_party/install/mklml/lib/libmklml_intel.so)
+  if(WIN32)
+    set(MATH_LIB ${PADDLE_LIB}/third_party/install/mklml/lib/mklml.lib)
+  else(WIN32)
+    set(MATH_LIB ${PADDLE_LIB}/third_party/install/mklml/lib/libmklml_intel.so)
+  endif(WIN32)
 else()
   if(APPLE)
     set(MATH_LIB cblas)
-  else(APPLE)
+  elseif(WIN32)
+    set(MATH_LIB ${PADDLE_LIB}/third_party/install/openblas/lib/libopenblas.lib)
+  else()
     set(MATH_LIB ${PADDLE_LIB}/third_party/install/openblas/lib/libopenblas.a)
   endif(APPLE)
 endif()
diff --git a/python/CMakeLists.txt b/python/CMakeLists.txt
index 139176b0d6..078d543ba2 100644
--- a/python/CMakeLists.txt
+++ b/python/CMakeLists.txt
@@ -48,12 +48,18 @@ configure_file(${CMAKE_CURRENT_SOURCE_DIR}/setup.py.in
 IF(WIN32)
     # Python would use the .pyd by default under Windows series platform
     set(FLUID_DST_DIR ${PADDLE_BINARY_DIR}/python/paddle/fluid/)
-    get_filename_component(openblas_refpath ${CBLAS_LIBRARIES} DIRECTORY)
     set(FLUID_CORE ${FLUID_DST_DIR}/core.pyd)
-    add_custom_command(OUTPUT ${FLUID_CORE}
-            COMMAND cmake -E copy $<TARGET_FILE:paddle_pybind> ${FLUID_CORE}
-            COMMAND cmake -E copy ${openblas_refpath}/openblas.dll ${FLUID_DST_DIR}
-            DEPENDS paddle_pybind)
+    if(NOT WITH_MKLDNN)
+        get_filename_component(openblas_refpath ${CBLAS_LIBRARIES} DIRECTORY)
+        add_custom_command(OUTPUT ${FLUID_CORE}
+                COMMAND cmake -E copy $<TARGET_FILE:paddle_pybind> ${FLUID_CORE}
+                COMMAND cmake -E copy ${openblas_refpath}/openblas.dll ${FLUID_DST_DIR}
+                DEPENDS paddle_pybind)
+    else(NOT WITH_MKLDNN)
+        add_custom_command(OUTPUT ${FLUID_CORE}
+                COMMAND cmake -E copy $<TARGET_FILE:paddle_pybind> ${FLUID_CORE}
+                DEPENDS paddle_pybind)
+    endif(NOT WITH_MKLDNN)
 ELSE()
     set(FLUID_CORE ${PADDLE_BINARY_DIR}/python/paddle/fluid/core.so)
     add_custom_command(OUTPUT ${FLUID_CORE}
diff --git a/python/paddle/fluid/__init__.py b/python/paddle/fluid/__init__.py
index 2dea71d7af..fd788d0929 100644
--- a/python/paddle/fluid/__init__.py
+++ b/python/paddle/fluid/__init__.py
@@ -102,6 +102,12 @@ def __bootstrap__():
     import sys
     import os
     import platform
+
+    if os.name == 'nt':
+        third_lib_path = os.path.abspath(os.path.dirname(__file__)) + os.sep + '..' + os.sep + 'libs'
+        os.environ['path'] += ';' + third_lib_path
+        sys.path.append(third_lib_path)
+
     from . import core
 
     in_test = 'unittest' in sys.modules
@@ -128,13 +134,12 @@ def __bootstrap__():
         'free_idle_memory', 'paddle_num_threads', "dist_threadpool_size",
         'eager_delete_tensor_gb', 'fast_eager_deletion_mode',
         'allocator_strategy', 'reader_queue_speed_test_mode',
-        'print_sub_graph_dir', 'pe_profile_fname'
+        'print_sub_graph_dir', 'pe_profile_fname', 'warpctc_dir'
     ]
     if 'Darwin' not in sysstr:
         read_env_flags.append('use_pinned_memory')
 
     if os.name != 'nt':
-        read_env_flags.append('warpctc_dir')
         read_env_flags.append('cpu_deterministic')
 
     if core.is_compiled_with_dist():
diff --git a/python/paddle/fluid/framework.py b/python/paddle/fluid/framework.py
index 0897920594..da74fd41fc 100644
--- a/python/paddle/fluid/framework.py
+++ b/python/paddle/fluid/framework.py
@@ -16,6 +16,7 @@ from __future__ import print_function
 
 import collections
 import contextlib
+import os
 import re
 import six
 import sys
@@ -27,11 +28,18 @@ from .proto import framework_pb2
 try:
     from . import core
 except ImportError as e:
-    raise ImportError(
-        """NOTE: You may need to run \"export LD_LIBRARY_PATH=/usr/local/lib:$LD_LIBRARY_PATH\"
-    if you encounters \"libmkldnn.so not found\" errors. If you have python
-    installed in other directory, replace \"/usr/local/lib\" with your own
-    directory. The original error is: \n""" + cpt.get_exception_message(e))
+    if os.name == 'nt':
+        raise ImportError(
+            """NOTE: You may need to run \"set PATH=c:\python27\lib:%PATH%\"
+        if you encounters \"mkldnn.dll not found\" errors. If you have python
+        installed in other directory, replace \"c:\python27\lib" with your own
+        directory. The original error is: \n""" + cpt.get_exception_message(e))
+    else:
+        raise ImportError(
+            """NOTE: You may need to run \"export LD_LIBRARY_PATH=/usr/local/lib:$LD_LIBRARY_PATH\"
+        if you encounters \"libmkldnn.so not found\" errors. If you have python
+        installed in other directory, replace \"/usr/local/lib\" with your own
+        directory. The original error is: \n""" + cpt.get_exception_message(e))
 except Exception as e:
     raise e
 from . import unique_name
diff --git a/python/setup.py.in b/python/setup.py.in
index 6562046641..f4613dd72d 100644
--- a/python/setup.py.in
+++ b/python/setup.py.in
@@ -158,27 +158,29 @@ if '${WITH_FLUID_ONLY}'== 'OFF':
 
 # put all thirdparty libraries in paddle.libs
 libs_path='${PADDLE_BINARY_DIR}/python/paddle/libs'
-if os.name != 'nt':
-    package_data['paddle.libs']= []
-    package_data['paddle.libs']=['libwarpctc' + ext_name]
-    shutil.copy('${WARPCTC_LIBRARIES}', libs_path)
+
+package_data['paddle.libs']= []
+package_data['paddle.libs']=[('libwarpctc' if os.name != 'nt' else 'warpctc') + ext_name]
+shutil.copy('${WARPCTC_LIBRARIES}', libs_path)
+
 if '${WITH_MKL}' == 'ON':
-    shutil.copy('${MKLML_LIB}', libs_path)
-    shutil.copy('${MKLML_IOMP_LIB}', libs_path)
-    package_data['paddle.libs']+=['libmklml_intel' + ext_name,'libiomp5' + ext_name]
+    shutil.copy('${MKLML_SHARED_LIB}', libs_path)
+    shutil.copy('${MKLML_SHARED_IOMP_LIB}', libs_path)
+    package_data['paddle.libs']+=[('libmklml_intel' if os.name != 'nt' else 'mklml') + ext_name, ('libiomp5' if os.name != 'nt' else 'libiomp5md') + ext_name]
 if '${WITH_MKLDNN}' == 'ON':
     if '${CMAKE_BUILD_TYPE}' == 'Release':
-        # only change rpath in Release mode.
-        # TODO(typhoonzero): use install_name_tool to patch mkl libs once
-        # we can support mkl on mac.
-        #
-        # change rpath of libmkldnn.so.0, add $ORIGIN/ to it.
-        # The reason is that all thirdparty libraries in the same directory,
-        # thus, libmkldnn.so.0 will find libmklml_intel.so and libiomp5.so.
-        command = "patchelf --set-rpath '$ORIGIN/' ${MKLDNN_SHARED_LIB}"
-        if os.system(command) != 0:
-            raise Exception("patch libmkldnn.so failed, command: %s" % command)
-    package_data['paddle.libs']+=['libmkldnn.so.0']
+        if os.name != 'nt':
+            # only change rpath in Release mode.
+            # TODO(typhoonzero): use install_name_tool to patch mkl libs once
+            # we can support mkl on mac.
+            #
+            # change rpath of libmkldnn.so.0, add $ORIGIN/ to it.
+            # The reason is that all thirdparty libraries in the same directory,
+            # thus, libmkldnn.so.0 will find libmklml_intel.so and libiomp5.so.
+            command = "patchelf --set-rpath '$ORIGIN/' ${MKLDNN_SHARED_LIB}"
+            if os.system(command) != 0:
+                raise Exception("patch libmkldnn.so failed, command: %s" % command)
+    package_data['paddle.libs']+=['libmkldnn.so.0' if os.name != 'nt' else ('mkldnn' + ext_name)]
     shutil.copy('${MKLDNN_SHARED_LIB}', libs_path)
 if '${WITH_NGRAPH}' == 'ON':
     # only change rpath in Release mode,

From 001891aea69a96a725c8026a82d5a7dd45ed558f Mon Sep 17 00:00:00 2001
From: peizhilin <wopeizl@163.com>
Date: Tue, 18 Dec 2018 13:15:54 +0800
Subject: [PATCH 02/28] fix code style test=develop

---
 python/paddle/fluid/__init__.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/python/paddle/fluid/__init__.py b/python/paddle/fluid/__init__.py
index fd788d0929..ecf75f7282 100644
--- a/python/paddle/fluid/__init__.py
+++ b/python/paddle/fluid/__init__.py
@@ -104,7 +104,8 @@ def __bootstrap__():
     import platform
 
     if os.name == 'nt':
-        third_lib_path = os.path.abspath(os.path.dirname(__file__)) + os.sep + '..' + os.sep + 'libs'
+        third_lib_path = os.path.abspath(os.path.dirname(
+            __file__)) + os.sep + '..' + os.sep + 'libs'
         os.environ['path'] += ';' + third_lib_path
         sys.path.append(third_lib_path)
 

From b601f2de8d3d1a336810438c521714749f8a19a6 Mon Sep 17 00:00:00 2001
From: peizhilin <wopeizl@163.com>
Date: Tue, 18 Dec 2018 13:38:08 +0800
Subject: [PATCH 03/28] include the mkl fix only test=develop

---
 CMakeLists.txt                                |  4 +-
 cmake/cuda.cmake                              |  3 -
 cmake/cudnn.cmake                             |  1 -
 cmake/operators.cmake                         |  2 +-
 cmake/simd.cmake                              | 73 ++++++++++---------
 paddle/fluid/framework/CMakeLists.txt         |  3 +-
 paddle/fluid/framework/mixed_vector.h         | 10 +--
 paddle/fluid/framework/op_registry.h          |  3 +-
 .../fluid/memory/detail/system_allocator.cc   |  1 +
 paddle/fluid/operators/CMakeLists.txt         |  7 +-
 paddle/fluid/operators/cum_op.h               |  2 -
 .../elementwise/elementwise_mul_mkldnn_op.cc  |  3 -
 .../operators/math/detail/lstm_cpu_kernel.h   |  6 --
 paddle/fluid/operators/math/jit_gen.h         |  3 -
 paddle/fluid/platform/dynload/CMakeLists.txt  |  2 +
 paddle/fluid/platform/dynload/cudnn.cc        |  4 -
 .../fluid/platform/dynload/dynamic_loader.cc  | 16 ----
 python/setup.py.in                            |  9 +--
 18 files changed, 61 insertions(+), 91 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index efdb451f65..aa9446a694 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -203,10 +203,10 @@ include(external/xxhash)    # download xxhash
 include(external/dlpack)
 include(external/snappy)    # download snappy
 include(external/snappystream) # download snappystream
-include(external/warpctc)   # download, build, install warpctc
 
 if (NOT WIN32)
-# there is no official support of nccl, cupti in windows
+# there is no official support of warpctc, nccl, cupti in windows
+include(external/warpctc)   # download, build, install warpctc
 include(cupti)
 include(external/gzstream)
 endif (NOT WIN32)
diff --git a/cmake/cuda.cmake b/cmake/cuda.cmake
index 5be7be6413..414e92eb27 100644
--- a/cmake/cuda.cmake
+++ b/cmake/cuda.cmake
@@ -139,12 +139,10 @@ endfunction()
 message(STATUS "CUDA detected: " ${CUDA_VERSION})
 if (${CUDA_VERSION} LESS 7.0)
   set(paddle_known_gpu_archs ${paddle_known_gpu_archs})
-  add_definitions("-DPADDLE_CUDA_BINVER=\"60\"")
 elseif (${CUDA_VERSION} LESS 8.0) # CUDA 7.x
   set(paddle_known_gpu_archs ${paddle_known_gpu_archs7})
   list(APPEND CUDA_NVCC_FLAGS "-D_MWAITXINTRIN_H_INCLUDED")
   list(APPEND CUDA_NVCC_FLAGS "-D__STRICT_ANSI__")
-  add_definitions("-DPADDLE_CUDA_BINVER=\"70\"")
 elseif (${CUDA_VERSION} LESS 9.0) # CUDA 8.x
   set(paddle_known_gpu_archs ${paddle_known_gpu_archs8})
   list(APPEND CUDA_NVCC_FLAGS "-D_MWAITXINTRIN_H_INCLUDED")
@@ -152,7 +150,6 @@ elseif (${CUDA_VERSION} LESS 9.0) # CUDA 8.x
   # CUDA 8 may complain that sm_20 is no longer supported. Suppress the
   # warning for now.
   list(APPEND CUDA_NVCC_FLAGS "-Wno-deprecated-gpu-targets")
-  add_definitions("-DPADDLE_CUDA_BINVER=\"80\"")
 endif()
 
 include_directories(${CUDA_INCLUDE_DIRS})
diff --git a/cmake/cudnn.cmake b/cmake/cudnn.cmake
index 96a9917e76..09bec347db 100644
--- a/cmake/cudnn.cmake
+++ b/cmake/cudnn.cmake
@@ -89,7 +89,6 @@ if(CUDNN_FOUND)
         if(NOT CUDNN_MAJOR_VERSION)
             set(CUDNN_VERSION "???")
         else()
-            add_definitions("-DPADDLE_CUDNN_BINVER=\"${CUDNN_MAJOR_VERSION}\"")
             math(EXPR CUDNN_VERSION
                 "${CUDNN_MAJOR_VERSION} * 1000 +
                  ${CUDNN_MINOR_VERSION} * 100 + ${CUDNN_PATCHLEVEL_VERSION}")
diff --git a/cmake/operators.cmake b/cmake/operators.cmake
index 70d159b4f3..2ced43f9e6 100644
--- a/cmake/operators.cmake
+++ b/cmake/operators.cmake
@@ -84,7 +84,7 @@ function(op_library TARGET)
     endif()
     if (WIN32)
     # remove windows unsupported op, because windows has no nccl, no warpctc such ops.
-    foreach(windows_unsupport_op "nccl_op" "gen_nccl_id_op")
+    foreach(windows_unsupport_op "nccl_op" "gen_nccl_id_op" "warpctc_op")
         if ("${TARGET}" STREQUAL "${windows_unsupport_op}")
           return()
         endif()
diff --git a/cmake/simd.cmake b/cmake/simd.cmake
index 566dc75fda..86096d4fea 100644
--- a/cmake/simd.cmake
+++ b/cmake/simd.cmake
@@ -57,43 +57,46 @@ int main()
     return 0;
 }" SSE3_FOUND)
 
-# Check AVX
-set(CMAKE_REQUIRED_FLAGS ${AVX_FLAG})
-set(AVX_FOUND_EXITCODE 1 CACHE STRING "Result from TRY_RUN" FORCE)
-CHECK_CXX_SOURCE_RUNS("
-#include <immintrin.h>
-int main()
-{
-    __m256 a = _mm256_set_ps (-1.0f, 2.0f, -3.0f, 4.0f, -1.0f, 2.0f, -3.0f, 4.0f);
-    __m256 b = _mm256_set_ps (1.0f, 2.0f, 3.0f, 4.0f, 1.0f, 2.0f, 3.0f, 4.0f);
-    __m256 result = _mm256_add_ps (a, b);
-    return 0;
-}" AVX_FOUND)
+# disable AVX by default on windows
+if(NOT WIN32)
+    # Check AVX
+    set(CMAKE_REQUIRED_FLAGS ${AVX_FLAG})
+    set(AVX_FOUND_EXITCODE 1 CACHE STRING "Result from TRY_RUN" FORCE)
+    CHECK_CXX_SOURCE_RUNS("
+    #include <immintrin.h>
+    int main()
+    {
+        __m256 a = _mm256_set_ps (-1.0f, 2.0f, -3.0f, 4.0f, -1.0f, 2.0f, -3.0f, 4.0f);
+        __m256 b = _mm256_set_ps (1.0f, 2.0f, 3.0f, 4.0f, 1.0f, 2.0f, 3.0f, 4.0f);
+        __m256 result = _mm256_add_ps (a, b);
+        return 0;
+    }" AVX_FOUND)
 
-# Check AVX 2
-set(CMAKE_REQUIRED_FLAGS ${AVX2_FLAG})
-set(AVX2_FOUND_EXITCODE 1 CACHE STRING "Result from TRY_RUN" FORCE)
-CHECK_CXX_SOURCE_RUNS("
-#include <immintrin.h>
-int main()
-{
-    __m256i a = _mm256_set_epi32 (-1, 2, -3, 4, -1, 2, -3, 4);
-    __m256i result = _mm256_abs_epi32 (a);
-    return 0;
-}" AVX2_FOUND)
+    # Check AVX 2
+    set(CMAKE_REQUIRED_FLAGS ${AVX2_FLAG})
+    set(AVX2_FOUND_EXITCODE 1 CACHE STRING "Result from TRY_RUN" FORCE)
+    CHECK_CXX_SOURCE_RUNS("
+    #include <immintrin.h>
+    int main()
+    {
+        __m256i a = _mm256_set_epi32 (-1, 2, -3, 4, -1, 2, -3, 4);
+        __m256i result = _mm256_abs_epi32 (a);
+        return 0;
+    }" AVX2_FOUND)
 
-# Check AVX512F
-set(CMAKE_REQUIRED_FLAGS ${AVX512F_FLAG})
-set(AVX512F_FOUND_EXITCODE 1 CACHE STRING "Result from TRY_RUN" FORCE)
-CHECK_CXX_SOURCE_RUNS("
-#include <immintrin.h>
-int main()
-{
-    __m512i a = _mm512_set_epi32 (-1, 2, -3, 4, -1, 2, -3, 4,
-                                  13, -5, 6, -7, 9, 2, -6, 3);
-    __m512i result = _mm512_abs_epi32 (a);
-    return 0;
-}" AVX512F_FOUND)
+    # Check AVX512F
+    set(CMAKE_REQUIRED_FLAGS ${AVX512F_FLAG})
+    set(AVX512F_FOUND_EXITCODE 1 CACHE STRING "Result from TRY_RUN" FORCE)
+    CHECK_CXX_SOURCE_RUNS("
+    #include <immintrin.h>
+    int main()
+    {
+        __m512i a = _mm512_set_epi32 (-1, 2, -3, 4, -1, 2, -3, 4,
+                                      13, -5, 6, -7, 9, 2, -6, 3);
+        __m512i result = _mm512_abs_epi32 (a);
+        return 0;
+    }" AVX512F_FOUND)
+endif(NOT WIN32)
 
 set(CMAKE_REQUIRED_FLAGS ${CMAKE_REQUIRED_FLAGS_RETAINED})
 mark_as_advanced(MMX_FOUND SSE2_FOUND SSE3_FOUND AVX_FOUND AVX2_FOUND AVX512F_FOUND)
diff --git a/paddle/fluid/framework/CMakeLists.txt b/paddle/fluid/framework/CMakeLists.txt
index 90083f690f..225dfb3e70 100644
--- a/paddle/fluid/framework/CMakeLists.txt
+++ b/paddle/fluid/framework/CMakeLists.txt
@@ -15,7 +15,8 @@ function(windows_symbolic TARGET)
     file(GENERATE OUTPUT ${final_path}/.${src}.cu INPUT ${final_path}/${src}.cc)
 
     add_custom_command(OUTPUT ${final_path}/.${src}.cu
-            COMMAND ${CMAKE_COMMAND} -E copy_if_different "${final_path}/${src}.cc" "${final_path}/.${src}.cu"
+            COMMAND ${CMAKE_COMMAND} -E remove ${final_path}/.${src}.cu
+            COMMAND ${CMAKE_COMMAND} -E copy "${final_path}/${src}.cc" "${final_path}/.${src}.cu"
             COMMENT "create hidden file of ${src}.cu")
     add_custom_target(${TARGET} ALL DEPENDS .${src}.cu)
   endforeach()
diff --git a/paddle/fluid/framework/mixed_vector.h b/paddle/fluid/framework/mixed_vector.h
index c3a044d22c..6940250c3f 100644
--- a/paddle/fluid/framework/mixed_vector.h
+++ b/paddle/fluid/framework/mixed_vector.h
@@ -215,8 +215,8 @@ class Vector {
       auto stream = dev_ctx->stream();
       void *src = gpu_->ptr();
       void *dst = cpu_.data();
-      paddle::memory::Copy(platform::CPUPlace(), dst, CUDAPlace().get(), src,
-                           gpu_->size(), stream);
+      memory::Copy(platform::CPUPlace(), dst, CUDAPlace().get(), src,
+                   gpu_->size(), stream);
       dev_ctx->Wait();
     }
 
@@ -261,8 +261,8 @@ class Vector {
       auto *dev_ctx = static_cast<platform::CUDADeviceContext *>(
           platform::DeviceContextPool::Instance().Get(place));
       auto stream = dev_ctx->stream();
-      paddle::memory::Copy(CUDAPlace().get(), dst, platform::CPUPlace(), src,
-                           gpu_->size(), stream);
+      memory::Copy(CUDAPlace().get(), dst, platform::CPUPlace(), src,
+                   gpu_->size(), stream);
     }
 
     void ImmutableCPU() const {
@@ -284,7 +284,7 @@ class Vector {
     bool IsInCPU() const { return flag_ & kDataInCPU; }
 
     mutable std::vector<T> cpu_;
-    mutable paddle::memory::AllocationPtr gpu_;
+    mutable memory::AllocationPtr gpu_;
     mutable int flag_;
 
     mutable std::mutex mtx_;
diff --git a/paddle/fluid/framework/op_registry.h b/paddle/fluid/framework/op_registry.h
index 2c1648c81f..6d39bb3c52 100644
--- a/paddle/fluid/framework/op_registry.h
+++ b/paddle/fluid/framework/op_registry.h
@@ -23,8 +23,7 @@ limitations under the License. */
 #include <unordered_map>
 #include <unordered_set>
 
-#define GLOG_NO_ABBREVIATED_SEVERITIES  // msvc conflict logging with windows.h
-#include "glog/logging.h"               // For VLOG()
+#include "glog/logging.h"  // For VLOG()
 #include "paddle/fluid/framework/attribute.h"
 #include "paddle/fluid/framework/details/op_registry.h"
 #include "paddle/fluid/framework/framework.pb.h"
diff --git a/paddle/fluid/memory/detail/system_allocator.cc b/paddle/fluid/memory/detail/system_allocator.cc
index 307c348822..3e8fb83e9d 100644
--- a/paddle/fluid/memory/detail/system_allocator.cc
+++ b/paddle/fluid/memory/detail/system_allocator.cc
@@ -17,6 +17,7 @@ limitations under the License. */
 
 #ifdef _WIN32
 #include <malloc.h>
+#include <windows.h>  // VirtualLock/VirtualUnlock
 #else
 #include <sys/mman.h>  // for mlock and munlock
 #endif
diff --git a/paddle/fluid/operators/CMakeLists.txt b/paddle/fluid/operators/CMakeLists.txt
index 95ad67e33e..257bfc0a3f 100644
--- a/paddle/fluid/operators/CMakeLists.txt
+++ b/paddle/fluid/operators/CMakeLists.txt
@@ -44,8 +44,9 @@ endif()
 
 register_operators(EXCLUDES warpctc_op conv_fusion_op DEPS ${OP_HEADER_DEPS} ${OP_PREFETCH_DEPS})
 
+
 # warpctc_op needs cudnn 7 above
-if (WITH_GPU)
+if (WITH_GPU AND NOT WIN32)
     if (${CUDNN_MAJOR_VERSION} VERSION_LESS 7)
         op_library(warpctc_op DEPS dynload_warpctc sequence_padding sequence_scale SRCS warpctc_op.cc warpctc_op.cu.cc)
     else()
@@ -63,7 +64,9 @@ endif()
 set(COMMON_OP_DEPS ${OP_HEADER_DEPS})
 
 set(COMMON_OP_DEPS ${COMMON_OP_DEPS} selected_rows_functor selected_rows lod_tensor maxouting unpooling pooling lod_rank_table context_project sequence_pooling executor)
-set(COMMON_OP_DEPS ${COMMON_OP_DEPS} dynload_warpctc)
+if (NOT WIN32)
+    set(COMMON_OP_DEPS ${COMMON_OP_DEPS} dynload_warpctc)
+endif()
 set(COMMON_OP_DEPS ${COMMON_OP_DEPS} sequence_padding sequence_scale cos_sim_functor memory jit_kernel concat_and_split cross_entropy softmax vol2col im2col sampler)
 set(COMMON_OP_DEPS ${COMMON_OP_DEPS} sequence2batch lstm_compute matrix_bit_code gru_compute activation_functions)
 if (WITH_GPU)
diff --git a/paddle/fluid/operators/cum_op.h b/paddle/fluid/operators/cum_op.h
index 7c0fda4169..999fdcff90 100644
--- a/paddle/fluid/operators/cum_op.h
+++ b/paddle/fluid/operators/cum_op.h
@@ -13,8 +13,6 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #pragma once
-
-#include <array>
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/operator.h"
diff --git a/paddle/fluid/operators/elementwise/elementwise_mul_mkldnn_op.cc b/paddle/fluid/operators/elementwise/elementwise_mul_mkldnn_op.cc
index bf9aef9135..c600d1e3d7 100644
--- a/paddle/fluid/operators/elementwise/elementwise_mul_mkldnn_op.cc
+++ b/paddle/fluid/operators/elementwise/elementwise_mul_mkldnn_op.cc
@@ -19,9 +19,6 @@ limitations under the License. */
 #include "paddle/fluid/platform/mkldnn_helper.h"
 
 #include "paddle/fluid/operators/math/jit_kernel.h"
-#if defined(_WIN32) && defined(_WINSOCKAPI_)
-#define _WINSOCK2API_ /* Prevent inclusion of winsock2.h */
-#endif
 #include "xbyak/xbyak.h"
 #include "xbyak/xbyak_util.h"
 
diff --git a/paddle/fluid/operators/math/detail/lstm_cpu_kernel.h b/paddle/fluid/operators/math/detail/lstm_cpu_kernel.h
index 2e3779ff08..ccbd05c82a 100644
--- a/paddle/fluid/operators/math/detail/lstm_cpu_kernel.h
+++ b/paddle/fluid/operators/math/detail/lstm_cpu_kernel.h
@@ -17,12 +17,6 @@ limitations under the License. */
 #include "paddle/fluid/operators/math/detail/activation_functions.h"
 #include "paddle/fluid/operators/math/lstm_compute.h"
 
-#if defined(_WIN32)
-#if defined(__AVX2__) || defined(__AVX__)
-inline __m256 operator+=(__m256 a, __m256 b) { return _mm256_add_ps(a, b); }
-#endif
-#endif
-
 namespace paddle {
 namespace operators {
 namespace math {
diff --git a/paddle/fluid/operators/math/jit_gen.h b/paddle/fluid/operators/math/jit_gen.h
index 2bc740e598..6abf3434cc 100644
--- a/paddle/fluid/operators/math/jit_gen.h
+++ b/paddle/fluid/operators/math/jit_gen.h
@@ -18,9 +18,6 @@ limitations under the License. */
 #include <type_traits>
 #include "paddle/fluid/platform/macros.h"
 
-#if defined(_WIN32) && defined(_WINSOCKAPI_)
-#define _WINSOCK2API_ /* Prevent inclusion of winsock2.h */
-#endif
 #define XBYAK_USE_MMAP_ALLOCATOR
 #include "xbyak/xbyak.h"
 #include "xbyak/xbyak_util.h"
diff --git a/paddle/fluid/platform/dynload/CMakeLists.txt b/paddle/fluid/platform/dynload/CMakeLists.txt
index 07159d4a12..5939c500c9 100644
--- a/paddle/fluid/platform/dynload/CMakeLists.txt
+++ b/paddle/fluid/platform/dynload/CMakeLists.txt
@@ -16,7 +16,9 @@ if (CUPTI_FOUND)
     list(APPEND CUDA_SRCS cupti.cc)
 endif(CUPTI_FOUND)
 nv_library(dynload_cuda SRCS ${CUDA_SRCS} DEPS dynamic_loader)
+if (NOT WIN32)
 cc_library(dynload_warpctc SRCS warpctc.cc DEPS dynamic_loader warpctc)
+endif(NOT WIN32)
 if (WITH_MKLML)
     cc_library(dynload_mklml SRCS mklml.cc DEPS dynamic_loader mklml)
 endif()
diff --git a/paddle/fluid/platform/dynload/cudnn.cc b/paddle/fluid/platform/dynload/cudnn.cc
index 91d9a1ef01..f3cd3b2bbe 100644
--- a/paddle/fluid/platform/dynload/cudnn.cc
+++ b/paddle/fluid/platform/dynload/cudnn.cc
@@ -38,10 +38,6 @@ CUDNN_DNN_ROUTINE_EACH_AFTER_R4(DEFINE_WRAP);
 CUDNN_DNN_ROUTINE_EACH_R5(DEFINE_WRAP);
 #endif
 
-#ifdef CUDNN_DNN_ROUTINE_EACH_R6
-CUDNN_DNN_ROUTINE_EACH_R6(DEFINE_WRAP);
-#endif
-
 #ifdef CUDNN_DNN_ROUTINE_EACH_R7
 CUDNN_DNN_ROUTINE_EACH_R7(DEFINE_WRAP);
 #endif
diff --git a/paddle/fluid/platform/dynload/dynamic_loader.cc b/paddle/fluid/platform/dynload/dynamic_loader.cc
index 15d5168366..cc5cda6106 100644
--- a/paddle/fluid/platform/dynload/dynamic_loader.cc
+++ b/paddle/fluid/platform/dynload/dynamic_loader.cc
@@ -53,12 +53,6 @@ namespace platform {
 namespace dynload {
 static constexpr char cupti_lib_path[] = CUPTI_LIB_PATH;
 
-#if defined(_WIN32) && defined(PADDLE_WITH_CUDA)
-static constexpr char* win_cublas_lib = "cublas64_" PADDLE_CUDA_BINVER ".dll";
-static constexpr char* win_curand_lib = "curand64_" PADDLE_CUDA_BINVER ".dll";
-static constexpr char* win_cudnn_lib = "cudnn64_" PADDLE_CUDNN_BINVER ".dll";
-#endif
-
 static inline std::string join(const std::string& part1,
                                const std::string& part2) {
   // directory separator
@@ -171,8 +165,6 @@ static inline void* GetDsoHandleFromSearchPath(const std::string& search_root,
 void* GetCublasDsoHandle() {
 #if defined(__APPLE__) || defined(__OSX__)
   return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcublas.dylib");
-#elif defined(_WIN32) && defined(PADDLE_WITH_CUDA)
-  return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, win_cublas_lib);
 #else
   return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcublas.so");
 #endif
@@ -181,8 +173,6 @@ void* GetCublasDsoHandle() {
 void* GetCUDNNDsoHandle() {
 #if defined(__APPLE__) || defined(__OSX__)
   return GetDsoHandleFromSearchPath(FLAGS_cudnn_dir, "libcudnn.dylib", false);
-#elif defined(_WIN32) && defined(PADDLE_WITH_CUDA)
-  return GetDsoHandleFromSearchPath(FLAGS_cudnn_dir, win_cudnn_lib);
 #else
   return GetDsoHandleFromSearchPath(FLAGS_cudnn_dir, "libcudnn.so", false);
 #endif
@@ -203,8 +193,6 @@ void* GetCUPTIDsoHandle() {
 void* GetCurandDsoHandle() {
 #if defined(__APPLE__) || defined(__OSX__)
   return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcurand.dylib");
-#elif defined(_WIN32) && defined(PADDLE_WITH_CUDA)
-  return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, win_curand_lib);
 #else
   return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcurand.so");
 #endif
@@ -213,8 +201,6 @@ void* GetCurandDsoHandle() {
 void* GetWarpCTCDsoHandle() {
 #if defined(__APPLE__) || defined(__OSX__)
   return GetDsoHandleFromSearchPath(FLAGS_warpctc_dir, "libwarpctc.dylib");
-#elif defined(_WIN32)
-  return GetDsoHandleFromSearchPath(FLAGS_warpctc_dir, "warpctc.dll");
 #else
   return GetDsoHandleFromSearchPath(FLAGS_warpctc_dir, "libwarpctc.so");
 #endif
@@ -239,8 +225,6 @@ void* GetTensorRtDsoHandle() {
 void* GetMKLMLDsoHandle() {
 #if defined(__APPLE__) || defined(__OSX__)
   return GetDsoHandleFromSearchPath(FLAGS_mklml_dir, "libmklml_intel.dylib");
-#elif defined(_WIN32)
-  return GetDsoHandleFromSearchPath(FLAGS_mklml_dir, "mklml.dll");
 #else
   return GetDsoHandleFromSearchPath(FLAGS_mklml_dir, "libmklml_intel.so");
 #endif
diff --git a/python/setup.py.in b/python/setup.py.in
index f4613dd72d..ff3aca5714 100644
--- a/python/setup.py.in
+++ b/python/setup.py.in
@@ -158,11 +158,10 @@ if '${WITH_FLUID_ONLY}'== 'OFF':
 
 # put all thirdparty libraries in paddle.libs
 libs_path='${PADDLE_BINARY_DIR}/python/paddle/libs'
-
-package_data['paddle.libs']= []
-package_data['paddle.libs']=[('libwarpctc' if os.name != 'nt' else 'warpctc') + ext_name]
-shutil.copy('${WARPCTC_LIBRARIES}', libs_path)
-
+if os.name != 'nt':
+    package_data['paddle.libs']= []
+    package_data['paddle.libs']=['libwarpctc' + ext_name]
+    shutil.copy('${WARPCTC_LIBRARIES}', libs_path)
 if '${WITH_MKL}' == 'ON':
     shutil.copy('${MKLML_SHARED_LIB}', libs_path)
     shutil.copy('${MKLML_SHARED_IOMP_LIB}', libs_path)

From fa135bbf525fba34e16f9c1e80a35382c5b1c983 Mon Sep 17 00:00:00 2001
From: peizhilin <wopeizl@163.com>
Date: Tue, 18 Dec 2018 15:38:33 +0800
Subject: [PATCH 04/28] Fix the mkl build script on windows test=develop

---
 cmake/external/mklml.cmake                    | 58 +++++++++----------
 .../elementwise/elementwise_mul_mkldnn_op.cc  |  3 +
 .../operators/math/detail/lstm_cpu_kernel.h   |  6 ++
 paddle/fluid/operators/math/jit_gen.h         |  3 +
 python/CMakeLists.txt                         | 17 +-----
 python/setup.py.in                            |  6 ++
 6 files changed, 48 insertions(+), 45 deletions(-)

diff --git a/cmake/external/mklml.cmake b/cmake/external/mklml.cmake
index 3da552e319..505f8b3834 100644
--- a/cmake/external/mklml.cmake
+++ b/cmake/external/mklml.cmake
@@ -44,40 +44,36 @@ else()
 endif()
 SET(CMAKE_INSTALL_RPATH "${CMAKE_INSTALL_RPATH}" "${MKLML_ROOT}/lib")
 
-if(WIN32)
-    MESSAGE(WARNING
-    "Please download the MKLML and and put it at " ${THIRD_PARTY_PATH}/install/mklml)
-else()
-    SET(MKLML_PROJECT       "extern_mklml")
-    IF((NOT DEFINED MKLML_VER) OR (NOT DEFINED MKLML_URL))
-      MESSAGE(STATUS "use pre defined download url")
-      SET(MKLML_VER "mklml_lnx_2019.0.20180710" CACHE STRING "" FORCE)
-      SET(MKLML_URL "http://paddlepaddledeps.cdn.bcebos.com/${MKLML_VER}.tgz" CACHE STRING "" FORCE)
+IF((NOT DEFINED MKLML_VER) OR (NOT DEFINED MKLML_URL))
+    MESSAGE(STATUS "use pre defined download url")
+    if(WIN32)
+        SET(MKLML_VER "mklml_win_2019.0.1.20180928" CACHE STRING "" FORCE)
+        SET(MKLML_URL "https://github.com/intel/mkl-dnn/releases/download/v0.17/${MKLML_VER}.zip" CACHE STRING "" FORCE)
+    else()
+        SET(MKLML_VER "mklml_lnx_2019.0.20180710" CACHE STRING "" FORCE)
+        SET(MKLML_URL "http://paddlepaddledeps.cdn.bcebos.com/${MKLML_VER}.tgz" CACHE STRING "" FORCE)
     ENDIF()
-    MESSAGE(STATUS "MKLML_VER: ${MKLML_VER}, MKLML_URL: ${MKLML_URL}")
-    SET(MKLML_SOURCE_DIR    "${THIRD_PARTY_PATH}/mklml")
-    SET(MKLML_DOWNLOAD_DIR  "${MKLML_SOURCE_DIR}/src/${MKLML_PROJECT}")
-
-    FILE(WRITE ${MKLML_DOWNLOAD_DIR}/CMakeLists.txt
-      "PROJECT(MKLML)\n"
-      "cmake_minimum_required(VERSION 3.0)\n"
-      "install(DIRECTORY ${MKLML_VER}/include ${MKLML_VER}/lib \n"
-      "        DESTINATION ${MKLML_DST_DIR})\n")
-
-    ExternalProject_Add(
-        ${MKLML_PROJECT}
-        ${EXTERNAL_PROJECT_LOG_ARGS}
-        PREFIX                ${MKLML_SOURCE_DIR}
-        DOWNLOAD_DIR          ${MKLML_DOWNLOAD_DIR}
-        DOWNLOAD_COMMAND      wget --no-check-certificate ${MKLML_URL} -c -q -O ${MKLML_VER}.tgz
-                              && tar zxf ${MKLML_VER}.tgz
-        DOWNLOAD_NO_PROGRESS  1
-        UPDATE_COMMAND        ""
-        CMAKE_ARGS            -DCMAKE_INSTALL_PREFIX=${MKLML_INSTALL_ROOT}
-        CMAKE_CACHE_ARGS      -DCMAKE_INSTALL_PREFIX:PATH=${MKLML_INSTALL_ROOT}
-    )
 endif()
 
+SET(MKLML_PROJECT       "extern_mklml")
+MESSAGE(STATUS "MKLML_VER: ${MKLML_VER}, MKLML_URL: ${MKLML_URL}")
+SET(MKLML_SOURCE_DIR    "${THIRD_PARTY_PATH}/mklml")
+SET(MKLML_DOWNLOAD_DIR  "${MKLML_SOURCE_DIR}/src/${MKLML_PROJECT}")
+
+ExternalProject_Add(
+    ${MKLML_PROJECT}
+    ${EXTERNAL_PROJECT_LOG_ARGS}
+    PREFIX                 ${MKLML_SOURCE_DIR}
+    URL                    ${MKLML_URL}
+    DOWNLOAD_DIR          ${MKLML_DOWNLOAD_DIR}
+    DOWNLOAD_NO_PROGRESS  1
+    CONFIGURE_COMMAND     ""
+    BUILD_COMMAND         ""
+    UPDATE_COMMAND ""
+    INSTALL_COMMAND
+        ${CMAKE_COMMAND} -E copy_directory ${MKLML_DOWNLOAD_DIR}/include ${MKLML_INC_DIR} &&
+        ${CMAKE_COMMAND} -E copy_directory ${MKLML_DOWNLOAD_DIR}/lib ${MKLML_LIB_DIR}
+)
 
 INCLUDE_DIRECTORIES(${MKLML_INC_DIR})
 
diff --git a/paddle/fluid/operators/elementwise/elementwise_mul_mkldnn_op.cc b/paddle/fluid/operators/elementwise/elementwise_mul_mkldnn_op.cc
index c600d1e3d7..bf9aef9135 100644
--- a/paddle/fluid/operators/elementwise/elementwise_mul_mkldnn_op.cc
+++ b/paddle/fluid/operators/elementwise/elementwise_mul_mkldnn_op.cc
@@ -19,6 +19,9 @@ limitations under the License. */
 #include "paddle/fluid/platform/mkldnn_helper.h"
 
 #include "paddle/fluid/operators/math/jit_kernel.h"
+#if defined(_WIN32) && defined(_WINSOCKAPI_)
+#define _WINSOCK2API_ /* Prevent inclusion of winsock2.h */
+#endif
 #include "xbyak/xbyak.h"
 #include "xbyak/xbyak_util.h"
 
diff --git a/paddle/fluid/operators/math/detail/lstm_cpu_kernel.h b/paddle/fluid/operators/math/detail/lstm_cpu_kernel.h
index ccbd05c82a..2e3779ff08 100644
--- a/paddle/fluid/operators/math/detail/lstm_cpu_kernel.h
+++ b/paddle/fluid/operators/math/detail/lstm_cpu_kernel.h
@@ -17,6 +17,12 @@ limitations under the License. */
 #include "paddle/fluid/operators/math/detail/activation_functions.h"
 #include "paddle/fluid/operators/math/lstm_compute.h"
 
+#if defined(_WIN32)
+#if defined(__AVX2__) || defined(__AVX__)
+inline __m256 operator+=(__m256 a, __m256 b) { return _mm256_add_ps(a, b); }
+#endif
+#endif
+
 namespace paddle {
 namespace operators {
 namespace math {
diff --git a/paddle/fluid/operators/math/jit_gen.h b/paddle/fluid/operators/math/jit_gen.h
index 6abf3434cc..2bc740e598 100644
--- a/paddle/fluid/operators/math/jit_gen.h
+++ b/paddle/fluid/operators/math/jit_gen.h
@@ -18,6 +18,9 @@ limitations under the License. */
 #include <type_traits>
 #include "paddle/fluid/platform/macros.h"
 
+#if defined(_WIN32) && defined(_WINSOCKAPI_)
+#define _WINSOCK2API_ /* Prevent inclusion of winsock2.h */
+#endif
 #define XBYAK_USE_MMAP_ALLOCATOR
 #include "xbyak/xbyak.h"
 #include "xbyak/xbyak_util.h"
diff --git a/python/CMakeLists.txt b/python/CMakeLists.txt
index 078d543ba2..72c0d03e52 100644
--- a/python/CMakeLists.txt
+++ b/python/CMakeLists.txt
@@ -49,23 +49,12 @@ IF(WIN32)
     # Python would use the .pyd by default under Windows series platform
     set(FLUID_DST_DIR ${PADDLE_BINARY_DIR}/python/paddle/fluid/)
     set(FLUID_CORE ${FLUID_DST_DIR}/core.pyd)
-    if(NOT WITH_MKLDNN)
-        get_filename_component(openblas_refpath ${CBLAS_LIBRARIES} DIRECTORY)
-        add_custom_command(OUTPUT ${FLUID_CORE}
-                COMMAND cmake -E copy $<TARGET_FILE:paddle_pybind> ${FLUID_CORE}
-                COMMAND cmake -E copy ${openblas_refpath}/openblas.dll ${FLUID_DST_DIR}
-                DEPENDS paddle_pybind)
-    else(NOT WITH_MKLDNN)
-        add_custom_command(OUTPUT ${FLUID_CORE}
-                COMMAND cmake -E copy $<TARGET_FILE:paddle_pybind> ${FLUID_CORE}
-                DEPENDS paddle_pybind)
-    endif(NOT WITH_MKLDNN)
 ELSE()
     set(FLUID_CORE ${PADDLE_BINARY_DIR}/python/paddle/fluid/core.so)
-    add_custom_command(OUTPUT ${FLUID_CORE}
-            COMMAND cmake -E copy $<TARGET_FILE:paddle_pybind> ${FLUID_CORE}
-            DEPENDS paddle_pybind)
 ENDIF()
+add_custom_command(OUTPUT ${FLUID_CORE}
+        COMMAND cmake -E copy $<TARGET_FILE:paddle_pybind> ${FLUID_CORE}
+        DEPENDS paddle_pybind)
 add_custom_target(copy_paddle_pybind ALL DEPENDS ${FLUID_CORE})
 
 IF(WIN32)
diff --git a/python/setup.py.in b/python/setup.py.in
index ff3aca5714..8973d883e4 100644
--- a/python/setup.py.in
+++ b/python/setup.py.in
@@ -166,6 +166,12 @@ if '${WITH_MKL}' == 'ON':
     shutil.copy('${MKLML_SHARED_LIB}', libs_path)
     shutil.copy('${MKLML_SHARED_IOMP_LIB}', libs_path)
     package_data['paddle.libs']+=[('libmklml_intel' if os.name != 'nt' else 'mklml') + ext_name, ('libiomp5' if os.name != 'nt' else 'libiomp5md') + ext_name]
+else:
+    # copy the openblas.dll
+    if os.name == 'nt':
+        shutil.copy('${CBLAS_LIBRARIES}', libs_path)
+        package_data['paddle.libs']+=['openblas' + ext_name]
+
 if '${WITH_MKLDNN}' == 'ON':
     if '${CMAKE_BUILD_TYPE}' == 'Release':
         if os.name != 'nt':

From 17fb3253c30f4ebeb8f6058a6e770344e52a0fad Mon Sep 17 00:00:00 2001
From: peizhilin <wopeizl@163.com>
Date: Tue, 18 Dec 2018 15:41:54 +0800
Subject: [PATCH 05/28] keep the mkl win's version inconsistent with Linux's
 test=develop

---
 cmake/external/mklml.cmake | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/cmake/external/mklml.cmake b/cmake/external/mklml.cmake
index 505f8b3834..96676f0be8 100644
--- a/cmake/external/mklml.cmake
+++ b/cmake/external/mklml.cmake
@@ -47,8 +47,8 @@ SET(CMAKE_INSTALL_RPATH "${CMAKE_INSTALL_RPATH}" "${MKLML_ROOT}/lib")
 IF((NOT DEFINED MKLML_VER) OR (NOT DEFINED MKLML_URL))
     MESSAGE(STATUS "use pre defined download url")
     if(WIN32)
-        SET(MKLML_VER "mklml_win_2019.0.1.20180928" CACHE STRING "" FORCE)
-        SET(MKLML_URL "https://github.com/intel/mkl-dnn/releases/download/v0.17/${MKLML_VER}.zip" CACHE STRING "" FORCE)
+        SET(MKLML_VER "mklml_win_2019.0.20180710" CACHE STRING "" FORCE)
+        SET(MKLML_URL "https://github.com/intel/mkl-dnn/releases/download/v0.16/${MKLML_VER}.zip" CACHE STRING "" FORCE)
     else()
         SET(MKLML_VER "mklml_lnx_2019.0.20180710" CACHE STRING "" FORCE)
         SET(MKLML_URL "http://paddlepaddledeps.cdn.bcebos.com/${MKLML_VER}.tgz" CACHE STRING "" FORCE)

From fdab7f749e68c86cc732c7570bbe327d630f5dc9 Mon Sep 17 00:00:00 2001
From: peizhilin <wopeizl@163.com>
Date: Tue, 18 Dec 2018 17:04:48 +0800
Subject: [PATCH 06/28] fix the setup script issue test=develop

---
 python/setup.py.in | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

diff --git a/python/setup.py.in b/python/setup.py.in
index 8973d883e4..bfbaa1d015 100644
--- a/python/setup.py.in
+++ b/python/setup.py.in
@@ -138,8 +138,6 @@ if '${WITH_FLUID_ONLY}'== 'OFF':
                    '${PADDLE_BINARY_DIR}/paddle/scripts/paddle']
 
 package_data={'paddle.fluid': ['core' + (ext_name if os.name != 'nt' else '.pyd')]}
-if os.name == 'nt':
-    package_data['paddle.fluid'] += ['openblas' + ext_name]
 
 if '${WITH_FLUID_ONLY}'== 'OFF':
     package_data['paddle.v2.master']=['libpaddle_master' + ext_name]
@@ -162,15 +160,16 @@ if os.name != 'nt':
     package_data['paddle.libs']= []
     package_data['paddle.libs']=['libwarpctc' + ext_name]
     shutil.copy('${WARPCTC_LIBRARIES}', libs_path)
+
 if '${WITH_MKL}' == 'ON':
     shutil.copy('${MKLML_SHARED_LIB}', libs_path)
     shutil.copy('${MKLML_SHARED_IOMP_LIB}', libs_path)
     package_data['paddle.libs']+=[('libmklml_intel' if os.name != 'nt' else 'mklml') + ext_name, ('libiomp5' if os.name != 'nt' else 'libiomp5md') + ext_name]
 else:
-    # copy the openblas.dll
     if os.name == 'nt':
-        shutil.copy('${CBLAS_LIBRARIES}', libs_path)
-        package_data['paddle.libs']+=['openblas' + ext_name]
+        # copy the openblas.dll
+        shutil.copy(os.path.dirname('${CBLAS_LIBRARIES}') + '/openblas' + ext_name, libs_path)
+        package_data['paddle.fluid'] += ['openblas' + ext_name]
 
 if '${WITH_MKLDNN}' == 'ON':
     if '${CMAKE_BUILD_TYPE}' == 'Release':

From 2f3b5054ad9a4fb0f62450c6dca912e0c1306471 Mon Sep 17 00:00:00 2001
From: peizhilin <wopeizl@163.com>
Date: Wed, 19 Dec 2018 10:31:40 +0800
Subject: [PATCH 07/28] fix build script

---
 python/setup.py.in | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/python/setup.py.in b/python/setup.py.in
index bfbaa1d015..521d108b2c 100644
--- a/python/setup.py.in
+++ b/python/setup.py.in
@@ -156,8 +156,8 @@ if '${WITH_FLUID_ONLY}'== 'OFF':
 
 # put all thirdparty libraries in paddle.libs
 libs_path='${PADDLE_BINARY_DIR}/python/paddle/libs'
+package_data['paddle.libs']= []
 if os.name != 'nt':
-    package_data['paddle.libs']= []
     package_data['paddle.libs']=['libwarpctc' + ext_name]
     shutil.copy('${WARPCTC_LIBRARIES}', libs_path)
 
@@ -169,7 +169,7 @@ else:
     if os.name == 'nt':
         # copy the openblas.dll
         shutil.copy(os.path.dirname('${CBLAS_LIBRARIES}') + '/openblas' + ext_name, libs_path)
-        package_data['paddle.fluid'] += ['openblas' + ext_name]
+        package_data['paddle.libs'] += ['openblas' + ext_name]
 
 if '${WITH_MKLDNN}' == 'ON':
     if '${CMAKE_BUILD_TYPE}' == 'Release':

From 40a94a138fd76e1143d129f86199a000a64edb2c Mon Sep 17 00:00:00 2001
From: peizhilin <wopeizl@163.com>
Date: Wed, 19 Dec 2018 14:59:06 +0800
Subject: [PATCH 08/28] remove irrelevant fix for mkl test=develop

---
 cmake/external/cub.cmake                               | 2 +-
 cmake/external/dlpack.cmake                            | 2 +-
 cmake/external/python.cmake                            | 8 +-------
 paddle/fluid/framework/details/all_reduce_op_handle.cc | 2 +-
 paddle/fluid/platform/dynload/dynamic_loader.cc        | 2 ++
 5 files changed, 6 insertions(+), 10 deletions(-)

diff --git a/cmake/external/cub.cmake b/cmake/external/cub.cmake
index f06728de91..c94849cf4b 100644
--- a/cmake/external/cub.cmake
+++ b/cmake/external/cub.cmake
@@ -32,4 +32,4 @@ endif()
 
 add_dependencies(cub extern_cub)
 
-LIST(APPEND external_project_dependencies cub)
+LIST(APPEND externl_project_dependencies cub)
diff --git a/cmake/external/dlpack.cmake b/cmake/external/dlpack.cmake
index 4587475d79..94d8fcc668 100644
--- a/cmake/external/dlpack.cmake
+++ b/cmake/external/dlpack.cmake
@@ -28,4 +28,4 @@ endif()
 
 add_dependencies(dlpack extern_dlpack)
 
-LIST(APPEND external_project_dependencies dlpack)
+LIST(APPEND externl_project_dependencies dlpack)
diff --git a/cmake/external/python.cmake b/cmake/external/python.cmake
index 5aa447d58d..623c53f4f7 100644
--- a/cmake/external/python.cmake
+++ b/cmake/external/python.cmake
@@ -23,12 +23,9 @@ FIND_PACKAGE(PythonLibs ${PY_VERSION} REQUIRED)
 
 if(WIN32)
     execute_process(COMMAND "${PYTHON_EXECUTABLE}" "-c"
-"from distutils import sysconfig as s;import sys;import struct;import sysconfig;
+"from distutils import sysconfig as s;import sys;import struct;
 print(sys.prefix);
 print(s.get_config_var('LDVERSION') or s.get_config_var('VERSION'));
-print(sysconfig.get_platform());
-print(sysconfig.get_config_var('py_version_nodot'));
-print(sysconfig.get_config_var('SOABI'));
 "
             RESULT_VARIABLE _PYTHON_SUCCESS
             OUTPUT_VARIABLE _PYTHON_VALUES
@@ -44,9 +41,6 @@ print(sysconfig.get_config_var('SOABI'));
     string(REGEX REPLACE "\n" ";" _PYTHON_VALUES ${_PYTHON_VALUES})
     list(GET _PYTHON_VALUES 0 PYTHON_PREFIX)
     list(GET _PYTHON_VALUES 1 PYTHON_LIBRARY_SUFFIX)
-    list(GET _PYTHON_VALUES 2 SYS_PLATFORM)
-    list(GET _PYTHON_VALUES 3 PYTHON_SHORT_VERSION_NODOT)
-    list(GET _PYTHON_VALUES 4 PYTHON_SOABI)
 
     # Make sure all directory separators are '/'
     string(REGEX REPLACE "\\\\" "/" PYTHON_PREFIX ${PYTHON_PREFIX})
diff --git a/paddle/fluid/framework/details/all_reduce_op_handle.cc b/paddle/fluid/framework/details/all_reduce_op_handle.cc
index de7c845884..9eaff1f560 100644
--- a/paddle/fluid/framework/details/all_reduce_op_handle.cc
+++ b/paddle/fluid/framework/details/all_reduce_op_handle.cc
@@ -50,7 +50,7 @@ void AllReduceOpHandle::RunImpl() {
 
 // FIXME(typhoonzero): If scope0(global scope) have NCCL_ID_VAR,
 // this is a distributed or inter-process call, find a better way.
-#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
+#ifdef PADDLE_WITH_CUDA
   if (NoDummyInputSize() == 1 &&
       local_scopes_[0]->FindLocalVar(NCCL_ID_VARNAME) == nullptr) {
 #else
diff --git a/paddle/fluid/platform/dynload/dynamic_loader.cc b/paddle/fluid/platform/dynload/dynamic_loader.cc
index eddebfe92a..990e44cd21 100644
--- a/paddle/fluid/platform/dynload/dynamic_loader.cc
+++ b/paddle/fluid/platform/dynload/dynamic_loader.cc
@@ -227,6 +227,8 @@ void* GetTensorRtDsoHandle() {
 void* GetMKLMLDsoHandle() {
 #if defined(__APPLE__) || defined(__OSX__)
   return GetDsoHandleFromSearchPath(FLAGS_mklml_dir, "libmklml_intel.dylib");
+#elif defined(_WIN32)
+  return GetDsoHandleFromSearchPath(FLAGS_mklml_dir, "mklml.dll");
 #else
   return GetDsoHandleFromSearchPath(FLAGS_mklml_dir, "libmklml_intel.so");
 #endif

From 1cc9d59838a560ce24cfd8d4f517427316b482c2 Mon Sep 17 00:00:00 2001
From: peizhilin <wopeizl@163.com>
Date: Wed, 19 Dec 2018 16:48:48 +0800
Subject: [PATCH 09/28] disable xbyak on windows test=develop

---
 cmake/external/mklml.cmake                                  | 4 +---
 cmake/external/xbyak.cmake                                  | 4 ++--
 .../operators/elementwise/elementwise_mul_mkldnn_op.cc      | 3 ---
 paddle/fluid/operators/math/detail/lstm_cpu_kernel.h        | 6 ------
 paddle/fluid/operators/math/jit_gen.h                       | 3 ---
 5 files changed, 3 insertions(+), 17 deletions(-)

diff --git a/cmake/external/mklml.cmake b/cmake/external/mklml.cmake
index 96676f0be8..1a766f3c3a 100644
--- a/cmake/external/mklml.cmake
+++ b/cmake/external/mklml.cmake
@@ -79,9 +79,7 @@ INCLUDE_DIRECTORIES(${MKLML_INC_DIR})
 
 ADD_LIBRARY(mklml SHARED IMPORTED GLOBAL)
 SET_PROPERTY(TARGET mklml PROPERTY IMPORTED_LOCATION ${MKLML_LIB})
-if(NOT WIN32)
-    ADD_DEPENDENCIES(mklml ${MKLML_PROJECT})
-endif()
+ADD_DEPENDENCIES(mklml ${MKLML_PROJECT})
 LIST(APPEND external_project_dependencies mklml)
 
 IF(WITH_C_API)
diff --git a/cmake/external/xbyak.cmake b/cmake/external/xbyak.cmake
index 42e39fb813..384c2f9328 100644
--- a/cmake/external/xbyak.cmake
+++ b/cmake/external/xbyak.cmake
@@ -13,8 +13,8 @@
 # limitations under the License.
 
 set(WITH_XBYAK ON)
-if(APPLE)
-    SET(WITH_XBYAK OFF CACHE STRING "Disable XBYAK in MacOS" FORCE)
+if(WIN32 OR APPLE)
+    SET(WITH_XBYAK OFF CACHE STRING "Disable XBYAK in Windows and MacOS" FORCE)
     return()
 endif()
 
diff --git a/paddle/fluid/operators/elementwise/elementwise_mul_mkldnn_op.cc b/paddle/fluid/operators/elementwise/elementwise_mul_mkldnn_op.cc
index bf9aef9135..c600d1e3d7 100644
--- a/paddle/fluid/operators/elementwise/elementwise_mul_mkldnn_op.cc
+++ b/paddle/fluid/operators/elementwise/elementwise_mul_mkldnn_op.cc
@@ -19,9 +19,6 @@ limitations under the License. */
 #include "paddle/fluid/platform/mkldnn_helper.h"
 
 #include "paddle/fluid/operators/math/jit_kernel.h"
-#if defined(_WIN32) && defined(_WINSOCKAPI_)
-#define _WINSOCK2API_ /* Prevent inclusion of winsock2.h */
-#endif
 #include "xbyak/xbyak.h"
 #include "xbyak/xbyak_util.h"
 
diff --git a/paddle/fluid/operators/math/detail/lstm_cpu_kernel.h b/paddle/fluid/operators/math/detail/lstm_cpu_kernel.h
index 2e3779ff08..ccbd05c82a 100644
--- a/paddle/fluid/operators/math/detail/lstm_cpu_kernel.h
+++ b/paddle/fluid/operators/math/detail/lstm_cpu_kernel.h
@@ -17,12 +17,6 @@ limitations under the License. */
 #include "paddle/fluid/operators/math/detail/activation_functions.h"
 #include "paddle/fluid/operators/math/lstm_compute.h"
 
-#if defined(_WIN32)
-#if defined(__AVX2__) || defined(__AVX__)
-inline __m256 operator+=(__m256 a, __m256 b) { return _mm256_add_ps(a, b); }
-#endif
-#endif
-
 namespace paddle {
 namespace operators {
 namespace math {
diff --git a/paddle/fluid/operators/math/jit_gen.h b/paddle/fluid/operators/math/jit_gen.h
index 2bc740e598..6abf3434cc 100644
--- a/paddle/fluid/operators/math/jit_gen.h
+++ b/paddle/fluid/operators/math/jit_gen.h
@@ -18,9 +18,6 @@ limitations under the License. */
 #include <type_traits>
 #include "paddle/fluid/platform/macros.h"
 
-#if defined(_WIN32) && defined(_WINSOCKAPI_)
-#define _WINSOCK2API_ /* Prevent inclusion of winsock2.h */
-#endif
 #define XBYAK_USE_MMAP_ALLOCATOR
 #include "xbyak/xbyak.h"
 #include "xbyak/xbyak_util.h"

From da42cf205598081b57c77a2eb44e8ec1a528eab5 Mon Sep 17 00:00:00 2001
From: peizhilin <wopeizl@163.com>
Date: Wed, 19 Dec 2018 17:05:12 +0800
Subject: [PATCH 10/28] fix build issue when xbyak is disabled on windows
 test=develop

---
 paddle/fluid/operators/elementwise/elementwise_mul_mkldnn_op.cc | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/paddle/fluid/operators/elementwise/elementwise_mul_mkldnn_op.cc b/paddle/fluid/operators/elementwise/elementwise_mul_mkldnn_op.cc
index c600d1e3d7..a3cf78dac5 100644
--- a/paddle/fluid/operators/elementwise/elementwise_mul_mkldnn_op.cc
+++ b/paddle/fluid/operators/elementwise/elementwise_mul_mkldnn_op.cc
@@ -19,8 +19,10 @@ limitations under the License. */
 #include "paddle/fluid/platform/mkldnn_helper.h"
 
 #include "paddle/fluid/operators/math/jit_kernel.h"
+#ifdef PADDLE_WITH_XBYAK
 #include "xbyak/xbyak.h"
 #include "xbyak/xbyak_util.h"
+#endif
 
 namespace paddle {
 namespace operators {

From 0b4f742e8a650fe598ef38ebce39919afe133bc4 Mon Sep 17 00:00:00 2001
From: peizhilin <wopeizl@163.com>
Date: Wed, 19 Dec 2018 17:38:48 +0800
Subject: [PATCH 11/28] fix the build issue test=develop

---
 .../operators/elementwise/elementwise_mul_mkldnn_op.cc      | 4 ++++
 paddle/fluid/operators/math/detail/lstm_cpu_kernel.h        | 6 ++++++
 paddle/fluid/operators/math/jit_kernel_blas.cc              | 2 ++
 3 files changed, 12 insertions(+)

diff --git a/paddle/fluid/operators/elementwise/elementwise_mul_mkldnn_op.cc b/paddle/fluid/operators/elementwise/elementwise_mul_mkldnn_op.cc
index a3cf78dac5..b45e08174c 100644
--- a/paddle/fluid/operators/elementwise/elementwise_mul_mkldnn_op.cc
+++ b/paddle/fluid/operators/elementwise/elementwise_mul_mkldnn_op.cc
@@ -83,8 +83,12 @@ class ElementwiseMulMKLDNNKernel : public framework::OpKernel<T> {
     UpdateDataFormat(ctx, const_cast<Tensor*>(x), "x_data_format");
     UpdateDataFormat(ctx, const_cast<Tensor*>(y), "y_data_format");
 
+#ifdef PADDLE_WITH_XBYAK
     Xbyak::util::Cpu cpu;
     const bool is_avx512_enabled = cpu.has(Xbyak::util::Cpu::tAVX512F);
+#else
+    const bool is_avx512_enabled = platform::MayIUse(platform::avx512f);
+#endif  // PADDLE_WITH_XBYAK
     const bool are_dims_divisable = !(x_int_dims[1] % 16);
     const bool is_x_format_correct = x->format() == memory::format::nChw16c;
     const bool is_y_format_correct = y->format() == memory::format::nc;
diff --git a/paddle/fluid/operators/math/detail/lstm_cpu_kernel.h b/paddle/fluid/operators/math/detail/lstm_cpu_kernel.h
index ccbd05c82a..2e3779ff08 100644
--- a/paddle/fluid/operators/math/detail/lstm_cpu_kernel.h
+++ b/paddle/fluid/operators/math/detail/lstm_cpu_kernel.h
@@ -17,6 +17,12 @@ limitations under the License. */
 #include "paddle/fluid/operators/math/detail/activation_functions.h"
 #include "paddle/fluid/operators/math/lstm_compute.h"
 
+#if defined(_WIN32)
+#if defined(__AVX2__) || defined(__AVX__)
+inline __m256 operator+=(__m256 a, __m256 b) { return _mm256_add_ps(a, b); }
+#endif
+#endif
+
 namespace paddle {
 namespace operators {
 namespace math {
diff --git a/paddle/fluid/operators/math/jit_kernel_blas.cc b/paddle/fluid/operators/math/jit_kernel_blas.cc
index 8cf588efba..ea54367c3b 100644
--- a/paddle/fluid/operators/math/jit_kernel_blas.cc
+++ b/paddle/fluid/operators/math/jit_kernel_blas.cc
@@ -212,8 +212,10 @@ class EltwiseMulnChw16cNCKernelImpl
 
  private:
   std::unique_ptr<gen::EltwiseMulnChw16cNC> jitcode_{nullptr};
+#endif
 };
 
+#ifdef PADDLE_WITH_XBYAK
 template <>
 bool EltwiseMulnChw16cNCKernelImpl<float>::useJIT(int d) {
   return true;

From 9f55f1ff50fd4b7db87689185cb0f3255cf72dc0 Mon Sep 17 00:00:00 2001
From: peizhilin <wopeizl@163.com>
Date: Wed, 19 Dec 2018 20:19:07 +0800
Subject: [PATCH 12/28] use the platform api to decide the specific instruction
 support or not test=develop

---
 .../fluid/operators/elementwise/elementwise_mul_mkldnn_op.cc | 5 -----
 1 file changed, 5 deletions(-)

diff --git a/paddle/fluid/operators/elementwise/elementwise_mul_mkldnn_op.cc b/paddle/fluid/operators/elementwise/elementwise_mul_mkldnn_op.cc
index b45e08174c..db47b8c460 100644
--- a/paddle/fluid/operators/elementwise/elementwise_mul_mkldnn_op.cc
+++ b/paddle/fluid/operators/elementwise/elementwise_mul_mkldnn_op.cc
@@ -83,12 +83,7 @@ class ElementwiseMulMKLDNNKernel : public framework::OpKernel<T> {
     UpdateDataFormat(ctx, const_cast<Tensor*>(x), "x_data_format");
     UpdateDataFormat(ctx, const_cast<Tensor*>(y), "y_data_format");
 
-#ifdef PADDLE_WITH_XBYAK
-    Xbyak::util::Cpu cpu;
-    const bool is_avx512_enabled = cpu.has(Xbyak::util::Cpu::tAVX512F);
-#else
     const bool is_avx512_enabled = platform::MayIUse(platform::avx512f);
-#endif  // PADDLE_WITH_XBYAK
     const bool are_dims_divisable = !(x_int_dims[1] % 16);
     const bool is_x_format_correct = x->format() == memory::format::nChw16c;
     const bool is_y_format_correct = y->format() == memory::format::nc;

From b6d7f0e5ec15035226d8cf4bdf8a2171b49ab9ed Mon Sep 17 00:00:00 2001
From: peizhilin <wopeizl@163.com>
Date: Thu, 20 Dec 2018 13:22:44 +0800
Subject: [PATCH 13/28] use the CDN as the source location test=develop

---
 cmake/external/mklml.cmake | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cmake/external/mklml.cmake b/cmake/external/mklml.cmake
index 1a766f3c3a..1ee553e2f5 100644
--- a/cmake/external/mklml.cmake
+++ b/cmake/external/mklml.cmake
@@ -48,7 +48,7 @@ IF((NOT DEFINED MKLML_VER) OR (NOT DEFINED MKLML_URL))
     MESSAGE(STATUS "use pre defined download url")
     if(WIN32)
         SET(MKLML_VER "mklml_win_2019.0.20180710" CACHE STRING "" FORCE)
-        SET(MKLML_URL "https://github.com/intel/mkl-dnn/releases/download/v0.16/${MKLML_VER}.zip" CACHE STRING "" FORCE)
+        SET(MKLML_URL "https://paddle-windows.bj.bcebos.com/mklml/${MKLML_VER}.zip" CACHE STRING "" FORCE)
     else()
         SET(MKLML_VER "mklml_lnx_2019.0.20180710" CACHE STRING "" FORCE)
         SET(MKLML_URL "http://paddlepaddledeps.cdn.bcebos.com/${MKLML_VER}.tgz" CACHE STRING "" FORCE)

From 969ad966bab631aa6521b205edf826acbc3646d4 Mon Sep 17 00:00:00 2001
From: Xin Pan <panxin.grad@gmail.com>
Date: Wed, 19 Dec 2018 22:11:45 +0800
Subject: [PATCH 14/28] all converted

test=develop
---
 paddle/fluid/framework/operator.cc | 22 ++++++++++
 paddle/fluid/framework/operator.h  | 65 ++++++++++++++++++++++++++++--
 2 files changed, 84 insertions(+), 3 deletions(-)

diff --git a/paddle/fluid/framework/operator.cc b/paddle/fluid/framework/operator.cc
index 4b520a393f..fec311e3ee 100644
--- a/paddle/fluid/framework/operator.cc
+++ b/paddle/fluid/framework/operator.cc
@@ -476,6 +476,28 @@ const Tensor* ExecutionContext::LegacyInput<Tensor>(
 template <>
 const std::vector<const Tensor*> ExecutionContext::MultiInput<Tensor>(
     const std::string& name) const {
+  auto it = ctx_.inputs.find(name);
+  if (it == ctx_.inputs.end()) {
+    return {};
+  }
+  const std::vector<Variable*>& vars = it->second;
+  std::vector<const Tensor*> res;
+  res.reserve(vars.size());
+  std::transform(vars.begin(), vars.end(), std::back_inserter(res),
+                 [&](Variable* var) -> const Tensor* {
+                   if (var == nullptr) return nullptr;
+                   PADDLE_ENFORCE(
+                       var->IsType<LoDTensor>(),
+                       "should be LoDTensor, but the received type is %s",
+                       var->Type().name());
+                   return &(var->Get<LoDTensor>());
+                 });
+  return res;
+}
+
+template <>
+const std::vector<const Tensor*> ExecutionContext::LegacyMultiInput<Tensor>(
+    const std::string& name) const {
   auto names = op().Inputs(name);
   std::vector<const Tensor*> res;
   res.reserve(names.size());
diff --git a/paddle/fluid/framework/operator.h b/paddle/fluid/framework/operator.h
index 39190d07b4..1fe2daacf1 100644
--- a/paddle/fluid/framework/operator.h
+++ b/paddle/fluid/framework/operator.h
@@ -197,8 +197,31 @@ class ExecutionContext {
 
   const std::vector<const Variable*> MultiInputVar(
       const std::string& name) const {
-    auto names = op_.Inputs(name);
+    auto it = ctx_.inputs.find(name);
+    if (it == ctx_.inputs.end()) {
+      return {};
+    }
     std::vector<const Variable*> res;
+    res.reserve(it->second.size());
+    std::transform(it->second.begin(), it->second.end(),
+                   std::back_inserter(res),
+                   [this](Variable* var) { return var; });
+    return res;
+  }
+
+  std::vector<Variable*> MultiOutputVar(const std::string& name) const {
+    auto names = op_.Outputs(name);
+    auto it = ctx_.outputs.find(name);
+    if (it == ctx_.outputs.end()) {
+      return {};
+    }
+    return it->second;
+  }
+
+  const std::vector<Variable*> LegacyMultiInputVar(
+      const std::string& name) const {
+    auto names = op_.Inputs(name);
+    std::vector<Variable*> res;
     res.reserve(names.size());
     std::transform(names.begin(), names.end(), std::back_inserter(res),
                    [this](const std::string& name) {
@@ -208,7 +231,7 @@ class ExecutionContext {
     return res;
   }
 
-  std::vector<Variable*> MultiOutputVar(const std::string& name) const {
+  std::vector<Variable*> LegacyMultiOutputVar(const std::string& name) const {
     auto names = op_.Outputs(name);
     std::vector<Variable*> res;
     res.reserve(names.size());
@@ -250,6 +273,38 @@ class ExecutionContext {
 
   template <typename T>
   const std::vector<const T*> MultiInput(const std::string& name) const {
+    auto it = ctx_.inputs.find(name);
+    if (it == ctx_.inputs.end()) {
+      return {};
+    }
+    const std::vector<Variable*>& vars = it->second;
+    std::vector<const T*> res;
+    res.reserve(vars.size());
+    std::transform(vars.begin(), vars.end(), std::back_inserter(res),
+                   [&](Variable* var) -> const T* {
+                     return var == nullptr ? nullptr : &var->Get<T>();
+                   });
+    return res;
+  }
+
+  template <typename T>
+  std::vector<T*> MultiOutput(const std::string& name) const {
+    auto it = ctx_.outputs.find(name);
+    if (it == ctx_.outputs.end()) {
+      return {};
+    }
+    const std::vector<Variable*>& vars = it->second;
+    std::vector<T*> res;
+    res.reserve(vars.size());
+    std::transform(vars.begin(), vars.end(), std::back_inserter(res),
+                   [&](Variable* var) -> T* {
+                     return var == nullptr ? nullptr : var->GetMutable<T>();
+                   });
+    return res;
+  }
+
+  template <typename T>
+  const std::vector<const T*> LegacyMultiInput(const std::string& name) const {
     auto names = op_.Inputs(name);
     std::vector<const T*> res;
     res.reserve(names.size());
@@ -262,7 +317,7 @@ class ExecutionContext {
   }
 
   template <typename T>
-  std::vector<T*> MultiOutput(const std::string& name) const {
+  std::vector<T*> LegacyMultiOutput(const std::string& name) const {
     auto names = op_.Outputs(name);
     std::vector<T*> res;
     res.reserve(names.size());
@@ -321,6 +376,10 @@ template <>
 const std::vector<const Tensor*> ExecutionContext::MultiInput<Tensor>(
     const std::string& name) const;
 
+template <>
+const std::vector<const Tensor*> ExecutionContext::LegacyMultiInput<Tensor>(
+    const std::string& name) const;
+
 template <>
 Tensor* ExecutionContext::Output<Tensor>(const std::string& name) const;
 

From b06ce129bcf8f0bb89fe5d1a1a13107218204d8a Mon Sep 17 00:00:00 2001
From: peizhilin <wopeizl@163.com>
Date: Thu, 20 Dec 2018 19:29:51 +0800
Subject: [PATCH 15/28] some not so useful adjust test=develop

---
 paddle/fluid/platform/cpu_info.cc |  6 +-----
 paddle/fluid/platform/port.h      |  4 ----
 python/setup.py.in                | 23 +++++++++++------------
 3 files changed, 12 insertions(+), 21 deletions(-)

diff --git a/paddle/fluid/platform/cpu_info.cc b/paddle/fluid/platform/cpu_info.cc
index 1642c17809..9d5ae813de 100644
--- a/paddle/fluid/platform/cpu_info.cc
+++ b/paddle/fluid/platform/cpu_info.cc
@@ -14,10 +14,6 @@ limitations under the License. */
 
 #include "paddle/fluid/platform/cpu_info.h"
 
-#if defined(_WIN32)
-#define NOMINMAX  // msvc max/min macro conflict with std::min/max
-#endif
-
 #ifdef PADDLE_WITH_XBYAK
 #include "xbyak/xbyak.h"
 #include "xbyak/xbyak_util.h"
@@ -27,7 +23,7 @@ limitations under the License. */
 #include <sys/sysctl.h>
 #include <sys/types.h>
 #elif defined(_WIN32)
-#define WIN32_LEAN_AND_MEAN
+#define NOMINMAX  // msvc max/min macro conflict with std::min/max
 #include <windows.h>
 #else
 #include <unistd.h>
diff --git a/paddle/fluid/platform/port.h b/paddle/fluid/platform/port.h
index 41388d8959..c1b81159ac 100644
--- a/paddle/fluid/platform/port.h
+++ b/paddle/fluid/platform/port.h
@@ -37,10 +37,6 @@
 #define GOOGLE_GLOG_DLL_DECL
 #include <io.h>  // _popen, _pclose
 #include <stdio.h>
-#ifdef _WINSOCKAPI_
-/* Prevent inclusion of winsock.h in windows.h */
-#define WIN32_LEAN_AND_MEAN
-#endif
 #include <windows.h>
 #include <numeric>  // std::accumulate in msvc
 #ifndef S_ISDIR     // windows port for sys/stat.h
diff --git a/python/setup.py.in b/python/setup.py.in
index 0a83773c82..155dd9c580 100644
--- a/python/setup.py.in
+++ b/python/setup.py.in
@@ -174,18 +174,17 @@ else:
         package_data['paddle.libs'] += ['openblas' + ext_name]
 
 if '${WITH_MKLDNN}' == 'ON':
-    if '${CMAKE_BUILD_TYPE}' == 'Release':
-        if os.name != 'nt':
-            # only change rpath in Release mode.
-            # TODO(typhoonzero): use install_name_tool to patch mkl libs once
-            # we can support mkl on mac.
-            #
-            # change rpath of libmkldnn.so.0, add $ORIGIN/ to it.
-            # The reason is that all thirdparty libraries in the same directory,
-            # thus, libmkldnn.so.0 will find libmklml_intel.so and libiomp5.so.
-            command = "patchelf --set-rpath '$ORIGIN/' ${MKLDNN_SHARED_LIB}"
-            if os.system(command) != 0:
-                raise Exception("patch libmkldnn.so failed, command: %s" % command)
+    if '${CMAKE_BUILD_TYPE}' == 'Release' AND os.name != 'nt':
+        # only change rpath in Release mode.
+        # TODO(typhoonzero): use install_name_tool to patch mkl libs once
+        # we can support mkl on mac.
+        #
+        # change rpath of libmkldnn.so.0, add $ORIGIN/ to it.
+        # The reason is that all thirdparty libraries in the same directory,
+        # thus, libmkldnn.so.0 will find libmklml_intel.so and libiomp5.so.
+        command = "patchelf --set-rpath '$ORIGIN/' ${MKLDNN_SHARED_LIB}"
+        if os.system(command) != 0:
+            raise Exception("patch libmkldnn.so failed, command: %s" % command)
     package_data['paddle.libs']+=['libmkldnn.so.0' if os.name != 'nt' else ('mkldnn' + ext_name)]
     shutil.copy('${MKLDNN_SHARED_LIB}', libs_path)
 if '${WITH_NGRAPH}' == 'ON':

From d05094eec3303030908affed86e558c0afbf4546 Mon Sep 17 00:00:00 2001
From: Jacek Czaja <jacek.czaja@intel.com>
Date: Thu, 20 Dec 2018 15:37:25 +0100
Subject: [PATCH 16/28] - Added transpose/transpose2 MKLDNN grad ops

test=develop
---
 paddle/fluid/operators/transpose_mkldnn_op.cc | 60 +++++++++++++++++--
 paddle/fluid/operators/transpose_op.cc        | 34 +++++++++--
 .../unittests/test_transpose_mkldnn_op.py     | 10 ----
 .../tests/unittests/test_transpose_op.py      |  2 -
 4 files changed, 85 insertions(+), 21 deletions(-)

diff --git a/paddle/fluid/operators/transpose_mkldnn_op.cc b/paddle/fluid/operators/transpose_mkldnn_op.cc
index 2f133c9e25..e6df7028f5 100644
--- a/paddle/fluid/operators/transpose_mkldnn_op.cc
+++ b/paddle/fluid/operators/transpose_mkldnn_op.cc
@@ -29,10 +29,6 @@ class TransposeMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
   void Compute(const paddle::framework::ExecutionContext& ctx) const override {
     PADDLE_ENFORCE(paddle::platform::is_cpu_place(ctx.GetPlace()),
                    "It must use CPUPlace.");
-    const bool is_test = ctx.Attr<bool>("is_test");
-    PADDLE_ENFORCE(
-        is_test == true,
-        "TransposeMKLDNN works only for inference!. Set is_test = True");
     auto& dev_ctx =
         ctx.template device_context<paddle::platform::MKLDNNDeviceContext>();
     const auto& mkldnn_engine = dev_ctx.GetEngine();
@@ -68,6 +64,57 @@ class TransposeMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
   }
 };
 
+template <typename T>
+class TransposeMKLDNNGradOpKernel : public paddle::framework::OpKernel<T> {
+ public:
+  void Compute(const paddle::framework::ExecutionContext& ctx) const override {
+    PADDLE_ENFORCE(paddle::platform::is_cpu_place(ctx.GetPlace()),
+                   "It must use CPUPlace.");
+    auto* out_grad =
+        ctx.Input<framework::Tensor>(framework::GradVarName("Out"));
+    auto* x_grad = ctx.Output<framework::Tensor>(framework::GradVarName("X"));
+    if (!x_grad) return;
+
+    auto& dev_ctx =
+        ctx.template device_context<paddle::platform::MKLDNNDeviceContext>();
+    const auto& mkldnn_engine = dev_ctx.GetEngine();
+    std::vector<int> axis = ctx.Attr<std::vector<int>>("axis");
+    std::vector<int> reversed_axis(axis);
+    int ndims = axis.size();
+    if (ndims == 1) {
+      x_grad->ShareDataWith(*out_grad);
+      return;
+    }
+
+    for (size_t i = 0; i < axis.size(); i++) {
+      reversed_axis[axis[i]] = i;
+    }
+
+    const T* out_grad_data = out_grad->data<T>();
+    x_grad->mutable_data<T>(ctx.GetPlace());
+
+    std::vector<int> nchw_tz =
+        paddle::framework::vectorize2int(out_grad->dims());
+
+    const std::string key = platform::TransposeMKLDNNHandler::GetHash(
+        nchw_tz, axis, ctx.op().Output(framework::GradVarName("X")));
+
+    platform::TransposeMKLDNNHandler handler(nchw_tz, reversed_axis, dev_ctx,
+                                             mkldnn_engine, key);
+
+    auto transpose_src_memory_p = handler.AcquireSrcMemory(
+        out_grad->format(), platform::to_void_cast<T>(out_grad_data));
+    auto transpose_dst_memory_p =
+        handler.AcquireDstMemory(x_grad, ctx.GetPlace());
+    auto transpose_p = handler.AcquireTranspose(transpose_dst_memory_p,
+                                                transpose_src_memory_p);
+
+    std::vector<mkldnn::primitive> pipeline;
+    pipeline.push_back(*transpose_p);
+    mkldnn::stream(mkldnn::stream::kind::eager).submit(pipeline).wait();
+  }
+};
+
 }  // namespace operators
 }  // namespace paddle
 
@@ -77,3 +124,8 @@ REGISTER_OP_KERNEL(transpose2, MKLDNN, ::paddle::platform::CPUPlace,
                    ops::TransposeMKLDNNOpKernel<float>);
 REGISTER_OP_KERNEL(transpose, MKLDNN, ::paddle::platform::CPUPlace,
                    ops::TransposeMKLDNNOpKernel<float>);
+
+REGISTER_OP_KERNEL(transpose_grad, MKLDNN, ::paddle::platform::CPUPlace,
+                   ops::TransposeMKLDNNGradOpKernel<float>);
+REGISTER_OP_KERNEL(transpose2_grad, MKLDNN, ::paddle::platform::CPUPlace,
+                   ops::TransposeMKLDNNGradOpKernel<float>);
diff --git a/paddle/fluid/operators/transpose_op.cc b/paddle/fluid/operators/transpose_op.cc
index b3b379d16f..db14d350c7 100644
--- a/paddle/fluid/operators/transpose_op.cc
+++ b/paddle/fluid/operators/transpose_op.cc
@@ -79,10 +79,6 @@ class TransposeOp : public framework::OperatorWithKernel {
 class TransposeOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
   void Make() override {
-    AddAttr<bool>("is_test",
-                  "(bool, default false) Set to true for inference only, false "
-                  "for training. Some layers may run faster when this is true.")
-        .SetDefault(false);
     AddInput(
         "X",
         "(Tensor) The input tensor, tensors with rank up to 6 are supported.");
@@ -147,6 +143,24 @@ class TransposeOpGrad : public framework::OperatorWithKernel {
       ctx->SetOutputDim(framework::GradVarName("X"), x_dims);
     }
   }
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext &ctx) const override {
+    framework::LibraryType library_{framework::LibraryType::kPlain};
+    std::string data_format = ctx.Attr<std::string>("data_format");
+    framework::DataLayout layout_ = framework::StringToDataLayout(data_format);
+#ifdef PADDLE_WITH_MKLDNN
+    if (library_ == framework::LibraryType::kPlain &&
+        platform::CanMKLDNNBeUsed(ctx)) {
+      library_ = framework::LibraryType::kMKLDNN;
+      layout_ = framework::DataLayout::kMKLDNN;
+    }
+#endif
+    return framework::OpKernelType(
+        ctx.Input<framework::LoDTensor>(framework::GradVarName("Out"))->type(),
+        ctx.GetPlace(), layout_, library_);
+  }
 };
 
 // FIXME(zcd): transpose2 adds an intermediate output(XShape) based on
@@ -237,9 +251,19 @@ class Transpose2OpGrad : public framework::OperatorWithKernel {
  protected:
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext &ctx) const override {
+    framework::LibraryType library_{framework::LibraryType::kPlain};
+    std::string data_format = ctx.Attr<std::string>("data_format");
+    framework::DataLayout layout_ = framework::StringToDataLayout(data_format);
+#ifdef PADDLE_WITH_MKLDNN
+    if (library_ == framework::LibraryType::kPlain &&
+        platform::CanMKLDNNBeUsed(ctx)) {
+      library_ = framework::LibraryType::kMKLDNN;
+      layout_ = framework::DataLayout::kMKLDNN;
+    }
+#endif
     return framework::OpKernelType(
         ctx.Input<framework::LoDTensor>(framework::GradVarName("Out"))->type(),
-        ctx.device_context());
+        ctx.GetPlace(), layout_, library_);
   }
 };
 
diff --git a/python/paddle/fluid/tests/unittests/test_transpose_mkldnn_op.py b/python/paddle/fluid/tests/unittests/test_transpose_mkldnn_op.py
index 61ac879011..0c201b9e4f 100644
--- a/python/paddle/fluid/tests/unittests/test_transpose_mkldnn_op.py
+++ b/python/paddle/fluid/tests/unittests/test_transpose_mkldnn_op.py
@@ -23,16 +23,6 @@ class TestTransposeMKLDNN(TestTransposeOp):
     def init_op_type(self):
         self.op_type = "transpose2"
         self.use_mkldnn = True
-        self.is_test = True
-        return
-
-    def test_check_grad(self):
-        return
-
-    def test_check_grad_no_input(self):
-        return
-
-    def test_check_grad_no_filter(self):
         return
 
 
diff --git a/python/paddle/fluid/tests/unittests/test_transpose_op.py b/python/paddle/fluid/tests/unittests/test_transpose_op.py
index 93be9d28da..a38540a724 100644
--- a/python/paddle/fluid/tests/unittests/test_transpose_op.py
+++ b/python/paddle/fluid/tests/unittests/test_transpose_op.py
@@ -27,7 +27,6 @@ class TestTransposeOp(OpTest):
         self.attrs = {
             'axis': list(self.axis),
             'use_mkldnn': self.use_mkldnn,
-            'is_test': self.is_test,
         }
         self.outputs = {
             'XShape': np.random.random(self.shape).astype("float32"),
@@ -37,7 +36,6 @@ class TestTransposeOp(OpTest):
     def init_op_type(self):
         self.op_type = "transpose2"
         self.use_mkldnn = False
-        self.is_test = False
 
     def test_check_output(self):
         self.check_output(no_check_set=['XShape'])

From fb223d8037acec1eda74c59518ac581b5e569984 Mon Sep 17 00:00:00 2001
From: peizhilin <wopeizl@163.com>
Date: Fri, 21 Dec 2018 10:25:37 +0800
Subject: [PATCH 17/28] fix the typo of keyword and test=develop

---
 python/setup.py.in | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/setup.py.in b/python/setup.py.in
index 155dd9c580..f8170b42ab 100644
--- a/python/setup.py.in
+++ b/python/setup.py.in
@@ -174,7 +174,7 @@ else:
         package_data['paddle.libs'] += ['openblas' + ext_name]
 
 if '${WITH_MKLDNN}' == 'ON':
-    if '${CMAKE_BUILD_TYPE}' == 'Release' AND os.name != 'nt':
+    if '${CMAKE_BUILD_TYPE}' == 'Release' and os.name != 'nt':
         # only change rpath in Release mode.
         # TODO(typhoonzero): use install_name_tool to patch mkl libs once
         # we can support mkl on mac.

From f31d65454cddcce3a0c015a5de8e7c7bae0d09c0 Mon Sep 17 00:00:00 2001
From: peizhilin <wopeizl@163.com>
Date: Fri, 21 Dec 2018 10:42:34 +0800
Subject: [PATCH 18/28] use the default cdn address for mklml package on
 windows test=develop

---
 cmake/external/mklml.cmake | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cmake/external/mklml.cmake b/cmake/external/mklml.cmake
index 1ee553e2f5..d49839a89d 100644
--- a/cmake/external/mklml.cmake
+++ b/cmake/external/mklml.cmake
@@ -48,7 +48,7 @@ IF((NOT DEFINED MKLML_VER) OR (NOT DEFINED MKLML_URL))
     MESSAGE(STATUS "use pre defined download url")
     if(WIN32)
         SET(MKLML_VER "mklml_win_2019.0.20180710" CACHE STRING "" FORCE)
-        SET(MKLML_URL "https://paddle-windows.bj.bcebos.com/mklml/${MKLML_VER}.zip" CACHE STRING "" FORCE)
+        SET(MKLML_URL "https://paddlepaddledeps.cdn.bcebos.com/${MKLML_VER}.zip" CACHE STRING "" FORCE)
     else()
         SET(MKLML_VER "mklml_lnx_2019.0.20180710" CACHE STRING "" FORCE)
         SET(MKLML_URL "http://paddlepaddledeps.cdn.bcebos.com/${MKLML_VER}.tgz" CACHE STRING "" FORCE)

From 3026aba7e16851a0ab11b0e126cc38f00fa1bd59 Mon Sep 17 00:00:00 2001
From: minqiyang <minqiyang@baidu.com>
Date: Fri, 21 Dec 2018 15:27:32 +0800
Subject: [PATCH 19/28] Fix net_drawer

test=develop
---
 python/paddle/fluid/net_drawer.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/python/paddle/fluid/net_drawer.py b/python/paddle/fluid/net_drawer.py
index 0b61c23d07..8485d7d32f 100644
--- a/python/paddle/fluid/net_drawer.py
+++ b/python/paddle/fluid/net_drawer.py
@@ -26,7 +26,7 @@ logger = logging.getLogger(__name__)
 logger.setLevel(logging.INFO)
 
 try:
-    from .graphviz import Digraph
+    from .graphviz import Graph
 except ImportError:
     logger.info(
         'Cannot import graphviz, which is required for drawing a network. This '
@@ -112,7 +112,7 @@ def draw_graph(startup_program, main_program, **kwargs):
     filename = kwargs.get("filename")
     if filename == None:
         filename = str(graph_id) + ".gv"
-    g = Digraph(
+    g = Graph(
         name=str(graph_id),
         filename=filename,
         graph_attr=GRAPH_STYLE,

From 201283f95c6fb02d03706e414a46ead3aa3c528e Mon Sep 17 00:00:00 2001
From: peizhilin <wopeizl@163.com>
Date: Fri, 21 Dec 2018 16:24:28 +0800
Subject: [PATCH 20/28] fix code style test=develop

---
 paddle/fluid/operators/elementwise/elementwise_mul_mkldnn_op.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/paddle/fluid/operators/elementwise/elementwise_mul_mkldnn_op.cc b/paddle/fluid/operators/elementwise/elementwise_mul_mkldnn_op.cc
index 17a5fe9895..74a3f64e26 100644
--- a/paddle/fluid/operators/elementwise/elementwise_mul_mkldnn_op.cc
+++ b/paddle/fluid/operators/elementwise/elementwise_mul_mkldnn_op.cc
@@ -16,8 +16,8 @@ limitations under the License. */
 #include "paddle/fluid/operators/elementwise/elementwise_op.h"
 #include "paddle/fluid/operators/elementwise/elementwise_op_function.h"
 
-#include "paddle/fluid/platform/mkldnn_helper.h"
 #include "paddle/fluid/operators/jit/kernels.h"
+#include "paddle/fluid/platform/mkldnn_helper.h"
 
 #ifdef PADDLE_WITH_XBYAK
 #include "xbyak/xbyak.h"

From e99e4e994de04aa55c4f37d08fd87cd31c634246 Mon Sep 17 00:00:00 2001
From: tensor-tang <tangjian03@baidu.com>
Date: Fri, 21 Dec 2018 09:50:09 +0000
Subject: [PATCH 21/28] fix deps of jit benchmark

test=develop
---
 paddle/fluid/operators/jit/CMakeLists.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/paddle/fluid/operators/jit/CMakeLists.txt b/paddle/fluid/operators/jit/CMakeLists.txt
index ced2974125..262094f922 100644
--- a/paddle/fluid/operators/jit/CMakeLists.txt
+++ b/paddle/fluid/operators/jit/CMakeLists.txt
@@ -21,5 +21,5 @@ endif()
 cc_library(jit_kernel_helper SRCS ${jit_kernel_cc_srcs} DEPS ${JIT_KERNEL_DEPS})
 cc_test(jit_kernel_test SRCS test.cc DEPS jit_kernel_helper)
 if(NOT WIN32)
-    cc_binary(jit_kernel_benchmark SRCS benchmark.cc DEPS jit_kernel_helper)
+    cc_binary(jit_kernel_benchmark SRCS benchmark.cc DEPS jit_kernel_helper device_tracer)
 endif()

From 2e35290ffad672b967def578f912c4a7cf683621 Mon Sep 17 00:00:00 2001
From: peizhilin <wopeizl@163.com>
Date: Fri, 21 Dec 2018 19:03:44 +0800
Subject: [PATCH 22/28] fix build issue test=develop

---
 paddle/fluid/operators/elementwise/elementwise_mul_mkldnn_op.cc | 1 +
 1 file changed, 1 insertion(+)

diff --git a/paddle/fluid/operators/elementwise/elementwise_mul_mkldnn_op.cc b/paddle/fluid/operators/elementwise/elementwise_mul_mkldnn_op.cc
index 74a3f64e26..4c73a70ed1 100644
--- a/paddle/fluid/operators/elementwise/elementwise_mul_mkldnn_op.cc
+++ b/paddle/fluid/operators/elementwise/elementwise_mul_mkldnn_op.cc
@@ -17,6 +17,7 @@ limitations under the License. */
 #include "paddle/fluid/operators/elementwise/elementwise_op_function.h"
 
 #include "paddle/fluid/operators/jit/kernels.h"
+#include "paddle/fluid/platform/cpu_info.h"
 #include "paddle/fluid/platform/mkldnn_helper.h"
 
 #ifdef PADDLE_WITH_XBYAK

From a30c5373eb86361c0c36df14f2503d6a05ec407b Mon Sep 17 00:00:00 2001
From: sneaxiy <sneaxiy@126.com>
Date: Fri, 21 Dec 2018 08:45:37 +0000
Subject: [PATCH 23/28] use std::is_sorted fix comment test=develop

---
 paddle/fluid/framework/lod_tensor.cc      | 14 +++-----------
 paddle/fluid/framework/lod_tensor_test.cc |  5 +++--
 2 files changed, 6 insertions(+), 13 deletions(-)

diff --git a/paddle/fluid/framework/lod_tensor.cc b/paddle/fluid/framework/lod_tensor.cc
index 1e8f6c42d1..8fbbc6584e 100644
--- a/paddle/fluid/framework/lod_tensor.cc
+++ b/paddle/fluid/framework/lod_tensor.cc
@@ -157,17 +157,9 @@ bool CheckLoD(const LoD &in, int tensor_height) {
     if (level.size() < 2) return false;
     // check: the first offset(the begin offset) of each level should be 0.
     if (level.front() != 0) return false;
-    // check: all the offsets in a level should be ascending(no same items
-    // allows).
-    auto beg = level.begin();
-    auto end = level.end();
-    // Do not use std::is_sorted, because we need strictly sorted lod
-    if (beg != end) {
-      for (auto it = beg + 1; it != end; ++it) {
-        if (*(it - 1) >= *it) {
-          return false;
-        }
-      }
+    // check: all the offsets in a level should be ascending(allow same items)
+    if (!std::is_sorted(level.begin(), level.end())) {
+      return false;
     }
   }
   // check: the lowest level's last offset should equals `tensor_height` if
diff --git a/paddle/fluid/framework/lod_tensor_test.cc b/paddle/fluid/framework/lod_tensor_test.cc
index 838b174343..15928c18d3 100644
--- a/paddle/fluid/framework/lod_tensor_test.cc
+++ b/paddle/fluid/framework/lod_tensor_test.cc
@@ -218,9 +218,10 @@ TEST(LoD, CheckLoD) {
   ASSERT_TRUE(CheckLoD(relative_lod, 5));
   ASSERT_FALSE(CheckLoD(relative_lod, 9));
 
-  // check strictly sorted lod
+  // check whether lod is ascending-sorted (allow same items)
   ASSERT_TRUE(CheckLoD({{0, 1, 2, 3, 4, 5}}, 5));
-  ASSERT_FALSE(CheckLoD({{0, 1, 3, 3, 4, 5}}, 5));
+  ASSERT_TRUE(CheckLoD({{0, 1, 3, 3, 4, 5}}, 5));
+  ASSERT_FALSE(CheckLoD({{0, 1, 3, 2, 5}}, 5));
 }
 
 TEST(LoD, CheckAbsLoD) {

From 79bd6dfa18c6eb6ca823d9b641330c8257c27fae Mon Sep 17 00:00:00 2001
From: chengduo <zhaochengduo@baidu.com>
Date: Fri, 21 Dec 2018 07:15:47 -0600
Subject: [PATCH 24/28] [Feature] Add Temporary Allocator (#14875)

* Add Temporal Allocator

* add Temporay Allocator to DeviceContext
test=develop

* code refine
test=develop

* fix mean_iou
test=develop

* Add DeviceTemporaryAllocator
test=develop

* fix conv_op bug
test=develop

* small fix
test=develop

* code refine
test=develop

* log refine
test=develop

* fix unit test
test=develop

* move double check

* refine concat_and_split
test=develop

* add limit_of_temporary_allocation
test=develop

* fix name
test=develop
---
 paddle/fluid/framework/tensor.cc              |  11 ++
 paddle/fluid/framework/tensor.h               |   4 +
 paddle/fluid/operators/conv_op.h              |  31 +++-
 .../fluid/operators/math/concat_and_split.cu  |  67 ++++---
 paddle/fluid/operators/mean_iou_op.cu         |  16 +-
 paddle/fluid/platform/CMakeLists.txt          |  11 +-
 .../create_tensor_with_allocationptr.h        |  42 +++++
 paddle/fluid/platform/device_context.cc       |  51 +++++-
 paddle/fluid/platform/device_context.h        |  46 +++++
 paddle/fluid/platform/init.cc                 |   2 +-
 paddle/fluid/platform/temporary_allocator.cc  |  95 ++++++++++
 paddle/fluid/platform/temporary_allocator.h   |  63 +++++++
 .../platform/temporary_allocator_test.cc      | 165 ++++++++++++++++++
 13 files changed, 561 insertions(+), 43 deletions(-)
 create mode 100644 paddle/fluid/platform/create_tensor_with_allocationptr.h
 create mode 100644 paddle/fluid/platform/temporary_allocator.cc
 create mode 100644 paddle/fluid/platform/temporary_allocator.h
 create mode 100644 paddle/fluid/platform/temporary_allocator_test.cc

diff --git a/paddle/fluid/framework/tensor.cc b/paddle/fluid/framework/tensor.cc
index 57335847a1..5b09cad06c 100644
--- a/paddle/fluid/framework/tensor.cc
+++ b/paddle/fluid/framework/tensor.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/framework/tensor.h"
+#include "paddle/fluid/framework/var_type.h"
 
 namespace paddle {
 namespace framework {
@@ -27,6 +28,9 @@ void Tensor::check_memory_size() const {
       "or maybe the required data-type mismatches the data already stored.");
 }
 
+Tensor::Tensor(std::type_index type)
+    : type_(framework::ToDataType(type)), offset_(0) {}
+
 size_t Tensor::memory_size() const {
   return holder_ == nullptr ? 0UL : holder_->size() - offset_;
 }
@@ -101,5 +105,12 @@ const DDim& Tensor::dims() const { return dims_; }
 
 int64_t Tensor::numel() const { return product(dims_); }
 
+void Tensor::ResetHolder(std::shared_ptr<memory::Allocation> holder) {
+  if (holder_) {
+    PADDLE_ENFORCE_EQ(numel() * SizeOfType(type()), holder->size());
+  }
+  holder_ = holder;
+}
+
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/fluid/framework/tensor.h b/paddle/fluid/framework/tensor.h
index 6a1cbe5cd5..2e110133a3 100644
--- a/paddle/fluid/framework/tensor.h
+++ b/paddle/fluid/framework/tensor.h
@@ -69,6 +69,8 @@ class Tensor {
  public:
   Tensor() : type_(proto::VarType::FP32), offset_(0) {}
 
+  explicit Tensor(std::type_index type);
+
   /*! Return a pointer to mutable memory block. */
   template <typename T>
   T* data();
@@ -162,6 +164,8 @@ class Tensor {
     return std::move(holder_);
   }
 
+  void ResetHolder(std::shared_ptr<memory::Allocation> holder);
+
  private:
   /*! holds the memory block if allocated. */
   std::shared_ptr<memory::Allocation> holder_;
diff --git a/paddle/fluid/operators/conv_op.h b/paddle/fluid/operators/conv_op.h
index 249f308c13..4a7b31c7d4 100644
--- a/paddle/fluid/operators/conv_op.h
+++ b/paddle/fluid/operators/conv_op.h
@@ -22,6 +22,7 @@ limitations under the License. */
 #include "paddle/fluid/operators/math/depthwise_conv.h"
 #include "paddle/fluid/operators/math/im2col.h"
 #include "paddle/fluid/operators/math/vol2col.h"
+#include "paddle/fluid/platform/create_tensor_with_allocationptr.h"
 
 namespace paddle {
 namespace operators {
@@ -123,6 +124,8 @@ class GemmConvKernel : public framework::OpKernel<T> {
     std::vector<int> paddings = context.Attr<std::vector<int>>("paddings");
     std::vector<int> dilations = context.Attr<std::vector<int>>("dilations");
 
+    auto& dev_ctx = context.template device_context<DeviceContext>();
+
     const int batch_size = static_cast<int>(input->dims()[0]);
 
     // filter_shape_vec: {k_o, k_i, k_h, k_w} or {k_o, k_i, k_d, k_h, k_w}
@@ -155,13 +158,19 @@ class GemmConvKernel : public framework::OpKernel<T> {
     // to call the matrix multiplication interface.
     Tensor col_matrix;
     if (is_expand) {
-      col.mutable_data<T>(col_shape, context.GetPlace());
+      auto tmp_allocation_ptr =
+          platform::DeviceTemporaryAllocator::Instance().Get(dev_ctx).Allocate(
+              framework::product(col_shape) * sizeof(T));
+      Tensor tep_tensor =
+          platform::GetTensor<T>(std::move(tmp_allocation_ptr), col_shape);
+
+      col.ShareDataWith(tep_tensor);
       col_matrix.ShareDataWith(col);
       col_matrix.Resize(col_matrix_shape);
     }
 
-    framework::DDim input_shape = framework::slice_ddim(
-        input->dims(), 1, static_cast<int>(input->dims().size()));
+    framework::DDim input_shape =
+        framework::slice_ddim(input->dims(), 1, input->dims().size());
 
     framework::DDim filter_matrix_shape = {filter.dims()[0],
                                            filter.numel() / filter.dims()[0]};
@@ -178,7 +187,6 @@ class GemmConvKernel : public framework::OpKernel<T> {
     math::Vol2ColFunctor<DeviceContext, T> vol2col;
     math::Im2ColFunctor<math::ColFormat::kCFO, DeviceContext, T> im2col;
 
-    auto& dev_ctx = context.template device_context<DeviceContext>();
     auto blas = math::GetBlas<DeviceContext, T>(dev_ctx);
     for (int i = 0; i < batch_size; i++) {
       Tensor in_batch = input->Slice(i, i + 1).Resize(input_shape);
@@ -237,6 +245,8 @@ class GemmConvGradKernel : public framework::OpKernel<T> {
 
     const int batch_size = static_cast<int>(input->dims()[0]);
 
+    auto& dev_ctx = context.template device_context<DeviceContext>();
+
     // filter_shape_vec: {k_o, k_i, k_h, k_w} or {k_o, k_i, k_d, k_h, k_w}
     std::vector<int64_t> filter_shape_vec(framework::vectorize(filter.dims()));
     // output_shape_vec: {o_n, o_c, o_h, o_w} or {o_n, o_c, o_d, o_h, o_w}
@@ -262,8 +272,8 @@ class GemmConvGradKernel : public framework::OpKernel<T> {
     framework::DDim col_matrix_shape =
         framework::flatten_to_2d(col_shape, data_dim + 1);
 
-    framework::DDim input_shape = framework::slice_ddim(
-        input->dims(), 1, static_cast<int>(input->dims().size()));
+    framework::DDim input_shape =
+        framework::slice_ddim(input->dims(), 1, input->dims().size());
 
     framework::DDim filter_matrix_shape = {filter.dims()[0],
                                            filter.numel() / filter.dims()[0]};
@@ -286,13 +296,18 @@ class GemmConvGradKernel : public framework::OpKernel<T> {
     // to call the matrix multiplication interface.
     Tensor col_matrix;
     if (is_expand) {
-      col.mutable_data<T>(col_shape, context.GetPlace());
+      auto tmp_allocation_ptr =
+          platform::DeviceTemporaryAllocator::Instance().Get(dev_ctx).Allocate(
+              framework::product(col_shape) * sizeof(T));
+      Tensor tep_tensor =
+          platform::GetTensor<T>(std::move(tmp_allocation_ptr), col_shape);
+
+      col.ShareDataWith(tep_tensor);
       col_matrix.ShareDataWith(col);
       col_matrix.Resize(col_matrix_shape);
     }
 
     math::SetConstant<DeviceContext, T> set_zero;
-    auto& dev_ctx = context.template device_context<DeviceContext>();
     auto blas = math::GetBlas<DeviceContext, T>(dev_ctx);
 
     if (input_grad) {
diff --git a/paddle/fluid/operators/math/concat_and_split.cu b/paddle/fluid/operators/math/concat_and_split.cu
index 760a065c10..b10a19b658 100644
--- a/paddle/fluid/operators/math/concat_and_split.cu
+++ b/paddle/fluid/operators/math/concat_and_split.cu
@@ -131,9 +131,8 @@ class ConcatFunctor<platform::CUDADeviceContext, T> {
     int in_col = input[0].numel() / in_row;
     int out_row = in_row, out_col = 0;
 
-    framework::Vector<int16_t> inputs_data(in_num * sizeof(T*) / 2);
-    framework::Vector<int> inputs_col(in_num + 1);
-    T** inputs_ptr = reinterpret_cast<T**>(inputs_data.data());
+    std::vector<T*> inputs_data(in_num);
+    std::vector<int> inputs_col(in_num + 1);
 
     inputs_col[0] = 0;
     bool sameShape = true;
@@ -144,12 +143,9 @@ class ConcatFunctor<platform::CUDADeviceContext, T> {
       }
       out_col += t_cols;
       inputs_col[i + 1] = out_col;
-      inputs_ptr[i] = const_cast<T*>(input[i].data<T>());
+      inputs_data[i] = const_cast<T*>(input[i].data<T>());
     }
 
-    T** dev_ins_data =
-        reinterpret_cast<T**>(inputs_data.CUDAMutableData(context.GetPlace()));
-
     // computation
     // set the thread block and grid according to CurrentDeviceId
     const int kThreadsPerBlock = 1024;
@@ -169,18 +165,32 @@ class ConcatFunctor<platform::CUDADeviceContext, T> {
         std::min(max_blocks / grid_cols, std::max(out_row / block_rows, 1));
     dim3 grid_size = dim3(grid_cols, grid_rows, 1);
 
+    auto tmp_dev_ins_data =
+        platform::DeviceTemporaryAllocator::Instance().Get(context).Allocate(
+            inputs_data.size() * sizeof(T*));
+    memory::Copy(boost::get<platform::CUDAPlace>(context.GetPlace()),
+                 tmp_dev_ins_data->ptr(), platform::CPUPlace(),
+                 static_cast<void*>(inputs_data.data()),
+                 inputs_data.size() * sizeof(T*), context.stream());
+    T** dev_ins_data = reinterpret_cast<T**>(tmp_dev_ins_data->ptr());
+
     if (sameShape) {
       ConcatKernel<<<grid_size, block_size, 0, context.stream()>>>(
           dev_ins_data, in_col, out_row, out_col, output->data<T>());
     } else {
-      const int* dev_ins_col_data = inputs_col.CUDAData(context.GetPlace());
+      auto tmp_dev_ins_col_data =
+          platform::DeviceTemporaryAllocator::Instance().Get(context).Allocate(
+              inputs_col.size() * sizeof(int));
+      memory::Copy(boost::get<platform::CUDAPlace>(context.GetPlace()),
+                   tmp_dev_ins_col_data->ptr(), platform::CPUPlace(),
+                   static_cast<void*>(inputs_col.data()),
+                   inputs_col.size() * sizeof(int), context.stream());
+      int* dev_ins_col_data = static_cast<int*>(tmp_dev_ins_col_data->ptr());
+
       ConcatKernel<<<grid_size, block_size, 0, context.stream()>>>(
           dev_ins_data, dev_ins_col_data, static_cast<int>(inputs_col.size()),
           out_row, out_col, output->data<T>());
     }
-    // Wait() must be called because `inputs_data` may be destructed before
-    // kernel ends
-    context.Wait();
   }
 };
 
@@ -207,9 +217,8 @@ class SplitFunctor<platform::CUDADeviceContext, T> {
     int in_col = 0, in_row = out_row;
     bool sameShape = true;
 
-    framework::Vector<int16_t> outputs_data(o_num * sizeof(T*) / 2);
-    framework::Vector<int> outputs_cols(o_num + 1);
-    T** outputs_ptr = reinterpret_cast<T**>(outputs_data.data());
+    std::vector<T*> outputs_data(o_num);
+    std::vector<int> outputs_cols(o_num + 1);
 
     outputs_cols[0] = 0;
     for (int i = 0; i < o_num; ++i) {
@@ -220,15 +229,12 @@ class SplitFunctor<platform::CUDADeviceContext, T> {
       in_col += t_col;
       outputs_cols[i + 1] = in_col;
       if (outputs->at(i) != nullptr) {
-        outputs_ptr[i] = outputs->at(i)->data<T>();
+        outputs_data[i] = outputs->at(i)->data<T>();
       } else {
-        outputs_ptr[i] = nullptr;
+        outputs_data[i] = nullptr;
       }
     }
 
-    T** dev_out_gpu_data =
-        reinterpret_cast<T**>(outputs_data.CUDAMutableData(context.GetPlace()));
-
     // computation
     const int kThreadsPerBlock = 1024;
     int block_cols = kThreadsPerBlock;
@@ -247,18 +253,33 @@ class SplitFunctor<platform::CUDADeviceContext, T> {
         std::min(max_blocks / grid_cols, std::max(out_row / block_rows, 1));
     dim3 grid_size = dim3(grid_cols, grid_rows, 1);
 
+    auto tmp_dev_outs_data =
+        platform::DeviceTemporaryAllocator::Instance().Get(context).Allocate(
+            outputs_data.size() * sizeof(T*));
+    memory::Copy(boost::get<platform::CUDAPlace>(context.GetPlace()),
+                 tmp_dev_outs_data->ptr(), platform::CPUPlace(),
+                 reinterpret_cast<void*>(outputs_data.data()),
+                 outputs_data.size() * sizeof(T*), context.stream());
+    T** dev_out_gpu_data = reinterpret_cast<T**>(tmp_dev_outs_data->ptr());
+
     if (sameShape) {
       SplitKernel<<<grid_size, block_size, 0, context.stream()>>>(
           input.data<T>(), in_row, in_col, out0_col, dev_out_gpu_data);
     } else {
-      const int* dev_outs_col_data = outputs_cols.CUDAData(context.GetPlace());
+      auto tmp_dev_ins_col_data =
+          platform::DeviceTemporaryAllocator::Instance().Get(context).Allocate(
+              outputs_cols.size() * sizeof(int));
+      memory::Copy(boost::get<platform::CUDAPlace>(context.GetPlace()),
+                   tmp_dev_ins_col_data->ptr(), platform::CPUPlace(),
+                   reinterpret_cast<void*>(outputs_cols.data()),
+                   outputs_cols.size() * sizeof(int), context.stream());
+      int* dev_outs_col_data =
+          reinterpret_cast<int*>(tmp_dev_ins_col_data->ptr());
+
       SplitKernel<<<grid_size, block_size, 0, context.stream()>>>(
           input.data<T>(), in_row, in_col, dev_outs_col_data,
           static_cast<int>(outputs_cols.size()), dev_out_gpu_data);
     }
-    // Wait() must be called because `outputs_data` may be destructed before
-    // kernel ends
-    context.Wait();
   }
 };
 
diff --git a/paddle/fluid/operators/mean_iou_op.cu b/paddle/fluid/operators/mean_iou_op.cu
index 83bb4dde46..08088eb873 100644
--- a/paddle/fluid/operators/mean_iou_op.cu
+++ b/paddle/fluid/operators/mean_iou_op.cu
@@ -92,8 +92,8 @@ template <typename T>
 class MeanIoUCUDAOpKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
-    auto& place = *ctx.template device_context<platform::CUDADeviceContext>()
-                       .eigen_device();
+    auto& dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
+    auto& place = *dev_ctx.eigen_device();
     // get input and output tensor
     auto* predictions = ctx.Input<Tensor>("Predictions");
     auto* labels = ctx.Input<Tensor>("Labels");
@@ -115,11 +115,11 @@ class MeanIoUCUDAOpKernel : public framework::OpKernel<T> {
     auto out_wrong_t = EigenTensor<int, 1>::From(*out_wrong);
     auto out_correct_t = EigenTensor<int, 1>::From(*out_correct);
 
-    // Temporary tensor
-    Tensor ious;
-    float* ious_data = ious.mutable_data<float>(
-        {static_cast<int64_t>(num_classes)}, ctx.GetPlace());
-    auto ious_t = EigenTensor<float, 1>::From(ious);
+    // Temporary memory
+    auto& allocator =
+        platform::DeviceTemporaryAllocator::Instance().Get(dev_ctx);
+    auto tmp_ious_data = allocator.Allocate(num_classes * sizeof(float));
+    float* ious_data = static_cast<float*>(tmp_ious_data->ptr());
 
     // Init out_wrong, out_correct and out_mean_iou
     out_wrong_t.device(place) = out_wrong_t.constant(0);
@@ -148,7 +148,7 @@ class MeanIoUCUDAOpKernel : public framework::OpKernel<T> {
     CountCUDAKernel<T><<<grid, block, cache_size, stream>>>(
         num_classes, predictions->numel(), predictions_data, labels_data,
         out_wrong_data, out_correct_data);
-    ctx.device_context().Wait();
+
     ComputeIoUCUDAKernel<<<1, block, 0, stream>>>(num_classes, out_wrong_data,
                                                   out_correct_data, ious_data,
                                                   out_mean_iou_data);
diff --git a/paddle/fluid/platform/CMakeLists.txt b/paddle/fluid/platform/CMakeLists.txt
index 2f205e1d5c..d1dff16ddd 100644
--- a/paddle/fluid/platform/CMakeLists.txt
+++ b/paddle/fluid/platform/CMakeLists.txt
@@ -56,6 +56,8 @@ ELSE()
     set(MKLDNN_CTX_DEPS)
 ENDIF()
 
+cc_library(temp_allocator SRCS temporary_allocator.cc DEPS  allocator_facade)
+
 nv_library(stream_callback_manager SRCS stream_callback_manager.cc DEPS simple_threadpool enforce) 
 IF(WITH_GPU)
   set(STREAM_CALLBACK_DEPS stream_callback_manager)
@@ -66,7 +68,8 @@ ENDIF()
 # memcpy depends on device_context, here add deps individually for
 # avoiding cycle dependencies
 cc_library(device_context SRCS device_context.cc init.cc DEPS simple_threadpool malloc ${STREAM_CALLBACK_DEPS}
-    place eigen3 stringpiece cpu_helper cpu_info framework_proto ${GPU_CTX_DEPS} ${MKLDNN_CTX_DEPS})
+    place eigen3 stringpiece cpu_helper cpu_info framework_proto ${GPU_CTX_DEPS} ${MKLDNN_CTX_DEPS}  temp_allocator)
+
 if(WIN32)
     if(WITH_GPU AND NOT WITH_DSO)
         get_property(cuda_modules GLOBAL PROPERTY CUDA_MODULES)
@@ -92,3 +95,9 @@ IF(WITH_GPU)
   nv_test(cuda_helper_test SRCS cuda_helper_test.cu)
 ENDIF()
 nv_library(cuda_device_guard SRCS cuda_device_guard.cc DEPS gpu_info)
+
+if(WITH_GPU)
+    nv_test(temporal_allocator_test SRCS temporary_allocator_test.cc DEPS temp_allocator tensor)
+else()
+    cc_test(temporal_allocator_test SRCS temporary_allocator_test.cc DEPS temp_allocator tensor)
+endif()
diff --git a/paddle/fluid/platform/create_tensor_with_allocationptr.h b/paddle/fluid/platform/create_tensor_with_allocationptr.h
new file mode 100644
index 0000000000..00fcc5f862
--- /dev/null
+++ b/paddle/fluid/platform/create_tensor_with_allocationptr.h
@@ -0,0 +1,42 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include "paddle/fluid/framework/tensor.h"
+#include "paddle/fluid/platform/temporary_allocator.h"
+namespace paddle {
+namespace platform {
+
+template <typename T>
+paddle::framework::Tensor GetTensor(
+    memory::allocation::AllocationPtr temp_allocation_ptr,
+    const framework::DDim &dim) {
+  auto &deleter = temp_allocation_ptr.get_deleter();
+  auto *allocation_ptr = temp_allocation_ptr.release();
+  auto shared_allocation =
+      std::shared_ptr<memory::allocation::Allocation>(allocation_ptr, deleter);
+
+  PADDLE_ENFORCE(dynamic_cast<TemporaryAllocation *>(allocation_ptr) != nullptr,
+                 "The AllocationPtr must be TemporaryAllocation.");
+  PADDLE_ENFORCE_EQ(allocation_ptr->size(),
+                    framework::product(dim) * sizeof(T));
+
+  paddle::framework::Tensor temp_tensor(std::type_index(typeid(T)));
+  temp_tensor.Resize(dim);
+  temp_tensor.ResetHolder(std::move(shared_allocation));
+  return temp_tensor;
+}
+
+}  // namespace platform
+}  // namespace paddle
diff --git a/paddle/fluid/platform/device_context.cc b/paddle/fluid/platform/device_context.cc
index d2e23d80f4..81c443d758 100644
--- a/paddle/fluid/platform/device_context.cc
+++ b/paddle/fluid/platform/device_context.cc
@@ -85,6 +85,49 @@ DeviceContextPool::DeviceContextPool(
   }
 }
 
+DeviceTemporaryAllocator* DeviceTemporaryAllocator::allocators = nullptr;
+
+#ifdef PADDLE_WITH_CUDA
+platform::TemporaryAllocator& DeviceTemporaryAllocator::Get(
+    const platform::Place& place, const cudaStream_t& stream) {
+  PADDLE_ENFORCE(platform::is_gpu_place(place));
+  auto place_stream = std::make_pair(place, stream);
+  {
+    std::unique_lock<std::mutex> lock(mtx_);
+    if (!device_allocator_.count(place_stream)) {
+      device_allocator_[place_stream].reset(new TemporaryAllocator(place));
+      device_allocator_[place_stream]->SetCallback([stream]() {
+        PADDLE_ENFORCE(cudaStreamSynchronize(stream));
+        PADDLE_ENFORCE(cudaGetLastError());
+      });
+    }
+  }
+  return *device_allocator_.at(place_stream);
+}
+
+template <>
+platform::TemporaryAllocator& DeviceTemporaryAllocator::Get(
+    const platform::CUDADeviceContext& dev_ctx) {
+  auto place_stream = std::make_pair(dev_ctx.GetPlace(), dev_ctx.stream());
+  if (device_allocator_.count(place_stream)) {
+    return *device_allocator_.at(place_stream);
+  }
+  return Get(dev_ctx.GetPlace(), dev_ctx.stream());
+}
+#endif
+
+template <>
+platform::TemporaryAllocator& DeviceTemporaryAllocator::Get(
+    const platform::CPUDeviceContext& dev_ctx) {
+  return cpu_allocator_;
+}
+
+platform::TemporaryAllocator& DeviceTemporaryAllocator::Get(
+    const platform::Place& place) {
+  PADDLE_ENFORCE(platform::is_cpu_place(place), "You should pass CPUPlace");
+  return cpu_allocator_;
+}
+
 CPUDeviceContext::CPUDeviceContext() {
   eigen_device_.reset(new Eigen::DefaultDevice());
 }
@@ -271,8 +314,12 @@ CUDADeviceContext::~CUDADeviceContext() {
 Place CUDADeviceContext::GetPlace() const { return place_; }
 
 void CUDADeviceContext::Wait() const {
-  PADDLE_ENFORCE(cudaStreamSynchronize(stream_));
-  PADDLE_ENFORCE(cudaGetLastError());
+  auto& allocator =
+      DeviceTemporaryAllocator::Instance().Get<CUDADeviceContext>(*this);
+  allocator.Release([=]() {
+    PADDLE_ENFORCE(cudaStreamSynchronize(stream_));
+    PADDLE_ENFORCE(cudaGetLastError());
+  });
 }
 
 int CUDADeviceContext::GetComputeCapability() const {
diff --git a/paddle/fluid/platform/device_context.h b/paddle/fluid/platform/device_context.h
index 812e56f1f9..af9744dcb8 100644
--- a/paddle/fluid/platform/device_context.h
+++ b/paddle/fluid/platform/device_context.h
@@ -15,8 +15,10 @@ limitations under the License. */
 #include <mutex>  // NOLINT
 #include <string>
 #include <unordered_map>
+#include <utility>
 #include <vector>
 #include "paddle/fluid/memory/malloc.h"
+#include "paddle/fluid/platform/temporary_allocator.h"
 #ifdef PADDLE_WITH_CUDA
 #include "paddle/fluid/platform/dynload/cublas.h"
 #include "paddle/fluid/platform/dynload/cudnn.h"
@@ -39,6 +41,50 @@ limitations under the License. */
 namespace paddle {
 namespace platform {
 
+/*! \brief device temporary allocator singleton */
+class DeviceTemporaryAllocator {
+ public:
+  static DeviceTemporaryAllocator& Instance() {
+    PADDLE_ENFORCE_NOT_NULL(allocators,
+                            "Need to Create DeviceTemporaryAllocator first!");
+    return *allocators;
+  }
+
+  static DeviceTemporaryAllocator& Init() {
+    if (allocators == nullptr) {
+      allocators = new DeviceTemporaryAllocator();
+    }
+    return *allocators;
+  }
+
+/*! \brief  Return handle of single temporary allocator. */
+#ifdef PADDLE_WITH_CUDA
+  platform::TemporaryAllocator& Get(const platform::Place& place,
+                                    const cudaStream_t& stream);
+#endif
+  template <typename DeviceContext>
+  platform::TemporaryAllocator& Get(const DeviceContext& dev_ctx);
+
+  platform::TemporaryAllocator& Get(const platform::Place& place);
+
+ private:
+  DeviceTemporaryAllocator() : cpu_allocator_(platform::CPUPlace()) {}
+
+  static DeviceTemporaryAllocator* allocators;
+
+  platform::TemporaryAllocator cpu_allocator_;
+
+#ifdef PADDLE_WITH_CUDA
+  std::map<std::pair<platform::Place, cudaStream_t>,
+           std::unique_ptr<platform::TemporaryAllocator>>
+      device_allocator_;
+#endif
+
+  std::mutex mtx_;
+
+  DISABLE_COPY_AND_ASSIGN(DeviceTemporaryAllocator);
+};
+
 class DeviceContext {
  public:
   virtual ~DeviceContext() {}
diff --git a/paddle/fluid/platform/init.cc b/paddle/fluid/platform/init.cc
index 0d10d82d74..ac86b38a61 100644
--- a/paddle/fluid/platform/init.cc
+++ b/paddle/fluid/platform/init.cc
@@ -110,7 +110,7 @@ void InitDevices(bool init_p2p, const std::vector<int> devices) {
   }
   places.emplace_back(platform::CPUPlace());
   platform::DeviceContextPool::Init(places);
-
+  platform::DeviceTemporaryAllocator::Init();
 #ifndef PADDLE_WITH_MKLDNN
   platform::SetNumThreads(FLAGS_paddle_num_threads);
 #endif
diff --git a/paddle/fluid/platform/temporary_allocator.cc b/paddle/fluid/platform/temporary_allocator.cc
new file mode 100644
index 0000000000..0be017f75b
--- /dev/null
+++ b/paddle/fluid/platform/temporary_allocator.cc
@@ -0,0 +1,95 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/platform/temporary_allocator.h"
+#include "paddle/fluid/memory/allocation/allocator_facade.h"
+
+DEFINE_double(limit_of_temporary_allocation, -1,
+              "The up limit of temporary_allocation size.");
+
+namespace paddle {
+namespace platform {
+namespace alloc = memory::allocation;
+
+TemporaryAllocation::TemporaryAllocation(
+    alloc::AllocationPtr &&underlying_allocation)
+    : Allocation(underlying_allocation->ptr(), underlying_allocation->size(),
+                 underlying_allocation->place()),
+      underlying_allocation_(std::move(underlying_allocation)) {}
+
+TemporaryAllocator::TemporaryAllocator(platform::Place place) : place_(place) {
+  temp_mem_queue_.reset(new std::deque<TemporaryAllocation *>());
+}
+
+bool TemporaryAllocator::IsAllocThreadSafe() const { return true; }
+
+void TemporaryAllocator::Release(const std::function<void()> &callback) {
+  std::shared_ptr<std::deque<TemporaryAllocation *>> t_allocations;
+  {
+    std::unique_lock<std::mutex> lock(mtx_);
+    callback();
+    t_allocations = temp_mem_queue_;
+    temp_mem_queue_.reset(new std::deque<TemporaryAllocation *>());
+    wait_delete_mem_ = 0;
+  }
+  for (auto tmp : *t_allocations) {
+    VLOG(10) << "Delete temporary allocation " << tmp->ptr()
+             << " size: " << tmp->size();
+    delete tmp;
+  }
+}
+
+void TemporaryAllocator::Free(alloc::Allocation *allocation) {
+  auto *temp_allocation = dynamic_cast<TemporaryAllocation *>(allocation);
+  PADDLE_ENFORCE_NOT_NULL(temp_allocation);
+  if (platform::is_gpu_place(temp_allocation->place())) {
+    size_t wait_delete_mem = 0;
+    {
+      std::unique_lock<std::mutex> lock(mtx_);
+      temp_mem_queue_->emplace_back(temp_allocation);
+      wait_delete_mem_ += temp_allocation->size();
+      wait_delete_mem = wait_delete_mem_;
+      VLOG(10) << "Move temporary allocation: " << temp_allocation->ptr()
+               << " to delete queue: " << temp_allocation->size() << "; "
+               << "wait_delete_mem: " << wait_delete_mem_;
+    }
+    if (FLAGS_limit_of_temporary_allocation > 0 &&
+        wait_delete_mem > FLAGS_limit_of_temporary_allocation) {
+      Release(callback_);
+    }
+    return;
+  }
+  delete temp_allocation;
+}
+
+size_t TemporaryAllocator::TemporaryAllocationQueueSize() {
+  std::unique_lock<std::mutex> lock(mtx_);
+  return temp_mem_queue_ ? temp_mem_queue_->size() : 0;
+}
+
+void TemporaryAllocator::SetCallback(const std::function<void()> &callback) {
+  callback_ = callback;
+}
+
+alloc::Allocation *TemporaryAllocator::AllocateImpl(
+    size_t size, alloc::Allocator::Attr attr) {
+  auto raw_allocation =
+      alloc::AllocatorFacade::Instance().Alloc(place_, size, attr);
+  auto temp_mem = new TemporaryAllocation(std::move(raw_allocation));
+  VLOG(10) << "Alloc temporary allocation: " << temp_mem->ptr() << ": " << size;
+  return temp_mem;
+}
+
+}  // namespace platform
+}  // namespace paddle
diff --git a/paddle/fluid/platform/temporary_allocator.h b/paddle/fluid/platform/temporary_allocator.h
new file mode 100644
index 0000000000..4e32d2d695
--- /dev/null
+++ b/paddle/fluid/platform/temporary_allocator.h
@@ -0,0 +1,63 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <condition_variable>  // NOLINT
+#include <deque>
+#include <mutex>  // NOLINT
+#include "paddle/fluid/memory/allocation/allocator.h"
+#include "paddle/fluid/platform/lock_guard_ptr.h"
+namespace paddle {
+namespace platform {
+
+class TemporaryAllocation : public memory::allocation::Allocation {
+ public:
+  explicit TemporaryAllocation(
+      memory::allocation::AllocationPtr &&underlying_allocation);
+
+  memory::allocation::AllocationPtr underlying_allocation_;
+};
+
+class TemporaryAllocator : public memory::allocation::Allocator {
+ public:
+  explicit TemporaryAllocator(platform::Place place);
+
+  void Release(const std::function<void()> &callback);
+
+  size_t TemporaryAllocationQueueSize();
+
+  bool IsAllocThreadSafe() const override;
+
+  void SetCallback(const std::function<void()> &callback);
+
+ protected:
+  void Free(memory::allocation::Allocation *allocation) override;
+
+  memory::allocation::Allocation *AllocateImpl(
+      size_t size, memory::allocation::Allocator::Attr attr) override;
+
+ private:
+  platform::Place place_;
+
+  // When the allocation is not held by any variable, it should be placed
+  // to temp_mem_queue immediately.
+  std::shared_ptr<std::deque<TemporaryAllocation *>> temp_mem_queue_{nullptr};
+
+  std::mutex mtx_;
+  size_t wait_delete_mem_{0};
+  std::function<void()> callback_;
+};
+
+}  // namespace platform
+}  // namespace paddle
diff --git a/paddle/fluid/platform/temporary_allocator_test.cc b/paddle/fluid/platform/temporary_allocator_test.cc
new file mode 100644
index 0000000000..3b940b0e82
--- /dev/null
+++ b/paddle/fluid/platform/temporary_allocator_test.cc
@@ -0,0 +1,165 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/platform/temporary_allocator.h"
+#include <gtest/gtest.h>
+#include "paddle/fluid/framework/tensor.h"
+#include "paddle/fluid/platform/create_tensor_with_allocationptr.h"
+DECLARE_double(limit_of_temporary_allocation);
+
+namespace paddle {
+namespace platform {
+
+TEST(temporary_allocator, temporary_allocator) {
+  platform::CPUPlace cpu_place;
+  TemporaryAllocator alloc(cpu_place);
+  alloc.Allocate(100);
+
+#ifdef PADDLE_WITH_CUDA
+  platform::CUDAPlace gpu_place(0);
+  TemporaryAllocator gpu_alloc(gpu_place);
+
+  auto allocation = gpu_alloc.Allocate(101);
+  PADDLE_ENFORCE_EQ(gpu_alloc.TemporaryAllocationQueueSize(), 0);
+  gpu_alloc.Release([]() {});
+  PADDLE_ENFORCE_EQ(gpu_alloc.TemporaryAllocationQueueSize(), 0);
+
+  {
+    auto allocation = gpu_alloc.Allocate(102);
+    PADDLE_ENFORCE_EQ(gpu_alloc.TemporaryAllocationQueueSize(), 0);
+  }
+  PADDLE_ENFORCE_EQ(gpu_alloc.TemporaryAllocationQueueSize(), 1);
+  gpu_alloc.Release([]() {});
+  PADDLE_ENFORCE_EQ(gpu_alloc.TemporaryAllocationQueueSize(), 0);
+#endif
+}
+
+TEST(temporary_allocator, add_callback) {
+#ifdef PADDLE_WITH_CUDA
+  FLAGS_limit_of_temporary_allocation = 10;
+  platform::CUDAPlace gpu_place(0);
+  TemporaryAllocator gpu_alloc(gpu_place);
+
+  platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
+  auto* dev_ctx =
+      static_cast<platform::CUDADeviceContext*>(pool.Get(gpu_place));
+  auto stream = dev_ctx->stream();
+  bool deleted = false;
+  gpu_alloc.SetCallback([stream, &deleted]() {
+    PADDLE_ENFORCE(cudaStreamSynchronize(stream));
+    PADDLE_ENFORCE(cudaGetLastError());
+    deleted = true;
+  });
+  { gpu_alloc.Allocate(100); }
+  PADDLE_ENFORCE(deleted);
+  FLAGS_limit_of_temporary_allocation = -1;
+#endif
+}
+
+TEST(temporary_allocator, create_tensor_with_allocationptr) {
+  platform::CPUPlace cpu_place;
+  TemporaryAllocator cpu_alloc(cpu_place);
+  {
+    size_t memory_size = 200;
+    auto allocation = cpu_alloc.Allocate(memory_size);
+    void* address = allocation->ptr();
+    int numel = memory_size / sizeof(float);
+    framework::Tensor tensor =
+        GetTensor<float>(std::move(allocation), framework::make_ddim({numel}));
+    PADDLE_ENFORCE_EQ(address, tensor.data<float>());
+    PADDLE_ENFORCE_EQ(tensor.numel(), numel);
+  }
+
+#ifdef PADDLE_WITH_CUDA
+  platform::CUDAPlace gpu_place(0);
+  TemporaryAllocator gpu_alloc(gpu_place);
+
+  {
+    size_t memory_size = 300;
+    auto allocation = gpu_alloc.Allocate(memory_size);
+    void* address = allocation->ptr();
+    int numel = memory_size / sizeof(float);
+    framework::Tensor tensor =
+        GetTensor<float>(std::move(allocation), framework::make_ddim({numel}));
+    PADDLE_ENFORCE_EQ(address, tensor.data<float>());
+    PADDLE_ENFORCE_EQ(tensor.numel(), numel);
+  }
+
+  // The allocation is not holded now, it should be placed to
+  // TemporaryAllocationQueue.
+  PADDLE_ENFORCE_EQ(gpu_alloc.TemporaryAllocationQueueSize(), 1);
+  gpu_alloc.Release([]() {});
+  PADDLE_ENFORCE_EQ(gpu_alloc.TemporaryAllocationQueueSize(), 0);
+#endif
+}
+
+TEST(temporary_allocator, create_tensor_with_allocationptr2) {
+  platform::CPUPlace cpu_place;
+  TemporaryAllocator cpu_alloc(cpu_place);
+  {
+    size_t memory_size = 400;
+    int numel = memory_size / sizeof(float);
+
+    framework::Tensor out_side_tensor;
+    void* address;
+    {
+      auto allocation = cpu_alloc.Allocate(memory_size);
+      address = allocation->ptr();
+      framework::Tensor tensor = GetTensor<float>(
+          std::move(allocation), framework::make_ddim({numel}));
+      PADDLE_ENFORCE_EQ(address, tensor.data<float>());
+      PADDLE_ENFORCE_EQ(tensor.numel(), numel);
+
+      out_side_tensor.ShareDataWith(tensor);
+    }
+    PADDLE_ENFORCE_EQ(address, out_side_tensor.data<float>());
+    PADDLE_ENFORCE_EQ(out_side_tensor.numel(), numel);
+  }
+
+#ifdef PADDLE_WITH_CUDA
+  platform::CUDAPlace gpu_place(0);
+  TemporaryAllocator gpu_alloc(gpu_place);
+  {
+    void* address;
+    size_t memory_size = 500;
+    int numel = memory_size / sizeof(float);
+    framework::Tensor out_side_tensor;
+    {
+      auto allocation = gpu_alloc.Allocate(memory_size);
+      address = allocation->ptr();
+      framework::Tensor tensor = GetTensor<float>(
+          std::move(allocation), framework::make_ddim({numel}));
+      PADDLE_ENFORCE_EQ(address, tensor.data<float>());
+      PADDLE_ENFORCE_EQ(tensor.numel(), numel);
+
+      out_side_tensor.ShareDataWith(tensor);
+    }
+    PADDLE_ENFORCE_EQ(address, out_side_tensor.data<float>());
+    PADDLE_ENFORCE_EQ(out_side_tensor.numel(), numel);
+    // The allocation is holded by out_side_tensor.
+    PADDLE_ENFORCE_EQ(gpu_alloc.TemporaryAllocationQueueSize(), 0);
+    gpu_alloc.Release([]() {});
+    PADDLE_ENFORCE_EQ(gpu_alloc.TemporaryAllocationQueueSize(), 0);
+  }
+
+  // The allocation is not holded now, it should be placed to
+  // TemporaryAllocationQueue.
+  PADDLE_ENFORCE_EQ(gpu_alloc.TemporaryAllocationQueueSize(), 1);
+  gpu_alloc.Release([]() {});
+  PADDLE_ENFORCE_EQ(gpu_alloc.TemporaryAllocationQueueSize(), 0);
+#endif
+}
+
+}  //  namespace platform
+}  //  namespace paddle

From 68ab16444abc54db77caae99a40f841d53016a81 Mon Sep 17 00:00:00 2001
From: tensor-tang <tangjian03@baidu.com>
Date: Fri, 21 Dec 2018 08:04:41 +0000
Subject: [PATCH 25/28] add eng doc of jit kernel and follow comments

test=develop
---
 paddle/fluid/operators/jit/README.en.md | 76 +++++++++++++++++++++++++
 paddle/fluid/operators/jit/README.md    | 40 ++++++-------
 paddle/fluid/operators/jit/gen_base.h   |  2 +
 3 files changed, 98 insertions(+), 20 deletions(-)
 create mode 100644 paddle/fluid/operators/jit/README.en.md

diff --git a/paddle/fluid/operators/jit/README.en.md b/paddle/fluid/operators/jit/README.en.md
new file mode 100644
index 0000000000..8670ec2ff2
--- /dev/null
+++ b/paddle/fluid/operators/jit/README.en.md
@@ -0,0 +1,76 @@
+# JIT Kernel
+
+JIT(Just In Time) Kernel contains actually generated code and some other implemenations with the same logic.
+Each implementations has its own condition to use, defined in `UseMe`.
+They are combined together to get the best performance of one single independent function.
+They could be some very simple functions like vector multiply, or some complicated functions like LSTM.
+And they can be composed with some other exited jit kernels to build up a complex function. 
+Currently it's only supported on CPU yet.
+
+## Contents
+
+```txt
+PaddlePaddle/Paddle/paddle/fluid/
+├── ...
+└── operators/
+    ├── .../
+    └── jit/
+        ├── ...
+        ├── gen/
+        │   └── ...
+        |── more/
+        │   ├── ...
+        │   ├── mkl/
+        │   │   └── ...
+        │   ├── mkldnn/
+        │   │   └── ...
+        │   ├── mix/
+        │   │   └── ...
+        │   ├── intrinsic/
+        │   │   └── ...
+        │   └── openblas/
+        │       └── ...
+        └── refer/
+            └── ...
+```
+
+All basical definations of jit kernels are addressed in `paddle/fluid/operators/jit` including these three key folders `refer`, `gen`, `more`. There is only one unique name for each kernel while may have seraval implementations with same functionality.
+
+- `refer`: Each kernel must have one reference implementation on CPU, and it should only focus on the correctness and should not depends on any third-party libraries.
+- `gen`: The code generated should be kept here. They should be designed focusing on the best performance, which depends on Xbyak.
+- `more`: All other implementations should be kept in this folder with one directory corresponding to one library kind or method kind, such as mkl, mkldnn, openblas or intrinsic code. Each implementation should have it advantage. 
+
+## How to use
+
+One simple function `jit::Get`, which is very easy to use, is supported to get the kernel.
+It can automatically return the expected function with best performance under the given attributes. 
+All kernels are inlcuded in `paddle/fluid/operators/jit/kernels.h`, you can only include this one header to get all the registered kernels.
+
+## Solid Test
+
+- Unit Test
+    All functions should be compared with the corresponding reference functions, including data tyep `float` and `double`.
+- Benchmark
+    All functions should be tested, and make sure the `jit::Get` function obtain the best performance with all attributes.
+
+# How to add new kernel
+
+## Required
+
+1. Add `your_key` at `KernelType`.
+2. Add reference function of `your_key`. 
+Note:
+    - this should be run on CPU and do not depend on any third-party.
+    - Add `USE_JITKERNEL_REFER(your_key)` in `refer/CmakeLists.txt` to make sure this code can be used.
+3. Add unit test in `test.cc`, and verfiy at least `float` and `double`.
+Test more data type for some special functions if necessary, for example `int8`.
+4. Add functions in `benchmark.cc` to test all function of same `KernelType`. Make sure `jit::Get` always get the best one.
+
+## Optional
+
+Add more implementations of `your_kery` for performance enhancement.
+
+1. Add functions based on generated code in `gen`. It should be derived from `JitCode` and should have corepsonding creator from `JitCodeCreator` which will be registered on the `your_key`.
+Note: Add new `KernelTuples` if necessary，your can refer to `XYZNTuples`.
+Specialie method `JitCodeKey` when add new attribute type。
+2. Add more functions in `more`，you can use any third party you wish, like mkl, mkldnn or intrinsic code to reach the best performance.
diff --git a/paddle/fluid/operators/jit/README.md b/paddle/fluid/operators/jit/README.md
index 89180b5900..cc19f09f56 100644
--- a/paddle/fluid/operators/jit/README.md
+++ b/paddle/fluid/operators/jit/README.md
@@ -10,26 +10,26 @@
 ```txt
 PaddlePaddle/Paddle/paddle/fluid/
 ├── ...
-├── operator/
-│   ├── .../
-└── jit/
-    ├── ...
-    ├── gen/
-    │   └── ...
-    |── more/
-    │   ├── ...
-    │   ├── mkl/
-    │   │   └── ...
-    │   ├── mkldnn/
-    │   │   └── ...
-    │   ├── mix/
-    │   │   └── ...
-    │   ├── intrinsic/
-    │   │   └── ...
-    │   └── openblas/
-    │       └── ...
-    └── refer/
-        └── ...
+└── operators/
+    ├── .../
+    └── jit/
+        ├── ...
+        ├── gen/
+        │   └── ...
+        |── more/
+        │   ├── ...
+        │   ├── mkl/
+        │   │   └── ...
+        │   ├── mkldnn/
+        │   │   └── ...
+        │   ├── mix/
+        │   │   └── ...
+        │   ├── intrinsic/
+        │   │   └── ...
+        │   └── openblas/
+        │       └── ...
+        └── refer/
+            └── ...
 ```
 
 基本类的定义都放在根目录下，根目录下包括gen,more和refer三个目录。每个目录下都是一种或者多种实现，每种kernel算子都需要有reference的实现，用作单元测试的基准，其他的实现都是可选的。
diff --git a/paddle/fluid/operators/jit/gen_base.h b/paddle/fluid/operators/jit/gen_base.h
index 48855abd26..4af01a4376 100644
--- a/paddle/fluid/operators/jit/gen_base.h
+++ b/paddle/fluid/operators/jit/gen_base.h
@@ -36,6 +36,8 @@ class GenBase : public Kernel {
     if (FLAGS_dump_jitcode) {
       this->dumpCode(code);
     }
+    // Note: failed to cast with reinterpret_cast<const Func> on Mac clang,
+    // then workaround with const_cast. Any better idea is appreciated.
     return reinterpret_cast<Func>(const_cast<unsigned char*>(code));
   }
 

From 00dadb072062d79633c86894919b7de7b6245550 Mon Sep 17 00:00:00 2001
From: gongweibao <weibao.gong@gmail.com>
Date: Mon, 24 Dec 2018 14:40:27 +0800
Subject: [PATCH 26/28] fix apple cuddn complation error test=develop (#15003)

---
 cmake/cudnn.cmake | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/cmake/cudnn.cmake b/cmake/cudnn.cmake
index 09bec347db..fb899e3d7c 100644
--- a/cmake/cudnn.cmake
+++ b/cmake/cudnn.cmake
@@ -44,9 +44,9 @@ if(WIN32)
 set(CUDNN_LIB_NAME "cudnn.lib" "cudnn64_7.dll")
 endif(WIN32)
 
-if(Apple)
+if(APPLE)
 set(CUDNN_LIB_NAME "libcudnn.dylib" "libcudnn.so")
-endif(Apple)
+endif(APPLE)
 
 find_library(CUDNN_LIBRARY NAMES ${CUDNN_LIB_NAME} # libcudnn_static.a
     PATHS ${CUDNN_CHECK_LIBRARY_DIRS} ${CUDNN_INCLUDE_DIR} ${__libpath_hist}

From 938705745e4b0734db183ae2307b5557aa79dda7 Mon Sep 17 00:00:00 2001
From: whs <wanghaoshuang@baidu.com>
Date: Mon, 24 Dec 2018 16:43:53 +0800
Subject: [PATCH 27/28] Init paddle slim (#14834)

* Init slim.

* Remove distillation demo.

* Fix import errors.
test=develop

* Fix some issues.
test=develop

* Fix configs.
test=develop

* Modify API.spec.
test=develop

* Fix format.
test=develop

* Fix format.
test=develop

* Add some comments.
---
 paddle/fluid/API.spec                         |  17 +++
 python/paddle/fluid/contrib/__init__.py       |   3 +
 python/paddle/fluid/contrib/slim/__init__.py  |  25 ++++
 .../fluid/contrib/slim/core/__init__.py       |  24 ++++
 .../fluid/contrib/slim/core/compress_pass.py  | 129 ++++++++++++++++++
 .../paddle/fluid/contrib/slim/core/config.py  | 111 +++++++++++++++
 .../fluid/contrib/slim/core/pass_builder.py   |  39 ++++++
 .../fluid/contrib/slim/core/strategy.py       |  48 +++++++
 .../slim/demo/filter_prune/config.yaml        |  28 ++++
 .../contrib/slim/demo/filter_prune/demo.py    |  69 ++++++++++
 .../fluid/contrib/slim/graph/__init__.py      |  23 ++++
 .../fluid/contrib/slim/graph/executor.py      |  62 +++++++++
 .../paddle/fluid/contrib/slim/graph/graph.py  |  45 ++++++
 .../fluid/contrib/slim/graph/graph_pass.py    |  42 ++++++
 .../fluid/contrib/slim/prune/__init__.py      |  21 +++
 .../contrib/slim/prune/prune_strategy.py      |  66 +++++++++
 .../paddle/fluid/contrib/slim/prune/pruner.py |  83 +++++++++++
 .../fluid/contrib/slim/unitest/__init__.py    |  13 ++
 .../contrib/slim/unitest/configs/config.yaml  |  29 ++++
 .../contrib/slim/unitest/configs/pruners.yaml |  12 ++
 .../slim/unitest/configs/pruners_0.yaml       |  12 ++
 .../contrib/slim/unitest/test_factory.py      |  41 ++++++
 python/requirements.txt                       |   2 +
 python/setup.py.in                            |   4 +
 24 files changed, 948 insertions(+)
 create mode 100644 python/paddle/fluid/contrib/slim/__init__.py
 create mode 100644 python/paddle/fluid/contrib/slim/core/__init__.py
 create mode 100644 python/paddle/fluid/contrib/slim/core/compress_pass.py
 create mode 100644 python/paddle/fluid/contrib/slim/core/config.py
 create mode 100644 python/paddle/fluid/contrib/slim/core/pass_builder.py
 create mode 100644 python/paddle/fluid/contrib/slim/core/strategy.py
 create mode 100644 python/paddle/fluid/contrib/slim/demo/filter_prune/config.yaml
 create mode 100644 python/paddle/fluid/contrib/slim/demo/filter_prune/demo.py
 create mode 100644 python/paddle/fluid/contrib/slim/graph/__init__.py
 create mode 100644 python/paddle/fluid/contrib/slim/graph/executor.py
 create mode 100644 python/paddle/fluid/contrib/slim/graph/graph.py
 create mode 100644 python/paddle/fluid/contrib/slim/graph/graph_pass.py
 create mode 100644 python/paddle/fluid/contrib/slim/prune/__init__.py
 create mode 100644 python/paddle/fluid/contrib/slim/prune/prune_strategy.py
 create mode 100644 python/paddle/fluid/contrib/slim/prune/pruner.py
 create mode 100644 python/paddle/fluid/contrib/slim/unitest/__init__.py
 create mode 100644 python/paddle/fluid/contrib/slim/unitest/configs/config.yaml
 create mode 100644 python/paddle/fluid/contrib/slim/unitest/configs/pruners.yaml
 create mode 100644 python/paddle/fluid/contrib/slim/unitest/configs/pruners_0.yaml
 create mode 100644 python/paddle/fluid/contrib/slim/unitest/test_factory.py

diff --git a/paddle/fluid/API.spec b/paddle/fluid/API.spec
index e3b4449925..b6974c6af2 100644
--- a/paddle/fluid/API.spec
+++ b/paddle/fluid/API.spec
@@ -351,6 +351,23 @@ paddle.fluid.contrib.QuantizeTranspiler.__init__ ArgSpec(args=['self', 'weight_b
 paddle.fluid.contrib.QuantizeTranspiler.convert_to_int8 ArgSpec(args=['self', 'program', 'place', 'scope'], varargs=None, keywords=None, defaults=(None,))
 paddle.fluid.contrib.QuantizeTranspiler.freeze_program ArgSpec(args=['self', 'program', 'place', 'fuse_bn', 'scope'], varargs=None, keywords=None, defaults=(False, None))
 paddle.fluid.contrib.QuantizeTranspiler.training_transpile ArgSpec(args=['self', 'program', 'startup_program'], varargs=None, keywords=None, defaults=(None, None))
+paddle.fluid.contrib.build_compressor ArgSpec(args=['place', 'data_reader', 'data_feeder', 'scope', 'metrics', 'epoch', 'config'], varargs=None, keywords=None, defaults=(None, None, None, None, None, None, None))
+paddle.fluid.contrib.CompressPass.__init__ ArgSpec(args=['self', 'place', 'data_reader', 'data_feeder', 'scope', 'metrics', 'epoch', 'program_exe'], varargs=None, keywords=None, defaults=(None, None, None, None, None, None, None))
+paddle.fluid.contrib.CompressPass.add_strategy ArgSpec(args=['self', 'strategy'], varargs=None, keywords=None, defaults=None)
+paddle.fluid.contrib.CompressPass.apply ArgSpec(args=['self', 'graph'], varargs=None, keywords=None, defaults=None)
+paddle.fluid.contrib.ImitationGraph.__init__ ArgSpec(args=['self', 'program'], varargs=None, keywords=None, defaults=(None,))
+paddle.fluid.contrib.ImitationGraph.all_parameters ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None)
+paddle.fluid.contrib.SensitivePruneStrategy.__init__ ArgSpec(args=['self', 'pruner', 'start_epoch', 'end_epoch', 'delta_rate', 'acc_loss_threshold', 'sensitivities'], varargs=None, keywords=None, defaults=(None, 0, 10, 0.2, 0.2, None))
+paddle.fluid.contrib.SensitivePruneStrategy.on_batch_begin ArgSpec(args=['self', 'context'], varargs=None, keywords=None, defaults=None)
+paddle.fluid.contrib.SensitivePruneStrategy.on_batch_end ArgSpec(args=['self', 'context'], varargs=None, keywords=None, defaults=None)
+paddle.fluid.contrib.SensitivePruneStrategy.on_compress_begin ArgSpec(args=['self', 'context'], varargs=None, keywords=None, defaults=None)
+paddle.fluid.contrib.SensitivePruneStrategy.on_compress_end ArgSpec(args=['self', 'context'], varargs=None, keywords=None, defaults=None)
+paddle.fluid.contrib.SensitivePruneStrategy.on_epoch_begin ArgSpec(args=['self', 'context'], varargs=None, keywords=None, defaults=None)
+paddle.fluid.contrib.SensitivePruneStrategy.on_epoch_end ArgSpec(args=['self', 'context'], varargs=None, keywords=None, defaults=None)
+paddle.fluid.contrib.MagnitudePruner.__init__ ArgSpec(args=['self', 'threshold'], varargs=None, keywords=None, defaults=None)
+paddle.fluid.contrib.MagnitudePruner.prune ArgSpec(args=['self', 'param', 'threshold'], varargs=None, keywords=None, defaults=(None,))
+paddle.fluid.contrib.RatioPruner.__init__ ArgSpec(args=['self', 'ratios'], varargs=None, keywords=None, defaults=(None,))
+paddle.fluid.contrib.RatioPruner.prune ArgSpec(args=['self', 'param', 'ratio'], varargs=None, keywords=None, defaults=(None,))
 paddle.fluid.contrib.load_persistables_for_increment ArgSpec(args=['dirname', 'executor', 'program', 'lookup_table_var', 'lookup_table_var_path'], varargs=None, keywords=None, defaults=None)
 paddle.fluid.contrib.load_persistables_for_inference ArgSpec(args=['dirname', 'executor', 'program', 'lookup_table_var_name'], varargs=None, keywords=None, defaults=None)
 paddle.fluid.contrib.convert_dist_to_sparse_program ArgSpec(args=['program'], varargs=None, keywords=None, defaults=None)
diff --git a/python/paddle/fluid/contrib/__init__.py b/python/paddle/fluid/contrib/__init__.py
index ece97b661f..24621110b1 100644
--- a/python/paddle/fluid/contrib/__init__.py
+++ b/python/paddle/fluid/contrib/__init__.py
@@ -22,6 +22,8 @@ from . import op_frequence
 from .op_frequence import *
 from . import quantize
 from .quantize import *
+from . import slim
+from .slim import *
 from . import utils
 from .utils import *
 
@@ -30,4 +32,5 @@ __all__ += decoder.__all__
 __all__ += memory_usage_calc.__all__
 __all__ += op_frequence.__all__
 __all__ += quantize.__all__
+__all__ += slim.__all__
 __all__ += utils.__all__
diff --git a/python/paddle/fluid/contrib/slim/__init__.py b/python/paddle/fluid/contrib/slim/__init__.py
new file mode 100644
index 0000000000..22dbf7c8b6
--- /dev/null
+++ b/python/paddle/fluid/contrib/slim/__init__.py
@@ -0,0 +1,25 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .core import *
+from .graph import *
+from .prune import *
+__all__ = [
+    'build_compressor',
+    'CompressPass',
+    'ImitationGraph',
+    'SensitivePruneStrategy',
+    'MagnitudePruner',
+    'RatioPruner',
+]
diff --git a/python/paddle/fluid/contrib/slim/core/__init__.py b/python/paddle/fluid/contrib/slim/core/__init__.py
new file mode 100644
index 0000000000..7826d5830a
--- /dev/null
+++ b/python/paddle/fluid/contrib/slim/core/__init__.py
@@ -0,0 +1,24 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from . import config
+from .config import *
+from . import compress_pass
+from .compress_pass import *
+from . import strategy
+from .strategy import *
+from . import pass_builder
+from .pass_builder import *
+
+__all__ = config.__all__ + compress_pass.__all__ + strategy.__all__ + pass_builder.__all__
diff --git a/python/paddle/fluid/contrib/slim/core/compress_pass.py b/python/paddle/fluid/contrib/slim/core/compress_pass.py
new file mode 100644
index 0000000000..c4c348b878
--- /dev/null
+++ b/python/paddle/fluid/contrib/slim/core/compress_pass.py
@@ -0,0 +1,129 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from ....core import CPUPlace
+from ..graph import get_executor
+
+__all__ = ['Context', 'CompressPass']
+
+
+class Context(object):
+    """
+    The context in the process of compression.
+    Args:
+        exe: The executor used to execute graph.
+        graph: The graph to be compressed.
+        scope: The scope used to execute graph.
+        program_exe: The program_exe is used to execute the program
+                     created for modifying the variables in scope.
+    """
+
+    def __init__(self, exe, graph, scope, program_exe=None):
+        # The total number of epoches to be trained.
+        self.epoch = 0
+        # Current epoch
+        self.epoch_id = 0
+        # Current batch
+        self.batch_id = 0
+        self.exe = exe
+        self.graph = graph
+        self.scope = scope
+        self.program_exe = program_exe
+
+
+class CompressPass(object):
+    """
+    The pass used to compress model.
+    Args:
+        place: The device used in compression.
+        data_reader: The data_reader used to run graph.
+        data_feeder: The data_feeder used to run graph.
+        scope: The scope used to run graph.
+        metrics: The metrics for evaluating model.
+        epoch: The total epoches of trainning in compression.
+        program_exe: The program_exe is used to execute the program
+                     created for modifying the variables in scope.
+    """
+
+    def __init__(self,
+                 place=None,
+                 data_reader=None,
+                 data_feeder=None,
+                 scope=None,
+                 metrics=None,
+                 epoch=None,
+                 program_exe=None):
+        self.strategies = []
+        self.place = CPUPlace() if place is None else place
+        self.data_reader = data_reader
+        self.data_feeder = data_feeder
+        self.scope = scope
+        self.metrics = metrics
+        self.epoch = epoch
+        self.program_exe = program_exe
+
+    def add_strategy(self, strategy):
+        """
+        Add a strategy to current compress pass.
+        Args:
+            strategy: The strategy to be added into current compress pass.
+        """
+        self.strategies.append(strategy)
+        self.epoch = max(strategy.end_epoch, self.epoch)
+
+    def apply(self, graph):
+        """
+        Compress a model.
+        Args:
+            graph: The target graph to be compressed.
+        """
+        self.executor = get_executor(graph, self.place)
+        context = Context(
+            self.executor, graph, self.scope, program_exe=self.program_exe)
+
+        for strategy in self.strategies:
+            strategy.on_compress_begin(context)
+
+        for epoch in range(self.epoch):
+
+            for strategy in self.strategies:
+                strategy.on_epoch_begin(context)
+
+            for data in self.data_reader():
+
+                for strategy in self.strategies:
+                    strategy.on_batch_begin(context)
+                fetches = None
+                if self.metrics:
+                    fetches = self.metrics.values()
+                feed = None
+                if self.data_feeder:
+                    feed = self.data_feeder.feed(data)
+                results = self.executor.run(graph,
+                                            fetches=fetches,
+                                            scope=self.scope,
+                                            feed=feed)
+                if results:
+                    print("results: {}".format(
+                        zip(self.metrics.keys(), results)))
+                for strategy in self.strategies:
+                    strategy.on_batch_end(context)
+                context.batch_id += 1
+
+            for strategy in self.strategies:
+                strategy.on_epoch_end(context)
+            context.epoch_id += 1
+
+        for strategy in self.strategies:
+            strategy.on_compress_end(context)
diff --git a/python/paddle/fluid/contrib/slim/core/config.py b/python/paddle/fluid/contrib/slim/core/config.py
new file mode 100644
index 0000000000..811c457003
--- /dev/null
+++ b/python/paddle/fluid/contrib/slim/core/config.py
@@ -0,0 +1,111 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import inspect
+import funcsigs
+import yaml
+from collections import OrderedDict
+from ..prune import *
+from .compress_pass import *
+from .strategy import *
+
+__all__ = ['ConfigFactory']
+"""This factory is used to create instances by loading and parsing configure file with yaml format.
+"""
+
+
+class ConfigFactory(object):
+    def __init__(self, config):
+        """Init a factory from configure file."""
+        self.instances = {}
+        self.version = None
+        self._parse_config(config)
+
+    def get_compress_pass(self):
+        """
+        Get compress pass from factory.
+        """
+        return self.instance('compress_pass')
+
+    def instance(self, name):
+        """
+        Get instance from factory.
+        """
+        if name in self.instances:
+            return self.instances[name]
+        else:
+            return None
+
+    def _new_instance(self, name, attrs):
+        if name not in self.instances:
+            class_ = globals()[attrs['class']]
+            sig = funcsigs.signature(class_.__init__)
+            keys = [
+                param.name for param in sig.parameters.values()
+                if (param.kind == param.POSITIONAL_OR_KEYWORD)
+            ][1:]
+            keys = set(attrs.keys()).intersection(set(keys))
+            args = {}
+            for key in keys:
+                value = attrs[key]
+                if isinstance(value, str) and value in self.instances:
+                    value = self.instances[value]
+                args[key] = value
+            self.instances[name] = class_(**args)
+        return self.instances.get(name)
+
+    def _parse_config(self, config):
+        assert config
+        with open(config, 'r') as config_file:
+            key_values = self._ordered_load(config_file)
+            for key in key_values:
+                # parse version
+                if key == 'version' and self.version is None:
+                    self.version = int(key_values['version'])
+                    assert self.version == int(key_values['version'])
+
+                # parse pruners
+                if key == 'pruners' or key == 'strategies':
+                    instances = key_values[key]
+                    for name in instances:
+                        self._new_instance(name, instances[name])
+
+                if key == 'compress_pass':
+                    compress_pass = self._new_instance(key, key_values[key])
+                    for name in key_values[key]['strategies']:
+                        strategy = self.instance(name)
+                        compress_pass.add_strategy(strategy)
+
+                if key == 'include':
+                    for config_file in key_values[key]:
+                        self._parse_config(config_file.strip())
+
+    def _ordered_load(self,
+                      stream,
+                      Loader=yaml.Loader,
+                      object_pairs_hook=OrderedDict):
+        """
+        See: https://stackoverflow.com/questions/5121931/in-python-how-can-you-load-yaml-mappings-as-ordereddicts
+        """
+
+        class OrderedLoader(Loader):
+            pass
+
+        def construct_mapping(loader, node):
+            loader.flatten_mapping(node)
+            return object_pairs_hook(loader.construct_pairs(node))
+
+        OrderedLoader.add_constructor(
+            yaml.resolver.BaseResolver.DEFAULT_MAPPING_TAG, construct_mapping)
+        return yaml.load(stream, OrderedLoader)
diff --git a/python/paddle/fluid/contrib/slim/core/pass_builder.py b/python/paddle/fluid/contrib/slim/core/pass_builder.py
new file mode 100644
index 0000000000..fc1ddc94e0
--- /dev/null
+++ b/python/paddle/fluid/contrib/slim/core/pass_builder.py
@@ -0,0 +1,39 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .compress_pass import CompressPass
+from .config import ConfigFactory
+
+__all__ = ['build_compressor']
+
+
+def build_compressor(place=None,
+                     data_reader=None,
+                     data_feeder=None,
+                     scope=None,
+                     metrics=None,
+                     epoch=None,
+                     config=None):
+    if config is not None:
+        factory = ConfigFactory(config)
+        comp_pass = factory.get_compress_pass()
+    else:
+        comp_pass = CompressPass()
+    comp_pass.place = place
+    comp_pass.data_reader = data_reader
+    comp_pass.data_feeder = data_feeder
+    comp_pass.scope = scope
+    comp_pass.metrics = metrics
+    comp_pass.epoch = epoch
+    return comp_pass
diff --git a/python/paddle/fluid/contrib/slim/core/strategy.py b/python/paddle/fluid/contrib/slim/core/strategy.py
new file mode 100644
index 0000000000..74d98e98b0
--- /dev/null
+++ b/python/paddle/fluid/contrib/slim/core/strategy.py
@@ -0,0 +1,48 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+__all__ = ['Strategy']
+
+
+class Strategy(object):
+    """
+    Base class for all strategies.
+    """
+
+    def __init__(self, start_epoch=0, end_epoch=10):
+        """
+        Args:
+            start_epoch: The first epoch to apply the strategy.
+            end_epoch: The last epoch to apply the strategy.
+        """
+        self.start_epoch = start_epoch
+        self.end_epoch = end_epoch
+
+    def on_compress_begin(self, context):
+        pass
+
+    def on_epoch_begin(self, context):
+        pass
+
+    def on_epoch_end(self, context):
+        pass
+
+    def on_batch_begin(self, context):
+        pass
+
+    def on_batch_end(self, context):
+        pass
+
+    def on_compress_end(self, context):
+        pass
diff --git a/python/paddle/fluid/contrib/slim/demo/filter_prune/config.yaml b/python/paddle/fluid/contrib/slim/demo/filter_prune/config.yaml
new file mode 100644
index 0000000000..ea888fa2c7
--- /dev/null
+++ b/python/paddle/fluid/contrib/slim/demo/filter_prune/config.yaml
@@ -0,0 +1,28 @@
+version: 1.0
+pruners:
+    pruner_1:
+        class: 'RatioPruner'
+        ratios:
+            'conv1_1.w': 0.3
+            'conv1_2.w': 0.4
+            '*': 0.9
+        group_dims:
+            '*': [1, 2, 3]
+        criterions:
+            '*': 'l1-norm'
+strategies:
+    strategy_1:
+        class: 'SensitivePruneStrategy'
+        pruner: 'pruner_1'
+        start_epoch: 0
+        end_epoch: 10
+        delta_rate: 0.20
+        acc_loss_threshold: 0.2
+        sensitivities:
+            'conv1_1.w': 0.4
+
+compress_pass:
+    class: 'CompressPass'
+    epoch: 100
+    strategies:
+        - strategy_1
diff --git a/python/paddle/fluid/contrib/slim/demo/filter_prune/demo.py b/python/paddle/fluid/contrib/slim/demo/filter_prune/demo.py
new file mode 100644
index 0000000000..21c59c0c9d
--- /dev/null
+++ b/python/paddle/fluid/contrib/slim/demo/filter_prune/demo.py
@@ -0,0 +1,69 @@
+# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle.fluid as fluid
+import paddle
+import os
+import sys
+from paddle.fluid.contrib.slim import CompressPass
+from paddle.fluid.contrib.slim import build_compressor
+from paddle.fluid.contrib.slim import ImitationGraph
+
+
+class LinearModel(object):
+    def __init__(slef):
+        pass
+
+    def train(self):
+        train_program = fluid.Program()
+        startup_program = fluid.Program()
+        startup_program.random_seed = 10
+        with fluid.program_guard(train_program, startup_program):
+            x = fluid.layers.data(name='x', shape=[13], dtype='float32')
+            y = fluid.layers.data(name='y', shape=[1], dtype='float32')
+            predict = fluid.layers.fc(input=x, size=1, act=None)
+            cost = fluid.layers.square_error_cost(input=predict, label=y)
+            avg_cost = fluid.layers.mean(cost)
+            eval_program = train_program.clone()
+            sgd_optimizer = fluid.optimizer.SGD(learning_rate=0.001)
+            sgd_optimizer.minimize(avg_cost)
+
+        train_reader = paddle.batch(
+            paddle.dataset.uci_housing.train(), batch_size=1)
+        eval_reader = paddle.batch(
+            paddle.dataset.uci_housing.test(), batch_size=1)
+        place = fluid.CPUPlace()
+        train_feeder = fluid.DataFeeder(place=place, feed_list=[x, y])
+        eval_feeder = fluid.DataFeeder(place=place, feed_list=[x, y])
+        exe = fluid.Executor(place)
+        exe.run(startup_program)
+        train_metrics = {"loss": avg_cost.name}
+        eval_metrics = {"loss": avg_cost.name}
+
+        graph = ImitationGraph(train_program)
+        config = './config.yaml'
+        comp_pass = build_compressor(
+            place,
+            data_reader=train_reader,
+            data_feeder=train_feeder,
+            scope=fluid.global_scope(),
+            metrics=train_metrics,
+            epoch=1,
+            config=config)
+        comp_pass.apply(graph)
+
+
+if __name__ == "__main__":
+    model = LinearModel()
+    model.train()
diff --git a/python/paddle/fluid/contrib/slim/graph/__init__.py b/python/paddle/fluid/contrib/slim/graph/__init__.py
new file mode 100644
index 0000000000..d65472d193
--- /dev/null
+++ b/python/paddle/fluid/contrib/slim/graph/__init__.py
@@ -0,0 +1,23 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from . import executor
+from .executor import *
+from . import graph
+from .graph import *
+from . import graph_pass
+from .graph_pass import *
+__all__ = executor.__all__
+__all__ += graph.__all__
+__all__ += graph_pass.__all__
diff --git a/python/paddle/fluid/contrib/slim/graph/executor.py b/python/paddle/fluid/contrib/slim/graph/executor.py
new file mode 100644
index 0000000000..c02c3af820
--- /dev/null
+++ b/python/paddle/fluid/contrib/slim/graph/executor.py
@@ -0,0 +1,62 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import abc
+from abc import abstractmethod
+from .... import executor
+from .graph import IRGraph, ImitationGraph
+
+__all__ = ['get_executor']
+
+
+class GraphExecutor(object):
+    __metaclass__ = abc.ABCMeta
+
+    def __init__(self, place):
+        self.place = place
+
+    @abstractmethod
+    def run(self, graph, feches=None, feed=None):
+        pass
+
+
+class IRGraphExecutor(GraphExecutor):
+    def run(self, grah, fetches, feed=None):
+        pass
+
+
+class ImitationGraphExecutor(GraphExecutor):
+    def __init__(self, place):
+        super(ImitationGraphExecutor, self).__init__(place)
+        self.exe = executor.Executor(place)
+
+    def run(self, graph, scope=None, fetches=None, feed=None):
+        assert isinstance(graph, ImitationGraph)
+        fetch_list = None
+        if fetches:
+            fetch_list = [
+                graph.program.global_block().var(name) for name in fetches
+            ]
+        results = self.exe.run(graph.program,
+                               scope=scope,
+                               fetch_list=fetch_list,
+                               feed=feed)
+        return results
+
+
+def get_executor(graph, place):
+    if isinstance(graph, ImitationGraph):
+        return ImitationGraphExecutor(place)
+    if isinstance(graph, IRGraph):
+        return IRGraphExecutor(place)
diff --git a/python/paddle/fluid/contrib/slim/graph/graph.py b/python/paddle/fluid/contrib/slim/graph/graph.py
new file mode 100644
index 0000000000..7d6b070203
--- /dev/null
+++ b/python/paddle/fluid/contrib/slim/graph/graph.py
@@ -0,0 +1,45 @@
+# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from ....framework import Program
+
+__all__ = ['Graph', 'ImitationGraph', 'IRGraph']
+
+
+class Graph(object):
+    """
+    Base class for all graph.
+    """
+
+    def __init__(self):
+        pass
+
+    def all_parameters(self):
+        """
+        Return all the parameters in current graph.
+        """
+        pass
+
+
+class ImitationGraph(Graph):
+    def __init__(self, program=None):
+        super(ImitationGraph, self).__init__()
+        self.program = Program() if program is None else program
+
+    def all_parameters(self):
+        return self.program.global_block().all_parameters()
+
+
+class IRGraph(Graph):
+    pass
diff --git a/python/paddle/fluid/contrib/slim/graph/graph_pass.py b/python/paddle/fluid/contrib/slim/graph/graph_pass.py
new file mode 100644
index 0000000000..1db6c4f110
--- /dev/null
+++ b/python/paddle/fluid/contrib/slim/graph/graph_pass.py
@@ -0,0 +1,42 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+__all__ = ['GraphPass', 'PruneParameterPass']
+
+
+class GraphPass(object):
+    """
+    Base class for all graph pass.
+    """
+
+    def __init__(self):
+        pass
+
+    def apply(self, graph):
+        pass
+
+
+class PruneParameterPass(GraphPass):
+    """
+    Generate a graph for pruning parameters from target graph.
+    """
+
+    def __init__(self, pruned_params, thresholds):
+        super(PruneParameterPass, self).__init__()
+        self.pruned_params = pruned_params
+        self.thresholds = thresholds
+        self.default_threshold = thresholds['*']
+
+    def apply(self, graph):
+        pass
diff --git a/python/paddle/fluid/contrib/slim/prune/__init__.py b/python/paddle/fluid/contrib/slim/prune/__init__.py
new file mode 100644
index 0000000000..764a45bb13
--- /dev/null
+++ b/python/paddle/fluid/contrib/slim/prune/__init__.py
@@ -0,0 +1,21 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from . import pruner
+from .pruner import *
+from . import prune_strategy
+from .prune_strategy import *
+
+__all__ = pruner.__all__
+__all__ += prune_strategy.__all__
diff --git a/python/paddle/fluid/contrib/slim/prune/prune_strategy.py b/python/paddle/fluid/contrib/slim/prune/prune_strategy.py
new file mode 100644
index 0000000000..34c5107daa
--- /dev/null
+++ b/python/paddle/fluid/contrib/slim/prune/prune_strategy.py
@@ -0,0 +1,66 @@
+# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from ..core.strategy import Strategy
+from ....framework import Program, program_guard
+from .... import layers
+import numpy as np
+
+__all__ = ['SensitivePruneStrategy', 'PruneStrategy']
+
+
+class SensitivePruneStrategy(Strategy):
+    def __init__(self,
+                 pruner=None,
+                 start_epoch=0,
+                 end_epoch=10,
+                 delta_rate=0.20,
+                 acc_loss_threshold=0.2,
+                 sensitivities=None):
+        super(SensitivePruneStrategy, self).__init__(start_epoch, end_epoch)
+        self.pruner = pruner
+        self.delta_rate = delta_rate
+        self.acc_loss_threshold = acc_loss_threshold
+        self.sensitivities = sensitivities
+
+
+class PruneStrategy(Strategy):
+    """
+    The strategy that pruning weights by threshold or ratio iteratively.
+    """
+
+    def __init__(self,
+                 pruner,
+                 mini_batch_pruning_frequency=1,
+                 start_epoch=0,
+                 end_epoch=10):
+        super(PruneStrategy, self).__init__(start_epoch, end_epoch)
+        self.pruner = pruner
+        self.mini_batch_pruning_frequency = mini_batch_pruning_frequency
+
+    def _triger(self, context):
+        return (context.batch_id % self.mini_batch_pruning_frequency == 0 and
+                self.start_epoch <= context.epoch_id < self.end_epoch)
+
+    def on_batch_end(self, context):
+        if self._triger(context):
+            prune_program = Program()
+            with program_guard(prune_program):
+                for param in context.graph.all_parameters():
+                    prune_program.global_block().clone_variable(param)
+                    p = prune_program.global_block().var(param.name)
+                    zeros_mask = self.pruner.prune(p)
+                    pruned_param = p * zeros_mask
+                    layers.assign(input=pruned_param, output=param)
+            context.program_exe.run(prune_program, scope=context.scope)
diff --git a/python/paddle/fluid/contrib/slim/prune/pruner.py b/python/paddle/fluid/contrib/slim/prune/pruner.py
new file mode 100644
index 0000000000..ca72bcb6f6
--- /dev/null
+++ b/python/paddle/fluid/contrib/slim/prune/pruner.py
@@ -0,0 +1,83 @@
+# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+from .... import layers
+
+__all__ = ['Pruner', 'MagnitudePruner', 'RatioPruner']
+
+
+class Pruner(object):
+    """
+    Base class of all pruners.
+    """
+
+    def __init__(self):
+        pass
+
+    def prune(self, param):
+        pass
+
+
+class MagnitudePruner(Pruner):
+    """
+    Pruner used to pruning a parameter by threshold.
+    """
+
+    def __init__(self, threshold):
+        self.threshold = threshold
+
+    def prune(self, param, threshold=None):
+        if threshold is None:
+            thres = layers.fill_constant(
+                shape=[1], dtype='float32', value=self.threshold)
+        else:
+            thres = threshold
+        zeros_mask = layers.less_than(x=param, y=thres)
+        return zeros_mask
+
+
+class RatioPruner(Pruner):
+    """
+    Pruner used to pruning a parameter by ratio.
+    """
+
+    def __init__(self, ratios=None):
+        """
+        Args:
+            ratios: dict with pair (paramer_name, pruned_ratio). 
+        """
+        self.ratios = ratios
+
+    def prune(self, param, ratio=None):
+        """
+        Args:
+            ratio: `ratio=40%` means pruning (1 - 40%) weights to zero.
+        """
+        if ratio is None:
+            rat = self.ratios[
+                param.name] if param.name in self.ratios else self.ratios['*']
+        else:
+            rat = ratio
+        if rat < 1.0:
+            k = max(int(rat * np.prod(param.shape)), 1)
+            param_vec = layers.reshape(x=param, shape=[1, -1])
+            param_topk, _ = layers.topk(param_vec, k=k)
+            threshold = layers.slice(
+                param_topk, axes=[1], starts=[-1], ends=[k])
+            threshold = layers.reshape(x=threshold, shape=[1])
+            zeros_mask = layers.less_than(x=param, y=threshold)
+        else:
+            zeros_mask = layers.ones(param.shape)
+        return zeros_mask
diff --git a/python/paddle/fluid/contrib/slim/unitest/__init__.py b/python/paddle/fluid/contrib/slim/unitest/__init__.py
new file mode 100644
index 0000000000..6d41233e22
--- /dev/null
+++ b/python/paddle/fluid/contrib/slim/unitest/__init__.py
@@ -0,0 +1,13 @@
+# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/python/paddle/fluid/contrib/slim/unitest/configs/config.yaml b/python/paddle/fluid/contrib/slim/unitest/configs/config.yaml
new file mode 100644
index 0000000000..db488b9633
--- /dev/null
+++ b/python/paddle/fluid/contrib/slim/unitest/configs/config.yaml
@@ -0,0 +1,29 @@
+version: 1.0
+include: ["./unitest/configs/pruners.yaml", "./unitest/configs/pruners_0.yaml"]
+pruners:
+    pruner_1:
+        class: 'RatioPruner'
+        ratios:
+            'conv1_1.w': 0.3
+            'conv1_2.w': 0.4
+            '*': 0.9
+        group_dims:
+            '*': [1, 2, 3]
+        criterions:
+            '*': 'l1-norm'
+strategies:
+    strategy_1:
+        class: 'SensitivePruneStrategy'
+        pruner: 'pruner_2'
+        start_epoch: 0
+        end_epoch: 10
+        delta_rate: 0.20
+        acc_loss_threshold: 0.2
+        sensitivities:
+            'conv1_1.w': 0.4
+
+compress_pass:
+    class: 'CompressPass'
+    epoch: 100
+    strategies:
+        - strategy_1
diff --git a/python/paddle/fluid/contrib/slim/unitest/configs/pruners.yaml b/python/paddle/fluid/contrib/slim/unitest/configs/pruners.yaml
new file mode 100644
index 0000000000..235092c595
--- /dev/null
+++ b/python/paddle/fluid/contrib/slim/unitest/configs/pruners.yaml
@@ -0,0 +1,12 @@
+version: 1.0
+pruners:
+    pruner_2:
+        class: 'RatioPruner'
+        ratios:
+            'conv1_1.w': 0.5
+            'conv1_2.w': 0.2
+            '*': 0.7
+        group_dims:
+            '*': [1, 2, 3]
+        criterions:
+            '*': 'l1-norm'
diff --git a/python/paddle/fluid/contrib/slim/unitest/configs/pruners_0.yaml b/python/paddle/fluid/contrib/slim/unitest/configs/pruners_0.yaml
new file mode 100644
index 0000000000..cd2ef9eb56
--- /dev/null
+++ b/python/paddle/fluid/contrib/slim/unitest/configs/pruners_0.yaml
@@ -0,0 +1,12 @@
+version: 1.0
+pruners:
+    pruner_3:
+        class: 'RatioPruner'
+        ratios:
+            'conv1_1.w': 0.5
+            'conv1_2.w': 0.2
+            '*': 0.7
+        group_dims:
+            '*': [1, 2, 3]
+        criterions:
+            '*': 'l1-norm'
diff --git a/python/paddle/fluid/contrib/slim/unitest/test_factory.py b/python/paddle/fluid/contrib/slim/unitest/test_factory.py
new file mode 100644
index 0000000000..07f28aac90
--- /dev/null
+++ b/python/paddle/fluid/contrib/slim/unitest/test_factory.py
@@ -0,0 +1,41 @@
+# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from paddle.fluid.contrib.slim import ConfigFactory
+import unittest
+
+
+class TestFactory(unittest.TestCase):
+    def test_parse(self):
+        factory = ConfigFactory('./unitest/configs/config.yaml')
+
+        pruner = factory.instance('pruner_1')
+        self.assertEquals(pruner.ratios['conv1_1.w'], 0.3)
+
+        pruner = factory.instance('pruner_2')
+        self.assertEquals(pruner.ratios['*'], 0.7)
+
+        strategy = factory.instance('strategy_1')
+        pruner = strategy.pruner
+        self.assertEquals(pruner.ratios['*'], 0.7)
+
+        compress_pass = factory.get_compress_pass()
+        self.assertEquals(compress_pass.epoch, 100)
+
+        strategy = compress_pass.strategies[0]
+        self.assertEquals(strategy.delta_rate, 0.2)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/requirements.txt b/python/requirements.txt
index 2f81d85df0..03d5e33e88 100644
--- a/python/requirements.txt
+++ b/python/requirements.txt
@@ -9,3 +9,5 @@ Pillow
 nltk>=3.2.2
 graphviz
 six
+funcsigs
+pyyaml
diff --git a/python/setup.py.in b/python/setup.py.in
index bfcd47f5b0..c9afe6c885 100644
--- a/python/setup.py.in
+++ b/python/setup.py.in
@@ -109,6 +109,10 @@ packages=['paddle',
           'paddle.fluid.contrib',
           'paddle.fluid.contrib.decoder',
           'paddle.fluid.contrib.quantize',
+          'paddle.fluid.contrib.slim',
+          'paddle.fluid.contrib.slim.core',
+          'paddle.fluid.contrib.slim.graph',
+          'paddle.fluid.contrib.slim.prune',
           'paddle.fluid.contrib.utils',
           'paddle.fluid.transpiler',
           'paddle.fluid.transpiler.details']

From 51a9fca3239ac7578eb739e0d44136ebeaec969d Mon Sep 17 00:00:00 2001
From: qingqing01 <dangqingqing@baidu.com>
Date: Mon, 24 Dec 2018 18:52:58 +0800
Subject: [PATCH 28/28] Async memory copy (#15013)

---
 paddle/fluid/inference/api/analysis_predictor.cc         | 7 +++++--
 paddle/fluid/inference/api/api_impl.cc                   | 7 +++++--
 paddle/fluid/operators/detection/density_prior_box_op.cu | 5 +++--
 3 files changed, 13 insertions(+), 6 deletions(-)

diff --git a/paddle/fluid/inference/api/analysis_predictor.cc b/paddle/fluid/inference/api/analysis_predictor.cc
index c751e85158..3937884ce4 100644
--- a/paddle/fluid/inference/api/analysis_predictor.cc
+++ b/paddle/fluid/inference/api/analysis_predictor.cc
@@ -231,11 +231,14 @@ bool AnalysisPredictor::SetFeed(const std::vector<PaddleTensor> &inputs,
                   inputs[i].data.length());
     } else {
 #ifdef PADDLE_WITH_CUDA
+      platform::DeviceContextPool &pool =
+          platform::DeviceContextPool::Instance();
+      auto *dev_ctx =
+          static_cast<const platform::CUDADeviceContext *>(pool.Get(place_));
       auto dst_gpu_place = boost::get<platform::CUDAPlace>(place_);
       memory::Copy(dst_gpu_place, static_cast<void *>(input_ptr),
                    platform::CPUPlace(), inputs[i].data.data(),
-                   inputs[i].data.length(),
-                   0);  // stream 0 for sync copy
+                   inputs[i].data.length(), dev_ctx->stream());
 #else
       PADDLE_THROW("Not compile with CUDA, should not reach here.");
 #endif
diff --git a/paddle/fluid/inference/api/api_impl.cc b/paddle/fluid/inference/api/api_impl.cc
index 3d121e0460..102147a493 100644
--- a/paddle/fluid/inference/api/api_impl.cc
+++ b/paddle/fluid/inference/api/api_impl.cc
@@ -208,11 +208,14 @@ bool NativePaddlePredictor::SetFeed(const std::vector<PaddleTensor> &inputs,
                   inputs[i].data.length());
     } else {
 #ifdef PADDLE_WITH_CUDA
+      platform::DeviceContextPool &pool =
+          platform::DeviceContextPool::Instance();
+      auto *dev_ctx =
+          static_cast<const platform::CUDADeviceContext *>(pool.Get(place_));
       auto dst_gpu_place = boost::get<platform::CUDAPlace>(place_);
       memory::Copy(dst_gpu_place, static_cast<void *>(input_ptr),
                    platform::CPUPlace(), inputs[i].data.data(),
-                   inputs[i].data.length(),
-                   0);  // stream 0 for sync copy
+                   inputs[i].data.length(), dev_ctx->stream());
 #else
       PADDLE_THROW("Not compile with CUDA, should not reach here.");
 #endif
diff --git a/paddle/fluid/operators/detection/density_prior_box_op.cu b/paddle/fluid/operators/detection/density_prior_box_op.cu
index 6a92762896..acd5993154 100644
--- a/paddle/fluid/operators/detection/density_prior_box_op.cu
+++ b/paddle/fluid/operators/detection/density_prior_box_op.cu
@@ -142,12 +142,13 @@ class DensityPriorBoxOpCUDAKernel : public framework::OpKernel<T> {
     vars->mutable_data<T>(ctx.GetPlace());
 
     framework::Tensor d_temp;
-    framework::TensorCopySync(h_temp, ctx.GetPlace(), &d_temp);
+    framework::TensorCopy(h_temp, ctx.GetPlace(), &d_temp);
 
     // At least use 32 threads, at most 512 threads.
     // blockx is multiple of 32.
     int blockx = std::min(
-        static_cast<long>(((feature_width * num_priors + 31) >> 5) << 5), 512L);
+        static_cast<int64_t>(((feature_width * num_priors + 31) >> 5) << 5),
+        512L);
     int gridx = (feature_width * num_priors + blockx - 1) / blockx;
     dim3 threads(blockx, 1);
     dim3 grids(gridx, feature_height);