Merge branch 'develop' into sequence_enumerate_op

7 years ago · 4ec12496dd
parent 733ea0d29b 4fcc293617
commit 4ec12496dd
75 changed files with 1473 additions and 343 deletions
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -24,6 +24,9 @@ message(STATUS "CXX compiler: ${CMAKE_CXX_COMPILER}, version: "
        "${CMAKE_CXX_COMPILER_ID} ${CMAKE_CXX_COMPILER_VERSION}")
 message(STATUS "C compiler: ${CMAKE_C_COMPILER}, version: "
        "${CMAKE_C_COMPILER_ID} ${CMAKE_C_COMPILER_VERSION}")
 if(WIN32)
    set(CMAKE_STATIC_LIBRARY_PREFIX lib)
 endif(WIN32)
 if(NOT CMAKE_CROSSCOMPILING)
    find_package(CUDA QUIET)
@ -165,7 +168,6 @@ include(external/python)    # download, build, install python
 include(external/openblas)  # download, build, install openblas
 include(external/mkldnn)    # download, build, install mkldnn
 include(external/swig)      # download, build, install swig
 include(external/warpctc)   # download, build, install warpctc
 include(external/boost)     # download boost
 include(external/any)       # download libn::any
 include(external/eigen)     # download eigen3
@ -173,6 +175,14 @@ include(external/pybind11)  # download pybind11
 include(external/cares)
 include(external/cub)
 if (NOT WIN32)
 # there is no official support of snappystream, warpctc, nccl, cupti in windows
 include(external/snappy)    # download snappy
 include(external/snappystream) # download snappystream
 include(external/warpctc)   # download, build, install warpctc
 include(cupti)
 endif (NOT WIN32)
 if(WITH_DISTRIBUTE)
    if(WITH_GRPC)
        include(external/grpc)
@ -194,13 +204,10 @@ if(WITH_BRPC_RDMA)
    endif()
 endif()
 include(external/snappy)    # download snappy
 include(external/snappystream)
 include(external/threadpool)
 include(external/threadpool)
 include(flags)              # set paddle compile flags
 include(cudnn)              # set cudnn libraries, must before configure
 include(cupti)
 include(configure)          # add paddle env configuration
 if(WITH_GPU)
--- a/cmake/configure.cmake
+++ b/cmake/configure.cmake
@ -61,6 +61,11 @@ if(NOT CMAKE_CROSSCOMPILING)
    endif()
 endif()
 if(WIN32)
  # windows stupid compile option for all targets.
  add_definitions(-D_XKEYCHECK_H)
 endif(WIN32)
 if(NOT WITH_GOLANG)
    add_definitions(-DPADDLE_WITHOUT_GOLANG)
 endif(NOT WITH_GOLANG)
--- a/cmake/external/boost.cmake
+++ b/cmake/external/boost.cmake
@ -28,7 +28,12 @@ if((NOT DEFINED BOOST_TAR) OR (NOT DEFINED BOOST_URL))
    set(BOOST_TAR "boost_1_41_0" CACHE STRING "" FORCE)
    set(BOOST_URL "http://paddlepaddledeps.cdn.bcebos.com/${BOOST_TAR}.tar.gz" CACHE STRING "" FORCE)
 endif()
-MESSAGE(STATUS "BOOST_TAR: ${BOOST_TAR}, BOOST_URL: ${BOOST_URL}")
+IF (WIN32)
    MESSAGE(WARNING, "In windows, boost can not be downloaded automaticlly, please build it manually and put it at " ${THIRD_PARTY_PATH}install/boost)
 else()
    MESSAGE(STATUS "BOOST_TAR: ${BOOST_TAR}, BOOST_URL: ${BOOST_URL}")
 ENDIF(WIN32)
 set(BOOST_SOURCES_DIR ${THIRD_PARTY_PATH}/boost)
 set(BOOST_DOWNLOAD_DIR  "${BOOST_SOURCES_DIR}/src/${BOOST_PROJECT}")
 set(BOOST_INCLUDE_DIR "${BOOST_DOWNLOAD_DIR}/${BOOST_TAR}" CACHE PATH "boost include directory." FORCE)
@ -36,12 +41,13 @@ set_directory_properties(PROPERTIES CLEAN_NO_CUSTOM 1)
 include_directories(${BOOST_INCLUDE_DIR})
 if (NOT WIN32)
 ExternalProject_Add(
    ${BOOST_PROJECT}
    ${EXTERNAL_PROJECT_LOG_ARGS}
    DOWNLOAD_DIR          ${BOOST_DOWNLOAD_DIR}
    DOWNLOAD_COMMAND      wget --no-check-certificate ${BOOST_URL} -c -q -O ${BOOST_TAR}.tar.gz
-                          && tar zxf ${BOOST_TAR}.tar.gz
+    && tar zxf ${BOOST_TAR}.tar.gz
    DOWNLOAD_NO_PROGRESS  1
    PREFIX                ${BOOST_SOURCES_DIR}
    CONFIGURE_COMMAND     ""
@ -49,8 +55,9 @@ ExternalProject_Add(
    INSTALL_COMMAND       ""
    UPDATE_COMMAND        ""
 )
 endif(NOT WIN32)
-if (${CMAKE_VERSION} VERSION_LESS "3.3.0")
+if (${CMAKE_VERSION} VERSION_LESS "3.3.0" OR NOT WIN32)
    set(dummyfile ${CMAKE_CURRENT_BINARY_DIR}/boost_dummy.c)
    file(WRITE ${dummyfile} "const char *dummy = \"${dummyfile}\";")
    add_library(boost STATIC ${dummyfile})
--- a/cmake/external/gflags.cmake
+++ b/cmake/external/gflags.cmake
@ -18,7 +18,7 @@ SET(GFLAGS_SOURCES_DIR ${THIRD_PARTY_PATH}/gflags)
 SET(GFLAGS_INSTALL_DIR ${THIRD_PARTY_PATH}/install/gflags)
 SET(GFLAGS_INCLUDE_DIR "${GFLAGS_INSTALL_DIR}/include" CACHE PATH "gflags include directory." FORCE)
 IF(WIN32)
-  set(GFLAGS_LIBRARIES "${GFLAGS_INSTALL_DIR}/lib/gflags.lib" CACHE FILEPATH "GFLAGS_LIBRARIES" FORCE)
+  set(GFLAGS_LIBRARIES "${GFLAGS_INSTALL_DIR}/lib/libgflags.lib" CACHE FILEPATH "GFLAGS_LIBRARIES" FORCE)
 ELSE(WIN32)
  set(GFLAGS_LIBRARIES "${GFLAGS_INSTALL_DIR}/lib/libgflags.a" CACHE FILEPATH "GFLAGS_LIBRARIES" FORCE)
 ENDIF(WIN32)
@ -45,7 +45,13 @@ ExternalProject_Add(
                     -DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON
                     -DCMAKE_BUILD_TYPE:STRING=${THIRD_PARTY_BUILD_TYPE}
 )
-
+IF(WIN32)
  IF(NOT EXISTS "${GFLAGS_INSTALL_DIR}/lib/libgflags.lib")
    add_custom_command(TARGET extern_gflags POST_BUILD
    COMMAND cmake -E rename ${GFLAGS_INSTALL_DIR}/lib/gflags_static.lib ${GFLAGS_INSTALL_DIR}/lib/libgflags.lib
  )
  ENDIF()
 ENDIF(WIN32)
 ADD_LIBRARY(gflags STATIC IMPORTED GLOBAL)
 SET_PROPERTY(TARGET gflags PROPERTY IMPORTED_LOCATION ${GFLAGS_LIBRARIES})
 ADD_DEPENDENCIES(gflags extern_gflags)
@ -60,3 +66,4 @@ IF(WITH_C_API)
    INSTALL(FILES ${GFLAGS_LIBRARIES} DESTINATION third_party/gflags/lib)
  ENDIF()
 ENDIF()
--- a/cmake/external/glog.cmake
+++ b/cmake/external/glog.cmake
@ -60,6 +60,13 @@ ExternalProject_Add(
                     -DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON
                     -DCMAKE_BUILD_TYPE:STRING=${THIRD_PARTY_BUILD_TYPE}
 )
 IF(WIN32)
  IF(NOT EXISTS "${GLOG_INSTALL_DIR}/lib/libglog.lib")
    add_custom_command(TARGET extern_glog POST_BUILD
    COMMAND cmake -E rename ${GLOG_INSTALL_DIR}/lib/glog.lib ${GLOG_INSTALL_DIR}/lib/libglog.lib
  )
  ENDIF()
 ENDIF(WIN32)
 ADD_LIBRARY(glog STATIC IMPORTED GLOBAL)
 SET_PROPERTY(TARGET glog PROPERTY IMPORTED_LOCATION ${GLOG_LIBRARIES})
--- a/cmake/external/mkldnn.cmake
+++ b/cmake/external/mkldnn.cmake
@ -54,7 +54,7 @@ ExternalProject_Add(
    ${EXTERNAL_PROJECT_LOG_ARGS}
    DEPENDS             ${MKLDNN_DEPENDS}
    GIT_REPOSITORY      "https://github.com/01org/mkl-dnn.git"
-    GIT_TAG             "a29d8487a63afca3d5b8c5bbdbb473cf8ccc6e51"
+    GIT_TAG             "64e03a1939e0d526aa8e9f2e3f7dc0ad8d372944"
    PREFIX              ${MKLDNN_SOURCES_DIR}
    UPDATE_COMMAND      ""
    CMAKE_ARGS          -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}
--- a/cmake/external/openblas.cmake
+++ b/cmake/external/openblas.cmake
@ -17,20 +17,29 @@ IF(USE_EIGEN_FOR_BLAS)
 ENDIF(USE_EIGEN_FOR_BLAS)
 INCLUDE(cblas)
 # IF(WIN32 AND NOT ${CBLAS_FOUND})
 IF(NOT ${CBLAS_FOUND})
    INCLUDE(ExternalProject)
    SET(CBLAS_SOURCES_DIR ${THIRD_PARTY_PATH}/openblas)
    SET(CBLAS_INSTALL_DIR ${THIRD_PARTY_PATH}/install/openblas)
-    SET(CBLAS_INC_DIR "${CBLAS_INSTALL_DIR}/include" CACHE PATH "openblas include directory." FORCE)
+    SET(CBLAS_INCLUDE_DIR "${CBLAS_INSTALL_DIR}/include" CACHE PATH "openblas include directory." FORCE)
    SET(CBLAS_LIBRARIES
        "${CBLAS_INSTALL_DIR}/lib/${CMAKE_STATIC_LIBRARY_PREFIX}openblas${CMAKE_STATIC_LIBRARY_SUFFIX}"
        CACHE FILEPATH "openblas library." FORCE)
    ADD_DEFINITIONS(-DPADDLE_USE_OPENBLAS)
    IF (WIN32)
        SET(CBLAS_FOUND true)
        MESSAGE(WARNING, "In windows, openblas only support msvc build, please build it manually and put it at " ${CBLAS_INSTALL_DIR})
    ENDIF(WIN32)
    IF (NOT WIN32)
    SET(OPENBLAS_CC "${CMAKE_C_COMPILER} -Wno-unused-but-set-variable -Wno-unused-variable")
    SET(OPENBLAS_COMMIT "v0.2.20")
@ -69,7 +78,6 @@ IF(NOT ${CBLAS_FOUND})
    ENDIF()
    SET(COMMON_ARGS CC=${OPENBLAS_CC} NO_SHARED=1 NO_LAPACK=1 libs)
    ExternalProject_Add(
        extern_openblas
        ${EXTERNAL_PROJECT_LOG_ARGS}
@ -84,9 +92,11 @@ IF(NOT ${CBLAS_FOUND})
        UPDATE_COMMAND      ""
        CONFIGURE_COMMAND   ""
    )
    ELSE()
    ENDIF(NOT WIN32)
    SET(CBLAS_PROVIDER openblas)
    IF(WITH_C_API)
-        INSTALL(DIRECTORY ${CBLAS_INC_DIR} DESTINATION third_party/openblas)
+        INSTALL(DIRECTORY ${CBLAS_INCLUDE_DIR} DESTINATION third_party/openblas)
        # Because libopenblas.a is a symbolic link of another library, thus need to
        # install the whole directory.
        IF(ANDROID)
@ -107,7 +117,8 @@ IF(NOT ${CBLAS_FOUND})
 ENDIF(NOT ${CBLAS_FOUND})
 MESSAGE(STATUS "BLAS library: ${CBLAS_LIBRARIES}")
-INCLUDE_DIRECTORIES(${CBLAS_INC_DIR})
+MESSAGE(STATUS "BLAS Include: ${CBLAS_INCLUDE_DIR}")
 INCLUDE_DIRECTORIES(${CBLAS_INCLUDE_DIR})
 # FIXME(gangliao): generate cblas target to track all high performance
 # linear algebra libraries for cc_library(xxx SRCS xxx.c DEPS cblas)
--- a/cmake/external/protobuf.cmake
+++ b/cmake/external/protobuf.cmake
@ -14,11 +14,14 @@
 INCLUDE(ExternalProject)
 # Always invoke `FIND_PACKAGE(Protobuf)` for importing function protobuf_generate_cpp
 IF(NOT WIN32)
 FIND_PACKAGE(Protobuf QUIET)
 ENDIF(NOT WIN32)
 macro(UNSET_VAR VAR_NAME)
    UNSET(${VAR_NAME} CACHE)
    UNSET(${VAR_NAME})
 endmacro()
 UNSET_VAR(PROTOBUF_INCLUDE_DIR)
 UNSET_VAR(PROTOBUF_FOUND)
 UNSET_VAR(PROTOBUF_PROTOC_EXECUTABLE)
@ -94,12 +97,14 @@ macro(PROMPT_PROTOBUF_LIB)
    SET(protobuf_DEPS ${ARGN})
    MESSAGE(STATUS "Protobuf protoc executable: ${PROTOBUF_PROTOC_EXECUTABLE}")
    MESSAGE(STATUS "Protobuf-lite library: ${PROTOBUF_LITE_LIBRARY}")
    MESSAGE(STATUS "Protobuf library: ${PROTOBUF_LIBRARY}")
    MESSAGE(STATUS "Protoc library: ${PROTOBUF_PROTOC_LIBRARY}")
    MESSAGE(STATUS "Protobuf version: ${PROTOBUF_VERSION}")
    INCLUDE_DIRECTORIES(${PROTOBUF_INCLUDE_DIR})
    # Assuming that all the protobuf libraries are of the same type.
-    IF(${PROTOBUF_LIBRARY} MATCHES "${CMAKE_STATIC_LIBRARY_SUFFIX}$")
+    IF(${PROTOBUF_LIBRARY} MATCHES ${CMAKE_STATIC_LIBRARY_SUFFIX})
        SET(protobuf_LIBTYPE STATIC)
    ELSEIF(${PROTOBUF_LIBRARY} MATCHES "${CMAKE_SHARED_LIBRARY_SUFFIX}$")
        SET(protobuf_LIBTYPE SHARED)
@ -137,18 +142,25 @@ macro(SET_PROTOBUF_VERSION)
 endmacro()
 set(PROTOBUF_ROOT "" CACHE PATH "Folder contains protobuf")
 IF (WIN32)
    SET(PROTOBUF_ROOT ${THIRD_PARTY_PATH}/install/protobuf)
    MESSAGE(WARNING, "In windows, protobuf only support msvc build, please build it manually and put it at " ${PROTOBUF_ROOT})
 ENDIF(WIN32)
 if (NOT "${PROTOBUF_ROOT}" STREQUAL "")
    find_path(PROTOBUF_INCLUDE_DIR google/protobuf/message.h PATHS ${PROTOBUF_ROOT}/include NO_DEFAULT_PATH)
-    find_library(PROTOBUF_LIBRARY protobuf PATHS ${PROTOBUF_ROOT}/lib NO_DEFAULT_PATH)
+    find_library(PROTOBUF_LIBRARY protobuf libprotobuf.lib PATHS ${PROTOBUF_ROOT}/lib NO_DEFAULT_PATH)
-    find_library(PROTOBUF_LITE_LIBRARY protobuf-lite PATHS ${PROTOBUF_ROOT}/lib NO_DEFAULT_PATH)
+    find_library(PROTOBUF_LITE_LIBRARY protobuf-lite libprotobuf-lite.lib PATHS ${PROTOBUF_ROOT}/lib NO_DEFAULT_PATH)
-    find_library(PROTOBUF_PROTOC_LIBRARY protoc PATHS ${PROTOBUF_ROOT}/lib NO_DEFAULT_PATH)
+    find_library(PROTOBUF_PROTOC_LIBRARY protoc libprotoc.lib PATHS ${PROTOBUF_ROOT}/lib NO_DEFAULT_PATH)
    find_program(PROTOBUF_PROTOC_EXECUTABLE protoc PATHS ${PROTOBUF_ROOT}/bin NO_DEFAULT_PATH)
    if (PROTOBUF_INCLUDE_DIR AND PROTOBUF_LIBRARY AND PROTOBUF_LITE_LIBRARY AND PROTOBUF_PROTOC_LIBRARY AND PROTOBUF_PROTOC_EXECUTABLE)
        message(STATUS "Using custom protobuf library in ${PROTOBUF_ROOT}.")
        SET(PROTOBUF_FOUND true)
        SET_PROTOBUF_VERSION()
        PROMPT_PROTOBUF_LIB()
    else()
-        message(WARNING "Cannot find protobuf library in ${PROTOBUF_ROOT}.")
+        message(WARNING "Cannot find protobuf library in ${PROTOBUF_ROOT}")
    endif()
 endif()
@ -239,6 +251,7 @@ IF(CMAKE_CROSSCOMPILING)
        CACHE FILEPATH "protobuf executable." FORCE)
 ENDIF()
 IF(NOT PROTOBUF_FOUND)
    build_protobuf(extern_protobuf FALSE)
--- a/cmake/generic.cmake
+++ b/cmake/generic.cmake
@ -148,7 +148,8 @@ function(merge_static_libs TARGET_NAME)
      COMMAND rm "${CMAKE_CURRENT_BINARY_DIR}/lib${TARGET_NAME}.a"
      COMMAND /usr/bin/libtool -static -o "${CMAKE_CURRENT_BINARY_DIR}/lib${TARGET_NAME}.a" ${libfiles}
      )
-  else() # general UNIX: use "ar" to extract objects and re-add to a common lib
+  endif(APPLE)
  if(LINUX) # general UNIX: use "ar" to extract objects and re-add to a common lib
    set(target_DIR ${CMAKE_CURRENT_BINARY_DIR}/${TARGET_NAME}.dir)
    foreach(lib ${libs})
@ -187,7 +188,36 @@ function(merge_static_libs TARGET_NAME)
        COMMAND ${CMAKE_AR} crs ${target_LIBNAME} `find ${target_DIR} -name '*.o'`
        COMMAND ${CMAKE_RANLIB} ${target_LIBNAME}
        WORKING_DIRECTORY ${target_DIR})
-  endif()
+  endif(LINUX)
  if(WIN32) # windows do not support gcc/nvcc combined compiling. Use msvc lib.exe to merge libs.
    # Make the generated dummy source file depended on all static input
    # libs. If input lib changes,the source file is touched
    # which causes the desired effect (relink).
    add_custom_command(OUTPUT ${target_SRCS}
      COMMAND ${CMAKE_COMMAND} -E touch ${target_SRCS}
      DEPENDS ${libs})
    # Generate dummy staic lib
    file(WRITE ${target_SRCS} "const char *dummy_${TARGET_NAME} = \"${target_SRCS}\";")
    add_library(${TARGET_NAME} STATIC ${target_SRCS})
    target_link_libraries(${TARGET_NAME} ${libs_deps})
    foreach(lib ${libs})
      # Get the file names of the libraries to be merged
      #if(NOT $<TARGET_FILE:${lib}> MATCHES "lib.*\\.lib")
      #  message("library" ${lib})
      #  set(libfiles ${libfiles} lib$<TARGET_FILE:${lib}>)
      #else()
      set(libfiles ${libfiles} $<TARGET_FILE:${lib}>)
      #endif()
    endforeach()
    # windows cmd return error in clean env.
    # COMMAND del "${CMAKE_CURRENT_BINARY_DIR}/${CMAKE_BUILD_TYPE}/${TARGET_NAME}.lib"
    add_custom_command(TARGET ${TARGET_NAME} POST_BUILD
      COMMAND lib /OUT:${CMAKE_CURRENT_BINARY_DIR}/lib${TARGET_NAME}.lib ${libfiles}
      )
  endif(WIN32)
 endfunction(merge_static_libs)
 function(cc_library TARGET_NAME)
@ -195,6 +225,10 @@ function(cc_library TARGET_NAME)
  set(oneValueArgs "")
  set(multiValueArgs SRCS DEPS)
  cmake_parse_arguments(cc_library "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
  if(WIN32)
      # add libxxx.lib prefix in windows
      set(${TARGET_NAME}_LIB_NAME "${CMAKE_STATIC_LIBRARY_PREFIX}${TARGET_NAME}${CMAKE_STATIC_LIBRARY_SUFFIX}" CACHE STRING "output library name for target ${TARGET_NAME}")
  endif(WIN32)
  if(cc_library_SRCS)
    if(cc_library_SHARED OR cc_library_shared) # build *.so
      add_library(${TARGET_NAME} SHARED ${cc_library_SRCS})
--- a/cmake/inference_lib.cmake
+++ b/cmake/inference_lib.cmake
@ -101,6 +101,7 @@ if(WITH_MKLDNN)
  )
 endif()
 if (NOT WIN32)
 if(NOT MOBILE_INFERENCE AND NOT RPI)
  set(dst_dir "${FLUID_INSTALL_DIR}/third_party/install/snappy")
  copy(snappy_lib
@ -120,15 +121,23 @@ if(NOT MOBILE_INFERENCE AND NOT RPI)
    DSTS ${dst_dir} ${dst_dir}/lib
    DEPS zlib)
 endif()
 endif(NOT WIN32)
 # paddle fluid module
 set(src_dir "${PADDLE_SOURCE_DIR}/paddle/fluid")
 set(dst_dir "${FLUID_INSTALL_DIR}/paddle/fluid")
 set(module "framework")
 if (NOT WIN32)
 copy(framework_lib DEPS framework_py_proto 
  SRCS ${src_dir}/${module}/*.h ${src_dir}/${module}/details/*.h ${PADDLE_BINARY_DIR}/paddle/fluid/framework/framework.pb.h
  DSTS ${dst_dir}/${module} ${dst_dir}/${module}/details ${dst_dir}/${module}
 )
 else()
 copy(framework_lib
  SRCS ${src_dir}/${module}/*.h ${src_dir}/${module}/details/*.h ${PADDLE_BINARY_DIR}/paddle/fluid/framework/framework.pb.h
  DSTS ${dst_dir}/${module} ${dst_dir}/${module}/details ${dst_dir}/${module}
 )
 endif(NOT WIN32)
 set(module "memory")
 copy(memory_lib
--- a/doc/fluid/dev/releasing_process_en.md
+++ b/doc/fluid/dev/releasing_process_en.md
@ -50,6 +50,33 @@ pop-up box, choose the current release branch and click "Run Build" button. You
 * pypi does not allow overwrite the already uploaded version of wheel package, even if you delete the
  old version. you must change the version number before upload a new one.
 ### Publish wheel Packages for MacOS
 You need to build the binary wheel package for MacOS before publishing, to
 make sure that the package can be used by many versions of MacOS
 (10.11, 10.12, 10.13) and different python installs (python.org, homebrew, etc.),
 you must build the package ***exactly*** following below steps:
 Build steps:
 1. install python from python.org downloads, and make sure it's currently in use
   in your system.
 1. `export MACOSX_DEPLOYMENT_TARGET=10.11`, use `10.11` is enough for recent versions.
 1. `git clone https://github.com/PaddlePaddle/Paddle.git && cd Paddle && mkdir build && cd build`
 1. `cmake -DWITH_GPU=OFF -DWITH_MKL=OFF -DWITH_SYSTEM_BLAS=OFF  ..`, make sure the output of `cmake` command is using the correct python interpreter installed from python.org
 1. `make -j`
 1. `pip install delocate`
 1. `mkdir fixed_wheel && delocate-wheel -w fixed_wheel python/dist/*.whl`
 Then the whl under `fixed_wheel` is ready to upload.
 Install steps:
 1. run `pip install paddlepaddle...whl`
 1. find the `libpython.dylib` that are currently in use:
    - for python.org package installs, do nothing.
    - for other python installs, find the path of `libpython*.dylib` and `export LD_LIBRARY_PATH=you path && DYLD_LIBRARY_PATH=your path`
 ## Publish Docker Images
 Our CI tool will push latest images to DockerHub, so we only need to push a version tag like:
--- a/doc/fluid/new_docs/advanced_usage/deploy/native_infer.rst
+++ b/doc/fluid/new_docs/advanced_usage/deploy/native_infer.rst
@ -9,8 +9,6 @@ Paddle 预测 API
 -  头文件 ``paddle_inference_api.h`` 定义了所有的接口
 -  库文件\ ``libpaddle_fluid.so`` 或 ``libpaddle_fluid.a``
 -  库文件 ``libpaddle_inference_api.so`` 或
   ``libpaddle_inference_api.a``
 编译和依赖可以参考 :ref:`install_or_build_cpp_inference_lib` 。
@ -97,8 +95,7 @@ engine
    CHECK(predictor->Run(slots, &outputs));
    // 获取 outputs ...
-编译时，联编 ``libpaddle_fluid.a/.so`` 和
+编译时，联编 ``libpaddle_fluid.a/.so`` 即可。
 ``libpaddle_inference_api.a/.so`` 便可。
 详细代码参考
 ------------
--- a/paddle/fluid/API.spec
+++ b/paddle/fluid/API.spec
@ -163,6 +163,7 @@ paddle.fluid.layers.rank_loss ArgSpec(args=['label', 'left', 'right', 'name'], v
 paddle.fluid.layers.prelu ArgSpec(args=['x', 'mode', 'param_attr', 'name'], varargs=None, keywords=None, defaults=(None, None))
 paddle.fluid.layers.flatten ArgSpec(args=['x', 'axis', 'name'], varargs=None, keywords=None, defaults=(1, None))
 paddle.fluid.layers.sequence_enumerate ArgSpec(args=['input', 'win_size', 'pad_value', 'name'], varargs=None, keywords=None, defaults=(None,))
 paddle.fluid.layers.sequence_mask ArgSpec(args=['x', 'maxlen', 'dtype', 'name'], varargs=None, keywords=None, defaults=(None, 'int64', None))
 paddle.fluid.layers.stack ArgSpec(args=['x', 'axis'], varargs=None, keywords=None, defaults=(0,))
 paddle.fluid.layers.data ArgSpec(args=['name', 'shape', 'append_batch_size', 'dtype', 'lod_level', 'type', 'stop_gradient'], varargs=None, keywords=None, defaults=(True, 'float32', 0, VarType.LOD_TENSOR, True))
 paddle.fluid.layers.open_recordio_file ArgSpec(args=['filename', 'shapes', 'lod_levels', 'dtypes', 'pass_num', 'for_parallel'], varargs=None, keywords=None, defaults=(1, True))
--- a/paddle/fluid/CMakeLists.txt
+++ b/paddle/fluid/CMakeLists.txt
@ -2,9 +2,13 @@ add_subdirectory(memory)
 add_subdirectory(platform)
 add_subdirectory(framework)
 add_subdirectory(operators)
 add_subdirectory(pybind)
 add_subdirectory(string)
 if (NOT WIN32)
 add_subdirectory(pybind)
 add_subdirectory(recordio)
 endif(NOT WIN32)
 if(WITH_INFERENCE)
  # NOTE: please add subdirectory inference at last.
  add_subdirectory(inference)
--- a/paddle/fluid/framework/CMakeLists.txt
+++ b/paddle/fluid/framework/CMakeLists.txt
@ -1,5 +1,7 @@
 add_subdirectory(details)
 add_subdirectory(ir)
 if (NOT WIN32)
 add_subdirectory(details)
 endif (NOT WIN32)
 # ddim lib
 proto_library(framework_proto SRCS framework.proto)
@ -28,8 +30,12 @@ if(WITH_GPU)
 else()
  cc_test(mixed_vector_test SRCS mixed_vector_test.cc DEPS place memory device_context tensor)
 endif()
-
+if (NOT WIN32)
 cc_library(lod_tensor SRCS lod_tensor.cc DEPS ddim place tensor framework_proto recordio)
 else()
 cc_library(lod_tensor SRCS lod_tensor.cc DEPS ddim place tensor framework_proto)
 endif (NOT WIN32)
 cc_test(lod_tensor_test SRCS lod_tensor_test.cc DEPS lod_tensor memory)
 nv_test(lod_tensor_gpu_test SRCS lod_tensor_test.cu DEPS lod_tensor)
@ -69,14 +75,22 @@ cc_library(op_proto_maker SRCS op_proto_maker.cc DEPS framework_proto attribute
 cc_test(op_proto_maker_test SRCS op_proto_maker_test.cc DEPS op_proto_maker)
 cc_library(op_info SRCS op_info.cc DEPS attribute framework_proto)
 cc_library(shape_inference SRCS shape_inference.cc DEPS ddim attribute device_context)
 if (NOT WIN32)
 cc_library(operator SRCS operator.cc DEPS op_info device_context tensor scope glog
    shape_inference data_transform lod_tensor profiler)
 else()
 cc_library(operator SRCS operator.cc DEPS op_info device_context tensor scope glog
    shape_inference data_transform lod_tensor)
 endif(NOT WIN32)
 cc_test(operator_test SRCS operator_test.cc DEPS operator op_registry device_context)
 cc_library(proto_desc SRCS var_desc.cc op_desc.cc block_desc.cc program_desc.cc DEPS shape_inference op_info operator glog)
 cc_library(op_registry SRCS op_registry.cc DEPS op_proto_maker op_info operator glog proto_desc)
 nv_test(op_registry_test SRCS op_registry_test.cc DEPS op_registry)
 if (NOT WIN32)
 py_proto_compile(framework_py_proto SRCS framework.proto)
 # Generate an empty __init__.py to make framework_py_proto as a valid python module.
 add_custom_target(framework_py_proto_init ALL COMMAND ${CMAKE_COMMAND} -E touch __init__.py)
@ -86,6 +100,7 @@ add_custom_command(TARGET framework_py_proto POST_BUILD
    COMMAND cp *.py ${PADDLE_BINARY_DIR}/python/paddle/fluid/proto/
    COMMENT "Copy generated python proto into directory paddle/fluid/proto."
    WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
 endif(NOT WIN32)
 cc_library(lod_rank_table SRCS lod_rank_table.cc DEPS lod_tensor)
@ -120,7 +135,9 @@ cc_test(cow_ptr_tests SRCS details/cow_ptr_test.cc)
 # cc_test(channel_test SRCS channel_test.cc)
 cc_test(tuple_test SRCS tuple_test.cc )
 if (NOT WIN32)
 cc_test(rw_lock_test SRCS rw_lock_test.cc)
 endif (NOT WIN32)
 # disable test temporarily.
 # TODO https://github.com/PaddlePaddle/Paddle/issues/11971
--- a/paddle/fluid/framework/data_type.h
+++ b/paddle/fluid/framework/data_type.h
@ -26,6 +26,7 @@ namespace framework {
 extern proto::VarType::Type ToDataType(std::type_index type);
 extern std::type_index ToTypeIndex(proto::VarType::Type type);
 #if !defined(_WIN32)
 template <typename Visitor>
 inline void VisitDataType(proto::VarType::Type type, Visitor visitor) {
  switch (type) {
@ -57,6 +58,40 @@ inline void VisitDataType(proto::VarType::Type type, Visitor visitor) {
      PADDLE_THROW("Not supported %d", type);
  }
 }
 #else
 // the msvc compiler do not implement two-stage name lookup correctly.
 template <typename Visitor>
 inline void VisitDataType(proto::VarType::Type type, Visitor visitor) {
  switch (type) {
    case proto::VarType::FP16:
      visitor.operator()<platform::float16>();
      break;
    case proto::VarType::FP32:
      visitor.operator()<float>();
      break;
    case proto::VarType::FP64:
      visitor.operator()<double>();
      break;
    case proto::VarType::INT32:
      visitor.operator()<int>();
      break;
    case proto::VarType::INT64:
      visitor.operator()<int64_t>();
      break;
    case proto::VarType::BOOL:
      visitor.operator()<bool>();
      break;
    case proto::VarType::UINT8:
      visitor.operator()<uint8_t>();
      break;
    case proto::VarType::INT16:
      visitor.operator()<int16_t>();
      break;
    default:
      PADDLE_THROW("Not supported %d", type);
  }
 }
 #endif  // _WIN32
 extern std::string DataTypeToString(const proto::VarType::Type type);
 extern size_t SizeOfType(std::type_index type);
--- a/paddle/fluid/framework/lod_tensor.cc
+++ b/paddle/fluid/framework/lod_tensor.cc
@ -25,8 +25,10 @@ limitations under the License. */
 #include "paddle/fluid/memory/memcpy.h"
 #include "paddle/fluid/memory/memory.h"
 #if !defined(_WIN32)
 #include "paddle/fluid/recordio/scanner.h"
 #include "paddle/fluid/recordio/writer.h"
 #endif  // _WIN32
 namespace paddle {
 namespace framework {
@ -300,6 +302,7 @@ void DeserializeFromStream(std::istream &is, LoDTensor *tensor,
  TensorFromStream(is, static_cast<Tensor *>(tensor), dev_ctx);
 }
 #if !defined(_WIN32)
 void WriteToRecordIO(recordio::Writer *writer,
                     const std::vector<LoDTensor> &tensor,
                     const platform::DeviceContext &dev_ctx) {
@ -329,7 +332,19 @@ bool ReadFromRecordIO(recordio::Scanner *scanner,
  return true;
 }
-
+#else
 class Writer {};
 class Scanner {};
 void WriteToRecordIO(recordio::Writer *writer,
                     const std::vector<LoDTensor> &tensor,
                     const platform::DeviceContext &dev_ctx) {}
 bool ReadFromRecordIO(recordio::Scanner *scanner,
                      const platform::DeviceContext &dev_ctx,
                      std::vector<LoDTensor> *result_ptr) {
  PADDLE_ENFORCE("windows didn't supported recordio!.");
  return true;
 }
 #endif  // _WIN32
 std::vector<LoDTensor> LoDTensor::SplitLoDTensor(
    const std::vector<platform::Place> places) const {
  check_memory_size();
--- a/paddle/fluid/framework/lod_tensor_test.cc
+++ b/paddle/fluid/framework/lod_tensor_test.cc
@ -274,6 +274,7 @@ TEST(LoD, ConvertToOffsetBasedLoD) {
  EXPECT_EQ(offset_lod, expected);
 }
 #if !defined(_WIN32)
 template <typename T>
 static void TestRecordIO() {
  LoDTensor tensor;
@ -320,6 +321,7 @@ TEST(LoDTensor, RecordIO) {
  TestRecordIO<float>();
  TestRecordIO<double>();
 }
 #endif  // !defined(_WIN32)
 }  // namespace framework
 }  // namespace paddle
--- a/paddle/fluid/framework/op_proto_maker.cc
+++ b/paddle/fluid/framework/op_proto_maker.cc
@ -129,10 +129,6 @@ void OpProtoAndCheckerMaker::operator()(proto::OpProto* proto,
                                    "Optimized for variable")
      .SetDefault({});
  AddAttr<std::vector<std::string>>(OpCreationCallstackAttrName(),
                                    "Callstack for Op Creatation.")
      .SetDefault({});
  Validate();
 }
--- a/paddle/fluid/framework/op_proto_maker.h
+++ b/paddle/fluid/framework/op_proto_maker.h
@ -39,7 +39,6 @@ class OpProtoAndCheckerMaker {
 public:
  static const char *OpRoleAttrName() { return "op_role"; }
  static const char *OpRoleVarAttrName() { return "op_role_var"; }
  static const char *OpCreationCallstackAttrName() { return "op_callstack"; }
  void operator()(proto::OpProto *proto, OpAttrChecker *attr_checker);
--- a/paddle/fluid/framework/operator.cc
+++ b/paddle/fluid/framework/operator.cc
@ -11,17 +11,15 @@ distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-#include "paddle/fluid/framework/operator.h"
+#include <gflags/gflags.h>
 #include <glog/logging.h>
 #include <algorithm>
-#include <sstream>
+
 #include <string>
 #include <vector>
 #include "gflags/gflags.h"
 #include "glog/logging.h"
 #include "paddle/fluid/framework/data_transform.h"
 #include "paddle/fluid/framework/executor.h"
 #include "paddle/fluid/framework/lod_tensor.h"
-#include "paddle/fluid/framework/op_proto_maker.h"
+#include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/framework/shape_inference.h"
 #include "paddle/fluid/framework/var_type.h"
 #include "paddle/fluid/platform/profiler.h"
@ -76,6 +74,12 @@ static DDim GetDims(const Scope& scope, const std::string& name,
  }
 }
 static bool VarInited(const Scope& scope, const std::string& name) {
  Variable* var = scope.FindVar(name);
  if (var == nullptr) return false;
  return var->IsInitialized();
 }
 static std::string GetDtype(const Scope& scope, const std::string& name) {
  Variable* var = scope.FindVar(name);
  if (var == nullptr) {
@ -89,8 +93,12 @@ static std::string GetDtype(const Scope& scope, const std::string& name) {
    }
    return DataTypeToString(ToDataType(tensor.type()));
  } else if (var->IsType<SelectedRows>()) {
-    return DataTypeToString(
+    auto tensor = var->Get<SelectedRows>().value();
-        ToDataType(var->Get<SelectedRows>().value().type()));
+    if (UNLIKELY(!tensor.IsInitialized())) {
      return "uninited";
    } else {
      return DataTypeToString(ToDataType(tensor.type()));
    }
  } else {
    return "";
  }
@ -129,48 +137,19 @@ static LoD GetLoD(const Scope& scope, const std::string& name) {
 }
 void OperatorBase::Run(const Scope& scope, const platform::Place& place) {
-  try {
+  VLOG(4) << place << " " << DebugStringEx(&scope);
-    if (VLOG_IS_ON(4)) {
+  if (platform::is_gpu_place(place)) {
      VLOG(4) << place << " " << DebugStringEx(&scope);
    }
    if (platform::is_gpu_place(place)) {
 #ifndef PADDLE_WITH_CUDA
-      PADDLE_THROW("Cannot run operator on place %s", place);
+    PADDLE_THROW("Cannot run operator on place %s", place);
 #else
-      auto dev_id = boost::get<platform::CUDAPlace>(place).device;
+    auto dev_id = boost::get<platform::CUDAPlace>(place).device;
-      platform::SetDeviceId(dev_id);
+    platform::SetDeviceId(dev_id);
 #endif
    }
    platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
    platform::RecordEvent record_event(Type(), pool.Get(place));
    RunImpl(scope, place);
    if (VLOG_IS_ON(3)) {
      VLOG(3) << place << " " << DebugStringEx(&scope);
    }
  } catch (platform::EnforceNotMet exception) {
    if (Attrs().count("sub_block") != 0) {
      throw exception;
    }
    auto& callstack = Attr<std::vector<std::string>>(
        OpProtoAndCheckerMaker::OpCreationCallstackAttrName());
    if (callstack.empty()) {
      throw exception;
    }
    std::ostringstream sout;
    sout << "Invoke operator " << Type() << " error.\n";
    sout << "Python Callstacks: \n";
    for (auto& line : callstack) {
      sout << line;
    }
    sout << "C++ Callstacks: \n";
    sout << exception.err_str_;
    exception.err_str_ = sout.str();
    throw exception;
  } catch (...) {
    std::rethrow_exception(std::current_exception());
  }
  platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
  platform::RecordEvent record_event(Type(), pool.Get(place));
  RunImpl(scope, place);
  VLOG(3) << place << " " << DebugStringEx(&scope);
 }
 bool OperatorBase::HasInputs(const std::string& name) const {
@ -198,7 +177,7 @@ const std::vector<std::string>& OperatorBase::Inputs(
 }
 bool OperatorBase::HasOutputs(const std::string& name) const {
-  if (outputs_.end() != outputs_.find(name)) {
+  if (outputs_.find(name) != outputs_.end()) {
    return true;
  } else {
    return false;
@ -228,16 +207,21 @@ std::string OperatorBase::DebugStringEx(const Scope* scope) const {
    auto& input = *it;
    ss << input.first << "[";
    for (size_t i = 0; i < input.second.size(); ++i) {
-      ss << input.second[i];
+      auto var_name = input.second[i];
      ss << var_name;
      if (scope) {
-        int row_size = GetRowSize(*scope, input.second[i]);
+        if (!VarInited(*scope, var_name)) {
-        if (row_size >= 0) {
+          ss << "[uninited]";
-          ss << "[row_size=" << row_size << "]";
+        } else {
          int row_size = GetRowSize(*scope, var_name);
          if (row_size >= 0) {
            ss << "[row_size=" << row_size << "]";
          }
          std::string dtype = GetDtype(*scope, var_name);
          ss << ":" << dtype;
          ss << "[" << GetDims(*scope, var_name, true) << "]";
          ss << "(" << GetLoD(*scope, var_name) << ")";
        }
        std::string dtype = GetDtype(*scope, input.second[i]);
        ss << ":" << dtype;
        ss << "[" << GetDims(*scope, input.second[i], true) << "]";
        ss << "(" << GetLoD(*scope, input.second[i]) << ")";
      }
      if (i != input.second.size() - 1) {
        ss << ", ";
@ -254,14 +238,19 @@ std::string OperatorBase::DebugStringEx(const Scope* scope) const {
    auto& output = *it;
    ss << output.first << "[";
    for (size_t i = 0; i < output.second.size(); ++i) {
-      ss << output.second[i];
+      auto var_name = output.second[i];
      ss << var_name;
      if (scope) {
-        int row_size = GetRowSize(*scope, output.second[i]);
+        if (!VarInited(*scope, var_name)) {
-        if (row_size >= 0) {
+          ss << "[uninited]";
-          ss << "[row_size=" << row_size << "]";
+        } else {
          int row_size = GetRowSize(*scope, output.second[i]);
          if (row_size >= 0) {
            ss << "[row_size=" << row_size << "]";
          }
          ss << "[" << GetDims(*scope, var_name, true) << "]";
          ss << "(" << GetLoD(*scope, var_name) << ")";
        }
        ss << "[" << GetDims(*scope, output.second[i], true) << "]";
        ss << "(" << GetLoD(*scope, output.second[i]) << ")";
      }
      if (i != output.second.size() - 1) {
        ss << ", ";
--- a/paddle/fluid/framework/rw_lock.h
+++ b/paddle/fluid/framework/rw_lock.h
@ -14,13 +14,16 @@ limitations under the License. */
 #pragma once
 #if !defined(_WIN32)
 #include <pthread.h>
 #endif  // !_WIN32
 #include "paddle/fluid/platform/enforce.h"
 namespace paddle {
 namespace framework {
 #if !defined(_WIN32)
 struct RWLock {
  RWLock() { pthread_rwlock_init(&lock_, nullptr); }
@ -43,6 +46,15 @@ struct RWLock {
 private:
  pthread_rwlock_t lock_;
 };
 #else
 // https://stackoverflow.com/questions/7125250/making-pthread-rwlock-wrlock-recursive
 // In windows, rw_lock seems like a hack. Use empty object and do nothing.
 struct RWLock {
  void RDLock() {}
  void WRLock() {}
  void UNLock() {}
 };
 #endif
 }  // namespace framework
 }  // namespace paddle
--- a/paddle/fluid/framework/tensor.cc
+++ b/paddle/fluid/framework/tensor.cc
@ -31,7 +31,8 @@ size_t Tensor::memory_size() const {
  return holder_ == nullptr ? 0UL : holder_->size() - offset_;
 }
-void* Tensor::mutable_data(platform::Place place, std::type_index type) {
+void* Tensor::mutable_data(platform::Place place, std::type_index type,
                           size_t requested_size) {
  if (holder_ != nullptr) {
    holder_->set_type(type);
  }
@ -39,7 +40,7 @@ void* Tensor::mutable_data(platform::Place place, std::type_index type) {
                    "When calling this method, the Tensor's numel must be "
                    "equal or larger than zero. "
                    "Please check Tensor::Resize has been called first.");
-  int64_t size = numel() * SizeOfType(type);
+  size_t size = requested_size ? requested_size : numel() * SizeOfType(type);
  /* some versions of boost::variant don't have operator!= */
  if (holder_ == nullptr || !(holder_->place() == place) ||
      holder_->size() < size + offset_) {
@ -68,10 +69,10 @@ void* Tensor::mutable_data(platform::Place place, std::type_index type) {
                                 offset_);
 }
-void* Tensor::mutable_data(platform::Place place) {
+void* Tensor::mutable_data(platform::Place place, size_t requested_size) {
  PADDLE_ENFORCE(this->holder_ != nullptr,
                 "Cannot invoke mutable data if current hold nothing.");
-  return mutable_data(place, holder_->type());
+  return mutable_data(place, holder_->type(), requested_size);
 }
 Tensor& Tensor::ShareDataWith(const Tensor& src) {
--- a/paddle/fluid/framework/tensor.h
+++ b/paddle/fluid/framework/tensor.h
@ -89,22 +89,24 @@ class Tensor {
   * @note    If not exist, then allocation.
   */
  template <typename T>
-  T* mutable_data(platform::Place place);
+  T* mutable_data(platform::Place place, size_t requested_size = 0);
-  void* mutable_data(platform::Place place, std::type_index type);
+  void* mutable_data(platform::Place place, std::type_index type,
                     size_t requested_size = 0);
-  void* mutable_data(platform::Place place);
+  void* mutable_data(platform::Place place, size_t requested_size = 0);
  /**
   * @brief     Return a pointer to mutable memory block.
   *
-   * @param[in] dims    The dimensions of the memory block.
+   * @param[in] dims           The dimensions of the memory block.
-   * @param[in] place   The place of the memory block.
+   * @param[in] place          The place of the memory block.
   * @param[in] requested_size The size of the block in bytes.
   *
   * @note      If not exist, then allocation.
   */
  template <typename T>
-  T* mutable_data(DDim dims, platform::Place place);
+  T* mutable_data(DDim dims, platform::Place place, size_t requested_size = 0);
  /*! Return the dimensions of the memory block. */
  const DDim& dims() const;
--- a/paddle/fluid/framework/tensor_impl.h
+++ b/paddle/fluid/framework/tensor_impl.h
@ -46,16 +46,17 @@ inline T* Tensor::data() {
 }
 template <typename T>
-inline T* Tensor::mutable_data(DDim dims, platform::Place place) {
+inline T* Tensor::mutable_data(DDim dims, platform::Place place,
                               size_t requested_size) {
  static_assert(std::is_pod<T>::value, "T must be POD");
  Resize(dims);
-  return mutable_data<T>(place);
+  return mutable_data<T>(place, requested_size);
 }
 template <typename T>
-inline T* Tensor::mutable_data(platform::Place place) {
+inline T* Tensor::mutable_data(platform::Place place, size_t requested_size) {
  static_assert(std::is_pod<T>::value, "T must be POD");
-  return reinterpret_cast<T*>(mutable_data(place, typeid(T)));
+  return reinterpret_cast<T*>(mutable_data(place, typeid(T), requested_size));
 }
 inline Tensor ReshapeToMatrix(const Tensor& src, int num_col_dims) {
--- a/Show More
+++ b/Show More