Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into fix-dist-sparse-decay

6 years ago · b16e832d4d
parent 877289c4ca d09d6eadc0
commit b16e832d4d
326 changed files with 10335 additions and 4238 deletions
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -55,6 +55,7 @@ option(WITH_DOUBLE      "Compile PaddlePaddle with double precision"    OFF)
 option(WITH_RDMA        "Compile PaddlePaddle with RDMA support"        OFF)
 option(WITH_TIMER       "Compile PaddlePaddle with stats timer"         OFF)
 option(WITH_PROFILER    "Compile PaddlePaddle with GPU profiler and gperftools"        OFF)
 option(WITH_JEMALLOC    "Compile PaddlePaddle with jemalloc"            OFF)
 option(WITH_DOC         "Compile PaddlePaddle with documentation"       OFF)
 option(WITH_COVERAGE    "Compile PaddlePaddle with code coverage"       OFF)
 option(COVERALLS_UPLOAD "Package code coverage data to coveralls"       OFF)
@ -261,6 +262,12 @@ if (WITH_PROFILER)
    add_definitions(-DWITH_GPERFTOOLS)
 endif()
 if (WITH_JEMALLOC)
    find_package(JeMalloc REQUIRED)
    include_directories(${JEMALLOC_INCLUDE_DIR})
    add_definitions(-DWITH_JEMALLOC)
 endif()
 include(generic)            # simplify cmake module
 include(package)            # set paddle packages
 include(ccache)             # set ccache for compilation
@ -290,7 +297,7 @@ if(WITH_PSLIB)
    list(APPEND EXTERNAL_LIBS pslib_brpc)
    list(APPEND EXTERNAL_LIBS libmct)
 endif(WITH_PSLIB)
-    
+
 if(WITH_AMD_GPU)
    find_package(HIP)
    include(hip)
--- a/84
+++ b/84
@ -94,52 +94,52 @@ RUN localedef -i en_US -f UTF-8 en_US.UTF-8
 # specify sphinx version as 1.5.6 and remove -U option for [pip install -U
 # sphinx-rtd-theme] since -U option will cause sphinx being updated to newest
 # version(1.7.1 for now), which causes building documentation failed.
-RUN pip3 install -U wheel && \
+RUN pip3 --no-cache-dir install -U wheel && \
-    pip3 install -U docopt PyYAML sphinx==1.5.6 && \
+    pip3 --no-cache-dir install -U docopt PyYAML sphinx==1.5.6 && \
-    pip3 install sphinx-rtd-theme==0.1.9 recommonmark && \
+    pip3 --no-cache-dir install sphinx-rtd-theme==0.1.9 recommonmark && \
-    pip3.6 install -U wheel && \
+    pip3.6 --no-cache-dir install -U wheel && \
-    pip3.6 install -U docopt PyYAML sphinx==1.5.6 && \
+    pip3.6 --no-cache-dir install -U docopt PyYAML sphinx==1.5.6 && \
-    pip3.6 install sphinx-rtd-theme==0.1.9 recommonmark && \
+    pip3.6 --no-cache-dir install sphinx-rtd-theme==0.1.9 recommonmark && \
-    pip3.7 install -U wheel && \
+    pip3.7 --no-cache-dir install -U wheel && \
-    pip3.7 install -U docopt PyYAML sphinx==1.5.6 && \
+    pip3.7 --no-cache-dir install -U docopt PyYAML sphinx==1.5.6 && \
-    pip3.7 install sphinx-rtd-theme==0.1.9 recommonmark && \
+    pip3.7 --no-cache-dir install sphinx-rtd-theme==0.1.9 recommonmark && \
    easy_install -U pip && \
-    pip install -U pip setuptools wheel && \
+    pip --no-cache-dir install -U pip setuptools wheel && \
-    pip install -U docopt PyYAML sphinx==1.5.6 && \
+    pip --no-cache-dir install -U docopt PyYAML sphinx==1.5.6 && \
-    pip install sphinx-rtd-theme==0.1.9 recommonmark
+    pip --no-cache-dir install sphinx-rtd-theme==0.1.9 recommonmark
-
+
-RUN pip3 install 'pre-commit==1.10.4' 'ipython==5.3.0' && \
+RUN pip3 --no-cache-dir install 'pre-commit==1.10.4' 'ipython==5.3.0' && \
-    pip3 install 'ipykernel==4.6.0' 'jupyter==1.0.0' && \
+    pip3 --no-cache-dir install 'ipykernel==4.6.0' 'jupyter==1.0.0' && \
-    pip3 install opencv-python && \
+    pip3 --no-cache-dir install opencv-python && \
-    pip3.6 install 'pre-commit==1.10.4' 'ipython==5.3.0' && \
+    pip3.6 --no-cache-dir install 'pre-commit==1.10.4' 'ipython==5.3.0' && \
-    pip3.6 install 'ipykernel==4.6.0' 'jupyter==1.0.0' && \
+    pip3.6 --no-cache-dir install 'ipykernel==4.6.0' 'jupyter==1.0.0' && \
-    pip3.6 install opencv-python && \
+    pip3.6 --no-cache-dir install opencv-python && \
-    pip3.7 install 'pre-commit==1.10.4' 'ipython==5.3.0' && \
+    pip3.7 --no-cache-dir install 'pre-commit==1.10.4' 'ipython==5.3.0' && \
-    pip3.7 install 'ipykernel==4.6.0' 'jupyter==1.0.0' && \
+    pip3.7 --no-cache-dir install 'ipykernel==4.6.0' 'jupyter==1.0.0' && \
-    pip3.7 install opencv-python && \
+    pip3.7 --no-cache-dir install opencv-python && \
-    pip install 'pre-commit==1.10.4' 'ipython==5.3.0' && \
+    pip --no-cache-dir install 'pre-commit==1.10.4' 'ipython==5.3.0' && \
-    pip install 'ipykernel==4.6.0' 'jupyter==1.0.0' && \
+    pip --no-cache-dir install 'ipykernel==4.6.0' 'jupyter==1.0.0' && \
-    pip install opencv-python
+    pip --no-cache-dir install opencv-python
 #For docstring checker
-RUN pip3 install pylint pytest astroid isort
+RUN pip3 --no-cache-dir install pylint pytest astroid isort
-RUN pip3.6 install pylint pytest astroid isort
+RUN pip3.6 --no-cache-dir install pylint pytest astroid isort
-RUN pip3.7 install pylint pytest astroid isort
+RUN pip3.7 --no-cache-dir install pylint pytest astroid isort
-RUN pip install pylint pytest astroid isort LinkChecker
+RUN pip --no-cache-dir install pylint pytest astroid isort LinkChecker
 COPY ./python/requirements.txt /root/
-RUN pip3 install -r /root/requirements.txt
+RUN pip3 --no-cache-dir install -r /root/requirements.txt
-RUN pip3.6 install -r /root/requirements.txt
+RUN pip3.6 --no-cache-dir install -r /root/requirements.txt
-RUN pip3.7 install -r /root/requirements.txt
+RUN pip3.7 --no-cache-dir install -r /root/requirements.txt
-RUN pip install -r /root/requirements.txt
+RUN pip --no-cache-dir install -r /root/requirements.txt
 # To fix https://github.com/PaddlePaddle/Paddle/issues/1954, we use
 # the solution in https://urllib3.readthedocs.io/en/latest/user-guide.html#ssl-py2
-RUN apt-get install -y libssl-dev libffi-dev
+RUN apt-get install -y libssl-dev libffi-dev && apt-get clean -y
-RUN pip3 install certifi urllib3[secure]
+RUN pip3 --no-cache-dir install certifi urllib3[secure]
-RUN pip3.6 install certifi urllib3[secure]
+RUN pip3.6 --no-cache-dir install certifi urllib3[secure]
-RUN pip3.7 install certifi urllib3[secure]
+RUN pip3.7 --no-cache-dir install certifi urllib3[secure]
-RUN pip install certifi urllib3[secure]
+RUN pip --no-cache-dir install certifi urllib3[secure]
 # Install woboq_codebrowser to /woboq
@ -149,6 +149,14 @@ RUN git clone https://github.com/woboq/woboq_codebrowser /woboq && \
           -DCMAKE_BUILD_TYPE=Release . \
     make)
 # ar mishandles 4GB files
 # https://sourceware.org/bugzilla/show_bug.cgi?id=14625
 # remove them when apt-get support 2.27 and higher version
 RUN wget -q https://launchpad.net/ubuntu/+archive/primary/+sourcefiles/binutils/2.27-9ubuntu1/binutils_2.27.orig.tar.gz && \
    tar -xzf binutils_2.27.orig.tar.gz && \
    cd binutils-2.27 && \
    ./configure && make -j && make install && cd .. && rm -rf binutils-2.27 binutils_2.27.orig.tar.gz
 # Configure OpenSSH server. c.f. https://docs.docker.com/engine/examples/running_ssh_service
 RUN mkdir /var/run/sshd
 RUN echo 'root:root' | chpasswd
--- a/cmake/FindJeMalloc.cmake
+++ b/cmake/FindJeMalloc.cmake
@ -0,0 +1,21 @@
 # - Find JeMalloc library
 # Find the native JeMalloc includes and library
 #
 # JEMALLOC_INCLUDE_DIR - where to find jemalloc.h, etc.
 # JEMALLOC_LIBRARIES - List of libraries when using jemalloc.
 # JEMALLOC_FOUND - True if jemalloc found.
 find_path(JEMALLOC_INCLUDE_DIR
  NAMES jemalloc/jemalloc.h
  HINTS ${JEMALLOC_ROOT_DIR}/include)
 find_library(JEMALLOC_LIBRARIES
  NAMES jemalloc
  HINTS ${JEMALLOC_ROOT_DIR}/lib)
 include(FindPackageHandleStandardArgs)
 find_package_handle_standard_args(jemalloc DEFAULT_MSG JEMALLOC_LIBRARIES JEMALLOC_INCLUDE_DIR)
 mark_as_advanced(
  JEMALLOC_LIBRARIES
  JEMALLOC_INCLUDE_DIR)
--- a/cmake/configure.cmake
+++ b/cmake/configure.cmake
@ -134,6 +134,7 @@ if(WITH_GPU)
            message(WARNING "Anakin needs CUDNN >= 7.0 to compile. Force WITH_ANAKIN=OFF")
            set(WITH_ANAKIN OFF CACHE STRING "Anakin is valid only when CUDNN >= 7.0." FORCE)
        endif()
        add_definitions(-DWITH_ANAKIN)
    endif()
    if(WITH_ANAKIN)
        # NOTICE(minqiyang): the end slash is important because $CUDNN_INCLUDE_DIR
--- a/cmake/cuda.cmake
+++ b/cmake/cuda.cmake
@ -2,7 +2,7 @@ if(NOT WITH_GPU)
    return()
 endif()
-set(paddle_known_gpu_archs "30 35 50 52 60 61 70")
+set(paddle_known_gpu_archs "30 35 50 52 60 61 70 75")
 set(paddle_known_gpu_archs7 "30 35 50 52")
 set(paddle_known_gpu_archs8 "30 35 50 52 60 61")
@ -59,7 +59,7 @@ endfunction()
 #   select_nvcc_arch_flags(out_variable)
 function(select_nvcc_arch_flags out_variable)
  # List of arch names
-  set(archs_names "Kepler" "Maxwell" "Pascal" "All" "Manual")
+  set(archs_names "Kepler" "Maxwell" "Pascal" "Volta" "Turing" "All" "Manual")
  set(archs_name_default "All")
  if(NOT CMAKE_CROSSCOMPILING)
    list(APPEND archs_names "Auto")
@ -93,6 +93,8 @@ function(select_nvcc_arch_flags out_variable)
    set(cuda_arch_bin "60 61")
  elseif(${CUDA_ARCH_NAME} STREQUAL "Volta")
    set(cuda_arch_bin "70")
  elseif(${CUDA_ARCH_NAME} STREQUAL "Turing")
    set(cuda_arch_bin "75")
  elseif(${CUDA_ARCH_NAME} STREQUAL "All")
    set(cuda_arch_bin ${paddle_known_gpu_archs})
  elseif(${CUDA_ARCH_NAME} STREQUAL "Auto")
@ -139,10 +141,12 @@ endfunction()
 message(STATUS "CUDA detected: " ${CUDA_VERSION})
 if (${CUDA_VERSION} LESS 7.0)
  set(paddle_known_gpu_archs ${paddle_known_gpu_archs})
  add_definitions("-DPADDLE_CUDA_BINVER=\"60\"")
 elseif (${CUDA_VERSION} LESS 8.0) # CUDA 7.x
  set(paddle_known_gpu_archs ${paddle_known_gpu_archs7})
  list(APPEND CUDA_NVCC_FLAGS "-D_MWAITXINTRIN_H_INCLUDED")
  list(APPEND CUDA_NVCC_FLAGS "-D__STRICT_ANSI__")
  add_definitions("-DPADDLE_CUDA_BINVER=\"70\"")
 elseif (${CUDA_VERSION} LESS 9.0) # CUDA 8.x
  set(paddle_known_gpu_archs ${paddle_known_gpu_archs8})
  list(APPEND CUDA_NVCC_FLAGS "-D_MWAITXINTRIN_H_INCLUDED")
@ -150,6 +154,7 @@ elseif (${CUDA_VERSION} LESS 9.0) # CUDA 8.x
  # CUDA 8 may complain that sm_20 is no longer supported. Suppress the
  # warning for now.
  list(APPEND CUDA_NVCC_FLAGS "-Wno-deprecated-gpu-targets")
  add_definitions("-DPADDLE_CUDA_BINVER=\"80\"")
 endif()
 include_directories(${CUDA_INCLUDE_DIRS})
--- a/cmake/cudnn.cmake
+++ b/cmake/cudnn.cmake
@ -89,6 +89,7 @@ if(CUDNN_FOUND)
        if(NOT CUDNN_MAJOR_VERSION)
            set(CUDNN_VERSION "???")
        else()
            add_definitions("-DPADDLE_CUDNN_BINVER=\"${CUDNN_MAJOR_VERSION}\"")
            math(EXPR CUDNN_VERSION
                "${CUDNN_MAJOR_VERSION} * 1000 +
                 ${CUDNN_MINOR_VERSION} * 100 + ${CUDNN_PATCHLEVEL_VERSION}")
--- a/cmake/external/cub.cmake
+++ b/cmake/external/cub.cmake
@ -32,4 +32,4 @@ endif()
 add_dependencies(cub extern_cub)
-LIST(APPEND externl_project_dependencies cub)
+LIST(APPEND external_project_dependencies cub)
--- a/cmake/external/dlpack.cmake
+++ b/cmake/external/dlpack.cmake
@ -28,4 +28,4 @@ endif()
 add_dependencies(dlpack extern_dlpack)
-LIST(APPEND externl_project_dependencies dlpack)
+LIST(APPEND external_project_dependencies dlpack)
--- a/cmake/external/mkldnn.cmake
+++ b/cmake/external/mkldnn.cmake
@ -106,10 +106,10 @@ else(WIN32)
    SET(MKLDNN_SHARED_LIB ${MKLDNN_INSTALL_DIR}/libmkldnn.so.0)
    ADD_CUSTOM_COMMAND(OUTPUT ${MKLDNN_SHARED_LIB}
            COMMAND ${CMAKE_COMMAND} -E copy ${MKLDNN_LIB} ${MKLDNN_SHARED_LIB}
-            DEPENDS mkldnn)
+            DEPENDS mkldnn shared_mkldnn)
 endif(WIN32)
 ADD_CUSTOM_TARGET(mkldnn_shared_lib ALL DEPENDS ${MKLDNN_SHARED_LIB})
-
+ADD_DEPENDENCIES(mkldnn_shared_lib ${MKLDNN_PROJECT} mkldnn)
 IF(WITH_C_API)
  INSTALL(FILES ${MKLDNN_SHARED_LIB} DESTINATION lib)
 ENDIF()
--- a/cmake/external/mklml.cmake
+++ b/cmake/external/mklml.cmake
@ -16,14 +16,6 @@ IF(NOT ${WITH_MKLML})
  return()
 ENDIF(NOT ${WITH_MKLML})
 IF(APPLE)
    MESSAGE(WARNING
        "Mac is not supported with MKLML in Paddle yet."
        "Force WITH_MKLML=OFF")
    SET(WITH_MKLML OFF CACHE STRING "Disable MKLML package in Windows and MacOS" FORCE)
    return()
 ENDIF()
 INCLUDE(ExternalProject)
 SET(MKLML_DST_DIR       "mklml")
 SET(MKLML_INSTALL_ROOT  "${THIRD_PARTY_PATH}/install")
@ -47,10 +39,13 @@ SET(CMAKE_INSTALL_RPATH "${CMAKE_INSTALL_RPATH}" "${MKLML_ROOT}/lib")
 IF((NOT DEFINED MKLML_VER) OR (NOT DEFINED MKLML_URL))
    MESSAGE(STATUS "use pre defined download url")
    if(WIN32)
-        SET(MKLML_VER "mklml_win_2019.0.20180710" CACHE STRING "" FORCE)
+        SET(MKLML_VER "mklml_win_2019.0.1.20180928" CACHE STRING "" FORCE)
        SET(MKLML_URL "https://paddlepaddledeps.cdn.bcebos.com/${MKLML_VER}.zip" CACHE STRING "" FORCE)
    elseif(APPLE)
        SET(MKLML_VER "mklml_mac_2019.0.1.20180928" CACHE STRING "" FORCE)
        SET(MKLML_URL "http://paddlepaddledeps.cdn.bcebos.com/${MKLML_VER}.tgz" CACHE STRING "" FORCE)
    else()
-        SET(MKLML_VER "mklml_lnx_2019.0.20180710" CACHE STRING "" FORCE)
+        SET(MKLML_VER "mklml_lnx_2019.0.1.20180928" CACHE STRING "" FORCE)
        SET(MKLML_URL "http://paddlepaddledeps.cdn.bcebos.com/${MKLML_VER}.tgz" CACHE STRING "" FORCE)
    ENDIF()
 endif()
--- a/cmake/external/ngraph.cmake
+++ b/cmake/external/ngraph.cmake
@ -37,13 +37,12 @@ INCLUDE(GNUInstallDirs)
 INCLUDE(ExternalProject)
 SET(NGRAPH_PROJECT         "extern_ngraph")
-SET(NGRAPH_VERSION         "0.9")
+SET(NGRAPH_GIT_TAG         "08851c2c45fcf9fa9c74871dd3dbc3fe38f37cc9")
 SET(NGRAPH_GIT_TAG         "f9fd9d4cc318dc59dd4b68448e7fbb5f67a28bd0")
 SET(NGRAPH_SOURCES_DIR     ${THIRD_PARTY_PATH}/ngraph)
 SET(NGRAPH_INSTALL_DIR     ${THIRD_PARTY_PATH}/install/ngraph)
 SET(NGRAPH_INC_DIR         ${NGRAPH_INSTALL_DIR}/include)
 SET(NGRAPH_LIB_DIR         ${NGRAPH_INSTALL_DIR}/${CMAKE_INSTALL_LIBDIR})
-SET(NGRAPH_SHARED_LIB_NAME libngraph.so.${NGRAPH_VERSION})
+SET(NGRAPH_SHARED_LIB_NAME libngraph.so)
 SET(NGRAPH_CPU_LIB_NAME    libcpu_backend.so)
 SET(NGRAPH_TBB_LIB_NAME    libtbb.so.2)
 SET(NGRAPH_GIT_REPO        "https://github.com/NervanaSystems/ngraph.git")
--- a/cmake/generic.cmake
+++ b/cmake/generic.cmake
@ -115,6 +115,10 @@ function(common_link TARGET_NAME)
  if (WITH_PROFILER)
    target_link_libraries(${TARGET_NAME} gperftools::profiler)
  endif()
  if (WITH_JEMALLOC)
    target_link_libraries(${TARGET_NAME} ${JEMALLOC_LIBRARIES})
  endif()
 endfunction()
@ -228,7 +232,7 @@ function(merge_static_libs TARGET_NAME)
      # Get the file names of the libraries to be merged
      set(libfiles ${libfiles} $<TARGET_FILE:${lib}>)
    endforeach()
-    # msvc will put libarary in directory of "/Release/xxxlib" by default 
+    # msvc will put libarary in directory of "/Release/xxxlib" by default
    #       COMMAND cmake -E remove "${CMAKE_CURRENT_BINARY_DIR}/${CMAKE_BUILD_TYPE}/${TARGET_NAME}.lib"
    add_custom_command(TARGET ${TARGET_NAME} POST_BUILD
      COMMAND cmake -E make_directory "${CMAKE_CURRENT_BINARY_DIR}/${CMAKE_BUILD_TYPE}"
--- a/cmake/inference_lib.cmake
+++ b/cmake/inference_lib.cmake
@ -136,7 +136,7 @@ if (WITH_MKLDNN)
    copy(mkldnn_lib
            SRCS ${MKLDNN_INC_DIR} ${MKLDNN_SHARED_LIB}
            DSTS ${dst_dir} ${dst_dir}/lib
-            DEPS mkldnn
+            DEPS mkldnn_shared_lib
            )
 endif ()
--- a/cmake/operators.cmake
+++ b/cmake/operators.cmake
@ -110,7 +110,7 @@ function(op_library TARGET)
    # Define operators that don't need pybind here.
    foreach(manual_pybind_op "compare_op" "logical_op" "nccl_op"
 "tensor_array_read_write_op" "tensorrt_engine_op" "conv_fusion_op"
-"fusion_transpose_flatten_concat_op")
+"fusion_transpose_flatten_concat_op" "fusion_conv_inception_op")
        if ("${TARGET}" STREQUAL "${manual_pybind_op}")
            set(pybind_flag 1)
        endif()
--- a/cmake/simd.cmake
+++ b/cmake/simd.cmake
@ -57,46 +57,43 @@ int main()
    return 0;
 }" SSE3_FOUND)
-# disable AVX by default on windows
+# Check AVX
-if(NOT WIN32)
+set(CMAKE_REQUIRED_FLAGS ${AVX_FLAG})
-    # Check AVX
+set(AVX_FOUND_EXITCODE 1 CACHE STRING "Result from TRY_RUN" FORCE)
-    set(CMAKE_REQUIRED_FLAGS ${AVX_FLAG})
+CHECK_CXX_SOURCE_RUNS("
-    set(AVX_FOUND_EXITCODE 1 CACHE STRING "Result from TRY_RUN" FORCE)
+#include <immintrin.h>
-    CHECK_CXX_SOURCE_RUNS("
+int main()
-    #include <immintrin.h>
+{
-    int main()
+    __m256 a = _mm256_set_ps (-1.0f, 2.0f, -3.0f, 4.0f, -1.0f, 2.0f, -3.0f, 4.0f);
-    {
+    __m256 b = _mm256_set_ps (1.0f, 2.0f, 3.0f, 4.0f, 1.0f, 2.0f, 3.0f, 4.0f);
-        __m256 a = _mm256_set_ps (-1.0f, 2.0f, -3.0f, 4.0f, -1.0f, 2.0f, -3.0f, 4.0f);
+    __m256 result = _mm256_add_ps (a, b);
-        __m256 b = _mm256_set_ps (1.0f, 2.0f, 3.0f, 4.0f, 1.0f, 2.0f, 3.0f, 4.0f);
+    return 0;
-        __m256 result = _mm256_add_ps (a, b);
+}" AVX_FOUND)
        return 0;
    }" AVX_FOUND)
-    # Check AVX 2
+# Check AVX 2
-    set(CMAKE_REQUIRED_FLAGS ${AVX2_FLAG})
+set(CMAKE_REQUIRED_FLAGS ${AVX2_FLAG})
-    set(AVX2_FOUND_EXITCODE 1 CACHE STRING "Result from TRY_RUN" FORCE)
+set(AVX2_FOUND_EXITCODE 1 CACHE STRING "Result from TRY_RUN" FORCE)
-    CHECK_CXX_SOURCE_RUNS("
+CHECK_CXX_SOURCE_RUNS("
-    #include <immintrin.h>
+#include <immintrin.h>
-    int main()
+int main()
-    {
+{
-        __m256i a = _mm256_set_epi32 (-1, 2, -3, 4, -1, 2, -3, 4);
+    __m256i a = _mm256_set_epi32 (-1, 2, -3, 4, -1, 2, -3, 4);
-        __m256i result = _mm256_abs_epi32 (a);
+    __m256i result = _mm256_abs_epi32 (a);
-        return 0;
+    return 0;
-    }" AVX2_FOUND)
+}" AVX2_FOUND)
-    # Check AVX512F
+# Check AVX512F
-    set(CMAKE_REQUIRED_FLAGS ${AVX512F_FLAG})
+set(CMAKE_REQUIRED_FLAGS ${AVX512F_FLAG})
-    set(AVX512F_FOUND_EXITCODE 1 CACHE STRING "Result from TRY_RUN" FORCE)
+set(AVX512F_FOUND_EXITCODE 1 CACHE STRING "Result from TRY_RUN" FORCE)
-    CHECK_CXX_SOURCE_RUNS("
+CHECK_CXX_SOURCE_RUNS("
-    #include <immintrin.h>
+#include <immintrin.h>
-    int main()
+int main()
-    {
+{
-        __m512i a = _mm512_set_epi32 (-1, 2, -3, 4, -1, 2, -3, 4,
+    __m512i a = _mm512_set_epi32 (-1, 2, -3, 4, -1, 2, -3, 4,
-                                      13, -5, 6, -7, 9, 2, -6, 3);
+                                  13, -5, 6, -7, 9, 2, -6, 3);
-        __m512i result = _mm512_abs_epi32 (a);
+    __m512i result = _mm512_abs_epi32 (a);
-        return 0;
+    return 0;
-    }" AVX512F_FOUND)
+}" AVX512F_FOUND)
 endif(NOT WIN32)
 set(CMAKE_REQUIRED_FLAGS ${CMAKE_REQUIRED_FLAGS_RETAINED})
 mark_as_advanced(MMX_FOUND SSE2_FOUND SSE3_FOUND AVX_FOUND AVX2_FOUND AVX512F_FOUND)
--- a/paddle/contrib/float16/float16_transpiler.py
+++ b/paddle/contrib/float16/float16_transpiler.py
@ -60,7 +60,7 @@ class Float16Transpiler:
            raise TypeError("place should be as CPUPlace/CUDAPlace type")
        if scope is None:
            scope = global_scope()
-        if not isinstance(scope, core.Scope):
+        if not isinstance(scope, core._Scope):
            raise TypeError("scope should be as Scope type or None")
        self.scope = scope
--- a/paddle/fluid/API.spec
+++ b/paddle/fluid/API.spec
@ -464,11 +464,7 @@ paddle.fluid.unique_name.switch ArgSpec(args=['new_generator'], varargs=None, ke
 paddle.fluid.unique_name.guard ArgSpec(args=[], varargs='args', keywords='kwds', defaults=None)
 paddle.fluid.recordio_writer.convert_reader_to_recordio_file ArgSpec(args=['filename', 'reader_creator', 'feeder', 'compressor', 'max_num_records', 'feed_order'], varargs=None, keywords=None, defaults=(Compressor.Snappy, 1000, None))
 paddle.fluid.recordio_writer.convert_reader_to_recordio_files ArgSpec(args=['filename', 'batch_per_file', 'reader_creator', 'feeder', 'compressor', 'max_num_records', 'feed_order'], varargs=None, keywords=None, defaults=(Compressor.Snappy, 1000, None))
-paddle.fluid.Scope.__init__ __init__(self: paddle.fluid.core.Scope) -> None
+paddle.fluid.Scope Scope() -> paddle.fluid.core._Scope
 paddle.fluid.Scope.drop_kids drop_kids(self: paddle.fluid.core.Scope) -> None
 paddle.fluid.Scope.find_var find_var(self: paddle.fluid.core.Scope, arg0: unicode) -> paddle.fluid.core.Variable
 paddle.fluid.Scope.new_scope new_scope(self: paddle.fluid.core.Scope) -> paddle.fluid.core.Scope
 paddle.fluid.Scope.var var(self: paddle.fluid.core.Scope, arg0: unicode) -> paddle.fluid.core.Variable
 paddle.reader.map_readers ArgSpec(args=['func'], varargs='readers', keywords=None, defaults=None)
 paddle.reader.buffered ArgSpec(args=['reader', 'size'], varargs=None, keywords=None, defaults=None)
 paddle.reader.compose ArgSpec(args=[], varargs='readers', keywords='kwargs', defaults=None)
--- a/paddle/fluid/framework/CMakeLists.txt
+++ b/paddle/fluid/framework/CMakeLists.txt
@ -7,27 +7,17 @@ function(windows_symbolic TARGET)
  cmake_parse_arguments(windows_symbolic "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
  set(final_path ${CMAKE_CURRENT_SOURCE_DIR}/${windows_symbolic_PATH})
  foreach(src ${windows_symbolic_SRCS})
-  get_filename_component(src ${src} NAME_WE)
+    get_filename_component(src ${src} NAME_WE)
-  if (NOT EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${src}.cc OR NOT EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${src}.cu)
+    if (NOT EXISTS ${final_path}/${src}.cc OR NOT EXISTS ${final_path}/${src}.cu)
-      message(FATAL " ${src}.cc and ${src}.cu must exsits, and ${src}.cu must be symbolic file.")
+        message(FATAL " ${src}.cc and ${src}.cu must exsits, and ${src}.cu must be symbolic file.")
-  endif()
+    endif()
-
+
-#only copy the xx.cu to.xx.cu when the content are modified
+    file(GENERATE OUTPUT ${final_path}/.${src}.cu INPUT ${final_path}/${src}.cc)
-  set(copy_flag 1)
+
-  if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/.${src}.cu)
+    add_custom_command(OUTPUT ${final_path}/.${src}.cu
-  file(READ ${CMAKE_CURRENT_SOURCE_DIR}/${src}.cc SOURCE_STR)
+            COMMAND ${CMAKE_COMMAND} -E copy_if_different "${final_path}/${src}.cc" "${final_path}/.${src}.cu"
-  file(READ ${CMAKE_CURRENT_SOURCE_DIR}/.${src}.cu TARGET_STR)
+            COMMENT "create hidden file of ${src}.cu")
-  if (SOURCE_STR STREQUAL TARGET_STR)
+    add_custom_target(${TARGET} ALL DEPENDS .${src}.cu)
    set(copy_flag 0)
  endif()
  endif()
  if (copy_flag)
  add_custom_command(OUTPUT .${src}.cu
          COMMAND ${CMAKE_COMMAND} -E remove ${CMAKE_CURRENT_SOURCE_DIR}/.${src}.cu
          COMMAND ${CMAKE_COMMAND} -E copy "${CMAKE_CURRENT_SOURCE_DIR}/${src}.cc" "${CMAKE_CURRENT_SOURCE_DIR}/.${src}.cu"
          COMMENT "create hidden file of ${src}.cu")
  endif(copy_flag)
  add_custom_target(${TARGET} ALL DEPENDS .${src}.cu)
  endforeach()
 endfunction()
@ -37,9 +27,10 @@ add_subdirectory(details)
 proto_library(framework_proto SRCS framework.proto)
 proto_library(async_executor_proto SRCS data_feed.proto)
-cc_library(ddim SRCS ddim.cc DEPS eigen3 boost)
+cc_library(ddim SRCS ddim.cc DEPS eigen3 boost enforce)
 cc_test(ddim_test SRCS ddim_test.cc DEPS ddim)
 nv_test(dim_test SRCS dim_test.cu DEPS ddim)
 cc_test(unroll_array_ops_test SRCS unroll_array_ops_test.cc)
 cc_library(data_type SRCS data_type.cc DEPS framework_proto ddim device_context)
 cc_test(data_type_test SRCS data_type_test.cc DEPS data_type place tensor)
 if(WITH_GPU)
@ -78,17 +69,23 @@ cc_library(garbage_collector SRCS garbage_collector.cc DEPS device_context memor
 cc_library(reader SRCS reader.cc DEPS lod_tensor ddim)
 cc_test(reader_test SRCS reader_test.cc DEPS reader)
 cc_test(variable_test SRCS variable_test.cc)
 cc_library(threadpool SRCS threadpool.cc DEPS enforce)
 cc_test(threadpool_test SRCS threadpool_test.cc DEPS threadpool)
-cc_library(scope SRCS scope.cc DEPS glog threadpool)
+cc_library(var_type_traits SRCS var_type_traits DEPS lod_tensor selected_rows framework_proto)
 if (WITH_GPU)
  target_link_libraries(var_type_traits dynload_cuda)
 endif()
 cc_test(var_type_traits_test SRCS var_type_traits_test.cc DEPS var_type_traits)
 cc_library(scope SRCS scope.cc DEPS glog threadpool xxhash var_type_traits)
 cc_library(scope_pool SRCS scope_pool.cc DEPS scope)
 cc_test(scope_test SRCS scope_test.cc DEPS scope)
 cc_test(variable_test SRCS variable_test.cc DEPS tensor var_type_traits)
 cc_library(data_device_transform SRCS data_device_transform.cc DEPS tensor)
 nv_test(data_device_transform_test SRCS data_device_transform_test.cu
-        DEPS operator op_registry device_context math_function)
+        DEPS operator op_registry device_context math_function scope)
 if(WITH_GPU)
  if (WIN32)
@ -133,11 +130,9 @@ cc_test(version_test SRCS version_test.cc DEPS version)
 cc_library(proto_desc SRCS var_desc.cc op_desc.cc block_desc.cc program_desc.cc DEPS shape_inference op_info operator glog version)
 if(WITH_NGRAPH)
-  if(NOT WIN32)
+  cc_library(ngraph_bridge SRCS ngraph_bridge.cc DEPS operator framework_proto ngraph)
-    cc_library(ngraph_bridge SRCS ngraph_bridge.cc DEPS operator framework_proto ngraph)
+  cc_library(ngraph_operator SRCS ngraph_operator.cc DEPS ngraph_bridge operator op_info device_context tensor scope glog
-    cc_library(ngraph_operator SRCS ngraph_operator.cc DEPS ngraph_bridge operator op_info device_context tensor scope glog
+             shape_inference data_transform lod_tensor profiler)
      shape_inference data_transform lod_tensor profiler ngraph)
  endif(NOT WIN32)
 endif(WITH_NGRAPH)
 cc_library(op_registry SRCS op_registry.cc DEPS op_proto_maker op_info operator glog proto_desc)
@ -179,11 +174,7 @@ if(WITH_DISTRIBUTE)
 else()
  if(WITH_NGRAPH)
-    if(NOT WIN32)
+    cc_library(executor SRCS executor.cc DEPS op_registry device_context scope framework_proto glog lod_rank_table feed_fetch_method graph_to_program_pass ngraph_operator variable_helper)
      cc_library(executor SRCS executor.cc DEPS op_registry device_context scope framework_proto glog lod_rank_table feed_fetch_method graph_to_program_pass ngraph ngraph_operator variable_helper)
    else(NOT WIN32)
      cc_library(executor SRCS executor.cc DEPS op_registry device_context scope framework_proto glog lod_rank_table feed_fetch_method graph_to_program_pass variable_helper)
    endif(NOT WIN32)
  else(WITH_NGRAPH)
    cc_library(executor SRCS executor.cc DEPS op_registry device_context scope framework_proto glog lod_rank_table feed_fetch_method graph_to_program_pass variable_helper)
  endif(WITH_NGRAPH)
@ -193,14 +184,14 @@ endif()
 target_link_libraries(executor garbage_collector)
 cc_library(parallel_executor SRCS parallel_executor.cc DEPS
-        threaded_ssa_graph_executor scope_buffered_ssa_graph_executor
+        threaded_ssa_graph_executor scope_buffered_ssa_graph_executor parallel_ssa_graph_executor
        graph build_strategy
        fast_threaded_ssa_graph_executor variable_helper)
 if(WITH_PSLIB)
-    cc_library(async_executor SRCS async_executor.cc data_feed.cc data_feed_factory.cc executor_thread_worker.cc DEPS op_registry device_context scope framework_proto glog lod_rank_table feed_fetch_method graph_to_program_pass async_executor_proto variable_helper pslib_brpc pslib)
+    cc_library(async_executor SRCS async_executor.cc data_feed.cc data_feed_factory.cc executor_thread_worker.cc DEPS op_registry device_context scope framework_proto glog lod_rank_table feed_fetch_method graph_to_program_pass async_executor_proto variable_helper pslib_brpc pslib timer)
 else()
-    cc_library(async_executor SRCS async_executor.cc data_feed.cc data_feed_factory.cc executor_thread_worker.cc DEPS op_registry device_context scope framework_proto glog lod_rank_table feed_fetch_method graph_to_program_pass async_executor_proto variable_helper)
+    cc_library(async_executor SRCS async_executor.cc data_feed.cc data_feed_factory.cc executor_thread_worker.cc DEPS op_registry device_context scope framework_proto glog lod_rank_table feed_fetch_method graph_to_program_pass async_executor_proto variable_helper timer)
 endif(WITH_PSLIB)
--- a/paddle/fluid/framework/array.h
+++ b/paddle/fluid/framework/array.h
@ -15,34 +15,123 @@
 #pragma once
 #include <cstdint>
-#include "paddle/fluid/platform/hostdevice.h"
+#include "paddle/fluid/framework/unroll_array_ops.h"
 #include "paddle/fluid/platform/enforce.h"
 namespace paddle {
 namespace framework {
 template <typename T, size_t N>
 class Array {
  static_assert(N > 0, "The size of array must be larger than 0");
 public:
-  HOSTDEVICE Array() {}
+  static constexpr size_t kSize = N;
  HOSTDEVICE inline Array() {}
-  HOSTDEVICE explicit Array(const T &val) {
+  template <typename... Args>
-    for (size_t i = 0; i < N; ++i) data_[i] = val;
+  HOSTDEVICE inline explicit Array(const T &val, Args... args) {
    static_assert(N == sizeof...(Args) + 1, "Invalid argument");
    UnrollVarArgsAssign<T>::Run(data_, val, args...);
  }
-  HOSTDEVICE const T *Get() const { return data_; }
+  HOSTDEVICE inline void Fill(const T &val) {
    UnrollFillConstant<N>::Run(data_, val);
  }
-  HOSTDEVICE T *GetMutable() { return data_; }
+  HOSTDEVICE inline const T *Get() const { return data_; }
-  HOSTDEVICE T &operator[](size_t index) { return data_[index]; }
+  HOSTDEVICE inline T *GetMutable() { return data_; }
-  HOSTDEVICE const T &operator[](size_t index) const { return data_[index]; }
+  HOSTDEVICE inline T &operator[](size_t i) { return *advance(data_, i); }
  // Writing "return data_[i]" would cause compilation warning/error:
  // "array subscript is above array bound" in Python 35 CI.
  // It seems that it is a false warning of GCC if we do not check the bounds
  // of array index. But for better performance, we do not check in operator[]
  // like what is in STL. If users want to check the bounds, use at() instead
  HOSTDEVICE inline const T &operator[](size_t i) const {
    return *advance(data_, i);
  }
  HOSTDEVICE inline T &at(size_t i) {
 #ifndef __CUDA_ARCH__
    PADDLE_ENFORCE_LT(i, N, "Array index out of bounds");
 #endif
    return (*this)[i];
  }
  HOSTDEVICE inline const T &at(size_t i) const {
 #ifndef __CUDA_ARCH__
    PADDLE_ENFORCE_LT(i, N, "Array index out of bounds");
 #endif
    return (*this)[i];
  }
  HOSTDEVICE constexpr size_t size() const { return N; }
  HOSTDEVICE inline bool operator==(const Array<T, N> &other) const {
    return UnrollCompare<N>::Run(data_, other.data_);
  }
  HOSTDEVICE inline bool operator!=(const Array<T, N> &other) const {
    return !(*this == other);
  }
 private:
  template <typename U>
  HOSTDEVICE static inline U *advance(U *ptr, size_t i) {
    return ptr + i;
  }
  T data_[N];
 };
 template <typename T>
 class Array<T, 0> {
 public:
  static constexpr size_t kSize = 0;
  HOSTDEVICE inline Array() {}
  HOSTDEVICE inline void Fill(const T &val) {}
  HOSTDEVICE inline constexpr T *Get() const { return nullptr; }
  // Add constexpr to GetMutable() cause warning in MAC
  HOSTDEVICE inline T *GetMutable() { return nullptr; }
  HOSTDEVICE inline T &operator[](size_t) {
 #ifdef __CUDA_ARCH__
    static T obj();
    return obj;
 #else
    PADDLE_THROW("Array<T, 0> has no element");
 #endif
  }
  HOSTDEVICE inline const T &operator[](size_t) const {
 #ifdef __CUDA_ARCH__
    static const T obj();
    return obj;
 #else
    PADDLE_THROW("Array<T, 0> has no element");
 #endif
  }
  HOSTDEVICE inline T &at(size_t i) { return (*this)[i]; }
  HOSTDEVICE inline const T &at(size_t i) const { return (*this)[i]; }
  HOSTDEVICE constexpr size_t size() const { return 0; }
  HOSTDEVICE constexpr bool operator==(const Array<T, 0> &other) const {
    return true;
  }
  HOSTDEVICE constexpr bool operator!=(const Array<T, 0> &other) const {
    return false;
  }
 };
 }  // namespace framework
 }  // namespace paddle
--- a/paddle/fluid/framework/async_executor.cc
+++ b/paddle/fluid/framework/async_executor.cc
@ -304,8 +304,13 @@ void AsyncExecutor::RunFromFile(const ProgramDesc& main_program,
  // start executing ops in multiple threads
  for (int thidx = 0; thidx < actual_thread_num; ++thidx) {
-    threads.push_back(
+    if (debug) {
-        std::thread(&ExecutorThreadWorker::TrainFiles, workers[thidx].get()));
+      threads.push_back(std::thread(&ExecutorThreadWorker::TrainFilesWithTimer,
                                    workers[thidx].get()));
    } else {
      threads.push_back(
          std::thread(&ExecutorThreadWorker::TrainFiles, workers[thidx].get()));
    }
  }
  for (auto& th : threads) {
--- a/paddle/fluid/framework/data_device_transform_test.cu
+++ b/paddle/fluid/framework/data_device_transform_test.cu
@ -17,6 +17,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/op_info.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/scope.h"
 #include "paddle/fluid/operators/elementwise/elementwise_op_function.h"
 #include "paddle/fluid/operators/math/math_function.h"
 #include "paddle/fluid/platform/device_context.h"
--- a/paddle/fluid/framework/ddim.cc
+++ b/paddle/fluid/framework/ddim.cc
--- a/paddle/fluid/framework/ddim.h
+++ b/paddle/fluid/framework/ddim.h
@ -18,62 +18,145 @@ limitations under the License. */
 #include <stdexcept>
 #include <vector>
 #include "paddle/fluid/framework/dim.h"
 #include "paddle/fluid/platform/enforce.h"
 #include "paddle/fluid/platform/variant.h"
 namespace paddle {
 namespace framework {
 #define PADDLE_VISIT_DDIM_BASE(rank, callback) \
  case (rank): {                               \
    constexpr auto kRank = (rank);             \
    return (callback);                         \
  }
 #define PADDLE_VISIT_DDIM(rank, callback)    \
  switch (rank) {                            \
    PADDLE_VISIT_DDIM_BASE(0, callback);     \
    PADDLE_VISIT_DDIM_BASE(1, callback);     \
    PADDLE_VISIT_DDIM_BASE(2, callback);     \
    PADDLE_VISIT_DDIM_BASE(3, callback);     \
    PADDLE_VISIT_DDIM_BASE(4, callback);     \
    PADDLE_VISIT_DDIM_BASE(5, callback);     \
    PADDLE_VISIT_DDIM_BASE(6, callback);     \
    PADDLE_VISIT_DDIM_BASE(7, callback);     \
    PADDLE_VISIT_DDIM_BASE(8, callback);     \
    PADDLE_VISIT_DDIM_BASE(9, callback);     \
    default:                                 \
      PADDLE_THROW("Invalid rank %d", rank); \
  }
 template <typename T1, typename T2>
 inline void dynamic_dim_assign(const T1* in, T2* out, int n) {
  PADDLE_VISIT_DDIM(n, (static_dim_assign<kRank, T1, T2>(in, out)));
 }
 /**
 * \brief A dynamically sized dimension.
 *
 * The number of dimensions must be between [1, 9].
 */
-struct DDim {
+class DDim {
-  typedef boost::variant<Dim<0>, Dim<1>, Dim<2>, Dim<3>, Dim<4>, Dim<5>, Dim<6>,
+ public:
-                         Dim<7>, Dim<8>, Dim<9>>
+  constexpr static int kMaxRank = 9;
-      DDimVar;
+
-  DDimVar var;
+  DDim() : rank_(1) { dim_[0] = 0; }
-  DDim() : var(Dim<1>()) {}
+  DDim(const DDim& ddim) : dim_() { CopyFrom(ddim); }
  DDim(const int* d, int n) : rank_(n) {
    dynamic_dim_assign(d, dim_.GetMutable(), n);
  }
  DDim(const int64_t* d, int n) : rank_(n) {
    dynamic_dim_assign(d, dim_.GetMutable(), n);
  }
  template <int D>
-  explicit DDim(const Dim<D>& in) : var(in) {}
+  /*implicit*/ DDim(const Dim<D>& in) : rank_(D) {  // NOLINT
    UnsafeCast<D>() = in;
  }
  /*implicit*/ DDim(std::initializer_list<int64_t> init_list)
      : DDim(init_list.begin(), init_list.size()) {}
-  /*implicit*/ DDim(std::initializer_list<int64_t> init_list);
+  inline DDim& operator=(const DDim& ddim) { return CopyFrom(ddim); }
  template <int D>
-  DDim& operator=(const Dim<D>& in) {
+  inline DDim& operator=(const Dim<D>& dim) {
-    var = in;
+    rank_ = D;
    UnsafeCast<D>() = dim;
    return *this;
  }
-  int64_t& operator[](int idx);
+  inline int64_t& operator[](int idx) { return dim_[idx]; }
-  int64_t operator[](int idx) const;
+
  inline int64_t operator[](int idx) const { return dim_[idx]; }
  inline int64_t& at(int idx) {
    PADDLE_ENFORCE(idx >= 0 && idx < rank_, "Invalid idx %d", idx);
    return dim_[idx];
  }
  inline int64_t at(int idx) const {
    PADDLE_ENFORCE(idx >= 0 && idx < rank_, "Invalid idx %d", idx);
    return dim_[idx];
  }
  template <typename Visitor>
-  typename Visitor::result_type apply_visitor(Visitor& visitor) {
+  typename std::result_of<Visitor(Dim<0>&)>::type apply_visitor(
-    return var.apply_visitor(visitor);
+      Visitor&& visitor) {
    PADDLE_VISIT_DDIM(rank_, visitor(UnsafeCast<kRank>()));
  }
  template <typename Visitor>
-  typename Visitor::result_type apply_visitor(Visitor& visitor) const {
+  typename std::result_of<Visitor(const Dim<0>&)>::type apply_visitor(
-    return var.apply_visitor(visitor);
+      Visitor&& visitor) const {
    PADDLE_VISIT_DDIM(rank_, visitor(UnsafeCast<kRank>()));
  }
-  DDimVar getVar() { return var; }
+  bool operator==(const DDim& d) const;
  bool operator!=(const DDim& d) const;
  DDim operator+(const DDim& d) const;
-  bool operator==(DDim d) const;
+  DDim operator*(const DDim& d) const;
-  bool operator!=(DDim d) const;
+  inline const int64_t* Get() const { return dim_.Get(); }
-  DDim operator+(DDim d) const;
+  inline int64_t* GetMutable() { return dim_.GetMutable(); }
-  DDim operator*(DDim d) const;
+  inline int size() const { return rank_; }
 private:
  template <int D>
  inline Dim<D>& UnsafeCast() {
    static_assert(D >= 0 && D <= kMaxRank, "Invalid rank");
    auto* p = static_cast<void*>(&dim_);
    return *reinterpret_cast<Dim<D>*>(p);
  }
  template <int D>
  inline const Dim<D>& UnsafeCast() const {
    static_assert(D >= 0 && D <= kMaxRank, "Invalid rank");
    auto* p = static_cast<const void*>(&dim_);
    return *reinterpret_cast<const Dim<D>*>(p);
  }
-  int size() const;
+  inline DDim& CopyFrom(const DDim& ddim) {
    PADDLE_VISIT_DDIM(ddim.rank_, (*this = ddim.UnsafeCast<kRank>()));
  }
  friend DDim stride(const DDim& ddim);
  friend DDim stride_numel(const DDim& ddim);
 private:
  Dim<kMaxRank> dim_;
  int rank_;
 };
 #undef PADDLE_VISIT_DDIM_BASE
 #undef PADDLE_VISIT_DDIM
 /**
 * \brief Make a DDim from std::vector<int64_t>
 *
@ -92,7 +175,7 @@ DDim make_ddim(const std::vector<int>& dims);
 DDim make_ddim(std::initializer_list<int64_t> dims);
 int64_t get(const DDim& dim, int idx);
-void set(DDim& dim, int idx, int val);
+void set(DDim& dim, int idx, int val);  // NOLINT
 std::vector<int64_t> vectorize(const DDim& ddim);
 std::vector<int> vectorize2int(const DDim& ddim);
@ -129,12 +212,3 @@ DDim stride(const DDim& ddim);
 DDim stride_numel(const DDim& ddim);
 }  // namespace framework
 }  // namespace paddle
 namespace boost {
 template <typename T>
 T get(const paddle::framework::DDim& in) {
  return boost::get<T>(in.var);
 }
 }  // namespace boost
--- a/paddle/fluid/framework/details/CMakeLists.txt
+++ b/paddle/fluid/framework/details/CMakeLists.txt
@ -77,6 +77,8 @@ cc_library(ssa_graph_executor SRCS ssa_graph_executor.cc DEPS ${SSA_GRAPH_EXECUT
 cc_library(threaded_ssa_graph_executor SRCS threaded_ssa_graph_executor.cc DEPS fetch_op_handle ssa_graph_executor scope
        simple_threadpool device_context)
 cc_library(parallel_ssa_graph_executor SRCS parallel_ssa_graph_executor.cc DEPS threaded_ssa_graph_executor)
 cc_test(broadcast_op_test SRCS broadcast_op_handle_test.cc DEPS var_handle op_handle_base scope ddim memory
        device_context broadcast_op_handle)
 cc_test(gather_op_test SRCS gather_op_handle_test.cc DEPS var_handle op_handle_base scope ddim memory
--- a/paddle/fluid/framework/details/all_reduce_op_handle.cc
+++ b/paddle/fluid/framework/details/all_reduce_op_handle.cc
@ -19,6 +19,13 @@
 #include "paddle/fluid/framework/details/variable_visitor.h"
 #include "paddle/fluid/platform/profiler.h"
 // asynchronous nccl allreduce or synchronous issue:
 // https://github.com/PaddlePaddle/Paddle/issues/15049
 DEFINE_bool(
    sync_nccl_allreduce, false,
    "If set true, will call `cudaStreamSynchronize(nccl_stream)`"
    "after allreduce, this mode can get better performance in some scenarios.");
 namespace paddle {
 namespace framework {
 namespace details {
@ -48,100 +55,104 @@ AllReduceOpHandle::AllReduceOpHandle(ir::Node *node,
 void AllReduceOpHandle::RunImpl() {
  platform::RecordEvent record_event(Name(), dev_ctxes_.cbegin()->second);
-// FIXME(typhoonzero): If scope0(global scope) have NCCL_ID_VAR,
+  WaitInputVarGenerated();
-// this is a distributed or inter-process call, find a better way.
+  auto in_var_handles = DynamicCast<VarHandle>(this->Inputs());
-#ifdef PADDLE_WITH_CUDA
+  auto out_var_handles = DynamicCast<VarHandle>(this->Outputs());
-  if (NoDummyInputSize() == 1 &&
+  PADDLE_ENFORCE_EQ(
-      local_scopes_[0]->FindLocalVar(NCCL_ID_VARNAME) == nullptr) {
+      in_var_handles.size(), places_.size(),
-#else
+      "The NoDummyInputSize should be equal to the number of places.");
-  if (NoDummyInputSize() == 1) {
+  PADDLE_ENFORCE_EQ(
-#endif
+      in_var_handles.size(), out_var_handles.size(),
-    return;  // No need to all reduce when GPU count = 1;
+      "The NoDummyInputSize and NoDummyOutputSize should be equal.");
-  } else {
+
-    // Wait input done
+  std::vector<const LoDTensor *> lod_tensors;
-    WaitInputVarGenerated();
+  for (size_t i = 0; i < local_scopes_.size(); ++i) {
-    auto in_var_handles = DynamicCast<VarHandle>(this->Inputs());
+    auto *s = local_scopes_[i];
-    auto out_var_handles = DynamicCast<VarHandle>(this->Outputs());
+    auto &local_scope = *s->FindVar(kLocalExecScopeName)->Get<Scope *>();
-    PADDLE_ENFORCE_EQ(
+    auto &lod_tensor =
-        in_var_handles.size(), places_.size(),
+        local_scope.FindVar(in_var_handles[i]->name_)->Get<LoDTensor>();
-        "The NoDummyInputSize should be equal to the number of places.");
+    lod_tensors.emplace_back(&lod_tensor);
-    PADDLE_ENFORCE_EQ(
+    PADDLE_ENFORCE_EQ(in_var_handles[i]->name_, out_var_handles[i]->name_,
-        in_var_handles.size(), out_var_handles.size(),
+                      "The name of input and output should be equal.");
-        "The NoDummyInputSize and NoDummyOutputSize should be equal.");
+  }
    std::vector<const LoDTensor *> lod_tensors;
    for (size_t i = 0; i < local_scopes_.size(); ++i) {
      auto *s = local_scopes_[i];
      auto &local_scope = *s->FindVar(kLocalExecScopeName)->Get<Scope *>();
      auto &lod_tensor =
          local_scope.FindVar(in_var_handles[i]->name_)->Get<LoDTensor>();
      lod_tensors.emplace_back(&lod_tensor);
      PADDLE_ENFORCE_EQ(in_var_handles[i]->name_, out_var_handles[i]->name_,
                        "The name of input and output should be equal.");
    }
-    if (platform::is_gpu_place(lod_tensors[0]->place())) {
+  if (platform::is_gpu_place(lod_tensors[0]->place())) {
 #if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
-      PADDLE_ENFORCE(nccl_ctxs_, "nccl_ctxs should not be nullptr.");
+    PADDLE_ENFORCE(nccl_ctxs_, "nccl_ctxs should not be nullptr.");
-      int dtype = -1;
+    int dtype = -1;
-      size_t numel = 0;
+    size_t numel = 0;
-      std::vector<std::function<void()>> all_reduce_calls;
+    std::vector<std::function<void()>> all_reduce_calls;
-      for (size_t i = 0; i < local_scopes_.size(); ++i) {
+    for (size_t i = 0; i < local_scopes_.size(); ++i) {
-        auto &p = places_[i];
+      auto &p = places_[i];
-        auto &lod_tensor = *lod_tensors[i];
+      auto &lod_tensor = *lod_tensors[i];
-        void *buffer = const_cast<void *>(lod_tensor.data<void>());
+      void *buffer = const_cast<void *>(lod_tensor.data<void>());
        if (dtype == -1) {
          dtype = platform::ToNCCLDataType(lod_tensor.type());
        }
-        if (numel == 0) {
+      if (dtype == -1) {
-          numel = static_cast<size_t>(lod_tensor.numel());
+        dtype = platform::ToNCCLDataType(lod_tensor.type());
-        }
+      }
-        int dev_id = boost::get<platform::CUDAPlace>(p).device;
+      if (numel == 0) {
-        auto &nccl_ctx = nccl_ctxs_->at(dev_id);
+        numel = static_cast<size_t>(lod_tensor.numel());
        auto stream = nccl_ctx.stream();
        auto comm = nccl_ctx.comm_;
        all_reduce_calls.emplace_back([=] {
          PADDLE_ENFORCE(platform::dynload::ncclAllReduce(
              buffer, buffer, numel, static_cast<ncclDataType_t>(dtype),
              ncclSum, comm, stream));
        });
      }
-      this->RunAndRecordEvent([&] {
+
      int dev_id = boost::get<platform::CUDAPlace>(p).device;
      auto &nccl_ctx = nccl_ctxs_->at(dev_id);
      auto stream = nccl_ctx.stream();
      auto comm = nccl_ctx.comm_;
      all_reduce_calls.emplace_back([=] {
        PADDLE_ENFORCE(platform::dynload::ncclAllReduce(
            buffer, buffer, numel, static_cast<ncclDataType_t>(dtype), ncclSum,
            comm, stream));
      });
    }
    this->RunAndRecordEvent([&] {
      if (all_reduce_calls.size() == 1UL) {
        // Do not use NCCLGroup when manage NCCL by per thread per device
        all_reduce_calls[0]();
      } else {
        platform::NCCLGroupGuard guard;
        for (auto &call : all_reduce_calls) {
          call();
        }
-      });
+      }
    });
    if (FLAGS_sync_nccl_allreduce) {
      for (auto &p : places_) {
        int dev_id = boost::get<platform::CUDAPlace>(p).device;
        auto &nccl_ctx = nccl_ctxs_->at(dev_id);
        auto stream = nccl_ctx.stream();
        cudaStreamSynchronize(stream);
      }
    }
 #else
-      PADDLE_THROW("Not compiled with CUDA");
+    PADDLE_THROW("Not compiled with CUDA");
 #endif
-    } else {  // Special handle CPU only Operator's gradient. Like CRF
+  } else {  // Special handle CPU only Operator's gradient. Like CRF
-      auto &trg = *this->local_scopes_[0]
+    auto &trg = *this->local_scopes_[0]
-                       ->FindVar(kLocalExecScopeName)
+                     ->FindVar(kLocalExecScopeName)
-                       ->Get<Scope *>()
+                     ->Get<Scope *>()
-                       ->FindVar(out_var_handles[0]->name_)
+                     ->FindVar(out_var_handles[0]->name_)
-                       ->GetMutable<framework::LoDTensor>();
+                     ->GetMutable<framework::LoDTensor>();
-
+
-      // Reduce All Tensor to trg in CPU
+    // Reduce All Tensor to trg in CPU
-      ReduceLoDTensor func(lod_tensors, &trg);
+    ReduceLoDTensor func(lod_tensors, &trg);
-      VisitDataType(lod_tensors[0]->type(), func);
+    VisitDataType(lod_tensors[0]->type(), func);
-
+
-      for (size_t i = 1; i < local_scopes_.size(); ++i) {
+    for (size_t i = 1; i < local_scopes_.size(); ++i) {
-        auto &scope =
+      auto &scope =
-            *local_scopes_[i]->FindVar(kLocalExecScopeName)->Get<Scope *>();
+          *local_scopes_[i]->FindVar(kLocalExecScopeName)->Get<Scope *>();
-        auto &p = places_[i];
+      auto &p = places_[i];
-        auto *var = scope.FindVar(out_var_handles[i]->name_);
+      auto *var = scope.FindVar(out_var_handles[i]->name_);
-        auto *dev_ctx = dev_ctxes_.at(p);
+      auto *dev_ctx = dev_ctxes_.at(p);
-
+
-        RunAndRecordEvent(p, [&trg, var, dev_ctx, p] {
+      RunAndRecordEvent(p, [&trg, var, dev_ctx, p] {
-          auto &tensor_gpu = *var->GetMutable<framework::LoDTensor>();
+        auto &tensor_gpu = *var->GetMutable<framework::LoDTensor>();
-          auto &tensor_cpu = trg;
+        auto &tensor_cpu = trg;
-          TensorCopy(tensor_cpu, p, *dev_ctx, &tensor_gpu);
+        TensorCopy(tensor_cpu, p, *dev_ctx, &tensor_gpu);
-        });
+      });
      }
    }
  }
 }
--- a/Show More
+++ b/Show More
`@ -32,4 +32,4 @@ endif()`

	`add_dependencies(cub extern_cub)`	`add_dependencies(cub extern_cub)`

	`LIST(APPEND externl_project_dependencies cub)`	`LIST(APPEND external_project_dependencies cub)`
`@ -28,4 +28,4 @@ endif()`

	`add_dependencies(dlpack extern_dlpack)`	`add_dependencies(dlpack extern_dlpack)`

	`LIST(APPEND externl_project_dependencies dlpack)`	`LIST(APPEND external_project_dependencies dlpack)`