Merge pull request #3 from PaddlePaddle/develop

merge to dev
revert-15207-remove_op_handle_lock_and_fix_var
lujun 6 years ago committed by GitHub
commit 7324ffaec7
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

@ -94,52 +94,52 @@ RUN localedef -i en_US -f UTF-8 en_US.UTF-8
# specify sphinx version as 1.5.6 and remove -U option for [pip install -U # specify sphinx version as 1.5.6 and remove -U option for [pip install -U
# sphinx-rtd-theme] since -U option will cause sphinx being updated to newest # sphinx-rtd-theme] since -U option will cause sphinx being updated to newest
# version(1.7.1 for now), which causes building documentation failed. # version(1.7.1 for now), which causes building documentation failed.
RUN pip3 install -U wheel && \ RUN pip3 --no-cache-dir install -U wheel && \
pip3 install -U docopt PyYAML sphinx==1.5.6 && \ pip3 --no-cache-dir install -U docopt PyYAML sphinx==1.5.6 && \
pip3 install sphinx-rtd-theme==0.1.9 recommonmark && \ pip3 --no-cache-dir install sphinx-rtd-theme==0.1.9 recommonmark && \
pip3.6 install -U wheel && \ pip3.6 --no-cache-dir install -U wheel && \
pip3.6 install -U docopt PyYAML sphinx==1.5.6 && \ pip3.6 --no-cache-dir install -U docopt PyYAML sphinx==1.5.6 && \
pip3.6 install sphinx-rtd-theme==0.1.9 recommonmark && \ pip3.6 --no-cache-dir install sphinx-rtd-theme==0.1.9 recommonmark && \
pip3.7 install -U wheel && \ pip3.7 --no-cache-dir install -U wheel && \
pip3.7 install -U docopt PyYAML sphinx==1.5.6 && \ pip3.7 --no-cache-dir install -U docopt PyYAML sphinx==1.5.6 && \
pip3.7 install sphinx-rtd-theme==0.1.9 recommonmark && \ pip3.7 --no-cache-dir install sphinx-rtd-theme==0.1.9 recommonmark && \
easy_install -U pip && \ easy_install -U pip && \
pip install -U pip setuptools wheel && \ pip --no-cache-dir install -U pip setuptools wheel && \
pip install -U docopt PyYAML sphinx==1.5.6 && \ pip --no-cache-dir install -U docopt PyYAML sphinx==1.5.6 && \
pip install sphinx-rtd-theme==0.1.9 recommonmark pip --no-cache-dir install sphinx-rtd-theme==0.1.9 recommonmark
RUN pip3 install 'pre-commit==1.10.4' 'ipython==5.3.0' && \ RUN pip3 --no-cache-dir install 'pre-commit==1.10.4' 'ipython==5.3.0' && \
pip3 install 'ipykernel==4.6.0' 'jupyter==1.0.0' && \ pip3 --no-cache-dir install 'ipykernel==4.6.0' 'jupyter==1.0.0' && \
pip3 install opencv-python && \ pip3 --no-cache-dir install opencv-python && \
pip3.6 install 'pre-commit==1.10.4' 'ipython==5.3.0' && \ pip3.6 --no-cache-dir install 'pre-commit==1.10.4' 'ipython==5.3.0' && \
pip3.6 install 'ipykernel==4.6.0' 'jupyter==1.0.0' && \ pip3.6 --no-cache-dir install 'ipykernel==4.6.0' 'jupyter==1.0.0' && \
pip3.6 install opencv-python && \ pip3.6 --no-cache-dir install opencv-python && \
pip3.7 install 'pre-commit==1.10.4' 'ipython==5.3.0' && \ pip3.7 --no-cache-dir install 'pre-commit==1.10.4' 'ipython==5.3.0' && \
pip3.7 install 'ipykernel==4.6.0' 'jupyter==1.0.0' && \ pip3.7 --no-cache-dir install 'ipykernel==4.6.0' 'jupyter==1.0.0' && \
pip3.7 install opencv-python && \ pip3.7 --no-cache-dir install opencv-python && \
pip install 'pre-commit==1.10.4' 'ipython==5.3.0' && \ pip --no-cache-dir install 'pre-commit==1.10.4' 'ipython==5.3.0' && \
pip install 'ipykernel==4.6.0' 'jupyter==1.0.0' && \ pip --no-cache-dir install 'ipykernel==4.6.0' 'jupyter==1.0.0' && \
pip install opencv-python pip --no-cache-dir install opencv-python
#For docstring checker #For docstring checker
RUN pip3 install pylint pytest astroid isort RUN pip3 --no-cache-dir install pylint pytest astroid isort
RUN pip3.6 install pylint pytest astroid isort RUN pip3.6 --no-cache-dir install pylint pytest astroid isort
RUN pip3.7 install pylint pytest astroid isort RUN pip3.7 --no-cache-dir install pylint pytest astroid isort
RUN pip install pylint pytest astroid isort LinkChecker RUN pip --no-cache-dir install pylint pytest astroid isort LinkChecker
COPY ./python/requirements.txt /root/ COPY ./python/requirements.txt /root/
RUN pip3 install -r /root/requirements.txt RUN pip3 --no-cache-dir install -r /root/requirements.txt
RUN pip3.6 install -r /root/requirements.txt RUN pip3.6 --no-cache-dir install -r /root/requirements.txt
RUN pip3.7 install -r /root/requirements.txt RUN pip3.7 --no-cache-dir install -r /root/requirements.txt
RUN pip install -r /root/requirements.txt RUN pip --no-cache-dir install -r /root/requirements.txt
# To fix https://github.com/PaddlePaddle/Paddle/issues/1954, we use # To fix https://github.com/PaddlePaddle/Paddle/issues/1954, we use
# the solution in https://urllib3.readthedocs.io/en/latest/user-guide.html#ssl-py2 # the solution in https://urllib3.readthedocs.io/en/latest/user-guide.html#ssl-py2
RUN apt-get install -y libssl-dev libffi-dev RUN apt-get install -y libssl-dev libffi-dev && apt-get clean -y
RUN pip3 install certifi urllib3[secure] RUN pip3 --no-cache-dir install certifi urllib3[secure]
RUN pip3.6 install certifi urllib3[secure] RUN pip3.6 --no-cache-dir install certifi urllib3[secure]
RUN pip3.7 install certifi urllib3[secure] RUN pip3.7 --no-cache-dir install certifi urllib3[secure]
RUN pip install certifi urllib3[secure] RUN pip --no-cache-dir install certifi urllib3[secure]
# Install woboq_codebrowser to /woboq # Install woboq_codebrowser to /woboq
@ -149,6 +149,14 @@ RUN git clone https://github.com/woboq/woboq_codebrowser /woboq && \
-DCMAKE_BUILD_TYPE=Release . \ -DCMAKE_BUILD_TYPE=Release . \
make) make)
# ar mishandles 4GB files
# https://sourceware.org/bugzilla/show_bug.cgi?id=14625
# remove them when apt-get support 2.27 and higher version
RUN wget -q https://launchpad.net/ubuntu/+archive/primary/+sourcefiles/binutils/2.27-9ubuntu1/binutils_2.27.orig.tar.gz && \
tar -xzf binutils_2.27.orig.tar.gz && \
cd binutils-2.27 && \
./configure && make -j && make install && cd .. && rm -rf binutils-2.27 binutils_2.27.orig.tar.gz
# Configure OpenSSH server. c.f. https://docs.docker.com/engine/examples/running_ssh_service # Configure OpenSSH server. c.f. https://docs.docker.com/engine/examples/running_ssh_service
RUN mkdir /var/run/sshd RUN mkdir /var/run/sshd
RUN echo 'root:root' | chpasswd RUN echo 'root:root' | chpasswd

@ -139,10 +139,12 @@ endfunction()
message(STATUS "CUDA detected: " ${CUDA_VERSION}) message(STATUS "CUDA detected: " ${CUDA_VERSION})
if (${CUDA_VERSION} LESS 7.0) if (${CUDA_VERSION} LESS 7.0)
set(paddle_known_gpu_archs ${paddle_known_gpu_archs}) set(paddle_known_gpu_archs ${paddle_known_gpu_archs})
add_definitions("-DPADDLE_CUDA_BINVER=\"60\"")
elseif (${CUDA_VERSION} LESS 8.0) # CUDA 7.x elseif (${CUDA_VERSION} LESS 8.0) # CUDA 7.x
set(paddle_known_gpu_archs ${paddle_known_gpu_archs7}) set(paddle_known_gpu_archs ${paddle_known_gpu_archs7})
list(APPEND CUDA_NVCC_FLAGS "-D_MWAITXINTRIN_H_INCLUDED") list(APPEND CUDA_NVCC_FLAGS "-D_MWAITXINTRIN_H_INCLUDED")
list(APPEND CUDA_NVCC_FLAGS "-D__STRICT_ANSI__") list(APPEND CUDA_NVCC_FLAGS "-D__STRICT_ANSI__")
add_definitions("-DPADDLE_CUDA_BINVER=\"70\"")
elseif (${CUDA_VERSION} LESS 9.0) # CUDA 8.x elseif (${CUDA_VERSION} LESS 9.0) # CUDA 8.x
set(paddle_known_gpu_archs ${paddle_known_gpu_archs8}) set(paddle_known_gpu_archs ${paddle_known_gpu_archs8})
list(APPEND CUDA_NVCC_FLAGS "-D_MWAITXINTRIN_H_INCLUDED") list(APPEND CUDA_NVCC_FLAGS "-D_MWAITXINTRIN_H_INCLUDED")
@ -150,6 +152,7 @@ elseif (${CUDA_VERSION} LESS 9.0) # CUDA 8.x
# CUDA 8 may complain that sm_20 is no longer supported. Suppress the # CUDA 8 may complain that sm_20 is no longer supported. Suppress the
# warning for now. # warning for now.
list(APPEND CUDA_NVCC_FLAGS "-Wno-deprecated-gpu-targets") list(APPEND CUDA_NVCC_FLAGS "-Wno-deprecated-gpu-targets")
add_definitions("-DPADDLE_CUDA_BINVER=\"80\"")
endif() endif()
include_directories(${CUDA_INCLUDE_DIRS}) include_directories(${CUDA_INCLUDE_DIRS})

@ -89,6 +89,7 @@ if(CUDNN_FOUND)
if(NOT CUDNN_MAJOR_VERSION) if(NOT CUDNN_MAJOR_VERSION)
set(CUDNN_VERSION "???") set(CUDNN_VERSION "???")
else() else()
add_definitions("-DPADDLE_CUDNN_BINVER=\"${CUDNN_MAJOR_VERSION}\"")
math(EXPR CUDNN_VERSION math(EXPR CUDNN_VERSION
"${CUDNN_MAJOR_VERSION} * 1000 + "${CUDNN_MAJOR_VERSION} * 1000 +
${CUDNN_MINOR_VERSION} * 100 + ${CUDNN_PATCHLEVEL_VERSION}") ${CUDNN_MINOR_VERSION} * 100 + ${CUDNN_PATCHLEVEL_VERSION}")

@ -32,4 +32,4 @@ endif()
add_dependencies(cub extern_cub) add_dependencies(cub extern_cub)
LIST(APPEND externl_project_dependencies cub) LIST(APPEND external_project_dependencies cub)

@ -28,4 +28,4 @@ endif()
add_dependencies(dlpack extern_dlpack) add_dependencies(dlpack extern_dlpack)
LIST(APPEND externl_project_dependencies dlpack) LIST(APPEND external_project_dependencies dlpack)

@ -106,10 +106,10 @@ else(WIN32)
SET(MKLDNN_SHARED_LIB ${MKLDNN_INSTALL_DIR}/libmkldnn.so.0) SET(MKLDNN_SHARED_LIB ${MKLDNN_INSTALL_DIR}/libmkldnn.so.0)
ADD_CUSTOM_COMMAND(OUTPUT ${MKLDNN_SHARED_LIB} ADD_CUSTOM_COMMAND(OUTPUT ${MKLDNN_SHARED_LIB}
COMMAND ${CMAKE_COMMAND} -E copy ${MKLDNN_LIB} ${MKLDNN_SHARED_LIB} COMMAND ${CMAKE_COMMAND} -E copy ${MKLDNN_LIB} ${MKLDNN_SHARED_LIB}
DEPENDS mkldnn) DEPENDS mkldnn shared_mkldnn)
endif(WIN32) endif(WIN32)
ADD_CUSTOM_TARGET(mkldnn_shared_lib ALL DEPENDS ${MKLDNN_SHARED_LIB}) ADD_CUSTOM_TARGET(mkldnn_shared_lib ALL DEPENDS ${MKLDNN_SHARED_LIB})
ADD_DEPENDENCIES(mkldnn_shared_lib ${MKLDNN_PROJECT} mkldnn)
IF(WITH_C_API) IF(WITH_C_API)
INSTALL(FILES ${MKLDNN_SHARED_LIB} DESTINATION lib) INSTALL(FILES ${MKLDNN_SHARED_LIB} DESTINATION lib)
ENDIF() ENDIF()

@ -37,13 +37,12 @@ INCLUDE(GNUInstallDirs)
INCLUDE(ExternalProject) INCLUDE(ExternalProject)
SET(NGRAPH_PROJECT "extern_ngraph") SET(NGRAPH_PROJECT "extern_ngraph")
SET(NGRAPH_VERSION "0.9") SET(NGRAPH_GIT_TAG "v0.10.1")
SET(NGRAPH_GIT_TAG "f9fd9d4cc318dc59dd4b68448e7fbb5f67a28bd0")
SET(NGRAPH_SOURCES_DIR ${THIRD_PARTY_PATH}/ngraph) SET(NGRAPH_SOURCES_DIR ${THIRD_PARTY_PATH}/ngraph)
SET(NGRAPH_INSTALL_DIR ${THIRD_PARTY_PATH}/install/ngraph) SET(NGRAPH_INSTALL_DIR ${THIRD_PARTY_PATH}/install/ngraph)
SET(NGRAPH_INC_DIR ${NGRAPH_INSTALL_DIR}/include) SET(NGRAPH_INC_DIR ${NGRAPH_INSTALL_DIR}/include)
SET(NGRAPH_LIB_DIR ${NGRAPH_INSTALL_DIR}/${CMAKE_INSTALL_LIBDIR}) SET(NGRAPH_LIB_DIR ${NGRAPH_INSTALL_DIR}/${CMAKE_INSTALL_LIBDIR})
SET(NGRAPH_SHARED_LIB_NAME libngraph.so.${NGRAPH_VERSION}) SET(NGRAPH_SHARED_LIB_NAME libngraph.so)
SET(NGRAPH_CPU_LIB_NAME libcpu_backend.so) SET(NGRAPH_CPU_LIB_NAME libcpu_backend.so)
SET(NGRAPH_TBB_LIB_NAME libtbb.so.2) SET(NGRAPH_TBB_LIB_NAME libtbb.so.2)
SET(NGRAPH_GIT_REPO "https://github.com/NervanaSystems/ngraph.git") SET(NGRAPH_GIT_REPO "https://github.com/NervanaSystems/ngraph.git")

@ -136,7 +136,7 @@ if (WITH_MKLDNN)
copy(mkldnn_lib copy(mkldnn_lib
SRCS ${MKLDNN_INC_DIR} ${MKLDNN_SHARED_LIB} SRCS ${MKLDNN_INC_DIR} ${MKLDNN_SHARED_LIB}
DSTS ${dst_dir} ${dst_dir}/lib DSTS ${dst_dir} ${dst_dir}/lib
DEPS mkldnn DEPS mkldnn_shared_lib
) )
endif () endif ()

@ -110,7 +110,7 @@ function(op_library TARGET)
# Define operators that don't need pybind here. # Define operators that don't need pybind here.
foreach(manual_pybind_op "compare_op" "logical_op" "nccl_op" foreach(manual_pybind_op "compare_op" "logical_op" "nccl_op"
"tensor_array_read_write_op" "tensorrt_engine_op" "conv_fusion_op" "tensor_array_read_write_op" "tensorrt_engine_op" "conv_fusion_op"
"fusion_transpose_flatten_concat_op") "fusion_transpose_flatten_concat_op" "fusion_conv_inception_op")
if ("${TARGET}" STREQUAL "${manual_pybind_op}") if ("${TARGET}" STREQUAL "${manual_pybind_op}")
set(pybind_flag 1) set(pybind_flag 1)
endif() endif()

@ -57,46 +57,43 @@ int main()
return 0; return 0;
}" SSE3_FOUND) }" SSE3_FOUND)
# disable AVX by default on windows # Check AVX
if(NOT WIN32) set(CMAKE_REQUIRED_FLAGS ${AVX_FLAG})
# Check AVX set(AVX_FOUND_EXITCODE 1 CACHE STRING "Result from TRY_RUN" FORCE)
set(CMAKE_REQUIRED_FLAGS ${AVX_FLAG}) CHECK_CXX_SOURCE_RUNS("
set(AVX_FOUND_EXITCODE 1 CACHE STRING "Result from TRY_RUN" FORCE) #include <immintrin.h>
CHECK_CXX_SOURCE_RUNS(" int main()
#include <immintrin.h> {
int main() __m256 a = _mm256_set_ps (-1.0f, 2.0f, -3.0f, 4.0f, -1.0f, 2.0f, -3.0f, 4.0f);
{ __m256 b = _mm256_set_ps (1.0f, 2.0f, 3.0f, 4.0f, 1.0f, 2.0f, 3.0f, 4.0f);
__m256 a = _mm256_set_ps (-1.0f, 2.0f, -3.0f, 4.0f, -1.0f, 2.0f, -3.0f, 4.0f); __m256 result = _mm256_add_ps (a, b);
__m256 b = _mm256_set_ps (1.0f, 2.0f, 3.0f, 4.0f, 1.0f, 2.0f, 3.0f, 4.0f); return 0;
__m256 result = _mm256_add_ps (a, b); }" AVX_FOUND)
return 0;
}" AVX_FOUND)
# Check AVX 2 # Check AVX 2
set(CMAKE_REQUIRED_FLAGS ${AVX2_FLAG}) set(CMAKE_REQUIRED_FLAGS ${AVX2_FLAG})
set(AVX2_FOUND_EXITCODE 1 CACHE STRING "Result from TRY_RUN" FORCE) set(AVX2_FOUND_EXITCODE 1 CACHE STRING "Result from TRY_RUN" FORCE)
CHECK_CXX_SOURCE_RUNS(" CHECK_CXX_SOURCE_RUNS("
#include <immintrin.h> #include <immintrin.h>
int main() int main()
{ {
__m256i a = _mm256_set_epi32 (-1, 2, -3, 4, -1, 2, -3, 4); __m256i a = _mm256_set_epi32 (-1, 2, -3, 4, -1, 2, -3, 4);
__m256i result = _mm256_abs_epi32 (a); __m256i result = _mm256_abs_epi32 (a);
return 0; return 0;
}" AVX2_FOUND) }" AVX2_FOUND)
# Check AVX512F # Check AVX512F
set(CMAKE_REQUIRED_FLAGS ${AVX512F_FLAG}) set(CMAKE_REQUIRED_FLAGS ${AVX512F_FLAG})
set(AVX512F_FOUND_EXITCODE 1 CACHE STRING "Result from TRY_RUN" FORCE) set(AVX512F_FOUND_EXITCODE 1 CACHE STRING "Result from TRY_RUN" FORCE)
CHECK_CXX_SOURCE_RUNS(" CHECK_CXX_SOURCE_RUNS("
#include <immintrin.h> #include <immintrin.h>
int main() int main()
{ {
__m512i a = _mm512_set_epi32 (-1, 2, -3, 4, -1, 2, -3, 4, __m512i a = _mm512_set_epi32 (-1, 2, -3, 4, -1, 2, -3, 4,
13, -5, 6, -7, 9, 2, -6, 3); 13, -5, 6, -7, 9, 2, -6, 3);
__m512i result = _mm512_abs_epi32 (a); __m512i result = _mm512_abs_epi32 (a);
return 0; return 0;
}" AVX512F_FOUND) }" AVX512F_FOUND)
endif(NOT WIN32)
set(CMAKE_REQUIRED_FLAGS ${CMAKE_REQUIRED_FLAGS_RETAINED}) set(CMAKE_REQUIRED_FLAGS ${CMAKE_REQUIRED_FLAGS_RETAINED})
mark_as_advanced(MMX_FOUND SSE2_FOUND SSE3_FOUND AVX_FOUND AVX2_FOUND AVX512F_FOUND) mark_as_advanced(MMX_FOUND SSE2_FOUND SSE3_FOUND AVX_FOUND AVX2_FOUND AVX512F_FOUND)

@ -60,7 +60,7 @@ class Float16Transpiler:
raise TypeError("place should be as CPUPlace/CUDAPlace type") raise TypeError("place should be as CPUPlace/CUDAPlace type")
if scope is None: if scope is None:
scope = global_scope() scope = global_scope()
if not isinstance(scope, core.Scope): if not isinstance(scope, core._Scope):
raise TypeError("scope should be as Scope type or None") raise TypeError("scope should be as Scope type or None")
self.scope = scope self.scope = scope

@ -464,11 +464,7 @@ paddle.fluid.unique_name.switch ArgSpec(args=['new_generator'], varargs=None, ke
paddle.fluid.unique_name.guard ArgSpec(args=[], varargs='args', keywords='kwds', defaults=None) paddle.fluid.unique_name.guard ArgSpec(args=[], varargs='args', keywords='kwds', defaults=None)
paddle.fluid.recordio_writer.convert_reader_to_recordio_file ArgSpec(args=['filename', 'reader_creator', 'feeder', 'compressor', 'max_num_records', 'feed_order'], varargs=None, keywords=None, defaults=(Compressor.Snappy, 1000, None)) paddle.fluid.recordio_writer.convert_reader_to_recordio_file ArgSpec(args=['filename', 'reader_creator', 'feeder', 'compressor', 'max_num_records', 'feed_order'], varargs=None, keywords=None, defaults=(Compressor.Snappy, 1000, None))
paddle.fluid.recordio_writer.convert_reader_to_recordio_files ArgSpec(args=['filename', 'batch_per_file', 'reader_creator', 'feeder', 'compressor', 'max_num_records', 'feed_order'], varargs=None, keywords=None, defaults=(Compressor.Snappy, 1000, None)) paddle.fluid.recordio_writer.convert_reader_to_recordio_files ArgSpec(args=['filename', 'batch_per_file', 'reader_creator', 'feeder', 'compressor', 'max_num_records', 'feed_order'], varargs=None, keywords=None, defaults=(Compressor.Snappy, 1000, None))
paddle.fluid.Scope.__init__ __init__(self: paddle.fluid.core.Scope) -> None paddle.fluid.Scope Scope() -> paddle.fluid.core._Scope
paddle.fluid.Scope.drop_kids drop_kids(self: paddle.fluid.core.Scope) -> None
paddle.fluid.Scope.find_var find_var(self: paddle.fluid.core.Scope, arg0: unicode) -> paddle.fluid.core.Variable
paddle.fluid.Scope.new_scope new_scope(self: paddle.fluid.core.Scope) -> paddle.fluid.core.Scope
paddle.fluid.Scope.var var(self: paddle.fluid.core.Scope, arg0: unicode) -> paddle.fluid.core.Variable
paddle.reader.map_readers ArgSpec(args=['func'], varargs='readers', keywords=None, defaults=None) paddle.reader.map_readers ArgSpec(args=['func'], varargs='readers', keywords=None, defaults=None)
paddle.reader.buffered ArgSpec(args=['reader', 'size'], varargs=None, keywords=None, defaults=None) paddle.reader.buffered ArgSpec(args=['reader', 'size'], varargs=None, keywords=None, defaults=None)
paddle.reader.compose ArgSpec(args=[], varargs='readers', keywords='kwargs', defaults=None) paddle.reader.compose ArgSpec(args=[], varargs='readers', keywords='kwargs', defaults=None)

@ -7,27 +7,17 @@ function(windows_symbolic TARGET)
cmake_parse_arguments(windows_symbolic "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN}) cmake_parse_arguments(windows_symbolic "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
set(final_path ${CMAKE_CURRENT_SOURCE_DIR}/${windows_symbolic_PATH}) set(final_path ${CMAKE_CURRENT_SOURCE_DIR}/${windows_symbolic_PATH})
foreach(src ${windows_symbolic_SRCS}) foreach(src ${windows_symbolic_SRCS})
get_filename_component(src ${src} NAME_WE) get_filename_component(src ${src} NAME_WE)
if (NOT EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${src}.cc OR NOT EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${src}.cu) if (NOT EXISTS ${final_path}/${src}.cc OR NOT EXISTS ${final_path}/${src}.cu)
message(FATAL " ${src}.cc and ${src}.cu must exsits, and ${src}.cu must be symbolic file.") message(FATAL " ${src}.cc and ${src}.cu must exsits, and ${src}.cu must be symbolic file.")
endif() endif()
#only copy the xx.cu to.xx.cu when the content are modified file(GENERATE OUTPUT ${final_path}/.${src}.cu INPUT ${final_path}/${src}.cc)
set(copy_flag 1)
if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/.${src}.cu) add_custom_command(OUTPUT ${final_path}/.${src}.cu
file(READ ${CMAKE_CURRENT_SOURCE_DIR}/${src}.cc SOURCE_STR) COMMAND ${CMAKE_COMMAND} -E copy_if_different "${final_path}/${src}.cc" "${final_path}/.${src}.cu"
file(READ ${CMAKE_CURRENT_SOURCE_DIR}/.${src}.cu TARGET_STR) COMMENT "create hidden file of ${src}.cu")
if (SOURCE_STR STREQUAL TARGET_STR) add_custom_target(${TARGET} ALL DEPENDS .${src}.cu)
set(copy_flag 0)
endif()
endif()
if (copy_flag)
add_custom_command(OUTPUT .${src}.cu
COMMAND ${CMAKE_COMMAND} -E remove ${CMAKE_CURRENT_SOURCE_DIR}/.${src}.cu
COMMAND ${CMAKE_COMMAND} -E copy "${CMAKE_CURRENT_SOURCE_DIR}/${src}.cc" "${CMAKE_CURRENT_SOURCE_DIR}/.${src}.cu"
COMMENT "create hidden file of ${src}.cu")
endif(copy_flag)
add_custom_target(${TARGET} ALL DEPENDS .${src}.cu)
endforeach() endforeach()
endfunction() endfunction()
@ -37,9 +27,10 @@ add_subdirectory(details)
proto_library(framework_proto SRCS framework.proto) proto_library(framework_proto SRCS framework.proto)
proto_library(async_executor_proto SRCS data_feed.proto) proto_library(async_executor_proto SRCS data_feed.proto)
cc_library(ddim SRCS ddim.cc DEPS eigen3 boost) cc_library(ddim SRCS ddim.cc DEPS eigen3 boost enforce)
cc_test(ddim_test SRCS ddim_test.cc DEPS ddim) cc_test(ddim_test SRCS ddim_test.cc DEPS ddim)
nv_test(dim_test SRCS dim_test.cu DEPS ddim) nv_test(dim_test SRCS dim_test.cu DEPS ddim)
cc_test(unroll_array_ops_test SRCS unroll_array_ops_test.cc)
cc_library(data_type SRCS data_type.cc DEPS framework_proto ddim device_context) cc_library(data_type SRCS data_type.cc DEPS framework_proto ddim device_context)
cc_test(data_type_test SRCS data_type_test.cc DEPS data_type place tensor) cc_test(data_type_test SRCS data_type_test.cc DEPS data_type place tensor)
if(WITH_GPU) if(WITH_GPU)
@ -78,17 +69,23 @@ cc_library(garbage_collector SRCS garbage_collector.cc DEPS device_context memor
cc_library(reader SRCS reader.cc DEPS lod_tensor ddim) cc_library(reader SRCS reader.cc DEPS lod_tensor ddim)
cc_test(reader_test SRCS reader_test.cc DEPS reader) cc_test(reader_test SRCS reader_test.cc DEPS reader)
cc_test(variable_test SRCS variable_test.cc)
cc_library(threadpool SRCS threadpool.cc DEPS enforce) cc_library(threadpool SRCS threadpool.cc DEPS enforce)
cc_test(threadpool_test SRCS threadpool_test.cc DEPS threadpool) cc_test(threadpool_test SRCS threadpool_test.cc DEPS threadpool)
cc_library(scope SRCS scope.cc DEPS glog threadpool) cc_library(var_type_traits SRCS var_type_traits DEPS lod_tensor selected_rows framework_proto)
if (WITH_GPU)
target_link_libraries(var_type_traits dynload_cuda)
endif()
cc_test(var_type_traits_test SRCS var_type_traits_test.cc DEPS var_type_traits)
cc_library(scope SRCS scope.cc DEPS glog threadpool xxhash var_type_traits)
cc_library(scope_pool SRCS scope_pool.cc DEPS scope)
cc_test(scope_test SRCS scope_test.cc DEPS scope) cc_test(scope_test SRCS scope_test.cc DEPS scope)
cc_test(variable_test SRCS variable_test.cc DEPS tensor var_type_traits)
cc_library(data_device_transform SRCS data_device_transform.cc DEPS tensor) cc_library(data_device_transform SRCS data_device_transform.cc DEPS tensor)
nv_test(data_device_transform_test SRCS data_device_transform_test.cu nv_test(data_device_transform_test SRCS data_device_transform_test.cu
DEPS operator op_registry device_context math_function) DEPS operator op_registry device_context math_function scope)
if(WITH_GPU) if(WITH_GPU)
if (WIN32) if (WIN32)
@ -133,11 +130,9 @@ cc_test(version_test SRCS version_test.cc DEPS version)
cc_library(proto_desc SRCS var_desc.cc op_desc.cc block_desc.cc program_desc.cc DEPS shape_inference op_info operator glog version) cc_library(proto_desc SRCS var_desc.cc op_desc.cc block_desc.cc program_desc.cc DEPS shape_inference op_info operator glog version)
if(WITH_NGRAPH) if(WITH_NGRAPH)
if(NOT WIN32) cc_library(ngraph_bridge SRCS ngraph_bridge.cc DEPS operator framework_proto ngraph)
cc_library(ngraph_bridge SRCS ngraph_bridge.cc DEPS operator framework_proto ngraph) cc_library(ngraph_operator SRCS ngraph_operator.cc DEPS ngraph_bridge operator op_info device_context tensor scope glog
cc_library(ngraph_operator SRCS ngraph_operator.cc DEPS ngraph_bridge operator op_info device_context tensor scope glog shape_inference data_transform lod_tensor profiler)
shape_inference data_transform lod_tensor profiler ngraph)
endif(NOT WIN32)
endif(WITH_NGRAPH) endif(WITH_NGRAPH)
cc_library(op_registry SRCS op_registry.cc DEPS op_proto_maker op_info operator glog proto_desc) cc_library(op_registry SRCS op_registry.cc DEPS op_proto_maker op_info operator glog proto_desc)
@ -179,11 +174,7 @@ if(WITH_DISTRIBUTE)
else() else()
if(WITH_NGRAPH) if(WITH_NGRAPH)
if(NOT WIN32) cc_library(executor SRCS executor.cc DEPS op_registry device_context scope framework_proto glog lod_rank_table feed_fetch_method graph_to_program_pass ngraph_operator variable_helper)
cc_library(executor SRCS executor.cc DEPS op_registry device_context scope framework_proto glog lod_rank_table feed_fetch_method graph_to_program_pass ngraph ngraph_operator variable_helper)
else(NOT WIN32)
cc_library(executor SRCS executor.cc DEPS op_registry device_context scope framework_proto glog lod_rank_table feed_fetch_method graph_to_program_pass variable_helper)
endif(NOT WIN32)
else(WITH_NGRAPH) else(WITH_NGRAPH)
cc_library(executor SRCS executor.cc DEPS op_registry device_context scope framework_proto glog lod_rank_table feed_fetch_method graph_to_program_pass variable_helper) cc_library(executor SRCS executor.cc DEPS op_registry device_context scope framework_proto glog lod_rank_table feed_fetch_method graph_to_program_pass variable_helper)
endif(WITH_NGRAPH) endif(WITH_NGRAPH)
@ -198,9 +189,9 @@ cc_library(parallel_executor SRCS parallel_executor.cc DEPS
fast_threaded_ssa_graph_executor variable_helper) fast_threaded_ssa_graph_executor variable_helper)
if(WITH_PSLIB) if(WITH_PSLIB)
cc_library(async_executor SRCS async_executor.cc data_feed.cc data_feed_factory.cc executor_thread_worker.cc DEPS op_registry device_context scope framework_proto glog lod_rank_table feed_fetch_method graph_to_program_pass async_executor_proto variable_helper pslib_brpc pslib) cc_library(async_executor SRCS async_executor.cc data_feed.cc data_feed_factory.cc executor_thread_worker.cc DEPS op_registry device_context scope framework_proto glog lod_rank_table feed_fetch_method graph_to_program_pass async_executor_proto variable_helper pslib_brpc pslib timer)
else() else()
cc_library(async_executor SRCS async_executor.cc data_feed.cc data_feed_factory.cc executor_thread_worker.cc DEPS op_registry device_context scope framework_proto glog lod_rank_table feed_fetch_method graph_to_program_pass async_executor_proto variable_helper) cc_library(async_executor SRCS async_executor.cc data_feed.cc data_feed_factory.cc executor_thread_worker.cc DEPS op_registry device_context scope framework_proto glog lod_rank_table feed_fetch_method graph_to_program_pass async_executor_proto variable_helper timer)
endif(WITH_PSLIB) endif(WITH_PSLIB)

@ -15,34 +15,123 @@
#pragma once #pragma once
#include <cstdint> #include <cstdint>
#include "paddle/fluid/platform/hostdevice.h" #include "paddle/fluid/framework/unroll_array_ops.h"
#include "paddle/fluid/platform/enforce.h"
namespace paddle { namespace paddle {
namespace framework { namespace framework {
template <typename T, size_t N> template <typename T, size_t N>
class Array { class Array {
static_assert(N > 0, "The size of array must be larger than 0");
public: public:
HOSTDEVICE Array() {} static constexpr size_t kSize = N;
HOSTDEVICE inline Array() {}
HOSTDEVICE explicit Array(const T &val) { template <typename... Args>
for (size_t i = 0; i < N; ++i) data_[i] = val; HOSTDEVICE inline explicit Array(const T &val, Args... args) {
static_assert(N == sizeof...(Args) + 1, "Invalid argument");
UnrollVarArgsAssign<T>::Run(data_, val, args...);
} }
HOSTDEVICE const T *Get() const { return data_; } HOSTDEVICE inline void Fill(const T &val) {
UnrollFillConstant<N>::Run(data_, val);
}
HOSTDEVICE T *GetMutable() { return data_; } HOSTDEVICE inline const T *Get() const { return data_; }
HOSTDEVICE T &operator[](size_t index) { return data_[index]; } HOSTDEVICE inline T *GetMutable() { return data_; }
HOSTDEVICE const T &operator[](size_t index) const { return data_[index]; } HOSTDEVICE inline T &operator[](size_t i) { return *advance(data_, i); }
// Writing "return data_[i]" would cause compilation warning/error:
// "array subscript is above array bound" in Python 35 CI.
// It seems that it is a false warning of GCC if we do not check the bounds
// of array index. But for better performance, we do not check in operator[]
// like what is in STL. If users want to check the bounds, use at() instead
HOSTDEVICE inline const T &operator[](size_t i) const {
return *advance(data_, i);
}
HOSTDEVICE inline T &at(size_t i) {
#ifndef __CUDA_ARCH__
PADDLE_ENFORCE_LT(i, N, "Array index out of bounds");
#endif
return (*this)[i];
}
HOSTDEVICE inline const T &at(size_t i) const {
#ifndef __CUDA_ARCH__
PADDLE_ENFORCE_LT(i, N, "Array index out of bounds");
#endif
return (*this)[i];
}
HOSTDEVICE constexpr size_t size() const { return N; } HOSTDEVICE constexpr size_t size() const { return N; }
HOSTDEVICE inline bool operator==(const Array<T, N> &other) const {
return UnrollCompare<N>::Run(data_, other.data_);
}
HOSTDEVICE inline bool operator!=(const Array<T, N> &other) const {
return !(*this == other);
}
private: private:
template <typename U>
HOSTDEVICE static inline U *advance(U *ptr, size_t i) {
return ptr + i;
}
T data_[N]; T data_[N];
}; };
template <typename T>
class Array<T, 0> {
public:
static constexpr size_t kSize = 0;
HOSTDEVICE inline Array() {}
HOSTDEVICE inline void Fill(const T &val) {}
HOSTDEVICE inline constexpr T *Get() const { return nullptr; }
// Add constexpr to GetMutable() cause warning in MAC
HOSTDEVICE inline T *GetMutable() { return nullptr; }
HOSTDEVICE inline T &operator[](size_t) {
#ifdef __CUDA_ARCH__
static T obj();
return obj;
#else
PADDLE_THROW("Array<T, 0> has no element");
#endif
}
HOSTDEVICE inline const T &operator[](size_t) const {
#ifdef __CUDA_ARCH__
static const T obj();
return obj;
#else
PADDLE_THROW("Array<T, 0> has no element");
#endif
}
HOSTDEVICE inline T &at(size_t i) { return (*this)[i]; }
HOSTDEVICE inline const T &at(size_t i) const { return (*this)[i]; }
HOSTDEVICE constexpr size_t size() const { return 0; }
HOSTDEVICE constexpr bool operator==(const Array<T, 0> &other) const {
return true;
}
HOSTDEVICE constexpr bool operator!=(const Array<T, 0> &other) const {
return false;
}
};
} // namespace framework } // namespace framework
} // namespace paddle } // namespace paddle

@ -304,8 +304,13 @@ void AsyncExecutor::RunFromFile(const ProgramDesc& main_program,
// start executing ops in multiple threads // start executing ops in multiple threads
for (int thidx = 0; thidx < actual_thread_num; ++thidx) { for (int thidx = 0; thidx < actual_thread_num; ++thidx) {
threads.push_back( if (debug) {
std::thread(&ExecutorThreadWorker::TrainFiles, workers[thidx].get())); threads.push_back(std::thread(&ExecutorThreadWorker::TrainFilesWithTimer,
workers[thidx].get()));
} else {
threads.push_back(
std::thread(&ExecutorThreadWorker::TrainFiles, workers[thidx].get()));
}
} }
for (auto& th : threads) { for (auto& th : threads) {

@ -17,6 +17,7 @@ limitations under the License. */
#include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/framework/lod_tensor.h"
#include "paddle/fluid/framework/op_info.h" #include "paddle/fluid/framework/op_info.h"
#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/scope.h"
#include "paddle/fluid/operators/elementwise/elementwise_op_function.h" #include "paddle/fluid/operators/elementwise/elementwise_op_function.h"
#include "paddle/fluid/operators/math/math_function.h" #include "paddle/fluid/operators/math/math_function.h"
#include "paddle/fluid/platform/device_context.h" #include "paddle/fluid/platform/device_context.h"

File diff suppressed because it is too large Load Diff

@ -18,62 +18,145 @@ limitations under the License. */
#include <stdexcept> #include <stdexcept>
#include <vector> #include <vector>
#include "paddle/fluid/framework/dim.h" #include "paddle/fluid/framework/dim.h"
#include "paddle/fluid/platform/enforce.h"
#include "paddle/fluid/platform/variant.h"
namespace paddle { namespace paddle {
namespace framework { namespace framework {
#define PADDLE_VISIT_DDIM_BASE(rank, callback) \
case (rank): { \
constexpr auto kRank = (rank); \
return (callback); \
}
#define PADDLE_VISIT_DDIM(rank, callback) \
switch (rank) { \
PADDLE_VISIT_DDIM_BASE(0, callback); \
PADDLE_VISIT_DDIM_BASE(1, callback); \
PADDLE_VISIT_DDIM_BASE(2, callback); \
PADDLE_VISIT_DDIM_BASE(3, callback); \
PADDLE_VISIT_DDIM_BASE(4, callback); \
PADDLE_VISIT_DDIM_BASE(5, callback); \
PADDLE_VISIT_DDIM_BASE(6, callback); \
PADDLE_VISIT_DDIM_BASE(7, callback); \
PADDLE_VISIT_DDIM_BASE(8, callback); \
PADDLE_VISIT_DDIM_BASE(9, callback); \
default: \
PADDLE_THROW("Invalid rank %d", rank); \
}
template <typename T1, typename T2>
inline void dynamic_dim_assign(const T1* in, T2* out, int n) {
PADDLE_VISIT_DDIM(n, (static_dim_assign<kRank, T1, T2>(in, out)));
}
/** /**
* \brief A dynamically sized dimension. * \brief A dynamically sized dimension.
* *
* The number of dimensions must be between [1, 9]. * The number of dimensions must be between [1, 9].
*/ */
struct DDim { class DDim {
typedef boost::variant<Dim<0>, Dim<1>, Dim<2>, Dim<3>, Dim<4>, Dim<5>, Dim<6>, public:
Dim<7>, Dim<8>, Dim<9>> constexpr static int kMaxRank = 9;
DDimVar;
DDimVar var; DDim() : rank_(1) { dim_[0] = 0; }
DDim() : var(Dim<1>()) {} DDim(const DDim& ddim) : dim_() { CopyFrom(ddim); }
DDim(const int* d, int n) : rank_(n) {
dynamic_dim_assign(d, dim_.GetMutable(), n);
}
DDim(const int64_t* d, int n) : rank_(n) {
dynamic_dim_assign(d, dim_.GetMutable(), n);
}
template <int D> template <int D>
explicit DDim(const Dim<D>& in) : var(in) {} /*implicit*/ DDim(const Dim<D>& in) : rank_(D) { // NOLINT
UnsafeCast<D>() = in;
}
/*implicit*/ DDim(std::initializer_list<int64_t> init_list)
: DDim(init_list.begin(), init_list.size()) {}
/*implicit*/ DDim(std::initializer_list<int64_t> init_list); inline DDim& operator=(const DDim& ddim) { return CopyFrom(ddim); }
template <int D> template <int D>
DDim& operator=(const Dim<D>& in) { inline DDim& operator=(const Dim<D>& dim) {
var = in; rank_ = D;
UnsafeCast<D>() = dim;
return *this; return *this;
} }
int64_t& operator[](int idx); inline int64_t& operator[](int idx) { return dim_[idx]; }
int64_t operator[](int idx) const;
inline int64_t operator[](int idx) const { return dim_[idx]; }
inline int64_t& at(int idx) {
PADDLE_ENFORCE(idx >= 0 && idx < rank_, "Invalid idx %d", idx);
return dim_[idx];
}
inline int64_t at(int idx) const {
PADDLE_ENFORCE(idx >= 0 && idx < rank_, "Invalid idx %d", idx);
return dim_[idx];
}
template <typename Visitor> template <typename Visitor>
typename Visitor::result_type apply_visitor(Visitor& visitor) { typename std::result_of<Visitor(Dim<0>&)>::type apply_visitor(
return var.apply_visitor(visitor); Visitor&& visitor) {
PADDLE_VISIT_DDIM(rank_, visitor(UnsafeCast<kRank>()));
} }
template <typename Visitor> template <typename Visitor>
typename Visitor::result_type apply_visitor(Visitor& visitor) const { typename std::result_of<Visitor(const Dim<0>&)>::type apply_visitor(
return var.apply_visitor(visitor); Visitor&& visitor) const {
PADDLE_VISIT_DDIM(rank_, visitor(UnsafeCast<kRank>()));
} }
DDimVar getVar() { return var; } bool operator==(const DDim& d) const;
bool operator!=(const DDim& d) const;
DDim operator+(const DDim& d) const;
bool operator==(DDim d) const; DDim operator*(const DDim& d) const;
bool operator!=(DDim d) const; inline const int64_t* Get() const { return dim_.Get(); }
DDim operator+(DDim d) const; inline int64_t* GetMutable() { return dim_.GetMutable(); }
DDim operator*(DDim d) const; inline int size() const { return rank_; }
private:
template <int D>
inline Dim<D>& UnsafeCast() {
static_assert(D >= 0 && D <= kMaxRank, "Invalid rank");
auto* p = static_cast<void*>(&dim_);
return *reinterpret_cast<Dim<D>*>(p);
}
template <int D>
inline const Dim<D>& UnsafeCast() const {
static_assert(D >= 0 && D <= kMaxRank, "Invalid rank");
auto* p = static_cast<const void*>(&dim_);
return *reinterpret_cast<const Dim<D>*>(p);
}
int size() const; inline DDim& CopyFrom(const DDim& ddim) {
PADDLE_VISIT_DDIM(ddim.rank_, (*this = ddim.UnsafeCast<kRank>()));
}
friend DDim stride(const DDim& ddim);
friend DDim stride_numel(const DDim& ddim);
private:
Dim<kMaxRank> dim_;
int rank_;
}; };
#undef PADDLE_VISIT_DDIM_BASE
#undef PADDLE_VISIT_DDIM
/** /**
* \brief Make a DDim from std::vector<int64_t> * \brief Make a DDim from std::vector<int64_t>
* *
@ -92,7 +175,7 @@ DDim make_ddim(const std::vector<int>& dims);
DDim make_ddim(std::initializer_list<int64_t> dims); DDim make_ddim(std::initializer_list<int64_t> dims);
int64_t get(const DDim& dim, int idx); int64_t get(const DDim& dim, int idx);
void set(DDim& dim, int idx, int val); void set(DDim& dim, int idx, int val); // NOLINT
std::vector<int64_t> vectorize(const DDim& ddim); std::vector<int64_t> vectorize(const DDim& ddim);
std::vector<int> vectorize2int(const DDim& ddim); std::vector<int> vectorize2int(const DDim& ddim);
@ -129,12 +212,3 @@ DDim stride(const DDim& ddim);
DDim stride_numel(const DDim& ddim); DDim stride_numel(const DDim& ddim);
} // namespace framework } // namespace framework
} // namespace paddle } // namespace paddle
namespace boost {
template <typename T>
T get(const paddle::framework::DDim& in) {
return boost::get<T>(in.var);
}
} // namespace boost

@ -50,7 +50,7 @@ void AllReduceOpHandle::RunImpl() {
// FIXME(typhoonzero): If scope0(global scope) have NCCL_ID_VAR, // FIXME(typhoonzero): If scope0(global scope) have NCCL_ID_VAR,
// this is a distributed or inter-process call, find a better way. // this is a distributed or inter-process call, find a better way.
#ifdef PADDLE_WITH_CUDA #if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
if (NoDummyInputSize() == 1 && if (NoDummyInputSize() == 1 &&
local_scopes_[0]->FindLocalVar(NCCL_ID_VARNAME) == nullptr) { local_scopes_[0]->FindLocalVar(NCCL_ID_VARNAME) == nullptr) {
#else #else

@ -88,7 +88,7 @@ void EagerDeletionOpHandle::RunImpl() {
} }
} else { } else {
PADDLE_THROW("Type %s of %s is not supported eager deletion", PADDLE_THROW("Type %s of %s is not supported eager deletion",
var->Type().name(), name); framework::ToTypeName(var->Type()), name);
} }
} }

@ -25,7 +25,7 @@ struct ExecutionStrategy {
size_t num_threads_{0}; size_t num_threads_{0};
bool use_cuda_{true}; bool use_cuda_{true};
bool allow_op_delay_{false}; bool allow_op_delay_{false};
size_t num_iteration_per_drop_scope_{100}; size_t num_iteration_per_drop_scope_{1};
ExecutorType type_{kDefault}; ExecutorType type_{kDefault};
bool dry_run_{false}; bool dry_run_{false};
}; };

@ -120,6 +120,7 @@ FeedFetchList FastThreadedSSAGraphExecutor::Run(
ClearFetchOp(graph_.get(), &fetch_ops); ClearFetchOp(graph_.get(), &fetch_ops);
return fetches; return fetches;
} }
void FastThreadedSSAGraphExecutor::RunOpAsync( void FastThreadedSSAGraphExecutor::RunOpAsync(
std::unordered_map<OpHandleBase *, std::atomic<int>> *op_deps, std::unordered_map<OpHandleBase *, std::atomic<int>> *op_deps,
OpHandleBase *op, OpHandleBase *op,

File diff suppressed because it is too large Load Diff

@ -45,7 +45,7 @@ class MultiDevSSAGraphBuilder : public ir::Pass {
#endif #endif
int GetVarDeviceID( int GetVarDeviceID(
const ir::Graph &graph, const std::string &varname, const std::string &varname,
const std::unordered_map<std::string, int> &sharded_var_device) const; const std::unordered_map<std::string, int> &sharded_var_device) const;
bool IsScaleLossOp(ir::Node *node) const; bool IsScaleLossOp(ir::Node *node) const;
@ -57,12 +57,6 @@ class MultiDevSSAGraphBuilder : public ir::Pass {
ir::Graph *result, ir::Node *node, ir::Graph *result, ir::Node *node,
std::unordered_map<std::string, int> *sharded_var_device) const; std::unordered_map<std::string, int> *sharded_var_device) const;
std::vector<std::string> FindDistTrainSendVars(
const std::vector<ir::Node *> &nodes) const;
std::vector<std::string> FindDistTrainRecvVars(
const std::vector<ir::Node *> &nodes) const;
void CreateComputationalOps(ir::Graph *result, ir::Node *node, void CreateComputationalOps(ir::Graph *result, ir::Node *node,
size_t num_places) const; size_t num_places) const;
@ -77,7 +71,7 @@ class MultiDevSSAGraphBuilder : public ir::Pass {
int dev_id) const; int dev_id) const;
int GetOpDeviceID( int GetOpDeviceID(
const ir::Graph &graph, ir::Node *node, ir::Node *node,
const std::unordered_map<std::string, int> &sharded_var_device) const; const std::unordered_map<std::string, int> &sharded_var_device) const;
void InsertAllReduceOp(ir::Graph *result, const std::string &og) const; void InsertAllReduceOp(ir::Graph *result, const std::string &og) const;
@ -100,6 +94,15 @@ class MultiDevSSAGraphBuilder : public ir::Pass {
void SetCommunicationContext(OpHandleBase *op_handle, void SetCommunicationContext(OpHandleBase *op_handle,
const platform::Place &p) const; const platform::Place &p) const;
std::vector<ir::Node *> SortForReduceMode(
const std::vector<ir::Node *> &) const;
int GetOpDeviceID(
ir::Node *node,
const std::unordered_map<std::string, int> &shared_var_device,
std::unordered_map<std::string, std::vector<ir::Node *>> *delay_ops)
const;
mutable std::string loss_var_name_; mutable std::string loss_var_name_;
mutable std::vector<platform::Place> places_; mutable std::vector<platform::Place> places_;
mutable std::vector<Scope *> local_scopes_; mutable std::vector<Scope *> local_scopes_;

@ -64,20 +64,26 @@ FeedFetchList ScopeBufferedSSAGraphExecutor::Run(
} }
platform::RecordEvent e("ScopeBufferedSSAGraphExecutorAfterRun", nullptr); platform::RecordEvent e("ScopeBufferedSSAGraphExecutorAfterRun", nullptr);
drop_scope_counter_ += 1; ++drop_scope_counter_;
if (!fetch_tensors.empty() || bool stream_end = false;
drop_scope_counter_ == strategy_.num_iteration_per_drop_scope_) { if (!fetch_tensors.empty()) {
drop_scope_counter_ = 0; WaitComputationalStreams();
// Wait All computational streams stream_end = true;
for (auto p : places_) { }
platform::DeviceContextPool::Instance().Get(p)->Wait();
if (drop_scope_counter_ == strategy_.num_iteration_per_drop_scope_) {
if (!stream_end) {
WaitComputationalStreams();
} }
for (auto &scope : local_scopes_) { for (auto &scope : local_scopes_) {
auto &local_scope = auto &local_scope =
*scope->Var(details::kLocalExecScopeName)->GetMutable<Scope *>(); *scope->Var(details::kLocalExecScopeName)->GetMutable<Scope *>();
scope->DeleteScope(local_scope); scope->DeleteScope(local_scope);
} }
drop_scope_counter_ = 0;
} }
if (eptr) { if (eptr) {
std::rethrow_exception(eptr); std::rethrow_exception(eptr);

Some files were not shown because too many files have changed in this diff Show More

Loading…
Cancel
Save