Merge branch 'develop' into ir5

createGenDocLib
Xin Pan 7 years ago committed by GitHub
commit ec5cb3762c
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

@ -24,6 +24,9 @@ message(STATUS "CXX compiler: ${CMAKE_CXX_COMPILER}, version: "
"${CMAKE_CXX_COMPILER_ID} ${CMAKE_CXX_COMPILER_VERSION}")
message(STATUS "C compiler: ${CMAKE_C_COMPILER}, version: "
"${CMAKE_C_COMPILER_ID} ${CMAKE_C_COMPILER_VERSION}")
if(WIN32)
set(CMAKE_STATIC_LIBRARY_PREFIX lib)
endif(WIN32)
if(NOT CMAKE_CROSSCOMPILING)
find_package(CUDA QUIET)
@ -138,12 +141,6 @@ else()
set(THIRD_PARTY_BUILD_TYPE Release)
endif()
if(WITH_MKL)
option(MKL_SPLIT_GEMM "PaddlePaddle MKL gemm would split to small ones" OFF)
if (MKL_SPLIT_GEMM)
add_definitions(-DPADDLE_MKL_SPLIT_GEMM)
endif()
endif()
set(WITH_MKLML ${WITH_MKL})
if (NOT DEFINED WITH_MKLDNN)
if (WITH_MKL AND AVX2_FOUND)
@ -171,7 +168,6 @@ include(external/python) # download, build, install python
include(external/openblas) # download, build, install openblas
include(external/mkldnn) # download, build, install mkldnn
include(external/swig) # download, build, install swig
include(external/warpctc) # download, build, install warpctc
include(external/boost) # download boost
include(external/any) # download libn::any
include(external/eigen) # download eigen3
@ -179,6 +175,14 @@ include(external/pybind11) # download pybind11
include(external/cares)
include(external/cub)
if (NOT WIN32)
# there is no official support of snappystream, warpctc, nccl, cupti in windows
include(external/snappy) # download snappy
include(external/snappystream) # download snappystream
include(external/warpctc) # download, build, install warpctc
include(cupti)
endif (NOT WIN32)
if(WITH_DISTRIBUTE)
if(WITH_GRPC)
include(external/grpc)
@ -200,13 +204,10 @@ if(WITH_BRPC_RDMA)
endif()
endif()
include(external/snappy) # download snappy
include(external/snappystream)
include(external/threadpool)
include(external/threadpool)
include(flags) # set paddle compile flags
include(cudnn) # set cudnn libraries, must before configure
include(cupti)
include(configure) # add paddle env configuration
if(WITH_GPU)

@ -61,6 +61,11 @@ if(NOT CMAKE_CROSSCOMPILING)
endif()
endif()
if(WIN32)
# windows stupid compile option for all targets.
add_definitions(-D_XKEYCHECK_H)
endif(WIN32)
if(NOT WITH_GOLANG)
add_definitions(-DPADDLE_WITHOUT_GOLANG)
endif(NOT WITH_GOLANG)

@ -28,7 +28,12 @@ if((NOT DEFINED BOOST_TAR) OR (NOT DEFINED BOOST_URL))
set(BOOST_TAR "boost_1_41_0" CACHE STRING "" FORCE)
set(BOOST_URL "http://paddlepaddledeps.cdn.bcebos.com/${BOOST_TAR}.tar.gz" CACHE STRING "" FORCE)
endif()
MESSAGE(STATUS "BOOST_TAR: ${BOOST_TAR}, BOOST_URL: ${BOOST_URL}")
IF (WIN32)
MESSAGE(WARNING, "In windows, boost can not be downloaded automaticlly, please build it manually and put it at " ${THIRD_PARTY_PATH}install/boost)
else()
MESSAGE(STATUS "BOOST_TAR: ${BOOST_TAR}, BOOST_URL: ${BOOST_URL}")
ENDIF(WIN32)
set(BOOST_SOURCES_DIR ${THIRD_PARTY_PATH}/boost)
set(BOOST_DOWNLOAD_DIR "${BOOST_SOURCES_DIR}/src/${BOOST_PROJECT}")
set(BOOST_INCLUDE_DIR "${BOOST_DOWNLOAD_DIR}/${BOOST_TAR}" CACHE PATH "boost include directory." FORCE)
@ -36,12 +41,13 @@ set_directory_properties(PROPERTIES CLEAN_NO_CUSTOM 1)
include_directories(${BOOST_INCLUDE_DIR})
if (NOT WIN32)
ExternalProject_Add(
${BOOST_PROJECT}
${EXTERNAL_PROJECT_LOG_ARGS}
DOWNLOAD_DIR ${BOOST_DOWNLOAD_DIR}
DOWNLOAD_COMMAND wget --no-check-certificate ${BOOST_URL} -c -q -O ${BOOST_TAR}.tar.gz
&& tar zxf ${BOOST_TAR}.tar.gz
&& tar zxf ${BOOST_TAR}.tar.gz
DOWNLOAD_NO_PROGRESS 1
PREFIX ${BOOST_SOURCES_DIR}
CONFIGURE_COMMAND ""
@ -49,8 +55,9 @@ ExternalProject_Add(
INSTALL_COMMAND ""
UPDATE_COMMAND ""
)
endif(NOT WIN32)
if (${CMAKE_VERSION} VERSION_LESS "3.3.0")
if (${CMAKE_VERSION} VERSION_LESS "3.3.0" OR NOT WIN32)
set(dummyfile ${CMAKE_CURRENT_BINARY_DIR}/boost_dummy.c)
file(WRITE ${dummyfile} "const char *dummy = \"${dummyfile}\";")
add_library(boost STATIC ${dummyfile})

@ -18,7 +18,7 @@ SET(GFLAGS_SOURCES_DIR ${THIRD_PARTY_PATH}/gflags)
SET(GFLAGS_INSTALL_DIR ${THIRD_PARTY_PATH}/install/gflags)
SET(GFLAGS_INCLUDE_DIR "${GFLAGS_INSTALL_DIR}/include" CACHE PATH "gflags include directory." FORCE)
IF(WIN32)
set(GFLAGS_LIBRARIES "${GFLAGS_INSTALL_DIR}/lib/gflags.lib" CACHE FILEPATH "GFLAGS_LIBRARIES" FORCE)
set(GFLAGS_LIBRARIES "${GFLAGS_INSTALL_DIR}/lib/libgflags.lib" CACHE FILEPATH "GFLAGS_LIBRARIES" FORCE)
ELSE(WIN32)
set(GFLAGS_LIBRARIES "${GFLAGS_INSTALL_DIR}/lib/libgflags.a" CACHE FILEPATH "GFLAGS_LIBRARIES" FORCE)
ENDIF(WIN32)
@ -45,7 +45,13 @@ ExternalProject_Add(
-DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON
-DCMAKE_BUILD_TYPE:STRING=${THIRD_PARTY_BUILD_TYPE}
)
IF(WIN32)
IF(NOT EXISTS "${GFLAGS_INSTALL_DIR}/lib/libgflags.lib")
add_custom_command(TARGET extern_gflags POST_BUILD
COMMAND cmake -E rename ${GFLAGS_INSTALL_DIR}/lib/gflags_static.lib ${GFLAGS_INSTALL_DIR}/lib/libgflags.lib
)
ENDIF()
ENDIF(WIN32)
ADD_LIBRARY(gflags STATIC IMPORTED GLOBAL)
SET_PROPERTY(TARGET gflags PROPERTY IMPORTED_LOCATION ${GFLAGS_LIBRARIES})
ADD_DEPENDENCIES(gflags extern_gflags)
@ -60,3 +66,4 @@ IF(WITH_C_API)
INSTALL(FILES ${GFLAGS_LIBRARIES} DESTINATION third_party/gflags/lib)
ENDIF()
ENDIF()

@ -60,6 +60,13 @@ ExternalProject_Add(
-DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON
-DCMAKE_BUILD_TYPE:STRING=${THIRD_PARTY_BUILD_TYPE}
)
IF(WIN32)
IF(NOT EXISTS "${GLOG_INSTALL_DIR}/lib/libglog.lib")
add_custom_command(TARGET extern_glog POST_BUILD
COMMAND cmake -E rename ${GLOG_INSTALL_DIR}/lib/glog.lib ${GLOG_INSTALL_DIR}/lib/libglog.lib
)
ENDIF()
ENDIF(WIN32)
ADD_LIBRARY(glog STATIC IMPORTED GLOBAL)
SET_PROPERTY(TARGET glog PROPERTY IMPORTED_LOCATION ${GLOG_LIBRARIES})

@ -54,7 +54,7 @@ ExternalProject_Add(
${EXTERNAL_PROJECT_LOG_ARGS}
DEPENDS ${MKLDNN_DEPENDS}
GIT_REPOSITORY "https://github.com/01org/mkl-dnn.git"
GIT_TAG "a29d8487a63afca3d5b8c5bbdbb473cf8ccc6e51"
GIT_TAG "64e03a1939e0d526aa8e9f2e3f7dc0ad8d372944"
PREFIX ${MKLDNN_SOURCES_DIR}
UPDATE_COMMAND ""
CMAKE_ARGS -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}

@ -17,20 +17,29 @@ IF(USE_EIGEN_FOR_BLAS)
ENDIF(USE_EIGEN_FOR_BLAS)
INCLUDE(cblas)
# IF(WIN32 AND NOT ${CBLAS_FOUND})
IF(NOT ${CBLAS_FOUND})
INCLUDE(ExternalProject)
SET(CBLAS_SOURCES_DIR ${THIRD_PARTY_PATH}/openblas)
SET(CBLAS_INSTALL_DIR ${THIRD_PARTY_PATH}/install/openblas)
SET(CBLAS_INC_DIR "${CBLAS_INSTALL_DIR}/include" CACHE PATH "openblas include directory." FORCE)
SET(CBLAS_INCLUDE_DIR "${CBLAS_INSTALL_DIR}/include" CACHE PATH "openblas include directory." FORCE)
SET(CBLAS_LIBRARIES
"${CBLAS_INSTALL_DIR}/lib/${CMAKE_STATIC_LIBRARY_PREFIX}openblas${CMAKE_STATIC_LIBRARY_SUFFIX}"
CACHE FILEPATH "openblas library." FORCE)
ADD_DEFINITIONS(-DPADDLE_USE_OPENBLAS)
IF (WIN32)
SET(CBLAS_FOUND true)
MESSAGE(WARNING, "In windows, openblas only support msvc build, please build it manually and put it at " ${CBLAS_INSTALL_DIR})
ENDIF(WIN32)
IF (NOT WIN32)
SET(OPENBLAS_CC "${CMAKE_C_COMPILER} -Wno-unused-but-set-variable -Wno-unused-variable")
SET(OPENBLAS_COMMIT "v0.2.20")
@ -69,7 +78,6 @@ IF(NOT ${CBLAS_FOUND})
ENDIF()
SET(COMMON_ARGS CC=${OPENBLAS_CC} NO_SHARED=1 NO_LAPACK=1 libs)
ExternalProject_Add(
extern_openblas
${EXTERNAL_PROJECT_LOG_ARGS}
@ -84,9 +92,11 @@ IF(NOT ${CBLAS_FOUND})
UPDATE_COMMAND ""
CONFIGURE_COMMAND ""
)
ELSE()
ENDIF(NOT WIN32)
SET(CBLAS_PROVIDER openblas)
IF(WITH_C_API)
INSTALL(DIRECTORY ${CBLAS_INC_DIR} DESTINATION third_party/openblas)
INSTALL(DIRECTORY ${CBLAS_INCLUDE_DIR} DESTINATION third_party/openblas)
# Because libopenblas.a is a symbolic link of another library, thus need to
# install the whole directory.
IF(ANDROID)
@ -107,7 +117,8 @@ IF(NOT ${CBLAS_FOUND})
ENDIF(NOT ${CBLAS_FOUND})
MESSAGE(STATUS "BLAS library: ${CBLAS_LIBRARIES}")
INCLUDE_DIRECTORIES(${CBLAS_INC_DIR})
MESSAGE(STATUS "BLAS Include: ${CBLAS_INCLUDE_DIR}")
INCLUDE_DIRECTORIES(${CBLAS_INCLUDE_DIR})
# FIXME(gangliao): generate cblas target to track all high performance
# linear algebra libraries for cc_library(xxx SRCS xxx.c DEPS cblas)

@ -14,11 +14,14 @@
INCLUDE(ExternalProject)
# Always invoke `FIND_PACKAGE(Protobuf)` for importing function protobuf_generate_cpp
IF(NOT WIN32)
FIND_PACKAGE(Protobuf QUIET)
ENDIF(NOT WIN32)
macro(UNSET_VAR VAR_NAME)
UNSET(${VAR_NAME} CACHE)
UNSET(${VAR_NAME})
endmacro()
UNSET_VAR(PROTOBUF_INCLUDE_DIR)
UNSET_VAR(PROTOBUF_FOUND)
UNSET_VAR(PROTOBUF_PROTOC_EXECUTABLE)
@ -94,12 +97,14 @@ macro(PROMPT_PROTOBUF_LIB)
SET(protobuf_DEPS ${ARGN})
MESSAGE(STATUS "Protobuf protoc executable: ${PROTOBUF_PROTOC_EXECUTABLE}")
MESSAGE(STATUS "Protobuf-lite library: ${PROTOBUF_LITE_LIBRARY}")
MESSAGE(STATUS "Protobuf library: ${PROTOBUF_LIBRARY}")
MESSAGE(STATUS "Protoc library: ${PROTOBUF_PROTOC_LIBRARY}")
MESSAGE(STATUS "Protobuf version: ${PROTOBUF_VERSION}")
INCLUDE_DIRECTORIES(${PROTOBUF_INCLUDE_DIR})
# Assuming that all the protobuf libraries are of the same type.
IF(${PROTOBUF_LIBRARY} MATCHES "${CMAKE_STATIC_LIBRARY_SUFFIX}$")
IF(${PROTOBUF_LIBRARY} MATCHES ${CMAKE_STATIC_LIBRARY_SUFFIX})
SET(protobuf_LIBTYPE STATIC)
ELSEIF(${PROTOBUF_LIBRARY} MATCHES "${CMAKE_SHARED_LIBRARY_SUFFIX}$")
SET(protobuf_LIBTYPE SHARED)
@ -137,18 +142,25 @@ macro(SET_PROTOBUF_VERSION)
endmacro()
set(PROTOBUF_ROOT "" CACHE PATH "Folder contains protobuf")
IF (WIN32)
SET(PROTOBUF_ROOT ${THIRD_PARTY_PATH}/install/protobuf)
MESSAGE(WARNING, "In windows, protobuf only support msvc build, please build it manually and put it at " ${PROTOBUF_ROOT})
ENDIF(WIN32)
if (NOT "${PROTOBUF_ROOT}" STREQUAL "")
find_path(PROTOBUF_INCLUDE_DIR google/protobuf/message.h PATHS ${PROTOBUF_ROOT}/include NO_DEFAULT_PATH)
find_library(PROTOBUF_LIBRARY protobuf PATHS ${PROTOBUF_ROOT}/lib NO_DEFAULT_PATH)
find_library(PROTOBUF_LITE_LIBRARY protobuf-lite PATHS ${PROTOBUF_ROOT}/lib NO_DEFAULT_PATH)
find_library(PROTOBUF_PROTOC_LIBRARY protoc PATHS ${PROTOBUF_ROOT}/lib NO_DEFAULT_PATH)
find_library(PROTOBUF_LIBRARY protobuf libprotobuf.lib PATHS ${PROTOBUF_ROOT}/lib NO_DEFAULT_PATH)
find_library(PROTOBUF_LITE_LIBRARY protobuf-lite libprotobuf-lite.lib PATHS ${PROTOBUF_ROOT}/lib NO_DEFAULT_PATH)
find_library(PROTOBUF_PROTOC_LIBRARY protoc libprotoc.lib PATHS ${PROTOBUF_ROOT}/lib NO_DEFAULT_PATH)
find_program(PROTOBUF_PROTOC_EXECUTABLE protoc PATHS ${PROTOBUF_ROOT}/bin NO_DEFAULT_PATH)
if (PROTOBUF_INCLUDE_DIR AND PROTOBUF_LIBRARY AND PROTOBUF_LITE_LIBRARY AND PROTOBUF_PROTOC_LIBRARY AND PROTOBUF_PROTOC_EXECUTABLE)
message(STATUS "Using custom protobuf library in ${PROTOBUF_ROOT}.")
SET(PROTOBUF_FOUND true)
SET_PROTOBUF_VERSION()
PROMPT_PROTOBUF_LIB()
else()
message(WARNING "Cannot find protobuf library in ${PROTOBUF_ROOT}.")
message(WARNING "Cannot find protobuf library in ${PROTOBUF_ROOT}")
endif()
endif()
@ -239,6 +251,7 @@ IF(CMAKE_CROSSCOMPILING)
CACHE FILEPATH "protobuf executable." FORCE)
ENDIF()
IF(NOT PROTOBUF_FOUND)
build_protobuf(extern_protobuf FALSE)

@ -148,7 +148,8 @@ function(merge_static_libs TARGET_NAME)
COMMAND rm "${CMAKE_CURRENT_BINARY_DIR}/lib${TARGET_NAME}.a"
COMMAND /usr/bin/libtool -static -o "${CMAKE_CURRENT_BINARY_DIR}/lib${TARGET_NAME}.a" ${libfiles}
)
else() # general UNIX: use "ar" to extract objects and re-add to a common lib
endif(APPLE)
if(LINUX) # general UNIX: use "ar" to extract objects and re-add to a common lib
set(target_DIR ${CMAKE_CURRENT_BINARY_DIR}/${TARGET_NAME}.dir)
foreach(lib ${libs})
@ -187,7 +188,36 @@ function(merge_static_libs TARGET_NAME)
COMMAND ${CMAKE_AR} crs ${target_LIBNAME} `find ${target_DIR} -name '*.o'`
COMMAND ${CMAKE_RANLIB} ${target_LIBNAME}
WORKING_DIRECTORY ${target_DIR})
endif()
endif(LINUX)
if(WIN32) # windows do not support gcc/nvcc combined compiling. Use msvc lib.exe to merge libs.
# Make the generated dummy source file depended on all static input
# libs. If input lib changes,the source file is touched
# which causes the desired effect (relink).
add_custom_command(OUTPUT ${target_SRCS}
COMMAND ${CMAKE_COMMAND} -E touch ${target_SRCS}
DEPENDS ${libs})
# Generate dummy staic lib
file(WRITE ${target_SRCS} "const char *dummy_${TARGET_NAME} = \"${target_SRCS}\";")
add_library(${TARGET_NAME} STATIC ${target_SRCS})
target_link_libraries(${TARGET_NAME} ${libs_deps})
foreach(lib ${libs})
# Get the file names of the libraries to be merged
#if(NOT $<TARGET_FILE:${lib}> MATCHES "lib.*\\.lib")
# message("library" ${lib})
# set(libfiles ${libfiles} lib$<TARGET_FILE:${lib}>)
#else()
set(libfiles ${libfiles} $<TARGET_FILE:${lib}>)
#endif()
endforeach()
# windows cmd return error in clean env.
# COMMAND del "${CMAKE_CURRENT_BINARY_DIR}/${CMAKE_BUILD_TYPE}/${TARGET_NAME}.lib"
add_custom_command(TARGET ${TARGET_NAME} POST_BUILD
COMMAND lib /OUT:${CMAKE_CURRENT_BINARY_DIR}/lib${TARGET_NAME}.lib ${libfiles}
)
endif(WIN32)
endfunction(merge_static_libs)
function(cc_library TARGET_NAME)
@ -195,6 +225,10 @@ function(cc_library TARGET_NAME)
set(oneValueArgs "")
set(multiValueArgs SRCS DEPS)
cmake_parse_arguments(cc_library "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
if(WIN32)
# add libxxx.lib prefix in windows
set(${TARGET_NAME}_LIB_NAME "${CMAKE_STATIC_LIBRARY_PREFIX}${TARGET_NAME}${CMAKE_STATIC_LIBRARY_SUFFIX}" CACHE STRING "output library name for target ${TARGET_NAME}")
endif(WIN32)
if(cc_library_SRCS)
if(cc_library_SHARED OR cc_library_shared) # build *.so
add_library(${TARGET_NAME} SHARED ${cc_library_SRCS})

@ -101,6 +101,7 @@ if(WITH_MKLDNN)
)
endif()
if (NOT WIN32)
if(NOT MOBILE_INFERENCE AND NOT RPI)
set(dst_dir "${FLUID_INSTALL_DIR}/third_party/install/snappy")
copy(snappy_lib
@ -120,15 +121,23 @@ if(NOT MOBILE_INFERENCE AND NOT RPI)
DSTS ${dst_dir} ${dst_dir}/lib
DEPS zlib)
endif()
endif(NOT WIN32)
# paddle fluid module
set(src_dir "${PADDLE_SOURCE_DIR}/paddle/fluid")
set(dst_dir "${FLUID_INSTALL_DIR}/paddle/fluid")
set(module "framework")
if (NOT WIN32)
copy(framework_lib DEPS framework_py_proto
SRCS ${src_dir}/${module}/*.h ${src_dir}/${module}/details/*.h ${PADDLE_BINARY_DIR}/paddle/fluid/framework/framework.pb.h
DSTS ${dst_dir}/${module} ${dst_dir}/${module}/details ${dst_dir}/${module}
)
else()
copy(framework_lib
SRCS ${src_dir}/${module}/*.h ${src_dir}/${module}/details/*.h ${PADDLE_BINARY_DIR}/paddle/fluid/framework/framework.pb.h
DSTS ${dst_dir}/${module} ${dst_dir}/${module}/details ${dst_dir}/${module}
)
endif(NOT WIN32)
set(module "memory")
copy(memory_lib

@ -1,7 +1,7 @@
# Distributed Training with NCCL2
We design a pattern that can enable training with `ParallelExecutor` and
using [NCCL2](https://developer.nvidia.com/nccl) as it's collective
use [NCCL2](https://developer.nvidia.com/nccl) as it's collective
communication library.
In `ParallelExecutor` we can use `AllReduce` or `Reduce` and `Broadcast`
@ -9,14 +9,14 @@ to do multi GPU training. And if we initialize NCCL2 communicators as
ranks in a distributed environment, we can simply run the `ParallelExecutor`
as a distributed program! The only thing that may be different than in
the single node version is that we need to broadcast the NCCL unique ID
to all the nodes, and initialize communicators using that ID, so NCCL2
will know each other as ranks.
to all the nodes and initialize communicators using that ID, so NCCL2
can know each other as ranks.
To achieve this feature, we introduce a new operator: `gen_nccl_id` op,
so we are ***not*** "bind to" running NCCL2 with MPI, we can run it in
what ever platform you like.
whatever platform you like.
It have two running modes:
It has two running modes:
1. Generate and broadcast mode, which should be used on trainer 0;
1. Listen and fetch mode, which should be used on trainers other than 0.
@ -29,7 +29,7 @@ initialize NCCL communicator objects.
<img src="src/ncc2_design.png">
The above figure indicates the general process when training with NCCL2
distributed. Each trainer have the number of communicators equal to the
distributed. Each trainer has the number of communicators equal to the
number of GPUs, but the ranks should match the global ranks number: here
we have total 8 GPUs, so `nranks==8`, for each trainer, the ranks should
be from 0 ~ 3 on trainer 0 and 4 ~ 7 on trainer 1.

@ -36,19 +36,19 @@
<tbody>
<tr>
<td>OpProtoMake定义 </td>
<td>`.cc`文件Backward Op不需要定义OpProtoMake </td>
<td>.cc 文件Backward Op不需要定义OpProtoMake </td>
</tr>
<tr>
<td>Op定义 </td>
<td> `.cc`文件</td>
<td> .cc 文件</td>
</tr>
<tr>
<td>Kernel实现 </td>
<td> CPU、CUDA共享Kernel实现在`.h`文件中否则CPU 实现在`.cc`文件中CUDA 实现在`.cu`文件中。</td>
<td> CPU、CUDA共享Kernel实现在.h 文件中否则CPU 实现在.cc 文件中CUDA 实现在.cu 文件中。</td>
</tr>
<tr>
<td>注册Op </td>
<td> Op注册实现在`.cc`文件Kernel注册CPU实现在`.cc`文件中CUDA实现在`.cu`文件中</td>
<td> Op注册实现在.cc 文件Kernel注册CPU实现在.cc 文件中CUDA实现在.cu 文件中</td>
</tr>
</tbody>
</table>
@ -391,7 +391,7 @@ PADDLE_ENFORCE(ctx->HasInput("X"), "");
```
问题示例2 :提示信息过于简单
```
PADDLE_ENFORCE(i != nullptr, "I must be set"); // I是什么?
PADDLE_ENFORCE(i != nullptr, "i must be set"); // i是什么?
```
2. 在报错信息中使用开发人员定义的变量缩写,不易理解!

@ -50,6 +50,33 @@ pop-up box, choose the current release branch and click "Run Build" button. You
* pypi does not allow overwrite the already uploaded version of wheel package, even if you delete the
old version. you must change the version number before upload a new one.
### Publish wheel Packages for MacOS
You need to build the binary wheel package for MacOS before publishing, to
make sure that the package can be used by many versions of MacOS
(10.11, 10.12, 10.13) and different python installs (python.org, homebrew, etc.),
you must build the package ***exactly*** following below steps:
Build steps:
1. install python from python.org downloads, and make sure it's currently in use
in your system.
1. `export MACOSX_DEPLOYMENT_TARGET=10.11`, use `10.11` is enough for recent versions.
1. `git clone https://github.com/PaddlePaddle/Paddle.git && cd Paddle && mkdir build && cd build`
1. `cmake -DWITH_GPU=OFF -DWITH_MKL=OFF -DWITH_SYSTEM_BLAS=OFF ..`, make sure the output of `cmake` command is using the correct python interpreter installed from python.org
1. `make -j`
1. `pip install delocate`
1. `mkdir fixed_wheel && delocate-wheel -w fixed_wheel python/dist/*.whl`
Then the whl under `fixed_wheel` is ready to upload.
Install steps:
1. run `pip install paddlepaddle...whl`
1. find the `libpython.dylib` that are currently in use:
- for python.org package installs, do nothing.
- for other python installs, find the path of `libpython*.dylib` and `export LD_LIBRARY_PATH=you path && DYLD_LIBRARY_PATH=your path`
## Publish Docker Images
Our CI tool will push latest images to DockerHub, so we only need to push a version tag like:

@ -1,12 +1,12 @@
# Distributed Training with NCCL2 and RDMA
When doing distributed multi-GPU training, network bandwith often becomes the
bottle neck. We introduce a way to use NCCL2 to do such training job to
achieve best performace.
When doing distributed multi-GPU training, network bandwidth often becomes the
bottleneck. We introduce a way to use NCCL2 to do such training job to
achieve best performance.
## Prepare Hardwares with RDMA and Multiple GPUs
## Prepare Hardware with RDMA and Multiple GPUs
I'm using two Linux servers each of them is installed with 8 GPUs and
I'm using two Linux servers each of them installed with 8 GPUs and
one 100Gb RDMA card.
Base environment is:
@ -25,7 +25,7 @@ In general, the steps including:
1. Use docker to run tests and make sure GPUs and RDMA can work inside
the container.
I'll ommit section "Install GPU drivers" because we can find it easily
I'll omit the section "Install GPU drivers" because we can find it easily
somewhere else.
### Install RDMA drivers
@ -33,7 +33,7 @@ somewhere else.
For my case, I've got two machines with device
"Mellanox Technologies MT27700 Family [ConnectX-4]" installed. The OS was
"CentOS 7.4" and I updated the kernel to version 4.4 so that docker can
work with latest overlay2 filesystem.
work with the latest overlay2 filesystem.
***NOTE: before you start, make sure you have a way to get a console
of the server other than ssh because we may need to re-configure the
@ -45,14 +45,14 @@ network device.***
1. Run `./mlnxofedinstall --add-kernel-support` in the software package.
1. Run `/etc/init.d/openibd restart` to make everything work, note that
this operation may cause the network goes down if you are using this
RDMA device as default network device and use ssh to login the server.
RDMA device as default network device and use ssh to log in the server.
1. Re-configure the network interface, for example:
`ifconfig eth2 192.168.16.30/20 up`, then add routes if needed:
`ip route add default via 192.168.16.1 dev eth2`.
1. Do the same thing on the other node.
1. Use `ping` to test if the two nodes have typical ICMP connection.
1. Use either `udaddy` or `ib_write_bw` to test the network connection is
ready and have the desired bandwith.
ready and have the desired bandwidth.
### Prepare Docker Image to Run RDMA Programs
@ -60,7 +60,7 @@ network device.***
package in it.
1. Start a docker container and mount GPU driver libs into it (you can
skip this step if you are using nvidia-docker).
1. Mount RDMA dirvers and libs into the docker image (see below section),
1. Mount RDMA drivers and libs into the docker image (see below section),
also `udaddy` and `ib_write_bw` if needed.
1. Mount GPU devices and RDMA devices into the container using `--device`
or just use privileged mode `--privileged`.

@ -9,8 +9,6 @@ Paddle 预测 API
- 头文件 ``paddle_inference_api.h`` 定义了所有的接口
- 库文件\ ``libpaddle_fluid.so````libpaddle_fluid.a``
- 库文件 ``libpaddle_inference_api.so``
``libpaddle_inference_api.a``
编译和依赖可以参考 :ref:`install_or_build_cpp_inference_lib`
@ -97,8 +95,7 @@ engine
CHECK(predictor->Run(slots, &outputs));
// 获取 outputs ...
编译时,联编 ``libpaddle_fluid.a/.so``
``libpaddle_inference_api.a/.so`` 便可。
编译时,联编 ``libpaddle_fluid.a/.so`` 即可。
详细代码参考
------------

File diff suppressed because one or more lines are too long

@ -2,9 +2,13 @@ add_subdirectory(memory)
add_subdirectory(platform)
add_subdirectory(framework)
add_subdirectory(operators)
add_subdirectory(pybind)
add_subdirectory(string)
if (NOT WIN32)
add_subdirectory(pybind)
add_subdirectory(recordio)
endif(NOT WIN32)
if(WITH_INFERENCE)
# NOTE: please add subdirectory inference at last.
add_subdirectory(inference)

@ -1,5 +1,7 @@
add_subdirectory(details)
add_subdirectory(ir)
if (NOT WIN32)
add_subdirectory(details)
endif (NOT WIN32)
# ddim lib
proto_library(framework_proto SRCS framework.proto)
@ -28,8 +30,12 @@ if(WITH_GPU)
else()
cc_test(mixed_vector_test SRCS mixed_vector_test.cc DEPS place memory device_context tensor)
endif()
if (NOT WIN32)
cc_library(lod_tensor SRCS lod_tensor.cc DEPS ddim place tensor framework_proto recordio)
else()
cc_library(lod_tensor SRCS lod_tensor.cc DEPS ddim place tensor framework_proto)
endif (NOT WIN32)
cc_test(lod_tensor_test SRCS lod_tensor_test.cc DEPS lod_tensor memory)
nv_test(lod_tensor_gpu_test SRCS lod_tensor_test.cu DEPS lod_tensor)
@ -69,14 +75,22 @@ cc_library(op_proto_maker SRCS op_proto_maker.cc DEPS framework_proto attribute
cc_test(op_proto_maker_test SRCS op_proto_maker_test.cc DEPS op_proto_maker)
cc_library(op_info SRCS op_info.cc DEPS attribute framework_proto)
cc_library(shape_inference SRCS shape_inference.cc DEPS ddim attribute device_context)
if (NOT WIN32)
cc_library(operator SRCS operator.cc DEPS op_info device_context tensor scope glog
shape_inference data_transform lod_tensor profiler)
else()
cc_library(operator SRCS operator.cc DEPS op_info device_context tensor scope glog
shape_inference data_transform lod_tensor)
endif(NOT WIN32)
cc_test(operator_test SRCS operator_test.cc DEPS operator op_registry device_context)
cc_library(proto_desc SRCS var_desc.cc op_desc.cc block_desc.cc program_desc.cc DEPS shape_inference op_info operator glog)
cc_library(op_registry SRCS op_registry.cc DEPS op_proto_maker op_info operator glog proto_desc)
nv_test(op_registry_test SRCS op_registry_test.cc DEPS op_registry)
if (NOT WIN32)
py_proto_compile(framework_py_proto SRCS framework.proto)
# Generate an empty __init__.py to make framework_py_proto as a valid python module.
add_custom_target(framework_py_proto_init ALL COMMAND ${CMAKE_COMMAND} -E touch __init__.py)
@ -86,17 +100,18 @@ add_custom_command(TARGET framework_py_proto POST_BUILD
COMMAND cp *.py ${PADDLE_BINARY_DIR}/python/paddle/fluid/proto/
COMMENT "Copy generated python proto into directory paddle/fluid/proto."
WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
endif(NOT WIN32)
cc_library(lod_rank_table SRCS lod_rank_table.cc DEPS lod_tensor)
cc_library(feed_fetch_method SRCS feed_fetch_method.cc DEPS lod_tensor scope glog)
if(WITH_DISTRIBUTE)
cc_library(executor SRCS executor.cc DEPS op_registry device_context scope framework_proto glog lod_rank_table feed_fetch_method sendrecvop_grpc cares grpc++_unsecure grpc_unsecure gpr)
cc_library(executor SRCS executor.cc DEPS op_registry device_context scope framework_proto glog lod_rank_table feed_fetch_method sendrecvop_grpc cares grpc++_unsecure grpc_unsecure gpr graph_to_program_pass)
set(DISTRIBUTE_COMPILE_FLAGS "-Wno-non-virtual-dtor -Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor")
set_source_files_properties(executor.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
else()
cc_library(executor SRCS executor.cc DEPS op_registry device_context scope framework_proto glog lod_rank_table feed_fetch_method)
cc_library(executor SRCS executor.cc DEPS op_registry device_context scope framework_proto glog lod_rank_table feed_fetch_method graph_to_program_pass)
endif()
if (NOT WIN32)
@ -120,7 +135,9 @@ cc_test(cow_ptr_tests SRCS details/cow_ptr_test.cc)
# cc_test(channel_test SRCS channel_test.cc)
cc_test(tuple_test SRCS tuple_test.cc )
if (NOT WIN32)
cc_test(rw_lock_test SRCS rw_lock_test.cc)
endif (NOT WIN32)
# disable test temporarily.
# TODO https://github.com/PaddlePaddle/Paddle/issues/11971

@ -0,0 +1,48 @@
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <cstdint>
#include "paddle/fluid/platform/hostdevice.h"
namespace paddle {
namespace framework {
template <typename T, size_t N>
class Array {
static_assert(N > 0, "The size of array must be larger than 0");
public:
HOSTDEVICE Array() {}
HOSTDEVICE explicit Array(const T &val) {
for (size_t i = 0; i < N; ++i) data_[i] = val;
}
HOSTDEVICE const T *Get() const { return data_; }
HOSTDEVICE T *GetMutable() { return data_; }
HOSTDEVICE T &operator[](size_t index) { return data_[index]; }
HOSTDEVICE const T &operator[](size_t index) const { return data_[index]; }
HOSTDEVICE constexpr size_t size() const { return N; }
private:
T data_[N];
};
} // namespace framework
} // namespace paddle

@ -64,6 +64,7 @@ static DataTypeMap* InitDataTypeMap() {
RegType(size_t, proto::VarType::SIZE_T);
RegType(int16_t, proto::VarType::INT16);
RegType(uint8_t, proto::VarType::UINT8);
RegType(int8_t, proto::VarType::INT8);
#undef RegType
return retv;

@ -26,6 +26,7 @@ namespace framework {
extern proto::VarType::Type ToDataType(std::type_index type);
extern std::type_index ToTypeIndex(proto::VarType::Type type);
#if !defined(_WIN32)
template <typename Visitor>
inline void VisitDataType(proto::VarType::Type type, Visitor visitor) {
switch (type) {
@ -53,10 +54,47 @@ inline void VisitDataType(proto::VarType::Type type, Visitor visitor) {
case proto::VarType::INT16:
visitor.template operator()<int16_t>();
break;
case proto::VarType::INT8:
visitor.template operator()<int8_t>();
break;
default:
PADDLE_THROW("Not supported %d", type);
}
}
#else
// the msvc compiler do not implement two-stage name lookup correctly.
template <typename Visitor>
inline void VisitDataType(proto::VarType::Type type, Visitor visitor) {
switch (type) {
case proto::VarType::FP16:
visitor.operator()<platform::float16>();
break;
case proto::VarType::FP32:
visitor.operator()<float>();
break;
case proto::VarType::FP64:
visitor.operator()<double>();
break;
case proto::VarType::INT32:
visitor.operator()<int>();
break;
case proto::VarType::INT64:
visitor.operator()<int64_t>();
break;
case proto::VarType::BOOL:
visitor.operator()<bool>();
break;
case proto::VarType::UINT8:
visitor.operator()<uint8_t>();
break;
case proto::VarType::INT16:
visitor.operator()<int16_t>();
break;
default:
PADDLE_THROW("Not supported %d", type);
}
}
#endif // _WIN32
extern std::string DataTypeToString(const proto::VarType::Type type);
extern size_t SizeOfType(std::type_index type);

@ -754,9 +754,20 @@ void MultiDevSSAGraphBuilder::CreateDistTrainOp(ir::Graph *result,
node->Op()->Type());
CreateComputationalOp(result, node, op_dev_id);
if (node->Op()->Type() == "concat") {
ConnectOp(result, result->Get<GraphOps>(kGraphOps).back().get(),
"fetch_barrier");
}
void SetOpInputsAllPlaces(ir::Graph *result, ir::Node *node, int num_places) {
auto *op_handle = result->Get<GraphOps>(kGraphOps).back().get();
for (ir::Node *input : node->inputs) {
VarHandle *var = nullptr;
for (int place_offset = 0; place_offset < num_places; ++place_offset) {
auto &var_holders = result->Get<GraphVars>(kGraphVars)[place_offset];
auto &var_holder = var_holders[input->Name()];
if (!var_holder.empty()) {
var = var_holder.rbegin()->get();
op_handle->AddInput(var);
}
}
}
}
@ -771,59 +782,83 @@ void MultiDevSSAGraphBuilder::CreateRPCOp(ir::Graph *result,
"This hack no longer holds, please fix.");
// the variable name which contains .block means it was splited by
// split_byref op
// so that we can balance the variable blocks to all the pserver
// instances.
if (strategy_.reduce_ == BuildStrategy::ReduceStrategy::kAllReduce &&
node->inputs[0]->Name().find(".block") == std::string::npos) {
std::vector<std::string> input_var_names;
for (ir::Node *n : node->inputs) {
input_var_names.push_back(n->Name());
}
op_dev_id = GetAppropriateDeviceID(input_var_names);
auto send_param_grad = boost::get<std::vector<std::string>>(
node->Op()->GetAttr(OpProtoAndCheckerMaker::OpRoleVarAttrName()));
PADDLE_ENFORCE_EQ(send_param_grad.size(), 2U);
op_dev_id = GetAppropriateDeviceID({send_param_grad[1]});
VLOG(10) << "send grad " << input_var_names[0] << " origin "
<< send_param_grad[1] << " place: " << op_dev_id;
for (auto &varname : input_var_names) {
result->Get<ShardedVarDevice>(kShardedVarDevice)
.emplace(varname, op_dev_id);
}
result->Get<ShardedVarDevice>(kShardedVarDevice)
.emplace(send_param_grad[1], op_dev_id);
}
} else if (node->Op()->Type() == "recv") {
std::vector<std::string> output_var_names;
for (ir::Node *n : node->outputs) {
output_var_names.push_back(n->Name());
}
op_dev_id = GetAppropriateDeviceID(output_var_names);
auto recv_param_grad = boost::get<std::vector<std::string>>(
node->Op()->GetAttr(OpProtoAndCheckerMaker::OpRoleVarAttrName()));
if (recv_param_grad.size() == 2U) {
op_dev_id = GetVarDeviceID(*result, recv_param_grad[1]);
VLOG(10) << "recv param " << recv_param_grad[0]
<< " get grad place: " << recv_param_grad[1]
<< " place: " << op_dev_id;
} else {
op_dev_id = GetAppropriateDeviceID(output_var_names);
}
for (auto &varname : output_var_names) {
result->Get<ShardedVarDevice>(kShardedVarDevice)
.emplace(varname, op_dev_id);
}
} else {
// send_barrier and fetch_barrier op can be scheduled on device 0
// send_barrier, fetch_barrier will run on place 0;
op_dev_id = 0;
}
PADDLE_ENFORCE(op_dev_id != -1, "can not find the right place for rpc op: %s",
node->Op()->Type());
result->Get<GraphOps>(kGraphOps).emplace_back(new RPCOpHandle(
result->CreateOpNode(node->Op()), *node->Op(), local_scopes_[op_dev_id],
node->Op()->Type(), places_[op_dev_id]));
// TODO(panyx0718): This might not be needed anymore.
if (node->Op()->Type() == "send_barrier") {
ConnectOp(result, result->Get<GraphOps>(kGraphOps).back().get(), "send");
} else if (node->Op()->Type() == "recv") {
ConnectOp(result, result->Get<GraphOps>(kGraphOps).back().get(),
"send_barrier");
} else if (node->Op()->Type() == "fetch_barrier") {
ConnectOp(result, result->Get<GraphOps>(kGraphOps).back().get(), "recv");
} else if (node->Op()->Type() == "send") {
// do nothing
if (node->Op()->Type() == "send") {
CreateOpHandleIOs(result, node, op_dev_id);
} else {
PADDLE_THROW(
"rpc op should be in ["
"send, send_barrier. recv, fetch_barrier]");
}
// send_barrier, recv, fetch_barrier's inputs are deps var, get them from
// all places
auto p = places_[op_dev_id];
auto *op_handle = result->Get<GraphOps>(kGraphOps).back().get();
op_handle->SetDeviceContext(p,
platform::DeviceContextPool::Instance().Get(p));
CreateOpHandleIOs(result, node, op_dev_id);
SetOpInputsAllPlaces(result, node, places_.size());
for (ir::Node *output : node->outputs) {
int outvar_dev_id = op_dev_id;
if (node->Op()->Type() == "fetch_barrier") {
outvar_dev_id = GetVarDeviceID(*result, output->Name());
PADDLE_ENFORCE_NE(outvar_dev_id, -1);
}
p = places_[outvar_dev_id];
ir::Node *new_node = nullptr;
if (output->Var()) {
new_node = result->CreateVarNode(output->Var());
} else {
new_node =
result->CreateEmptyNode(output->Name(), ir::Node::Type::kVariable);
}
CreateOpOutput(result, op_handle, new_node, p, outvar_dev_id);
}
}
}
bool MultiDevSSAGraphBuilder::IsScaleLossOp(ir::Node *node) const {

@ -54,7 +54,8 @@ void GraphvizSSAGraphPrinter::Print(const ir::Graph &graph,
sout << "var_" << cur_var_id << " [label=\"" << var_handle_ptr->name_
<< "\\n"
<< var_handle_ptr->place_ << "\\n"
<< var_handle_ptr->version_ << "\"]" << std::endl;
<< "scope: " << var_handle_ptr->scope_idx_ << "\\n"
<< "v" << var_handle_ptr->version_ << "\"]" << std::endl;
} else if (dummy_ptr) {
sout << "var_" << cur_var_id << " [label=\"dummy\"]" << std::endl;
}

@ -107,6 +107,7 @@ message VarType {
// Tensor<size_t> is used in C++.
SIZE_T = 19;
UINT8 = 20;
INT8 = 21;
// Other types that may need additional descriptions
LOD_TENSOR = 7;

@ -3,6 +3,7 @@ cc_library(graph SRCS graph.cc DEPS node)
cc_library(graph_helper SRCS graph_helper.cc DEPS graph)
cc_library(pass SRCS pass.cc DEPS graph node graph_helper)
cc_library(graph_viz_pass SRCS graph_viz_pass.cc DEPS graph pass graph_helper)
cc_library(graph_to_program_pass SRCS graph_to_program_pass.cc DEPS graph pass graph_helper)
cc_library(graph_traits SRCS graph_traits.cc DEPS graph)
cc_library(graph_pattern_detecter SRCS graph_pattern_detecter.cc DEPS graph graph_helper graph_traits)
cc_library(fc_fuse_pass SRCS fc_fuse_pass.cc DEPS graph graph_pattern_detecter)
@ -12,5 +13,6 @@ cc_library(infer_clean_graph_pass SRCS infer_clean_graph_pass.cc DEPS graph pass
cc_test(pass_test SRCS pass_test.cc DEPS graph pass graph_helper)
cc_test(graph_test SRCS graph_test.cc DEPS graph graph_helper op_registry)
cc_test(graph_helper_test SRCS graph_helper_test.cc DEPS graph graph_helper op_registry)
cc_test(graph_to_program_pass_test SRCS graph_to_program_pass_test.cc DEPS graph_to_program_pass)
cc_test(test_graph_pattern_detecter SRCS graph_pattern_detecter_tester.cc DEPS graph_pattern_detecter)
cc_test(test_fc_fuse_pass SRCS fc_fuse_pass_tester.cc DEPS fc_fuse_pass graph_pattern_detecter graph pass graph_traits framework_proto)

Some files were not shown because too many files have changed in this diff Show More

Loading…
Cancel
Save