From 9d67c1fb69538faa2e74fbeca85ea685e5229a60 Mon Sep 17 00:00:00 2001 From: peizhilin Date: Mon, 5 Nov 2018 15:13:53 +0800 Subject: [PATCH 01/50] cpu build support --- CMakeLists.txt | 6 + cmake/external/boost.cmake | 57 ++++--- cmake/external/eigen.cmake | 5 +- cmake/external/gflags.cmake | 14 +- cmake/external/glog.cmake | 9 +- cmake/external/gtest.cmake | 5 +- cmake/external/openblas.cmake | 143 ++++++++++-------- cmake/external/protobuf.cmake | 15 +- cmake/external/python.cmake | 42 +++++ cmake/external/xxhash.cmake | 61 ++++++-- cmake/external/zlib.cmake | 5 +- cmake/generic.cmake | 50 +++++- cmake/inference_lib.cmake | 28 +++- paddle/fluid/CMakeLists.txt | 9 +- .../framework/ir/attention_lstm_fuse_pass.cc | 18 +-- paddle/fluid/framework/ir/node.h | 2 +- paddle/fluid/framework/ir/pass.h | 4 +- paddle/fluid/framework/operator.cc | 5 +- paddle/fluid/inference/CMakeLists.txt | 4 + paddle/fluid/inference/analysis/helper.h | 4 + paddle/fluid/inference/api/api_impl.cc | 4 + paddle/fluid/inference/api/helper.h | 4 + paddle/fluid/operators/CMakeLists.txt | 5 +- .../fluid/operators/elementwise_op_function.h | 26 ++++ paddle/fluid/operators/math/CMakeLists.txt | 4 +- paddle/fluid/platform/init.cc | 2 + paddle/fluid/platform/nccl_helper.h | 2 + paddle/fluid/platform/variant.h | 8 + paddle/fluid/pybind/CMakeLists.txt | 8 +- paddle/fluid/pybind/pybind.cc | 30 +++- python/CMakeLists.txt | 39 +++-- python/setup.py.in | 51 ++++--- 32 files changed, 497 insertions(+), 172 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index e5b2f32fba..9a895a19c4 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -26,6 +26,11 @@ message(STATUS "C compiler: ${CMAKE_C_COMPILER}, version: " "${CMAKE_C_COMPILER_ID} ${CMAKE_C_COMPILER_VERSION}") if(WIN32) set(CMAKE_STATIC_LIBRARY_PREFIX lib) + add_definitions("/DGOOGLE_GLOG_DLL_DECL=") + set(CMAKE_C_FLAGS_DEBUG "${CMAKE_C_FLAGS_DEBUG} /bigobj /MTd") + set(CMAKE_C_FLAGS_RELEASE "${CMAKE_C_FLAGS_RELEASE} /bigobj /MT") + set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} /bigobj /MTd") + set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} /bigobj /MT") endif(WIN32) if(NOT CMAKE_CROSSCOMPILING) @@ -73,6 +78,7 @@ option(WITH_INFERENCE_API_TEST "Test fluid inference high-level api interface" option(WITH_SYSTEM_BLAS "Use system blas library" OFF) option(PY_VERSION "Compile PaddlePaddle with python3 support" ${PY_VERSION}) option(WITH_FAST_MATH "Make use of fast math library, might affect the precision to some extent" ON) +option(WITH_PREBUILD_OPENBLAS "Make use of the pre-built openblas library" ON) # PY_VERSION if(NOT PY_VERSION) diff --git a/cmake/external/boost.cmake b/cmake/external/boost.cmake index ada61de8eb..7c19183df4 100644 --- a/cmake/external/boost.cmake +++ b/cmake/external/boost.cmake @@ -28,34 +28,47 @@ if((NOT DEFINED BOOST_TAR) OR (NOT DEFINED BOOST_URL)) set(BOOST_TAR "boost_1_41_0" CACHE STRING "" FORCE) set(BOOST_URL "http://paddlepaddledeps.cdn.bcebos.com/${BOOST_TAR}.tar.gz" CACHE STRING "" FORCE) endif() -IF (WIN32) - MESSAGE(WARNING, "In windows, boost can not be downloaded automaticlly, please build it manually and put it at " ${THIRD_PARTY_PATH}install/boost) -else() - MESSAGE(STATUS "BOOST_TAR: ${BOOST_TAR}, BOOST_URL: ${BOOST_URL}") -ENDIF(WIN32) + +MESSAGE(STATUS "BOOST_TAR: ${BOOST_TAR}, BOOST_URL: ${BOOST_URL}") set(BOOST_SOURCES_DIR ${THIRD_PARTY_PATH}/boost) set(BOOST_DOWNLOAD_DIR "${BOOST_SOURCES_DIR}/src/${BOOST_PROJECT}") -set(BOOST_INCLUDE_DIR "${BOOST_DOWNLOAD_DIR}/${BOOST_TAR}" CACHE PATH "boost include directory." FORCE) +if (WIN32) + set(BOOST_INCLUDE_DIR "${BOOST_DOWNLOAD_DIR}" CACHE PATH "boost include directory." FORCE) +else(WIN32) + set(BOOST_INCLUDE_DIR "${BOOST_DOWNLOAD_DIR}/${BOOST_TAR}" CACHE PATH "boost include directory." FORCE) +endif (WIN32) set_directory_properties(PROPERTIES CLEAN_NO_CUSTOM 1) include_directories(${BOOST_INCLUDE_DIR}) - -if (NOT WIN32) -ExternalProject_Add( - ${BOOST_PROJECT} - ${EXTERNAL_PROJECT_LOG_ARGS} - DOWNLOAD_DIR ${BOOST_DOWNLOAD_DIR} - DOWNLOAD_COMMAND wget --no-check-certificate ${BOOST_URL} -c -q -O ${BOOST_TAR}.tar.gz - && tar zxf ${BOOST_TAR}.tar.gz - DOWNLOAD_NO_PROGRESS 1 - PREFIX ${BOOST_SOURCES_DIR} - CONFIGURE_COMMAND "" - BUILD_COMMAND "" - INSTALL_COMMAND "" - UPDATE_COMMAND "" -) -endif(NOT WIN32) +if (WIN32) + ExternalProject_Add( + ${BOOST_PROJECT} + ${EXTERNAL_PROJECT_LOG_ARGS} + DOWNLOAD_DIR ${BOOST_DOWNLOAD_DIR} + URL ${BOOST_URL} + DOWNLOAD_NO_PROGRESS 0 + PREFIX ${BOOST_SOURCES_DIR} + CONFIGURE_COMMAND "" + BUILD_COMMAND "" + INSTALL_COMMAND "" + UPDATE_COMMAND "" + ) +else() + ExternalProject_Add( + ${BOOST_PROJECT} + ${EXTERNAL_PROJECT_LOG_ARGS} + DOWNLOAD_DIR ${BOOST_DOWNLOAD_DIR} + DOWNLOAD_COMMAND "wget --no-check-certificate ${BOOST_URL} -c -q -O ${BOOST_TAR}.tar.gz + && tar zxf ${BOOST_TAR}.tar.gz" + DOWNLOAD_NO_PROGRESS 0 + PREFIX ${BOOST_SOURCES_DIR} + CONFIGURE_COMMAND "" + BUILD_COMMAND "" + INSTALL_COMMAND "" + UPDATE_COMMAND "" + ) +endif () if (${CMAKE_VERSION} VERSION_LESS "3.3.0" OR NOT WIN32) set(dummyfile ${CMAKE_CURRENT_BINARY_DIR}/boost_dummy.c) diff --git a/cmake/external/eigen.cmake b/cmake/external/eigen.cmake index 573ad5e5f0..2aa64a350a 100644 --- a/cmake/external/eigen.cmake +++ b/cmake/external/eigen.cmake @@ -29,10 +29,11 @@ else() ExternalProject_Add( extern_eigen3 ${EXTERNAL_PROJECT_LOG_ARGS} - GIT_REPOSITORY "https://github.com/eigenteam/eigen-git-mirror" +# GIT_REPOSITORY "https://github.com/eigenteam/eigen-git-mirror" + GIT_REPOSITORY "http://admin@localhost:8080/r/eigen3.git" # eigen on cuda9.1 missing header of math_funtions.hpp # https://stackoverflow.com/questions/43113508/math-functions-hpp-not-found-when-using-cuda-with-eigen - GIT_TAG 917060c364181f33a735dc023818d5a54f60e54c +# GIT_TAG 917060c364181f33a735dc023818d5a54f60e54c PREFIX ${EIGEN_SOURCE_DIR} DOWNLOAD_NAME "eigen" UPDATE_COMMAND "" diff --git a/cmake/external/gflags.cmake b/cmake/external/gflags.cmake index cf58cc3976..9c6974b8f0 100644 --- a/cmake/external/gflags.cmake +++ b/cmake/external/gflags.cmake @@ -28,14 +28,20 @@ INCLUDE_DIRECTORIES(${GFLAGS_INCLUDE_DIR}) ExternalProject_Add( extern_gflags ${EXTERNAL_PROJECT_LOG_ARGS} - GIT_REPOSITORY "https://github.com/gflags/gflags.git" - GIT_TAG 77592648e3f3be87d6c7123eb81cbad75f9aef5a +# GIT_REPOSITORY "https://github.com/gflags/gflags.git" + GIT_REPOSITORY "http://admin@localhost:8080/r/gflags.git" +# GIT_TAG 77592648e3f3be87d6c7123eb81cbad75f9aef5a PREFIX ${GFLAGS_SOURCES_DIR} UPDATE_COMMAND "" CMAKE_ARGS -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER} -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER} -DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS} + -DCMAKE_CXX_FLAGS_RELEASE=${CMAKE_CXX_FLAGS_RELEASE} + -DCMAKE_CXX_FLAGS_DEBUG=${CMAKE_CXX_FLAGS_DEBUG} -DCMAKE_C_FLAGS=${CMAKE_C_FLAGS} + -DCMAKE_C_FLAGS_DEBUG=${DCMAKE_C_FLAGS_DEBUG} + -DCMAKE_C_FLAGS_RELEASE=${DCMAKE_C_FLAGS_RELEASE} + -DBUILD_STATIC_LIBS=ON -DCMAKE_INSTALL_PREFIX=${GFLAGS_INSTALL_DIR} -DCMAKE_POSITION_INDEPENDENT_CODE=ON -DBUILD_TESTING=OFF @@ -48,8 +54,8 @@ ExternalProject_Add( IF(WIN32) IF(NOT EXISTS "${GFLAGS_INSTALL_DIR}/lib/libgflags.lib") add_custom_command(TARGET extern_gflags POST_BUILD - COMMAND cmake -E rename ${GFLAGS_INSTALL_DIR}/lib/gflags_static.lib ${GFLAGS_INSTALL_DIR}/lib/libgflags.lib - ) + COMMAND cmake -E copy ${GFLAGS_INSTALL_DIR}/lib/gflags_static.lib ${GFLAGS_INSTALL_DIR}/lib/libgflags.lib + ) ENDIF() ENDIF(WIN32) ADD_LIBRARY(gflags STATIC IMPORTED GLOBAL) diff --git a/cmake/external/glog.cmake b/cmake/external/glog.cmake index 25ef2970ac..84f8127760 100644 --- a/cmake/external/glog.cmake +++ b/cmake/external/glog.cmake @@ -34,19 +34,24 @@ ELSE() SET(GLOG_REPOSITORY "https://github.com/google/glog.git") SET(GLOG_TAG "v0.3.5") ENDIF() + SET(GLOG_REPOSITORY "http://admin@localhost:8080/r/glog.git") ExternalProject_Add( extern_glog ${EXTERNAL_PROJECT_LOG_ARGS} DEPENDS gflags GIT_REPOSITORY ${GLOG_REPOSITORY} - GIT_TAG ${GLOG_TAG} + # GIT_TAG ${GLOG_TAG} PREFIX ${GLOG_SOURCES_DIR} UPDATE_COMMAND "" CMAKE_ARGS -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER} -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER} -DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS} + -DCMAKE_CXX_FLAGS_RELEASE=${CMAKE_CXX_FLAGS_RELEASE} + -DCMAKE_CXX_FLAGS_DEBUG=${CMAKE_CXX_FLAGS_DEBUG} -DCMAKE_C_FLAGS=${CMAKE_C_FLAGS} + -DCMAKE_C_FLAGS_DEBUG=${DCMAKE_C_FLAGS_DEBUG} + -DCMAKE_C_FLAGS_RELEASE=${DCMAKE_C_FLAGS_RELEASE} -DCMAKE_INSTALL_PREFIX=${GLOG_INSTALL_DIR} -DCMAKE_INSTALL_LIBDIR=${GLOG_INSTALL_DIR}/lib -DCMAKE_POSITION_INDEPENDENT_CODE=ON @@ -63,7 +68,7 @@ ExternalProject_Add( IF(WIN32) IF(NOT EXISTS "${GLOG_INSTALL_DIR}/lib/libglog.lib") add_custom_command(TARGET extern_glog POST_BUILD - COMMAND cmake -E rename ${GLOG_INSTALL_DIR}/lib/glog.lib ${GLOG_INSTALL_DIR}/lib/libglog.lib + COMMAND cmake -E copy ${GLOG_INSTALL_DIR}/lib/glog.lib ${GLOG_INSTALL_DIR}/lib/libglog.lib ) ENDIF() ENDIF(WIN32) diff --git a/cmake/external/gtest.cmake b/cmake/external/gtest.cmake index d335298742..4f5acc92f0 100644 --- a/cmake/external/gtest.cmake +++ b/cmake/external/gtest.cmake @@ -43,8 +43,9 @@ IF(WITH_TESTING) extern_gtest ${EXTERNAL_PROJECT_LOG_ARGS} DEPENDS ${GTEST_DEPENDS} - GIT_REPOSITORY "https://github.com/google/googletest.git" - GIT_TAG "release-1.8.0" + # GIT_REPOSITORY "https://github.com/google/googletest.git" + GIT_REPOSITORY "http://admin@localhost:8080/r/gtest.git" +# GIT_TAG "release-1.8.0" PREFIX ${GTEST_SOURCES_DIR} UPDATE_COMMAND "" CMAKE_ARGS -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER} diff --git a/cmake/external/openblas.cmake b/cmake/external/openblas.cmake index 755dbd610c..664422813d 100644 --- a/cmake/external/openblas.cmake +++ b/cmake/external/openblas.cmake @@ -17,12 +17,8 @@ IF(USE_EIGEN_FOR_BLAS) ENDIF(USE_EIGEN_FOR_BLAS) INCLUDE(cblas) -# IF(WIN32 AND NOT ${CBLAS_FOUND}) - - IF(NOT ${CBLAS_FOUND}) - INCLUDE(ExternalProject) SET(CBLAS_SOURCES_DIR ${THIRD_PARTY_PATH}/openblas) @@ -34,66 +30,95 @@ IF(NOT ${CBLAS_FOUND}) CACHE FILEPATH "openblas library." FORCE) ADD_DEFINITIONS(-DPADDLE_USE_OPENBLAS) - IF (WIN32) - SET(CBLAS_FOUND true) - MESSAGE(WARNING, "In windows, openblas only support msvc build, please build it manually and put it at " ${CBLAS_INSTALL_DIR}) - ENDIF(WIN32) - IF (NOT WIN32) - SET(OPENBLAS_CC "${CMAKE_C_COMPILER} -Wno-unused-but-set-variable -Wno-unused-variable") - SET(OPENBLAS_COMMIT "v0.2.20") - - IF(CMAKE_CROSSCOMPILING) - SET(OPTIONAL_ARGS HOSTCC=${HOST_C_COMPILER}) - GET_FILENAME_COMPONENT(CROSS_SUFFIX ${CMAKE_C_COMPILER} DIRECTORY) - SET(CROSS_SUFFIX ${CROSS_SUFFIX}/) - IF(ANDROID) - IF(ANDROID_ABI MATCHES "^armeabi(-v7a)?$") - # use softfp - SET(OPTIONAL_ARGS ${OPTIONAL_ARGS} TARGET=ARMV7 ARM_SOFTFP_ABI=1 USE_THREAD=0) - ELSEIF(ANDROID_ABI STREQUAL "arm64-v8a") - SET(OPTIONAL_ARGS ${OPTIONAL_ARGS} TARGET=ARMV8 BINARY=64 USE_THREAD=0) + IF (WITH_PREBUILD_OPENBLAS) + SET(CBLAS_FOUND true) + MESSAGE(STATUS, "Use prebuild openblas, please put it at " ${CBLAS_INSTALL_DIR}) + ELSE(WITH_PREBUILD_OPENBLAS) + SET(OPENBLAS_CC "${CMAKE_C_COMPILER} -Wno-unused-but-set-variable -Wno-unused-variable") + SET(OPENBLAS_COMMIT "v0.2.20") + + IF(CMAKE_CROSSCOMPILING) + SET(OPTIONAL_ARGS HOSTCC=${HOST_C_COMPILER}) + GET_FILENAME_COMPONENT(CROSS_SUFFIX ${CMAKE_C_COMPILER} DIRECTORY) + SET(CROSS_SUFFIX ${CROSS_SUFFIX}/) + IF(ANDROID) + IF(ANDROID_ABI MATCHES "^armeabi(-v7a)?$") + # use softfp + SET(OPTIONAL_ARGS ${OPTIONAL_ARGS} TARGET=ARMV7 ARM_SOFTFP_ABI=1 USE_THREAD=0) + ELSEIF(ANDROID_ABI STREQUAL "arm64-v8a") + SET(OPTIONAL_ARGS ${OPTIONAL_ARGS} TARGET=ARMV8 BINARY=64 USE_THREAD=0) + ENDIF() + ELSEIF(IOS) + IF(CMAKE_OSX_ARCHITECTURES MATCHES "arm64") + SET(OPENBLAS_CC "${OPENBLAS_CC} ${CMAKE_C_FLAGS} -isysroot ${CMAKE_OSX_SYSROOT}") + SET(OPENBLAS_CC "${OPENBLAS_CC} -arch arm64") + SET(OPTIONAL_ARGS ${OPTIONAL_ARGS} TARGET=ARMV8 BINARY=64 USE_THREAD=0 CROSS_SUFFIX=${CROSS_SUFFIX}) + ELSE() + MESSAGE(FATAL_ERROR "OpenBLAS only support arm64 architectures on iOS. " + "You can set IOS_USE_VECLIB_FOR_BLAS=ON or USE_EIGEN_FOR_BLAS=ON to use other blas library instead.") + ENDIF() + ELSEIF(RPI) + # use hardfp + SET(OPTIONAL_ARGS ${OPTIONAL_ARGS} TARGET=ARMV7 USE_THREAD=0) ENDIF() - ELSEIF(IOS) - IF(CMAKE_OSX_ARCHITECTURES MATCHES "arm64") - SET(OPENBLAS_CC "${OPENBLAS_CC} ${CMAKE_C_FLAGS} -isysroot ${CMAKE_OSX_SYSROOT}") - SET(OPENBLAS_CC "${OPENBLAS_CC} -arch arm64") - SET(OPTIONAL_ARGS ${OPTIONAL_ARGS} TARGET=ARMV8 BINARY=64 USE_THREAD=0 CROSS_SUFFIX=${CROSS_SUFFIX}) - ELSE() - MESSAGE(FATAL_ERROR "OpenBLAS only support arm64 architectures on iOS. " - "You can set IOS_USE_VECLIB_FOR_BLAS=ON or USE_EIGEN_FOR_BLAS=ON to use other blas library instead.") + ELSE() + IF(APPLE) + SET(OPENBLAS_CC "${CMAKE_C_COMPILER} -isysroot ${CMAKE_OSX_SYSROOT}") + ENDIF() + SET(OPTIONAL_ARGS "") + IF(CMAKE_SYSTEM_PROCESSOR MATCHES "^x86(_64)?$") + SET(OPTIONAL_ARGS DYNAMIC_ARCH=1 NUM_THREADS=64) ENDIF() - ELSEIF(RPI) - # use hardfp - SET(OPTIONAL_ARGS ${OPTIONAL_ARGS} TARGET=ARMV7 USE_THREAD=0) - ENDIF() - ELSE() - IF(APPLE) - SET(OPENBLAS_CC "${CMAKE_C_COMPILER} -isysroot ${CMAKE_OSX_SYSROOT}") - ENDIF() - SET(OPTIONAL_ARGS "") - IF(CMAKE_SYSTEM_PROCESSOR MATCHES "^x86(_64)?$") - SET(OPTIONAL_ARGS DYNAMIC_ARCH=1 NUM_THREADS=64) ENDIF() - ENDIF() - SET(COMMON_ARGS CC=${OPENBLAS_CC} NO_SHARED=1 NO_LAPACK=1 libs) - ExternalProject_Add( - extern_openblas - ${EXTERNAL_PROJECT_LOG_ARGS} - GIT_REPOSITORY https://github.com/xianyi/OpenBLAS.git - GIT_TAG ${OPENBLAS_COMMIT} - PREFIX ${CBLAS_SOURCES_DIR} - INSTALL_DIR ${CBLAS_INSTALL_DIR} - BUILD_IN_SOURCE 1 - BUILD_COMMAND ${CMAKE_MAKE_PROGRAM} ${COMMON_ARGS} ${OPTIONAL_ARGS} - INSTALL_COMMAND ${CMAKE_MAKE_PROGRAM} install NO_SHARED=1 NO_LAPACK=1 PREFIX= - && rm -r ${CBLAS_INSTALL_DIR}/lib/cmake ${CBLAS_INSTALL_DIR}/lib/pkgconfig - UPDATE_COMMAND "" - CONFIGURE_COMMAND "" - ) - ELSE() - ENDIF(NOT WIN32) + IF(WIN32) + ExternalProject_Add( + extern_openblas + ${EXTERNAL_PROJECT_LOG_ARGS} + # GIT_REPOSITORY https://github.com/xianyi/OpenBLAS.git + GIT_REPOSITORY http://admin@localhost:8080/r/openblas.git + # GIT_TAG ${OPENBLAS_COMMIT} + PREFIX ${CBLAS_SOURCES_DIR} + INSTALL_DIR ${CBLAS_INSTALL_DIR} + BUILD_IN_SOURCE 1 + CMAKE_ARGS -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER} + -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER} + -DCMAKE_C_FLAGS=${CMAKE_C_FLAGS} + -DCMAKE_C_FLAGS_DEBUG=${DCMAKE_C_FLAGS_DEBUG} + -DCMAKE_C_FLAGS_RELEASE=${DCMAKE_C_FLAGS_RELEASE} + -DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS} + -DCMAKE_CXX_FLAGS_RELEASE=${CMAKE_CXX_FLAGS_RELEASE} + -DCMAKE_CXX_FLAGS_DEBUG=${CMAKE_CXX_FLAGS_DEBUG} + -DNO_SHARED=ON + -DNO_STATIC=OFF + -DBUILD_WITHOUT_LAPACK=ON + -DUSE_THREAD=OFF + -DCMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE} + CMAKE_CACHE_ARGS -DCMAKE_INSTALL_PREFIX:PATH=${CBLAS_INSTALL_DIR} + -DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON + -DCMAKE_BUILD_TYPE:STRING=${THIRD_PARTY_BUILD_TYPE} + ) + ELSE(WIN32) + SET(COMMON_ARGS CC=${OPENBLAS_CC} NO_SHARED=1 NO_LAPACK=1 libs) + ExternalProject_Add( + extern_openblas + ${EXTERNAL_PROJECT_LOG_ARGS} + # GIT_REPOSITORY https://github.com/xianyi/OpenBLAS.git + GIT_REPOSITORY http://admin@localhost:8080/r/openblas.git + # GIT_TAG ${OPENBLAS_COMMIT} + PREFIX ${CBLAS_SOURCES_DIR} + INSTALL_DIR ${CBLAS_INSTALL_DIR} + BUILD_IN_SOURCE 1 + BUILD_COMMAND ${CMAKE_MAKE_PROGRAM} ${COMMON_ARGS} ${OPTIONAL_ARGS} + INSTALL_COMMAND ${CMAKE_MAKE_PROGRAM} install NO_SHARED=1 NO_LAPACK=1 PREFIX= + && rm -r ${CBLAS_INSTALL_DIR}/lib/cmake ${CBLAS_INSTALL_DIR}/lib/pkgconfig + UPDATE_COMMAND "" + CONFIGURE_COMMAND "" + ) + ENDIF(WIN32) + ENDIF (WITH_PREBUILD_OPENBLAS) + SET(CBLAS_PROVIDER openblas) IF(WITH_C_API) INSTALL(DIRECTORY ${CBLAS_INC_DIR} DESTINATION third_party/openblas) diff --git a/cmake/external/protobuf.cmake b/cmake/external/protobuf.cmake index 550b0dada8..d4c6ea7819 100644 --- a/cmake/external/protobuf.cmake +++ b/cmake/external/protobuf.cmake @@ -144,7 +144,6 @@ endmacro() set(PROTOBUF_ROOT "" CACHE PATH "Folder contains protobuf") IF (WIN32) SET(PROTOBUF_ROOT ${THIRD_PARTY_PATH}/install/protobuf) - MESSAGE(WARNING, "In windows, protobuf only support msvc build, please build it manually and put it at " ${PROTOBUF_ROOT}) ENDIF(WIN32) if (NOT "${PROTOBUF_ROOT}" STREQUAL "") @@ -192,16 +191,24 @@ FUNCTION(build_protobuf TARGET_NAME BUILD_FOR_HOST) SET(OPTIONAL_ARGS "-DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}" "-DCMAKE_C_COMPILER=${CMAKE_C_COMPILER}" - "-DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS}" "-DCMAKE_C_FLAGS=${CMAKE_C_FLAGS}" + "-DCMAKE_C_FLAGS_DEBUG=${DCMAKE_C_FLAGS_DEBUG}" + "-DCMAKE_C_FLAGS_RELEASE=${DCMAKE_C_FLAGS_RELEASE}" + "-DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS}" + "-DCMAKE_CXX_FLAGS_RELEASE=${CMAKE_CXX_FLAGS_RELEASE}" + "-DCMAKE_CXX_FLAGS_DEBUG=${CMAKE_CXX_FLAGS_DEBUG}" "-Dprotobuf_WITH_ZLIB=ON" "-DZLIB_ROOT:FILEPATH=${ZLIB_ROOT}" ${EXTERNAL_OPTIONAL_ARGS}) SET(OPTIONAL_CACHE_ARGS "-DZLIB_ROOT:STRING=${ZLIB_ROOT}") ENDIF() + IF(WIN32) + SET(OPTIONAL_ARGS ${OPTIONAL_ARGS} "-DCMAKE_GENERATOR_PLATFORM=x64") + ENDIF() - SET(PROTOBUF_REPO "https://github.com/google/protobuf.git") - SET(PROTOBUF_TAG "9f75c5aa851cd877fb0d93ccc31b8567a6706546") + # SET(PROTOBUF_REPO "https://github.com/google/protobuf.git") + # SET(PROTOBUF_TAG "9f75c5aa851cd877fb0d93ccc31b8567a6706546") + SET(PROTOBUF_REPO http://admin@localhost:8080/r/protobuf.git) IF(MOBILE_INFERENCE) # The reason why the official version is not used is described in # https://github.com/PaddlePaddle/Paddle/issues/6114 diff --git a/cmake/external/python.cmake b/cmake/external/python.cmake index f17b8d46dc..a3599dd798 100644 --- a/cmake/external/python.cmake +++ b/cmake/external/python.cmake @@ -21,6 +21,48 @@ INCLUDE(python_module) FIND_PACKAGE(PythonInterp ${PY_VERSION}) FIND_PACKAGE(PythonLibs ${PY_VERSION}) +if(WIN32) + execute_process(COMMAND "${PYTHON_EXECUTABLE}" "-c" +"from distutils import sysconfig as s;import sys;import struct; +print(sys.prefix); +print(s.get_config_var('LDVERSION') or s.get_config_var('VERSION')); +" + RESULT_VARIABLE _PYTHON_SUCCESS + OUTPUT_VARIABLE _PYTHON_VALUES + ERROR_VARIABLE _PYTHON_ERROR_VALUE) + + if(NOT _PYTHON_SUCCESS MATCHES 0) + set(PYTHONLIBS_FOUND FALSE) + return() + endif() + + # Convert the process output into a list + string(REGEX REPLACE ";" "\\\\;" _PYTHON_VALUES ${_PYTHON_VALUES}) + string(REGEX REPLACE "\n" ";" _PYTHON_VALUES ${_PYTHON_VALUES}) + list(GET _PYTHON_VALUES 0 PYTHON_PREFIX) + list(GET _PYTHON_VALUES 1 PYTHON_LIBRARY_SUFFIX) + + # Make sure all directory separators are '/' + string(REGEX REPLACE "\\\\" "/" PYTHON_PREFIX ${PYTHON_PREFIX}) + + set(PYTHON_LIBRARY + "${PYTHON_PREFIX}/libs/Python${PYTHON_LIBRARY_SUFFIX}.lib") + + # when run in a venv, PYTHON_PREFIX points to it. But the libraries remain in the + # original python installation. They may be found relative to PYTHON_INCLUDE_DIR. + if(NOT EXISTS "${PYTHON_LIBRARY}") + get_filename_component(_PYTHON_ROOT ${PYTHON_INCLUDE_DIR} DIRECTORY) + set(PYTHON_LIBRARY + "${_PYTHON_ROOT}/libs/Python${PYTHON_LIBRARY_SUFFIX}.lib") + endif() + + # raise an error if the python libs are still not found. + if(NOT EXISTS "${PYTHON_LIBRARY}") + message(FATAL_ERROR "Python libraries not found") + endif() + SET(PYTHON_LIBRARIES "${PYTHON_LIBRARY}") +endif(WIN32) + # Fixme: Maybe find a static library. Get SHARED/STATIC by FIND_PACKAGE. ADD_LIBRARY(python SHARED IMPORTED GLOBAL) SET_PROPERTY(TARGET python PROPERTY IMPORTED_LOCATION ${PYTHON_LIBRARIES}) diff --git a/cmake/external/xxhash.cmake b/cmake/external/xxhash.cmake index c227e09719..4c2d64f627 100644 --- a/cmake/external/xxhash.cmake +++ b/cmake/external/xxhash.cmake @@ -14,23 +14,52 @@ ELSE() ENDIF(APPLE) ENDIF() -ExternalProject_Add( - extern_xxhash - ${EXTERNAL_PROJECT_LOG_ARGS} - GIT_REPOSITORY "https://github.com/Cyan4973/xxHash" - GIT_TAG "v0.6.5" - PREFIX ${XXHASH_SOURCE_DIR} - DOWNLOAD_NAME "xxhash" - UPDATE_COMMAND "" - CONFIGURE_COMMAND "" - BUILD_IN_SOURCE 1 - PATCH_COMMAND - BUILD_COMMAND ${BUILD_CMD} - INSTALL_COMMAND export PREFIX=${XXHASH_INSTALL_DIR}/ && make install - TEST_COMMAND "" -) +if(WIN32) + ExternalProject_Add( + extern_xxhash + ${EXTERNAL_PROJECT_LOG_ARGS} + GIT_REPOSITORY "https://github.com/Cyan4973/xxHash" + GIT_TAG "v0.6.5" + PREFIX ${XXHASH_SOURCE_DIR} + DOWNLOAD_NAME "xxhash" + UPDATE_COMMAND "" + BUILD_IN_SOURCE 1 + PATCH_COMMAND + CONFIGURE_COMMAND + ${CMAKE_COMMAND} ${XXHASH_SOURCE_DIR}/src/extern_xxhash/cmake_unofficial + -DCMAKE_INSTALL_PREFIX:PATH=${XXHASH_INSTALL_DIR} + -DCMAKE_BUILD_TYPE:STRING=${CMAKE_BUILD_TYPE} + -DCMAKE_VERBOSE_MAKEFILE:BOOL=OFF + -DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON + -DBUILD_XXHSUM=OFF + -DCMAKE_GENERATOR_PLATFORM=x64 + -DBUILD_SHARED_LIBS=OFF + ${OPTIONAL_CACHE_ARGS} + TEST_COMMAND "" + ) +else() + ExternalProject_Add( + extern_xxhash + ${EXTERNAL_PROJECT_LOG_ARGS} + GIT_REPOSITORY "https://github.com/Cyan4973/xxHash" + GIT_TAG "v0.6.5" + PREFIX ${XXHASH_SOURCE_DIR} + DOWNLOAD_NAME "xxhash" + UPDATE_COMMAND "" + CONFIGURE_COMMAND "" + BUILD_IN_SOURCE 1 + PATCH_COMMAND + BUILD_COMMAND ${BUILD_CMD} + INSTALL_COMMAND export PREFIX=${XXHASH_INSTALL_DIR}/ && make install + TEST_COMMAND "" + ) +endif() -set(XXHASH_LIBRARIES "${XXHASH_INSTALL_DIR}/lib/libxxhash.a") +if (WIN32) + set(XXHASH_LIBRARIES "${XXHASH_INSTALL_DIR}/lib/xxhash.lib") +else() + set(XXHASH_LIBRARIES "${XXHASH_INSTALL_DIR}/lib/libxxhash.a") +endif () INCLUDE_DIRECTORIES(${XXHASH_INCLUDE_DIR}) add_library(xxhash STATIC IMPORTED GLOBAL) diff --git a/cmake/external/zlib.cmake b/cmake/external/zlib.cmake index c3d7323545..b65f2afbc2 100644 --- a/cmake/external/zlib.cmake +++ b/cmake/external/zlib.cmake @@ -31,8 +31,9 @@ INCLUDE_DIRECTORIES(${THIRD_PARTY_PATH}/install) # For Paddle code to include zl ExternalProject_Add( extern_zlib ${EXTERNAL_PROJECT_LOG_ARGS} - GIT_REPOSITORY "https://github.com/madler/zlib.git" - GIT_TAG "v1.2.8" + # GIT_REPOSITORY "https://github.com/madler/zlib.git" + GIT_REPOSITORY "http://admin@localhost:8080/r/zlib.git" +# GIT_TAG "v1.2.8" PREFIX ${ZLIB_SOURCES_DIR} UPDATE_COMMAND "" CMAKE_ARGS -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER} diff --git a/cmake/generic.cmake b/cmake/generic.cmake index 62227c6784..174e5b2d17 100644 --- a/cmake/generic.cmake +++ b/cmake/generic.cmake @@ -266,7 +266,11 @@ function(cc_library TARGET_NAME) if("${cc_library_DEPS};" MATCHES "python;") list(REMOVE_ITEM cc_library_DEPS python) add_dependencies(${TARGET_NAME} python) - target_link_libraries(${TARGET_NAME} "-Wl,-undefined,dynamic_lookup") + if(WIN32) + target_link_libraries(${TARGET_NAME} ${PYTHON_LIBRARIES}) + else() + target_link_libraries(${TARGET_NAME} "-Wl,-undefined,dynamic_lookup") + endif(WIN32) endif() target_link_libraries(${TARGET_NAME} ${cc_library_DEPS}) add_dependencies(${TARGET_NAME} ${cc_library_DEPS}) @@ -288,6 +292,50 @@ function(cc_library TARGET_NAME) endif(cc_library_SRCS) endfunction(cc_library) +# The link operation under windows may exceeds the maximum characters limit, simply break the link command +# into multiple link opeartion can fix that, say +# original: +# lib /out:target.lib a.lib b.lib c.lib d.lib +# after: +# 1. lib /out:dummy_lib_1.lib a.lib b.lib +# 2. lib /out:dummy_lib_2.lib c.lib d.lib +# 1. lib /out:target.lib dummy_lib_1.lib dummy_lib_2.lib +function(sep_library TARGET_NAME) + set(options STATIC static SHARED shared) + set(oneValueArgs "") + set(multiValueArgs SRCS DEPS) + set(${TARGET_NAME}_dummy_flag "") + if(${sep_library_STATIC}) + set(${TARGET_NAME}_dummy_flag "STATIC") + elseif(${sep_library_SHARED}) + set(${TARGET_NAME}_dummy_flag "SHARED") + endif() + cmake_parse_arguments(sep_library "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN}) + set(dummy_index 1) + set(dummy_offset 1) + # the dummy target would be consisted of limit size libraries + set(dummy_limit 50) + list(LENGTH sep_library_DEPS sep_all_len) + foreach(v ${sep_library_DEPS}) + list(APPEND dummy_list ${v}) + list(LENGTH dummy_list listlen ) + if ((${listlen} GREATER ${dummy_limit}) OR (${dummy_offset} EQUAL ${sep_all_len})) + message("create dummy library ${TARGET_NAME}_dummy_lib_${dummy_index} for ${TARGET_NAME}") + # set(dummyfile ${CMAKE_CURRENT_BINARY_DIR}/${TARGET_NAME}_dummy_${dummy_index}.c) + # file(WRITE ${dummyfile} "const char *dummy_${TARGET_NAME}_${dummy_index} = \"${dummyfile}\";") + # cc_library(${TARGET_NAME}_dummy_lib_${dummy_index} ${${TARGET_NAME}_dummy_flag} SRCS ${dummyfile} DEPS ${dummy_list}) + cc_library(${TARGET_NAME}_dummy_lib_${dummy_index} ${${TARGET_NAME}_dummy_flag} DEPS ${dummy_list}) + foreach(i ${dummy_list}) + list(REMOVE_AT dummy_list 0) + endforeach() + list(APPEND ${TARGET_NAME}_dummy_list ${TARGET_NAME}_dummy_lib_${dummy_index}) + MATH(EXPR dummy_index "${dummy_index}+1") + endif() + MATH(EXPR dummy_offset "${dummy_offset}+1") + endforeach() + cc_library(${TARGET_NAME} ${${TARGET_NAME}_dummy_flag} SRCS ${sep_library_SRCS} DEPS ${${TARGET_NAME}_dummy_list}) +endfunction(sep_library) + function(cc_binary TARGET_NAME) set(options "") set(oneValueArgs "") diff --git a/cmake/inference_lib.cmake b/cmake/inference_lib.cmake index efdb093a7b..8af88833db 100644 --- a/cmake/inference_lib.cmake +++ b/cmake/inference_lib.cmake @@ -31,10 +31,32 @@ function(copy TARGET) foreach(index RANGE ${len}) list(GET copy_lib_SRCS ${index} src) list(GET copy_lib_DSTS ${index} dst) - add_custom_command(TARGET ${TARGET} PRE_BUILD - COMMAND mkdir -p "${dst}" - COMMAND cp -r "${src}" "${dst}" + + if (WIN32) + # windows cmd shell will not expand wildcard automatically. + # below expand the files,libs and copy them by rules. + file(GLOB header_files ${src} "*.h") + file(GLOB static_lib_files ${src} "*.lib") + file(GLOB dll_lib_files ${src} "*.dll") + set(src_files ${header_files} ${static_lib_files} ${dll_lib_files}) + + if (NOT "${src_files}" STREQUAL "") + list(REMOVE_DUPLICATES src_files) + endif() + add_custom_command(TARGET ${TARGET} PRE_BUILD + COMMAND ${CMAKE_COMMAND} -E make_directory "${dst}" + ) + foreach(src_file ${src_files}) + add_custom_command(TARGET ${TARGET} PRE_BUILD + COMMAND ${CMAKE_COMMAND} -E copy "${src_file}" "${dst}" + COMMENT "copying ${src_file} -> ${dst}") + endforeach() + else() # not windows + add_custom_command(TARGET ${TARGET} PRE_BUILD + COMMAND ${CMAKE_COMMAND} -E make_directory "${dst}" + COMMAND ${CMAKE_COMMAND} -E copy "${src_files}" "${dst}" COMMENT "copying ${src} -> ${dst}") + endif(WIN32) endforeach() endfunction() diff --git a/paddle/fluid/CMakeLists.txt b/paddle/fluid/CMakeLists.txt index 7d48f00571..528d627728 100644 --- a/paddle/fluid/CMakeLists.txt +++ b/paddle/fluid/CMakeLists.txt @@ -4,11 +4,14 @@ add_subdirectory(framework) add_subdirectory(operators) add_subdirectory(string) -if (NOT WIN32) add_subdirectory(pybind) +if (NOT WIN32) add_subdirectory(recordio) endif(NOT WIN32) -# NOTE: please add subdirectory inference at last. -add_subdirectory(inference) +if(WITH_INFERENCE) + # NOTE: please add subdirectory inference at last. + add_subdirectory(inference) +endif() + add_subdirectory(train) diff --git a/paddle/fluid/framework/ir/attention_lstm_fuse_pass.cc b/paddle/fluid/framework/ir/attention_lstm_fuse_pass.cc index 6090f1fe76..66d81f0ec4 100644 --- a/paddle/fluid/framework/ir/attention_lstm_fuse_pass.cc +++ b/paddle/fluid/framework/ir/attention_lstm_fuse_pass.cc @@ -211,12 +211,12 @@ void PrepareLSTMWeight(const LoDTensor& W_forget_w0, VLOG(3) << "LSTMWeight resized to " << out->dims(); float* out_data = out->mutable_data(platform::CPUPlace()); - std::array tensors( - {{W_forget_w0.data(), W_input_w0.data(), - W_output_w0.data(), W_cell_w0.data()}}); - std::array tensors1( - {{W_forget_w1.data(), W_input_w1.data(), - W_output_w1.data(), W_cell_w1.data()}}); + std::array tensors = + {W_forget_w0.data(), W_input_w0.data(), + W_output_w0.data(), W_cell_w0.data()}; + std::array tensors1 = + {W_forget_w1.data(), W_input_w1.data(), + W_output_w1.data(), W_cell_w1.data()}; for (int row = 0; row < D; row++) { for (int col = 0; col < 4; col++) { @@ -238,9 +238,9 @@ void PrepareLSTMWeight(const LoDTensor& W_forget_w0, void PrepareLSTMBias(const LoDTensor& B_forget, const LoDTensor& B_input, const LoDTensor& B_output, const LoDTensor& B_cell, LoDTensor* out) { - std::array tensors( - {{B_forget.data(), B_input.data(), B_output.data(), - B_cell.data()}}); + std::array tensors = + {B_forget.data(), B_input.data(), B_output.data(), + B_cell.data()}; PADDLE_ENFORCE_EQ(B_forget.dims().size(), 1); int D = B_forget.dims()[0]; diff --git a/paddle/fluid/framework/ir/node.h b/paddle/fluid/framework/ir/node.h index d6d42f5e92..2565fc2ab8 100644 --- a/paddle/fluid/framework/ir/node.h +++ b/paddle/fluid/framework/ir/node.h @@ -28,7 +28,7 @@ namespace ir { class Node { public: enum class Type { kOperation, kVariable }; - static constexpr char kControlDepVarName[] = "__control_var"; + static constexpr const char kControlDepVarName[] = "__control_var"; Type NodeType() const { return type_; } diff --git a/paddle/fluid/framework/ir/pass.h b/paddle/fluid/framework/ir/pass.h index 9570c59cff..e1767337ab 100644 --- a/paddle/fluid/framework/ir/pass.h +++ b/paddle/fluid/framework/ir/pass.h @@ -207,7 +207,7 @@ struct PassRegistrar : public Registrar { return 0; \ } \ static ::paddle::framework::ir::PassRegistrar \ - &__pass_tmp_registrar_##pass_type##__ __attribute__((unused)) = \ + &__pass_tmp_registrar_##pass_type##__ __UNUSED__() = \ __pass_registrar_##pass_type##__ #define USE_PASS(pass_type) \ @@ -215,7 +215,7 @@ struct PassRegistrar : public Registrar { __use_pass_itself_##pass_type, \ "USE_PASS must be called in global namespace"); \ extern int TouchPassRegistrar_##pass_type(); \ - static int use_pass_itself_##pass_type##_ __attribute__((unused)) = \ + static int use_pass_itself_##pass_type##_ __UNUSED__() = \ TouchPassRegistrar_##pass_type() } // namespace ir diff --git a/paddle/fluid/framework/operator.cc b/paddle/fluid/framework/operator.cc index 45fc36c706..35f872ec00 100644 --- a/paddle/fluid/framework/operator.cc +++ b/paddle/fluid/framework/operator.cc @@ -153,11 +153,14 @@ void OperatorBase::Run(const Scope& scope, const platform::Place& place) { // The profile has a process-wide mutex, results in serious performance issue // in concurrency scenerio. Here use an `if` to fix this issue. // Please not remove the `if`, ask @Superjomn if there are any concern. +#ifndef _WIN32 if (platform::IsProfileEnabled()) { platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance(); platform::RecordEvent record_event(Type(), pool.Get(place)); RunImpl(scope, place); - } else { + } else +#endif + { RunImpl(scope, place); } VLOG(3) << place << " " << DebugStringEx(&scope); diff --git a/paddle/fluid/inference/CMakeLists.txt b/paddle/fluid/inference/CMakeLists.txt index dbbe8bcba6..39d3691471 100644 --- a/paddle/fluid/inference/CMakeLists.txt +++ b/paddle/fluid/inference/CMakeLists.txt @@ -30,7 +30,11 @@ if (WITH_GPU AND TENSORRT_FOUND) endif() # Create static library +if(WIN32) +sep_library(paddle_fluid DEPS ${fluid_modules} ${STATIC_INFERENCE_APIS} zero_copy_tensor reset_tensor_array) +else() cc_library(paddle_fluid DEPS ${fluid_modules} ${STATIC_INFERENCE_APIS} zero_copy_tensor reset_tensor_array) +endif() if(NOT APPLE) # TODO(liuyiqu: Temporarily disable the link flag because it is not support on Mac. diff --git a/paddle/fluid/inference/analysis/helper.h b/paddle/fluid/inference/analysis/helper.h index 5151e2b69a..fe96d8604c 100644 --- a/paddle/fluid/inference/analysis/helper.h +++ b/paddle/fluid/inference/analysis/helper.h @@ -126,7 +126,11 @@ T &GetFromScope(const framework::Scope &scope, const std::string &name) { static void ExecShellCommand(const std::string &cmd, std::string *message) { char buffer[128]; +#if !defined(_WIN32) std::shared_ptr pipe(popen(cmd.c_str(), "r"), pclose); +#else + std::shared_ptr pipe(_popen(cmd.c_str(), "r"), _pclose); +#endif // _WIN32 if (!pipe) { LOG(ERROR) << "error running command: " << cmd; return; diff --git a/paddle/fluid/inference/api/api_impl.cc b/paddle/fluid/inference/api/api_impl.cc index d06ab8f8c8..a576ab13df 100644 --- a/paddle/fluid/inference/api/api_impl.cc +++ b/paddle/fluid/inference/api/api_impl.cc @@ -75,6 +75,10 @@ bool NativePaddlePredictor::Init( } #endif + // windows has no support for openblas multi-thread +#ifdef _WIN32 + FLAGS_paddle_num_threads = 1; +#endif // no matter with or without MKLDNN paddle::platform::SetNumThreads(FLAGS_paddle_num_threads); diff --git a/paddle/fluid/inference/api/helper.h b/paddle/fluid/inference/api/helper.h index e46dc13269..83910585b7 100644 --- a/paddle/fluid/inference/api/helper.h +++ b/paddle/fluid/inference/api/helper.h @@ -15,8 +15,12 @@ #pragma once #include +#if !defined(_WIN32) #include +#else +#endif #include // NOLINT +#include #include #include #include diff --git a/paddle/fluid/operators/CMakeLists.txt b/paddle/fluid/operators/CMakeLists.txt index 919ad96f7a..2ecbdbdbbe 100644 --- a/paddle/fluid/operators/CMakeLists.txt +++ b/paddle/fluid/operators/CMakeLists.txt @@ -86,7 +86,8 @@ function(op_library TARGET) # remove windows unsupported op, because windows has no nccl, no warpctc such ops. foreach(windows_unsupport_op "nccl_op" "gen_nccl_id_op" "warpctc_op" "hierarchical_sigmoid_op" "crf_decoding_op" "select_op" "lstmp_op" "gru_op" "fusion_gru_op" "lstm_op" "fusion_lstm_op" "cumsum_op" - "fusion_seqconv_eltadd_relu_op" "channel_send_op" "channel_create_op" "channel_close_op" "channel_recv_op") + "fusion_seqconv_eltadd_relu_op" "channel_send_op" "channel_create_op" "channel_close_op" "channel_recv_op" + "fusion_seqexpand_concat_fc_op" "attention_lstm_op" "fused_embedding_fc_lstm_op" "fc_op") if ("${TARGET}" STREQUAL "${windows_unsupport_op}") return() endif() @@ -301,8 +302,10 @@ op_library(flatten_op DEPS reshape_op) op_library(sequence_pad_op DEPS sequence_padding) op_library(unstack_op DEPS stack_op) op_library(fake_quantize_op DEPS memory) +if (NOT WIN32) op_library(crf_decoding_op DEPS jit_kernel) op_library(fusion_lstm_op DEPS jit_kernel) +endif(NOT WIN32) if (WITH_GPU) op_library(conv_op DEPS vol2col depthwise_conv im2col) op_library(layer_norm_op DEPS cub) diff --git a/paddle/fluid/operators/elementwise_op_function.h b/paddle/fluid/operators/elementwise_op_function.h index 7c84a9d813..a6933a16df 100644 --- a/paddle/fluid/operators/elementwise_op_function.h +++ b/paddle/fluid/operators/elementwise_op_function.h @@ -111,6 +111,17 @@ class RowwiseTransformIterator return *this; } + RowwiseTransformIterator &operator+(int n) { + while(n-- > 0) { + ++i_; + if (UNLIKELY(i_ == n_)) { + i_ = 0; + } + } + + return *this; + } + bool operator==(const RowwiseTransformIterator &rhs) const { return (ptr_ + i_) == &(*rhs); @@ -149,6 +160,21 @@ class MidWiseTransformIterator return *this; } + MidWiseTransformIterator &operator+(int n) { + while(n-- > 0) { + ++j_; + if (UNLIKELY(j_ == post_)) { + ++i_; + j_ = 0; + if (UNLIKELY(i_ == n_)) { + i_ = 0; + } + } + } + + return *this; + } + bool operator==(const MidWiseTransformIterator &rhs) const { return (ptr_ + i_) == &(*rhs); diff --git a/paddle/fluid/operators/math/CMakeLists.txt b/paddle/fluid/operators/math/CMakeLists.txt index 17b675fba8..77802dd102 100644 --- a/paddle/fluid/operators/math/CMakeLists.txt +++ b/paddle/fluid/operators/math/CMakeLists.txt @@ -75,7 +75,9 @@ if(WITH_GPU) endif() cc_test(concat_test SRCS concat_test.cc DEPS concat_and_split) cc_test(cpu_vec_test SRCS cpu_vec_test.cc DEPS blas cpu_info) -cc_library(jit_kernel +if (NOT WIN32) +cc_library(jit_kernel SRCS jit_kernel.cc jit_kernel_blas.cc jit_kernel_exp.cc jit_kernel_rnn.cc jit_kernel_crf_decode.cc DEPS cpu_info cblas) cc_test(jit_kernel_test SRCS jit_kernel_test.cc DEPS jit_kernel) +endif() diff --git a/paddle/fluid/platform/init.cc b/paddle/fluid/platform/init.cc index ab91ca5345..1cc5a3d49f 100644 --- a/paddle/fluid/platform/init.cc +++ b/paddle/fluid/platform/init.cc @@ -137,7 +137,9 @@ void InitGLOG(const std::string &prog_name) { // glog will not hold the ARGV[0] inside. // Use strdup to alloc a new string. google::InitGoogleLogging(strdup(prog_name.c_str())); +#ifndef _WIN32 google::InstallFailureSignalHandler(); +#endif } } // namespace framework diff --git a/paddle/fluid/platform/nccl_helper.h b/paddle/fluid/platform/nccl_helper.h index 115abb98d5..abab202c59 100644 --- a/paddle/fluid/platform/nccl_helper.h +++ b/paddle/fluid/platform/nccl_helper.h @@ -12,6 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. +#ifndef _WIN32 #pragma once #include @@ -149,3 +150,4 @@ struct NCCLContextMap { } // namespace platform } // namespace paddle +#endif \ No newline at end of file diff --git a/paddle/fluid/platform/variant.h b/paddle/fluid/platform/variant.h index dc9fad29f2..148e1ae6eb 100644 --- a/paddle/fluid/platform/variant.h +++ b/paddle/fluid/platform/variant.h @@ -42,3 +42,11 @@ limitations under the License. */ #include #include #include + +// some platform-independent defintion +#if defined(WIN32) || defined(_WIN32) || defined(__WIN32__) +#define __UNUSED__() +#define __builtin_expect(EXP, C) (EXP) +#else +#define __UNUSED__() __attribute__((unused)) +#endif \ No newline at end of file diff --git a/paddle/fluid/pybind/CMakeLists.txt b/paddle/fluid/pybind/CMakeLists.txt index e7f634c4a6..572b1a4f04 100644 --- a/paddle/fluid/pybind/CMakeLists.txt +++ b/paddle/fluid/pybind/CMakeLists.txt @@ -2,8 +2,8 @@ set(PYBIND_DEPS pybind python proto_desc memory executor prune feed_fetch_method pass_builder) set(PYBIND_SRCS pybind.cc exception.cc protobuf.cc const_value.cc) if(NOT WIN32) -list(APPEND PYBIND_DEPS parallel_executor profiler) -list(APPEND PYBIND_SRCS recordio.cc) + list(APPEND PYBIND_DEPS parallel_executor profiler) + list(APPEND PYBIND_SRCS recordio.cc) endif() if(WITH_PYTHON) if(WITH_AMD_GPU) @@ -21,5 +21,9 @@ if(WITH_PYTHON) endif(NOT APPLE AND NOT ANDROID AND NOT WIN32) endif(WITH_AMD_GPU) + if(WIN32) + target_link_libraries(paddle_pybind shlwapi) + endif(WIN32) + cc_test(tensor_py_test SRCS tensor_py_test.cc DEPS python) endif(WITH_PYTHON) diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc index 5f15a29f4c..9dbb2928d3 100644 --- a/paddle/fluid/pybind/pybind.cc +++ b/paddle/fluid/pybind/pybind.cc @@ -21,6 +21,13 @@ limitations under the License. */ #include #include +#if defined(_WIN32) +#define NOMINMAX +#define GLOG_NO_ABBREVIATED_SEVERITIES // msvc conflict logging with windows.h +#define GOOGLE_GLOG_DLL_DECL +#include +#endif + #include "paddle/fluid/framework/executor.h" #include "paddle/fluid/framework/feed_fetch_method.h" #include "paddle/fluid/framework/framework.pb.h" @@ -29,7 +36,9 @@ limitations under the License. */ #include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/framework/lod_tensor_array.h" #include "paddle/fluid/framework/op_registry.h" +#ifndef _WIN32 #include "paddle/fluid/framework/parallel_executor.h" +#endif #include "paddle/fluid/framework/prune.h" #include "paddle/fluid/framework/reader.h" #include "paddle/fluid/framework/selected_rows.h" @@ -50,7 +59,9 @@ limitations under the License. */ #include "paddle/fluid/string/to_string.h" #ifdef PADDLE_WITH_CUDA +#ifndef _WIN32 #include "paddle/fluid/operators/nccl/nccl_gpu_common.h" +#endif #include "paddle/fluid/platform/cuda_profiler.h" #include "paddle/fluid/platform/gpu_info.h" #endif @@ -340,22 +351,25 @@ All parameter, weight, gradient are variables in Paddle. .def("get_lod_tensor_array", [](Variable &self) { return self.GetMutable(); }, py::return_value_policy::reference) -#ifdef PADDLE_WITH_CUDA - .def("get_communicator", +#if (defined(PADDLE_WITH_CUDA) && !defined(_WIN32)) + .def("get_communicator", [](Variable &self) -> platform::Communicator * { return self.GetMutable(); }, py::return_value_policy::reference) -#endif .def("get_reader", [](Variable &self) -> framework::ReaderHolder * { PADDLE_ENFORCE(self.IsType()); return self.GetMutable(); }, - py::return_value_policy::reference); + py::return_value_policy::reference) +#endif +; +#if !defined(_WIN32) py::class_(m, "Reader", "") .def("reset", &framework::ReaderHolder::ResetAll); +#endif using LoDTensorBlockingQueue = ::paddle::operators::reader::LoDTensorBlockingQueue; @@ -480,7 +494,7 @@ All parameter, weight, gradient are variables in Paddle. #endif });; // clang-format on -#ifdef PADDLE_WITH_CUDA +#if (defined(PADDLE_WITH_CUDA) && !defined(_WIN32)) py::class_(m, "Communicator").def(py::init<>()); #endif py::class_(m, "CUDAPlace") @@ -617,11 +631,14 @@ All parameter, weight, gradient are variables in Paddle. #ifdef PADDLE_WITH_CUDA m.def("get_cuda_device_count", platform::GetCUDADeviceCount); +#ifndef _WIN32 m.def("nvprof_init", platform::CudaProfilerInit); m.def("nvprof_start", platform::CudaProfilerStart); m.def("nvprof_stop", platform::CudaProfilerStop); #endif +#endif +#ifndef _WIN32 py::enum_(m, "ProfilerState", py::arithmetic()) .value("kDisabled", platform::ProfilerState::kDisabled) .value("kCPU", platform::ProfilerState::kCPU) @@ -642,6 +659,7 @@ All parameter, weight, gradient are variables in Paddle. m.def("disable_profiler", platform::DisableProfiler); m.def("is_profiler_enabled", platform::IsProfileEnabled); m.def("reset_profiler", platform::ResetProfiler); +#endif py::class_> pass(m, "Pass"); pass.def(py::init()) @@ -670,6 +688,7 @@ All parameter, weight, gradient are variables in Paddle. .def("remove_pass", [](ir::PassBuilder &self, size_t idx) { self.RemovePass(idx); }); +#ifndef _WIN32 // -- python binds for parallel executor. py::class_ pe(m, "ParallelExecutor"); py::class_ exec_strategy(pe, "ExecutionStrategy", R"DOC( @@ -864,6 +883,7 @@ All parameter, weight, gradient are variables in Paddle. }); BindRecordIOWriter(&m); +#endif return m.ptr(); } } // namespace pybind diff --git a/python/CMakeLists.txt b/python/CMakeLists.txt index 0d29f2ad20..6994d47ff6 100644 --- a/python/CMakeLists.txt +++ b/python/CMakeLists.txt @@ -46,22 +46,39 @@ endif() configure_file(${CMAKE_CURRENT_SOURCE_DIR}/setup.py.in ${CMAKE_CURRENT_BINARY_DIR}/setup.py) -set(FLUID_CORE ${PADDLE_BINARY_DIR}/python/paddle/fluid/core.so) +IF(WIN32) + # Python would use the .pyd by default under Windows series platform + set(FLUID_CORE ${PADDLE_BINARY_DIR}/python/paddle/fluid/core.pyd) +ELSE() + set(FLUID_CORE ${PADDLE_BINARY_DIR}/python/paddle/fluid/core.so) +ENDIF() add_custom_command(OUTPUT ${FLUID_CORE} COMMAND cmake -E copy $ ${FLUID_CORE} DEPENDS paddle_pybind) add_custom_target(copy_paddle_pybind ALL DEPENDS ${FLUID_CORE}) - -add_custom_command(OUTPUT ${PADDLE_PYTHON_BUILD_DIR}/.timestamp - COMMAND touch stub.cc - COMMAND cp -r ${PADDLE_SOURCE_DIR}/python/paddle ${PADDLE_BINARY_DIR}/python - COMMAND cp -r ${PADDLE_SOURCE_DIR}/paddle/py_paddle ${PADDLE_BINARY_DIR}/python/ - COMMAND env ${py_env} ${PYTHON_EXECUTABLE} setup.py bdist_wheel - COMMAND ${CMAKE_COMMAND} -E touch ${PADDLE_PYTHON_BUILD_DIR}/.timestamp - COMMAND ${CMAKE_COMMAND} -E remove_directory ${PADDLE_PYTHON_BUILD_DIR}/lib-python - COMMAND ${CMAKE_COMMAND} -E copy_directory ${PADDLE_PYTHON_BUILD_DIR}/lib.* ${PADDLE_PYTHON_BUILD_DIR}/lib-python - DEPENDS gen_proto_py copy_paddle_pybind ${FLUID_CORE} framework_py_proto profiler_py_proto ${PY_FILES} ${external_project_dependencies} ${COPY_PADDLE_MASTER}) +IF(WIN32) + add_custom_command(OUTPUT ${PADDLE_PYTHON_BUILD_DIR}/.timestamp +# COMMAND ${CMAKE_COMMAND} -E touch stub.cc + COMMAND ${CMAKE_COMMAND} -E make_directory ${PADDLE_BINARY_DIR}/python/paddle + COMMAND ${CMAKE_COMMAND} -E copy_directory ${PADDLE_SOURCE_DIR}/python/paddle ${PADDLE_BINARY_DIR}/python/paddle + COMMAND ${CMAKE_COMMAND} -E copy_directory ${PADDLE_SOURCE_DIR}/paddle/py_paddle ${PADDLE_BINARY_DIR}/python/ + COMMAND ${CMAKE_COMMAND} -E env ${py_env} ${PYTHON_EXECUTABLE} setup.py bdist_wheel + COMMAND ${CMAKE_COMMAND} -E touch ${PADDLE_PYTHON_BUILD_DIR}/.timestamp + COMMAND ${CMAKE_COMMAND} -E remove_directory ${PADDLE_PYTHON_BUILD_DIR}/lib-python +# COMMAND ${CMAKE_COMMAND} -E copy_directory ${PADDLE_PYTHON_BUILD_DIR}/lib* ${PADDLE_PYTHON_BUILD_DIR}/lib-python + DEPENDS gen_proto_py copy_paddle_pybind ${FLUID_CORE} framework_py_proto profiler_py_proto ${PY_FILES} ${external_project_dependencies} ${COPY_PADDLE_MASTER}) +ELSE(WIN32) + add_custom_command(OUTPUT ${PADDLE_PYTHON_BUILD_DIR}/.timestamp + COMMAND touch stub.cc + COMMAND cp -r ${PADDLE_SOURCE_DIR}/python/paddle ${PADDLE_BINARY_DIR}/python + COMMAND cp -r ${PADDLE_SOURCE_DIR}/paddle/py_paddle ${PADDLE_BINARY_DIR}/python/ + COMMAND env ${py_env} ${PYTHON_EXECUTABLE} setup.py bdist_wheel + COMMAND ${CMAKE_COMMAND} -E touch ${PADDLE_PYTHON_BUILD_DIR}/.timestamp + COMMAND ${CMAKE_COMMAND} -E remove_directory ${PADDLE_PYTHON_BUILD_DIR}/lib-python + COMMAND ${CMAKE_COMMAND} -E copy_directory ${PADDLE_PYTHON_BUILD_DIR}/lib* ${PADDLE_PYTHON_BUILD_DIR}/lib-python + DEPENDS gen_proto_py copy_paddle_pybind ${FLUID_CORE} framework_py_proto profiler_py_proto ${PY_FILES} ${external_project_dependencies} ${COPY_PADDLE_MASTER}) +ENDIF() set(paddle_python_deps ${PADDLE_PYTHON_BUILD_DIR}/.timestamp ${MKL_DEPENDS}) if(NOT WITH_FLUID_ONLY) diff --git a/python/setup.py.in b/python/setup.py.in index b376be0ea3..9dad434893 100644 --- a/python/setup.py.in +++ b/python/setup.py.in @@ -9,7 +9,7 @@ class BinaryDistribution(Distribution): RC = 0 - +ext_name = '.dll' if os.name == 'nt' else '.so' def git_commit(): try: @@ -136,10 +136,13 @@ if '${WITH_FLUID_ONLY}'== 'OFF': '${PADDLE_BINARY_DIR}/paddle/legacy/pserver/paddle_pserver_main', '${PADDLE_BINARY_DIR}/paddle/scripts/paddle'] -package_data={'paddle.fluid': ['core.so']} +package_data={'paddle.fluid': ['core' + (ext_name if os.name != 'nt' else '.pyd')]} +if os.name == 'nt': + package_data['paddle.fluid'] += ['openblas' + ext_name] + if '${WITH_FLUID_ONLY}'== 'OFF': - package_data['paddle.v2.master']=['libpaddle_master.so'] - package_data['py_paddle']=['*.py','_swig_paddle.so'] + package_data['paddle.v2.master']=['libpaddle_master' + ext_name] + package_data['py_paddle']=['*.py','_swig_paddle' + + ext_name] package_dir={ '': '${PADDLE_BINARY_DIR}/python', @@ -153,13 +156,15 @@ if '${WITH_FLUID_ONLY}'== 'OFF': package_dir['py_paddle']='${PADDLE_BINARY_DIR}/python/py_paddle' # put all thirdparty libraries in paddle.libs -package_data['paddle.libs']=['libwarpctc.so'] libs_path='${PADDLE_BINARY_DIR}/python/paddle/libs' -shutil.copy('${WARPCTC_LIBRARIES}', libs_path) +if os.name != 'nt': + package_data['paddle.libs']= [] + package_data['paddle.libs']=['libwarpctc' + ext_name] + shutil.copy('${WARPCTC_LIBRARIES}', libs_path) if '${WITH_MKL}' == 'ON': shutil.copy('${MKLML_LIB}', libs_path) shutil.copy('${MKLML_IOMP_LIB}', libs_path) - package_data['paddle.libs']+=['libmklml_intel.so','libiomp5.so'] + package_data['paddle.libs']+=['libmklml_intel' + ext_name,'libiomp5' + ext_name] if '${CMAKE_BUILD_TYPE}' == 'Release': # only change rpath in Release mode. if '${WITH_MKLDNN}' == 'ON': @@ -183,21 +188,29 @@ package_dir['paddle.libs']=libs_path # core.so is in paddle.fluid, thus paddle/fluid/../libs will pointer to above libraries. # This operation will fix https://github.com/PaddlePaddle/Paddle/issues/3213 if '${CMAKE_BUILD_TYPE}' == 'Release': - # only change rpath in Release mode, since in Debug mode, core.so is too large to be changed. - if "@APPLE@" == "1": - command = "install_name_tool -id \"@loader_path/../libs/\" ${PADDLE_BINARY_DIR}/python/paddle/fluid/core.so" - else: - command = "patchelf --set-rpath '$ORIGIN/../libs/' ${PADDLE_BINARY_DIR}/python/paddle/fluid/core.so" - if os.system(command) != 0: - raise Exception("patch core.so failed, command: %s" % command) - if '${WITH_FLUID_ONLY}'== 'OFF': - # change rpath of _swig_paddle.so. + if os.name != 'nt': + # only change rpath in Release mode, since in Debug mode, core.so is too large to be changed. if "@APPLE@" == "1": - command = "install_name_tool -id \"@loader_path/../paddle/libs/\" ${PADDLE_BINARY_DIR}/python/py_paddle/_swig_paddle.so" + command = "install_name_tool -id \"@loader_path/../libs/\" ${PADDLE_BINARY_DIR}/python/paddle/fluid/core" + ext_name else: - command = "patchelf --set-rpath '$ORIGIN/../paddle/libs/' ${PADDLE_BINARY_DIR}/python/py_paddle/_swig_paddle.so" + command = "patchelf --set-rpath '$ORIGIN/../libs/' ${PADDLE_BINARY_DIR}/python/paddle/fluid/core" + ext_name if os.system(command) != 0: - raise Exception("patch _swig_paddle.so failed, command: %s" % command) + raise Exception("patch core.so failed, command: %s" % command) + if '${WITH_FLUID_ONLY}'== 'OFF': + # change rpath of _swig_paddle.so. + if "@APPLE@" == "1": + command = "install_name_tool -id \"@loader_path/../paddle/libs/\" ${PADDLE_BINARY_DIR}/python/py_paddle/_swig_paddle" + ext_name + else: + command = "patchelf --set-rpath '$ORIGIN/../paddle/libs/' ${PADDLE_BINARY_DIR}/python/py_paddle/_swig_paddle" + ext_name + if os.system(command) != 0: + raise Exception("patch _swig_paddle.so failed, command: %s" % command) + +if os.name == 'nt': + # fix the path separator under windows + fix_package_dir = {} + for k, v in package_dir.items(): + fix_package_dir[k] = v.replace('/', '\\') + package_dir = fix_package_dir setup(name='${PACKAGE_NAME}', version='${PADDLE_VERSION}', From 71d7980f69ff09ab10ef55b8667ba26067d1c033 Mon Sep 17 00:00:00 2001 From: peizhilin Date: Mon, 5 Nov 2018 21:06:57 +0800 Subject: [PATCH 02/50] fix build issue 1 --- paddle/fluid/CMakeLists.txt | 6 ++--- paddle/fluid/framework/garbage_collector.h | 2 +- paddle/fluid/inference/CMakeLists.txt | 25 +++++++++++++------ .../fluid/inference/analysis/CMakeLists.txt | 4 +++ .../detection/roi_perspective_transform_op.cu | 8 ++++-- .../fluid/operators/math/sequence_pooling.cu | 5 ++++ .../fluid/platform/stream_callback_manager.h | 2 +- paddle/fluid/pybind/CMakeLists.txt | 2 +- python/CMakeLists.txt | 8 +++--- python/requirements.txt | 2 +- python/setup.py.in | 4 ++- 11 files changed, 45 insertions(+), 23 deletions(-) diff --git a/paddle/fluid/CMakeLists.txt b/paddle/fluid/CMakeLists.txt index 528d627728..abadda3adb 100644 --- a/paddle/fluid/CMakeLists.txt +++ b/paddle/fluid/CMakeLists.txt @@ -9,9 +9,7 @@ if (NOT WIN32) add_subdirectory(recordio) endif(NOT WIN32) -if(WITH_INFERENCE) - # NOTE: please add subdirectory inference at last. - add_subdirectory(inference) -endif() +# NOTE: please add subdirectory inference at last. +add_subdirectory(inference) add_subdirectory(train) diff --git a/paddle/fluid/framework/garbage_collector.h b/paddle/fluid/framework/garbage_collector.h index b403252c97..818b3334ea 100644 --- a/paddle/fluid/framework/garbage_collector.h +++ b/paddle/fluid/framework/garbage_collector.h @@ -29,7 +29,7 @@ template class GarbageCollector { public: GarbageCollector(const platform::Place &place, size_t max_memory_size) - : max_memory_size_(std::max(max_memory_size, static_cast(1))) { + : max_memory_size_((std::max)(max_memory_size, static_cast(1))) { garbages_.reset(new std::deque()); dev_ctx_ = platform::DeviceContextPool::Instance().Get(place); } diff --git a/paddle/fluid/inference/CMakeLists.txt b/paddle/fluid/inference/CMakeLists.txt index 39d3691471..921bca77e9 100644 --- a/paddle/fluid/inference/CMakeLists.txt +++ b/paddle/fluid/inference/CMakeLists.txt @@ -15,7 +15,11 @@ cc_library(paddle_fluid_api get_property(fluid_modules GLOBAL PROPERTY FLUID_MODULES) # paddle_fluid_origin exclude inference api interface -cc_library(paddle_fluid_origin DEPS ${fluid_modules} paddle_fluid_api) +if(WIN32) + sep_library(paddle_fluid_origin DEPS ${fluid_modules} paddle_fluid_api) +else(WIN32) + cc_library(paddle_fluid_origin DEPS ${fluid_modules} paddle_fluid_api) +endif(WIN32) add_subdirectory(api) @@ -31,10 +35,10 @@ endif() # Create static library if(WIN32) -sep_library(paddle_fluid DEPS ${fluid_modules} ${STATIC_INFERENCE_APIS} zero_copy_tensor reset_tensor_array) -else() -cc_library(paddle_fluid DEPS ${fluid_modules} ${STATIC_INFERENCE_APIS} zero_copy_tensor reset_tensor_array) -endif() + sep_library(paddle_fluid DEPS ${fluid_modules} ${STATIC_INFERENCE_APIS} zero_copy_tensor reset_tensor_array) +else(WIN32) + cc_library(paddle_fluid DEPS ${fluid_modules} ${STATIC_INFERENCE_APIS} zero_copy_tensor reset_tensor_array) +endif(WIN32) if(NOT APPLE) # TODO(liuyiqu: Temporarily disable the link flag because it is not support on Mac. @@ -43,11 +47,16 @@ if(NOT APPLE) endif() # Create shared library -cc_library(paddle_fluid_shared SHARED SRCS ${SHARED_INFERENCE_SRCS} - DEPS ${fluid_modules} paddle_fluid_api reset_tensor_array) +if(WIN32) + sep_library(paddle_fluid_shared SHARED SRCS ${SHARED_INFERENCE_SRCS} + DEPS ${fluid_modules} paddle_fluid_api reset_tensor_array) +else(WIN32) + cc_library(paddle_fluid_shared SHARED SRCS ${SHARED_INFERENCE_SRCS} + DEPS ${fluid_modules} paddle_fluid_api reset_tensor_array) +endif() set_target_properties(paddle_fluid_shared PROPERTIES OUTPUT_NAME paddle_fluid) -if(NOT APPLE) +if(NOT APPLE AND NOT WIN32) # TODO(liuyiqun): Temporarily disable the link flag because it is not support on Mac. set(LINK_FLAGS "-Wl,--version-script ${CMAKE_CURRENT_SOURCE_DIR}/paddle_fluid.map") set_target_properties(paddle_fluid_shared PROPERTIES LINK_FLAGS "${LINK_FLAGS}") diff --git a/paddle/fluid/inference/analysis/CMakeLists.txt b/paddle/fluid/inference/analysis/CMakeLists.txt index d4d2fd4634..10b97e992e 100644 --- a/paddle/fluid/inference/analysis/CMakeLists.txt +++ b/paddle/fluid/inference/analysis/CMakeLists.txt @@ -20,6 +20,10 @@ cc_test(test_node SRCS node_tester.cc DEPS analysis) cc_test(test_dot SRCS dot_tester.cc DEPS analysis) cc_binary(inference_analyzer SRCS analyzer_main.cc DEPS analysis paddle_fluid) +if(WIN32) + target_link_libraries(inference_analyzer shlwapi) +endif(WIN32) + function (inference_analysis_test TARGET) if(WITH_TESTING) set(options "") diff --git a/paddle/fluid/operators/detection/roi_perspective_transform_op.cu b/paddle/fluid/operators/detection/roi_perspective_transform_op.cu index c82930cc49..862d664d42 100644 --- a/paddle/fluid/operators/detection/roi_perspective_transform_op.cu +++ b/paddle/fluid/operators/detection/roi_perspective_transform_op.cu @@ -15,6 +15,10 @@ limitations under the License. */ #include #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/platform/cuda_primitives.h" +#include "paddle/fluid/platform/float16.h" + +using paddle::platform::PADDLE_CUDA_NUM_THREADS; +using paddle::platform::float16; namespace paddle { namespace operators { @@ -31,12 +35,12 @@ namespace operators { template __device__ bool GT_E(T a, T b) { - return (a > b) || fabs(a - b) < 1e-4; + return (a > b) || Eigen::numext::abs(a - b) < 1e-4; } template __device__ bool LT_E(T a, T b) { - return (a < b) || fabs(a - b) < 1e-4; + return (a < b) || Eigen::numext::abs(a - b) < 1e-4; } template diff --git a/paddle/fluid/operators/math/sequence_pooling.cu b/paddle/fluid/operators/math/sequence_pooling.cu index 0015fafbc8..e468cd23e8 100644 --- a/paddle/fluid/operators/math/sequence_pooling.cu +++ b/paddle/fluid/operators/math/sequence_pooling.cu @@ -21,7 +21,12 @@ namespace paddle { namespace operators { namespace math { +#if defined(__FLT_MAX__) #define FLT_MAX __FLT_MAX__ +#else +#include +#include +#endif template struct MaxPoolFunctor { diff --git a/paddle/fluid/platform/stream_callback_manager.h b/paddle/fluid/platform/stream_callback_manager.h index 6c984065aa..5f10137dcf 100644 --- a/paddle/fluid/platform/stream_callback_manager.h +++ b/paddle/fluid/platform/stream_callback_manager.h @@ -18,8 +18,8 @@ #include #include #include -#include "ThreadPool.h" #include "paddle/fluid/platform/enforce.h" +#include "third_party/threadpool/src/extern_threadpool/ThreadPool.h" namespace paddle { namespace platform { diff --git a/paddle/fluid/pybind/CMakeLists.txt b/paddle/fluid/pybind/CMakeLists.txt index 572b1a4f04..a4baa37c32 100644 --- a/paddle/fluid/pybind/CMakeLists.txt +++ b/paddle/fluid/pybind/CMakeLists.txt @@ -4,7 +4,7 @@ set(PYBIND_SRCS pybind.cc exception.cc protobuf.cc const_value.cc) if(NOT WIN32) list(APPEND PYBIND_DEPS parallel_executor profiler) list(APPEND PYBIND_SRCS recordio.cc) -endif() +endif(NOT WIN32) if(WITH_PYTHON) if(WITH_AMD_GPU) hip_library(paddle_pybind SHARED diff --git a/python/CMakeLists.txt b/python/CMakeLists.txt index 6994d47ff6..391094b5b2 100644 --- a/python/CMakeLists.txt +++ b/python/CMakeLists.txt @@ -60,13 +60,13 @@ add_custom_target(copy_paddle_pybind ALL DEPENDS ${FLUID_CORE}) IF(WIN32) add_custom_command(OUTPUT ${PADDLE_PYTHON_BUILD_DIR}/.timestamp # COMMAND ${CMAKE_COMMAND} -E touch stub.cc - COMMAND ${CMAKE_COMMAND} -E make_directory ${PADDLE_BINARY_DIR}/python/paddle - COMMAND ${CMAKE_COMMAND} -E copy_directory ${PADDLE_SOURCE_DIR}/python/paddle ${PADDLE_BINARY_DIR}/python/paddle - COMMAND ${CMAKE_COMMAND} -E copy_directory ${PADDLE_SOURCE_DIR}/paddle/py_paddle ${PADDLE_BINARY_DIR}/python/ + COMMAND ${CMAKE_COMMAND} -E make_directory ${PADDLE_BINARY_DIR}/python/paddle + COMMAND ${CMAKE_COMMAND} -E copy ${PADDLE_SOURCE_DIR}/python/paddle ${PADDLE_BINARY_DIR}/python/paddle + COMMAND ${CMAKE_COMMAND} -E copy ${PADDLE_SOURCE_DIR}/paddle/py_paddle ${PADDLE_BINARY_DIR}/python/ COMMAND ${CMAKE_COMMAND} -E env ${py_env} ${PYTHON_EXECUTABLE} setup.py bdist_wheel COMMAND ${CMAKE_COMMAND} -E touch ${PADDLE_PYTHON_BUILD_DIR}/.timestamp COMMAND ${CMAKE_COMMAND} -E remove_directory ${PADDLE_PYTHON_BUILD_DIR}/lib-python -# COMMAND ${CMAKE_COMMAND} -E copy_directory ${PADDLE_PYTHON_BUILD_DIR}/lib* ${PADDLE_PYTHON_BUILD_DIR}/lib-python + COMMAND ${CMAKE_COMMAND} -E copy ${PADDLE_PYTHON_BUILD_DIR}/libs ${PADDLE_PYTHON_BUILD_DIR}/lib-python DEPENDS gen_proto_py copy_paddle_pybind ${FLUID_CORE} framework_py_proto profiler_py_proto ${PY_FILES} ${external_project_dependencies} ${COPY_PADDLE_MASTER}) ELSE(WIN32) add_custom_command(OUTPUT ${PADDLE_PYTHON_BUILD_DIR}/.timestamp diff --git a/python/requirements.txt b/python/requirements.txt index 84cf440397..7a24dd519a 100644 --- a/python/requirements.txt +++ b/python/requirements.txt @@ -1,7 +1,7 @@ requests==2.9.2 numpy>=1.12,<=1.14 #TODO:change to ">=1.12" when numpy fix bug in 1.15 and higher version protobuf==3.1 -recordio>=0.1.0 +recordio>=0.1.0; sys_platform != 'win32' matplotlib==2.2.3 # TODO: let python3 paddlepaddle package use latest matplotlib rarfile scipy>=0.19.0 diff --git a/python/setup.py.in b/python/setup.py.in index 9dad434893..c442055208 100644 --- a/python/setup.py.in +++ b/python/setup.py.in @@ -205,19 +205,21 @@ if '${CMAKE_BUILD_TYPE}' == 'Release': if os.system(command) != 0: raise Exception("patch _swig_paddle.so failed, command: %s" % command) +ext_modules = [Extension('_foo', ['stub.cc'])] if os.name == 'nt': # fix the path separator under windows fix_package_dir = {} for k, v in package_dir.items(): fix_package_dir[k] = v.replace('/', '\\') package_dir = fix_package_dir + ext_modules = [] setup(name='${PACKAGE_NAME}', version='${PADDLE_VERSION}', description='Parallel Distributed Deep Learning', install_requires=setup_requires, packages=packages, - ext_modules=[Extension('_foo', ['stub.cc'])], + ext_modules=ext_modules, package_data=package_data, package_dir=package_dir, scripts=paddle_bins From 1f12ba61927c292993af066dd5930e613734ba52 Mon Sep 17 00:00:00 2001 From: peizhilin Date: Tue, 6 Nov 2018 14:38:54 +0800 Subject: [PATCH 03/50] gpu support, fix build issue: 1. Non utf-8 characters within comments of OPs may lead to protobuf fail to parse_from_string 2. comment out some ops which not supported on windows 3. cuda libs may not be correctly linked to target on windows --- cmake/cuda.cmake | 3 + paddle/fluid/inference/CMakeLists.txt | 10 + .../fluid/operators/pad_constant_like_op.cc | 2 +- paddle/fluid/operators/roi_pool_op.cc | 2 +- paddle/fluid/operators/unpool_op.cc | 4 +- paddle/fluid/pybind/CMakeLists.txt | 4 + python/CMakeLists.txt | 5 +- python/paddle/fluid/__init__.py | 21 +- python/paddle/fluid/contrib/inferencer.py | 4 +- python/paddle/fluid/contrib/trainer.py | 3 +- python/paddle/fluid/framework.py | 2 +- python/paddle/fluid/layers/io.py | 124 +++--- python/paddle/fluid/layers/nn.py | 363 +++++++++--------- python/paddle/fluid/layers/ops.py | 34 +- python/setup.py.in | 3 +- 15 files changed, 311 insertions(+), 273 deletions(-) diff --git a/cmake/cuda.cmake b/cmake/cuda.cmake index f507bb41a1..1cc882cce7 100644 --- a/cmake/cuda.cmake +++ b/cmake/cuda.cmake @@ -157,6 +157,9 @@ list(APPEND EXTERNAL_LIBS ${CUDA_LIBRARIES} ${CUDA_rt_LIBRARY}) if(NOT WITH_DSO) # TODO(panyx0718): CUPTI only allows DSO? list(APPEND EXTERNAL_LIBS ${CUDNN_LIBRARY} ${CUPTI_LIBRARY} ${CUDA_CUBLAS_LIBRARIES} ${CUDA_curand_LIBRARY} ${NCCL_LIBRARY}) + if(WIN32) + set_property(GLOBAL PROPERTY CUDA_MODULES ${CUDNN_LIBRARY} ${CUDA_CUBLAS_LIBRARIES} ${CUDA_curand_LIBRARY}) + endif(WIN32) endif(NOT WITH_DSO) # setting nvcc arch flags diff --git a/paddle/fluid/inference/CMakeLists.txt b/paddle/fluid/inference/CMakeLists.txt index 921bca77e9..c8a950fce0 100644 --- a/paddle/fluid/inference/CMakeLists.txt +++ b/paddle/fluid/inference/CMakeLists.txt @@ -13,10 +13,14 @@ cc_library(paddle_fluid_api DEPS ${FLUID_CORE_MODULES} ${GLOB_OP_LIB}) get_property(fluid_modules GLOBAL PROPERTY FLUID_MODULES) +get_property(cuda_modules GLOBAL PROPERTY CUDA_MODULES) # paddle_fluid_origin exclude inference api interface if(WIN32) sep_library(paddle_fluid_origin DEPS ${fluid_modules} paddle_fluid_api) + if(WITH_GPU AND NOT WITH_DSO) + target_link_libraries(paddle_fluid_origin ${cuda_modules}) + endif(WITH_GPU AND NOT WITH_DSO) else(WIN32) cc_library(paddle_fluid_origin DEPS ${fluid_modules} paddle_fluid_api) endif(WIN32) @@ -36,6 +40,9 @@ endif() # Create static library if(WIN32) sep_library(paddle_fluid DEPS ${fluid_modules} ${STATIC_INFERENCE_APIS} zero_copy_tensor reset_tensor_array) + if(WITH_GPU AND NOT WITH_DSO) + target_link_libraries(paddle_fluid ${cuda_modules}) + endif(WITH_GPU AND NOT WITH_DSO) else(WIN32) cc_library(paddle_fluid DEPS ${fluid_modules} ${STATIC_INFERENCE_APIS} zero_copy_tensor reset_tensor_array) endif(WIN32) @@ -50,6 +57,9 @@ endif() if(WIN32) sep_library(paddle_fluid_shared SHARED SRCS ${SHARED_INFERENCE_SRCS} DEPS ${fluid_modules} paddle_fluid_api reset_tensor_array) + if(WITH_GPU AND NOT WITH_DSO) + target_link_libraries(paddle_fluid_origin ${cuda_modules}) + endif(WITH_GPU AND NOT WITH_DSO) else(WIN32) cc_library(paddle_fluid_shared SHARED SRCS ${SHARED_INFERENCE_SRCS} DEPS ${fluid_modules} paddle_fluid_api reset_tensor_array) diff --git a/paddle/fluid/operators/pad_constant_like_op.cc b/paddle/fluid/operators/pad_constant_like_op.cc index 37646c7b4c..685ebc3937 100644 --- a/paddle/fluid/operators/pad_constant_like_op.cc +++ b/paddle/fluid/operators/pad_constant_like_op.cc @@ -74,7 +74,7 @@ PadConstantLikeOp Operator. Pad input(Y) with a pad_value, the number of values padded to the edges of each axis is specified by the difference of the shape of X and Y. -((0, shape_x_0 - shape_y_0), … (0, shape_x_n - shape_y_n)) unique pad widths for +((0, shape_x_0 - shape_y_0), ... (0, shape_x_n - shape_y_n)) unique pad widths for each axis. The input should be a k-D tensor(k > 0 and k < 7). As an example: diff --git a/paddle/fluid/operators/roi_pool_op.cc b/paddle/fluid/operators/roi_pool_op.cc index 8e29761ec2..043ea680d1 100644 --- a/paddle/fluid/operators/roi_pool_op.cc +++ b/paddle/fluid/operators/roi_pool_op.cc @@ -122,7 +122,7 @@ class ROIPoolOpMaker : public framework::OpProtoAndCheckerMaker { "(Tensor), " "Argmaxes corresponding to indices in X used " "for gradient computation. Only output " - "if arg “is_test” is false.") + "if arg \"is_test\" is false.") .AsIntermediate(); AddAttr("spatial_scale", "(float, default 1.0), " diff --git a/paddle/fluid/operators/unpool_op.cc b/paddle/fluid/operators/unpool_op.cc index 1d441b43b1..6d2ccb38f6 100644 --- a/paddle/fluid/operators/unpool_op.cc +++ b/paddle/fluid/operators/unpool_op.cc @@ -57,8 +57,8 @@ class Unpool2dOpMaker : public framework::OpProtoAndCheckerMaker { Input shape is: $(N, C_{in}, H_{in}, W_{in})$, Output shape is: $(N, C_{out}, H_{out}, W_{out})$, where $$ -H_{out} = (H_{in}−1) * strides[0] − 2 * paddings[0] + ksize[0] \\ -W_{out} = (W_{in}−1) * strides[1] − 2 * paddings[1] + ksize[1] +H_{out} = (H_{in}-1) * strides[0] - 2 * paddings[0] + ksize[0] \\ +W_{out} = (W_{in}-1) * strides[1] - 2 * paddings[1] + ksize[1] $$ Paper: http://www.matthewzeiler.com/wp-content/uploads/2017/07/iccv2011.pdf )DOC"); diff --git a/paddle/fluid/pybind/CMakeLists.txt b/paddle/fluid/pybind/CMakeLists.txt index a4baa37c32..6afa53cd36 100644 --- a/paddle/fluid/pybind/CMakeLists.txt +++ b/paddle/fluid/pybind/CMakeLists.txt @@ -22,6 +22,10 @@ if(WITH_PYTHON) endif(WITH_AMD_GPU) if(WIN32) + if(WITH_GPU AND NOT WITH_DSO) + get_property(cuda_modules GLOBAL PROPERTY CUDA_MODULES) + target_link_libraries(paddle_pybind ${cuda_modules}) + endif(WITH_GPU AND NOT WITH_DSO) target_link_libraries(paddle_pybind shlwapi) endif(WIN32) diff --git a/python/CMakeLists.txt b/python/CMakeLists.txt index 391094b5b2..879d4d6bf9 100644 --- a/python/CMakeLists.txt +++ b/python/CMakeLists.txt @@ -61,12 +61,13 @@ IF(WIN32) add_custom_command(OUTPUT ${PADDLE_PYTHON_BUILD_DIR}/.timestamp # COMMAND ${CMAKE_COMMAND} -E touch stub.cc COMMAND ${CMAKE_COMMAND} -E make_directory ${PADDLE_BINARY_DIR}/python/paddle + COMMAND ${CMAKE_COMMAND} -E make_directory ${PADDLE_BINARY_DIR}/python/paddle/libs COMMAND ${CMAKE_COMMAND} -E copy ${PADDLE_SOURCE_DIR}/python/paddle ${PADDLE_BINARY_DIR}/python/paddle COMMAND ${CMAKE_COMMAND} -E copy ${PADDLE_SOURCE_DIR}/paddle/py_paddle ${PADDLE_BINARY_DIR}/python/ COMMAND ${CMAKE_COMMAND} -E env ${py_env} ${PYTHON_EXECUTABLE} setup.py bdist_wheel COMMAND ${CMAKE_COMMAND} -E touch ${PADDLE_PYTHON_BUILD_DIR}/.timestamp - COMMAND ${CMAKE_COMMAND} -E remove_directory ${PADDLE_PYTHON_BUILD_DIR}/lib-python - COMMAND ${CMAKE_COMMAND} -E copy ${PADDLE_PYTHON_BUILD_DIR}/libs ${PADDLE_PYTHON_BUILD_DIR}/lib-python +# COMMAND ${CMAKE_COMMAND} -E remove_directory ${PADDLE_PYTHON_BUILD_DIR}/libs +# COMMAND ${CMAKE_COMMAND} -E copy ${PADDLE_PYTHON_BUILD_DIR}/libs ${PADDLE_PYTHON_BUILD_DIR}/libs DEPENDS gen_proto_py copy_paddle_pybind ${FLUID_CORE} framework_py_proto profiler_py_proto ${PY_FILES} ${external_project_dependencies} ${COPY_PADDLE_MASTER}) ELSE(WIN32) add_custom_command(OUTPUT ${PADDLE_PYTHON_BUILD_DIR}/.timestamp diff --git a/python/paddle/fluid/__init__.py b/python/paddle/fluid/__init__.py index 737c8be814..70c1f95899 100644 --- a/python/paddle/fluid/__init__.py +++ b/python/paddle/fluid/__init__.py @@ -13,6 +13,7 @@ # limitations under the License. from __future__ import print_function +import os # import all class inside framework into fluid module from . import framework from .framework import * @@ -43,16 +44,17 @@ from .lod_tensor import create_lod_tensor, create_random_int_lodtensor from . import clip from . import profiler from . import unique_name -from . import recordio_writer -from . import parallel_executor -from .parallel_executor import * +if os.name != 'nt': + from . import recordio_writer + from . import parallel_executor + from .parallel_executor import * from paddle.fluid.layers.math_op_patch import monkey_patch_variable Tensor = LoDTensor __all__ = framework.__all__ + executor.__all__ + \ trainer.__all__ + inferencer.__all__ + transpiler.__all__ + \ - parallel_executor.__all__ + lod_tensor.__all__ + [ + lod_tensor.__all__ + [ 'io', 'initializer', 'layers', @@ -78,7 +80,8 @@ __all__ = framework.__all__ + executor.__all__ + \ 'recordio_writer', 'Scope', ] - +if os.name != 'nt': + __all__ += parallel_executor.__all__ def __bootstrap__(): """ @@ -110,12 +113,16 @@ def __bootstrap__(): os.environ['OMP_NUM_THREADS'] = str(num_threads) read_env_flags = [ - 'use_pinned_memory', 'check_nan_inf', 'benchmark', 'warpctc_dir', + 'use_pinned_memory', 'check_nan_inf', 'benchmark', 'eager_delete_scope', 'use_mkldnn', 'initial_cpu_memory_in_mb', 'init_allocated_mem', 'free_idle_memory', 'paddle_num_threads', - 'dist_threadpool_size', 'cpu_deterministic', 'eager_delete_tensor_gb', + 'dist_threadpool_size', 'eager_delete_tensor_gb', 'reader_queue_speed_test_mode' ] + if os.name != 'nt': + read_env_flags.append('warpctc_dir') + read_env_flags.append('cpu_deterministic') + if core.is_compiled_with_dist(): read_env_flags.append('rpc_deadline') read_env_flags.append('rpc_server_profile_period') diff --git a/python/paddle/fluid/contrib/inferencer.py b/python/paddle/fluid/contrib/inferencer.py index b8d5f4ffea..b966ae01d0 100644 --- a/python/paddle/fluid/contrib/inferencer.py +++ b/python/paddle/fluid/contrib/inferencer.py @@ -15,13 +15,15 @@ from __future__ import print_function import contextlib +import os from .. import core from .. import executor from .. import framework from .. import io -from .. import parallel_executor +if os.name != 'nt': + from .. import parallel_executor from .. import unique_name from .trainer import check_and_get_place diff --git a/python/paddle/fluid/contrib/trainer.py b/python/paddle/fluid/contrib/trainer.py index 8569e486f9..096821a5ba 100644 --- a/python/paddle/fluid/contrib/trainer.py +++ b/python/paddle/fluid/contrib/trainer.py @@ -28,7 +28,8 @@ from .. import framework from .. import io # optimizer is same as the parameter of Trainer.__init__. Rename it to opt_module from .. import optimizer as opt_module -from .. import parallel_executor +if os.name != 'nt': + from .. import parallel_executor from ..transpiler import distribute_transpiler __all__ = [ diff --git a/python/paddle/fluid/framework.py b/python/paddle/fluid/framework.py index fd03dff386..0282ffec16 100644 --- a/python/paddle/fluid/framework.py +++ b/python/paddle/fluid/framework.py @@ -536,7 +536,7 @@ class Operator(object): OP_WITHOUT_KERNEL_SET = { 'feed', 'fetch', 'save', 'load', 'recurrent', 'go', 'rnn_memory_helper_grad', 'conditional_block', 'while', 'send', 'recv', - 'listen_and_serv', 'parallel_do', 'save_combine', 'load_combine', + 'listen_and_serv', 'parallel_do', 'save_combine', 'loadload_combine', 'ncclInit', 'select', 'checkpoint_notify', 'gen_nccl_id' } diff --git a/python/paddle/fluid/layers/io.py b/python/paddle/fluid/layers/io.py index 95e13669ad..e2d304dc86 100644 --- a/python/paddle/fluid/layers/io.py +++ b/python/paddle/fluid/layers/io.py @@ -15,6 +15,7 @@ from __future__ import print_function import contextlib import multiprocessing +import os import six import threading @@ -344,70 +345,71 @@ def _copy_reader_create_op_(block, op): return new_op -@templatedoc(op_type='create_recordio_file_reader') -def open_recordio_file(filename, - shapes, - lod_levels, - dtypes, - pass_num=1, - for_parallel=True): - """ - ${comment} - - Args: - filename(${filename_type}): ${filename_comment}. - shapes(list): List of tuples which declaring data shapes. - lod_levels(${lod_levels_type}): ${lod_levels_comment}. - dtypes(list): List of strs which declaring data type. - pass_num(int): Number of passes to run. - for_parallel(Bool): Set it as True if you are going to run - subsequent operators in parallel. - - Returns: - ${out_comment}. - - Examples: - - >>> import paddle.fluid as fluid - >>> reader = fluid.layers.io.open_recordio_file( - >>> filename='./data.recordio', - >>> shapes=[(3,224,224), (1)], - >>> lod_levels=[0, 0], - >>> dtypes=['float32', 'int64']) - >>> # Via the reader, we can use 'read_file' layer to get data: - >>> image, label = fluid.layers.io.read_file(reader) - """ - dtypes = [convert_np_dtype_to_dtype_(dt) for dt in dtypes] - shape_concat = [] - ranks = [] - - for shape in shapes: - shape_concat.extend(shape) - ranks.append(len(shape)) - - var_name = unique_name('open_recordio_file') - - startup_blk = default_startup_program().current_block() - startup_var = startup_blk.create_var(name=var_name) - startup_blk.append_op( - type='create_recordio_file_reader', - outputs={'Out': [startup_var]}, - attrs={ - 'shape_concat': shape_concat, - 'lod_levels': lod_levels, - 'filename': filename, - 'ranks': ranks - }) +if os.name != 'nt': + @templatedoc(op_type='create_recordio_file_reader') + def open_recordio_file(filename, + shapes, + lod_levels, + dtypes, + pass_num=1, + for_parallel=True): + """ + ${comment} + + Args: + filename(${filename_type}): ${filename_comment}. + shapes(list): List of tuples which declaring data shapes. + lod_levels(${lod_levels_type}): ${lod_levels_comment}. + dtypes(list): List of strs which declaring data type. + pass_num(int): Number of passes to run. + for_parallel(Bool): Set it as True if you are going to run + subsequent operators in parallel. + + Returns: + ${out_comment}. + + Examples: + + >>> import paddle.fluid as fluid + >>> reader = fluid.layers.io.open_recordio_file( + >>> filename='./data.recordio', + >>> shapes=[(3,224,224), (1)], + >>> lod_levels=[0, 0], + >>> dtypes=['float32', 'int64']) + >>> # Via the reader, we can use 'read_file' layer to get data: + >>> image, label = fluid.layers.io.read_file(reader) + """ + dtypes = [convert_np_dtype_to_dtype_(dt) for dt in dtypes] + shape_concat = [] + ranks = [] + + for shape in shapes: + shape_concat.extend(shape) + ranks.append(len(shape)) + + var_name = unique_name('open_recordio_file') + + startup_blk = default_startup_program().current_block() + startup_var = startup_blk.create_var(name=var_name) + startup_blk.append_op( + type='create_recordio_file_reader', + outputs={'Out': [startup_var]}, + attrs={ + 'shape_concat': shape_concat, + 'lod_levels': lod_levels, + 'filename': filename, + 'ranks': ranks + }) - startup_var.desc.set_dtypes(dtypes) - startup_var.persistable = True - main_prog_var = _copy_reader_var_(default_main_program().current_block(), - startup_var) + startup_var.desc.set_dtypes(dtypes) + startup_var.persistable = True + main_prog_var = _copy_reader_var_(default_main_program().current_block(), + startup_var) - if pass_num > 1: - main_prog_var = multi_pass(reader=main_prog_var, pass_num=pass_num) + if pass_num > 1: + main_prog_var = multi_pass(reader=main_prog_var, pass_num=pass_num) - return monkey_patch_reader_methods(main_prog_var) + return monkey_patch_reader_methods(main_prog_var) def random_data_generator(low, high, shapes, lod_levels, for_parallel=True): diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py index 110e6d5ab2..d201357e6f 100644 --- a/python/paddle/fluid/layers/nn.py +++ b/python/paddle/fluid/layers/nn.py @@ -18,6 +18,7 @@ All layers just related to the neural network. from __future__ import print_function import numpy as np +import os from ..layer_helper import LayerHelper from ..initializer import Normal, Constant from ..framework import Variable, OpProtoHolder @@ -31,12 +32,10 @@ from functools import reduce __all__ = [ 'fc', 'embedding', - 'dynamic_lstm', 'dynamic_lstmp', 'dynamic_gru', 'gru_unit', 'linear_chain_crf', - 'crf_decoding', 'cos_sim', 'cross_entropy', 'square_error_cost', @@ -95,7 +94,6 @@ __all__ = [ 'pad', 'pad_constant_like', 'label_smooth', - 'roi_pool', 'roi_align', 'dice_loss', 'image_resize', @@ -160,6 +158,10 @@ __all__ = [ 'log_loss', 'add_position_encoding', ] +if os.name != 'nt': + __all__.append('dynamic_lstm') + __all__.append('crf_decoding') + __all__.append('roi_pool') def fc(input, @@ -334,126 +336,127 @@ def embedding(input, return tmp -@templatedoc(op_type="lstm") -def dynamic_lstm(input, - size, - h_0=None, - c_0=None, - param_attr=None, - bias_attr=None, - use_peepholes=True, - is_reverse=False, - gate_activation='sigmoid', - cell_activation='tanh', - candidate_activation='tanh', - dtype='float32', - name=None): - """ - ${comment} - - Args: - input (Variable): ${input_comment} - size (int): 4 * hidden size. - h_0(Variable): The initial hidden state is an optional input, default is zero. - This is a tensor with shape (N x D), where N is the - batch size and D is the hidden size. - c_0(Variable): The initial cell state is an optional input, default is zero. - This is a tensor with shape (N x D), where N is the - batch size. `h_0` and `c_0` can be NULL but only at the same time. - param_attr(ParamAttr|None): The parameter attribute for the learnable - hidden-hidden weights. - - - Weights = {:math:`W_{ch}, W_{ih}, \ - W_{fh}, W_{oh}`} - - The shape is (D x 4D), where D is the hidden - size. - - If it is set to None or one attribute of ParamAttr, - dynamic_lstm will create ParamAttr as param_attr. - If the Initializer of the param_attr is not set, the - parameter is initialized with Xavier. Default: None. - bias_attr (ParamAttr|None): The bias attribute for the learnable bias - weights, which contains two parts, input-hidden - bias weights and peephole connections weights if - setting `use_peepholes` to `True`. - - 1. `use_peepholes = False` - - Biases = {:math:`b_c, b_i, b_f, b_o`}. - - The shape is (1 x 4D). - 2. `use_peepholes = True` - - Biases = { :math:`b_c, b_i, b_f, b_o, W_{ic}, \ - W_{fc}, W_{oc}`}. - - The shape is (1 x 7D). - - If it is set to None or one attribute of ParamAttr, - dynamic_lstm will create ParamAttr as bias_attr. - If the Initializer of the bias_attr is not set, - the bias is initialized zero. Default: None. - use_peepholes (bool): ${use_peepholes_comment} - is_reverse (bool): ${is_reverse_comment} - gate_activation (str): ${gate_activation_comment} - cell_activation (str): ${cell_activation_comment} - candidate_activation (str): ${candidate_activation_comment} - dtype (str): Data type. Choices = ["float32", "float64"], default "float32". - name (str|None): A name for this layer(optional). If set None, the layer - will be named automatically. - - Returns: - tuple: The hidden state, and cell state of LSTM. The shape of both \ - is (T x D), and lod is the same with the `input`. - - Examples: - .. code-block:: python - - hidden_dim = 512 - forward_proj = fluid.layers.fc(input=input_seq, size=hidden_dim * 4, - bias_attr=False) - forward, _ = fluid.layers.dynamic_lstm( - input=forward_proj, size=hidden_dim * 4, use_peepholes=False) - """ - assert bias_attr is not False, "bias_attr should not be False in dynamic_lstmp." - helper = LayerHelper('lstm', **locals()) - size = size // 4 - weight = helper.create_parameter( - attr=helper.param_attr, shape=[size, 4 * size], dtype=dtype) - bias_size = [1, 7 * size] - if not use_peepholes: - bias_size[1] = 4 * size - bias = helper.create_parameter( - attr=helper.bias_attr, shape=bias_size, dtype=dtype, is_bias=True) +if os.name != 'nt': + @templatedoc(op_type="lstm") + def dynamic_lstm(input, + size, + h_0=None, + c_0=None, + param_attr=None, + bias_attr=None, + use_peepholes=True, + is_reverse=False, + gate_activation='sigmoid', + cell_activation='tanh', + candidate_activation='tanh', + dtype='float32', + name=None): + """ + ${comment} + + Args: + input (Variable): ${input_comment} + size (int): 4 * hidden size. + h_0(Variable): The initial hidden state is an optional input, default is zero. + This is a tensor with shape (N x D), where N is the + batch size and D is the hidden size. + c_0(Variable): The initial cell state is an optional input, default is zero. + This is a tensor with shape (N x D), where N is the + batch size. `h_0` and `c_0` can be NULL but only at the same time. + param_attr(ParamAttr|None): The parameter attribute for the learnable + hidden-hidden weights. + + - Weights = {:math:`W_{ch}, W_{ih}, \ + W_{fh}, W_{oh}`} + - The shape is (D x 4D), where D is the hidden + size. + + If it is set to None or one attribute of ParamAttr, + dynamic_lstm will create ParamAttr as param_attr. + If the Initializer of the param_attr is not set, the + parameter is initialized with Xavier. Default: None. + bias_attr (ParamAttr|None): The bias attribute for the learnable bias + weights, which contains two parts, input-hidden + bias weights and peephole connections weights if + setting `use_peepholes` to `True`. + + 1. `use_peepholes = False` + - Biases = {:math:`b_c, b_i, b_f, b_o`}. + - The shape is (1 x 4D). + 2. `use_peepholes = True` + - Biases = { :math:`b_c, b_i, b_f, b_o, W_{ic}, \ + W_{fc}, W_{oc}`}. + - The shape is (1 x 7D). + + If it is set to None or one attribute of ParamAttr, + dynamic_lstm will create ParamAttr as bias_attr. + If the Initializer of the bias_attr is not set, + the bias is initialized zero. Default: None. + use_peepholes (bool): ${use_peepholes_comment} + is_reverse (bool): ${is_reverse_comment} + gate_activation (str): ${gate_activation_comment} + cell_activation (str): ${cell_activation_comment} + candidate_activation (str): ${candidate_activation_comment} + dtype (str): Data type. Choices = ["float32", "float64"], default "float32". + name (str|None): A name for this layer(optional). If set None, the layer + will be named automatically. + + Returns: + tuple: The hidden state, and cell state of LSTM. The shape of both \ + is (T x D), and lod is the same with the `input`. + + Examples: + .. code-block:: python + + hidden_dim = 512 + forward_proj = fluid.layers.fc(input=input_seq, size=hidden_dim * 4, + bias_attr=False) + forward, _ = fluid.layers.dynamic_lstm( + input=forward_proj, size=hidden_dim * 4, use_peepholes=False) + """ + assert bias_attr is not False, "bias_attr should not be False in dynamic_lstmp." + helper = LayerHelper('lstm', **locals()) + size = size // 4 + weight = helper.create_parameter( + attr=helper.param_attr, shape=[size, 4 * size], dtype=dtype) + bias_size = [1, 7 * size] + if not use_peepholes: + bias_size[1] = 4 * size + bias = helper.create_parameter( + attr=helper.bias_attr, shape=bias_size, dtype=dtype, is_bias=True) - hidden = helper.create_variable_for_type_inference(dtype) - cell = helper.create_variable_for_type_inference(dtype) - batch_gate = helper.create_variable_for_type_inference(dtype) - batch_cell_pre_act = helper.create_variable_for_type_inference(dtype) - inputs = {'Input': input, 'Weight': weight, 'Bias': bias} - batch_size = input.shape[0] - if h_0: - assert h_0.shape == (batch_size, size), \ - 'The shape of h0 should be (batch_size, %d)' % size - inputs['H0'] = h_0 - if c_0: - assert c_0.shape == (batch_size, size), \ - 'The shape of c0 should be (batch_size, %d)' % size - inputs['C0'] = c_0 + hidden = helper.create_variable_for_type_inference(dtype) + cell = helper.create_variable_for_type_inference(dtype) + batch_gate = helper.create_variable_for_type_inference(dtype) + batch_cell_pre_act = helper.create_variable_for_type_inference(dtype) + inputs = {'Input': input, 'Weight': weight, 'Bias': bias} + batch_size = input.shape[0] + if h_0: + assert h_0.shape == (batch_size, size), \ + 'The shape of h0 should be (batch_size, %d)' % size + inputs['H0'] = h_0 + if c_0: + assert c_0.shape == (batch_size, size), \ + 'The shape of c0 should be (batch_size, %d)' % size + inputs['C0'] = c_0 - helper.append_op( - type='lstm', - inputs=inputs, - outputs={ - 'Hidden': hidden, - 'Cell': cell, - 'BatchGate': batch_gate, - 'BatchCellPreAct': batch_cell_pre_act - }, - attrs={ - 'use_peepholes': use_peepholes, - 'is_reverse': is_reverse, - 'gate_activation': gate_activation, - 'cell_activation': cell_activation, - 'candidate_activation': candidate_activation - }) - return hidden, cell + helper.append_op( + type='lstm', + inputs=inputs, + outputs={ + 'Hidden': hidden, + 'Cell': cell, + 'BatchGate': batch_gate, + 'BatchCellPreAct': batch_cell_pre_act + }, + attrs={ + 'use_peepholes': use_peepholes, + 'is_reverse': is_reverse, + 'gate_activation': gate_activation, + 'cell_activation': cell_activation, + 'candidate_activation': candidate_activation + }) + return hidden, cell def dynamic_lstmp(input, @@ -923,39 +926,40 @@ def linear_chain_crf(input, label, param_attr=None): return log_likelihood -@templatedoc() -def crf_decoding(input, param_attr, label=None): - """ - ${comment} +if os.name != 'nt': + @templatedoc() + def crf_decoding(input, param_attr, label=None): + """ + ${comment} - Args: - input(${emission_type}): ${emission_comment} + Args: + input(${emission_type}): ${emission_comment} - param_attr(ParamAttr): The parameter attribute for training. + param_attr(ParamAttr): The parameter attribute for training. - label(${label_type}): ${label_comment} + label(${label_type}): ${label_comment} - Returns: - Variable: ${viterbi_path_comment} + Returns: + Variable: ${viterbi_path_comment} - Examples: - .. code-block:: python + Examples: + .. code-block:: python - crf_decode = layers.crf_decoding( - input=hidden, param_attr=ParamAttr(name="crfw")) - """ - helper = LayerHelper('crf_decoding', **locals()) - transition = helper.get_parameter(param_attr.name) - viterbi_path = helper.create_variable_for_type_inference( - dtype=helper.input_dtype()) - helper.append_op( - type='crf_decoding', - inputs={"Emission": [input], - "Transition": transition, - "Label": label}, - outputs={"ViterbiPath": [viterbi_path]}) + crf_decode = layers.crf_decoding( + input=hidden, param_attr=ParamAttr(name="crfw")) + """ + helper = LayerHelper('crf_decoding', **locals()) + transition = helper.get_parameter(param_attr.name) + viterbi_path = helper.create_variable_for_type_inference( + dtype=helper.input_dtype()) + helper.append_op( + type='crf_decoding', + inputs={"Emission": [input], + "Transition": transition, + "Label": label}, + outputs={"ViterbiPath": [viterbi_path]}) - return viterbi_path + return viterbi_path @templatedoc() @@ -5443,42 +5447,43 @@ def label_smooth(label, return smooth_label -@templatedoc() -def roi_pool(input, rois, pooled_height=1, pooled_width=1, spatial_scale=1.0): - """ - ${comment} +if os.name != 'nt': + @templatedoc() + def roi_pool(input, rois, pooled_height=1, pooled_width=1, spatial_scale=1.0): + """ + ${comment} - Args: - input (Variable): ${x_comment} - rois (Variable): ROIs (Regions of Interest) to pool over. - pooled_height (integer): ${pooled_height_comment} Default: 1 - pooled_width (integer): ${pooled_width_comment} Default: 1 - spatial_scale (float): ${spatial_scale_comment} Default: 1.0 + Args: + input (Variable): ${x_comment} + rois (Variable): ROIs (Regions of Interest) to pool over. + pooled_height (integer): ${pooled_height_comment} Default: 1 + pooled_width (integer): ${pooled_width_comment} Default: 1 + spatial_scale (float): ${spatial_scale_comment} Default: 1.0 - Returns: - Variable: ${out_comment}. + Returns: + Variable: ${out_comment}. - Examples: - .. code-block:: python + Examples: + .. code-block:: python - pool_out = fluid.layers.roi_pool(input=x, rois=rois, 7, 7, 1.0) - """ - helper = LayerHelper('roi_pool', **locals()) - dtype = helper.input_dtype() - pool_out = helper.create_variable_for_type_inference(dtype) - argmaxes = helper.create_variable_for_type_inference(dtype='int32') - helper.append_op( - type="roi_pool", - inputs={"X": input, - "ROIs": rois}, - outputs={"Out": pool_out, - "Argmax": argmaxes}, - attrs={ - "pooled_height": pooled_height, - "pooled_width": pooled_width, - "spatial_scale": spatial_scale - }) - return pool_out + pool_out = fluid.layers.roi_pool(input=x, rois=rois, 7, 7, 1.0) + """ + helper = LayerHelper('roi_pool', **locals()) + dtype = helper.input_dtype() + pool_out = helper.create_variable_for_type_inference(dtype) + argmaxes = helper.create_variable_for_type_inference(dtype='int32') + helper.append_op( + type="roi_pool", + inputs={"X": input, + "ROIs": rois}, + outputs={"Out": pool_out, + "Argmax": argmaxes}, + attrs={ + "pooled_height": pooled_height, + "pooled_width": pooled_width, + "spatial_scale": spatial_scale + }) + return pool_out @templatedoc() diff --git a/python/paddle/fluid/layers/ops.py b/python/paddle/fluid/layers/ops.py index 1ff40a26f2..df52b7042f 100644 --- a/python/paddle/fluid/layers/ops.py +++ b/python/paddle/fluid/layers/ops.py @@ -13,6 +13,7 @@ # limitations under the License. from __future__ import print_function +import os from .layer_function_generator import generate_layer_fn, generate_layer_fn_noattr from .. import core from ..framework import convert_np_dtype_to_dtype_ @@ -99,27 +100,28 @@ Examples: >>> result = fluid.layers.hard_shrink(x=data, threshold=0.3) """ -__all__ += ['cumsum'] +if os.name != 'nt': + __all__ += ['cumsum'] -_cum_sum_ = generate_layer_fn('cumsum') + _cum_sum_ = generate_layer_fn('cumsum') -def cumsum(x, axis=None, exclusive=None, reverse=None): - locals_var = locals().keys() - kwargs = dict() - for name in locals_var: - val = locals()[name] - if val is not None: - kwargs[name] = val - return _cum_sum_(**kwargs) - + def cumsum(x, axis=None, exclusive=None, reverse=None): + locals_var = locals().keys() + kwargs = dict() + for name in locals_var: + val = locals()[name] + if val is not None: + kwargs[name] = val + return _cum_sum_(**kwargs) -cumsum.__doc__ = _cum_sum_.__doc__ + """ -Examples: - >>> data = fluid.layers.data(name="input", shape=[32, 784]) - >>> result = fluid.layers.cumsum(data, axis=0) -""" + cumsum.__doc__ = _cum_sum_.__doc__ + """ + Examples: + + >>> data = fluid.layers.data(name="input", shape=[32, 784]) + >>> result = fluid.layers.cumsum(data, axis=0) + """ __all__ += ['thresholded_relu'] diff --git a/python/setup.py.in b/python/setup.py.in index c442055208..ce65d0003f 100644 --- a/python/setup.py.in +++ b/python/setup.py.in @@ -180,7 +180,8 @@ if '${CMAKE_BUILD_TYPE}' == 'Release': package_data['paddle.libs']+=['libmkldnn.so.0'] shutil.copy('${MKLDNN_SHARED_LIB}', libs_path) # remove unused paddle/libs/__init__.py -os.remove(libs_path+'/__init__.py') +if os.path.isfile(libs_path+'/__init__.py'): + os.remove(libs_path+'/__init__.py') package_dir['paddle.libs']=libs_path # change rpath of core.so, add $ORIGIN/../libs/ to it. From a37918c31f740b5b6a886bb472ce52f8d4e65659 Mon Sep 17 00:00:00 2001 From: peizhilin Date: Tue, 6 Nov 2018 17:28:12 +0800 Subject: [PATCH 04/50] fix python package issue --- paddle/fluid/framework/CMakeLists.txt | 19 ++++++++++----- python/CMakeLists.txt | 33 ++++++++++++++------------- 2 files changed, 30 insertions(+), 22 deletions(-) diff --git a/paddle/fluid/framework/CMakeLists.txt b/paddle/fluid/framework/CMakeLists.txt index 8442911406..2bab3a15b1 100644 --- a/paddle/fluid/framework/CMakeLists.txt +++ b/paddle/fluid/framework/CMakeLists.txt @@ -140,16 +140,23 @@ cc_library(proto_desc SRCS var_desc.cc op_desc.cc block_desc.cc program_desc.cc cc_library(op_registry SRCS op_registry.cc DEPS op_proto_maker op_info operator glog proto_desc) nv_test(op_registry_test SRCS op_registry_test.cc DEPS op_registry) -if (NOT WIN32) py_proto_compile(framework_py_proto SRCS framework.proto) # Generate an empty __init__.py to make framework_py_proto as a valid python module. add_custom_target(framework_py_proto_init ALL COMMAND ${CMAKE_COMMAND} -E touch __init__.py) add_dependencies(framework_py_proto framework_py_proto_init) -add_custom_command(TARGET framework_py_proto POST_BUILD - COMMAND ${CMAKE_COMMAND} -E make_directory ${PADDLE_BINARY_DIR}/python/paddle/fluid/proto - COMMAND cp *.py ${PADDLE_BINARY_DIR}/python/paddle/fluid/proto/ - COMMENT "Copy generated python proto into directory paddle/fluid/proto." - WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}) +if (NOT WIN32) + add_custom_command(TARGET framework_py_proto POST_BUILD + COMMAND ${CMAKE_COMMAND} -E make_directory ${PADDLE_BINARY_DIR}/python/paddle/fluid/proto + COMMAND cp *.py ${PADDLE_BINARY_DIR}/python/paddle/fluid/proto/ + COMMENT "Copy generated python proto into directory paddle/fluid/proto." + WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}) +else(NOT WIN32) + string(REPLACE "/" "\\" proto_dstpath "${PADDLE_BINARY_DIR}/python/paddle/fluid/proto/") + add_custom_command(TARGET framework_py_proto POST_BUILD + COMMAND ${CMAKE_COMMAND} -E make_directory ${PADDLE_BINARY_DIR}/python/paddle/fluid/proto + COMMAND copy /Y *.py ${proto_dstpath} + COMMENT "Copy generated python proto into directory paddle/fluid/proto." + WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}) endif(NOT WIN32) cc_library(lod_rank_table SRCS lod_rank_table.cc DEPS lod_tensor) diff --git a/python/CMakeLists.txt b/python/CMakeLists.txt index 879d4d6bf9..139176b0d6 100644 --- a/python/CMakeLists.txt +++ b/python/CMakeLists.txt @@ -45,30 +45,31 @@ endif() configure_file(${CMAKE_CURRENT_SOURCE_DIR}/setup.py.in ${CMAKE_CURRENT_BINARY_DIR}/setup.py) - IF(WIN32) # Python would use the .pyd by default under Windows series platform - set(FLUID_CORE ${PADDLE_BINARY_DIR}/python/paddle/fluid/core.pyd) + set(FLUID_DST_DIR ${PADDLE_BINARY_DIR}/python/paddle/fluid/) + get_filename_component(openblas_refpath ${CBLAS_LIBRARIES} DIRECTORY) + set(FLUID_CORE ${FLUID_DST_DIR}/core.pyd) + add_custom_command(OUTPUT ${FLUID_CORE} + COMMAND cmake -E copy $ ${FLUID_CORE} + COMMAND cmake -E copy ${openblas_refpath}/openblas.dll ${FLUID_DST_DIR} + DEPENDS paddle_pybind) ELSE() set(FLUID_CORE ${PADDLE_BINARY_DIR}/python/paddle/fluid/core.so) + add_custom_command(OUTPUT ${FLUID_CORE} + COMMAND cmake -E copy $ ${FLUID_CORE} + DEPENDS paddle_pybind) ENDIF() -add_custom_command(OUTPUT ${FLUID_CORE} - COMMAND cmake -E copy $ ${FLUID_CORE} - DEPENDS paddle_pybind) add_custom_target(copy_paddle_pybind ALL DEPENDS ${FLUID_CORE}) IF(WIN32) - add_custom_command(OUTPUT ${PADDLE_PYTHON_BUILD_DIR}/.timestamp -# COMMAND ${CMAKE_COMMAND} -E touch stub.cc - COMMAND ${CMAKE_COMMAND} -E make_directory ${PADDLE_BINARY_DIR}/python/paddle - COMMAND ${CMAKE_COMMAND} -E make_directory ${PADDLE_BINARY_DIR}/python/paddle/libs - COMMAND ${CMAKE_COMMAND} -E copy ${PADDLE_SOURCE_DIR}/python/paddle ${PADDLE_BINARY_DIR}/python/paddle - COMMAND ${CMAKE_COMMAND} -E copy ${PADDLE_SOURCE_DIR}/paddle/py_paddle ${PADDLE_BINARY_DIR}/python/ - COMMAND ${CMAKE_COMMAND} -E env ${py_env} ${PYTHON_EXECUTABLE} setup.py bdist_wheel - COMMAND ${CMAKE_COMMAND} -E touch ${PADDLE_PYTHON_BUILD_DIR}/.timestamp -# COMMAND ${CMAKE_COMMAND} -E remove_directory ${PADDLE_PYTHON_BUILD_DIR}/libs -# COMMAND ${CMAKE_COMMAND} -E copy ${PADDLE_PYTHON_BUILD_DIR}/libs ${PADDLE_PYTHON_BUILD_DIR}/libs - DEPENDS gen_proto_py copy_paddle_pybind ${FLUID_CORE} framework_py_proto profiler_py_proto ${PY_FILES} ${external_project_dependencies} ${COPY_PADDLE_MASTER}) + add_custom_command(OUTPUT ${PADDLE_PYTHON_BUILD_DIR}/.timestamp + COMMAND ${CMAKE_COMMAND} -E copy_directory ${PADDLE_SOURCE_DIR}/python/paddle ${PADDLE_BINARY_DIR}/python/paddle/ + COMMAND ${CMAKE_COMMAND} -E copy_directory ${PADDLE_SOURCE_DIR}/paddle/py_paddle ${PADDLE_BINARY_DIR}/python/ + COMMAND ${CMAKE_COMMAND} -E env ${py_env} ${PYTHON_EXECUTABLE} setup.py bdist_wheel + COMMAND ${CMAKE_COMMAND} -E touch ${PADDLE_PYTHON_BUILD_DIR}/.timestamp + COMMAND ${CMAKE_COMMAND} -E remove_directory ${PADDLE_PYTHON_BUILD_DIR}/lib-python + DEPENDS gen_proto_py copy_paddle_pybind ${FLUID_CORE} framework_py_proto profiler_py_proto ${PY_FILES} ${external_project_dependencies} ${COPY_PADDLE_MASTER}) ELSE(WIN32) add_custom_command(OUTPUT ${PADDLE_PYTHON_BUILD_DIR}/.timestamp COMMAND touch stub.cc From 77892124fb7babd0b1651092958878555764bdbf Mon Sep 17 00:00:00 2001 From: peizhilin Date: Wed, 7 Nov 2018 14:22:07 +0800 Subject: [PATCH 05/50] online configuration --- cmake/external/eigen.cmake | 5 ++--- cmake/external/gflags.cmake | 5 ++--- cmake/external/glog.cmake | 1 - cmake/external/gtest.cmake | 5 ++--- cmake/external/openblas.cmake | 10 ++++------ cmake/external/protobuf.cmake | 5 ++--- cmake/external/zlib.cmake | 5 ++--- 7 files changed, 14 insertions(+), 22 deletions(-) diff --git a/cmake/external/eigen.cmake b/cmake/external/eigen.cmake index 2aa64a350a..573ad5e5f0 100644 --- a/cmake/external/eigen.cmake +++ b/cmake/external/eigen.cmake @@ -29,11 +29,10 @@ else() ExternalProject_Add( extern_eigen3 ${EXTERNAL_PROJECT_LOG_ARGS} -# GIT_REPOSITORY "https://github.com/eigenteam/eigen-git-mirror" - GIT_REPOSITORY "http://admin@localhost:8080/r/eigen3.git" + GIT_REPOSITORY "https://github.com/eigenteam/eigen-git-mirror" # eigen on cuda9.1 missing header of math_funtions.hpp # https://stackoverflow.com/questions/43113508/math-functions-hpp-not-found-when-using-cuda-with-eigen -# GIT_TAG 917060c364181f33a735dc023818d5a54f60e54c + GIT_TAG 917060c364181f33a735dc023818d5a54f60e54c PREFIX ${EIGEN_SOURCE_DIR} DOWNLOAD_NAME "eigen" UPDATE_COMMAND "" diff --git a/cmake/external/gflags.cmake b/cmake/external/gflags.cmake index 9c6974b8f0..7a0369b9df 100644 --- a/cmake/external/gflags.cmake +++ b/cmake/external/gflags.cmake @@ -28,9 +28,8 @@ INCLUDE_DIRECTORIES(${GFLAGS_INCLUDE_DIR}) ExternalProject_Add( extern_gflags ${EXTERNAL_PROJECT_LOG_ARGS} -# GIT_REPOSITORY "https://github.com/gflags/gflags.git" - GIT_REPOSITORY "http://admin@localhost:8080/r/gflags.git" -# GIT_TAG 77592648e3f3be87d6c7123eb81cbad75f9aef5a + GIT_REPOSITORY "https://github.com/gflags/gflags.git" + GIT_TAG 77592648e3f3be87d6c7123eb81cbad75f9aef5a PREFIX ${GFLAGS_SOURCES_DIR} UPDATE_COMMAND "" CMAKE_ARGS -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER} diff --git a/cmake/external/glog.cmake b/cmake/external/glog.cmake index 84f8127760..ac2f2be83b 100644 --- a/cmake/external/glog.cmake +++ b/cmake/external/glog.cmake @@ -34,7 +34,6 @@ ELSE() SET(GLOG_REPOSITORY "https://github.com/google/glog.git") SET(GLOG_TAG "v0.3.5") ENDIF() - SET(GLOG_REPOSITORY "http://admin@localhost:8080/r/glog.git") ExternalProject_Add( extern_glog diff --git a/cmake/external/gtest.cmake b/cmake/external/gtest.cmake index 4f5acc92f0..d335298742 100644 --- a/cmake/external/gtest.cmake +++ b/cmake/external/gtest.cmake @@ -43,9 +43,8 @@ IF(WITH_TESTING) extern_gtest ${EXTERNAL_PROJECT_LOG_ARGS} DEPENDS ${GTEST_DEPENDS} - # GIT_REPOSITORY "https://github.com/google/googletest.git" - GIT_REPOSITORY "http://admin@localhost:8080/r/gtest.git" -# GIT_TAG "release-1.8.0" + GIT_REPOSITORY "https://github.com/google/googletest.git" + GIT_TAG "release-1.8.0" PREFIX ${GTEST_SOURCES_DIR} UPDATE_COMMAND "" CMAKE_ARGS -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER} diff --git a/cmake/external/openblas.cmake b/cmake/external/openblas.cmake index 664422813d..2b46936c18 100644 --- a/cmake/external/openblas.cmake +++ b/cmake/external/openblas.cmake @@ -76,9 +76,8 @@ IF(NOT ${CBLAS_FOUND}) ExternalProject_Add( extern_openblas ${EXTERNAL_PROJECT_LOG_ARGS} - # GIT_REPOSITORY https://github.com/xianyi/OpenBLAS.git - GIT_REPOSITORY http://admin@localhost:8080/r/openblas.git - # GIT_TAG ${OPENBLAS_COMMIT} + GIT_REPOSITORY https://github.com/xianyi/OpenBLAS.git + GIT_TAG ${OPENBLAS_COMMIT} PREFIX ${CBLAS_SOURCES_DIR} INSTALL_DIR ${CBLAS_INSTALL_DIR} BUILD_IN_SOURCE 1 @@ -104,9 +103,8 @@ IF(NOT ${CBLAS_FOUND}) ExternalProject_Add( extern_openblas ${EXTERNAL_PROJECT_LOG_ARGS} - # GIT_REPOSITORY https://github.com/xianyi/OpenBLAS.git - GIT_REPOSITORY http://admin@localhost:8080/r/openblas.git - # GIT_TAG ${OPENBLAS_COMMIT} + GIT_REPOSITORY https://github.com/xianyi/OpenBLAS.git + GIT_TAG ${OPENBLAS_COMMIT} PREFIX ${CBLAS_SOURCES_DIR} INSTALL_DIR ${CBLAS_INSTALL_DIR} BUILD_IN_SOURCE 1 diff --git a/cmake/external/protobuf.cmake b/cmake/external/protobuf.cmake index d4c6ea7819..bb1fcf356f 100644 --- a/cmake/external/protobuf.cmake +++ b/cmake/external/protobuf.cmake @@ -206,9 +206,8 @@ FUNCTION(build_protobuf TARGET_NAME BUILD_FOR_HOST) SET(OPTIONAL_ARGS ${OPTIONAL_ARGS} "-DCMAKE_GENERATOR_PLATFORM=x64") ENDIF() - # SET(PROTOBUF_REPO "https://github.com/google/protobuf.git") - # SET(PROTOBUF_TAG "9f75c5aa851cd877fb0d93ccc31b8567a6706546") - SET(PROTOBUF_REPO http://admin@localhost:8080/r/protobuf.git) + SET(PROTOBUF_REPO "https://github.com/google/protobuf.git") + SET(PROTOBUF_TAG "9f75c5aa851cd877fb0d93ccc31b8567a6706546") IF(MOBILE_INFERENCE) # The reason why the official version is not used is described in # https://github.com/PaddlePaddle/Paddle/issues/6114 diff --git a/cmake/external/zlib.cmake b/cmake/external/zlib.cmake index b65f2afbc2..c3d7323545 100644 --- a/cmake/external/zlib.cmake +++ b/cmake/external/zlib.cmake @@ -31,9 +31,8 @@ INCLUDE_DIRECTORIES(${THIRD_PARTY_PATH}/install) # For Paddle code to include zl ExternalProject_Add( extern_zlib ${EXTERNAL_PROJECT_LOG_ARGS} - # GIT_REPOSITORY "https://github.com/madler/zlib.git" - GIT_REPOSITORY "http://admin@localhost:8080/r/zlib.git" -# GIT_TAG "v1.2.8" + GIT_REPOSITORY "https://github.com/madler/zlib.git" + GIT_TAG "v1.2.8" PREFIX ${ZLIB_SOURCES_DIR} UPDATE_COMMAND "" CMAKE_ARGS -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER} From 130cdda65b1b148c5f11a4dac1ee8848658a8587 Mon Sep 17 00:00:00 2001 From: peizhilin Date: Wed, 7 Nov 2018 19:14:25 +0800 Subject: [PATCH 06/50] add gpu debug mode --- cmake/cuda.cmake | 8 ++++++-- cmake/cudnn.cmake | 7 ++++++- cmake/external/eigen.cmake | 7 ++++--- cmake/external/gflags.cmake | 2 +- cmake/external/glog.cmake | 2 +- cmake/external/gtest.cmake | 2 +- cmake/external/openblas.cmake | 4 ++-- cmake/external/protobuf.cmake | 2 +- cmake/external/zlib.cmake | 2 +- 9 files changed, 23 insertions(+), 13 deletions(-) diff --git a/cmake/cuda.cmake b/cmake/cuda.cmake index 1cc882cce7..45a4b13288 100644 --- a/cmake/cuda.cmake +++ b/cmake/cuda.cmake @@ -167,8 +167,12 @@ select_nvcc_arch_flags(NVCC_FLAGS_EXTRA) list(APPEND CUDA_NVCC_FLAGS ${NVCC_FLAGS_EXTRA}) message(STATUS "Added CUDA NVCC flags for: ${NVCC_FLAGS_EXTRA_readable}") -# Set C++11 support -set(CUDA_PROPAGATE_HOST_FLAGS OFF) +if (WIN32) + set(CUDA_PROPAGATE_HOST_FLAGS ON) +else (WIN32) + # Set C++11 support + set(CUDA_PROPAGATE_HOST_FLAGS OFF) +endif (WIN32) # Release/Debug flags set by cmake. Such as -O3 -g -DNDEBUG etc. # So, don't set these flags here. diff --git a/cmake/cudnn.cmake b/cmake/cudnn.cmake index cd51533926..09bec347db 100644 --- a/cmake/cudnn.cmake +++ b/cmake/cudnn.cmake @@ -2,7 +2,12 @@ if(NOT WITH_GPU) return() endif() -set(CUDNN_ROOT "/usr" CACHE PATH "CUDNN ROOT") +if(WIN32) + set(CUDNN_ROOT ${CUDA_TOOLKIT_ROOT_DIR}) +else(WIN32) + set(CUDNN_ROOT "/usr" CACHE PATH "CUDNN ROOT") +endif(WIN32) + find_path(CUDNN_INCLUDE_DIR cudnn.h PATHS ${CUDNN_ROOT} ${CUDNN_ROOT}/include $ENV{CUDNN_ROOT} $ENV{CUDNN_ROOT}/include ${CUDA_TOOLKIT_INCLUDE} diff --git a/cmake/external/eigen.cmake b/cmake/external/eigen.cmake index 2aa64a350a..98079678ae 100644 --- a/cmake/external/eigen.cmake +++ b/cmake/external/eigen.cmake @@ -16,8 +16,9 @@ if(WITH_AMD_GPU) ExternalProject_Add( extern_eigen3 ${EXTERNAL_PROJECT_LOG_ARGS} - GIT_REPOSITORY "https://github.com/sabreshao/hipeigen.git" - GIT_TAG 0cba03ff9f8f9f70bbd92ac5857b031aa8fed6f9 +# GIT_REPOSITORY "https://github.com/sabreshao/hipeigen.git" +# GIT_TAG 0cba03ff9f8f9f70bbd92ac5857b031aa8fed6f9 + GIT_REPOSITORY "http://admin@172.20.90.14:8080/r/eigen3.git" PREFIX ${EIGEN_SOURCE_DIR} UPDATE_COMMAND "" CONFIGURE_COMMAND "" @@ -30,7 +31,7 @@ else() extern_eigen3 ${EXTERNAL_PROJECT_LOG_ARGS} # GIT_REPOSITORY "https://github.com/eigenteam/eigen-git-mirror" - GIT_REPOSITORY "http://admin@localhost:8080/r/eigen3.git" + GIT_REPOSITORY "http://admin@172.20.90.14:8080/r/eigen3.git" # eigen on cuda9.1 missing header of math_funtions.hpp # https://stackoverflow.com/questions/43113508/math-functions-hpp-not-found-when-using-cuda-with-eigen # GIT_TAG 917060c364181f33a735dc023818d5a54f60e54c diff --git a/cmake/external/gflags.cmake b/cmake/external/gflags.cmake index 9c6974b8f0..73ea80ea45 100644 --- a/cmake/external/gflags.cmake +++ b/cmake/external/gflags.cmake @@ -29,7 +29,7 @@ ExternalProject_Add( extern_gflags ${EXTERNAL_PROJECT_LOG_ARGS} # GIT_REPOSITORY "https://github.com/gflags/gflags.git" - GIT_REPOSITORY "http://admin@localhost:8080/r/gflags.git" + GIT_REPOSITORY "http://admin@172.20.90.14:8080/r/gflags.git" # GIT_TAG 77592648e3f3be87d6c7123eb81cbad75f9aef5a PREFIX ${GFLAGS_SOURCES_DIR} UPDATE_COMMAND "" diff --git a/cmake/external/glog.cmake b/cmake/external/glog.cmake index 84f8127760..5184a83bdd 100644 --- a/cmake/external/glog.cmake +++ b/cmake/external/glog.cmake @@ -34,7 +34,7 @@ ELSE() SET(GLOG_REPOSITORY "https://github.com/google/glog.git") SET(GLOG_TAG "v0.3.5") ENDIF() - SET(GLOG_REPOSITORY "http://admin@localhost:8080/r/glog.git") + SET(GLOG_REPOSITORY "http://admin@172.20.90.14:8080/r/glog.git") ExternalProject_Add( extern_glog diff --git a/cmake/external/gtest.cmake b/cmake/external/gtest.cmake index 4f5acc92f0..da539d52bd 100644 --- a/cmake/external/gtest.cmake +++ b/cmake/external/gtest.cmake @@ -44,7 +44,7 @@ IF(WITH_TESTING) ${EXTERNAL_PROJECT_LOG_ARGS} DEPENDS ${GTEST_DEPENDS} # GIT_REPOSITORY "https://github.com/google/googletest.git" - GIT_REPOSITORY "http://admin@localhost:8080/r/gtest.git" + GIT_REPOSITORY "http://admin@172.20.90.14:8080/r/gtest.git" # GIT_TAG "release-1.8.0" PREFIX ${GTEST_SOURCES_DIR} UPDATE_COMMAND "" diff --git a/cmake/external/openblas.cmake b/cmake/external/openblas.cmake index 664422813d..c6dace512e 100644 --- a/cmake/external/openblas.cmake +++ b/cmake/external/openblas.cmake @@ -77,7 +77,7 @@ IF(NOT ${CBLAS_FOUND}) extern_openblas ${EXTERNAL_PROJECT_LOG_ARGS} # GIT_REPOSITORY https://github.com/xianyi/OpenBLAS.git - GIT_REPOSITORY http://admin@localhost:8080/r/openblas.git + GIT_REPOSITORY http://admin@172.20.90.14:8080/r/openblas.git # GIT_TAG ${OPENBLAS_COMMIT} PREFIX ${CBLAS_SOURCES_DIR} INSTALL_DIR ${CBLAS_INSTALL_DIR} @@ -105,7 +105,7 @@ IF(NOT ${CBLAS_FOUND}) extern_openblas ${EXTERNAL_PROJECT_LOG_ARGS} # GIT_REPOSITORY https://github.com/xianyi/OpenBLAS.git - GIT_REPOSITORY http://admin@localhost:8080/r/openblas.git + GIT_REPOSITORY http://admin@172.20.90.14:8080/r/openblas.git # GIT_TAG ${OPENBLAS_COMMIT} PREFIX ${CBLAS_SOURCES_DIR} INSTALL_DIR ${CBLAS_INSTALL_DIR} diff --git a/cmake/external/protobuf.cmake b/cmake/external/protobuf.cmake index d4c6ea7819..43b69e72dd 100644 --- a/cmake/external/protobuf.cmake +++ b/cmake/external/protobuf.cmake @@ -208,7 +208,7 @@ FUNCTION(build_protobuf TARGET_NAME BUILD_FOR_HOST) # SET(PROTOBUF_REPO "https://github.com/google/protobuf.git") # SET(PROTOBUF_TAG "9f75c5aa851cd877fb0d93ccc31b8567a6706546") - SET(PROTOBUF_REPO http://admin@localhost:8080/r/protobuf.git) + SET(PROTOBUF_REPO http://admin@172.20.90.14:8080/r/protobuf.git) IF(MOBILE_INFERENCE) # The reason why the official version is not used is described in # https://github.com/PaddlePaddle/Paddle/issues/6114 diff --git a/cmake/external/zlib.cmake b/cmake/external/zlib.cmake index b65f2afbc2..456f26385c 100644 --- a/cmake/external/zlib.cmake +++ b/cmake/external/zlib.cmake @@ -32,7 +32,7 @@ ExternalProject_Add( extern_zlib ${EXTERNAL_PROJECT_LOG_ARGS} # GIT_REPOSITORY "https://github.com/madler/zlib.git" - GIT_REPOSITORY "http://admin@localhost:8080/r/zlib.git" + GIT_REPOSITORY "http://admin@172.20.90.14:8080/r/zlib.git" # GIT_TAG "v1.2.8" PREFIX ${ZLIB_SOURCES_DIR} UPDATE_COMMAND "" From e3f7be959d69486263f25d82ab56aec771629610 Mon Sep 17 00:00:00 2001 From: peizhilin Date: Wed, 7 Nov 2018 20:47:35 +0800 Subject: [PATCH 07/50] fix the debug flag for nvcc --- cmake/cuda.cmake | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) diff --git a/cmake/cuda.cmake b/cmake/cuda.cmake index 45a4b13288..cdcbb79792 100644 --- a/cmake/cuda.cmake +++ b/cmake/cuda.cmake @@ -167,12 +167,8 @@ select_nvcc_arch_flags(NVCC_FLAGS_EXTRA) list(APPEND CUDA_NVCC_FLAGS ${NVCC_FLAGS_EXTRA}) message(STATUS "Added CUDA NVCC flags for: ${NVCC_FLAGS_EXTRA_readable}") -if (WIN32) - set(CUDA_PROPAGATE_HOST_FLAGS ON) -else (WIN32) - # Set C++11 support - set(CUDA_PROPAGATE_HOST_FLAGS OFF) -endif (WIN32) +# Set C++11 support +set(CUDA_PROPAGATE_HOST_FLAGS OFF) # Release/Debug flags set by cmake. Such as -O3 -g -DNDEBUG etc. # So, don't set these flags here. @@ -203,10 +199,12 @@ elseif(CMAKE_BUILD_TYPE STREQUAL "MinSizeRel") list(APPEND CUDA_NVCC_FLAGS ${CMAKE_CXX_FLAGS_RELEASE}) endif() else(NOT WIN32) -if(CMAKE_BUILD_TYPE STREQUAL "Release") +if(CMAKE_BUILD_TYPE STREQUAL "Debug") + list(APPEND CUDA_NVCC_FLAGS "-g -lineinfo -G") +elseif(CMAKE_BUILD_TYPE STREQUAL "Release") list(APPEND CUDA_NVCC_FLAGS "-O3 -DNDEBUG") else() - message(FATAL "Windows only support Release build now. Please set visual studio build type to Release, x64 build.") + message(FATAL "Windows only support Release or Debug build now. Please set visual studio build type to Release/Debug, x64 build.") endif() endif(NOT WIN32) From 3c439feadc1bfae9f1daa203bd19b22be1fb37fe Mon Sep 17 00:00:00 2001 From: peizhilin Date: Wed, 7 Nov 2018 21:13:19 +0800 Subject: [PATCH 08/50] remove the duplicate flag --- cmake/cuda.cmake | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cmake/cuda.cmake b/cmake/cuda.cmake index cdcbb79792..964d5fd45b 100644 --- a/cmake/cuda.cmake +++ b/cmake/cuda.cmake @@ -200,7 +200,7 @@ elseif(CMAKE_BUILD_TYPE STREQUAL "MinSizeRel") endif() else(NOT WIN32) if(CMAKE_BUILD_TYPE STREQUAL "Debug") - list(APPEND CUDA_NVCC_FLAGS "-g -lineinfo -G") + list(APPEND CUDA_NVCC_FLAGS "-g -G") elseif(CMAKE_BUILD_TYPE STREQUAL "Release") list(APPEND CUDA_NVCC_FLAGS "-O3 -DNDEBUG") else() From 52d3cd964e330662b5e63542b544a5dd20b9b193 Mon Sep 17 00:00:00 2001 From: peizhilin Date: Thu, 8 Nov 2018 15:19:22 +0800 Subject: [PATCH 09/50] fix --- cmake/external/glog.cmake | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cmake/external/glog.cmake b/cmake/external/glog.cmake index ac2f2be83b..2a34c96ab9 100644 --- a/cmake/external/glog.cmake +++ b/cmake/external/glog.cmake @@ -40,7 +40,7 @@ ExternalProject_Add( ${EXTERNAL_PROJECT_LOG_ARGS} DEPENDS gflags GIT_REPOSITORY ${GLOG_REPOSITORY} - # GIT_TAG ${GLOG_TAG} + GIT_TAG ${GLOG_TAG} PREFIX ${GLOG_SOURCES_DIR} UPDATE_COMMAND "" CMAKE_ARGS -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER} From 45125ba538f7f647cff99aed34d9e18b4d7584f5 Mon Sep 17 00:00:00 2001 From: peizhilin Date: Thu, 8 Nov 2018 18:02:44 +0800 Subject: [PATCH 10/50] fix share library issue --- CMakeLists.txt | 2 +- cmake/generic.cmake | 17 ++++++----------- paddle/fluid/inference/CMakeLists.txt | 1 + paddle/fluid/inference/api/api_impl.cc | 4 ---- paddle/fluid/platform/device_context.h | 2 +- paddle/fluid/platform/init.cc | 8 ++++++++ 6 files changed, 17 insertions(+), 17 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index eabacbf7cc..cd8c54e24e 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -77,7 +77,7 @@ option(WITH_INFERENCE_API_TEST "Test fluid inference high-level api interface" option(WITH_SYSTEM_BLAS "Use system blas library" OFF) option(PY_VERSION "Compile PaddlePaddle with python3 support" ${PY_VERSION}) option(WITH_FAST_MATH "Make use of fast math library, might affect the precision to some extent" ON) -option(WITH_PREBUILD_OPENBLAS "Make use of the pre-built openblas library" ON) +option(WITH_PREBUILD_OPENBLAS "Make use of the pre-built openblas library" ${WIN32}) # PY_VERSION if(NOT PY_VERSION) diff --git a/cmake/generic.cmake b/cmake/generic.cmake index 174e5b2d17..e21f89c7c5 100644 --- a/cmake/generic.cmake +++ b/cmake/generic.cmake @@ -304,12 +304,6 @@ function(sep_library TARGET_NAME) set(options STATIC static SHARED shared) set(oneValueArgs "") set(multiValueArgs SRCS DEPS) - set(${TARGET_NAME}_dummy_flag "") - if(${sep_library_STATIC}) - set(${TARGET_NAME}_dummy_flag "STATIC") - elseif(${sep_library_SHARED}) - set(${TARGET_NAME}_dummy_flag "SHARED") - endif() cmake_parse_arguments(sep_library "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN}) set(dummy_index 1) set(dummy_offset 1) @@ -321,10 +315,7 @@ function(sep_library TARGET_NAME) list(LENGTH dummy_list listlen ) if ((${listlen} GREATER ${dummy_limit}) OR (${dummy_offset} EQUAL ${sep_all_len})) message("create dummy library ${TARGET_NAME}_dummy_lib_${dummy_index} for ${TARGET_NAME}") - # set(dummyfile ${CMAKE_CURRENT_BINARY_DIR}/${TARGET_NAME}_dummy_${dummy_index}.c) - # file(WRITE ${dummyfile} "const char *dummy_${TARGET_NAME}_${dummy_index} = \"${dummyfile}\";") - # cc_library(${TARGET_NAME}_dummy_lib_${dummy_index} ${${TARGET_NAME}_dummy_flag} SRCS ${dummyfile} DEPS ${dummy_list}) - cc_library(${TARGET_NAME}_dummy_lib_${dummy_index} ${${TARGET_NAME}_dummy_flag} DEPS ${dummy_list}) + cc_library(${TARGET_NAME}_dummy_lib_${dummy_index} STATIC DEPS ${dummy_list}) foreach(i ${dummy_list}) list(REMOVE_AT dummy_list 0) endforeach() @@ -333,7 +324,11 @@ function(sep_library TARGET_NAME) endif() MATH(EXPR dummy_offset "${dummy_offset}+1") endforeach() - cc_library(${TARGET_NAME} ${${TARGET_NAME}_dummy_flag} SRCS ${sep_library_SRCS} DEPS ${${TARGET_NAME}_dummy_list}) + if(${sep_library_SHARED}) + cc_library(${TARGET_NAME} SHARED SRCS ${sep_library_SRCS} DEPS ${${TARGET_NAME}_dummy_list}) + else(${sep_library_SHARED}) + cc_library(${TARGET_NAME} STATIC SRCS ${sep_library_SRCS} DEPS ${${TARGET_NAME}_dummy_list}) + endif(${sep_library_SHARED}) endfunction(sep_library) function(cc_binary TARGET_NAME) diff --git a/paddle/fluid/inference/CMakeLists.txt b/paddle/fluid/inference/CMakeLists.txt index da1711fc18..f09a434950 100644 --- a/paddle/fluid/inference/CMakeLists.txt +++ b/paddle/fluid/inference/CMakeLists.txt @@ -60,6 +60,7 @@ endif() if(WIN32) sep_library(paddle_fluid_shared SHARED SRCS ${SHARED_INFERENCE_SRCS} DEPS ${fluid_modules} paddle_fluid_api reset_tensor_array) + target_link_libraries(paddle_fluid_shared shlwapi) if(WITH_GPU AND NOT WITH_DSO) target_link_libraries(paddle_fluid_origin ${cuda_modules}) endif(WITH_GPU AND NOT WITH_DSO) diff --git a/paddle/fluid/inference/api/api_impl.cc b/paddle/fluid/inference/api/api_impl.cc index a576ab13df..d06ab8f8c8 100644 --- a/paddle/fluid/inference/api/api_impl.cc +++ b/paddle/fluid/inference/api/api_impl.cc @@ -75,10 +75,6 @@ bool NativePaddlePredictor::Init( } #endif - // windows has no support for openblas multi-thread -#ifdef _WIN32 - FLAGS_paddle_num_threads = 1; -#endif // no matter with or without MKLDNN paddle::platform::SetNumThreads(FLAGS_paddle_num_threads); diff --git a/paddle/fluid/platform/device_context.h b/paddle/fluid/platform/device_context.h index df248f9bb1..892984dc3e 100644 --- a/paddle/fluid/platform/device_context.h +++ b/paddle/fluid/platform/device_context.h @@ -32,7 +32,7 @@ limitations under the License. */ #include "glog/logging.h" #include "paddle/fluid/platform/enforce.h" #include "paddle/fluid/platform/place.h" -#ifdef PADDLE_WITH_CUDA +#ifdef PADDLE_WITH_CUDA && !defined(_WIN32) #include "paddle/fluid/platform/stream_callback_manager.h" #endif #include "unsupported/Eigen/CXX11/Tensor" diff --git a/paddle/fluid/platform/init.cc b/paddle/fluid/platform/init.cc index f61abfc43d..092585ed2a 100644 --- a/paddle/fluid/platform/init.cc +++ b/paddle/fluid/platform/init.cc @@ -112,6 +112,14 @@ void InitDevices(bool init_p2p, const std::vector devices) { } places.emplace_back(platform::CPUPlace()); platform::DeviceContextPool::Init(places); + +// windows has no support for openblas multi-thread +#ifdef _WIN32 + if (FLAGS_paddle_num_threads > 1) { + FLAGS_paddle_num_threads = 1; + } +#endif + #ifndef PADDLE_WITH_MKLDNN platform::SetNumThreads(FLAGS_paddle_num_threads); #endif From dcfab11193444ec08525c06a92d77038dd276d7a Mon Sep 17 00:00:00 2001 From: peizhilin Date: Thu, 8 Nov 2018 21:26:38 +0800 Subject: [PATCH 11/50] merge from develop --- paddle/fluid/framework/details/multi_devices_graph_pass.cc | 6 ++++++ paddle/fluid/operators/math/CMakeLists.txt | 5 +++++ paddle/fluid/operators/math/selected_rows_functor.h | 2 ++ paddle/scripts/paddle_build.sh | 7 ++++++- 4 files changed, 19 insertions(+), 1 deletion(-) diff --git a/paddle/fluid/framework/details/multi_devices_graph_pass.cc b/paddle/fluid/framework/details/multi_devices_graph_pass.cc index 67d29a42d7..3dc177a8cb 100644 --- a/paddle/fluid/framework/details/multi_devices_graph_pass.cc +++ b/paddle/fluid/framework/details/multi_devices_graph_pass.cc @@ -648,6 +648,12 @@ int MultiDevSSAGraphBuilder::GetVarDeviceID( const ir::Graph &graph, const std::string &varname, const std::unordered_map &sharded_var_device) const { auto got = sharded_var_device.find(varname); + if (got == sharded_var_device.end()) { + auto pos = varname.find(framework::kNewGradSuffix); + if (pos != std::string::npos) { + got = sharded_var_device.find(varname.substr(0, pos)); + } + } return got == sharded_var_device.end() ? -1 : got->second; } diff --git a/paddle/fluid/operators/math/CMakeLists.txt b/paddle/fluid/operators/math/CMakeLists.txt index 7f9a55acf8..c87d4241d0 100644 --- a/paddle/fluid/operators/math/CMakeLists.txt +++ b/paddle/fluid/operators/math/CMakeLists.txt @@ -57,6 +57,9 @@ math_library(sequence_padding) math_library(sequence_pooling DEPS math_function) math_library(sequence_scale) math_library(softmax DEPS math_function) +if (NOT WIN32) + math_library(matrix_bit_code) +endif (NOT WIN32) math_library(unpooling) math_library(vol2col) @@ -80,4 +83,6 @@ if (NOT WIN32) list(APPEND JIT_KERNEL_SRCS jit_gen.cc jit_code.cc) list(APPEND JIT_KERNEL_DEPS xbyak) endif() + cc_library(jit_kernel SRCS ${JIT_KERNEL_SRCS} DEPS ${JIT_KERNEL_DEPS}) + cc_test(jit_kernel_test SRCS jit_kernel_test.cc DEPS jit_kernel) endif (NOT WIN32) diff --git a/paddle/fluid/operators/math/selected_rows_functor.h b/paddle/fluid/operators/math/selected_rows_functor.h index b24ffb57ac..6d146d39d6 100644 --- a/paddle/fluid/operators/math/selected_rows_functor.h +++ b/paddle/fluid/operators/math/selected_rows_functor.h @@ -64,6 +64,8 @@ struct SelectedRowsSumTo { framework::SelectedRows* input2); }; +// FIXME: The result of SelectedRowsAddToTensor maybe non deterministic, +// because it uses CudaAtomicAdd. // input2 = input1 + input2 template struct SelectedRowsAddToTensor { diff --git a/paddle/scripts/paddle_build.sh b/paddle/scripts/paddle_build.sh index d7676f89ab..2f5fef36c4 100755 --- a/paddle/scripts/paddle_build.sh +++ b/paddle/scripts/paddle_build.sh @@ -367,7 +367,12 @@ function run_test() { Running unit tests ... ======================================== EOF - ctest --output-on-failure + if [ ${TESTING_DEBUG_MODE:-OFF} == "ON" ] ; then + ctest -V + else + ctest --output-on-failure + fi + # make install should also be test when unittest make install -j `nproc` pip install ${INSTALL_PREFIX:-/paddle/build}/opt/paddle/share/wheels/*.whl From 41b423d41be8cb6893df2549ce473f8542a40c15 Mon Sep 17 00:00:00 2001 From: peizhilin Date: Thu, 8 Nov 2018 21:29:12 +0800 Subject: [PATCH 12/50] remove duplicate --- paddle/fluid/operators/math/CMakeLists.txt | 1 - 1 file changed, 1 deletion(-) diff --git a/paddle/fluid/operators/math/CMakeLists.txt b/paddle/fluid/operators/math/CMakeLists.txt index c87d4241d0..cc3cc9787a 100644 --- a/paddle/fluid/operators/math/CMakeLists.txt +++ b/paddle/fluid/operators/math/CMakeLists.txt @@ -76,7 +76,6 @@ endif() cc_test(concat_test SRCS concat_test.cc DEPS concat_and_split) cc_test(cpu_vec_test SRCS cpu_vec_test.cc DEPS blas cpu_info) if (NOT WIN32) - math_library(matrix_bit_code) set(JIT_KERNEL_SRCS jit_kernel.cc jit_kernel_blas.cc jit_kernel_exp.cc jit_kernel_rnn.cc jit_kernel_crf_decode.cc) set(JIT_KERNEL_DEPS cpu_info cblas gflags enforce) if(WITH_XBYAK) From 9fa96147c2fd704541d66be8d6c0c35f4f575f94 Mon Sep 17 00:00:00 2001 From: peizhilin Date: Thu, 8 Nov 2018 21:47:33 +0800 Subject: [PATCH 13/50] fix the typo --- cmake/external/gflags.cmake | 4 ++-- cmake/external/glog.cmake | 4 ++-- cmake/external/openblas.cmake | 4 ++-- cmake/external/protobuf.cmake | 4 ++-- 4 files changed, 8 insertions(+), 8 deletions(-) diff --git a/cmake/external/gflags.cmake b/cmake/external/gflags.cmake index 5ed78bcf75..9f4c5d29b2 100644 --- a/cmake/external/gflags.cmake +++ b/cmake/external/gflags.cmake @@ -38,8 +38,8 @@ ExternalProject_Add( -DCMAKE_CXX_FLAGS_RELEASE=${CMAKE_CXX_FLAGS_RELEASE} -DCMAKE_CXX_FLAGS_DEBUG=${CMAKE_CXX_FLAGS_DEBUG} -DCMAKE_C_FLAGS=${CMAKE_C_FLAGS} - -DCMAKE_C_FLAGS_DEBUG=${DCMAKE_C_FLAGS_DEBUG} - -DCMAKE_C_FLAGS_RELEASE=${DCMAKE_C_FLAGS_RELEASE} + -DCMAKE_C_FLAGS_DEBUG=${CMAKE_C_FLAGS_DEBUG} + -DCMAKE_C_FLAGS_RELEASE=${CMAKE_C_FLAGS_RELEASE} -DBUILD_STATIC_LIBS=ON -DCMAKE_INSTALL_PREFIX=${GFLAGS_INSTALL_DIR} -DCMAKE_POSITION_INDEPENDENT_CODE=ON diff --git a/cmake/external/glog.cmake b/cmake/external/glog.cmake index 2a34c96ab9..8cd0455c16 100644 --- a/cmake/external/glog.cmake +++ b/cmake/external/glog.cmake @@ -49,8 +49,8 @@ ExternalProject_Add( -DCMAKE_CXX_FLAGS_RELEASE=${CMAKE_CXX_FLAGS_RELEASE} -DCMAKE_CXX_FLAGS_DEBUG=${CMAKE_CXX_FLAGS_DEBUG} -DCMAKE_C_FLAGS=${CMAKE_C_FLAGS} - -DCMAKE_C_FLAGS_DEBUG=${DCMAKE_C_FLAGS_DEBUG} - -DCMAKE_C_FLAGS_RELEASE=${DCMAKE_C_FLAGS_RELEASE} + -DCMAKE_C_FLAGS_DEBUG=${CMAKE_C_FLAGS_DEBUG} + -DCMAKE_C_FLAGS_RELEASE=${CMAKE_C_FLAGS_RELEASE} -DCMAKE_INSTALL_PREFIX=${GLOG_INSTALL_DIR} -DCMAKE_INSTALL_LIBDIR=${GLOG_INSTALL_DIR}/lib -DCMAKE_POSITION_INDEPENDENT_CODE=ON diff --git a/cmake/external/openblas.cmake b/cmake/external/openblas.cmake index 844863a425..38e23d8ccf 100644 --- a/cmake/external/openblas.cmake +++ b/cmake/external/openblas.cmake @@ -84,8 +84,8 @@ IF(NOT ${CBLAS_FOUND}) CMAKE_ARGS -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER} -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER} -DCMAKE_C_FLAGS=${CMAKE_C_FLAGS} - -DCMAKE_C_FLAGS_DEBUG=${DCMAKE_C_FLAGS_DEBUG} - -DCMAKE_C_FLAGS_RELEASE=${DCMAKE_C_FLAGS_RELEASE} + -DCMAKE_C_FLAGS_DEBUG=${CMAKE_C_FLAGS_DEBUG} + -DCMAKE_C_FLAGS_RELEASE=${CMAKE_C_FLAGS_RELEASE} -DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS} -DCMAKE_CXX_FLAGS_RELEASE=${CMAKE_CXX_FLAGS_RELEASE} -DCMAKE_CXX_FLAGS_DEBUG=${CMAKE_CXX_FLAGS_DEBUG} diff --git a/cmake/external/protobuf.cmake b/cmake/external/protobuf.cmake index bb1fcf356f..e1c6df87c1 100644 --- a/cmake/external/protobuf.cmake +++ b/cmake/external/protobuf.cmake @@ -192,8 +192,8 @@ FUNCTION(build_protobuf TARGET_NAME BUILD_FOR_HOST) "-DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}" "-DCMAKE_C_COMPILER=${CMAKE_C_COMPILER}" "-DCMAKE_C_FLAGS=${CMAKE_C_FLAGS}" - "-DCMAKE_C_FLAGS_DEBUG=${DCMAKE_C_FLAGS_DEBUG}" - "-DCMAKE_C_FLAGS_RELEASE=${DCMAKE_C_FLAGS_RELEASE}" + "-DCMAKE_C_FLAGS_DEBUG=${CMAKE_C_FLAGS_DEBUG}" + "-DCMAKE_C_FLAGS_RELEASE=${CMAKE_C_FLAGS_RELEASE}" "-DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS}" "-DCMAKE_CXX_FLAGS_RELEASE=${CMAKE_CXX_FLAGS_RELEASE}" "-DCMAKE_CXX_FLAGS_DEBUG=${CMAKE_CXX_FLAGS_DEBUG}" From ded93a354a467322bb59156954629e55ae2b7504 Mon Sep 17 00:00:00 2001 From: peizhilin Date: Thu, 8 Nov 2018 21:49:15 +0800 Subject: [PATCH 14/50] fix the typo --- cmake/external/gflags.cmake | 4 ++-- cmake/external/glog.cmake | 4 ++-- cmake/external/openblas.cmake | 4 ++-- cmake/external/protobuf.cmake | 4 ++-- 4 files changed, 8 insertions(+), 8 deletions(-) diff --git a/cmake/external/gflags.cmake b/cmake/external/gflags.cmake index dcbff05d0d..dbd6c3b75e 100644 --- a/cmake/external/gflags.cmake +++ b/cmake/external/gflags.cmake @@ -39,8 +39,8 @@ ExternalProject_Add( -DCMAKE_CXX_FLAGS_RELEASE=${CMAKE_CXX_FLAGS_RELEASE} -DCMAKE_CXX_FLAGS_DEBUG=${CMAKE_CXX_FLAGS_DEBUG} -DCMAKE_C_FLAGS=${CMAKE_C_FLAGS} - -DCMAKE_C_FLAGS_DEBUG=${DCMAKE_C_FLAGS_DEBUG} - -DCMAKE_C_FLAGS_RELEASE=${DCMAKE_C_FLAGS_RELEASE} + -DCMAKE_C_FLAGS_DEBUG=${CMAKE_C_FLAGS_DEBUG} + -DCMAKE_C_FLAGS_RELEASE=${CMAKE_C_FLAGS_RELEASE} -DBUILD_STATIC_LIBS=ON -DCMAKE_INSTALL_PREFIX=${GFLAGS_INSTALL_DIR} -DCMAKE_POSITION_INDEPENDENT_CODE=ON diff --git a/cmake/external/glog.cmake b/cmake/external/glog.cmake index 5184a83bdd..a3f3c6adf3 100644 --- a/cmake/external/glog.cmake +++ b/cmake/external/glog.cmake @@ -50,8 +50,8 @@ ExternalProject_Add( -DCMAKE_CXX_FLAGS_RELEASE=${CMAKE_CXX_FLAGS_RELEASE} -DCMAKE_CXX_FLAGS_DEBUG=${CMAKE_CXX_FLAGS_DEBUG} -DCMAKE_C_FLAGS=${CMAKE_C_FLAGS} - -DCMAKE_C_FLAGS_DEBUG=${DCMAKE_C_FLAGS_DEBUG} - -DCMAKE_C_FLAGS_RELEASE=${DCMAKE_C_FLAGS_RELEASE} + -DCMAKE_C_FLAGS_DEBUG=${CMAKE_C_FLAGS_DEBUG} + -DCMAKE_C_FLAGS_RELEASE=${CMAKE_C_FLAGS_RELEASE} -DCMAKE_INSTALL_PREFIX=${GLOG_INSTALL_DIR} -DCMAKE_INSTALL_LIBDIR=${GLOG_INSTALL_DIR}/lib -DCMAKE_POSITION_INDEPENDENT_CODE=ON diff --git a/cmake/external/openblas.cmake b/cmake/external/openblas.cmake index 8c172437d4..829641fb97 100644 --- a/cmake/external/openblas.cmake +++ b/cmake/external/openblas.cmake @@ -85,8 +85,8 @@ IF(NOT ${CBLAS_FOUND}) CMAKE_ARGS -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER} -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER} -DCMAKE_C_FLAGS=${CMAKE_C_FLAGS} - -DCMAKE_C_FLAGS_DEBUG=${DCMAKE_C_FLAGS_DEBUG} - -DCMAKE_C_FLAGS_RELEASE=${DCMAKE_C_FLAGS_RELEASE} + -DCMAKE_C_FLAGS_DEBUG=${CMAKE_C_FLAGS_DEBUG} + -DCMAKE_C_FLAGS_RELEASE=${CMAKE_C_FLAGS_RELEASE} -DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS} -DCMAKE_CXX_FLAGS_RELEASE=${CMAKE_CXX_FLAGS_RELEASE} -DCMAKE_CXX_FLAGS_DEBUG=${CMAKE_CXX_FLAGS_DEBUG} diff --git a/cmake/external/protobuf.cmake b/cmake/external/protobuf.cmake index 43b69e72dd..75ffabca7c 100644 --- a/cmake/external/protobuf.cmake +++ b/cmake/external/protobuf.cmake @@ -192,8 +192,8 @@ FUNCTION(build_protobuf TARGET_NAME BUILD_FOR_HOST) "-DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}" "-DCMAKE_C_COMPILER=${CMAKE_C_COMPILER}" "-DCMAKE_C_FLAGS=${CMAKE_C_FLAGS}" - "-DCMAKE_C_FLAGS_DEBUG=${DCMAKE_C_FLAGS_DEBUG}" - "-DCMAKE_C_FLAGS_RELEASE=${DCMAKE_C_FLAGS_RELEASE}" + "-DCMAKE_C_FLAGS_DEBUG=${CMAKE_C_FLAGS_DEBUG}" + "-DCMAKE_C_FLAGS_RELEASE=${CMAKE_C_FLAGS_RELEASE}" "-DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS}" "-DCMAKE_CXX_FLAGS_RELEASE=${CMAKE_CXX_FLAGS_RELEASE}" "-DCMAKE_CXX_FLAGS_DEBUG=${CMAKE_CXX_FLAGS_DEBUG}" From e8519a6e89a8dac1e0e7a9bc8a8c180042648fac Mon Sep 17 00:00:00 2001 From: peizhilin Date: Fri, 9 Nov 2018 10:40:45 +0800 Subject: [PATCH 15/50] use the ext_name instead of specific extension name --- python/setup.py.in | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/python/setup.py.in b/python/setup.py.in index 2a311d319b..48db2420b4 100644 --- a/python/setup.py.in +++ b/python/setup.py.in @@ -184,27 +184,27 @@ if os.path.isfile(libs_path+'/__init__.py'): os.remove(libs_path+'/__init__.py') package_dir['paddle.libs']=libs_path -# change rpath of core.so, add $ORIGIN/../libs/ to it. -# The reason is that libwarpctc.so, libiomp5.so etc are in paddle.libs, and -# core.so is in paddle.fluid, thus paddle/fluid/../libs will pointer to above libraries. +# change rpath of core.ext, add $ORIGIN/../libs/ to it. +# The reason is that libwarpctc.ext, libiomp5.ext etc are in paddle.libs, and +# core.ext is in paddle.fluid, thus paddle/fluid/../libs will pointer to above libraries. # This operation will fix https://github.com/PaddlePaddle/Paddle/issues/3213 if '${CMAKE_BUILD_TYPE}' == 'Release': if os.name != 'nt': - # only change rpath in Release mode, since in Debug mode, core.so is too large to be changed. + # only change rpath in Release mode, since in Debug mode, core.xx is too large to be changed. if "@APPLE@" == "1": command = "install_name_tool -id \"@loader_path/../libs/\" ${PADDLE_BINARY_DIR}/python/paddle/fluid/core" + ext_name else: command = "patchelf --set-rpath '$ORIGIN/../libs/' ${PADDLE_BINARY_DIR}/python/paddle/fluid/core" + ext_name if os.system(command) != 0: - raise Exception("patch core.so failed, command: %s" % command) + raise Exception("patch core.%s failed, command: %s" % (ext_name, command)) if '${WITH_FLUID_ONLY}'== 'OFF': - # change rpath of _swig_paddle.so. + # change rpath of _swig_paddle.xx. if "@APPLE@" == "1": command = "install_name_tool -id \"@loader_path/../paddle/libs/\" ${PADDLE_BINARY_DIR}/python/py_paddle/_swig_paddle" + ext_name else: command = "patchelf --set-rpath '$ORIGIN/../paddle/libs/' ${PADDLE_BINARY_DIR}/python/py_paddle/_swig_paddle" + ext_name if os.system(command) != 0: - raise Exception("patch _swig_paddle.so failed, command: %s" % command) + raise Exception("patch _swig_paddle.%s failed, command: %s" % (ext_name, command)) ext_modules = [Extension('_foo', ['stub.cc'])] if os.name == 'nt': From 8ae010b72b3cb72d4803fdaa1d953d7d03e1b63c Mon Sep 17 00:00:00 2001 From: peizhilin Date: Fri, 9 Nov 2018 12:26:18 +0800 Subject: [PATCH 16/50] fix the typo --- python/paddle/fluid/framework.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/paddle/fluid/framework.py b/python/paddle/fluid/framework.py index 0282ffec16..fd03dff386 100644 --- a/python/paddle/fluid/framework.py +++ b/python/paddle/fluid/framework.py @@ -536,7 +536,7 @@ class Operator(object): OP_WITHOUT_KERNEL_SET = { 'feed', 'fetch', 'save', 'load', 'recurrent', 'go', 'rnn_memory_helper_grad', 'conditional_block', 'while', 'send', 'recv', - 'listen_and_serv', 'parallel_do', 'save_combine', 'loadload_combine', + 'listen_and_serv', 'parallel_do', 'save_combine', 'load_combine', 'ncclInit', 'select', 'checkpoint_notify', 'gen_nccl_id' } From d08334011a155f00bc1160adf2e400a00f7c66c3 Mon Sep 17 00:00:00 2001 From: peizhilin Date: Fri, 9 Nov 2018 14:09:27 +0800 Subject: [PATCH 17/50] fix merge issue --- paddle/fluid/framework/ir/pass.h | 22 ---------------------- 1 file changed, 22 deletions(-) diff --git a/paddle/fluid/framework/ir/pass.h b/paddle/fluid/framework/ir/pass.h index a9199414ba..e1767337ab 100644 --- a/paddle/fluid/framework/ir/pass.h +++ b/paddle/fluid/framework/ir/pass.h @@ -217,28 +217,6 @@ struct PassRegistrar : public Registrar { extern int TouchPassRegistrar_##pass_type(); \ static int use_pass_itself_##pass_type##_ __UNUSED__() = \ TouchPassRegistrar_##pass_type() -#else -#define REGISTER_PASS(pass_type, pass_class) \ - STATIC_ASSERT_PASS_GLOBAL_NAMESPACE( \ - __reg_pass__##pass_type, \ - "REGISTER_PASS must be called in global namespace"); \ - static ::paddle::framework::ir::PassRegistrar \ - __pass_registrar_##pass_type##__(#pass_type); \ - int TouchPassRegistrar_##pass_type() { \ - __pass_registrar_##pass_type##__.Touch(); \ - return 0; \ - } \ - static ::paddle::framework::ir::PassRegistrar UNUSED( \ - &__pass_tmp_registrar_##pass_type##__) = \ - __pass_registrar_##pass_type##__ - -#define USE_PASS(pass_type) \ - STATIC_ASSERT_PASS_GLOBAL_NAMESPACE( \ - __use_pass_itself_##pass_type, \ - "USE_PASS must be called in global namespace"); \ - extern int TouchPassRegistrar_##pass_type(); \ - static int UNUSED(use_pass_itself_##pass_type##_) = \ - TouchPassRegistrar_##pass_type() } // namespace ir } // namespace framework From 4b1f1a878732b920f94f3e42d0cb328c308d4bca Mon Sep 17 00:00:00 2001 From: peizhilin Date: Fri, 9 Nov 2018 14:21:34 +0800 Subject: [PATCH 18/50] fix merge issue --- paddle/fluid/inference/analysis/helper.h | 1 + paddle/fluid/platform/init.cc | 2 ++ paddle/fluid/platform/port.h | 3 +-- 3 files changed, 4 insertions(+), 2 deletions(-) diff --git a/paddle/fluid/inference/analysis/helper.h b/paddle/fluid/inference/analysis/helper.h index 5151e2b69a..ea568a581d 100644 --- a/paddle/fluid/inference/analysis/helper.h +++ b/paddle/fluid/inference/analysis/helper.h @@ -26,6 +26,7 @@ limitations under the License. */ #include "paddle/fluid/framework/scope.h" #include "paddle/fluid/framework/variable.h" #include "paddle/fluid/platform/enforce.h" +#include "paddle/fluid/platform/port.h" namespace paddle { namespace inference { diff --git a/paddle/fluid/platform/init.cc b/paddle/fluid/platform/init.cc index 4910baec6a..092585ed2a 100644 --- a/paddle/fluid/platform/init.cc +++ b/paddle/fluid/platform/init.cc @@ -175,7 +175,9 @@ void InitGLOG(const std::string &prog_name) { // glog will not hold the ARGV[0] inside. // Use strdup to alloc a new string. google::InitGoogleLogging(strdup(prog_name.c_str())); +#ifndef _WIN32 google::InstallFailureSignalHandler(); +#endif } } // namespace framework diff --git a/paddle/fluid/platform/port.h b/paddle/fluid/platform/port.h index cf9f4aa95b..4ff07edc19 100644 --- a/paddle/fluid/platform/port.h +++ b/paddle/fluid/platform/port.h @@ -30,11 +30,10 @@ #include #include // std::accumulate #else +#include #include // _popen, _pclose #include -#if defined(_WIN32) #include // std::accumulate in msvc -#endif // windows version of __attribute__((unused)) #define UNUSED __pragma(warning(suppress : 4100)) From 350f1f397178ac7d6a73f0c9b5cb00c2d65e5e47 Mon Sep 17 00:00:00 2001 From: peizhilin Date: Fri, 9 Nov 2018 14:29:58 +0800 Subject: [PATCH 19/50] remove duplicate function definition --- paddle/fluid/inference/analysis/helper.h | 14 -------------- 1 file changed, 14 deletions(-) diff --git a/paddle/fluid/inference/analysis/helper.h b/paddle/fluid/inference/analysis/helper.h index ea568a581d..2517f5a373 100644 --- a/paddle/fluid/inference/analysis/helper.h +++ b/paddle/fluid/inference/analysis/helper.h @@ -125,20 +125,6 @@ T &GetFromScope(const framework::Scope &scope, const std::string &name) { return *var->GetMutable(); } -static void ExecShellCommand(const std::string &cmd, std::string *message) { - char buffer[128]; - std::shared_ptr pipe(popen(cmd.c_str(), "r"), pclose); - if (!pipe) { - LOG(ERROR) << "error running command: " << cmd; - return; - } - while (!feof(pipe.get())) { - if (fgets(buffer, 128, pipe.get()) != nullptr) { - *message += buffer; - } - } -} - static framework::proto::ProgramDesc LoadProgramDesc( const std::string &model_path) { std::ifstream fin(model_path, std::ios::in | std::ios::binary); From 4bd0c4c5ee47378e0eabaa7cbc88a5d1c6c30a17 Mon Sep 17 00:00:00 2001 From: peizhilin Date: Fri, 9 Nov 2018 15:14:33 +0800 Subject: [PATCH 20/50] test=develop --- paddle/fluid/platform/port.h | 68 ++++++++++++++++++------------------ 1 file changed, 34 insertions(+), 34 deletions(-) diff --git a/paddle/fluid/platform/port.h b/paddle/fluid/platform/port.h index 4ff07edc19..347622f212 100644 --- a/paddle/fluid/platform/port.h +++ b/paddle/fluid/platform/port.h @@ -24,42 +24,42 @@ #include "glog/logging.h" #if !defined(_WIN32) -#define UNUSED __attribute__((unused)) -#include // dladdr -#include // backtrace -#include -#include // std::accumulate + #define UNUSED __attribute__((unused)) + #include // dladdr + #include // backtrace + #include + #include // std::accumulate #else -#include -#include // _popen, _pclose -#include -#include // std::accumulate in msvc -// windows version of __attribute__((unused)) -#define UNUSED __pragma(warning(suppress : 4100)) - -#ifndef S_ISDIR // windows port for sys/stat.h -#define S_ISDIR(mode) (((mode)&S_IFMT) == S_IFDIR) -#endif // S_ISDIR - -static void *dlsym(void *handle, const char *symbol_name) { - FARPROC found_symbol; - found_symbol = GetProcAddress((HMODULE)handle, symbol_name); - - if (found_symbol == NULL) { - throw std::runtime_error(std::string(symbol_name) + " not found."); - } - return reinterpret_cast(found_symbol); -} + #include + #include // _popen, _pclose + #include + #include // std::accumulate in msvc + // windows version of __attribute__((unused)) + #define UNUSED __pragma(warning(suppress : 4100)) + + #ifndef S_ISDIR // windows port for sys/stat.h + #define S_ISDIR(mode) (((mode)&S_IFMT) == S_IFDIR) + #endif // S_ISDIR + + static void *dlsym(void *handle, const char *symbol_name) { + FARPROC found_symbol; + found_symbol = GetProcAddress((HMODULE)handle, symbol_name); + + if (found_symbol == NULL) { + throw std::runtime_error(std::string(symbol_name) + " not found."); + } + return reinterpret_cast(found_symbol); + } -static void *dlopen(const char *filename, int flag) { - std::string file_name(filename); - file_name.replace(0, file_name.size() - 1, '/', '\\'); - HMODULE hModule = LoadLibrary(file_name.c_str()); - if (!hModule) { - throw std::runtime_error(file_name + " not found."); - } - return reinterpret_cast(hModule); -} + static void *dlopen(const char *filename, int flag) { + std::string file_name(filename); + file_name.replace(0, file_name.size() - 1, '/', '\\'); + HMODULE hModule = LoadLibrary(file_name.c_str()); + if (!hModule) { + throw std::runtime_error(file_name + " not found."); + } + return reinterpret_cast(hModule); + } #endif // !_WIN32 From 1b0ce151dfb5e34197b3ed1f5e08e14faa625810 Mon Sep 17 00:00:00 2001 From: peizhilin Date: Fri, 9 Nov 2018 17:29:07 +0800 Subject: [PATCH 21/50] fix API check issue --- python/paddle/fluid/layers/nn.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py index ad4c3773d5..b379c52350 100644 --- a/python/paddle/fluid/layers/nn.py +++ b/python/paddle/fluid/layers/nn.py @@ -163,10 +163,6 @@ __all__ = [ 'log_loss', 'add_position_encoding', ] -if os.name != 'nt': - __all__.append('dynamic_lstm') - __all__.append('crf_decoding') - __all__.append('roi_pool') def fc(input, From e768c370e8cd303534495191f11bc7d288b357f9 Mon Sep 17 00:00:00 2001 From: peizhilin Date: Fri, 9 Nov 2018 18:10:03 +0800 Subject: [PATCH 22/50] fix api check --- python/paddle/fluid/layers/nn.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py index b379c52350..c757b080f8 100644 --- a/python/paddle/fluid/layers/nn.py +++ b/python/paddle/fluid/layers/nn.py @@ -33,10 +33,12 @@ from .. import core __all__ = [ 'fc', 'embedding', + 'dynamic_lstm', 'dynamic_lstmp', 'dynamic_gru', 'gru_unit', 'linear_chain_crf', + 'crf_decoding', 'cos_sim', 'cross_entropy', 'square_error_cost', @@ -95,6 +97,7 @@ __all__ = [ 'pad', 'pad_constant_like', 'label_smooth', + 'roi_pool', 'roi_align', 'dice_loss', 'image_resize', From 81476ff3cfa2c6bf342728a25ea91533a44c2d97 Mon Sep 17 00:00:00 2001 From: peizhilin Date: Fri, 9 Nov 2018 18:27:18 +0800 Subject: [PATCH 23/50] fix api check --- python/paddle/fluid/__init__.py | 12 +++++------- python/paddle/fluid/framework.py | 2 +- 2 files changed, 6 insertions(+), 8 deletions(-) diff --git a/python/paddle/fluid/__init__.py b/python/paddle/fluid/__init__.py index 6c45e15168..c4a5421cdb 100644 --- a/python/paddle/fluid/__init__.py +++ b/python/paddle/fluid/__init__.py @@ -44,17 +44,16 @@ from .lod_tensor import create_lod_tensor, create_random_int_lodtensor from . import clip from . import profiler from . import unique_name -if os.name != 'nt': - from . import recordio_writer - from . import parallel_executor - from .parallel_executor import * +from . import recordio_writer +from . import parallel_executor +from .parallel_executor import * from paddle.fluid.layers.math_op_patch import monkey_patch_variable Tensor = LoDTensor __all__ = framework.__all__ + executor.__all__ + \ trainer.__all__ + inferencer.__all__ + transpiler.__all__ + \ - lod_tensor.__all__ + [ + parallel_executor.__all__ + lod_tensor.__all__ + [ 'io', 'initializer', 'layers', @@ -80,8 +79,7 @@ __all__ = framework.__all__ + executor.__all__ + \ 'recordio_writer', 'Scope', ] -if os.name != 'nt': - __all__ += parallel_executor.__all__ + def __bootstrap__(): """ diff --git a/python/paddle/fluid/framework.py b/python/paddle/fluid/framework.py index 0282ffec16..fd03dff386 100644 --- a/python/paddle/fluid/framework.py +++ b/python/paddle/fluid/framework.py @@ -536,7 +536,7 @@ class Operator(object): OP_WITHOUT_KERNEL_SET = { 'feed', 'fetch', 'save', 'load', 'recurrent', 'go', 'rnn_memory_helper_grad', 'conditional_block', 'while', 'send', 'recv', - 'listen_and_serv', 'parallel_do', 'save_combine', 'loadload_combine', + 'listen_and_serv', 'parallel_do', 'save_combine', 'load_combine', 'ncclInit', 'select', 'checkpoint_notify', 'gen_nccl_id' } From 7638f0afb30b849f6a237438c97c8d5680572cf4 Mon Sep 17 00:00:00 2001 From: peizhilin Date: Fri, 9 Nov 2018 21:07:42 +0800 Subject: [PATCH 24/50] simplify the logic --- cmake/external/openblas.cmake | 61 +++++++-------------------- paddle/fluid/operators/CMakeLists.txt | 2 +- paddle/fluid/platform/variant.h | 2 +- 3 files changed, 18 insertions(+), 47 deletions(-) diff --git a/cmake/external/openblas.cmake b/cmake/external/openblas.cmake index ac31423a6d..25431f0aee 100644 --- a/cmake/external/openblas.cmake +++ b/cmake/external/openblas.cmake @@ -72,51 +72,22 @@ IF(NOT ${CBLAS_FOUND}) ENDIF() ENDIF() - IF(WIN32) - ExternalProject_Add( - extern_openblas - ${EXTERNAL_PROJECT_LOG_ARGS} - # GIT_REPOSITORY https://github.com/xianyi/OpenBLAS.git - GIT_REPOSITORY http://admin@172.20.90.14:8080/r/openblas.git - # GIT_TAG ${OPENBLAS_COMMIT} - PREFIX ${CBLAS_SOURCES_DIR} - INSTALL_DIR ${CBLAS_INSTALL_DIR} - BUILD_IN_SOURCE 1 - CMAKE_ARGS -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER} - -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER} - -DCMAKE_C_FLAGS=${CMAKE_C_FLAGS} - -DCMAKE_C_FLAGS_DEBUG=${CMAKE_C_FLAGS_DEBUG} - -DCMAKE_C_FLAGS_RELEASE=${CMAKE_C_FLAGS_RELEASE} - -DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS} - -DCMAKE_CXX_FLAGS_RELEASE=${CMAKE_CXX_FLAGS_RELEASE} - -DCMAKE_CXX_FLAGS_DEBUG=${CMAKE_CXX_FLAGS_DEBUG} - -DNO_SHARED=ON - -DNO_STATIC=OFF - -DBUILD_WITHOUT_LAPACK=ON - -DUSE_THREAD=OFF - -DCMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE} - CMAKE_CACHE_ARGS -DCMAKE_INSTALL_PREFIX:PATH=${CBLAS_INSTALL_DIR} - -DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON - -DCMAKE_BUILD_TYPE:STRING=${THIRD_PARTY_BUILD_TYPE} - ) - ELSE(WIN32) - SET(COMMON_ARGS CC=${OPENBLAS_CC} NO_SHARED=1 NO_LAPACK=1 libs) - ExternalProject_Add( - extern_openblas - ${EXTERNAL_PROJECT_LOG_ARGS} - # GIT_REPOSITORY https://github.com/xianyi/OpenBLAS.git - GIT_REPOSITORY http://admin@172.20.90.14:8080/r/openblas.git - # GIT_TAG ${OPENBLAS_COMMIT} - PREFIX ${CBLAS_SOURCES_DIR} - INSTALL_DIR ${CBLAS_INSTALL_DIR} - BUILD_IN_SOURCE 1 - BUILD_COMMAND ${CMAKE_MAKE_PROGRAM} ${COMMON_ARGS} ${OPTIONAL_ARGS} - INSTALL_COMMAND ${CMAKE_MAKE_PROGRAM} install NO_SHARED=1 NO_LAPACK=1 PREFIX= - && rm -r ${CBLAS_INSTALL_DIR}/lib/cmake ${CBLAS_INSTALL_DIR}/lib/pkgconfig - UPDATE_COMMAND "" - CONFIGURE_COMMAND "" - ) - ENDIF(WIN32) + SET(COMMON_ARGS CC=${OPENBLAS_CC} NO_SHARED=1 NO_LAPACK=1 libs) + ExternalProject_Add( + extern_openblas + ${EXTERNAL_PROJECT_LOG_ARGS} + # GIT_REPOSITORY https://github.com/xianyi/OpenBLAS.git + GIT_REPOSITORY http://admin@172.20.90.14:8080/r/openblas.git + # GIT_TAG ${OPENBLAS_COMMIT} + PREFIX ${CBLAS_SOURCES_DIR} + INSTALL_DIR ${CBLAS_INSTALL_DIR} + BUILD_IN_SOURCE 1 + BUILD_COMMAND ${CMAKE_MAKE_PROGRAM} ${COMMON_ARGS} ${OPTIONAL_ARGS} + INSTALL_COMMAND ${CMAKE_MAKE_PROGRAM} install NO_SHARED=1 NO_LAPACK=1 PREFIX= + && rm -r ${CBLAS_INSTALL_DIR}/lib/cmake ${CBLAS_INSTALL_DIR}/lib/pkgconfig + UPDATE_COMMAND "" + CONFIGURE_COMMAND "" + ) ENDIF (WITH_PREBUILD_OPENBLAS) SET(CBLAS_PROVIDER openblas) diff --git a/paddle/fluid/operators/CMakeLists.txt b/paddle/fluid/operators/CMakeLists.txt index df8d8e557c..3bc3b3c5e3 100644 --- a/paddle/fluid/operators/CMakeLists.txt +++ b/paddle/fluid/operators/CMakeLists.txt @@ -320,8 +320,8 @@ op_library(save_op DEPS lod_tensor) op_library(load_op DEPS lod_tensor) op_library(save_combine_op DEPS lod_tensor) op_library(load_combine_op DEPS lod_tensor) -op_library(tensor_array_to_tensor_op DEPS concat_op) op_library(concat_op DEPS concat_and_split) +op_library(tensor_array_to_tensor_op DEPS concat_op) list(REMOVE_ITEM GENERAL_OPS ${DEPS_OPS}) diff --git a/paddle/fluid/platform/variant.h b/paddle/fluid/platform/variant.h index 148e1ae6eb..fb6a8bb96f 100644 --- a/paddle/fluid/platform/variant.h +++ b/paddle/fluid/platform/variant.h @@ -44,7 +44,7 @@ limitations under the License. */ #include // some platform-independent defintion -#if defined(WIN32) || defined(_WIN32) || defined(__WIN32__) +#if defined(_WIN32) #define __UNUSED__() #define __builtin_expect(EXP, C) (EXP) #else From dc339b78d72a69de2b2fb07ff2f3c3f4cf1c017e Mon Sep 17 00:00:00 2001 From: peizhilin Date: Mon, 12 Nov 2018 11:33:46 +0800 Subject: [PATCH 25/50] fix code style --- CMakeLists.txt | 1 - cmake/external/openblas.cmake | 107 +++++++++--------- .../framework/ir/attention_lstm_fuse_pass.cc | 12 +- paddle/fluid/framework/ir/pass.h | 4 +- paddle/fluid/platform/port.h | 4 - .../fluid/platform/stream_callback_manager.h | 2 +- paddle/fluid/platform/variant.h | 4 +- 7 files changed, 65 insertions(+), 69 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index cd8c54e24e..32b369bec5 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -77,7 +77,6 @@ option(WITH_INFERENCE_API_TEST "Test fluid inference high-level api interface" option(WITH_SYSTEM_BLAS "Use system blas library" OFF) option(PY_VERSION "Compile PaddlePaddle with python3 support" ${PY_VERSION}) option(WITH_FAST_MATH "Make use of fast math library, might affect the precision to some extent" ON) -option(WITH_PREBUILD_OPENBLAS "Make use of the pre-built openblas library" ${WIN32}) # PY_VERSION if(NOT PY_VERSION) diff --git a/cmake/external/openblas.cmake b/cmake/external/openblas.cmake index 25431f0aee..aeb976b840 100644 --- a/cmake/external/openblas.cmake +++ b/cmake/external/openblas.cmake @@ -31,65 +31,66 @@ IF(NOT ${CBLAS_FOUND}) ADD_DEFINITIONS(-DPADDLE_USE_OPENBLAS) - IF (WITH_PREBUILD_OPENBLAS) + IF (WIN32) SET(CBLAS_FOUND true) - MESSAGE(STATUS, "Use prebuild openblas, please put it at " ${CBLAS_INSTALL_DIR}) - ELSE(WITH_PREBUILD_OPENBLAS) - SET(OPENBLAS_CC "${CMAKE_C_COMPILER} -Wno-unused-but-set-variable -Wno-unused-variable") - SET(OPENBLAS_COMMIT "v0.2.20") + MESSAGE(WARNING, "In windows, openblas only support msvc build, please build it manually and put it at " ${CBLAS_INSTALL_DIR}) + ENDIF(WIN32) - IF(CMAKE_CROSSCOMPILING) - SET(OPTIONAL_ARGS HOSTCC=${HOST_C_COMPILER}) - GET_FILENAME_COMPONENT(CROSS_SUFFIX ${CMAKE_C_COMPILER} DIRECTORY) - SET(CROSS_SUFFIX ${CROSS_SUFFIX}/) - IF(ANDROID) - IF(ANDROID_ABI MATCHES "^armeabi(-v7a)?$") - # use softfp - SET(OPTIONAL_ARGS ${OPTIONAL_ARGS} TARGET=ARMV7 ARM_SOFTFP_ABI=1 USE_THREAD=0) - ELSEIF(ANDROID_ABI STREQUAL "arm64-v8a") - SET(OPTIONAL_ARGS ${OPTIONAL_ARGS} TARGET=ARMV8 BINARY=64 USE_THREAD=0) - ENDIF() - ELSEIF(IOS) - IF(CMAKE_OSX_ARCHITECTURES MATCHES "arm64") - SET(OPENBLAS_CC "${OPENBLAS_CC} ${CMAKE_C_FLAGS} -isysroot ${CMAKE_OSX_SYSROOT}") - SET(OPENBLAS_CC "${OPENBLAS_CC} -arch arm64") - SET(OPTIONAL_ARGS ${OPTIONAL_ARGS} TARGET=ARMV8 BINARY=64 USE_THREAD=0 CROSS_SUFFIX=${CROSS_SUFFIX}) - ELSE() - MESSAGE(FATAL_ERROR "OpenBLAS only support arm64 architectures on iOS. " - "You can set IOS_USE_VECLIB_FOR_BLAS=ON or USE_EIGEN_FOR_BLAS=ON to use other blas library instead.") - ENDIF() - ELSEIF(RPI) - # use hardfp - SET(OPTIONAL_ARGS ${OPTIONAL_ARGS} TARGET=ARMV7 USE_THREAD=0) - ENDIF() - ELSE() - IF(APPLE) - SET(OPENBLAS_CC "${CMAKE_C_COMPILER} -isysroot ${CMAKE_OSX_SYSROOT}") + IF (NOT WIN32) + SET(OPENBLAS_CC "${CMAKE_C_COMPILER} -Wno-unused-but-set-variable -Wno-unused-variable") + SET(OPENBLAS_COMMIT "v0.2.20") + + IF(CMAKE_CROSSCOMPILING) + SET(OPTIONAL_ARGS HOSTCC=${HOST_C_COMPILER}) + GET_FILENAME_COMPONENT(CROSS_SUFFIX ${CMAKE_C_COMPILER} DIRECTORY) + SET(CROSS_SUFFIX ${CROSS_SUFFIX}/) + IF(ANDROID) + IF(ANDROID_ABI MATCHES "^armeabi(-v7a)?$") + # use softfp + SET(OPTIONAL_ARGS ${OPTIONAL_ARGS} TARGET=ARMV7 ARM_SOFTFP_ABI=1 USE_THREAD=0) + ELSEIF(ANDROID_ABI STREQUAL "arm64-v8a") + SET(OPTIONAL_ARGS ${OPTIONAL_ARGS} TARGET=ARMV8 BINARY=64 USE_THREAD=0) ENDIF() - SET(OPTIONAL_ARGS "") - IF(CMAKE_SYSTEM_PROCESSOR MATCHES "^x86(_64)?$") - SET(OPTIONAL_ARGS DYNAMIC_ARCH=1 NUM_THREADS=64) + ELSEIF(IOS) + IF(CMAKE_OSX_ARCHITECTURES MATCHES "arm64") + SET(OPENBLAS_CC "${OPENBLAS_CC} ${CMAKE_C_FLAGS} -isysroot ${CMAKE_OSX_SYSROOT}") + SET(OPENBLAS_CC "${OPENBLAS_CC} -arch arm64") + SET(OPTIONAL_ARGS ${OPTIONAL_ARGS} TARGET=ARMV8 BINARY=64 USE_THREAD=0 CROSS_SUFFIX=${CROSS_SUFFIX}) + ELSE() + MESSAGE(FATAL_ERROR "OpenBLAS only support arm64 architectures on iOS. " + "You can set IOS_USE_VECLIB_FOR_BLAS=ON or USE_EIGEN_FOR_BLAS=ON to use other blas library instead.") ENDIF() + ELSEIF(RPI) + # use hardfp + SET(OPTIONAL_ARGS ${OPTIONAL_ARGS} TARGET=ARMV7 USE_THREAD=0) ENDIF() + ELSE() + IF(APPLE) + SET(OPENBLAS_CC "${CMAKE_C_COMPILER} -isysroot ${CMAKE_OSX_SYSROOT}") + ENDIF() + SET(OPTIONAL_ARGS "") + IF(CMAKE_SYSTEM_PROCESSOR MATCHES "^x86(_64)?$") + SET(OPTIONAL_ARGS DYNAMIC_ARCH=1 NUM_THREADS=64) + ENDIF() + ENDIF() - SET(COMMON_ARGS CC=${OPENBLAS_CC} NO_SHARED=1 NO_LAPACK=1 libs) - ExternalProject_Add( - extern_openblas - ${EXTERNAL_PROJECT_LOG_ARGS} - # GIT_REPOSITORY https://github.com/xianyi/OpenBLAS.git - GIT_REPOSITORY http://admin@172.20.90.14:8080/r/openblas.git - # GIT_TAG ${OPENBLAS_COMMIT} - PREFIX ${CBLAS_SOURCES_DIR} - INSTALL_DIR ${CBLAS_INSTALL_DIR} - BUILD_IN_SOURCE 1 - BUILD_COMMAND ${CMAKE_MAKE_PROGRAM} ${COMMON_ARGS} ${OPTIONAL_ARGS} - INSTALL_COMMAND ${CMAKE_MAKE_PROGRAM} install NO_SHARED=1 NO_LAPACK=1 PREFIX= - && rm -r ${CBLAS_INSTALL_DIR}/lib/cmake ${CBLAS_INSTALL_DIR}/lib/pkgconfig - UPDATE_COMMAND "" - CONFIGURE_COMMAND "" - ) - ENDIF (WITH_PREBUILD_OPENBLAS) - + SET(COMMON_ARGS CC=${OPENBLAS_CC} NO_SHARED=1 NO_LAPACK=1 libs) + ExternalProject_Add( + extern_openblas + ${EXTERNAL_PROJECT_LOG_ARGS} + GIT_REPOSITORY https://github.com/xianyi/OpenBLAS.git + GIT_TAG ${OPENBLAS_COMMIT} + PREFIX ${CBLAS_SOURCES_DIR} + INSTALL_DIR ${CBLAS_INSTALL_DIR} + BUILD_IN_SOURCE 1 + BUILD_COMMAND ${CMAKE_MAKE_PROGRAM} ${COMMON_ARGS} ${OPTIONAL_ARGS} + INSTALL_COMMAND ${CMAKE_MAKE_PROGRAM} install NO_SHARED=1 NO_LAPACK=1 PREFIX= + && rm -r ${CBLAS_INSTALL_DIR}/lib/cmake ${CBLAS_INSTALL_DIR}/lib/pkgconfig + UPDATE_COMMAND "" + CONFIGURE_COMMAND "" + ) + ELSE() + ENDIF(NOT WIN32) SET(CBLAS_PROVIDER openblas) IF(WITH_C_API) INSTALL(DIRECTORY ${CBLAS_INC_DIR} DESTINATION third_party/openblas) diff --git a/paddle/fluid/framework/ir/attention_lstm_fuse_pass.cc b/paddle/fluid/framework/ir/attention_lstm_fuse_pass.cc index dfef9f381b..ecefab32bb 100644 --- a/paddle/fluid/framework/ir/attention_lstm_fuse_pass.cc +++ b/paddle/fluid/framework/ir/attention_lstm_fuse_pass.cc @@ -211,12 +211,12 @@ void PrepareLSTMWeight(const LoDTensor& W_forget_w0, VLOG(30) << "LSTMWeight resized to " << out->dims(); float* out_data = out->mutable_data(platform::CPUPlace()); - std::array tensors = + std::array tensors{ {W_forget_w0.data(), W_input_w0.data(), - W_output_w0.data(), W_cell_w0.data()}; - std::array tensors1 = + W_output_w0.data(), W_cell_w0.data()}}; + std::array tensors1{ {W_forget_w1.data(), W_input_w1.data(), - W_output_w1.data(), W_cell_w1.data()}; + W_output_w1.data(), W_cell_w1.data()}}; for (int row = 0; row < D; row++) { for (int col = 0; col < 4; col++) { @@ -238,9 +238,9 @@ void PrepareLSTMWeight(const LoDTensor& W_forget_w0, void PrepareLSTMBias(const LoDTensor& B_forget, const LoDTensor& B_input, const LoDTensor& B_output, const LoDTensor& B_cell, LoDTensor* out) { - std::array tensors = + std::array tensors{ {B_forget.data(), B_input.data(), B_output.data(), - B_cell.data()}; + B_cell.data()}}; PADDLE_ENFORCE_EQ(B_forget.dims().size(), 1); int D = B_forget.dims()[0]; diff --git a/paddle/fluid/framework/ir/pass.h b/paddle/fluid/framework/ir/pass.h index 8d699146bd..5f7cea65d9 100644 --- a/paddle/fluid/framework/ir/pass.h +++ b/paddle/fluid/framework/ir/pass.h @@ -207,7 +207,7 @@ struct PassRegistrar : public Registrar { return 0; \ } \ static ::paddle::framework::ir::PassRegistrar \ - &__pass_tmp_registrar_##pass_type##__ __UNUSED__() = \ + &__pass_tmp_registrar_##pass_type##__ UNUSED = \ __pass_registrar_##pass_type##__ #define USE_PASS(pass_type) \ @@ -215,7 +215,7 @@ struct PassRegistrar : public Registrar { __use_pass_itself_##pass_type, \ "USE_PASS must be called in global namespace"); \ extern int TouchPassRegistrar_##pass_type(); \ - static int use_pass_itself_##pass_type##_ __UNUSED__() = \ + static int use_pass_itself_##pass_type##_ UNUSED = \ TouchPassRegistrar_##pass_type() } // namespace ir diff --git a/paddle/fluid/platform/port.h b/paddle/fluid/platform/port.h index 4ff07edc19..55e1dd87c2 100644 --- a/paddle/fluid/platform/port.h +++ b/paddle/fluid/platform/port.h @@ -24,7 +24,6 @@ #include "glog/logging.h" #if !defined(_WIN32) -#define UNUSED __attribute__((unused)) #include // dladdr #include // backtrace #include @@ -34,9 +33,6 @@ #include // _popen, _pclose #include #include // std::accumulate in msvc -// windows version of __attribute__((unused)) -#define UNUSED __pragma(warning(suppress : 4100)) - #ifndef S_ISDIR // windows port for sys/stat.h #define S_ISDIR(mode) (((mode)&S_IFMT) == S_IFDIR) #endif // S_ISDIR diff --git a/paddle/fluid/platform/stream_callback_manager.h b/paddle/fluid/platform/stream_callback_manager.h index 3cd1628a0b..0e88a439cf 100644 --- a/paddle/fluid/platform/stream_callback_manager.h +++ b/paddle/fluid/platform/stream_callback_manager.h @@ -18,8 +18,8 @@ #include #include #include +#include "ThreadPool.h" #include "paddle/fluid/platform/enforce.h" -#include "third_party/threadpool/src/extern_threadpool/ThreadPool.h" namespace paddle { namespace platform { diff --git a/paddle/fluid/platform/variant.h b/paddle/fluid/platform/variant.h index fb6a8bb96f..e9d90ac1ec 100644 --- a/paddle/fluid/platform/variant.h +++ b/paddle/fluid/platform/variant.h @@ -45,8 +45,8 @@ limitations under the License. */ // some platform-independent defintion #if defined(_WIN32) -#define __UNUSED__() +#define UNUSED #define __builtin_expect(EXP, C) (EXP) #else -#define __UNUSED__() __attribute__((unused)) +#define UNUSED __attribute__((unused)) #endif \ No newline at end of file From 7840d181c9b84b25fad80a69c49cc09a29e158f2 Mon Sep 17 00:00:00 2001 From: peizhilin Date: Mon, 12 Nov 2018 11:49:13 +0800 Subject: [PATCH 26/50] fix style issue --- doc/v2/dev/contribute_to_paddle_en.md | 2 +- paddle/fluid/framework/data_type_transform.cu | 14 ++++++++++++++ paddle/fluid/framework/tensor_util.cu | 14 ++++++++++++++ paddle/fluid/platform/nccl_helper.h | 2 +- paddle/fluid/platform/variant.h | 2 +- python/paddle/fluid/__init__.py | 9 ++++----- python/paddle/fluid/layers/io.py | 5 +++-- python/paddle/fluid/layers/nn.py | 17 +++++++++++++---- python/paddle/fluid/layers/ops.py | 2 -- .../paddle/trainer_config_helpers/networks.py | 4 ++-- 10 files changed, 53 insertions(+), 18 deletions(-) diff --git a/doc/v2/dev/contribute_to_paddle_en.md b/doc/v2/dev/contribute_to_paddle_en.md index c97564d93a..7272339644 120000 --- a/doc/v2/dev/contribute_to_paddle_en.md +++ b/doc/v2/dev/contribute_to_paddle_en.md @@ -1 +1 @@ -../../../CONTRIBUTING.md \ No newline at end of file +../../../CONTRIBUTING.md diff --git a/paddle/fluid/framework/data_type_transform.cu b/paddle/fluid/framework/data_type_transform.cu index f46491293e..7dd9cb5cfd 120000 --- a/paddle/fluid/framework/data_type_transform.cu +++ b/paddle/fluid/framework/data_type_transform.cu @@ -1 +1,15 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + data_type_transform.cc \ No newline at end of file diff --git a/paddle/fluid/framework/tensor_util.cu b/paddle/fluid/framework/tensor_util.cu index edd88c4e54..251c3a5e40 120000 --- a/paddle/fluid/framework/tensor_util.cu +++ b/paddle/fluid/framework/tensor_util.cu @@ -1 +1,15 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + tensor_util.cc \ No newline at end of file diff --git a/paddle/fluid/platform/nccl_helper.h b/paddle/fluid/platform/nccl_helper.h index cbe03c163f..a6360a884d 100644 --- a/paddle/fluid/platform/nccl_helper.h +++ b/paddle/fluid/platform/nccl_helper.h @@ -150,4 +150,4 @@ struct NCCLContextMap { } // namespace platform } // namespace paddle -#endif \ No newline at end of file +#endif diff --git a/paddle/fluid/platform/variant.h b/paddle/fluid/platform/variant.h index e9d90ac1ec..1b10db8669 100644 --- a/paddle/fluid/platform/variant.h +++ b/paddle/fluid/platform/variant.h @@ -49,4 +49,4 @@ limitations under the License. */ #define __builtin_expect(EXP, C) (EXP) #else #define UNUSED __attribute__((unused)) -#endif \ No newline at end of file +#endif diff --git a/python/paddle/fluid/__init__.py b/python/paddle/fluid/__init__.py index d3ad2149bc..2e1b4b2ead 100644 --- a/python/paddle/fluid/__init__.py +++ b/python/paddle/fluid/__init__.py @@ -112,11 +112,10 @@ def __bootstrap__(): os.environ['OMP_NUM_THREADS'] = str(num_threads) read_env_flags = [ - 'use_pinned_memory', 'check_nan_inf', 'benchmark', - 'eager_delete_scope', 'use_mkldnn', 'initial_cpu_memory_in_mb', - 'init_allocated_mem', 'free_idle_memory', 'paddle_num_threads', - 'dist_threadpool_size', 'eager_delete_tensor_gb', - 'reader_queue_speed_test_mode' + 'use_pinned_memory', 'check_nan_inf', 'benchmark', 'eager_delete_scope', + 'use_mkldnn', 'initial_cpu_memory_in_mb', 'init_allocated_mem', + 'free_idle_memory', 'paddle_num_threads', 'dist_threadpool_size', + 'eager_delete_tensor_gb', 'reader_queue_speed_test_mode' ] if os.name != 'nt': read_env_flags.append('warpctc_dir') diff --git a/python/paddle/fluid/layers/io.py b/python/paddle/fluid/layers/io.py index d50f6744df..a9075045a2 100644 --- a/python/paddle/fluid/layers/io.py +++ b/python/paddle/fluid/layers/io.py @@ -348,6 +348,7 @@ def _copy_reader_create_op_(block, op): if os.name != 'nt': + @templatedoc(op_type='create_recordio_file_reader') def open_recordio_file(filename, shapes, @@ -405,8 +406,8 @@ if os.name != 'nt': startup_var.desc.set_dtypes(dtypes) startup_var.persistable = True - main_prog_var = _copy_reader_var_(default_main_program().current_block(), - startup_var) + main_prog_var = _copy_reader_var_( + default_main_program().current_block(), startup_var) if pass_num > 1: main_prog_var = multi_pass(reader=main_prog_var, pass_num=pass_num) diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py index c0278efb60..4b9264bfb6 100644 --- a/python/paddle/fluid/layers/nn.py +++ b/python/paddle/fluid/layers/nn.py @@ -342,6 +342,7 @@ def embedding(input, if os.name != 'nt': + @templatedoc(op_type="lstm") def dynamic_lstm(input, size, @@ -961,6 +962,7 @@ def linear_chain_crf(input, label, param_attr=None): if os.name != 'nt': + @templatedoc() def crf_decoding(input, param_attr, label=None): """ @@ -988,9 +990,11 @@ if os.name != 'nt': dtype=helper.input_dtype()) helper.append_op( type='crf_decoding', - inputs={"Emission": [input], - "Transition": transition, - "Label": label}, + inputs={ + "Emission": [input], + "Transition": transition, + "Label": label + }, outputs={"ViterbiPath": [viterbi_path]}) return viterbi_path @@ -5530,8 +5534,13 @@ def label_smooth(label, if os.name != 'nt': + @templatedoc() - def roi_pool(input, rois, pooled_height=1, pooled_width=1, spatial_scale=1.0): + def roi_pool(input, + rois, + pooled_height=1, + pooled_width=1, + spatial_scale=1.0): """ ${comment} diff --git a/python/paddle/fluid/layers/ops.py b/python/paddle/fluid/layers/ops.py index df52b7042f..66eb1229aa 100644 --- a/python/paddle/fluid/layers/ops.py +++ b/python/paddle/fluid/layers/ops.py @@ -105,7 +105,6 @@ if os.name != 'nt': _cum_sum_ = generate_layer_fn('cumsum') - def cumsum(x, axis=None, exclusive=None, reverse=None): locals_var = locals().keys() kwargs = dict() @@ -115,7 +114,6 @@ if os.name != 'nt': kwargs[name] = val return _cum_sum_(**kwargs) - cumsum.__doc__ = _cum_sum_.__doc__ + """ Examples: diff --git a/python/paddle/trainer_config_helpers/networks.py b/python/paddle/trainer_config_helpers/networks.py index b5cde7bac7..1e961b936f 100644 --- a/python/paddle/trainer_config_helpers/networks.py +++ b/python/paddle/trainer_config_helpers/networks.py @@ -1719,7 +1719,7 @@ def inputs(layers, *args): if len(args) != 0: layers.extend(args) - Inputs(*[l.name for l in layers]) + Inputs(* [l.name for l in layers]) def outputs(layers, *args): @@ -1769,7 +1769,7 @@ def outputs(layers, *args): assert len(layers) > 0 if HasInputsSet(): # input already set - Outputs(*[l.name for l in layers]) + Outputs(* [l.name for l in layers]) return # just return outputs. if len(layers) != 1: From 1b75fd2236dfa3563226306269fc04f63395e8f6 Mon Sep 17 00:00:00 2001 From: peizhilin Date: Mon, 12 Nov 2018 13:23:41 +0800 Subject: [PATCH 27/50] revert --- paddle/fluid/framework/data_type_transform.cu | 14 -------------- paddle/fluid/framework/tensor_util.cu | 14 -------------- 2 files changed, 28 deletions(-) diff --git a/paddle/fluid/framework/data_type_transform.cu b/paddle/fluid/framework/data_type_transform.cu index 7dd9cb5cfd..f46491293e 120000 --- a/paddle/fluid/framework/data_type_transform.cu +++ b/paddle/fluid/framework/data_type_transform.cu @@ -1,15 +1 @@ -// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - data_type_transform.cc \ No newline at end of file diff --git a/paddle/fluid/framework/tensor_util.cu b/paddle/fluid/framework/tensor_util.cu index 251c3a5e40..edd88c4e54 120000 --- a/paddle/fluid/framework/tensor_util.cu +++ b/paddle/fluid/framework/tensor_util.cu @@ -1,15 +1 @@ -// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - tensor_util.cc \ No newline at end of file From 2d7134bc37fc0a9fa4b02a83fc1a20bf48c47674 Mon Sep 17 00:00:00 2001 From: nhzlx Date: Tue, 13 Nov 2018 02:33:42 +0000 Subject: [PATCH 28/50] add initial code for plugin --- .../fluid/inference/tensorrt/CMakeLists.txt | 1 + .../inference/tensorrt/convert/CMakeLists.txt | 2 +- .../inference/tensorrt/convert/concat_op.cc | 2 +- .../tensorrt/plugin/.trt_plugin_utils.h.swp | Bin 0 -> 12288 bytes .../inference/tensorrt/plugin/CMakeLists.txt | 2 + .../tensorrt/plugin/plugin_factory.cc | 64 ++++++++++ .../tensorrt/plugin/plugin_factory.h | 91 ++++++++++++++ .../inference/tensorrt/plugin/plugin_utils.cc | 37 ++++++ .../inference/tensorrt/plugin/plugin_utils.h | 34 ++++++ .../inference/tensorrt/plugin/serialize.hpp | 111 +++++++++++++++++ .../tensorrt/plugin/split_op_plugin.cu | 114 ++++++++++++++++++ .../tensorrt/plugin/split_op_plugin.h | 62 ++++++++++ .../inference/tensorrt/plugin/trt_plugin.cc | 63 ++++++++++ .../inference/tensorrt/plugin/trt_plugin.h | 72 +++++++++++ 14 files changed, 653 insertions(+), 2 deletions(-) create mode 100644 paddle/fluid/inference/tensorrt/plugin/.trt_plugin_utils.h.swp create mode 100644 paddle/fluid/inference/tensorrt/plugin/CMakeLists.txt create mode 100644 paddle/fluid/inference/tensorrt/plugin/plugin_factory.cc create mode 100644 paddle/fluid/inference/tensorrt/plugin/plugin_factory.h create mode 100644 paddle/fluid/inference/tensorrt/plugin/plugin_utils.cc create mode 100644 paddle/fluid/inference/tensorrt/plugin/plugin_utils.h create mode 100644 paddle/fluid/inference/tensorrt/plugin/serialize.hpp create mode 100644 paddle/fluid/inference/tensorrt/plugin/split_op_plugin.cu create mode 100644 paddle/fluid/inference/tensorrt/plugin/split_op_plugin.h create mode 100644 paddle/fluid/inference/tensorrt/plugin/trt_plugin.cc create mode 100644 paddle/fluid/inference/tensorrt/plugin/trt_plugin.h diff --git a/paddle/fluid/inference/tensorrt/CMakeLists.txt b/paddle/fluid/inference/tensorrt/CMakeLists.txt index a610687a5b..e09705e3c6 100644 --- a/paddle/fluid/inference/tensorrt/CMakeLists.txt +++ b/paddle/fluid/inference/tensorrt/CMakeLists.txt @@ -1,4 +1,5 @@ nv_library(tensorrt_engine SRCS engine.cc DEPS framework_proto device_context) nv_test(test_tensorrt SRCS test_tensorrt.cc DEPS dynload_cuda device_context dynamic_loader) nv_test(test_tensorrt_engine SRCS test_engine.cc DEPS dynload_cuda tensorrt_engine) +add_subdirectory(plugin) add_subdirectory(convert) diff --git a/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt b/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt index 0a35e10f69..e34d5db6b8 100644 --- a/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt +++ b/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt @@ -2,7 +2,7 @@ nv_library(tensorrt_converter SRCS mul_op.cc conv2d_op.cc fc_op.cc pool2d_op.cc elementwise_op.cc batch_norm_op.cc activation_op.cc softmax_op.cc concat_op.cc dropout_op.cc pad_op.cc - DEPS tensorrt_engine operator scope framework_proto op_registry) + DEPS tensorrt_engine tensorrt_plugin operator scope framework_proto op_registry) nv_test(test_op_converter SRCS test_op_converter.cc DEPS ${FLUID_CORE_MODULES} tensorrt_engine tensorrt_converter) diff --git a/paddle/fluid/inference/tensorrt/convert/concat_op.cc b/paddle/fluid/inference/tensorrt/convert/concat_op.cc index 60c16e35ed..cd1bb892bd 100644 --- a/paddle/fluid/inference/tensorrt/convert/concat_op.cc +++ b/paddle/fluid/inference/tensorrt/convert/concat_op.cc @@ -19,7 +19,7 @@ namespace inference { namespace tensorrt { /* - * MulOp, IMatrixMultiplyLayer in TRT. This Layer doesn't has weights. + * ConcatOp */ class ConcatOpConverter : public OpConverter { public: diff --git a/paddle/fluid/inference/tensorrt/plugin/.trt_plugin_utils.h.swp b/paddle/fluid/inference/tensorrt/plugin/.trt_plugin_utils.h.swp new file mode 100644 index 0000000000000000000000000000000000000000..08d1434089f792131d0e6a545ad8675b3ba4892c GIT binary patch literal 12288 zcmeI2O^@3|7{{j^p`|Nc4hRkpS%h|18#|j)QMP48x^J}z$to{h6%JuykDWn|?T*Kr z-9?2DfCL|cmJa|&Q27KL5nrL)$^i+%0sfAYCc8jeP6#xXez83>e*DaD>}Xec`jzX> zJM_9$W!M%N`}xJM-+PzcUl_i{n2KYaH$Q4Sx>;E(2T4}0Uc6XdyYLz)S1OiBT&vHe zmsxH+%wyPT>Q(K-oj_u*9A zLJ0M zTy$)S;*Q9+=z6^1b16^LT;5=8y5=)G^x<6NY+2`9i)R1>*&<|xnJ4H<y>tdLXDSUDy;Bt=OzBFsE4? z`EfBOO@u3b=~Bp}Amf3cAU+T(%2zoNYADlm9F<2N+jlzGn%xfV*IKPwqvLG1so5fY z7i>8lr`f>S4%Her=xwL5wMs(beu~6lqC}b!?k9&yD1~P+Pv*~2KhwhdbGjr`nja8H z#3L%z+T^rLbhXxY+N-P^g?UgVZe~&;O8g6J;qhXQjM<@e)(66n)09%33PsWB<6 zcI@jBSFkqFI$5{v(P(6GyyydA#VW87B)68@b!QXbc-!Twh4R=NaYjRaL~&npMC0Vf z3C{czQn_YFlW|e3DNA$bn2s-zlsI%nqVrTx`;JsOR*puHG#|oZlSP{sQyV8YUCvdy zE>ylKgv4kmU)0j%q7vRvX0OmDa#J!GXj*cYsajdPM0?|+`r?ynnI6O{wWt<`)XE2@ M)XHC^gM4-V01zic1poj5 literal 0 HcmV?d00001 diff --git a/paddle/fluid/inference/tensorrt/plugin/CMakeLists.txt b/paddle/fluid/inference/tensorrt/plugin/CMakeLists.txt new file mode 100644 index 0000000000..1b91c864c9 --- /dev/null +++ b/paddle/fluid/inference/tensorrt/plugin/CMakeLists.txt @@ -0,0 +1,2 @@ +nv_library(tensorrt_plugin SRCS plugin_factory.cc plugin_utils.cc +trt_plugin.cc split_op_plugin.cu DEPS enforce) diff --git a/paddle/fluid/inference/tensorrt/plugin/plugin_factory.cc b/paddle/fluid/inference/tensorrt/plugin/plugin_factory.cc new file mode 100644 index 0000000000..5ebcd44611 --- /dev/null +++ b/paddle/fluid/inference/tensorrt/plugin/plugin_factory.cc @@ -0,0 +1,64 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/inference/tensorrt/plugin/plugin_factory.h" + +namespace paddle { +namespace inference { +namespace tensorrt { + +PluginTensorRT* PluginFactoryTensorRT::createPlugin(const char* layer_name, + const void* serial_data, + size_t serial_length) { + size_t parsed_byte = 0; + std::string encoded_op_name = + ExtractOpName(serial_data, serial_length, &parsed_byte); + + if (!IsPlugin(encoded_op_name)) { + return nullptr; + } + + auto plugin_ptr = + plugin_registry_[encoded_op_name].first(serial_data, serial_length); + owned_plugins_.emplace_back(plugin_ptr); + + return plugin_ptr; +} + +PluginTensorRT* PluginFactoryTensorRT::CreatePlugin( + const std::string& op_name) { + if (!IsPlugin(op_name)) return nullptr; + + auto plugin_ptr = plugin_registry_[op_name].second(); + owned_plugins_.emplace_back(plugin_ptr); + + return plugin_ptr; +} + +bool PluginFactoryTensorRT::RegisterPlugin( + const std::string& op_name, PluginDeserializeFunc deserialize_func, + PluginConstructFunc construct_func) { + if (IsPlugin(op_name)) return false; + + auto ret = plugin_registry_.emplace( + op_name, std::make_pair(deserialize_func, construct_func)); + + return ret.second; +} + +void PluginFactoryTensorRT::DestroyPlugins() { owned_plugins_.clear(); } + +} // namespace tensorrt +} // namespace inference +} // namespace paddle diff --git a/paddle/fluid/inference/tensorrt/plugin/plugin_factory.h b/paddle/fluid/inference/tensorrt/plugin/plugin_factory.h new file mode 100644 index 0000000000..00435766f7 --- /dev/null +++ b/paddle/fluid/inference/tensorrt/plugin/plugin_factory.h @@ -0,0 +1,91 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include +#include + +#include "NvInfer.h" +#include "paddle/fluid/inference/tensorrt/plugin/plugin_utils.h" +#include "paddle/fluid/inference/tensorrt/plugin/trt_plugin.h" +#include "paddle/fluid/platform/enforce.h" + +namespace paddle { +namespace inference { +namespace tensorrt { + +class PluginFactoryTensorRT : public nvinfer1::IPluginFactory { + public: + static PluginFactoryTensorRT* GetInstance() { + static PluginFactoryTensorRT* factory_instance = + new PluginFactoryTensorRT(); + return factory_instance; + } + + // Deserialization method + PluginTensorRT* createPlugin(const char* layer_name, const void* serial_data, + size_t serial_length) override; + + // Plugin construction, PluginFactoryTensorRT owns the plugin. + PluginTensorRT* CreatePlugin(const std::string& op_name); + + bool RegisterPlugin(const std::string& op_name, + PluginDeserializeFunc deserialize_func, + PluginConstructFunc construct_func); + + bool IsPlugin(const std::string& op_name) { + return plugin_registry_.find(op_name) != plugin_registry_.end(); + } + + size_t CountOwnedPlugins() { return owned_plugins_.size(); } + + void DestroyPlugins(); + + protected: + std::unordered_map> + plugin_registry_; + std::vector> owned_plugins_; +}; + +class TrtPluginRegistrar { + public: + TrtPluginRegistrar(const std::string& name, + PluginDeserializeFunc deserialize_func, + PluginConstructFunc construct_func) { + auto factory = PluginFactoryTensorRT::GetInstance(); + // platform::PADDLE_ENFORCE(factory->RegisterPlugin(name, deserialize_func, + // construct_func), "Falied to register plugin [%s]", name); + // platform::PADDLE_ENFORCE(factory->RegisterPlugin(name, deserialize_func, + // construct_func)); + factory->RegisterPlugin(name, deserialize_func, construct_func); + } +}; + +#define REGISTER_TRT_PLUGIN(name, deserialize_func, construct_func) \ + REGISTER_TRT_PLUGIN_UNIQ_HELPER(__COUNTER__, name, deserialize_func, \ + construct_func) +#define REGISTER_TRT_PLUGIN_UNIQ_HELPER(ctr, name, deserialize_func, \ + construct_func) \ + REGISTER_TRT_PLUGIN_UNIQ(ctr, name, deserialize_func, construct_func) +#define REGISTER_TRT_PLUGIN_UNIQ(ctr, name, deserialize_func, construct_func) \ + static ::paddle::inference::tensorrt::TrtPluginRegistrar \ + trt_plugin_registrar##ctr __attribute__((unused)) = \ + ::paddle::inference::tensorrt::TrtPluginRegistrar( \ + name, deserialize_func, construct_func) + +} // namespace tensorrt +} // namespace inference +} // namespace paddle diff --git a/paddle/fluid/inference/tensorrt/plugin/plugin_utils.cc b/paddle/fluid/inference/tensorrt/plugin/plugin_utils.cc new file mode 100644 index 0000000000..2cc4162aa7 --- /dev/null +++ b/paddle/fluid/inference/tensorrt/plugin/plugin_utils.cc @@ -0,0 +1,37 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/inference/tensorrt/plugin/plugin_utils.h" +#include + +namespace paddle { +namespace inference { +namespace tensorrt { + +std::string ExtractOpName(const void* serial_data, size_t serial_length, + size_t* incremental) { + size_t op_name_char_count = *static_cast(serial_data); + *incremental = sizeof(size_t) + op_name_char_count; + + assert(serial_length >= *incremental); + + const char* buffer = static_cast(serial_data) + sizeof(size_t); + std::string op_name(buffer, op_name_char_count); + + return op_name; +} + +} // namespace tensorrt +} // namespace inference +} // namespace paddle diff --git a/paddle/fluid/inference/tensorrt/plugin/plugin_utils.h b/paddle/fluid/inference/tensorrt/plugin/plugin_utils.h new file mode 100644 index 0000000000..fb6608c12a --- /dev/null +++ b/paddle/fluid/inference/tensorrt/plugin/plugin_utils.h @@ -0,0 +1,34 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once +#include + +#include "NvInfer.h" +#include "paddle/fluid/inference/tensorrt/plugin/trt_plugin.h" + +namespace paddle { +namespace inference { +namespace tensorrt { + +typedef std::function + PluginDeserializeFunc; +typedef std::function PluginConstructFunc; + +std::string ExtractOpName(const void* serial_data, size_t serial_length, + size_t* incremental); + +} // namespace tensorrt +} // namespace inference +} // namespze paddle diff --git a/paddle/fluid/inference/tensorrt/plugin/serialize.hpp b/paddle/fluid/inference/tensorrt/plugin/serialize.hpp new file mode 100644 index 0000000000..96df352feb --- /dev/null +++ b/paddle/fluid/inference/tensorrt/plugin/serialize.hpp @@ -0,0 +1,111 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include +#include +#include + +template +inline void serialize_value(void** buffer, T const& value); + +template +inline void deserialize_value(void const** buffer, size_t* buffer_size, + T* value); + +namespace { + +template +struct Serializer {}; + +template +struct Serializer::value || + std::is_enum::value || + std::is_pod::value>::type> { + static size_t serialized_size(T const& value) { return sizeof(T); } + static void serialize(void** buffer, T const& value) { + ::memcpy(*buffer, &value, sizeof(T)); + reinterpret_cast(*buffer) += sizeof(T); + } + static void deserialize(void const** buffer, size_t* buffer_size, T* value) { + assert(*buffer_size >= sizeof(T)); + ::memcpy(value, *buffer, sizeof(T)); + reinterpret_cast(*buffer) += sizeof(T); + *buffer_size -= sizeof(T); + } +}; + +template <> +struct Serializer { + static size_t serialized_size(const char* value) { return strlen(value) + 1; } + static void serialize(void** buffer, const char* value) { + ::strcpy(static_cast(*buffer), value); + reinterpret_cast(*buffer) += strlen(value) + 1; + } + static void deserialize(void const** buffer, size_t* buffer_size, + const char** value) { + *value = static_cast(*buffer); + size_t data_size = strnlen(*value, *buffer_size) + 1; + assert(*buffer_size >= data_size); + reinterpret_cast(*buffer) += data_size; + *buffer_size -= data_size; + } +}; + +template +struct Serializer, + typename std::enable_if::value || + std::is_enum::value || + std::is_pod::value>::type> { + static size_t serialized_size(std::vector const& value) { + return sizeof(value.size()) + value.size() * sizeof(T); + } + static void serialize(void** buffer, std::vector const& value) { + serialize_value(buffer, value.size()); + size_t nbyte = value.size() * sizeof(T); + ::memcpy(*buffer, value.data(), nbyte); + reinterpret_cast(*buffer) += nbyte; + } + static void deserialize(void const** buffer, size_t* buffer_size, + std::vector* value) { + size_t size; + deserialize_value(buffer, buffer_size, &size); + value->resize(size); + size_t nbyte = value->size() * sizeof(T); + assert(*buffer_size >= nbyte); + ::memcpy(value->data(), *buffer, nbyte); + reinterpret_cast(*buffer) += nbyte; + *buffer_size -= nbyte; + } +}; + +} // namespace + +template +inline size_t serialized_size(T const& value) { + return Serializer::serialized_size(value); +} + +template +inline void serialize_value(void** buffer, T const& value) { + return Serializer::serialize(buffer, value); +} + +template +inline void deserialize_value(void const** buffer, size_t* buffer_size, + T* value) { + return Serializer::deserialize(buffer, buffer_size, value); +} diff --git a/paddle/fluid/inference/tensorrt/plugin/split_op_plugin.cu b/paddle/fluid/inference/tensorrt/plugin/split_op_plugin.cu new file mode 100644 index 0000000000..044c229b55 --- /dev/null +++ b/paddle/fluid/inference/tensorrt/plugin/split_op_plugin.cu @@ -0,0 +1,114 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include "paddle/fluid/inference/tensorrt/plugin/split_op_plugin.h" + +namespace paddle { +namespace inference { +namespace tensorrt { + +SplitPlugin* CreateSplitPlugin() { return new SplitPlugin(); }; + +nvinfer1::Dims SplitPlugin::getOutputDimensions(int index, + const nvinfer1::Dims* inputDims, + int nbInputs) { + assert(nbInputs == 1); + assert(index < this->getNbOutputs()); + nvinfer1::Dims const& input_dims = inputDims[0]; + nvinfer1::Dims output_dims = input_dims; + output_dims.d[axis_] = output_lenght_.at(index); + return output_dims; +} + +int SplitPlugin::initialize() { + std::vector segment_offsets(1, 0); + for (int i = 0; i < this->getNbOutputs(); ++i) { + segment_offsets.push_back(segment_offsets.back() + output_lenght_[i]); + } + d_segment_offsets_ = segment_offsets; + nvinfer1::Dims dims = this->getInputDims(0); + nx_ = 1; + for (int i = dims.nbDims - 1; i > axis_; --i) { + nx_ *= dims.d[i]; + } + ny_ = dims.d[axis_]; + nz_ = 1; + for (int i = axis_ - 1; i >= 0; --i) { + nz_ *= dims.d[i]; + } + return 0; +} + +template +__device__ int upper_bound(T const* vals, int n, T const& key) { + int i = 0; + while (n > 0) { + int m = n / 2; + int j = i + m; + if (!(key < vals[j])) { + i = j + 1; + n -= m + 1; + } else { + n = m; + } + } + return i; +} + +template +__global__ void split_kernel(int nsegment, + int const* __restrict__ segment_offsets, + T const* __restrict__ idata, T* const* odatas, + int nx, int srcny_, int nz) { + int x0 = threadIdx.x + blockIdx.x * blockDim.x; + int src_y0 = threadIdx.y + blockIdx.y * blockDim.y; + int z0 = threadIdx.z + blockIdx.z * blockDim.z; + for (int z = z0; z < nz; z += blockDim.z * gridDim.z) { + for (int src_y = src_y0; src_y < srcny_; src_y += blockDim.y * gridDim.y) { + for (int x = x0; x < nx; x += blockDim.x * gridDim.x) { + int segment = upper_bound(segment_offsets, nsegment, src_y) - 1; + int dst_y = src_y - segment_offsets[segment]; + int dstny_ = segment_offsets[segment + 1] - segment_offsets[segment]; + odatas[segment][x + nx * (dst_y + dstny_ * z)] = + idata[x + nx * (src_y + srcny_ * z)]; + } + } + } +} + +int SplitPlugin::enqueue(int batchSize, const void* const* inputs, + void** outputs, void* workspace, cudaStream_t stream) { + auto const& input_dims = this->getInputDims(0); + int const* d_segment_offsets_ptr = + thrust::raw_pointer_cast(&d_segment_offsets_[0]); + float const* idata = reinterpret_cast(inputs[0]); + float** odatas = reinterpret_cast(outputs); + + int nz = nz_ * batchSize; + dim3 block(32, 16); + dim3 grid(std::min((nx_ - 1) / block.x + 1, 65535u), + std::min((ny_ - 1) / block.y + 1, 65535u), + std::min((nz_ - 1) / block.z + 1, 65535u)); + + split_kernel<<>>(d_segment_offsets_.size(), + d_segment_offsets_ptr, idata, odatas, + nx_, ny_, nz); + + return cudaGetLastError() != cudaSuccess; +} + +} // tensorrt +} // inference +} // paddle diff --git a/paddle/fluid/inference/tensorrt/plugin/split_op_plugin.h b/paddle/fluid/inference/tensorrt/plugin/split_op_plugin.h new file mode 100644 index 0000000000..406c822bb5 --- /dev/null +++ b/paddle/fluid/inference/tensorrt/plugin/split_op_plugin.h @@ -0,0 +1,62 @@ + +#pragma once + +#include "paddle/fluid/inference/tensorrt/plugin/trt_plugin.h" +#include + +namespace paddle { +namespace inference { +namespace tensorrt { + +class SplitPlugin : public PluginTensorRT { + int axis_; + std::vector output_lenght_; + int nx_, ny_, nz_; + thrust::device_vector d_segment_offsets_; + + protected: + virtual size_t getSerializationSize() override { + return serialized_size(axis_) + serialized_size(output_lenght_) + + getBaseSerializationSize(); + } + + virtual void serialize(void *buffer) override { + serializeBase(buffer); + serialize_value(&buffer, axis_); + serialize_value(&buffer, output_lenght_); + } + + public: + Split() {} + SplitPlugin(void const* serialData, size_t serialLength) { + deserializeBase(serialData, serialLength); + deserialize_value(&serialData, &serialLength, &axis_); + deserialize_value(&serialData, &serialLength, &output_lenght_); + } + + SplitPlugin* clone() const override { + return new SplitPlugin(axis_, output_lenght_); + } + + virtual const char* getPluginType() const override { return "split"; } + virtual int getNbOutputs() const override { return output_lenght_.size(); } + virtual nvinfer1::Dims getOutputDimensions(int index, + const nvinfer1::Dims *inputs, int nbInputDims) override; + virtual int initialize() override; + virtual int enqueue(int batchSize, + const void *const *inputs, void **outputs, + void *workspace, cudaStream_t stream) override; + + void setAxis(int axis) { + axis_ = axis; + } + + void setOutputLengths(const std::vector & output_lengths) { + output_length_ = output_lengths; + } + +}; + +} // tensorrt +} // inference +} // paddle diff --git a/paddle/fluid/inference/tensorrt/plugin/trt_plugin.cc b/paddle/fluid/inference/tensorrt/plugin/trt_plugin.cc new file mode 100644 index 0000000000..4eff6665d4 --- /dev/null +++ b/paddle/fluid/inference/tensorrt/plugin/trt_plugin.cc @@ -0,0 +1,63 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/inference/tensorrt/plugin/trt_plugin.h" +#include "paddle/fluid/inference/tensorrt/plugin/plugin_utils.h" + +namespace paddle { +namespace inference { +namespace tensorrt { + +void PluginTensorRT::serializeBase(void*& buffer) { + serialize_value(&buffer, input_dims_); + serialize_value(&buffer, max_batch_size_); + serialize_value(&buffer, data_type_); + serialize_value(&buffer, data_format_); +} + +void PluginTensorRT::deserializeBase(void const*& serialData, + size_t& serialLength) { + deserialize_value(&serialData, &serialLength, &input_dims_); + deserialize_value(&serialData, &serialLength, &max_batch_size_); + deserialize_value(&serialData, &serialLength, &data_type_); + deserialize_value(&serialData, &serialLength, &data_format_); +} + +size_t PluginTensorRT::getBaseSerializationSize() { + return (serialized_size(input_dims_) + serialized_size(max_batch_size_) + + serialized_size(data_type_) + serialized_size(data_format_)); +} + +bool PluginTensorRT::supportsFormat(nvinfer1::DataType type, + nvinfer1::PluginFormat format) const { + return ((type == nvinfer1::DataType::kFLOAT || + type == nvinfer1::DataType::kHALF) && + (format == nvinfer1::PluginFormat::kNCHW)); +} + +void PluginTensorRT::configureWithFormat(const nvinfer1::Dims* inputDims, + int nbInputs, + const nvinfer1::Dims* outputDims, + int nbOutputs, nvinfer1::DataType type, + nvinfer1::PluginFormat format, + int maxBatchSize) { + data_type_ = type; + data_format_ = format; + input_dims_.assign(inputDims, inputDims + nbInputs); + max_batch_size_ = maxBatchSize; +} + +} // namespace tensorrt +} // namespace inference +} // namespace paddle diff --git a/paddle/fluid/inference/tensorrt/plugin/trt_plugin.h b/paddle/fluid/inference/tensorrt/plugin/trt_plugin.h new file mode 100644 index 0000000000..8168646bde --- /dev/null +++ b/paddle/fluid/inference/tensorrt/plugin/trt_plugin.h @@ -0,0 +1,72 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include +#include +#include +#include +#include + +#include "paddle/fluid/inference/tensorrt/plugin/serialize.hpp" + +namespace paddle { +namespace inference { +namespace tensorrt { + +class PluginTensorRT : public nvinfer1::IPluginExt { + public: + PluginTensorRT() {} + PluginTensorRT(const void* serialized_data, size_t length) {} + nvinfer1::Dims const& getInputDims(int index) const { + return input_dims_.at(index); + } + size_t getMaxBatchSize() const { return max_batch_size_; } + nvinfer1::DataType getDataType() const { return data_type_; } + nvinfer1::PluginFormat getDataFormat() const { return data_format_; } + virtual const char* getPluginVersion() const { return "1"; } + size_t getWorkspaceSize(int) const override { return 0; } + void terminate() override {} + virtual ~PluginTensorRT() {} + + // The following functions need to be overrided in the subclass. + virtual nvinfer1::IPluginExt* clone() const = 0; + virtual const char* getPluginType() const = 0; + int initialize() override { return 0; } + bool supportsFormat(nvinfer1::DataType type, + nvinfer1::PluginFormat format) const override; + void configureWithFormat(const nvinfer1::Dims* inputDims, int nbInputs, + const nvinfer1::Dims* outputDims, int nbOutputs, + nvinfer1::DataType type, + nvinfer1::PluginFormat format, + int maxBatchSize) override; + virtual void serialize(void* buffer) override; + virtual size_t getSerializationSize() override; + + protected: + void deserializeBase(void const*& serialData, size_t& serialLength); + size_t getBaseSerializationSize(); + void serializeBase(void*& buffer); + + std::vector input_dims_; + size_t max_batch_size_; + nvinfer1::DataType data_type_; + nvinfer1::PluginFormat data_format_; +}; + +} // namespace tensorrt +} // namespace inference +} // namespace paddle From 99dffb91d668d70b7c110f76de70d9666c5dc7d4 Mon Sep 17 00:00:00 2001 From: Xin Pan Date: Thu, 8 Nov 2018 20:20:33 +0800 Subject: [PATCH 29/50] allow to repeatedly share and update BuildStrategy test=develop --- paddle/fluid/framework/details/build_strategy.cc | 16 ++++++++++------ paddle/fluid/framework/details/build_strategy.h | 4 +++- paddle/fluid/pybind/pybind.cc | 9 ++++++--- .../fluid/tests/unittests/test_pass_builder.py | 2 +- 4 files changed, 20 insertions(+), 11 deletions(-) diff --git a/paddle/fluid/framework/details/build_strategy.cc b/paddle/fluid/framework/details/build_strategy.cc index 48f94a1f05..132725fa7e 100644 --- a/paddle/fluid/framework/details/build_strategy.cc +++ b/paddle/fluid/framework/details/build_strategy.cc @@ -79,9 +79,15 @@ class ParallelExecutorPassBuilder : public ir::PassBuilder { BuildStrategy strategy_; }; -std::shared_ptr BuildStrategy::CreatePassesFromStrategy() - const { +std::shared_ptr BuildStrategy::CreatePassesFromStrategy( + bool from_user) const { + if (finalized_by_user_) { + return pass_builder_; + } pass_builder_.reset(new ParallelExecutorPassBuilder(*this)); + if (from_user) { + finalized_by_user_ = true; + } return pass_builder_; } @@ -95,10 +101,8 @@ std::unique_ptr BuildStrategy::Apply( #else const bool use_cuda) const { #endif - // Create a default one if not initialized by user. - if (!pass_builder_) { - CreatePassesFromStrategy(); - } + // Create a default one if not finalized by user. + CreatePassesFromStrategy(false); std::unique_ptr graph(new ir::Graph(main_program)); diff --git a/paddle/fluid/framework/details/build_strategy.h b/paddle/fluid/framework/details/build_strategy.h index 6c7b54db8f..e9deebd504 100644 --- a/paddle/fluid/framework/details/build_strategy.h +++ b/paddle/fluid/framework/details/build_strategy.h @@ -80,7 +80,8 @@ struct BuildStrategy { // from python side. // A new PassBuilder is created based on configs defined above and // passes are owned by the PassBuilder. - std::shared_ptr CreatePassesFromStrategy() const; + std::shared_ptr CreatePassesFromStrategy( + bool from_user) const; // Apply the passes built by the pass_builder_. The passes will be // applied to the Program and output an ir::Graph. @@ -97,6 +98,7 @@ struct BuildStrategy { #endif private: + mutable bool finalized_by_user_ = false; mutable std::shared_ptr pass_builder_; }; diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc index 238cc19189..b7776df904 100644 --- a/paddle/fluid/pybind/pybind.cc +++ b/paddle/fluid/pybind/pybind.cc @@ -855,10 +855,13 @@ All parameter, weight, gradient are variables in Paddle. R"DOC(The type is BOOL, fuse_elewise_add_act_ops indicate whether to fuse elementwise_add_op and activation_op, it may make the execution faster. Default False)DOC") - .def("_create_passes_from_strategy", + .def("_finalize_strategy_and_create_passes", [](BuildStrategy &self) -> std::shared_ptr { - return self.CreatePassesFromStrategy(); - }); + return self.CreatePassesFromStrategy(true); + }, + R"DOC(Allow user to customized passes. Normally model-specific + optimization passes should be defined in this way. BuildStrategy + cannot be updated after being finalized.)DOC"); pe.def(py::init &, const std::unordered_set &, diff --git a/python/paddle/fluid/tests/unittests/test_pass_builder.py b/python/paddle/fluid/tests/unittests/test_pass_builder.py index 288c5f6a1f..65ad63dc01 100644 --- a/python/paddle/fluid/tests/unittests/test_pass_builder.py +++ b/python/paddle/fluid/tests/unittests/test_pass_builder.py @@ -94,7 +94,7 @@ class TestPassBuilder(unittest.TestCase): def test_parallel_testing_with_new_strategy(self): build_strategy = fluid.BuildStrategy() - pass_builder = build_strategy._create_passes_from_strategy() + pass_builder = build_strategy._finalize_strategy_and_create_passes() origin_len = len(pass_builder.all_passes()) viz_pass = pass_builder.append_pass("graph_viz_pass") From 759ffca42330f40a5655dae304faa3d9057bc004 Mon Sep 17 00:00:00 2001 From: Xin Pan Date: Tue, 13 Nov 2018 13:15:12 +0800 Subject: [PATCH 30/50] some improvements test=develop --- paddle/fluid/framework/details/build_strategy.cc | 8 ++++---- paddle/fluid/framework/details/build_strategy.h | 11 +++++++++-- paddle/fluid/pybind/pybind.cc | 7 +++++++ 3 files changed, 20 insertions(+), 6 deletions(-) diff --git a/paddle/fluid/framework/details/build_strategy.cc b/paddle/fluid/framework/details/build_strategy.cc index 132725fa7e..37202f8695 100644 --- a/paddle/fluid/framework/details/build_strategy.cc +++ b/paddle/fluid/framework/details/build_strategy.cc @@ -80,13 +80,13 @@ class ParallelExecutorPassBuilder : public ir::PassBuilder { }; std::shared_ptr BuildStrategy::CreatePassesFromStrategy( - bool from_user) const { - if (finalized_by_user_) { + bool finalize_strategy) const { + if (is_finalized_) { return pass_builder_; } pass_builder_.reset(new ParallelExecutorPassBuilder(*this)); - if (from_user) { - finalized_by_user_ = true; + if (finalize_strategy) { + is_finalized_ = true; } return pass_builder_; } diff --git a/paddle/fluid/framework/details/build_strategy.h b/paddle/fluid/framework/details/build_strategy.h index e9deebd504..fc2641dbd4 100644 --- a/paddle/fluid/framework/details/build_strategy.h +++ b/paddle/fluid/framework/details/build_strategy.h @@ -75,13 +75,20 @@ struct BuildStrategy { bool remove_unnecessary_lock_{false}; + // NOTE: + // Before you add new options, think if it's a general strategy that works + // with other strategy. If not, the strategy should be created through + // CreatePassesFromStrategy and the pass can be managed separately. + // User normally doesn't need to call this API. // The PassBuilder allows for more customized insert, remove of passes // from python side. // A new PassBuilder is created based on configs defined above and // passes are owned by the PassBuilder. std::shared_ptr CreatePassesFromStrategy( - bool from_user) const; + bool finalize_strategy) const; + + bool IsFinalized() const { return is_finalized_; } // Apply the passes built by the pass_builder_. The passes will be // applied to the Program and output an ir::Graph. @@ -98,7 +105,7 @@ struct BuildStrategy { #endif private: - mutable bool finalized_by_user_ = false; + mutable bool is_finalized_ = false; mutable std::shared_ptr pass_builder_; }; diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc index b7776df904..68b80c6311 100644 --- a/paddle/fluid/pybind/pybind.cc +++ b/paddle/fluid/pybind/pybind.cc @@ -791,6 +791,7 @@ All parameter, weight, gradient are variables in Paddle. "reduce_strategy", [](const BuildStrategy &self) { return self.reduce_; }, [](BuildStrategy &self, BuildStrategy::ReduceStrategy strategy) { + PADDLE_ENFORCE(!self.IsFinalized(), "BuildStrategy is finlaized."); self.reduce_ = strategy; }, R"DOC(The type is STR, there are two reduce strategies in ParallelExecutor, @@ -804,6 +805,7 @@ All parameter, weight, gradient are variables in Paddle. [](const BuildStrategy &self) { return self.gradient_scale_; }, [](BuildStrategy &self, BuildStrategy::GradientScaleStrategy strategy) { + PADDLE_ENFORCE(!self.IsFinalized(), "BuildStrategy is finlaized."); self.gradient_scale_ = strategy; }, R"DOC(The type is STR, there are three ways of defining :math:`loss@grad` in @@ -815,6 +817,7 @@ All parameter, weight, gradient are variables in Paddle. "debug_graphviz_path", [](const BuildStrategy &self) { return self.debug_graphviz_path_; }, [](BuildStrategy &self, const std::string &path) { + PADDLE_ENFORCE(!self.IsFinalized(), "BuildStrategy is finlaized."); self.debug_graphviz_path_ = path; }, R"DOC(The type is STR, debug_graphviz_path indicate the path that @@ -824,6 +827,7 @@ All parameter, weight, gradient are variables in Paddle. "enable_data_balance", [](const BuildStrategy &self) { return self.enable_data_balance_; }, [](BuildStrategy &self, bool b) { + PADDLE_ENFORCE(!self.IsFinalized(), "BuildStrategy is finlaized."); self.enable_data_balance_ = b; }) // FIXME(chengudo): enable_data_balance seems not important .def_property( @@ -832,6 +836,7 @@ All parameter, weight, gradient are variables in Paddle. return self.enable_sequential_execution_; }, [](BuildStrategy &self, bool b) { + PADDLE_ENFORCE(!self.IsFinalized(), "BuildStrategy is finlaized."); self.enable_sequential_execution_ = b; }, R"DOC(The type is BOOL. If set True, the execution order of ops would be the same as what is in the program. Default False.)DOC") @@ -841,6 +846,7 @@ All parameter, weight, gradient are variables in Paddle. return self.remove_unnecessary_lock_; }, [](BuildStrategy &self, bool b) { + PADDLE_ENFORCE(!self.IsFinalized(), "BuildStrategy is finlaized."); self.remove_unnecessary_lock_ = b; }, R"DOC(The type is BOOL. If set True, some locks in GPU ops would be released and ParallelExecutor would run faster. Default False.)DOC") @@ -850,6 +856,7 @@ All parameter, weight, gradient are variables in Paddle. return self.fuse_elewise_add_act_ops_; }, [](BuildStrategy &self, bool b) { + PADDLE_ENFORCE(!self.IsFinalized(), "BuildStrategy is finlaized."); self.fuse_elewise_add_act_ops_ = b; }, R"DOC(The type is BOOL, fuse_elewise_add_act_ops indicate whether From ea3538d8ddc7fb6df7559697614751fb4683cd53 Mon Sep 17 00:00:00 2001 From: baojun-nervana Date: Mon, 12 Nov 2018 10:13:32 -0800 Subject: [PATCH 31/50] Added fused operator test=develop --- paddle/fluid/framework/CMakeLists.txt | 8 +- paddle/fluid/framework/executor.cc | 22 ++- paddle/fluid/framework/ngraph_bridge.cc | 39 ++++ paddle/fluid/framework/ngraph_bridge.h | 58 ++++++ paddle/fluid/framework/ngraph_operator.cc | 216 ++++++++++++++++++++++ paddle/fluid/framework/ngraph_operator.h | 72 ++++++++ python/paddle/fluid/__init__.py | 8 +- 7 files changed, 416 insertions(+), 7 deletions(-) create mode 100644 paddle/fluid/framework/ngraph_bridge.cc create mode 100644 paddle/fluid/framework/ngraph_bridge.h create mode 100644 paddle/fluid/framework/ngraph_operator.cc create mode 100644 paddle/fluid/framework/ngraph_operator.h diff --git a/paddle/fluid/framework/CMakeLists.txt b/paddle/fluid/framework/CMakeLists.txt index 8442911406..50e0677c21 100644 --- a/paddle/fluid/framework/CMakeLists.txt +++ b/paddle/fluid/framework/CMakeLists.txt @@ -136,6 +136,10 @@ cc_library(version SRCS version.cc) cc_test(version_test SRCS version_test.cc DEPS version) cc_library(proto_desc SRCS var_desc.cc op_desc.cc block_desc.cc program_desc.cc DEPS shape_inference op_info operator glog version) +cc_library(ngraph_bridge SRCS ngraph_bridge.cc DEPS operator framework_proto) +cc_library(ngraph_operator SRCS ngraph_operator.cc DEPS ngraph_bridge operator op_info device_context tensor scope glog + shape_inference data_transform lod_tensor profiler) + cc_library(op_registry SRCS op_registry.cc DEPS op_proto_maker op_info operator glog proto_desc) nv_test(op_registry_test SRCS op_registry_test.cc DEPS op_registry) @@ -163,10 +167,10 @@ if(WITH_DISTRIBUTE) set(DISTRIBUTE_COMPILE_FLAGS "-Wno-non-virtual-dtor -Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor") set_source_files_properties(executor.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) else() - cc_library(executor SRCS executor.cc DEPS op_registry device_context scope framework_proto glog lod_rank_table feed_fetch_method graph_to_program_pass) + cc_library(executor SRCS executor.cc DEPS op_registry device_context scope framework_proto glog lod_rank_table feed_fetch_method graph_to_program_pass ngraph_operator) cc_test(test_naive_executor SRCS naive_executor_test.cc DEPS naive_executor elementwise_add_op) endif() - + if (NOT WIN32) cc_library(parallel_executor SRCS parallel_executor.cc DEPS threaded_ssa_graph_executor scope_buffered_ssa_graph_executor diff --git a/paddle/fluid/framework/executor.cc b/paddle/fluid/framework/executor.cc index fc6b325286..7c9c8331e2 100644 --- a/paddle/fluid/framework/executor.cc +++ b/paddle/fluid/framework/executor.cc @@ -17,6 +17,7 @@ limitations under the License. */ #include "paddle/fluid/framework/feed_fetch_method.h" #include "paddle/fluid/framework/lod_rank_table.h" #include "paddle/fluid/framework/lod_tensor_array.h" +#include "paddle/fluid/framework/ngraph_operator.h" #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/reader.h" #include "paddle/fluid/operators/detail/macros.h" @@ -25,6 +26,7 @@ limitations under the License. */ DECLARE_bool(benchmark); DEFINE_bool(use_mkldnn, false, "Use MKLDNN to run"); +DEFINE_bool(use_ngraph, false, "Use NGRAPH to run"); namespace paddle { namespace framework { @@ -81,6 +83,24 @@ static void DeleteUnusedTensors(const Scope& scope, const OperatorBase* op, } } +static void EnableFusedOp(ExecutorPrepareContext* ctx) { +#ifdef PADDLE_WITH_NGRAPH + VLOG(3) << "use_ngraph=True"; + auto intervals = FusedOperator::FusedOpIntervals(&ctx->ops_); + for (auto& interval : intervals) { + auto* fused_op = new FusedOperator(ctx->prog_, ctx->block_id_, + interval.at(0), interval.at(1)); + *interval[0] = std::unique_ptr(fused_op); + } + for (auto it = intervals.rbegin(); it != intervals.rend(); ++it) { + ctx->ops_.erase(it->at(0) + 1, it->at(1)); + } +#else + LOG(WARNING) + << "'NGRAPH' is not supported, Please re-compile with WITH_NGRAPH option"; +#endif +} + Executor::Executor(const platform::Place& place) : place_(place) {} void Executor::Close() { @@ -338,6 +358,7 @@ std::unique_ptr Executor::Prepare( for (auto& op_desc : block.AllOps()) { ctx->ops_.push_back(OpRegistry::CreateOp(*op_desc)); } + if (FLAGS_use_ngraph) EnableFusedOp(ctx.get()); return ctx; } @@ -485,6 +506,5 @@ void Executor::EnableMKLDNN(const ProgramDesc& program) { << "'MKLDNN' is not supported, Please re-compile with WITH_MKLDNN option"; #endif } - } // namespace framework } // namespace paddle diff --git a/paddle/fluid/framework/ngraph_bridge.cc b/paddle/fluid/framework/ngraph_bridge.cc new file mode 100644 index 0000000000..8177436d0b --- /dev/null +++ b/paddle/fluid/framework/ngraph_bridge.cc @@ -0,0 +1,39 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#ifdef PADDLE_WITH_NGRAPH +#include +#include + +#include "paddle/fluid/framework/ngraph_bridge.h" + +#include "ngraph/ngraph.hpp" + +namespace paddle { +namespace framework { + +std::map&, + std::shared_ptr>>)>> + NgraphBridge::NG_NODE_MAP = {}; + +void NgraphBridge::build_graph(const std::shared_ptr& op) { + auto& op_type = op->Type(); + NG_NODE_MAP[op_type](op, ngb_node_map); +} + +} // namespace framework +} // namespace paddle +#endif diff --git a/paddle/fluid/framework/ngraph_bridge.h b/paddle/fluid/framework/ngraph_bridge.h new file mode 100644 index 0000000000..55bf0d21f3 --- /dev/null +++ b/paddle/fluid/framework/ngraph_bridge.h @@ -0,0 +1,58 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#ifdef PADDLE_WITH_NGRAPH + +#include +#include +#include +#include +#include + +#include "paddle/fluid/framework/operator.h" +#include "paddle/fluid/platform/enforce.h" + +#include "ngraph/ngraph.hpp" + +namespace paddle { +namespace framework { + +class NgraphBridge { + public: + static std::map< + std::string, + std::function&, + std::shared_ptr>>)>> + NG_NODE_MAP; + + explicit NgraphBridge( + std::shared_ptr< + std::unordered_map>> + var_node_map) + : ngb_node_map(var_node_map) {} + + void build_graph(const std::shared_ptr& op); + + private: + std::shared_ptr< + std::unordered_map>> + ngb_node_map; +}; + +} // namespace framework +} // namespace paddle +#endif diff --git a/paddle/fluid/framework/ngraph_operator.cc b/paddle/fluid/framework/ngraph_operator.cc new file mode 100644 index 0000000000..70e6f97b4c --- /dev/null +++ b/paddle/fluid/framework/ngraph_operator.cc @@ -0,0 +1,216 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#ifdef PADDLE_WITH_NGRAPH +#include + +#include +#include + +#include "paddle/fluid/framework/feed_fetch_type.h" +#include "paddle/fluid/framework/ngraph_operator.h" +#include "paddle/fluid/framework/shape_inference.h" +#include "paddle/fluid/framework/var_desc.h" +#include "paddle/fluid/framework/var_type.h" + +namespace paddle { +namespace framework { + +static std::map pd2ng_type_map = { + {proto::VarType::FP32, ngraph::element::f32}, + {proto::VarType::FP64, ngraph::element::f64}, + {proto::VarType::INT32, ngraph::element::i32}, + {proto::VarType::INT64, ngraph::element::i64}, + {proto::VarType::BOOL, ngraph::element::boolean}, +}; + +class NgraphOperator { + public: + explicit NgraphOperator(const Scope& scope, const platform::Place& place, + const std::vector>& ops, + const std::unordered_map< + std::string, ngraph::element::Type>& var_type_map, + const std::unordered_set& persist, + const std::unordered_set& fetches, + const std::unordered_set& post_op_inputs, + int is_test_or_train) + : scope(scope), + place(place), + fused_ops(ops), + var_type_map(var_type_map), + persistables(persist), + fetches(fetches), + post_op_inputs(post_op_inputs), + is_test_or_train(is_test_or_train) {} + + void Run(const Scope& scope, const platform::Place& place) const; + + private: + static std::unordered_map> + func_cache; + const Scope& scope; + const platform::Place& place; + std::vector> fused_ops; + std::unordered_map var_type_map; + std::unordered_set persistables; + std::unordered_set fetches; + std::unordered_set post_op_inputs; + // 0 = default; 1 = (is_test && not is_complete) + // 2 = (is_test && is_complete) + // 3 = (is_training && not is_complete) + // 4 = (is_training && is_complete) + int is_test_or_train; +}; + +std::vector>::iterator>> +FusedOperator::FusedOpIntervals( + std::vector>* ops) { + std::vector>::iterator>> + intervals; + if (ops->empty()) { + return intervals; + } + size_t size = ops->size(); + size_t left = 0; + while (left < size && ops.at(left)->Type() != kFeedOpType) { + ++left; + } + if (left == size) { + return intervals; + } + while (left < size && ops->at(left)->Type() == kFeedOpType) { + ++left; + } + + size_t right = left; + while (right < size && ops->at(right)->Type() != kFetchOpType) { + ++right; + } + if (right == size) { + return intervals; + } + if (left >= right) return intervals; + + // (left, right - 1) represents indices between feed and fetch + size_t pivot = left; + while (pivot < right) { + auto op_type = ops->at(pivot)->Type(); + if (paddle::framework::NgraphBridge::NG_NODE_MAP.find(op_type) == + paddle::framework::NgraphBridge::NG_NODE_MAP.end()) { + ++pivot; + } else { + size_t start = pivot, end = start; + while (pivot < right && + (paddle::framework::NgraphBridge::NG_NODE_MAP.find( + ops.at(pivot)->Type()) != + paddle::framework::NgraphBridge::NG_NODE_MAP.end())) { + ++pivot; + ++end; + } + std::vector>::iterator> + interval = {ops->begin() + start, ops->begin() + end}; + intervals.push_back(interval); + } + } // end while + + return intervals; +} + +FusedOperator::FusedOperator( + const ProgramDesc& prog, size_t block_id, + std::vector>::iterator start, + std::vector>::iterator end, + const std::string& type = "fused_op", const VariableNameMap& inputs = {}, + const VariableNameMap& outputs = {}, const AttributeMap& attrs = {}) + : OperatorBase(type, inputs, outputs, attrs), pdesc(prog), block(block_id) { + for (std::vector>::iterator it = start; + it != end; ++it) { + fused_ops.push_back(std::move(*it)); + } + + for (std::vector>::iterator it = end; + (*it)->Type() != kFetchOpType; ++it) { + for (auto& var_name_item : (*it)->Inputs()) { + for (auto& var_name : var_name_item.second) { + post_op_inputs.insert(var_name); + } + } + } + + if ((*(start - 1))->Type() == kFeedOpType && (*end)->Type() == kFetchOpType) { + is_complete = true; + } + + process(); +} + +void FusedOperator::process() { + auto& bdesc = pdesc.Block(block); + for (auto& var : bdesc.AllVars()) { + if (!(var->GetType() == proto::VarType::SELECTED_ROWS || + var->GetType() == proto::VarType::LOD_TENSOR || + var->GetType() == proto::VarType::LOD_TENSOR_ARRAY)) { + continue; + } + + auto var_name = var->Name(); + if (var->Name() == framework::kEmptyVarName) { + continue; + } + + if (var_name != "fetch" && var_name != "feed") { + auto pd_type = var->GetDataType(); + if (pd2ng_type_map.find(pd_type) == pd2ng_type_map.end()) { + PADDLE_THROW("Data type of var %s not found in pd2ng_type_map", + var_name); + } + var_type_map[var_name] = pd2ng_type_map[pd_type]; + } + + if (var->Persistable()) { + persistables.insert(var->Name()); + } + } + + for (auto* op : bdesc.AllOps()) { + if (op->Type() == kFetchOpType) { + std::string fetch_target_name = op->Input("X")[0]; + fetches.insert(fetch_target_name); + } + } +} + +void FusedOperator::RunImpl(const Scope& scope, + const platform::Place& place) const { + int is_test_or_train = 1; + auto& bdesc = pdesc.Block(block); + for (auto* op : bdesc.AllOps()) { + if (op->Type().find("_grad") != std::string::npos) { + is_test_or_train = 3; + break; + } + } + + if (is_complete) { + is_test_or_train = is_test_or_train == 1 ? 2 : 4; + } + + NgraphOperator ngraph_op(scope, place, fused_ops, var_type_map, persistables, + fetches, post_op_inputs, is_test_or_train); + ngraph_op.Run(scope, place); +} + +} // namespace framework +} // namespace paddle +#endif diff --git a/paddle/fluid/framework/ngraph_operator.h b/paddle/fluid/framework/ngraph_operator.h new file mode 100644 index 0000000000..eb77c78115 --- /dev/null +++ b/paddle/fluid/framework/ngraph_operator.h @@ -0,0 +1,72 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#ifdef PADDLE_WITH_NGRAPH + +#include +#include +#include +#include +#include + +#include "paddle/fluid/framework/attribute.h" +#include "paddle/fluid/framework/framework.pb.h" +#include "paddle/fluid/framework/lod_tensor.h" +#include "paddle/fluid/framework/ngraph_bridge.h" +#include "paddle/fluid/framework/op_info.h" +#include "paddle/fluid/framework/op_kernel_type.h" +#include "paddle/fluid/framework/operator.h" +#include "paddle/fluid/framework/program_desc.h" +#include "paddle/fluid/framework/scope.h" +#include "paddle/fluid/framework/tensor.h" +#include "paddle/fluid/platform/variant.h" + +#include "ngraph/ngraph.hpp" + +namespace paddle { +namespace framework { + +class FusedOperator : public OperatorBase { + public: + static std::vector< + std::vector>::iterator>> + FusedOpIntervals( + std::vector>* ops); + + explicit FusedOperator( + const ProgramDesc& prog, size_t block_id, + std::vector>::iterator start, + std::vector>::iterator end, + const std::string& type = "fused_op", const VariableNameMap& inputs = {}, + const VariableNameMap& outputs = {}, const AttributeMap& attrs = {}); + + void RunImpl(const Scope& scope, const platform::Place& place) const final; + + private: + const ProgramDesc pdesc; + size_t block; + std::vector> fused_ops; + std::unordered_map var_type_map; + std::unordered_set persistables; + std::unordered_set fetches; + std::unordered_set post_op_inputs; + bool is_complete = false; + + void process(); +}; +} // namespace framework +} // namespace paddle +#endif diff --git a/python/paddle/fluid/__init__.py b/python/paddle/fluid/__init__.py index 0b997009bf..dd57a8aac2 100644 --- a/python/paddle/fluid/__init__.py +++ b/python/paddle/fluid/__init__.py @@ -112,10 +112,10 @@ def __bootstrap__(): read_env_flags = [ 'use_pinned_memory', 'check_nan_inf', 'benchmark', 'warpctc_dir', - 'eager_delete_scope', 'use_mkldnn', 'initial_cpu_memory_in_mb', - 'init_allocated_mem', 'free_idle_memory', 'paddle_num_threads', - 'dist_threadpool_size', 'cpu_deterministic', 'eager_delete_tensor_gb', - 'reader_queue_speed_test_mode' + 'eager_delete_scope', 'use_mkldnn', 'use_ngraph', + 'initial_cpu_memory_in_mb', 'init_allocated_mem', 'free_idle_memory', + 'paddle_num_threads', 'dist_threadpool_size', 'cpu_deterministic', + 'eager_delete_tensor_gb', 'reader_queue_speed_test_mode' ] if core.is_compiled_with_dist(): read_env_flags.append('rpc_deadline') From d38fd6a0fcd754907ff17fe896651c5274c7f672 Mon Sep 17 00:00:00 2001 From: nhzlx Date: Tue, 13 Nov 2018 08:23:26 +0000 Subject: [PATCH 32/50] add plugin support and offer an simple split sample --- paddle/fluid/inference/analysis/analyzer.cc | 2 +- .../api/api_tensorrt_subgraph_engine.cc | 1 + .../inference/tensorrt/convert/CMakeLists.txt | 7 +- .../inference/tensorrt/convert/split_op.cc | 73 +++++++++++++++ .../tensorrt/convert/test_split_op.cc | 53 +++++++++++ paddle/fluid/inference/tensorrt/engine.cc | 6 ++ paddle/fluid/inference/tensorrt/engine.h | 5 + .../inference/tensorrt/plugin/CMakeLists.txt | 3 +- .../tensorrt/plugin/plugin_factory.cc | 64 ------------- .../tensorrt/plugin/plugin_factory.h | 91 ------------------- .../inference/tensorrt/plugin/plugin_utils.cc | 37 -------- .../inference/tensorrt/plugin/plugin_utils.h | 34 ------- .../plugin/{serialize.hpp => serialize.h} | 0 .../tensorrt/plugin/split_op_plugin.cu | 70 ++++---------- .../tensorrt/plugin/split_op_plugin.h | 61 ++++++++----- .../inference/tensorrt/plugin/trt_plugin.cc | 4 +- .../inference/tensorrt/plugin/trt_plugin.h | 8 +- 17 files changed, 208 insertions(+), 311 deletions(-) create mode 100644 paddle/fluid/inference/tensorrt/convert/split_op.cc create mode 100644 paddle/fluid/inference/tensorrt/convert/test_split_op.cc delete mode 100644 paddle/fluid/inference/tensorrt/plugin/plugin_factory.cc delete mode 100644 paddle/fluid/inference/tensorrt/plugin/plugin_factory.h delete mode 100644 paddle/fluid/inference/tensorrt/plugin/plugin_utils.cc delete mode 100644 paddle/fluid/inference/tensorrt/plugin/plugin_utils.h rename paddle/fluid/inference/tensorrt/plugin/{serialize.hpp => serialize.h} (100%) diff --git a/paddle/fluid/inference/analysis/analyzer.cc b/paddle/fluid/inference/analysis/analyzer.cc index a3440cfc78..cd6636a7eb 100644 --- a/paddle/fluid/inference/analysis/analyzer.cc +++ b/paddle/fluid/inference/analysis/analyzer.cc @@ -71,7 +71,7 @@ class DfgPassManagerImpl final : public DfgPassManager { std::unordered_set teller_set( {"mul", "conv2d", "pool2d", "relu", "softmax", "sigmoid", "depthwise_conv2d", "batch_norm", "concat", "tanh", "pad", - "elementwise_add", "dropout"}); + "elementwise_add", "dropout", "split"}); if (!node->IsFunction()) return false; const auto* func = static_cast(node); diff --git a/paddle/fluid/inference/api/api_tensorrt_subgraph_engine.cc b/paddle/fluid/inference/api/api_tensorrt_subgraph_engine.cc index 94b3933497..eceab6e2be 100644 --- a/paddle/fluid/inference/api/api_tensorrt_subgraph_engine.cc +++ b/paddle/fluid/inference/api/api_tensorrt_subgraph_engine.cc @@ -186,3 +186,4 @@ USE_TRT_CONVERTER(batch_norm); USE_TRT_CONVERTER(concat); USE_TRT_CONVERTER(dropout); USE_TRT_CONVERTER(pad); +USE_TRT_CONVERTER(split); diff --git a/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt b/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt index e34d5db6b8..ed4c398cee 100644 --- a/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt +++ b/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt @@ -1,7 +1,8 @@ # Add TRT tests nv_library(tensorrt_converter SRCS mul_op.cc conv2d_op.cc fc_op.cc pool2d_op.cc elementwise_op.cc -batch_norm_op.cc activation_op.cc softmax_op.cc concat_op.cc dropout_op.cc pad_op.cc +batch_norm_op.cc activation_op.cc softmax_op.cc concat_op.cc dropout_op.cc +pad_op.cc split_op.cc DEPS tensorrt_engine tensorrt_plugin operator scope framework_proto op_registry) nv_test(test_op_converter SRCS test_op_converter.cc DEPS @@ -28,6 +29,8 @@ nv_test(test_trt_concat_op SRCS test_concat_op.cc concat_op.cc DEPS ${FLUID_CORE_MODULES} tensorrt_engine concat_op SERIAL) nv_test(test_trt_dropout_op SRCS test_dropout_op.cc dropout_op.cc DEPS ${FLUID_CORE_MODULES} tensorrt_engine dropout_op SERIAL) - nv_test(test_trt_pad_op SRCS test_pad_op.cc pad_op.cc DEPS ${FLUID_CORE_MODULES} tensorrt_engine pad_op SERIAL) +nv_test(test_trt_split_op SRCS test_split_op.cc split_op.cc + DEPS ${FLUID_CORE_MODULES} tensorrt_engine tensorrt_plugin +split_op concat_op SERIAL) diff --git a/paddle/fluid/inference/tensorrt/convert/split_op.cc b/paddle/fluid/inference/tensorrt/convert/split_op.cc new file mode 100644 index 0000000000..60d07859f3 --- /dev/null +++ b/paddle/fluid/inference/tensorrt/convert/split_op.cc @@ -0,0 +1,73 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/inference/tensorrt/convert/op_converter.h" +#include "paddle/fluid/inference/tensorrt/plugin/split_op_plugin.h" + +namespace paddle { +namespace inference { +namespace tensorrt { + +/* + * SplitOp. + */ +class SplitOpConverter : public OpConverter { + public: + void operator()(const framework::proto::OpDesc& op, + const framework::Scope& scope, bool test_mode) override { + VLOG(40) << "convert a fluid split op to tensorrt split layer"; + + framework::OpDesc op_desc(op, nullptr); + // Declare inputs + auto* input = engine_->GetITensor(op_desc.Input("X")[0]); + auto input_dims = input->getDimensions(); + int input_num = op_desc.Input("X").size(); + size_t output_num = op_desc.Output("Out").size(); + + PADDLE_ENFORCE(input_num == 1); + int axis = boost::get(op_desc.GetAttr("axis")); + std::vector output_lengths = + boost::get>(op_desc.GetAttr("sections")); + PADDLE_ENFORCE(axis != 0); + if (axis < 0) { + axis += input_dims.nbDims; + } else { + axis -= 1; + } + + PADDLE_ENFORCE(output_lengths.size() == output_num); + + SplitPlugin* plugin = new SplitPlugin(axis, output_lengths); + nvinfer1::IPluginLayer* layer = + engine_->addPlugin(&input, input_num, plugin); + + std::string layer_name = "split (Output: "; + for (size_t i = 0; i < output_num; i++) { + auto output_name = op_desc.Output("Out")[i]; + layer->getOutput(i)->setName(output_name.c_str()); + engine_->SetITensor(output_name, layer->getOutput(i)); + layer_name += output_name; + if (test_mode) { + engine_->DeclareOutput(output_name); + } + } + layer->setName((layer_name + ")").c_str()); + } +}; + +} // namespace tensorrt +} // namespace inference +} // namespace paddle + +REGISTER_TRT_OP_CONVERTER(split, SplitOpConverter); diff --git a/paddle/fluid/inference/tensorrt/convert/test_split_op.cc b/paddle/fluid/inference/tensorrt/convert/test_split_op.cc new file mode 100644 index 0000000000..f81d011552 --- /dev/null +++ b/paddle/fluid/inference/tensorrt/convert/test_split_op.cc @@ -0,0 +1,53 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include +#include "paddle/fluid/inference/tensorrt/convert/op_converter.h" +#include "paddle/fluid/inference/tensorrt/convert/ut_helper.h" + +namespace paddle { +namespace inference { +namespace tensorrt { + +TEST(split_op, test) { + std::unordered_set parameters({""}); + framework::Scope scope; + TRTConvertValidation validator(10, parameters, scope, 1000); + validator.DeclInputVar("split_input", nvinfer1::DimsCHW(3, 2, 2)); + validator.DeclOutputVar("split_out1", nvinfer1::DimsCHW(2, 2, 2)); + validator.DeclOutputVar("split_out2", nvinfer1::DimsCHW(1, 2, 2)); + + // Prepare Op description + framework::OpDesc desc; + desc.SetType("split"); + desc.SetInput("X", {"split_input"}); + desc.SetOutput("Out", {"split_out1", "split_out2"}); + + int num = 0; + int axis = 1; + std::vector output_lengths = {2, 1}; + desc.SetAttr("axis", axis); + desc.SetAttr("num", num); + desc.SetAttr("sections", output_lengths); + + validator.SetOp(*desc.Proto()); + + validator.Execute(1); +} + +} // namespace tensorrt +} // namespace inference +} // namespace paddle + +USE_OP(split); diff --git a/paddle/fluid/inference/tensorrt/engine.cc b/paddle/fluid/inference/tensorrt/engine.cc index 9e0f958447..426bf169bb 100644 --- a/paddle/fluid/inference/tensorrt/engine.cc +++ b/paddle/fluid/inference/tensorrt/engine.cc @@ -254,6 +254,12 @@ void TensorRTEngine::freshDeviceId() { cudaSetDevice(device_); } +nvinfer1::IPluginLayer *TensorRTEngine::addPlugin( + nvinfer1::ITensor *const *inputs, int nbInputs, PluginTensorRT *plugin) { + owned_plugin_.emplace_back(plugin); + return infer_network_.get()->addPluginExt(inputs, nbInputs, *plugin); +} + } // namespace tensorrt } // namespace inference } // namespace paddle diff --git a/paddle/fluid/inference/tensorrt/engine.h b/paddle/fluid/inference/tensorrt/engine.h index 828181200e..216606a291 100644 --- a/paddle/fluid/inference/tensorrt/engine.h +++ b/paddle/fluid/inference/tensorrt/engine.h @@ -22,6 +22,7 @@ limitations under the License. */ #include "paddle/fluid/framework/tensor.h" #include "paddle/fluid/inference/engine.h" #include "paddle/fluid/inference/tensorrt/helper.h" +#include "paddle/fluid/inference/tensorrt/plugin/trt_plugin.h" #include "paddle/fluid/inference/utils/singleton.h" namespace paddle { @@ -125,6 +126,8 @@ class TensorRTEngine : public EngineBase { void SetRuntimeBatch(size_t batch_size); int GetRuntimeBatch(); int GetDevice() { return device_; } + nvinfer1::IPluginLayer* addPlugin(nvinfer1::ITensor* const* inputs, + int nbInputs, PluginTensorRT*); // A pointer to CPU memory is needed of the TRT weight. // Before TRT runs, fluid loads weight into GPU storage. @@ -164,8 +167,10 @@ class TensorRTEngine : public EngineBase { std::unordered_map buffer_sizes_; std::unordered_map itensor_map_; + // The specific GPU id that the TensorRTEngine bounded to. int device_; + std::vector> owned_plugin_; // TensorRT related internal members template diff --git a/paddle/fluid/inference/tensorrt/plugin/CMakeLists.txt b/paddle/fluid/inference/tensorrt/plugin/CMakeLists.txt index 1b91c864c9..71b7a55161 100644 --- a/paddle/fluid/inference/tensorrt/plugin/CMakeLists.txt +++ b/paddle/fluid/inference/tensorrt/plugin/CMakeLists.txt @@ -1,2 +1 @@ -nv_library(tensorrt_plugin SRCS plugin_factory.cc plugin_utils.cc -trt_plugin.cc split_op_plugin.cu DEPS enforce) +nv_library(tensorrt_plugin SRCS trt_plugin.cc split_op_plugin.cu DEPS enforce) diff --git a/paddle/fluid/inference/tensorrt/plugin/plugin_factory.cc b/paddle/fluid/inference/tensorrt/plugin/plugin_factory.cc deleted file mode 100644 index 5ebcd44611..0000000000 --- a/paddle/fluid/inference/tensorrt/plugin/plugin_factory.cc +++ /dev/null @@ -1,64 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - -http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "paddle/fluid/inference/tensorrt/plugin/plugin_factory.h" - -namespace paddle { -namespace inference { -namespace tensorrt { - -PluginTensorRT* PluginFactoryTensorRT::createPlugin(const char* layer_name, - const void* serial_data, - size_t serial_length) { - size_t parsed_byte = 0; - std::string encoded_op_name = - ExtractOpName(serial_data, serial_length, &parsed_byte); - - if (!IsPlugin(encoded_op_name)) { - return nullptr; - } - - auto plugin_ptr = - plugin_registry_[encoded_op_name].first(serial_data, serial_length); - owned_plugins_.emplace_back(plugin_ptr); - - return plugin_ptr; -} - -PluginTensorRT* PluginFactoryTensorRT::CreatePlugin( - const std::string& op_name) { - if (!IsPlugin(op_name)) return nullptr; - - auto plugin_ptr = plugin_registry_[op_name].second(); - owned_plugins_.emplace_back(plugin_ptr); - - return plugin_ptr; -} - -bool PluginFactoryTensorRT::RegisterPlugin( - const std::string& op_name, PluginDeserializeFunc deserialize_func, - PluginConstructFunc construct_func) { - if (IsPlugin(op_name)) return false; - - auto ret = plugin_registry_.emplace( - op_name, std::make_pair(deserialize_func, construct_func)); - - return ret.second; -} - -void PluginFactoryTensorRT::DestroyPlugins() { owned_plugins_.clear(); } - -} // namespace tensorrt -} // namespace inference -} // namespace paddle diff --git a/paddle/fluid/inference/tensorrt/plugin/plugin_factory.h b/paddle/fluid/inference/tensorrt/plugin/plugin_factory.h deleted file mode 100644 index 00435766f7..0000000000 --- a/paddle/fluid/inference/tensorrt/plugin/plugin_factory.h +++ /dev/null @@ -1,91 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - -http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once - -#include -#include - -#include "NvInfer.h" -#include "paddle/fluid/inference/tensorrt/plugin/plugin_utils.h" -#include "paddle/fluid/inference/tensorrt/plugin/trt_plugin.h" -#include "paddle/fluid/platform/enforce.h" - -namespace paddle { -namespace inference { -namespace tensorrt { - -class PluginFactoryTensorRT : public nvinfer1::IPluginFactory { - public: - static PluginFactoryTensorRT* GetInstance() { - static PluginFactoryTensorRT* factory_instance = - new PluginFactoryTensorRT(); - return factory_instance; - } - - // Deserialization method - PluginTensorRT* createPlugin(const char* layer_name, const void* serial_data, - size_t serial_length) override; - - // Plugin construction, PluginFactoryTensorRT owns the plugin. - PluginTensorRT* CreatePlugin(const std::string& op_name); - - bool RegisterPlugin(const std::string& op_name, - PluginDeserializeFunc deserialize_func, - PluginConstructFunc construct_func); - - bool IsPlugin(const std::string& op_name) { - return plugin_registry_.find(op_name) != plugin_registry_.end(); - } - - size_t CountOwnedPlugins() { return owned_plugins_.size(); } - - void DestroyPlugins(); - - protected: - std::unordered_map> - plugin_registry_; - std::vector> owned_plugins_; -}; - -class TrtPluginRegistrar { - public: - TrtPluginRegistrar(const std::string& name, - PluginDeserializeFunc deserialize_func, - PluginConstructFunc construct_func) { - auto factory = PluginFactoryTensorRT::GetInstance(); - // platform::PADDLE_ENFORCE(factory->RegisterPlugin(name, deserialize_func, - // construct_func), "Falied to register plugin [%s]", name); - // platform::PADDLE_ENFORCE(factory->RegisterPlugin(name, deserialize_func, - // construct_func)); - factory->RegisterPlugin(name, deserialize_func, construct_func); - } -}; - -#define REGISTER_TRT_PLUGIN(name, deserialize_func, construct_func) \ - REGISTER_TRT_PLUGIN_UNIQ_HELPER(__COUNTER__, name, deserialize_func, \ - construct_func) -#define REGISTER_TRT_PLUGIN_UNIQ_HELPER(ctr, name, deserialize_func, \ - construct_func) \ - REGISTER_TRT_PLUGIN_UNIQ(ctr, name, deserialize_func, construct_func) -#define REGISTER_TRT_PLUGIN_UNIQ(ctr, name, deserialize_func, construct_func) \ - static ::paddle::inference::tensorrt::TrtPluginRegistrar \ - trt_plugin_registrar##ctr __attribute__((unused)) = \ - ::paddle::inference::tensorrt::TrtPluginRegistrar( \ - name, deserialize_func, construct_func) - -} // namespace tensorrt -} // namespace inference -} // namespace paddle diff --git a/paddle/fluid/inference/tensorrt/plugin/plugin_utils.cc b/paddle/fluid/inference/tensorrt/plugin/plugin_utils.cc deleted file mode 100644 index 2cc4162aa7..0000000000 --- a/paddle/fluid/inference/tensorrt/plugin/plugin_utils.cc +++ /dev/null @@ -1,37 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - -http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "paddle/fluid/inference/tensorrt/plugin/plugin_utils.h" -#include - -namespace paddle { -namespace inference { -namespace tensorrt { - -std::string ExtractOpName(const void* serial_data, size_t serial_length, - size_t* incremental) { - size_t op_name_char_count = *static_cast(serial_data); - *incremental = sizeof(size_t) + op_name_char_count; - - assert(serial_length >= *incremental); - - const char* buffer = static_cast(serial_data) + sizeof(size_t); - std::string op_name(buffer, op_name_char_count); - - return op_name; -} - -} // namespace tensorrt -} // namespace inference -} // namespace paddle diff --git a/paddle/fluid/inference/tensorrt/plugin/plugin_utils.h b/paddle/fluid/inference/tensorrt/plugin/plugin_utils.h deleted file mode 100644 index fb6608c12a..0000000000 --- a/paddle/fluid/inference/tensorrt/plugin/plugin_utils.h +++ /dev/null @@ -1,34 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - -http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once -#include - -#include "NvInfer.h" -#include "paddle/fluid/inference/tensorrt/plugin/trt_plugin.h" - -namespace paddle { -namespace inference { -namespace tensorrt { - -typedef std::function - PluginDeserializeFunc; -typedef std::function PluginConstructFunc; - -std::string ExtractOpName(const void* serial_data, size_t serial_length, - size_t* incremental); - -} // namespace tensorrt -} // namespace inference -} // namespze paddle diff --git a/paddle/fluid/inference/tensorrt/plugin/serialize.hpp b/paddle/fluid/inference/tensorrt/plugin/serialize.h similarity index 100% rename from paddle/fluid/inference/tensorrt/plugin/serialize.hpp rename to paddle/fluid/inference/tensorrt/plugin/serialize.h diff --git a/paddle/fluid/inference/tensorrt/plugin/split_op_plugin.cu b/paddle/fluid/inference/tensorrt/plugin/split_op_plugin.cu index 044c229b55..ed43c4d435 100644 --- a/paddle/fluid/inference/tensorrt/plugin/split_op_plugin.cu +++ b/paddle/fluid/inference/tensorrt/plugin/split_op_plugin.cu @@ -12,6 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. +#include #include #include "paddle/fluid/inference/tensorrt/plugin/split_op_plugin.h" @@ -19,8 +20,6 @@ namespace paddle { namespace inference { namespace tensorrt { -SplitPlugin* CreateSplitPlugin() { return new SplitPlugin(); }; - nvinfer1::Dims SplitPlugin::getOutputDimensions(int index, const nvinfer1::Dims* inputDims, int nbInputs) { @@ -28,15 +27,16 @@ nvinfer1::Dims SplitPlugin::getOutputDimensions(int index, assert(index < this->getNbOutputs()); nvinfer1::Dims const& input_dims = inputDims[0]; nvinfer1::Dims output_dims = input_dims; - output_dims.d[axis_] = output_lenght_.at(index); + output_dims.d[axis_] = output_length_.at(index); return output_dims; } int SplitPlugin::initialize() { std::vector segment_offsets(1, 0); for (int i = 0; i < this->getNbOutputs(); ++i) { - segment_offsets.push_back(segment_offsets.back() + output_lenght_[i]); + segment_offsets.push_back(segment_offsets.back() + output_length_[i]); } + segment_offsets_ = segment_offsets; d_segment_offsets_ = segment_offsets; nvinfer1::Dims dims = this->getInputDims(0); nx_ = 1; @@ -51,60 +51,30 @@ int SplitPlugin::initialize() { return 0; } -template -__device__ int upper_bound(T const* vals, int n, T const& key) { - int i = 0; - while (n > 0) { - int m = n / 2; - int j = i + m; - if (!(key < vals[j])) { - i = j + 1; - n -= m + 1; - } else { - n = m; - } - } - return i; -} - -template -__global__ void split_kernel(int nsegment, - int const* __restrict__ segment_offsets, - T const* __restrict__ idata, T* const* odatas, - int nx, int srcny_, int nz) { - int x0 = threadIdx.x + blockIdx.x * blockDim.x; - int src_y0 = threadIdx.y + blockIdx.y * blockDim.y; - int z0 = threadIdx.z + blockIdx.z * blockDim.z; - for (int z = z0; z < nz; z += blockDim.z * gridDim.z) { - for (int src_y = src_y0; src_y < srcny_; src_y += blockDim.y * gridDim.y) { - for (int x = x0; x < nx; x += blockDim.x * gridDim.x) { - int segment = upper_bound(segment_offsets, nsegment, src_y) - 1; - int dst_y = src_y - segment_offsets[segment]; - int dstny_ = segment_offsets[segment + 1] - segment_offsets[segment]; - odatas[segment][x + nx * (dst_y + dstny_ * z)] = - idata[x + nx * (src_y + srcny_ * z)]; - } - } - } -} - int SplitPlugin::enqueue(int batchSize, const void* const* inputs, void** outputs, void* workspace, cudaStream_t stream) { auto const& input_dims = this->getInputDims(0); + int input_size = 0; int const* d_segment_offsets_ptr = thrust::raw_pointer_cast(&d_segment_offsets_[0]); float const* idata = reinterpret_cast(inputs[0]); float** odatas = reinterpret_cast(outputs); - int nz = nz_ * batchSize; - dim3 block(32, 16); - dim3 grid(std::min((nx_ - 1) / block.x + 1, 65535u), - std::min((ny_ - 1) / block.y + 1, 65535u), - std::min((nz_ - 1) / block.z + 1, 65535u)); - - split_kernel<<>>(d_segment_offsets_.size(), - d_segment_offsets_ptr, idata, odatas, - nx_, ny_, nz); + // kernel impl here. + int inputBatchOffset = nx_ * ny_ * nz_; + for (size_t i = 0; i < this->getNbOutputs(); i++) { + for (size_t j = 0; j < batchSize; j++) { + cudaMemcpyAsync( + odatas[i] + + j * (segment_offsets_[i + 1] - segment_offsets_[i]) * nx_ * + sizeof(float), + inputs[0] + + (inputBatchOffset * j + segment_offsets_[i] * nx_) * + sizeof(float), + (segment_offsets_[i + 1] - segment_offsets_[i]) * nx_ * sizeof(float), + cudaMemcpyDeviceToDevice, stream); + } + } return cudaGetLastError() != cudaSuccess; } diff --git a/paddle/fluid/inference/tensorrt/plugin/split_op_plugin.h b/paddle/fluid/inference/tensorrt/plugin/split_op_plugin.h index 406c822bb5..59be609111 100644 --- a/paddle/fluid/inference/tensorrt/plugin/split_op_plugin.h +++ b/paddle/fluid/inference/tensorrt/plugin/split_op_plugin.h @@ -1,8 +1,21 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. #pragma once -#include "paddle/fluid/inference/tensorrt/plugin/trt_plugin.h" #include +#include "paddle/fluid/inference/tensorrt/plugin/trt_plugin.h" namespace paddle { namespace inference { @@ -10,53 +23,55 @@ namespace tensorrt { class SplitPlugin : public PluginTensorRT { int axis_; - std::vector output_lenght_; + std::vector output_length_; int nx_, ny_, nz_; thrust::device_vector d_segment_offsets_; + std::vector segment_offsets_; protected: virtual size_t getSerializationSize() override { - return serialized_size(axis_) + serialized_size(output_lenght_) - + getBaseSerializationSize(); + return serialized_size(axis_) + serialized_size(output_length_) + + getBaseSerializationSize(); } virtual void serialize(void *buffer) override { serializeBase(buffer); serialize_value(&buffer, axis_); - serialize_value(&buffer, output_lenght_); + serialize_value(&buffer, output_length_); } public: - Split() {} - SplitPlugin(void const* serialData, size_t serialLength) { + SplitPlugin(int axis, std::vector const &output_lengths) + : axis_(axis), output_length_(output_lengths) { + assert(axis <= nvinfer1::Dims::MAX_DIMS); + } + + SplitPlugin(void const *serialData, size_t serialLength) { deserializeBase(serialData, serialLength); deserialize_value(&serialData, &serialLength, &axis_); - deserialize_value(&serialData, &serialLength, &output_lenght_); + deserialize_value(&serialData, &serialLength, &output_length_); } - SplitPlugin* clone() const override { - return new SplitPlugin(axis_, output_lenght_); + SplitPlugin *clone() const override { + return new SplitPlugin(axis_, output_length_); } - virtual const char* getPluginType() const override { return "split"; } - virtual int getNbOutputs() const override { return output_lenght_.size(); } + virtual const char *getPluginType() const override { return "split"; } + virtual int getNbOutputs() const override { return output_length_.size(); } virtual nvinfer1::Dims getOutputDimensions(int index, - const nvinfer1::Dims *inputs, int nbInputDims) override; + const nvinfer1::Dims *inputs, + int nbInputDims) override; virtual int initialize() override; - virtual int enqueue(int batchSize, - const void *const *inputs, void **outputs, + virtual int enqueue(int batchSize, const void *const *inputs, void **outputs, void *workspace, cudaStream_t stream) override; - void setAxis(int axis) { - axis_ = axis; - } + void setAxis(int axis) { axis_ = axis; } - void setOutputLengths(const std::vector & output_lengths) { + void setOutputLengths(const std::vector &output_lengths) { output_length_ = output_lengths; } - }; -} // tensorrt -} // inference -} // paddle +} // tensorrt +} // inference +} // paddle diff --git a/paddle/fluid/inference/tensorrt/plugin/trt_plugin.cc b/paddle/fluid/inference/tensorrt/plugin/trt_plugin.cc index 4eff6665d4..975a5ed162 100644 --- a/paddle/fluid/inference/tensorrt/plugin/trt_plugin.cc +++ b/paddle/fluid/inference/tensorrt/plugin/trt_plugin.cc @@ -13,7 +13,6 @@ // limitations under the License. #include "paddle/fluid/inference/tensorrt/plugin/trt_plugin.h" -#include "paddle/fluid/inference/tensorrt/plugin/plugin_utils.h" namespace paddle { namespace inference { @@ -41,8 +40,7 @@ size_t PluginTensorRT::getBaseSerializationSize() { bool PluginTensorRT::supportsFormat(nvinfer1::DataType type, nvinfer1::PluginFormat format) const { - return ((type == nvinfer1::DataType::kFLOAT || - type == nvinfer1::DataType::kHALF) && + return ((type == nvinfer1::DataType::kFLOAT) && (format == nvinfer1::PluginFormat::kNCHW)); } diff --git a/paddle/fluid/inference/tensorrt/plugin/trt_plugin.h b/paddle/fluid/inference/tensorrt/plugin/trt_plugin.h index 8168646bde..44869b390f 100644 --- a/paddle/fluid/inference/tensorrt/plugin/trt_plugin.h +++ b/paddle/fluid/inference/tensorrt/plugin/trt_plugin.h @@ -14,14 +14,14 @@ #pragma once -#include #include #include #include #include #include +#include "NvInfer.h" -#include "paddle/fluid/inference/tensorrt/plugin/serialize.hpp" +#include "paddle/fluid/inference/tensorrt/plugin/serialize.h" namespace paddle { namespace inference { @@ -53,8 +53,8 @@ class PluginTensorRT : public nvinfer1::IPluginExt { nvinfer1::DataType type, nvinfer1::PluginFormat format, int maxBatchSize) override; - virtual void serialize(void* buffer) override; - virtual size_t getSerializationSize() override; + virtual void serialize(void* buffer) = 0; + virtual size_t getSerializationSize() = 0; protected: void deserializeBase(void const*& serialData, size_t& serialLength); From 44ecf9a4816222df9fb673aa4ab9d4f74cb4acd3 Mon Sep 17 00:00:00 2001 From: Xin Pan Date: Tue, 13 Nov 2018 17:03:47 +0800 Subject: [PATCH 33/50] fix test=develop --- python/paddle/fluid/tests/unittests/test_dist_base.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/paddle/fluid/tests/unittests/test_dist_base.py b/python/paddle/fluid/tests/unittests/test_dist_base.py index 4b8a215190..97e7ee6229 100644 --- a/python/paddle/fluid/tests/unittests/test_dist_base.py +++ b/python/paddle/fluid/tests/unittests/test_dist_base.py @@ -105,7 +105,7 @@ class TestDistRunnerBase(object): build_stra.reduce_strategy = fluid.BuildStrategy.ReduceStrategy.AllReduce if args.batch_merge_repeat > 1: - pass_builder = build_stra._create_passes_from_strategy() + pass_builder = build_stra._finalize_strategy_and_create_passes() mypass = pass_builder.insert_pass( len(pass_builder.all_passes()) - 2, "multi_batch_merge_pass") mypass.set_int("num_repeats", args.batch_merge_repeat) From d219818434cd7f3a952a4f597032a292c98a72da Mon Sep 17 00:00:00 2001 From: Dang Qingqing Date: Tue, 13 Nov 2018 13:27:43 +0800 Subject: [PATCH 34/50] Fix compiling in cuDNN v5. test=develop --- paddle/fluid/operators/conv_cudnn_op.cu.cc | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/paddle/fluid/operators/conv_cudnn_op.cu.cc b/paddle/fluid/operators/conv_cudnn_op.cu.cc index 3083e622c3..3a4086274d 100644 --- a/paddle/fluid/operators/conv_cudnn_op.cu.cc +++ b/paddle/fluid/operators/conv_cudnn_op.cu.cc @@ -50,12 +50,18 @@ static constexpr char kCUDNNBwdFilterAlgoCache[] = "kCUDNNBwdFilterAlgoCache"; static constexpr size_t kCONV_CUDNN_WORKSPACE_LIMIT_BYTES = static_cast(1024) * 1024 * 1024; -static constexpr size_t kNUM_CUDNN_FWD_ALGS = - CUDNN_CONVOLUTION_BWD_FILTER_ALGO_COUNT; +#if CUDNN_VERSION_MIN(6, 0, 5) +static constexpr size_t kNUM_CUDNN_FWD_ALGS = CUDNN_CONVOLUTION_FWD_ALGO_COUNT; static constexpr size_t kNUM_CUDNN_BWD_FILTER_ALGS = CUDNN_CONVOLUTION_BWD_FILTER_ALGO_COUNT; static constexpr size_t kNUM_CUDNN_BWD_DATA_ALGS = CUDNN_CONVOLUTION_BWD_DATA_ALGO_COUNT; +#else +// cuDNN v5 has no CUDNN_CONVOLUTION_FWD_ALGO_COUNT etc. +static constexpr size_t kNUM_CUDNN_FWD_ALGS = 7; +static constexpr size_t kNUM_CUDNN_BWD_FILTER_ALGS = 4; +static constexpr size_t kNUM_CUDNN_BWD_DATA_ALGS = 5; +#endif template class CUDNNConvOpKernel : public framework::OpKernel { From 0b96268057275615bce25b97708f9efd5c06bce1 Mon Sep 17 00:00:00 2001 From: nhzlx Date: Tue, 13 Nov 2018 11:41:39 +0000 Subject: [PATCH 35/50] fix comments test=develop --- .../inference/tensorrt/convert/split_op.cc | 4 +- paddle/fluid/inference/tensorrt/engine.cc | 2 +- paddle/fluid/inference/tensorrt/engine.h | 2 +- .../inference/tensorrt/plugin/serialize.h | 52 +++++++++---------- .../tensorrt/plugin/split_op_plugin.cu | 3 -- .../tensorrt/plugin/split_op_plugin.h | 23 ++++---- .../inference/tensorrt/plugin/trt_plugin.cc | 20 +++---- .../inference/tensorrt/plugin/trt_plugin.h | 18 +++++-- 8 files changed, 64 insertions(+), 60 deletions(-) diff --git a/paddle/fluid/inference/tensorrt/convert/split_op.cc b/paddle/fluid/inference/tensorrt/convert/split_op.cc index 60d07859f3..12179cccc7 100644 --- a/paddle/fluid/inference/tensorrt/convert/split_op.cc +++ b/paddle/fluid/inference/tensorrt/convert/split_op.cc @@ -35,6 +35,7 @@ class SplitOpConverter : public OpConverter { int input_num = op_desc.Input("X").size(); size_t output_num = op_desc.Output("Out").size(); + // Get Attrs PADDLE_ENFORCE(input_num == 1); int axis = boost::get(op_desc.GetAttr("axis")); std::vector output_lengths = @@ -48,9 +49,10 @@ class SplitOpConverter : public OpConverter { PADDLE_ENFORCE(output_lengths.size() == output_num); + // SplitPlugin* plugin = new SplitPlugin(axis, output_lengths); nvinfer1::IPluginLayer* layer = - engine_->addPlugin(&input, input_num, plugin); + engine_->AddPlugin(&input, input_num, plugin); std::string layer_name = "split (Output: "; for (size_t i = 0; i < output_num; i++) { diff --git a/paddle/fluid/inference/tensorrt/engine.cc b/paddle/fluid/inference/tensorrt/engine.cc index 426bf169bb..0e06a8f804 100644 --- a/paddle/fluid/inference/tensorrt/engine.cc +++ b/paddle/fluid/inference/tensorrt/engine.cc @@ -254,7 +254,7 @@ void TensorRTEngine::freshDeviceId() { cudaSetDevice(device_); } -nvinfer1::IPluginLayer *TensorRTEngine::addPlugin( +nvinfer1::IPluginLayer *TensorRTEngine::AddPlugin( nvinfer1::ITensor *const *inputs, int nbInputs, PluginTensorRT *plugin) { owned_plugin_.emplace_back(plugin); return infer_network_.get()->addPluginExt(inputs, nbInputs, *plugin); diff --git a/paddle/fluid/inference/tensorrt/engine.h b/paddle/fluid/inference/tensorrt/engine.h index 216606a291..335acdf653 100644 --- a/paddle/fluid/inference/tensorrt/engine.h +++ b/paddle/fluid/inference/tensorrt/engine.h @@ -126,7 +126,7 @@ class TensorRTEngine : public EngineBase { void SetRuntimeBatch(size_t batch_size); int GetRuntimeBatch(); int GetDevice() { return device_; } - nvinfer1::IPluginLayer* addPlugin(nvinfer1::ITensor* const* inputs, + nvinfer1::IPluginLayer* AddPlugin(nvinfer1::ITensor* const* inputs, int nbInputs, PluginTensorRT*); // A pointer to CPU memory is needed of the TRT weight. diff --git a/paddle/fluid/inference/tensorrt/plugin/serialize.h b/paddle/fluid/inference/tensorrt/plugin/serialize.h index 96df352feb..50c0b17d78 100644 --- a/paddle/fluid/inference/tensorrt/plugin/serialize.h +++ b/paddle/fluid/inference/tensorrt/plugin/serialize.h @@ -20,11 +20,11 @@ #include template -inline void serialize_value(void** buffer, T const& value); +inline void SerializeValue(void** buffer, T const& value); template -inline void deserialize_value(void const** buffer, size_t* buffer_size, - T* value); +inline void DeserializeValue(void const** buffer, size_t* buffer_size, + T* value); namespace { @@ -35,14 +35,14 @@ template struct Serializer::value || std::is_enum::value || std::is_pod::value>::type> { - static size_t serialized_size(T const& value) { return sizeof(T); } - static void serialize(void** buffer, T const& value) { - ::memcpy(*buffer, &value, sizeof(T)); + static size_t SerializedSize(T const& value) { return sizeof(T); } + static void Serialize(void** buffer, T const& value) { + std::memcpy(*buffer, &value, sizeof(T)); reinterpret_cast(*buffer) += sizeof(T); } - static void deserialize(void const** buffer, size_t* buffer_size, T* value) { + static void Deserialize(void const** buffer, size_t* buffer_size, T* value) { assert(*buffer_size >= sizeof(T)); - ::memcpy(value, *buffer, sizeof(T)); + std::memcpy(value, *buffer, sizeof(T)); reinterpret_cast(*buffer) += sizeof(T); *buffer_size -= sizeof(T); } @@ -50,12 +50,12 @@ struct Serializer::value || template <> struct Serializer { - static size_t serialized_size(const char* value) { return strlen(value) + 1; } - static void serialize(void** buffer, const char* value) { - ::strcpy(static_cast(*buffer), value); + static size_t SerializedSize(const char* value) { return strlen(value) + 1; } + static void Serialize(void** buffer, const char* value) { + std::strcpy(static_cast(*buffer), value); reinterpret_cast(*buffer) += strlen(value) + 1; } - static void deserialize(void const** buffer, size_t* buffer_size, + static void Deserialize(void const** buffer, size_t* buffer_size, const char** value) { *value = static_cast(*buffer); size_t data_size = strnlen(*value, *buffer_size) + 1; @@ -70,23 +70,23 @@ struct Serializer, typename std::enable_if::value || std::is_enum::value || std::is_pod::value>::type> { - static size_t serialized_size(std::vector const& value) { + static size_t SerializedSize(std::vector const& value) { return sizeof(value.size()) + value.size() * sizeof(T); } - static void serialize(void** buffer, std::vector const& value) { - serialize_value(buffer, value.size()); + static void Serialize(void** buffer, std::vector const& value) { + SerializeValue(buffer, value.size()); size_t nbyte = value.size() * sizeof(T); - ::memcpy(*buffer, value.data(), nbyte); + std::memcpy(*buffer, value.data(), nbyte); reinterpret_cast(*buffer) += nbyte; } - static void deserialize(void const** buffer, size_t* buffer_size, + static void Deserialize(void const** buffer, size_t* buffer_size, std::vector* value) { size_t size; - deserialize_value(buffer, buffer_size, &size); + DeserializeValue(buffer, buffer_size, &size); value->resize(size); size_t nbyte = value->size() * sizeof(T); assert(*buffer_size >= nbyte); - ::memcpy(value->data(), *buffer, nbyte); + std::memcpy(value->data(), *buffer, nbyte); reinterpret_cast(*buffer) += nbyte; *buffer_size -= nbyte; } @@ -95,17 +95,17 @@ struct Serializer, } // namespace template -inline size_t serialized_size(T const& value) { - return Serializer::serialized_size(value); +inline size_t SerializedSize(T const& value) { + return Serializer::SerializedSize(value); } template -inline void serialize_value(void** buffer, T const& value) { - return Serializer::serialize(buffer, value); +inline void SerializeValue(void** buffer, T const& value) { + return Serializer::Serialize(buffer, value); } template -inline void deserialize_value(void const** buffer, size_t* buffer_size, - T* value) { - return Serializer::deserialize(buffer, buffer_size, value); +inline void DeserializeValue(void const** buffer, size_t* buffer_size, + T* value) { + return Serializer::Deserialize(buffer, buffer_size, value); } diff --git a/paddle/fluid/inference/tensorrt/plugin/split_op_plugin.cu b/paddle/fluid/inference/tensorrt/plugin/split_op_plugin.cu index ed43c4d435..bd6a44dcc1 100644 --- a/paddle/fluid/inference/tensorrt/plugin/split_op_plugin.cu +++ b/paddle/fluid/inference/tensorrt/plugin/split_op_plugin.cu @@ -37,7 +37,6 @@ int SplitPlugin::initialize() { segment_offsets.push_back(segment_offsets.back() + output_length_[i]); } segment_offsets_ = segment_offsets; - d_segment_offsets_ = segment_offsets; nvinfer1::Dims dims = this->getInputDims(0); nx_ = 1; for (int i = dims.nbDims - 1; i > axis_; --i) { @@ -55,8 +54,6 @@ int SplitPlugin::enqueue(int batchSize, const void* const* inputs, void** outputs, void* workspace, cudaStream_t stream) { auto const& input_dims = this->getInputDims(0); int input_size = 0; - int const* d_segment_offsets_ptr = - thrust::raw_pointer_cast(&d_segment_offsets_[0]); float const* idata = reinterpret_cast(inputs[0]); float** odatas = reinterpret_cast(outputs); diff --git a/paddle/fluid/inference/tensorrt/plugin/split_op_plugin.h b/paddle/fluid/inference/tensorrt/plugin/split_op_plugin.h index 59be609111..7281e40c33 100644 --- a/paddle/fluid/inference/tensorrt/plugin/split_op_plugin.h +++ b/paddle/fluid/inference/tensorrt/plugin/split_op_plugin.h @@ -14,7 +14,6 @@ #pragma once -#include #include "paddle/fluid/inference/tensorrt/plugin/trt_plugin.h" namespace paddle { @@ -25,19 +24,21 @@ class SplitPlugin : public PluginTensorRT { int axis_; std::vector output_length_; int nx_, ny_, nz_; - thrust::device_vector d_segment_offsets_; std::vector segment_offsets_; protected: virtual size_t getSerializationSize() override { - return serialized_size(axis_) + serialized_size(output_length_) + + return SerializedSize(axis_) + SerializedSize(output_length_) + getBaseSerializationSize(); } + // TRT will call this func when we need to serialize the configuration of + // tensorrt. + // It should not be called by users. virtual void serialize(void *buffer) override { serializeBase(buffer); - serialize_value(&buffer, axis_); - serialize_value(&buffer, output_length_); + SerializeValue(&buffer, axis_); + SerializeValue(&buffer, output_length_); } public: @@ -46,10 +47,12 @@ class SplitPlugin : public PluginTensorRT { assert(axis <= nvinfer1::Dims::MAX_DIMS); } + // It was used for tensorrt deserialization. + // It should not be called by users. SplitPlugin(void const *serialData, size_t serialLength) { deserializeBase(serialData, serialLength); - deserialize_value(&serialData, &serialLength, &axis_); - deserialize_value(&serialData, &serialLength, &output_length_); + DeserializeValue(&serialData, &serialLength, &axis_); + DeserializeValue(&serialData, &serialLength, &output_length_); } SplitPlugin *clone() const override { @@ -64,12 +67,6 @@ class SplitPlugin : public PluginTensorRT { virtual int initialize() override; virtual int enqueue(int batchSize, const void *const *inputs, void **outputs, void *workspace, cudaStream_t stream) override; - - void setAxis(int axis) { axis_ = axis; } - - void setOutputLengths(const std::vector &output_lengths) { - output_length_ = output_lengths; - } }; } // tensorrt diff --git a/paddle/fluid/inference/tensorrt/plugin/trt_plugin.cc b/paddle/fluid/inference/tensorrt/plugin/trt_plugin.cc index 975a5ed162..08016d84b1 100644 --- a/paddle/fluid/inference/tensorrt/plugin/trt_plugin.cc +++ b/paddle/fluid/inference/tensorrt/plugin/trt_plugin.cc @@ -19,23 +19,23 @@ namespace inference { namespace tensorrt { void PluginTensorRT::serializeBase(void*& buffer) { - serialize_value(&buffer, input_dims_); - serialize_value(&buffer, max_batch_size_); - serialize_value(&buffer, data_type_); - serialize_value(&buffer, data_format_); + SerializeValue(&buffer, input_dims_); + SerializeValue(&buffer, max_batch_size_); + SerializeValue(&buffer, data_type_); + SerializeValue(&buffer, data_format_); } void PluginTensorRT::deserializeBase(void const*& serialData, size_t& serialLength) { - deserialize_value(&serialData, &serialLength, &input_dims_); - deserialize_value(&serialData, &serialLength, &max_batch_size_); - deserialize_value(&serialData, &serialLength, &data_type_); - deserialize_value(&serialData, &serialLength, &data_format_); + DeserializeValue(&serialData, &serialLength, &input_dims_); + DeserializeValue(&serialData, &serialLength, &max_batch_size_); + DeserializeValue(&serialData, &serialLength, &data_type_); + DeserializeValue(&serialData, &serialLength, &data_format_); } size_t PluginTensorRT::getBaseSerializationSize() { - return (serialized_size(input_dims_) + serialized_size(max_batch_size_) + - serialized_size(data_type_) + serialized_size(data_format_)); + return (SerializedSize(input_dims_) + SerializedSize(max_batch_size_) + + SerializedSize(data_type_) + SerializedSize(data_format_)); } bool PluginTensorRT::supportsFormat(nvinfer1::DataType type, diff --git a/paddle/fluid/inference/tensorrt/plugin/trt_plugin.h b/paddle/fluid/inference/tensorrt/plugin/trt_plugin.h index 44869b390f..4d85e955a4 100644 --- a/paddle/fluid/inference/tensorrt/plugin/trt_plugin.h +++ b/paddle/fluid/inference/tensorrt/plugin/trt_plugin.h @@ -41,11 +41,7 @@ class PluginTensorRT : public nvinfer1::IPluginExt { size_t getWorkspaceSize(int) const override { return 0; } void terminate() override {} virtual ~PluginTensorRT() {} - - // The following functions need to be overrided in the subclass. - virtual nvinfer1::IPluginExt* clone() const = 0; - virtual const char* getPluginType() const = 0; - int initialize() override { return 0; } + // Check format support. The default is FLOAT32 and NCHW. bool supportsFormat(nvinfer1::DataType type, nvinfer1::PluginFormat format) const override; void configureWithFormat(const nvinfer1::Dims* inputDims, int nbInputs, @@ -53,12 +49,24 @@ class PluginTensorRT : public nvinfer1::IPluginExt { nvinfer1::DataType type, nvinfer1::PluginFormat format, int maxBatchSize) override; + + // *NOTE* The following functions need to be overrided in the subclass. + virtual nvinfer1::IPluginExt* clone() const = 0; + virtual const char* getPluginType() const = 0; + // Initialize the layer for execution. This is called when the engine is + // created. + int initialize() override { return 0; } + // Serialize the layer config to buffer. virtual void serialize(void* buffer) = 0; virtual size_t getSerializationSize() = 0; + virtual int enqueue(int batchSize, const void* const* inputs, void** outputs, + void* workspace, cudaStream_t stream) = 0; protected: + // Deserialize input_dims, max_batch_size, data_type, data_format void deserializeBase(void const*& serialData, size_t& serialLength); size_t getBaseSerializationSize(); + // Serialize input_dims, max_batch_size, data_type, data_format void serializeBase(void*& buffer); std::vector input_dims_; From 8e0616ebeed0a74d6efb836fcaff147862095203 Mon Sep 17 00:00:00 2001 From: Qiao Longfei Date: Tue, 13 Nov 2018 22:51:32 +0800 Subject: [PATCH 36/50] fix prelu test=develop --- python/paddle/fluid/layers/nn.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py index d3623464e9..bbc2686091 100644 --- a/python/paddle/fluid/layers/nn.py +++ b/python/paddle/fluid/layers/nn.py @@ -6811,7 +6811,7 @@ def prelu(x, mode, param_attr=None, name=None): alpha_shape = x.shape dtype = helper.input_dtype(input_param_name='x') alpha = helper.create_parameter( - attr=param_attr, + attr=helper.param_attr, shape=alpha_shape, dtype='float32', is_bias=False, From 51a538e0554ff08ee4cd80e1ecc849f564e3416e Mon Sep 17 00:00:00 2001 From: baojun-nervana Date: Tue, 13 Nov 2018 14:14:24 -0800 Subject: [PATCH 37/50] Fix style and use enum test=develop --- paddle/fluid/framework/ngraph_operator.cc | 80 ++++++++++++----------- paddle/fluid/framework/ngraph_operator.h | 18 ++--- 2 files changed, 51 insertions(+), 47 deletions(-) diff --git a/paddle/fluid/framework/ngraph_operator.cc b/paddle/fluid/framework/ngraph_operator.cc index 70e6f97b4c..d967b2780c 100644 --- a/paddle/fluid/framework/ngraph_operator.cc +++ b/paddle/fluid/framework/ngraph_operator.cc @@ -35,6 +35,13 @@ static std::map pd2ng_type_map = { {proto::VarType::BOOL, ngraph::element::boolean}, }; +typedef enum { /* nGraph support state on ops */ + FULL_TRAIN, /* Support full ops for train */ + PARTIAL_TRAIN, /* Support partial ops for train */ + FULL_TEST, /* Support full list of ops for test */ + PARTIAL_TEST /* Support partial list of ops for test */ +} op_state; + class NgraphOperator { public: explicit NgraphOperator(const Scope& scope, const platform::Place& place, @@ -44,33 +51,29 @@ class NgraphOperator { const std::unordered_set& persist, const std::unordered_set& fetches, const std::unordered_set& post_op_inputs, - int is_test_or_train) - : scope(scope), - place(place), - fused_ops(ops), - var_type_map(var_type_map), - persistables(persist), - fetches(fetches), - post_op_inputs(post_op_inputs), - is_test_or_train(is_test_or_train) {} + op_state ng_op_state) + : scope_(scope), + place_(place), + fused_ops_(ops), + var_type_map_(var_type_map), + persistables_(persist), + fetches_(fetches), + post_op_inputs_(post_op_inputs), + ng_op_state_(ng_op_state) {} void Run(const Scope& scope, const platform::Place& place) const; private: static std::unordered_map> func_cache; - const Scope& scope; - const platform::Place& place; - std::vector> fused_ops; - std::unordered_map var_type_map; - std::unordered_set persistables; - std::unordered_set fetches; - std::unordered_set post_op_inputs; - // 0 = default; 1 = (is_test && not is_complete) - // 2 = (is_test && is_complete) - // 3 = (is_training && not is_complete) - // 4 = (is_training && is_complete) - int is_test_or_train; + const Scope& scope_; + const platform::Place& place_; + std::vector> fused_ops_; + std::unordered_map var_type_map_; + std::unordered_set persistables_; + std::unordered_set fetches_; + std::unordered_set post_op_inputs_; + op_state ng_op_state_; }; std::vector>::iterator>> @@ -131,19 +134,19 @@ FusedOperator::FusedOperator( const ProgramDesc& prog, size_t block_id, std::vector>::iterator start, std::vector>::iterator end, - const std::string& type = "fused_op", const VariableNameMap& inputs = {}, - const VariableNameMap& outputs = {}, const AttributeMap& attrs = {}) + const std::string& type, const VariableNameMap& inputs, + const VariableNameMap& outputs, const AttributeMap& attrs) : OperatorBase(type, inputs, outputs, attrs), pdesc(prog), block(block_id) { for (std::vector>::iterator it = start; it != end; ++it) { - fused_ops.push_back(std::move(*it)); + fused_ops_.push_back(std::move(*it)); } for (std::vector>::iterator it = end; (*it)->Type() != kFetchOpType; ++it) { for (auto& var_name_item : (*it)->Inputs()) { for (auto& var_name : var_name_item.second) { - post_op_inputs.insert(var_name); + post_op_inputs_.insert(var_name); } } } @@ -152,11 +155,11 @@ FusedOperator::FusedOperator( is_complete = true; } - process(); + Process(); } -void FusedOperator::process() { - auto& bdesc = pdesc.Block(block); +void FusedOperator::Process() { + auto& bdesc = pdesc_.Block(block_); for (auto& var : bdesc.AllVars()) { if (!(var->GetType() == proto::VarType::SELECTED_ROWS || var->GetType() == proto::VarType::LOD_TENSOR || @@ -175,39 +178,40 @@ void FusedOperator::process() { PADDLE_THROW("Data type of var %s not found in pd2ng_type_map", var_name); } - var_type_map[var_name] = pd2ng_type_map[pd_type]; + var_type_map_[var_name] = pd2ng_type_map[pd_type]; } if (var->Persistable()) { - persistables.insert(var->Name()); + persistables_.insert(var->Name()); } } for (auto* op : bdesc.AllOps()) { if (op->Type() == kFetchOpType) { std::string fetch_target_name = op->Input("X")[0]; - fetches.insert(fetch_target_name); + fetches_.insert(fetch_target_name); } } } void FusedOperator::RunImpl(const Scope& scope, const platform::Place& place) const { - int is_test_or_train = 1; - auto& bdesc = pdesc.Block(block); + op_state ng_op_state = PARTIAL_TEST; + auto& bdesc = pdesc_.Block(block_); for (auto* op : bdesc.AllOps()) { if (op->Type().find("_grad") != std::string::npos) { - is_test_or_train = 3; + ng_op_state = PARTIAL_TRAIN; break; } } - if (is_complete) { - is_test_or_train = is_test_or_train == 1 ? 2 : 4; + if (is_full) { + ng_op_state = ng_op_state == PARTIAL_TEST ? FULL_TEST : FULL_TRAIN; } - NgraphOperator ngraph_op(scope, place, fused_ops, var_type_map, persistables, - fetches, post_op_inputs, is_test_or_train); + NgraphOperator ngraph_op(scope, place, fused_ops_, var_type_map_, + persistables_, fetches_, post_op_inputs_, + ng_op_state); ngraph_op.Run(scope, place); } diff --git a/paddle/fluid/framework/ngraph_operator.h b/paddle/fluid/framework/ngraph_operator.h index eb77c78115..0f655cef1d 100644 --- a/paddle/fluid/framework/ngraph_operator.h +++ b/paddle/fluid/framework/ngraph_operator.h @@ -56,16 +56,16 @@ class FusedOperator : public OperatorBase { void RunImpl(const Scope& scope, const platform::Place& place) const final; private: - const ProgramDesc pdesc; - size_t block; - std::vector> fused_ops; - std::unordered_map var_type_map; - std::unordered_set persistables; - std::unordered_set fetches; - std::unordered_set post_op_inputs; - bool is_complete = false; + const ProgramDesc pdesc_; + size_t block_; + std::vector> fused_ops_; + std::unordered_map var_type_map_; + std::unordered_set persistables_; + std::unordered_set fetches_; + std::unordered_set post_op_inputs_; + bool is_full_ = false; - void process(); + void Process(); }; } // namespace framework } // namespace paddle From a61909ff47e559726bd51ad8694779b372e62636 Mon Sep 17 00:00:00 2001 From: peizhilin Date: Wed, 14 Nov 2018 10:32:22 +0800 Subject: [PATCH 38/50] test=develop --- .../fluid/framework/ir/attention_lstm_fuse_pass.cc | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/paddle/fluid/framework/ir/attention_lstm_fuse_pass.cc b/paddle/fluid/framework/ir/attention_lstm_fuse_pass.cc index ecefab32bb..d61ff04bc7 100644 --- a/paddle/fluid/framework/ir/attention_lstm_fuse_pass.cc +++ b/paddle/fluid/framework/ir/attention_lstm_fuse_pass.cc @@ -212,11 +212,11 @@ void PrepareLSTMWeight(const LoDTensor& W_forget_w0, float* out_data = out->mutable_data(platform::CPUPlace()); std::array tensors{ - {W_forget_w0.data(), W_input_w0.data(), - W_output_w0.data(), W_cell_w0.data()}}; + W_forget_w0.data(), W_input_w0.data(), + W_output_w0.data(), W_cell_w0.data()}; std::array tensors1{ - {W_forget_w1.data(), W_input_w1.data(), - W_output_w1.data(), W_cell_w1.data()}}; + W_forget_w1.data(), W_input_w1.data(), + W_output_w1.data(), W_cell_w1.data()}; for (int row = 0; row < D; row++) { for (int col = 0; col < 4; col++) { @@ -239,8 +239,8 @@ void PrepareLSTMBias(const LoDTensor& B_forget, const LoDTensor& B_input, const LoDTensor& B_output, const LoDTensor& B_cell, LoDTensor* out) { std::array tensors{ - {B_forget.data(), B_input.data(), B_output.data(), - B_cell.data()}}; + B_forget.data(), B_input.data(), B_output.data(), + B_cell.data()}; PADDLE_ENFORCE_EQ(B_forget.dims().size(), 1); int D = B_forget.dims()[0]; From 447bf7c80b70dafb8369403c751dcb0572f88494 Mon Sep 17 00:00:00 2001 From: peizhilin Date: Wed, 14 Nov 2018 11:26:33 +0800 Subject: [PATCH 39/50] test=develop --- cmake/inference_lib.cmake | 1 + paddle/fluid/inference/analysis/CMakeLists.txt | 4 ---- 2 files changed, 1 insertion(+), 4 deletions(-) diff --git a/cmake/inference_lib.cmake b/cmake/inference_lib.cmake index dc6906bfb3..729bdcb3dc 100644 --- a/cmake/inference_lib.cmake +++ b/cmake/inference_lib.cmake @@ -188,6 +188,7 @@ copy(inference_lib DEPS ${inference_deps} ${src_dir}/${module}/api/paddle_*.h ${PADDLE_BINARY_DIR}/paddle/fluid/inference/api/paddle_inference_pass.h DSTS ${dst_dir}/${module} ${dst_dir}/${module} ${dst_dir}/${module} ${dst_dir}/${module} + ) set(module "platform") copy(platform_lib DEPS profiler_py_proto diff --git a/paddle/fluid/inference/analysis/CMakeLists.txt b/paddle/fluid/inference/analysis/CMakeLists.txt index 07a45ece02..344aecaae5 100644 --- a/paddle/fluid/inference/analysis/CMakeLists.txt +++ b/paddle/fluid/inference/analysis/CMakeLists.txt @@ -21,10 +21,6 @@ cc_library(analysis SRCS cc_test(test_dot SRCS dot_tester.cc DEPS analysis) -if(WIN32) - target_link_libraries(inference_analyzer shlwapi) -endif(WIN32) - function (inference_analysis_test TARGET) if(WITH_TESTING) set(options "") From 42c48c3a82201b871f8c90341074ebd9791901b0 Mon Sep 17 00:00:00 2001 From: peizhilin Date: Wed, 14 Nov 2018 11:37:43 +0800 Subject: [PATCH 40/50] fix --- cmake/inference_lib.cmake | 1 + paddle/fluid/inference/analysis/CMakeLists.txt | 4 ---- 2 files changed, 1 insertion(+), 4 deletions(-) diff --git a/cmake/inference_lib.cmake b/cmake/inference_lib.cmake index dc6906bfb3..729bdcb3dc 100644 --- a/cmake/inference_lib.cmake +++ b/cmake/inference_lib.cmake @@ -188,6 +188,7 @@ copy(inference_lib DEPS ${inference_deps} ${src_dir}/${module}/api/paddle_*.h ${PADDLE_BINARY_DIR}/paddle/fluid/inference/api/paddle_inference_pass.h DSTS ${dst_dir}/${module} ${dst_dir}/${module} ${dst_dir}/${module} ${dst_dir}/${module} + ) set(module "platform") copy(platform_lib DEPS profiler_py_proto diff --git a/paddle/fluid/inference/analysis/CMakeLists.txt b/paddle/fluid/inference/analysis/CMakeLists.txt index 07a45ece02..344aecaae5 100644 --- a/paddle/fluid/inference/analysis/CMakeLists.txt +++ b/paddle/fluid/inference/analysis/CMakeLists.txt @@ -21,10 +21,6 @@ cc_library(analysis SRCS cc_test(test_dot SRCS dot_tester.cc DEPS analysis) -if(WIN32) - target_link_libraries(inference_analyzer shlwapi) -endif(WIN32) - function (inference_analysis_test TARGET) if(WITH_TESTING) set(options "") From 08d1dc84a97f4a40daf82de42006a8e97cd81bcf Mon Sep 17 00:00:00 2001 From: peizhilin Date: Wed, 14 Nov 2018 11:40:05 +0800 Subject: [PATCH 41/50] fix --- paddle/fluid/framework/ir/node.cc | 1 - 1 file changed, 1 deletion(-) diff --git a/paddle/fluid/framework/ir/node.cc b/paddle/fluid/framework/ir/node.cc index 57ee426f73..f34ce62b1e 100644 --- a/paddle/fluid/framework/ir/node.cc +++ b/paddle/fluid/framework/ir/node.cc @@ -23,7 +23,6 @@ namespace ir { #else const char Node::kControlDepVarName[] = "__control_var"; #endif -int Node::count_ = 0; std::unique_ptr CreateNodeForTest(const std::string& name, Node::Type type) { From bae3659714ac7e033c220bb7c3df9400b6c02992 Mon Sep 17 00:00:00 2001 From: Xin Pan Date: Wed, 14 Nov 2018 14:47:47 +0800 Subject: [PATCH 42/50] more test test=develop --- paddle/fluid/pybind/pybind.cc | 6 +++--- python/paddle/fluid/tests/unittests/test_pass_builder.py | 5 +++++ 2 files changed, 8 insertions(+), 3 deletions(-) diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc index 68b80c6311..50b7a08876 100644 --- a/paddle/fluid/pybind/pybind.cc +++ b/paddle/fluid/pybind/pybind.cc @@ -650,9 +650,9 @@ All parameter, weight, gradient are variables in Paddle. [](ir::Pass &self, const std::string &name, const std::string &attr) { self.Set(name, new std::string(attr)); }) - .def("set_int", [](ir::Pass &self, const std::string &name, int val) { - self.Set(name, new int(val)); - }); + .def("set_int", [](ir::Pass &self, const std::string &name, + int val) { self.Set(name, new int(val)); }) + .def("type", &ir::Pass::Type); py::class_> pb( m, "PassBuilder"); diff --git a/python/paddle/fluid/tests/unittests/test_pass_builder.py b/python/paddle/fluid/tests/unittests/test_pass_builder.py index 65ad63dc01..5a3ec8ff01 100644 --- a/python/paddle/fluid/tests/unittests/test_pass_builder.py +++ b/python/paddle/fluid/tests/unittests/test_pass_builder.py @@ -94,7 +94,12 @@ class TestPassBuilder(unittest.TestCase): def test_parallel_testing_with_new_strategy(self): build_strategy = fluid.BuildStrategy() + self.assertFalse(build_strategy.fuse_elewise_add_act_ops) + build_strategy.fuse_elewise_add_act_ops = True pass_builder = build_strategy._finalize_strategy_and_create_passes() + self.assertTrue("fuse_elewise_add_act_pass" in + [p.type() for p in pass_builder.all_passes()]) + origin_len = len(pass_builder.all_passes()) viz_pass = pass_builder.append_pass("graph_viz_pass") From 83ddafb515c664ae0d8e37c1e1ed423c077b829e Mon Sep 17 00:00:00 2001 From: Yu Yang Date: Wed, 14 Nov 2018 14:50:31 +0800 Subject: [PATCH 43/50] Splict cicheks jobs and expose anakin options (#14327) * Split cichecks test=develop * feat(Anakin): expose anakin options to paddle cmake option Expose ANAKIN_BUILD_FAT_BIN, ANAKIN_BUILD_CROSS_PLANTFORM to Paddle cmake option test=develop --- CMakeLists.txt | 2 ++ cmake/external/anakin.cmake | 8 +++++--- paddle/scripts/paddle_build.sh | 15 +++++++++++++++ 3 files changed, 22 insertions(+), 3 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 291a960b14..bd53604075 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -66,6 +66,8 @@ option(WITH_ARM_FP16 "Use half precision support on armv8.2-a cpu" OFF) option(WITH_CONTRIB "Compile the third-party contributation" OFF) option(REPLACE_ENFORCE_GLOG "Replace PADDLE_ENFORCE with glog/CHECK for better debug." OFF) option(WITH_ANAKIN "Compile with Anakin library" OFF) +option(ANAKIN_BUILD_FAT_BIN "Build anakin cuda fat-bin lib for all device plantform, ignored when WITH_ANAKIN=OFF" OFF) +option(ANAKIN_BUILD_CROSS_PLANTFORM "Build anakin lib for any nvidia device plantform. ignored when WITH_ANAKIN=OFF" ON) option(WITH_GRPC "Use grpc as the default rpc framework" ${WITH_DISTRIBUTE}) option(WITH_BRPC_RDMA "Use brpc rdma as the rpc protocal" OFF) option(ON_INFER "Turn on inference optimization." OFF) diff --git a/cmake/external/anakin.cmake b/cmake/external/anakin.cmake index 84354c446e..06fc6061bc 100644 --- a/cmake/external/anakin.cmake +++ b/cmake/external/anakin.cmake @@ -58,19 +58,21 @@ ExternalProject_Add( -DPROTOBUF_ROOT=${THIRD_PARTY_PATH}/install/protobuf -DMKLML_ROOT=${THIRD_PARTY_PATH}/install/mklml -DENABLE_OP_TIMER=${ANAKIN_ENABLE_OP_TIMER} + -DBUILD_FAT_BIN=${ANAKIN_BUILD_FAT_BIN} + -DBUILD_CROSS_PLANTFORM=${ANAKIN_BUILD_CROSS_PLANTFORM} ${EXTERNAL_OPTIONAL_ARGS} CMAKE_CACHE_ARGS -DCMAKE_INSTALL_PREFIX:PATH=${ANAKIN_INSTALL_DIR} ) message(STATUS "Anakin for inference is enabled") message(STATUS "Anakin is set INCLUDE:${ANAKIN_INCLUDE} LIBRARY:${ANAKIN_LIBRARY}") - +add_dependencies(extern_anakin protobuf mklml) add_library(anakin_shared SHARED IMPORTED GLOBAL) set_property(TARGET anakin_shared PROPERTY IMPORTED_LOCATION ${ANAKIN_SHARED_LIB}) -add_dependencies(anakin_shared extern_anakin protobuf mklml) +add_dependencies(anakin_shared extern_anakin) add_library(anakin_saber SHARED IMPORTED GLOBAL) set_property(TARGET anakin_saber PROPERTY IMPORTED_LOCATION ${ANAKIN_SABER_LIB}) -add_dependencies(anakin_saber extern_anakin protobuf mklml) +add_dependencies(anakin_saber extern_anakin) list(APPEND external_project_dependencies anakin_shared anakin_saber) diff --git a/paddle/scripts/paddle_build.sh b/paddle/scripts/paddle_build.sh index a51c9becd4..32f9bca645 100755 --- a/paddle/scripts/paddle_build.sh +++ b/paddle/scripts/paddle_build.sh @@ -156,6 +156,8 @@ function cmake_gen() { -DWITH_INFERENCE_API_TEST=${WITH_INFERENCE_API_TEST:-ON} -DINFERENCE_DEMO_INSTALL_DIR=${INFERENCE_DEMO_INSTALL_DIR} -DWITH_ANAKIN=${WITH_ANAKIN:-OFF} + -DANAKIN_BUILD_FAT_BIN=${ANAKIN_BUILD_FAT_BIN:OFF} + -DANAKIN_BUILD_CROSS_PLANTFORM=${ANAKIN_BUILD_CROSS_PLANTFORM:ON} -DPY_VERSION=${PY_VERSION:-2.7} -DCMAKE_INSTALL_PREFIX=${INSTALL_PREFIX:-/paddle/build} ======================================== @@ -188,6 +190,8 @@ EOF -DWITH_INFERENCE_API_TEST=${WITH_INFERENCE_API_TEST:-ON} \ -DINFERENCE_DEMO_INSTALL_DIR=${INFERENCE_DEMO_INSTALL_DIR} \ -DWITH_ANAKIN=${WITH_ANAKIN:-OFF} \ + -DANAKIN_BUILD_FAT_BIN=${ANAKIN_BUILD_FAT_BIN:OFF}\ + -DANAKIN_BUILD_CROSS_PLANTFORM=${ANAKIN_BUILD_CROSS_PLANTFORM:ON}\ -DPY_VERSION=${PY_VERSION:-2.7} \ -DCMAKE_INSTALL_PREFIX=${INSTALL_PREFIX:-/paddle/build} @@ -777,6 +781,17 @@ function main() { test_fluid_lib assert_api_spec_approvals ;; + assert_api) + assert_api_not_changed ${PYTHON_ABI:-""} + ;; + test_inference) + gen_capi_package + gen_fluid_lib + test_fluid_lib + ;; + assert_api_approvals) + assert_api_spec_approvals + ;; maccheck) cmake_gen ${PYTHON_ABI:-""} build_mac From 1a9008c420cf95e38d49959c313705ebf3d3ff8c Mon Sep 17 00:00:00 2001 From: peizhilin Date: Wed, 14 Nov 2018 17:09:21 +0800 Subject: [PATCH 44/50] code style fix test=develop --- .../framework/ir/attention_lstm_fuse_pass.cc | 6 +- paddle/fluid/framework/ir/node.cc | 4 +- paddle/fluid/framework/ir/node.h | 4 +- paddle/fluid/framework/ir/pass.h | 36 ++++++------ paddle/fluid/framework/operator.cc | 6 +- paddle/fluid/inference/api/helper.h | 2 +- .../fluid/operators/elementwise_op_function.h | 4 +- paddle/fluid/operators/grid_sampler_op.h | 4 +- paddle/fluid/platform/init.cc | 6 +- paddle/fluid/platform/port.h | 56 +++++++++---------- paddle/fluid/platform/variant.h | 2 +- paddle/fluid/pybind/pybind.cc | 4 +- 12 files changed, 67 insertions(+), 67 deletions(-) diff --git a/paddle/fluid/framework/ir/attention_lstm_fuse_pass.cc b/paddle/fluid/framework/ir/attention_lstm_fuse_pass.cc index 64d585c222..c436dd414d 100644 --- a/paddle/fluid/framework/ir/attention_lstm_fuse_pass.cc +++ b/paddle/fluid/framework/ir/attention_lstm_fuse_pass.cc @@ -213,10 +213,10 @@ void PrepareLSTMWeight(const LoDTensor& W_forget_w0, float* out_data = out->mutable_data(platform::CPUPlace()); std::array tensors{ W_forget_w0.data(), W_input_w0.data(), - W_output_w0.data(), W_cell_w0.data()}; + W_output_w0.data(), W_cell_w0.data()}; std::array tensors1{ W_forget_w1.data(), W_input_w1.data(), - W_output_w1.data(), W_cell_w1.data()}; + W_output_w1.data(), W_cell_w1.data()}; for (int row = 0; row < D; row++) { for (int col = 0; col < 4; col++) { @@ -240,7 +240,7 @@ void PrepareLSTMBias(const LoDTensor& B_forget, const LoDTensor& B_input, LoDTensor* out) { std::array tensors{ B_forget.data(), B_input.data(), B_output.data(), - B_cell.data()}; + B_cell.data()}; PADDLE_ENFORCE_EQ(B_forget.dims().size(), 1); int D = B_forget.dims()[0]; diff --git a/paddle/fluid/framework/ir/node.cc b/paddle/fluid/framework/ir/node.cc index f34ce62b1e..50d9113088 100644 --- a/paddle/fluid/framework/ir/node.cc +++ b/paddle/fluid/framework/ir/node.cc @@ -19,9 +19,9 @@ namespace framework { namespace ir { // msvc15 don't support constexpr in correct way. #if !defined(_WIN32) - constexpr char Node::kControlDepVarName[]; +constexpr char Node::kControlDepVarName[]; #else - const char Node::kControlDepVarName[] = "__control_var"; +const char Node::kControlDepVarName[] = "__control_var"; #endif std::unique_ptr CreateNodeForTest(const std::string& name, diff --git a/paddle/fluid/framework/ir/node.h b/paddle/fluid/framework/ir/node.h index 21dd43bc1d..d2a393b3f1 100644 --- a/paddle/fluid/framework/ir/node.h +++ b/paddle/fluid/framework/ir/node.h @@ -56,9 +56,9 @@ class Node { enum class Type { kOperation, kVariable }; #if !defined(_WIN32) // msvc not support constexpr correctly. - static constexpr char kControlDepVarName[] = "__control_var"; + static constexpr char kControlDepVarName[] = "__control_var"; #else - static const char kControlDepVarName[]; + static const char kControlDepVarName[]; #endif Type NodeType() const { return type_; } diff --git a/paddle/fluid/framework/ir/pass.h b/paddle/fluid/framework/ir/pass.h index e8dd48a535..615b539695 100644 --- a/paddle/fluid/framework/ir/pass.h +++ b/paddle/fluid/framework/ir/pass.h @@ -197,26 +197,26 @@ struct PassRegistrar : public Registrar { msg) // Register a new pass that can be applied on the IR. -#define REGISTER_PASS(pass_type, pass_class) \ - STATIC_ASSERT_PASS_GLOBAL_NAMESPACE( \ - __reg_pass__##pass_type, \ - "REGISTER_PASS must be called in global namespace"); \ - static ::paddle::framework::ir::PassRegistrar \ - __pass_registrar_##pass_type##__(#pass_type); \ - int TouchPassRegistrar_##pass_type() { \ - __pass_registrar_##pass_type##__.Touch(); \ - return 0; \ - } \ - static ::paddle::framework::ir::PassRegistrar \ - &__pass_tmp_registrar_##pass_type##__ UNUSED = \ +#define REGISTER_PASS(pass_type, pass_class) \ + STATIC_ASSERT_PASS_GLOBAL_NAMESPACE( \ + __reg_pass__##pass_type, \ + "REGISTER_PASS must be called in global namespace"); \ + static ::paddle::framework::ir::PassRegistrar \ + __pass_registrar_##pass_type##__(#pass_type); \ + int TouchPassRegistrar_##pass_type() { \ + __pass_registrar_##pass_type##__.Touch(); \ + return 0; \ + } \ + static ::paddle::framework::ir::PassRegistrar \ + &__pass_tmp_registrar_##pass_type##__ UNUSED = \ __pass_registrar_##pass_type##__ -#define USE_PASS(pass_type) \ - STATIC_ASSERT_PASS_GLOBAL_NAMESPACE( \ - __use_pass_itself_##pass_type, \ - "USE_PASS must be called in global namespace"); \ - extern int TouchPassRegistrar_##pass_type(); \ - static int use_pass_itself_##pass_type##_ UNUSED = \ +#define USE_PASS(pass_type) \ + STATIC_ASSERT_PASS_GLOBAL_NAMESPACE( \ + __use_pass_itself_##pass_type, \ + "USE_PASS must be called in global namespace"); \ + extern int TouchPassRegistrar_##pass_type(); \ + static int use_pass_itself_##pass_type##_ UNUSED = \ TouchPassRegistrar_##pass_type() } // namespace ir diff --git a/paddle/fluid/framework/operator.cc b/paddle/fluid/framework/operator.cc index 36fe5724ea..6bd744edc2 100644 --- a/paddle/fluid/framework/operator.cc +++ b/paddle/fluid/framework/operator.cc @@ -150,9 +150,9 @@ void OperatorBase::Run(const Scope& scope, const platform::Place& place) { #endif } - // The profile has a process-wide mutex, results in serious performance issue - // in concurrency scenerio. Here use an `if` to fix this issue. - // Please not remove the `if`, ask @Superjomn if there are any concern. +// The profile has a process-wide mutex, results in serious performance issue +// in concurrency scenerio. Here use an `if` to fix this issue. +// Please not remove the `if`, ask @Superjomn if there are any concern. #ifndef _WIN32 if (platform::IsProfileEnabled()) { platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance(); diff --git a/paddle/fluid/inference/api/helper.h b/paddle/fluid/inference/api/helper.h index ba72fba8be..6f9d663121 100644 --- a/paddle/fluid/inference/api/helper.h +++ b/paddle/fluid/inference/api/helper.h @@ -20,9 +20,9 @@ #else #endif -#include #include #include // NOLINT +#include #include #include #include diff --git a/paddle/fluid/operators/elementwise_op_function.h b/paddle/fluid/operators/elementwise_op_function.h index d7444bcfe0..7bb6934e14 100644 --- a/paddle/fluid/operators/elementwise_op_function.h +++ b/paddle/fluid/operators/elementwise_op_function.h @@ -112,7 +112,7 @@ class RowwiseTransformIterator } RowwiseTransformIterator &operator+(int n) { - while(n-- > 0) { + while (n-- > 0) { ++i_; if (UNLIKELY(i_ == n_)) { i_ = 0; @@ -161,7 +161,7 @@ class MidWiseTransformIterator } MidWiseTransformIterator &operator+(int n) { - while(n-- > 0) { + while (n-- > 0) { ++j_; if (UNLIKELY(j_ == post_)) { ++i_; diff --git a/paddle/fluid/operators/grid_sampler_op.h b/paddle/fluid/operators/grid_sampler_op.h index 00fba457bb..08a6043eb0 100644 --- a/paddle/fluid/operators/grid_sampler_op.h +++ b/paddle/fluid/operators/grid_sampler_op.h @@ -67,10 +67,10 @@ static void CalcGridLocations(const platform::CPUDeviceContext& ctx, Tensor half_ymax; half_xmax.mutable_data({n, h, w}, ctx.GetPlace()); auto half_xmax_t = - EigenTensor::From(half_xmax).setConstant(0.5 * x_max); + EigenTensor::From(half_xmax).setConstant(0.5 * x_max); half_ymax.mutable_data({n, h, w}, ctx.GetPlace()); auto half_ymax_t = - EigenTensor::From(half_ymax).setConstant(0.5 * y_max); + EigenTensor::From(half_ymax).setConstant(0.5 * y_max); // scale grid to [0, h-1/w-1] auto grid_x_t = EigenTensor::From(grid_x); diff --git a/paddle/fluid/platform/init.cc b/paddle/fluid/platform/init.cc index 6156067645..84d1b852cb 100644 --- a/paddle/fluid/platform/init.cc +++ b/paddle/fluid/platform/init.cc @@ -115,9 +115,9 @@ void InitDevices(bool init_p2p, const std::vector devices) { // windows has no support for openblas multi-thread #ifdef _WIN32 - if (FLAGS_paddle_num_threads > 1) { - FLAGS_paddle_num_threads = 1; - } + if (FLAGS_paddle_num_threads > 1) { + FLAGS_paddle_num_threads = 1; + } #endif #ifndef PADDLE_WITH_MKLDNN diff --git a/paddle/fluid/platform/port.h b/paddle/fluid/platform/port.h index d3a6e28549..8823e97b0b 100644 --- a/paddle/fluid/platform/port.h +++ b/paddle/fluid/platform/port.h @@ -24,38 +24,38 @@ #include "glog/logging.h" #if !defined(_WIN32) - #include // dladdr - #include // backtrace - #include - #include // std::accumulate +#include // dladdr +#include // backtrace +#include +#include // std::accumulate #else - #include - #include // _popen, _pclose - #include - #include // std::accumulate in msvc - #ifndef S_ISDIR // windows port for sys/stat.h - #define S_ISDIR(mode) (((mode)&S_IFMT) == S_IFDIR) - #endif // S_ISDIR - - static void *dlsym(void *handle, const char *symbol_name) { - FARPROC found_symbol; - found_symbol = GetProcAddress((HMODULE)handle, symbol_name); - - if (found_symbol == NULL) { - throw std::runtime_error(std::string(symbol_name) + " not found."); - } - return reinterpret_cast(found_symbol); +#include // _popen, _pclose +#include +#include +#include // std::accumulate in msvc +#ifndef S_ISDIR // windows port for sys/stat.h +#define S_ISDIR(mode) (((mode)&S_IFMT) == S_IFDIR) +#endif // S_ISDIR + +static void *dlsym(void *handle, const char *symbol_name) { + FARPROC found_symbol; + found_symbol = GetProcAddress((HMODULE)handle, symbol_name); + + if (found_symbol == NULL) { + throw std::runtime_error(std::string(symbol_name) + " not found."); } + return reinterpret_cast(found_symbol); +} - static void *dlopen(const char *filename, int flag) { - std::string file_name(filename); - file_name.replace(0, file_name.size() - 1, '/', '\\'); - HMODULE hModule = LoadLibrary(file_name.c_str()); - if (!hModule) { - throw std::runtime_error(file_name + " not found."); - } - return reinterpret_cast(hModule); +static void *dlopen(const char *filename, int flag) { + std::string file_name(filename); + file_name.replace(0, file_name.size() - 1, '/', '\\'); + HMODULE hModule = LoadLibrary(file_name.c_str()); + if (!hModule) { + throw std::runtime_error(file_name + " not found."); } + return reinterpret_cast(hModule); +} #endif // !_WIN32 diff --git a/paddle/fluid/platform/variant.h b/paddle/fluid/platform/variant.h index 1b10db8669..42bff087d2 100644 --- a/paddle/fluid/platform/variant.h +++ b/paddle/fluid/platform/variant.h @@ -46,7 +46,7 @@ limitations under the License. */ // some platform-independent defintion #if defined(_WIN32) #define UNUSED -#define __builtin_expect(EXP, C) (EXP) +#define __builtin_expect(EXP, C) (EXP) #else #define UNUSED __attribute__((unused)) #endif diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc index 6cba3395bf..592c40cf1c 100644 --- a/paddle/fluid/pybind/pybind.cc +++ b/paddle/fluid/pybind/pybind.cc @@ -352,7 +352,7 @@ All parameter, weight, gradient are variables in Paddle. [](Variable &self) { return self.GetMutable(); }, py::return_value_policy::reference) #if (defined(PADDLE_WITH_CUDA) && !defined(_WIN32)) - .def("get_communicator", + .def("get_communicator", [](Variable &self) -> platform::Communicator * { return self.GetMutable(); }, @@ -364,7 +364,7 @@ All parameter, weight, gradient are variables in Paddle. }, py::return_value_policy::reference) #endif -; + ; #if !defined(_WIN32) py::class_(m, "Reader", "") From e2a1cd19f1602fff49fe5fccf54b96dd99ddcd90 Mon Sep 17 00:00:00 2001 From: peizhilin Date: Wed, 14 Nov 2018 17:17:17 +0800 Subject: [PATCH 45/50] code style fix test=develop --- python/paddle/trainer_config_helpers/networks.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/python/paddle/trainer_config_helpers/networks.py b/python/paddle/trainer_config_helpers/networks.py index 1e961b936f..b5cde7bac7 100644 --- a/python/paddle/trainer_config_helpers/networks.py +++ b/python/paddle/trainer_config_helpers/networks.py @@ -1719,7 +1719,7 @@ def inputs(layers, *args): if len(args) != 0: layers.extend(args) - Inputs(* [l.name for l in layers]) + Inputs(*[l.name for l in layers]) def outputs(layers, *args): @@ -1769,7 +1769,7 @@ def outputs(layers, *args): assert len(layers) > 0 if HasInputsSet(): # input already set - Outputs(* [l.name for l in layers]) + Outputs(*[l.name for l in layers]) return # just return outputs. if len(layers) != 1: From 228e1934b81c1d25555c269c28923aef8d192154 Mon Sep 17 00:00:00 2001 From: peizhilin Date: Wed, 14 Nov 2018 17:17:17 +0800 Subject: [PATCH 46/50] code style fix test=develop --- python/paddle/trainer_config_helpers/networks.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/python/paddle/trainer_config_helpers/networks.py b/python/paddle/trainer_config_helpers/networks.py index 1e961b936f..b5cde7bac7 100644 --- a/python/paddle/trainer_config_helpers/networks.py +++ b/python/paddle/trainer_config_helpers/networks.py @@ -1719,7 +1719,7 @@ def inputs(layers, *args): if len(args) != 0: layers.extend(args) - Inputs(* [l.name for l in layers]) + Inputs(*[l.name for l in layers]) def outputs(layers, *args): @@ -1769,7 +1769,7 @@ def outputs(layers, *args): assert len(layers) > 0 if HasInputsSet(): # input already set - Outputs(* [l.name for l in layers]) + Outputs(*[l.name for l in layers]) return # just return outputs. if len(layers) != 1: From 15bdb7ef14f7ce9ff4c72d31a17ab9a1d03204d7 Mon Sep 17 00:00:00 2001 From: nhzlx Date: Wed, 14 Nov 2018 10:31:17 +0000 Subject: [PATCH 47/50] delete error uploaded files test=develop --- .../tensorrt/plugin/.trt_plugin_utils.h.swp | Bin 12288 -> 0 bytes 1 file changed, 0 insertions(+), 0 deletions(-) delete mode 100644 paddle/fluid/inference/tensorrt/plugin/.trt_plugin_utils.h.swp diff --git a/paddle/fluid/inference/tensorrt/plugin/.trt_plugin_utils.h.swp b/paddle/fluid/inference/tensorrt/plugin/.trt_plugin_utils.h.swp deleted file mode 100644 index 08d1434089f792131d0e6a545ad8675b3ba4892c..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 12288 zcmeI2O^@3|7{{j^p`|Nc4hRkpS%h|18#|j)QMP48x^J}z$to{h6%JuykDWn|?T*Kr z-9?2DfCL|cmJa|&Q27KL5nrL)$^i+%0sfAYCc8jeP6#xXez83>e*DaD>}Xec`jzX> zJM_9$W!M%N`}xJM-+PzcUl_i{n2KYaH$Q4Sx>;E(2T4}0Uc6XdyYLz)S1OiBT&vHe zmsxH+%wyPT>Q(K-oj_u*9A zLJ0M zTy$)S;*Q9+=z6^1b16^LT;5=8y5=)G^x<6NY+2`9i)R1>*&<|xnJ4H<y>tdLXDSUDy;Bt=OzBFsE4? z`EfBOO@u3b=~Bp}Amf3cAU+T(%2zoNYADlm9F<2N+jlzGn%xfV*IKPwqvLG1so5fY z7i>8lr`f>S4%Her=xwL5wMs(beu~6lqC}b!?k9&yD1~P+Pv*~2KhwhdbGjr`nja8H z#3L%z+T^rLbhXxY+N-P^g?UgVZe~&;O8g6J;qhXQjM<@e)(66n)09%33PsWB<6 zcI@jBSFkqFI$5{v(P(6GyyydA#VW87B)68@b!QXbc-!Twh4R=NaYjRaL~&npMC0Vf z3C{czQn_YFlW|e3DNA$bn2s-zlsI%nqVrTx`;JsOR*puHG#|oZlSP{sQyV8YUCvdy zE>ylKgv4kmU)0j%q7vRvX0OmDa#J!GXj*cYsajdPM0?|+`r?ynnI6O{wWt<`)XE2@ M)XHC^gM4-V01zic1poj5 From b361579f09840910fd5ab6c3118b74b67f4939b6 Mon Sep 17 00:00:00 2001 From: Jacek Czaja Date: Wed, 14 Nov 2018 12:20:36 +0100 Subject: [PATCH 48/50] - Softmax for Inference is enabled when ON_INFER is set test=develop --- paddle/fluid/operators/math/softmax.cc | 6 ++- paddle/fluid/operators/math/softmax.cu | 11 +++-- paddle/fluid/operators/math/softmax.h | 2 +- paddle/fluid/operators/math/softmax_impl.h | 41 +++++++++++++++++-- paddle/fluid/operators/softmax_op.h | 7 +++- .../operators/softmax_with_cross_entropy_op.h | 4 +- 6 files changed, 58 insertions(+), 13 deletions(-) diff --git a/paddle/fluid/operators/math/softmax.cc b/paddle/fluid/operators/math/softmax.cc index 78c65af24a..fa2018178f 100644 --- a/paddle/fluid/operators/math/softmax.cc +++ b/paddle/fluid/operators/math/softmax.cc @@ -19,8 +19,10 @@ namespace paddle { namespace operators { namespace math { -template class SoftmaxFunctor; -template class SoftmaxFunctor; +template class SoftmaxFunctor; +template class SoftmaxFunctor; +template class SoftmaxFunctor; +template class SoftmaxFunctor; template class SoftmaxGradFunctor; template class SoftmaxGradFunctor; diff --git a/paddle/fluid/operators/math/softmax.cu b/paddle/fluid/operators/math/softmax.cu index ce183ed364..2e9669049e 100644 --- a/paddle/fluid/operators/math/softmax.cu +++ b/paddle/fluid/operators/math/softmax.cu @@ -98,9 +98,14 @@ template class SoftmaxGradCUDNNFunctor; template class SoftmaxGradCUDNNFunctor; template class SoftmaxGradCUDNNFunctor; -template class SoftmaxFunctor; -template class SoftmaxFunctor; -template class SoftmaxFunctor; +template class SoftmaxFunctor; +template class SoftmaxFunctor; +template class SoftmaxFunctor; +template class SoftmaxFunctor; +template class SoftmaxFunctor; +template class SoftmaxFunctor; template class SoftmaxGradFunctor; template class SoftmaxGradFunctor; template class SoftmaxGradFunctor +template class SoftmaxFunctor { public: void operator()(const DeviceContext& context, const framework::Tensor* X, diff --git a/paddle/fluid/operators/math/softmax_impl.h b/paddle/fluid/operators/math/softmax_impl.h index dd9971ba09..7cf98f2725 100644 --- a/paddle/fluid/operators/math/softmax_impl.h +++ b/paddle/fluid/operators/math/softmax_impl.h @@ -32,10 +32,10 @@ struct ValueClip { } }; -template -void SoftmaxFunctor::operator()(const DeviceContext& context, - const framework::Tensor* X, - framework::Tensor* Y) { +template +void SoftmaxFunctor::operator()( + const DeviceContext& context, const framework::Tensor* X, + framework::Tensor* Y) { auto logits = EigenMatrix::From(*X); auto softmax = EigenMatrix::From(*Y); @@ -65,6 +65,39 @@ void SoftmaxFunctor::operator()(const DeviceContext& context, .broadcast(one_by_class)); } +template +class SoftmaxFunctor { + void operator()(const DeviceContext& context, const framework::Tensor* X, + framework::Tensor* Y) { + auto logits = EigenMatrix::From(*X); + auto softmax = EigenMatrix::From(*Y); + + const int kBatchDim = 0; + const int kClassDim = 1; + + const int batch_size = logits.dimension(kBatchDim); + const int num_classes = logits.dimension(kClassDim); + + Eigen::DSizes along_class(kClassDim); + Eigen::DSizes batch_by_one(batch_size, 1); + Eigen::DSizes one_by_class(1, num_classes); + + auto shifted_logits = (logits - + logits.maximum(along_class) + .eval() + .reshape(batch_by_one) + .broadcast(one_by_class)); + + softmax.device(*context.eigen_device()) = shifted_logits.exp(); + softmax.device(*context.eigen_device()) = (softmax * + softmax.sum(along_class) + .inverse() + .eval() + .reshape(batch_by_one) + .broadcast(one_by_class)); + } +}; + template void SoftmaxGradFunctor::operator()( const DeviceContext& context, const framework::Tensor* y, diff --git a/paddle/fluid/operators/softmax_op.h b/paddle/fluid/operators/softmax_op.h index cf1eeb017d..2fea8a65bc 100644 --- a/paddle/fluid/operators/softmax_op.h +++ b/paddle/fluid/operators/softmax_op.h @@ -35,8 +35,13 @@ class SoftmaxKernel : public framework::OpKernel { Tensor X_2d = framework::ReshapeToMatrix(*X, rank - 1); Tensor Out_2d = framework::ReshapeToMatrix(*Out, rank - 1); - math::SoftmaxFunctor()( +#ifdef ON_INFER + math::SoftmaxFunctor()( context.template device_context(), &X_2d, &Out_2d); +#else + math::SoftmaxFunctor()( + context.template device_context(), &X_2d, &Out_2d); +#endif } }; diff --git a/paddle/fluid/operators/softmax_with_cross_entropy_op.h b/paddle/fluid/operators/softmax_with_cross_entropy_op.h index e9aba3b37b..c0530e3d8b 100644 --- a/paddle/fluid/operators/softmax_with_cross_entropy_op.h +++ b/paddle/fluid/operators/softmax_with_cross_entropy_op.h @@ -42,8 +42,8 @@ class SoftmaxWithCrossEntropyKernel : public framework::OpKernel { auto& dev_ctx = context.template device_context(); - math::SoftmaxFunctor()(dev_ctx, logits, - softmax); + math::SoftmaxFunctor()( + dev_ctx, logits, softmax); math::CrossEntropyFunctor()( dev_ctx, loss, softmax, labels, context.Attr("soft_label"), context.Attr("ignore_index")); From 0ef2a37c0e3675be44e4bb556ca601d7d43c79a7 Mon Sep 17 00:00:00 2001 From: peizhilin Date: Wed, 14 Nov 2018 19:57:44 +0800 Subject: [PATCH 49/50] merge from develop --- .../fluid/inference/analysis/CMakeLists.txt | 27 ++++++++----------- 1 file changed, 11 insertions(+), 16 deletions(-) diff --git a/paddle/fluid/inference/analysis/CMakeLists.txt b/paddle/fluid/inference/analysis/CMakeLists.txt index 344aecaae5..eb89fc5e11 100644 --- a/paddle/fluid/inference/analysis/CMakeLists.txt +++ b/paddle/fluid/inference/analysis/CMakeLists.txt @@ -21,22 +21,17 @@ cc_library(analysis SRCS cc_test(test_dot SRCS dot_tester.cc DEPS analysis) -function (inference_analysis_test TARGET) - if(WITH_TESTING) - set(options "") - set(oneValueArgs "") - set(multiValueArgs SRCS ARGS EXTRA_DEPS) - cmake_parse_arguments(analysis_test "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN}) - set(mem_opt "") - if(WITH_GPU) - set(mem_opt "--fraction_of_gpu_memory_to_use=0.5") - endif() - cc_test(${TARGET} - SRCS "${analysis_test_SRCS}" - DEPS analysis pass ${GLOB_PASS_LIB} ${analysis_test_EXTRA_DEPS} - ARGS --inference_model_dir=${PYTHON_TESTS_DIR}/book/word2vec.inference.model ${mem_opt} ${analysis_test_ARGS}) - set_tests_properties(${TARGET} PROPERTIES DEPENDS test_word2vec) - endif(WITH_TESTING) +function(inference_analysis_test TARGET) + if(WITH_TESTING) + set(options "") + set(oneValueArgs "") + set(multiValueArgs SRCS ARGS EXTRA_DEPS) + cmake_parse_arguments(analysis_test "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN}) + inference_base_test(${TARGET} + SRCS ${analysis_test_SRCS} + DEPS analysis pass ${GLOB_PASS_LIB} ${analysis_test_EXTRA_DEPS} + ARGS --inference_model_dir=${WORD2VEC_MODEL_DIR} ${analysis_test_ARGS}) + endif() endfunction(inference_analysis_test) inference_analysis_test(test_analyzer SRCS analyzer_tester.cc EXTRA_DEPS paddle_inference_api) From 9e6b1c5f974bdb42c8ec5dc323f76d405e8017d8 Mon Sep 17 00:00:00 2001 From: Yiqun Liu Date: Thu, 15 Nov 2018 10:28:52 +0800 Subject: [PATCH 50/50] Refine tester of TensorRT engine (#14390) * Refine the tester for MixedRTPredictor. test=develop * Enable the profiler in TensorRT engine. * Support the use of combined inference model in TensorRT unittest, and print the shape of feed targets. --- .../api/analysis_predictor_tester.cc | 2 +- .../api/demo_ci/simple_on_word2vec.cc | 2 +- .../api/demo_ci/trt_mobilenet_demo.cc | 2 +- .../inference/api/paddle_analysis_config.h | 2 + .../fluid/inference/api/paddle_pass_builder.h | 4 +- .../fluid/inference/tests/api/CMakeLists.txt | 3 +- .../tests/api/analyzer_dam_tester.cc | 7 +- .../tests/api/analyzer_lac_tester.cc | 6 +- .../tests/api/analyzer_ner_tester.cc | 6 +- .../tests/api/analyzer_resnet50_tester.cc | 6 +- .../tests/api/analyzer_rnn1_tester.cc | 10 +- .../tests/api/analyzer_rnn2_tester.cc | 6 +- .../tests/api/analyzer_seq_conv1_tester.cc | 6 +- .../analyzer_text_classification_tester.cc | 9 +- .../tests/api/analyzer_vis_tester.cc | 6 +- .../inference/tests/api/config_printer.h | 79 ++++++ .../fluid/inference/tests/api/tester_helper.h | 87 +++++-- .../inference/tests/api/trt_models_tester.cc | 245 +++++++++--------- 18 files changed, 315 insertions(+), 173 deletions(-) create mode 100644 paddle/fluid/inference/tests/api/config_printer.h diff --git a/paddle/fluid/inference/api/analysis_predictor_tester.cc b/paddle/fluid/inference/api/analysis_predictor_tester.cc index 1e6f75e364..d67305670c 100644 --- a/paddle/fluid/inference/api/analysis_predictor_tester.cc +++ b/paddle/fluid/inference/api/analysis_predictor_tester.cc @@ -15,7 +15,7 @@ #include "paddle/fluid/inference/api/analysis_predictor.h" #include #include -#include +#include // NOLINT #include "paddle/fluid/inference/api/helper.h" #include "paddle/fluid/inference/api/paddle_inference_api.h" diff --git a/paddle/fluid/inference/api/demo_ci/simple_on_word2vec.cc b/paddle/fluid/inference/api/demo_ci/simple_on_word2vec.cc index 6ae5198dab..3dd1d3c838 100644 --- a/paddle/fluid/inference/api/demo_ci/simple_on_word2vec.cc +++ b/paddle/fluid/inference/api/demo_ci/simple_on_word2vec.cc @@ -23,7 +23,7 @@ limitations under the License. */ #include #include //NOLINT -#include "utils.h" +#include "utils.h" // NOLINT DEFINE_string(dirname, "", "Directory of the inference model."); DEFINE_bool(use_gpu, false, "Whether use gpu."); diff --git a/paddle/fluid/inference/api/demo_ci/trt_mobilenet_demo.cc b/paddle/fluid/inference/api/demo_ci/trt_mobilenet_demo.cc index 72d20bc59e..0eb620ea51 100644 --- a/paddle/fluid/inference/api/demo_ci/trt_mobilenet_demo.cc +++ b/paddle/fluid/inference/api/demo_ci/trt_mobilenet_demo.cc @@ -4,7 +4,7 @@ Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at -http://www.apache.org/licenses/LICENSE-2.0 + http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, diff --git a/paddle/fluid/inference/api/paddle_analysis_config.h b/paddle/fluid/inference/api/paddle_analysis_config.h index 82c04e9f3f..2ac736df7c 100644 --- a/paddle/fluid/inference/api/paddle_analysis_config.h +++ b/paddle/fluid/inference/api/paddle_analysis_config.h @@ -49,6 +49,8 @@ struct AnalysisConfig : public NativeConfig { void EnableTensorRtEngine(int workspace_size = 1 << 20, int max_batch_size = 1); + bool use_tensorrt() const { return use_tensorrt_; } + // NOTE this is just for internal development, please not use it. // NOT stable yet. void EnableMKLDNN(); diff --git a/paddle/fluid/inference/api/paddle_pass_builder.h b/paddle/fluid/inference/api/paddle_pass_builder.h index 8aad5c5984..80658d3085 100644 --- a/paddle/fluid/inference/api/paddle_pass_builder.h +++ b/paddle/fluid/inference/api/paddle_pass_builder.h @@ -91,7 +91,7 @@ class CpuPassStrategy : public PassStrategy { virtual ~CpuPassStrategy() = default; - virtual void EnableMKLDNN() override { + void EnableMKLDNN() override { // TODO(Superjomn) Consider the way to mix CPU with GPU. #ifdef PADDLE_WITH_MKLDNN passes_.insert(passes_.begin(), "mkldnn_placement_pass"); @@ -123,7 +123,7 @@ class GpuPassStrategy : public PassStrategy { GpuPassStrategy(const GpuPassStrategy &other) : PassStrategy(other.AllPasses()) {} - virtual void EnableMKLDNN() override; + void EnableMKLDNN() override; virtual ~GpuPassStrategy() = default; }; diff --git a/paddle/fluid/inference/tests/api/CMakeLists.txt b/paddle/fluid/inference/tests/api/CMakeLists.txt index fc3e44ffd7..4915f28f43 100644 --- a/paddle/fluid/inference/tests/api/CMakeLists.txt +++ b/paddle/fluid/inference/tests/api/CMakeLists.txt @@ -108,8 +108,7 @@ if(WITH_GPU AND TENSORRT_FOUND) if (NOT EXISTS ${TRT_MODEL_INSTALL_DIR}) inference_download_and_uncompress(${TRT_MODEL_INSTALL_DIR} ${INFERENCE_URL}/tensorrt_test "trt_test_models.tar.gz") endif() - inference_analysis_test(test_trt_models SRCS trt_models_tester.cc EXTRA_DEPS ${INFERENCE_EXTRA_DEPS} analysis ${analysis_deps} ir_pass_manager analysis_predictor - ARGS --dirname=${TRT_MODEL_INSTALL_DIR}/trt_test_models SERIAL) + ARGS --infer_model=${TRT_MODEL_INSTALL_DIR}/trt_test_models SERIAL) endif() diff --git a/paddle/fluid/inference/tests/api/analyzer_dam_tester.cc b/paddle/fluid/inference/tests/api/analyzer_dam_tester.cc index ceac5dc7e1..d1adc08667 100644 --- a/paddle/fluid/inference/tests/api/analyzer_dam_tester.cc +++ b/paddle/fluid/inference/tests/api/analyzer_dam_tester.cc @@ -178,7 +178,8 @@ TEST(Analyzer_dam, profile) { std::vector outputs; std::vector> input_slots_all; SetInput(&input_slots_all); - TestPrediction(cfg, input_slots_all, &outputs, FLAGS_num_threads); + TestPrediction(reinterpret_cast(&cfg), + input_slots_all, &outputs, FLAGS_num_threads); if (FLAGS_num_threads == 1 && !FLAGS_test_all_data) { PADDLE_ENFORCE_GT(outputs.size(), 0); @@ -216,7 +217,9 @@ TEST(Analyzer_dam, compare) { SetInput(&input_slots_all); if (FLAGS_use_analysis) { - CompareNativeAndAnalysis(cfg, input_slots_all); + CompareNativeAndAnalysis( + reinterpret_cast(&cfg), + input_slots_all); } } diff --git a/paddle/fluid/inference/tests/api/analyzer_lac_tester.cc b/paddle/fluid/inference/tests/api/analyzer_lac_tester.cc index 5fb551810f..310852e2f7 100644 --- a/paddle/fluid/inference/tests/api/analyzer_lac_tester.cc +++ b/paddle/fluid/inference/tests/api/analyzer_lac_tester.cc @@ -133,7 +133,8 @@ TEST(Analyzer_LAC, profile) { std::vector> input_slots_all; SetInput(&input_slots_all); - TestPrediction(cfg, input_slots_all, &outputs, FLAGS_num_threads); + TestPrediction(reinterpret_cast(&cfg), + input_slots_all, &outputs, FLAGS_num_threads); if (FLAGS_num_threads == 1 && !FLAGS_test_all_data) { // the first inference result @@ -175,7 +176,8 @@ TEST(Analyzer_LAC, compare) { std::vector> input_slots_all; SetInput(&input_slots_all); - CompareNativeAndAnalysis(cfg, input_slots_all); + CompareNativeAndAnalysis( + reinterpret_cast(&cfg), input_slots_all); } } // namespace analysis diff --git a/paddle/fluid/inference/tests/api/analyzer_ner_tester.cc b/paddle/fluid/inference/tests/api/analyzer_ner_tester.cc index d91f7c314d..3a5f844de3 100644 --- a/paddle/fluid/inference/tests/api/analyzer_ner_tester.cc +++ b/paddle/fluid/inference/tests/api/analyzer_ner_tester.cc @@ -121,7 +121,8 @@ TEST(Analyzer_Chinese_ner, profile) { std::vector> input_slots_all; SetInput(&input_slots_all); - TestPrediction(cfg, input_slots_all, &outputs, FLAGS_num_threads); + TestPrediction(reinterpret_cast(&cfg), + input_slots_all, &outputs, FLAGS_num_threads); if (FLAGS_num_threads == 1 && !FLAGS_test_all_data) { // the first inference result @@ -160,7 +161,8 @@ TEST(Analyzer_Chinese_ner, compare) { std::vector> input_slots_all; SetInput(&input_slots_all); - CompareNativeAndAnalysis(cfg, input_slots_all); + CompareNativeAndAnalysis( + reinterpret_cast(&cfg), input_slots_all); } } // namespace inference diff --git a/paddle/fluid/inference/tests/api/analyzer_resnet50_tester.cc b/paddle/fluid/inference/tests/api/analyzer_resnet50_tester.cc index 5c92096d9d..2b936175ed 100644 --- a/paddle/fluid/inference/tests/api/analyzer_resnet50_tester.cc +++ b/paddle/fluid/inference/tests/api/analyzer_resnet50_tester.cc @@ -45,7 +45,8 @@ void profile(bool use_mkldnn = false) { std::vector> input_slots_all; SetInput(&input_slots_all); - TestPrediction(cfg, input_slots_all, &outputs, FLAGS_num_threads); + TestPrediction(reinterpret_cast(&cfg), + input_slots_all, &outputs, FLAGS_num_threads); } TEST(Analyzer_resnet50, profile) { profile(); } @@ -74,7 +75,8 @@ void compare(bool use_mkldnn = false) { std::vector> input_slots_all; SetInput(&input_slots_all); - CompareNativeAndAnalysis(cfg, input_slots_all); + CompareNativeAndAnalysis( + reinterpret_cast(&cfg), input_slots_all); } TEST(Analyzer_resnet50, compare) { compare(); } diff --git a/paddle/fluid/inference/tests/api/analyzer_rnn1_tester.cc b/paddle/fluid/inference/tests/api/analyzer_rnn1_tester.cc index 612ae121b2..1ae2b4b03a 100644 --- a/paddle/fluid/inference/tests/api/analyzer_rnn1_tester.cc +++ b/paddle/fluid/inference/tests/api/analyzer_rnn1_tester.cc @@ -233,8 +233,8 @@ TEST(Analyzer_rnn1, profile) { std::vector> input_slots_all; SetInput(&input_slots_all); - LOG(INFO) << "to test prediction"; - TestPrediction(cfg, input_slots_all, &outputs, FLAGS_num_threads); + TestPrediction(reinterpret_cast(&cfg), + input_slots_all, &outputs, FLAGS_num_threads); } // Check the fuse status @@ -261,7 +261,8 @@ TEST(Analyzer_rnn1, compare) { std::vector> input_slots_all; SetInput(&input_slots_all); - CompareNativeAndAnalysis(cfg, input_slots_all); + CompareNativeAndAnalysis( + reinterpret_cast(&cfg), input_slots_all); } // Test Multi-Thread. @@ -272,7 +273,8 @@ TEST(Analyzer_rnn1, multi_thread) { std::vector> input_slots_all; SetInput(&input_slots_all); - TestPrediction(cfg, input_slots_all, &outputs, 4 /* multi_thread */); + TestPrediction(reinterpret_cast(&cfg), + input_slots_all, &outputs, 4 /* multi_thread */); } // Validate that the AnalysisPredictor + ZeroCopyTensor really works by testing diff --git a/paddle/fluid/inference/tests/api/analyzer_rnn2_tester.cc b/paddle/fluid/inference/tests/api/analyzer_rnn2_tester.cc index e0eb919bd8..e2985006f0 100644 --- a/paddle/fluid/inference/tests/api/analyzer_rnn2_tester.cc +++ b/paddle/fluid/inference/tests/api/analyzer_rnn2_tester.cc @@ -132,7 +132,8 @@ TEST(Analyzer_rnn2, profile) { std::vector> input_slots_all; SetInput(&input_slots_all); - TestPrediction(cfg, input_slots_all, &outputs, FLAGS_num_threads); + TestPrediction(reinterpret_cast(&cfg), + input_slots_all, &outputs, FLAGS_num_threads); if (FLAGS_num_threads == 1 && !FLAGS_test_all_data) { // the first inference result @@ -153,7 +154,8 @@ TEST(Analyzer_rnn2, compare) { std::vector> input_slots_all; SetInput(&input_slots_all); - CompareNativeAndAnalysis(cfg, input_slots_all); + CompareNativeAndAnalysis( + reinterpret_cast(&cfg), input_slots_all); } } // namespace inference diff --git a/paddle/fluid/inference/tests/api/analyzer_seq_conv1_tester.cc b/paddle/fluid/inference/tests/api/analyzer_seq_conv1_tester.cc index f590ef2796..858191184a 100644 --- a/paddle/fluid/inference/tests/api/analyzer_seq_conv1_tester.cc +++ b/paddle/fluid/inference/tests/api/analyzer_seq_conv1_tester.cc @@ -161,7 +161,8 @@ TEST(Analyzer_seq_conv1, profile) { std::vector> input_slots_all; SetInput(&input_slots_all); - TestPrediction(cfg, input_slots_all, &outputs, FLAGS_num_threads); + TestPrediction(reinterpret_cast(&cfg), + input_slots_all, &outputs, FLAGS_num_threads); if (FLAGS_num_threads == 1 && !FLAGS_test_all_data) { // the first inference result @@ -199,7 +200,8 @@ TEST(Analyzer_seq_conv1, compare) { std::vector> input_slots_all; SetInput(&input_slots_all); - CompareNativeAndAnalysis(cfg, input_slots_all); + CompareNativeAndAnalysis( + reinterpret_cast(&cfg), input_slots_all); } } // namespace inference diff --git a/paddle/fluid/inference/tests/api/analyzer_text_classification_tester.cc b/paddle/fluid/inference/tests/api/analyzer_text_classification_tester.cc index 05bffede47..34a241f070 100644 --- a/paddle/fluid/inference/tests/api/analyzer_text_classification_tester.cc +++ b/paddle/fluid/inference/tests/api/analyzer_text_classification_tester.cc @@ -74,7 +74,8 @@ TEST(Analyzer_Text_Classification, profile) { std::vector> input_slots_all; SetInput(&input_slots_all); - TestPrediction(cfg, input_slots_all, &outputs, FLAGS_num_threads); + TestPrediction(reinterpret_cast(&cfg), + input_slots_all, &outputs, FLAGS_num_threads); if (FLAGS_num_threads == 1) { // Get output @@ -101,7 +102,8 @@ TEST(Analyzer_Text_Classification, compare) { std::vector> input_slots_all; SetInput(&input_slots_all); - CompareNativeAndAnalysis(cfg, input_slots_all); + CompareNativeAndAnalysis( + reinterpret_cast(&cfg), input_slots_all); } TEST(Analyzer_Text_Classification, compare_against_embedding_fc_lstm_fused) { @@ -112,7 +114,8 @@ TEST(Analyzer_Text_Classification, compare_against_embedding_fc_lstm_fused) { std::vector> input_slots_all; SetInput(&input_slots_all); - CompareNativeAndAnalysis(cfg, input_slots_all); + CompareNativeAndAnalysis( + reinterpret_cast(&cfg), input_slots_all); } } // namespace inference diff --git a/paddle/fluid/inference/tests/api/analyzer_vis_tester.cc b/paddle/fluid/inference/tests/api/analyzer_vis_tester.cc index 8fafd25b78..16e1011dda 100644 --- a/paddle/fluid/inference/tests/api/analyzer_vis_tester.cc +++ b/paddle/fluid/inference/tests/api/analyzer_vis_tester.cc @@ -94,7 +94,8 @@ void profile(bool use_mkldnn = false) { std::vector> input_slots_all; SetInput(&input_slots_all); - TestPrediction(cfg, input_slots_all, &outputs, FLAGS_num_threads); + TestPrediction(reinterpret_cast(&cfg), + input_slots_all, &outputs, FLAGS_num_threads); if (FLAGS_num_threads == 1 && !FLAGS_test_all_data) { const float ocr_result_data[] = { @@ -136,7 +137,8 @@ void compare(bool use_mkldnn = false) { std::vector> input_slots_all; SetInput(&input_slots_all); - CompareNativeAndAnalysis(cfg, input_slots_all); + CompareNativeAndAnalysis( + reinterpret_cast(&cfg), input_slots_all); } TEST(Analyzer_vis, compare) { compare(); } diff --git a/paddle/fluid/inference/tests/api/config_printer.h b/paddle/fluid/inference/tests/api/config_printer.h new file mode 100644 index 0000000000..aa0c4b1d04 --- /dev/null +++ b/paddle/fluid/inference/tests/api/config_printer.h @@ -0,0 +1,79 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include +#include +#include "paddle/fluid/inference/api/paddle_inference_api.h" + +namespace paddle { +namespace inference { + +thread_local int num_spaces = 0; + +static std::string GenSpaces(int num_spaces) { + std::ostringstream os; + for (int i = 0; i < num_spaces; ++i) { + os << " "; + } + return os.str(); +} + +std::ostream &operator<<(std::ostream &os, + const PaddlePredictor::Config &config) { + os << GenSpaces(num_spaces) << "PaddlePredictor::Config {\n"; + num_spaces++; + os << GenSpaces(num_spaces) << "model_dir: " << config.model_dir << "\n"; + num_spaces--; + os << GenSpaces(num_spaces) << "}\n"; + return os; +} + +std::ostream &operator<<(std::ostream &os, const NativeConfig &config) { + os << GenSpaces(num_spaces) << "NativeConfig {\n"; + num_spaces++; + os << *reinterpret_cast(&config); + os << GenSpaces(num_spaces) << "use_gpu: " << config.use_gpu << "\n"; + os << GenSpaces(num_spaces) << "device: " << config.device << "\n"; + os << GenSpaces(num_spaces) + << "fraction_of_gpu_memory: " << config.fraction_of_gpu_memory << "\n"; + os << GenSpaces(num_spaces) << "prog_file: " << config.prog_file << "\n"; + os << GenSpaces(num_spaces) << "param_file: " << config.param_file << "\n"; + os << GenSpaces(num_spaces) + << "specify_input_name: " << config.specify_input_name << "\n"; + num_spaces--; + os << GenSpaces(num_spaces) << "}\n"; + return os; +} + +std::ostream &operator<<(std::ostream &os, + const contrib::AnalysisConfig &config) { + os << GenSpaces(num_spaces) << "contrib::AnalysisConfig {\n"; + num_spaces++; + os << *reinterpret_cast(&config); + os << GenSpaces(num_spaces) << "enable_ir_optim: " << config.enable_ir_optim + << "\n"; + os << GenSpaces(num_spaces) + << "use_feed_fetch_ops: " << config.use_feed_fetch_ops << "\n"; + os << GenSpaces(num_spaces) << "use_tensorrt: " << config.use_tensorrt() + << "\n"; + os << GenSpaces(num_spaces) << "use_mkldnn: " << config.use_mkldnn() << "\n"; + num_spaces--; + os << GenSpaces(num_spaces) << "}\n"; + return os; +} + +} // namespace inference +} // namespace paddle diff --git a/paddle/fluid/inference/tests/api/tester_helper.h b/paddle/fluid/inference/tests/api/tester_helper.h index ab4ab20b58..a404691413 100644 --- a/paddle/fluid/inference/tests/api/tester_helper.h +++ b/paddle/fluid/inference/tests/api/tester_helper.h @@ -19,13 +19,16 @@ #include #include // NOLINT #include + #include "paddle/fluid/framework/ir/fuse_pass_base.h" #include "paddle/fluid/framework/scope.h" #include "paddle/fluid/inference/analysis/analyzer.h" #include "paddle/fluid/inference/analysis/ut_helper.h" #include "paddle/fluid/inference/api/analysis_predictor.h" -#include "paddle/fluid/inference/api/helper.h" #include "paddle/fluid/inference/api/paddle_inference_pass.h" + +#include "paddle/fluid/inference/api/helper.h" +#include "paddle/fluid/inference/tests/api/config_printer.h" #include "paddle/fluid/inference/tests/test_helper.h" #include "paddle/fluid/platform/profiler.h" @@ -38,10 +41,18 @@ DEFINE_int32(num_threads, 1, "Running the inference program in multi-threads."); DEFINE_bool(use_analysis, true, "Running the inference program in analysis mode."); +DECLARE_bool(profile); + namespace paddle { namespace inference { -using contrib::AnalysisConfig; +void PrintConfig(const PaddlePredictor::Config *config, bool use_analysis) { + if (use_analysis) { + LOG(INFO) << *reinterpret_cast(config); + return; + } + LOG(INFO) << *config; +} void CompareResult(const std::vector &outputs, const std::vector &ref_outputs) { @@ -77,12 +88,13 @@ void CompareResult(const std::vector &outputs, } std::unique_ptr CreateTestPredictor( - const AnalysisConfig &config, bool use_analysis = true) { + const PaddlePredictor::Config *config, bool use_analysis = true) { if (use_analysis) { - return CreatePaddlePredictor(config); - } else { - return CreatePaddlePredictor(config); + return CreatePaddlePredictor( + *(reinterpret_cast(config))); } + return CreatePaddlePredictor( + *(reinterpret_cast(config))); } size_t GetSize(const PaddleTensor &out) { return VecReduceToInt(out.shape); } @@ -111,11 +123,23 @@ std::unordered_map GetFuseStatis(PaddlePredictor *predictor, } void SetFakeImageInput(std::vector> *inputs, - const std::string &dirname) { + const std::string &dirname, bool is_combined = true, + std::string model_filename = "model", + std::string params_filename = "params") { // Set fake_image_data PADDLE_ENFORCE_EQ(FLAGS_test_all_data, 0, "Only have single batch of data."); - std::vector> feed_target_shapes = - GetFeedTargetShapes(dirname, true, "model", "params"); + std::vector> feed_target_shapes = GetFeedTargetShapes( + dirname, is_combined, model_filename, params_filename); + std::ostringstream os; + for (size_t i = 0; i < feed_target_shapes.size(); ++i) { + os << "feed target " << i << ": {" << feed_target_shapes[i][0]; + for (size_t j = 1; j < feed_target_shapes[i].size(); ++j) { + os << ", " << feed_target_shapes[i][j]; + } + os << "}\n"; + } + LOG(INFO) << os.str(); + int dim1 = feed_target_shapes[0][1]; int dim2 = feed_target_shapes[0][2]; int dim3 = feed_target_shapes[0][3]; @@ -139,25 +163,43 @@ void SetFakeImageInput(std::vector> *inputs, } void TestOneThreadPrediction( - const AnalysisConfig &config, + const PaddlePredictor::Config *config, const std::vector> &inputs, std::vector *outputs, bool use_analysis = true) { int batch_size = FLAGS_batch_size; int num_times = FLAGS_repeat; auto predictor = CreateTestPredictor(config, use_analysis); - Timer timer; - timer.tic(); - for (int i = 0; i < num_times; i++) { - for (size_t j = 0; j < inputs.size(); j++) { - predictor->Run(inputs[j], outputs); + + // warmup run + LOG(INFO) << "Warm up run..."; + { + Timer warmup_timer; + warmup_timer.tic(); + predictor->Run(inputs[0], outputs, batch_size); + PrintTime(batch_size, 1, 1, 0, warmup_timer.toc(), 1); +#if !defined(_WIN32) + if (FLAGS_profile) { + paddle::platform::ResetProfiler(); + } +#endif + } + + LOG(INFO) << "Run " << num_times << " times..."; + { + Timer run_timer; + run_timer.tic(); + for (int i = 0; i < num_times; i++) { + for (size_t j = 0; j < inputs.size(); j++) { + predictor->Run(inputs[j], outputs, batch_size); + } } + PrintTime(batch_size, num_times, 1, 0, run_timer.toc() / num_times, + inputs.size()); } - PrintTime(batch_size, num_times, 1, 0, timer.toc() / num_times, - inputs.size()); } void TestMultiThreadPrediction( - const AnalysisConfig &config, + const PaddlePredictor::Config *config, const std::vector> &inputs, std::vector *outputs, int num_threads, bool use_analysis = true) { @@ -200,12 +242,11 @@ void TestMultiThreadPrediction( } } -void TestPrediction(const AnalysisConfig &config, +void TestPrediction(const PaddlePredictor::Config *config, const std::vector> &inputs, std::vector *outputs, int num_threads, bool use_analysis = FLAGS_use_analysis) { - LOG(INFO) << "use_analysis: " << use_analysis - << ", use_mkldnn: " << config.use_mkldnn(); + PrintConfig(config, use_analysis); if (num_threads == 1) { TestOneThreadPrediction(config, inputs, outputs, use_analysis); } else { @@ -215,9 +256,9 @@ void TestPrediction(const AnalysisConfig &config, } void CompareNativeAndAnalysis( - const AnalysisConfig &config, + const PaddlePredictor::Config *config, const std::vector> &inputs) { - LOG(INFO) << "use_mkldnn: " << config.use_mkldnn(); + PrintConfig(config, true); std::vector native_outputs, analysis_outputs; TestOneThreadPrediction(config, inputs, &native_outputs, false); TestOneThreadPrediction(config, inputs, &analysis_outputs, true); diff --git a/paddle/fluid/inference/tests/api/trt_models_tester.cc b/paddle/fluid/inference/tests/api/trt_models_tester.cc index 71423154f8..922feba10f 100644 --- a/paddle/fluid/inference/tests/api/trt_models_tester.cc +++ b/paddle/fluid/inference/tests/api/trt_models_tester.cc @@ -1,148 +1,149 @@ -// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ #include #include #include -#include "paddle/fluid/inference/analysis/analyzer.h" -#include "paddle/fluid/inference/api/helper.h" -#include "paddle/fluid/inference/api/paddle_inference_api.h" -#include "paddle/fluid/inference/api/paddle_inference_pass.h" + #include "paddle/fluid/inference/tests/api/tester_helper.h" namespace paddle { -using paddle::contrib::AnalysisConfig; - -DEFINE_string(dirname, "", "Directory of the inference model."); - -NativeConfig GetConfigNative() { - NativeConfig config; - config.model_dir = FLAGS_dirname; - // LOG(INFO) << "dirname " << config.model_dir; - config.fraction_of_gpu_memory = 0.15; - config.use_gpu = true; - config.device = 0; - return config; -} - -void PrepareTRTConfig(AnalysisConfig *config) { - config->model_dir = FLAGS_dirname + "/" + "mobilenet"; - config->fraction_of_gpu_memory = 0.15; - config->EnableTensorRtEngine(1 << 10, 5); - config->pass_builder()->DeletePass("conv_bn_fuse_pass"); - config->pass_builder()->DeletePass("fc_fuse_pass"); - config->pass_builder()->TurnOnDebug(); +namespace inference { + +DEFINE_bool(use_tensorrt, true, "Test the performance of TensorRT engine."); +DEFINE_string(prog_filename, "", "Name of model file."); +DEFINE_string(param_filename, "", "Name of parameters file."); + +template +void SetConfig(ConfigType* config, std::string model_dir, bool use_gpu, + bool use_tensorrt = false, int batch_size = -1) { + if (!FLAGS_prog_filename.empty() && !FLAGS_param_filename.empty()) { + config->prog_file = model_dir + "/" + FLAGS_prog_filename; + config->param_file = model_dir + "/" + FLAGS_param_filename; + } else { + config->model_dir = model_dir; + } + if (use_gpu) { + config->use_gpu = true; + config->device = 0; + config->fraction_of_gpu_memory = 0.15; + } } -void PrepareInputs(std::vector *tensors, int batch_size) { - PADDLE_ENFORCE_EQ(tensors->size(), 1UL); - auto &tensor = tensors->front(); - int height = 224; - int width = 224; - float *data = new float[batch_size * 3 * height * width]; - memset(data, 0, sizeof(float) * (batch_size * 3 * height * width)); - data[0] = 1.0f; - - // Prepare inputs - tensor.name = "input_0"; - tensor.shape = std::vector({batch_size, 3, height, width}); - tensor.data = PaddleBuf(static_cast(data), - sizeof(float) * (batch_size * 3 * height * width)); - tensor.dtype = PaddleDType::FLOAT32; +template <> +void SetConfig(contrib::AnalysisConfig* config, + std::string model_dir, bool use_gpu, + bool use_tensorrt, int batch_size) { + if (!FLAGS_prog_filename.empty() && !FLAGS_param_filename.empty()) { + config->prog_file = model_dir + "/" + FLAGS_prog_filename; + config->param_file = model_dir + "/" + FLAGS_param_filename; + } else { + config->model_dir = model_dir; + } + if (use_gpu) { + config->use_gpu = true; + config->device = 0; + config->fraction_of_gpu_memory = 0.15; + if (use_tensorrt) { + config->EnableTensorRtEngine(1 << 10, batch_size); + config->pass_builder()->DeletePass("conv_bn_fuse_pass"); + config->pass_builder()->DeletePass("fc_fuse_pass"); + config->pass_builder()->TurnOnDebug(); + } else { + config->enable_ir_optim = true; + } + } } -void CompareTensorRTWithFluid(int batch_size, std::string model_dirname) { - auto config0 = GetConfigNative(); - config0.model_dir = model_dirname; - - AnalysisConfig config1(true); - PrepareTRTConfig(&config1); - config1.model_dir = model_dirname; - - auto predictor0 = CreatePaddlePredictor(config0); - auto predictor1 = CreatePaddlePredictor(config1); - - // Prepare inputs - std::vector paddle_tensor_feeds(1); - PrepareInputs(&paddle_tensor_feeds, batch_size); - - // Prepare outputs - std::vector outputs0; - std::vector outputs1; - CHECK(predictor0->Run(paddle_tensor_feeds, &outputs0)); - CHECK(predictor1->Run(paddle_tensor_feeds, &outputs1, batch_size)); - - const size_t num_elements = outputs0.front().data.length() / sizeof(float); - const size_t num_elements1 = outputs1.front().data.length() / sizeof(float); - EXPECT_EQ(num_elements, num_elements1); - - auto *data0 = static_cast(outputs0.front().data.data()); - auto *data1 = static_cast(outputs1.front().data.data()); - - ASSERT_GT(num_elements, 0UL); - for (size_t i = 0; i < std::min(num_elements, num_elements1); i++) { - EXPECT_NEAR(data0[i], data1[i], 1e-3); +void profile(std::string model_dir, bool use_analysis, bool use_tensorrt) { + std::vector> inputs_all; + if (!FLAGS_prog_filename.empty() && !FLAGS_param_filename.empty()) { + SetFakeImageInput(&inputs_all, model_dir, true, FLAGS_prog_filename, + FLAGS_param_filename); + } else { + SetFakeImageInput(&inputs_all, model_dir, false, "__model__", ""); } -} -TEST(trt_models_test, mobilenet) { - CompareTensorRTWithFluid(1, FLAGS_dirname + "/" + "mobilenet"); -} -TEST(trt_models_test, resnet50) { - CompareTensorRTWithFluid(1, FLAGS_dirname + "/" + "resnet50"); -} -TEST(trt_models_test, resnext50) { - CompareTensorRTWithFluid(1, FLAGS_dirname + "/" + "resnext50"); + std::vector outputs; + if (use_analysis || use_tensorrt) { + contrib::AnalysisConfig config(true); + SetConfig(&config, model_dir, true, use_tensorrt, + FLAGS_batch_size); + TestPrediction(reinterpret_cast(&config), + inputs_all, &outputs, FLAGS_num_threads, true); + } else { + NativeConfig config; + SetConfig(&config, model_dir, true, false); + TestPrediction(reinterpret_cast(&config), + inputs_all, &outputs, FLAGS_num_threads, false); + } } -TEST(trt_models_test, raw_gpu) { - std::string model_dir = FLAGS_dirname + "/" + "mobilenet"; - auto config0 = GetConfigNative(); - config0.model_dir = model_dir; - int batch_size = 2; - - AnalysisConfig config1(true); - config1.fraction_of_gpu_memory = 0.1; - config1.enable_ir_optim = true; - config1.model_dir = model_dir; +void compare(std::string model_dir, bool use_tensorrt) { + std::vector> inputs_all; + if (!FLAGS_prog_filename.empty() && !FLAGS_param_filename.empty()) { + SetFakeImageInput(&inputs_all, model_dir, true, FLAGS_prog_filename, + FLAGS_param_filename); + } else { + SetFakeImageInput(&inputs_all, model_dir, false, "__model__", ""); + } - auto predictor0 = CreatePaddlePredictor(config0); - auto predictor1 = CreatePaddlePredictor(config1); + std::vector native_outputs; + NativeConfig native_config; + SetConfig(&native_config, model_dir, true, false, + FLAGS_batch_size); + TestOneThreadPrediction( + reinterpret_cast(&native_config), inputs_all, + &native_outputs, false); + + std::vector analysis_outputs; + contrib::AnalysisConfig analysis_config(true); + SetConfig(&analysis_config, model_dir, true, + use_tensorrt, FLAGS_batch_size); + TestOneThreadPrediction( + reinterpret_cast(&analysis_config), inputs_all, + &analysis_outputs, true); + + CompareResult(native_outputs, analysis_outputs); +} - // Prepare inputs - std::vector paddle_tensor_feeds(1); - PrepareInputs(&paddle_tensor_feeds, batch_size); +TEST(TensorRT_mobilenet, compare) { + std::string model_dir = FLAGS_infer_model + "/mobilenet"; + compare(model_dir, /* use_tensorrt */ true); +} - // Prepare outputs - std::vector outputs0; - std::vector outputs1; - CHECK(predictor0->Run(paddle_tensor_feeds, &outputs0)); - CHECK(predictor1->Run(paddle_tensor_feeds, &outputs1, batch_size)); +TEST(TensorRT_resnet50, compare) { + std::string model_dir = FLAGS_infer_model + "/resnet50"; + compare(model_dir, /* use_tensorrt */ true); +} - const size_t num_elements = outputs0.front().data.length() / sizeof(float); - const size_t num_elements1 = outputs1.front().data.length() / sizeof(float); - EXPECT_EQ(num_elements, num_elements1); +TEST(TensorRT_resnext50, compare) { + std::string model_dir = FLAGS_infer_model + "/resnext50"; + compare(model_dir, /* use_tensorrt */ true); +} - auto *data0 = static_cast(outputs0.front().data.data()); - auto *data1 = static_cast(outputs1.front().data.data()); +TEST(TensorRT_resnext50, profile) { + std::string model_dir = FLAGS_infer_model + "/resnext50"; + profile(model_dir, /* use_analysis */ true, FLAGS_use_tensorrt); +} - ASSERT_GT(num_elements, 0UL); - for (size_t i = 0; i < std::min(num_elements, num_elements1); i++) { - EXPECT_NEAR(data0[i], data1[i], 1e-3); - } +TEST(TensorRT_mobilenet, analysis) { + std::string model_dir = FLAGS_infer_model + "/" + "mobilenet"; + compare(model_dir, /* use_tensorrt */ false); } +} // namespace inference } // namespace paddle USE_PASS(tensorrt_subgraph_pass);