diff --git a/CMakeLists.txt b/CMakeLists.txt index e4442d2549..61f5e63098 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -25,12 +25,18 @@ message(STATUS "CXX compiler: ${CMAKE_CXX_COMPILER}, version: " message(STATUS "C compiler: ${CMAKE_C_COMPILER}, version: " "${CMAKE_C_COMPILER_ID} ${CMAKE_C_COMPILER_VERSION}") if(WIN32) + set(CMAKE_SUPPRESS_REGENERATION ON) set(CMAKE_STATIC_LIBRARY_PREFIX lib) add_definitions("/DGOOGLE_GLOG_DLL_DECL=") set(CMAKE_C_FLAGS_DEBUG "${CMAKE_C_FLAGS_DEBUG} /bigobj /MTd") set(CMAKE_C_FLAGS_RELEASE "${CMAKE_C_FLAGS_RELEASE} /bigobj /MT") set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} /bigobj /MTd") set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} /bigobj /MT") + add_compile_options(/wd4068 /wd4129 /wd4244 /wd4267 /wd4297 /wd4530 /wd4577 /wd4819 /wd4838) + set(PADDLE_LINK_FLAGS "/IGNORE:4006 /IGNORE:4098 /IGNORE:4217 /IGNORE:4221") + set(CMAKE_STATIC_LINKER_FLAGS "${CMAKE_STATIC_LINKER_FLAGS} ${PADDLE_LINK_FLAGS}") + set(CMAKE_SHARED_LINKER_FLAGS "${CMAKE_SHARED_LINKER_FLAGS} ${PADDLE_LINK_FLAGS}") + set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} ${PADDLE_LINK_FLAGS}") endif(WIN32) find_package(CUDA QUIET) diff --git a/README.md b/README.md index 32a302cc54..68421cf177 100644 --- a/README.md +++ b/README.md @@ -1,5 +1,6 @@ # PaddlePaddle +English | [简体中文](./README_cn.md) [](https://travis-ci.org/PaddlePaddle/Paddle) [](http://paddlepaddle.org/documentation/docs/en/1.2/getstarted/index_en.html) @@ -7,7 +8,6 @@ [](https://github.com/PaddlePaddle/Paddle/releases) [](LICENSE) - Welcome to the PaddlePaddle GitHub. PaddlePaddle (PArallel Distributed Deep LEarning) is an easy-to-use, @@ -18,16 +18,6 @@ learning to many products at Baidu. Our vision is to enable deep learning for everyone via PaddlePaddle. Please refer to our [release announcement](https://github.com/PaddlePaddle/Paddle/releases) to track the latest feature of PaddlePaddle. - -欢迎来到 PaddlePaddle GitHub - -PaddlePaddle (PArallel Distributed Deep LEarning) 是一个简单易用、高效灵活、可扩展的深度学习平台,最初由百度科学家和工程师共同开发,目的是将深度学习技术应用到百度的众多产品中。 - -我们的愿景是让每个人都能通过PaddlePaddle接触深度学习 - -跟进PaddlePaddle最新特性请参考我们的[版本说明](https://github.com/PaddlePaddle/Paddle/releases) - - ### Latest PaddlePaddle Release: [Fluid 1.2.0](https://github.com/PaddlePaddle/Paddle/tree/release/1.2) ### Install Latest Stable Release: ``` @@ -43,23 +33,6 @@ pip install paddlepaddle-gpu==1.2.0.post85 # For installation on other platform, refer to http://paddlepaddle.org/ ``` - -### PaddlePaddle最新版本: [Fluid 1.2.0](https://github.com/PaddlePaddle/Paddle/tree/release/1.2) -### 安装最新稳定版本: -``` -# Linux CPU -pip install paddlepaddle -# Linux GPU cuda9cudnn7 -pip install paddlepaddle-gpu -# Linux GPU cuda8cudnn7 -pip install paddlepaddle-gpu==1.2.0.post87 -# Linux GPU cuda8cudnn5 -pip install paddlepaddle-gpu==1.2.0.post85 - -# 其他平台上的安装指引请参考 http://paddlepaddle.org/ -``` - - ## Features - **Flexibility** @@ -100,38 +73,10 @@ pip install paddlepaddle-gpu==1.2.0.post85 Baidu and it has achieved a significant impact. We hope you can also explore the capability of PaddlePaddle to make an impact on your product. -## 特点 - -- **灵活性** - - PaddlePaddle支持丰富的神经网络架构和优化算法。易于配置复杂模型,例如带有注意力机制或复杂记忆连接的神经网络机器翻译模型。 - -- **高效性** - - 为了高效使用异步计算资源,PaddlePaddle对框架的不同层进行优化,包括计算、存储、架构和通信。下面是一些样例: - - - 通过SSE/AVX 内置函数、BLAS库(例如MKL、OpenBLAS、cuBLAS)或定制的CPU/GPU内核优化数学操作。 - - 通过MKL-DNN库优化CNN网络 - - 高度优化循环网络,无需执行 `padding` 操作即可处理 **变长** 序列 - - 针对高维稀疏数据模型,优化了局部和分布式训练。 - - -- **稳定性** - - 有了 PaddlePaddle,使得利用各种CPU/GPU和机器来加速训练变得简单。PaddlePaddle 通过优化通信可以实现巨大吞吐量和快速执行。 - -- **连接产品** - - 另外,PaddlePaddle 的设计也易于部署。在百度,PaddlePaddle 已经部署到含有巨大用户量的产品和服务上,包括广告点击率(CTR)预测、大规模图像分类、光学字符识别(OCR)、搜索排序,计算机病毒检测、推荐系统等等。PaddlePaddle广泛应用于百度产品中,产生了非常重要的影响。我们希望您也能探索 PaddlePaddle 的能力,为您的产品创造新的影响力和效果。 - ## Installation It is recommended to read [this doc](http://paddlepaddle.org/documentation/docs/zh/1.2/beginners_guide/install/index_cn.html) on our website. -## 安装 - -推荐阅读官网上的[安装说明](http://paddlepaddle.org/documentation/docs/zh/1.2/beginners_guide/install/index_cn.html) - ## Documentation We provide [English](http://paddlepaddle.org/documentation/docs/en/1.2/getstarted/index_en.html) and @@ -153,37 +98,9 @@ We provide [English](http://paddlepaddle.org/documentation/docs/en/1.2/getstarte We appreciate your contributions! -## 文档 - -我们提供[英文](http://paddlepaddle.org/documentation/docs/en/1.2/getstarted/index_en.html)和 -[中文](http://paddlepaddle.org/documentation/docs/zh/1.2/beginners_guide/index.html) 文档 - -- [深度学习101](https://github.com/PaddlePaddle/book) - - 或许您想从这个在线交互式书籍开始,可以在Jupyter Notebook中运行 - -- [分布式训练](http://paddlepaddle.org/documentation/docs/zh/1.2/user_guides/howto/training/cluster_howto.html) - - 可以在MPI集群上运行分布式训练任务 - -- [Python API](http://paddlepaddle.org/documentation/docs/zh/1.2/api_cn/index_cn.html) - - 新的API支持代码更少更简洁的程序 - -- [贡献方式](http://paddlepaddle.org/documentation/docs/zh/1.2/advanced_usage/development/contribute_to_paddle/index_cn.html) - - 欢迎您的贡献! - ## Ask Questions You are welcome to submit questions and bug reports as [Github Issues](https://github.com/PaddlePaddle/Paddle/issues). -## 答疑 - -欢迎您将问题和bug报告以[Github Issues](https://github.com/PaddlePaddle/Paddle/issues)的形式提交 - ## Copyright and License PaddlePaddle is provided under the [Apache-2.0 license](LICENSE). - -## 版权和许可证 -PaddlePaddle由[Apache-2.0 license](LICENSE)提供 diff --git a/README_cn.md b/README_cn.md new file mode 100644 index 0000000000..dfb55b17ca --- /dev/null +++ b/README_cn.md @@ -0,0 +1,88 @@ +# PaddlePaddle + +[English](./README.md) | 简体中文 + +[](https://travis-ci.org/PaddlePaddle/Paddle) +[](http://paddlepaddle.org/documentation/docs/en/1.2/getstarted/index_en.html) +[](http://paddlepaddle.org/documentation/docs/zh/1.2/beginners_guide/index.html) +[](https://github.com/PaddlePaddle/Paddle/releases) +[](LICENSE) + +欢迎来到 PaddlePaddle GitHub + +PaddlePaddle (PArallel Distributed Deep LEarning) 是一个简单易用、高效灵活、可扩展的深度学习平台,最初由百度科学家和工程师共同开发,目的是将深度学习技术应用到百度的众多产品中。 + +我们的愿景是让每个人都能通过PaddlePaddle接触深度学习 + +跟进PaddlePaddle最新特性请参考我们的[版本说明](https://github.com/PaddlePaddle/Paddle/releases) + +### PaddlePaddle最新版本: [Fluid 1.2.0](https://github.com/PaddlePaddle/Paddle/tree/release/1.2) +### 安装最新稳定版本: +``` +# Linux CPU +pip install paddlepaddle +# Linux GPU cuda9cudnn7 +pip install paddlepaddle-gpu +# Linux GPU cuda8cudnn7 +pip install paddlepaddle-gpu==1.2.0.post87 +# Linux GPU cuda8cudnn5 +pip install paddlepaddle-gpu==1.2.0.post85 + +# 其他平台上的安装指引请参考 http://paddlepaddle.org/ +``` + +## 特性 + +- **灵活性** + + PaddlePaddle支持丰富的神经网络架构和优化算法。易于配置复杂模型,例如带有注意力机制或复杂记忆连接的神经网络机器翻译模型。 + +- **高效性** + + 为了高效使用异步计算资源,PaddlePaddle对框架的不同层进行优化,包括计算、存储、架构和通信。下面是一些样例: + + - 通过SSE/AVX 内置函数、BLAS库(例如MKL、OpenBLAS、cuBLAS)或定制的CPU/GPU内核优化数学操作。 + - 通过MKL-DNN库优化CNN网络 + - 高度优化循环网络,无需执行 `padding` 操作即可处理 **变长** 序列 + - 针对高维稀疏数据模型,优化了局部和分布式训练。 + + +- **稳定性** + + 有了 PaddlePaddle,使得利用各种CPU/GPU和机器来加速训练变得简单。PaddlePaddle 通过优化通信可以实现巨大吞吐量和快速执行。 + +- **与产品相连** + + 另外,PaddlePaddle 的设计也易于部署。在百度,PaddlePaddle 已经部署到含有巨大用户量的产品和服务上,包括广告点击率(CTR)预测、大规模图像分类、光学字符识别(OCR)、搜索排序,计算机病毒检测、推荐系统等等。PaddlePaddle广泛应用于百度产品中,产生了非常重要的影响。我们希望您也能探索 PaddlePaddle 的能力,为您的产品创造新的影响力和效果。 + +## 安装 + +推荐阅读官网上的[安装说明](http://paddlepaddle.org/documentation/docs/zh/1.2/beginners_guide/install/index_cn.html) + +## 文档 + +我们提供[英文](http://paddlepaddle.org/documentation/docs/en/1.2/getstarted/index_en.html)和 +[中文](http://paddlepaddle.org/documentation/docs/zh/1.2/beginners_guide/index.html) 文档 + +- [深度学习101](https://github.com/PaddlePaddle/book) + + 或许您想从这个在线交互式书籍开始,可以在Jupyter Notebook中运行 + +- [分布式训练](http://paddlepaddle.org/documentation/docs/zh/1.2/user_guides/howto/training/cluster_howto.html) + + 可以在MPI集群上运行分布式训练任务 + +- [Python API](http://paddlepaddle.org/documentation/docs/zh/1.2/api_cn/index_cn.html) + + 新的API支持代码更少更简洁的程序 + +- [贡献方式](http://paddlepaddle.org/documentation/docs/zh/1.2/advanced_usage/development/contribute_to_paddle/index_cn.html) + + 欢迎您的贡献! + +## 答疑 + +欢迎您将问题和bug报告以[Github Issues](https://github.com/PaddlePaddle/Paddle/issues)的形式提交 + +## 版权和许可证 +PaddlePaddle由[Apache-2.0 license](LICENSE)提供 diff --git a/cmake/configure.cmake b/cmake/configure.cmake index 076e839120..b0f54bf49a 100644 --- a/cmake/configure.cmake +++ b/cmake/configure.cmake @@ -152,7 +152,12 @@ endif() if (WITH_MKLML AND MKLML_IOMP_LIB) message(STATUS "Enable Intel OpenMP with ${MKLML_IOMP_LIB}") - set(OPENMP_FLAGS "-fopenmp") + if(WIN32) + # openmp not support well for now on windows + set(OPENMP_FLAGS "") + else(WIN32) + set(OPENMP_FLAGS "-fopenmp") + endif(WIN32) set(CMAKE_C_CREATE_SHARED_LIBRARY_FORBIDDEN_FLAGS ${OPENMP_FLAGS}) set(CMAKE_CXX_CREATE_SHARED_LIBRARY_FORBIDDEN_FLAGS ${OPENMP_FLAGS}) set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${OPENMP_FLAGS}") diff --git a/cmake/cuda.cmake b/cmake/cuda.cmake index ea46f6418e..ef4192ecc9 100644 --- a/cmake/cuda.cmake +++ b/cmake/cuda.cmake @@ -203,25 +203,26 @@ list(APPEND CUDA_NVCC_FLAGS "-w") list(APPEND CUDA_NVCC_FLAGS "--expt-relaxed-constexpr") if (NOT WIN32) -if(CMAKE_BUILD_TYPE STREQUAL "Debug") - list(APPEND CUDA_NVCC_FLAGS ${CMAKE_CXX_FLAGS_DEBUG}) -elseif(CMAKE_BUILD_TYPE STREQUAL "Release") - list(APPEND CUDA_NVCC_FLAGS ${CMAKE_CXX_FLAGS_RELEASE}) -elseif(CMAKE_BUILD_TYPE STREQUAL "RelWithDebInfo") - list(APPEND CUDA_NVCC_FLAGS ${CMAKE_CXX_FLAGS_RELWITHDEBINFO}) -elseif(CMAKE_BUILD_TYPE STREQUAL "MinSizeRel") - # nvcc 9 does not support -Os. Use Release flags instead - list(APPEND CUDA_NVCC_FLAGS ${CMAKE_CXX_FLAGS_RELEASE}) -endif() + if(CMAKE_BUILD_TYPE STREQUAL "Debug") + list(APPEND CUDA_NVCC_FLAGS ${CMAKE_CXX_FLAGS_DEBUG}) + elseif(CMAKE_BUILD_TYPE STREQUAL "Release") + list(APPEND CUDA_NVCC_FLAGS ${CMAKE_CXX_FLAGS_RELEASE}) + elseif(CMAKE_BUILD_TYPE STREQUAL "RelWithDebInfo") + list(APPEND CUDA_NVCC_FLAGS ${CMAKE_CXX_FLAGS_RELWITHDEBINFO}) + elseif(CMAKE_BUILD_TYPE STREQUAL "MinSizeRel") + # nvcc 9 does not support -Os. Use Release flags instead + list(APPEND CUDA_NVCC_FLAGS ${CMAKE_CXX_FLAGS_RELEASE}) + endif() else(NOT WIN32) -list(APPEND CUDA_NVCC_FLAGS "--compiler-options;/bigobj") -if(CMAKE_BUILD_TYPE STREQUAL "Debug") - list(APPEND CUDA_NVCC_FLAGS "-g -G") - # match the cl's _ITERATOR_DEBUG_LEVEL - list(APPEND CUDA_NVCC_FLAGS "-D_DEBUG") -elseif(CMAKE_BUILD_TYPE STREQUAL "Release") - list(APPEND CUDA_NVCC_FLAGS "-O3 -DNDEBUG") -else() + list(APPEND CUDA_NVCC_FLAGS "-Xcompiler \"/wd 4244 /wd 4267 /wd 4819\"") + list(APPEND CUDA_NVCC_FLAGS "--compiler-options;/bigobj") + if(CMAKE_BUILD_TYPE STREQUAL "Debug") + list(APPEND CUDA_NVCC_FLAGS "-g -G") + # match the cl's _ITERATOR_DEBUG_LEVEL + list(APPEND CUDA_NVCC_FLAGS "-D_DEBUG") + elseif(CMAKE_BUILD_TYPE STREQUAL "Release") + list(APPEND CUDA_NVCC_FLAGS "-O3 -DNDEBUG") + else() message(FATAL "Windows only support Release or Debug build now. Please set visual studio build type to Release/Debug, x64 build.") endif() endif(NOT WIN32) diff --git a/cmake/external/glog.cmake b/cmake/external/glog.cmake index 7a6a452388..d3a4d69d3a 100644 --- a/cmake/external/glog.cmake +++ b/cmake/external/glog.cmake @@ -20,8 +20,10 @@ SET(GLOG_INCLUDE_DIR "${GLOG_INSTALL_DIR}/include" CACHE PATH "glog include dire IF(WIN32) SET(GLOG_LIBRARIES "${GLOG_INSTALL_DIR}/lib/libglog.lib" CACHE FILEPATH "glog library." FORCE) + SET(GLOG_CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /wd4267 /wd4530") ELSE(WIN32) SET(GLOG_LIBRARIES "${GLOG_INSTALL_DIR}/lib/libglog.a" CACHE FILEPATH "glog library." FORCE) + SET(GLOG_CMAKE_CXX_FLAGS ${CMAKE_CXX_FLAGS}) ENDIF(WIN32) INCLUDE_DIRECTORIES(${GLOG_INCLUDE_DIR}) @@ -39,7 +41,7 @@ ExternalProject_Add( UPDATE_COMMAND "" CMAKE_ARGS -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER} -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER} - -DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS} + -DCMAKE_CXX_FLAGS=${GLOG_CMAKE_CXX_FLAGS} -DCMAKE_CXX_FLAGS_RELEASE=${CMAKE_CXX_FLAGS_RELEASE} -DCMAKE_CXX_FLAGS_DEBUG=${CMAKE_CXX_FLAGS_DEBUG} -DCMAKE_C_FLAGS=${CMAKE_C_FLAGS} diff --git a/cmake/external/mkldnn.cmake b/cmake/external/mkldnn.cmake index 6a7be73f09..92fe76d05c 100644 --- a/cmake/external/mkldnn.cmake +++ b/cmake/external/mkldnn.cmake @@ -49,6 +49,8 @@ IF(NOT WIN32) SET(MKLDNN_FLAG "${MKLDNN_FLAG} -Wno-unused-result -Wno-unused-value") SET(MKLDNN_CFLAG "${CMAKE_C_FLAGS} ${MKLDNN_FLAG}") SET(MKLDNN_CXXFLAG "${CMAKE_CXX_FLAGS} ${MKLDNN_FLAG}") +ELSE() + SET(MKLDNN_CXXFLAG "${CMAKE_CXX_FLAGS} /EHsc") ENDIF(NOT WIN32) ExternalProject_Add( @@ -61,7 +63,6 @@ ExternalProject_Add( UPDATE_COMMAND "" CMAKE_ARGS -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER} CMAKE_ARGS -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER} - CMAKE_ARGS -DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS} CMAKE_ARGS -DCMAKE_CXX_FLAGS_RELEASE=${CMAKE_CXX_FLAGS_RELEASE} CMAKE_ARGS -DCMAKE_CXX_FLAGS_DEBUG=${CMAKE_CXX_FLAGS_DEBUG} CMAKE_ARGS -DCMAKE_C_FLAGS=${CMAKE_C_FLAGS} diff --git a/cmake/external/snappy.cmake b/cmake/external/snappy.cmake index 27d075336d..1e01057aa6 100644 --- a/cmake/external/snappy.cmake +++ b/cmake/external/snappy.cmake @@ -20,6 +20,12 @@ set(SNAPPY_SOURCES_DIR ${THIRD_PARTY_PATH}/snappy) set(SNAPPY_INSTALL_DIR ${THIRD_PARTY_PATH}/install/snappy) set(SNAPPY_INCLUDE_DIR "${SNAPPY_INSTALL_DIR}/include" CACHE PATH "snappy include directory." FORCE) +if(WIN32) + SET(SNAPPY_CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /wd4244 /wd4267") +else() + SET(SNAPPY_CMAKE_CXX_FLAGS ${CMAKE_CXX_FLAGS}) +endif() + ExternalProject_Add( extern_snappy GIT_REPOSITORY "https://github.com/google/snappy" @@ -31,7 +37,7 @@ ExternalProject_Add( -DCMAKE_C_FLAGS=${CMAKE_C_FLAGS} -DCMAKE_C_FLAGS_DEBUG=${CMAKE_C_FLAGS_DEBUG} -DCMAKE_C_FLAGS_RELEASE=${CMAKE_C_FLAGS_RELEASE} - -DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS} + -DCMAKE_CXX_FLAGS=${SNAPPY_CMAKE_CXX_FLAGS} -DCMAKE_CXX_FLAGS_RELEASE=${CMAKE_CXX_FLAGS_RELEASE} -DCMAKE_CXX_FLAGS_DEBUG=${CMAKE_CXX_FLAGS_DEBUG} -DCMAKE_INSTALL_PREFIX=${SNAPPY_INSTALL_DIR} diff --git a/cmake/flags.cmake b/cmake/flags.cmake index 9e6c47f016..36b533aa4f 100644 --- a/cmake/flags.cmake +++ b/cmake/flags.cmake @@ -21,7 +21,7 @@ function(CheckCompilerCXX11Flag) if (${CMAKE_CXX_COMPILER_VERSION} VERSION_LESS 3.3) message(FATAL_ERROR "Unsupported Clang version. Clang >= 3.3 required.") endif() - endif() + endif() endif() endfunction() @@ -147,12 +147,7 @@ set(GPU_COMMON_FLAGS -Wno-error=unused-function # Warnings in Numpy Header. -Wno-error=array-bounds # Warnings in Eigen::array ) - -else(NOT WIN32) -set(COMMON_FLAGS - "/w") #disable all warnings. -set(GPU_COMMON_FLAGS - "/w") #disable all warnings +set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -m64") endif(NOT WIN32) if (APPLE) @@ -193,8 +188,7 @@ safe_set_static_flag() CMAKE_CXX_FLAGS_MINSIZEREL CMAKE_CXX_FLAGS_RELWITHDEBINFO CMAKE_C_FLAGS CMAKE_C_FLAGS_DEBUG CMAKE_C_FLAGS_RELEASE CMAKE_C_FLAGS_MINSIZEREL CMAKE_C_FLAGS_RELWITHDEBINFO) - if(${flag_var} MATCHES "/W3") - string(REGEX REPLACE "/W3" "/w" ${flag_var} "${${flag_var}}") - endif(${flag_var} MATCHES "/W3") + string(REGEX REPLACE "(^| )/W[0-9]( |$)" " " ${flag_var} "${${flag_var}}") + set(flag_var "${flag_var} /w") endforeach(flag_var) endif(WIN32) diff --git a/cmake/version.cmake b/cmake/version.cmake index ac10bdf067..dd57d4ab99 100644 --- a/cmake/version.cmake +++ b/cmake/version.cmake @@ -31,8 +31,23 @@ while ("${PADDLE_VERSION}" STREQUAL "") set(tmp_version "${GIT_TAG_NAME}~1") endif() else() - # otherwise, we always set PADDLE_VERSION to 0.0.0 to represent latest - set(PADDLE_VERSION "0.0.0") + execute_process( + COMMAND ${GIT_EXECUTABLE} describe --exact-match --tags ${tmp_version} + WORKING_DIRECTORY ${PADDLE_SOURCE_DIR} + OUTPUT_VARIABLE GIT_EXACT_TAG_NAME + RESULT_VARIABLE GIT_EXACT_TAG_RESULT + ERROR_QUIET OUTPUT_STRIP_TRAILING_WHITESPACE) + if (NOT ${GIT_EXACT_TAG_NAME}) + # Check if current branch is tag branch + if (${GIT_EXACT_TAG_NAME} MATCHES "v${TAG_VERSION_REGEX}") + string(REPLACE "v" "" PADDLE_VERSION ${GIT_EXACT_TAG_NAME}) + else() + set(PADDLE_VERSION "0.0.0") + endif() + else() + # otherwise, we always set PADDLE_VERSION to 0.0.0 to represent latest + set(PADDLE_VERSION "0.0.0") + endif() endif() else() set(PADDLE_VERSION "0.0.0") diff --git a/paddle/fluid/API.spec b/paddle/fluid/API.spec index 84b4677777..df961be911 100644 --- a/paddle/fluid/API.spec +++ b/paddle/fluid/API.spec @@ -8,13 +8,13 @@ paddle.fluid.Program.parse_from_string ArgSpec(args=['binary_str'], varargs=None paddle.fluid.Program.to_string ArgSpec(args=['self', 'throw_on_error', 'with_details'], varargs=None, keywords=None, defaults=(False,)) paddle.fluid.default_startup_program ArgSpec(args=[], varargs=None, keywords=None, defaults=None) paddle.fluid.default_main_program ArgSpec(args=[], varargs=None, keywords=None, defaults=None) -paddle.fluid.program_guard ArgSpec(args=[], varargs='args', keywords='kwds', defaults=None) -paddle.fluid.name_scope ArgSpec(args=[], varargs='args', keywords='kwds', defaults=None) +paddle.fluid.program_guard ArgSpec(args=['main_program', 'startup_program'], varargs=None, keywords=None, defaults=(None,)) +paddle.fluid.name_scope ArgSpec(args=['prefix'], varargs=None, keywords=None, defaults=(None,)) paddle.fluid.Executor.__init__ ArgSpec(args=['self', 'place'], varargs=None, keywords=None, defaults=None) paddle.fluid.Executor.close ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None) paddle.fluid.Executor.run ArgSpec(args=['self', 'program', 'feed', 'fetch_list', 'feed_var_name', 'fetch_var_name', 'scope', 'return_numpy', 'use_program_cache'], varargs=None, keywords=None, defaults=(None, None, None, 'feed', 'fetch', None, True, False)) paddle.fluid.global_scope ArgSpec(args=[], varargs=None, keywords=None, defaults=None) -paddle.fluid.scope_guard ArgSpec(args=[], varargs='args', keywords='kwds', defaults=None) +paddle.fluid.scope_guard ArgSpec(args=['scope'], varargs=None, keywords=None, defaults=None) paddle.fluid.DistributeTranspiler.__init__ ArgSpec(args=['self', 'config'], varargs=None, keywords=None, defaults=(None,)) paddle.fluid.DistributeTranspiler.get_pserver_program ArgSpec(args=['self', 'endpoint'], varargs=None, keywords=None, defaults=None) paddle.fluid.DistributeTranspiler.get_pserver_programs ArgSpec(args=['self', 'endpoint'], varargs=None, keywords=None, defaults=None) @@ -66,7 +66,7 @@ paddle.fluid.initializer.XavierInitializer.__init__ ArgSpec(args=['self', 'unifo paddle.fluid.initializer.BilinearInitializer.__init__ ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None) paddle.fluid.initializer.MSRAInitializer.__init__ ArgSpec(args=['self', 'uniform', 'fan_in', 'seed'], varargs=None, keywords=None, defaults=(True, None, 0)) paddle.fluid.initializer.force_init_on_cpu ArgSpec(args=[], varargs=None, keywords=None, defaults=None) -paddle.fluid.initializer.init_on_cpu ArgSpec(args=[], varargs='args', keywords='kwds', defaults=None) +paddle.fluid.initializer.init_on_cpu ArgSpec(args=[], varargs=None, keywords=None, defaults=None) paddle.fluid.initializer.NumpyArrayInitializer.__init__ ArgSpec(args=['self', 'value'], varargs=None, keywords=None, defaults=None) paddle.fluid.layers.fc ArgSpec(args=['input', 'size', 'num_flatten_dims', 'param_attr', 'bias_attr', 'act', 'is_test', 'name'], varargs=None, keywords=None, defaults=(1, None, None, None, False, None)) paddle.fluid.layers.embedding ArgSpec(args=['input', 'size', 'is_sparse', 'is_distributed', 'padding_idx', 'param_attr', 'dtype'], varargs=None, keywords=None, defaults=(False, False, None, None, 'float32')) @@ -229,7 +229,7 @@ paddle.fluid.layers.random_data_generator ArgSpec(args=['low', 'high', 'shapes', paddle.fluid.layers.py_reader ArgSpec(args=['capacity', 'shapes', 'dtypes', 'lod_levels', 'name', 'use_double_buffer'], varargs=None, keywords=None, defaults=(None, None, True)) paddle.fluid.layers.create_py_reader_by_data ArgSpec(args=['capacity', 'feed_list', 'name', 'use_double_buffer'], varargs=None, keywords=None, defaults=(None, True)) paddle.fluid.layers.Preprocessor.__init__ ArgSpec(args=['self', 'reader', 'name'], varargs=None, keywords=None, defaults=(None,)) -paddle.fluid.layers.Preprocessor.block ArgSpec(args=[], varargs='args', keywords='kwds', defaults=None) +paddle.fluid.layers.Preprocessor.block ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None) paddle.fluid.layers.Preprocessor.inputs ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None) paddle.fluid.layers.Preprocessor.outputs ArgSpec(args=['self'], varargs='outs', keywords=None, defaults=None) paddle.fluid.layers.load ArgSpec(args=['out', 'file_path', 'load_as_fp16'], varargs=None, keywords=None, defaults=(None,)) @@ -270,7 +270,7 @@ paddle.fluid.layers.IfElse.input ArgSpec(args=['self', 'x'], varargs=None, keywo paddle.fluid.layers.IfElse.output ArgSpec(args=['self'], varargs='outs', keywords=None, defaults=None) paddle.fluid.layers.IfElse.true_block ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None) paddle.fluid.layers.DynamicRNN.__init__ ArgSpec(args=['self', 'name'], varargs=None, keywords=None, defaults=(None,)) -paddle.fluid.layers.DynamicRNN.block ArgSpec(args=[], varargs='args', keywords='kwds', defaults=None) +paddle.fluid.layers.DynamicRNN.block ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None) paddle.fluid.layers.DynamicRNN.memory ArgSpec(args=['self', 'init', 'shape', 'value', 'need_reorder', 'dtype'], varargs=None, keywords=None, defaults=(None, None, 0.0, False, 'float32')) paddle.fluid.layers.DynamicRNN.output ArgSpec(args=['self'], varargs='outputs', keywords=None, defaults=None) paddle.fluid.layers.DynamicRNN.static_input ArgSpec(args=['self', 'x'], varargs=None, keywords=None, defaults=None) @@ -325,6 +325,7 @@ paddle.fluid.layers.iou_similarity ArgSpec(args=['x', 'y', 'name'], varargs=None paddle.fluid.layers.box_coder ArgSpec(args=['prior_box', 'prior_box_var', 'target_box', 'code_type', 'box_normalized', 'name', 'axis'], varargs=None, keywords=None, defaults=('encode_center_size', True, None, 0)) paddle.fluid.layers.polygon_box_transform ArgSpec(args=['input', 'name'], varargs=None, keywords=None, defaults=(None,)) paddle.fluid.layers.yolov3_loss ArgSpec(args=['x', 'gtbox', 'gtlabel', 'anchors', 'anchor_mask', 'class_num', 'ignore_thresh', 'downsample_ratio', 'name'], varargs=None, keywords=None, defaults=(None,)) +paddle.fluid.layers.box_clip ArgSpec(args=['input', 'im_info', 'name'], varargs=None, keywords=None, defaults=(None,)) paddle.fluid.layers.multiclass_nms ArgSpec(args=['bboxes', 'scores', 'score_threshold', 'nms_top_k', 'keep_top_k', 'nms_threshold', 'normalized', 'nms_eta', 'background_label', 'name'], varargs=None, keywords=None, defaults=(0.3, True, 1.0, 0, None)) paddle.fluid.layers.accuracy ArgSpec(args=['input', 'label', 'k', 'correct', 'total'], varargs=None, keywords=None, defaults=(1, None, None)) paddle.fluid.layers.auc ArgSpec(args=['input', 'label', 'curve', 'num_thresholds', 'topk', 'slide_steps'], varargs=None, keywords=None, defaults=('ROC', 4095, 1, 1)) @@ -345,12 +346,12 @@ paddle.fluid.contrib.StateCell.set_state ArgSpec(args=['self', 'state_name', 'st paddle.fluid.contrib.StateCell.state_updater ArgSpec(args=['self', 'updater'], varargs=None, keywords=None, defaults=None) paddle.fluid.contrib.StateCell.update_states ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None) paddle.fluid.contrib.TrainingDecoder.__init__ ArgSpec(args=['self', 'state_cell', 'name'], varargs=None, keywords=None, defaults=(None,)) -paddle.fluid.contrib.TrainingDecoder.block ArgSpec(args=[], varargs='args', keywords='kwds', defaults=None) +paddle.fluid.contrib.TrainingDecoder.block ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None) paddle.fluid.contrib.TrainingDecoder.output ArgSpec(args=['self'], varargs='outputs', keywords=None, defaults=None) paddle.fluid.contrib.TrainingDecoder.static_input ArgSpec(args=['self', 'x'], varargs=None, keywords=None, defaults=None) paddle.fluid.contrib.TrainingDecoder.step_input ArgSpec(args=['self', 'x'], varargs=None, keywords=None, defaults=None) paddle.fluid.contrib.BeamSearchDecoder.__init__ ArgSpec(args=['self', 'state_cell', 'init_ids', 'init_scores', 'target_dict_dim', 'word_dim', 'input_var_dict', 'topk_size', 'sparse_emb', 'max_len', 'beam_size', 'end_id', 'name'], varargs=None, keywords=None, defaults=({}, 50, True, 100, 1, 1, None)) -paddle.fluid.contrib.BeamSearchDecoder.block ArgSpec(args=[], varargs='args', keywords='kwds', defaults=None) +paddle.fluid.contrib.BeamSearchDecoder.block ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None) paddle.fluid.contrib.BeamSearchDecoder.decode ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None) paddle.fluid.contrib.BeamSearchDecoder.early_stop ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None) paddle.fluid.contrib.BeamSearchDecoder.read_array ArgSpec(args=['self', 'init', 'is_ids', 'is_scores'], varargs=None, keywords=None, defaults=(False, False)) @@ -455,7 +456,7 @@ paddle.fluid.optimizer.AdadeltaOptimizer.apply_gradients ArgSpec(args=['self', ' paddle.fluid.optimizer.AdadeltaOptimizer.backward ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set', 'callbacks'], varargs=None, keywords=None, defaults=(None, None, None, None)) paddle.fluid.optimizer.AdadeltaOptimizer.minimize ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None)) paddle.fluid.optimizer.ModelAverage.__init__ ArgSpec(args=['self', 'average_window_rate', 'min_average_window', 'max_average_window', 'regularization', 'name'], varargs=None, keywords=None, defaults=(10000, 10000, None, None)) -paddle.fluid.optimizer.ModelAverage.apply ArgSpec(args=[], varargs='args', keywords='kwds', defaults=None) +paddle.fluid.optimizer.ModelAverage.apply ArgSpec(args=['self', 'executor', 'need_restore'], varargs=None, keywords=None, defaults=(True,)) paddle.fluid.optimizer.ModelAverage.apply_gradients ArgSpec(args=['self', 'params_grads'], varargs=None, keywords=None, defaults=None) paddle.fluid.optimizer.ModelAverage.backward ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set', 'callbacks'], varargs=None, keywords=None, defaults=(None, None, None, None)) paddle.fluid.optimizer.ModelAverage.minimize ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None)) @@ -490,14 +491,14 @@ paddle.fluid.clip.ErrorClipByValue.__init__ ArgSpec(args=['self', 'max', 'min'], paddle.fluid.clip.GradientClipByValue.__init__ ArgSpec(args=['self', 'max', 'min'], varargs=None, keywords=None, defaults=(None,)) paddle.fluid.clip.GradientClipByNorm.__init__ ArgSpec(args=['self', 'clip_norm'], varargs=None, keywords=None, defaults=None) paddle.fluid.clip.GradientClipByGlobalNorm.__init__ ArgSpec(args=['self', 'clip_norm', 'group_name'], varargs=None, keywords=None, defaults=('default_group',)) -paddle.fluid.profiler.cuda_profiler ArgSpec(args=[], varargs='args', keywords='kwds', defaults=None) +paddle.fluid.profiler.cuda_profiler ArgSpec(args=['output_file', 'output_mode', 'config'], varargs=None, keywords=None, defaults=(None, None)) paddle.fluid.profiler.reset_profiler ArgSpec(args=[], varargs=None, keywords=None, defaults=None) -paddle.fluid.profiler.profiler ArgSpec(args=[], varargs='args', keywords='kwds', defaults=None) +paddle.fluid.profiler.profiler ArgSpec(args=['state', 'sorted_key', 'profile_path'], varargs=None, keywords=None, defaults=(None, '/tmp/profile')) paddle.fluid.profiler.start_profiler ArgSpec(args=['state'], varargs=None, keywords=None, defaults=None) paddle.fluid.profiler.stop_profiler ArgSpec(args=['sorted_key', 'profile_path'], varargs=None, keywords=None, defaults=(None, '/tmp/profile')) paddle.fluid.unique_name.generate ArgSpec(args=['key'], varargs=None, keywords=None, defaults=None) paddle.fluid.unique_name.switch ArgSpec(args=['new_generator'], varargs=None, keywords=None, defaults=(None,)) -paddle.fluid.unique_name.guard ArgSpec(args=[], varargs='args', keywords='kwds', defaults=None) +paddle.fluid.unique_name.guard ArgSpec(args=['new_generator'], varargs=None, keywords=None, defaults=(None,)) paddle.fluid.recordio_writer.convert_reader_to_recordio_file ArgSpec(args=['filename', 'reader_creator', 'feeder', 'compressor', 'max_num_records', 'feed_order'], varargs=None, keywords=None, defaults=(Compressor.Snappy, 1000, None)) paddle.fluid.recordio_writer.convert_reader_to_recordio_files ArgSpec(args=['filename', 'batch_per_file', 'reader_creator', 'feeder', 'compressor', 'max_num_records', 'feed_order'], varargs=None, keywords=None, defaults=(Compressor.Snappy, 1000, None)) paddle.fluid.Scope Scope() -> paddle.fluid.core._Scope diff --git a/paddle/fluid/framework/CMakeLists.txt b/paddle/fluid/framework/CMakeLists.txt index 66f11dedba..7ddf1ab44f 100644 --- a/paddle/fluid/framework/CMakeLists.txt +++ b/paddle/fluid/framework/CMakeLists.txt @@ -128,7 +128,7 @@ cc_test(version_test SRCS version_test.cc DEPS version) cc_library(proto_desc SRCS var_desc.cc op_desc.cc block_desc.cc program_desc.cc DEPS shape_inference op_info operator glog version) -cc_library(op_registry SRCS op_registry.cc DEPS op_proto_maker op_info operator glog proto_desc) +cc_library(op_registry SRCS op_registry.cc DEPS op_proto_maker op_info operator glog proto_desc memory_optimize_helper) nv_test(op_registry_test SRCS op_registry_test.cc DEPS op_registry) py_proto_compile(framework_py_proto SRCS framework.proto data_feed.proto) @@ -158,18 +158,19 @@ cc_library(variable_helper SRCS variable_helper.cc DEPS lod_tensor) cc_library(naive_executor SRCS naive_executor.cc DEPS op_registry device_context scope framework_proto glog lod_rank_table feed_fetch_method graph_to_program_pass variable_helper) -if(WITH_DISTRIBUTE) - cc_library(executor SRCS executor.cc DEPS op_registry device_context scope framework_proto glog - lod_rank_table feed_fetch_method sendrecvop_rpc ${GLOB_DISTRIBUTE_DEPS} graph_to_program_pass variable_helper) +if(WITH_NGRAPH) + set(NGRAPH_EXE_DEPS ngraph_engine) +else() + set(NGRAPH_EXE_DEPS) +endif() - set(DISTRIBUTE_COMPILE_FLAGS "-Wno-non-virtual-dtor -Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor") - set_source_files_properties(executor.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) +if(WITH_DISTRIBUTE) + cc_library(executor SRCS executor.cc DEPS op_registry device_context scope framework_proto glog + lod_rank_table feed_fetch_method sendrecvop_rpc ${GLOB_DISTRIBUTE_DEPS} graph_to_program_pass variable_helper ${NGRAPH_EXE_DEPS}) + set(DISTRIBUTE_COMPILE_FLAGS "-Wno-non-virtual-dtor -Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor") + set_source_files_properties(executor.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) else() - if (WITH_NGRAPH) - cc_library(executor SRCS executor.cc DEPS op_registry device_context scope framework_proto glog lod_rank_table feed_fetch_method graph_to_program_pass variable_helper ngraph_engine) - else () - cc_library(executor SRCS executor.cc DEPS op_registry device_context scope framework_proto glog lod_rank_table feed_fetch_method graph_to_program_pass variable_helper) - endif() + cc_library(executor SRCS executor.cc DEPS op_registry device_context scope framework_proto glog lod_rank_table feed_fetch_method graph_to_program_pass variable_helper ${NGRAPH_EXE_DEPS}) cc_test(test_naive_executor SRCS naive_executor_test.cc DEPS naive_executor elementwise_add_op) endif() @@ -192,6 +193,7 @@ cc_library(prune SRCS prune.cc DEPS framework_proto) cc_test(prune_test SRCS prune_test.cc DEPS op_info prune recurrent_op device_context) cc_test(var_type_inference_test SRCS var_type_inference_test.cc DEPS op_registry proto_desc) +cc_test(inplace_op_inference_test SRCS inplace_op_inference_test.cc DEPS op_registry proto_desc op_info memory_optimize_helper) cc_library(selected_rows SRCS selected_rows.cc DEPS tensor) cc_test(selected_rows_test SRCS selected_rows_test.cc DEPS selected_rows) diff --git a/paddle/fluid/framework/details/CMakeLists.txt b/paddle/fluid/framework/details/CMakeLists.txt index d5966ad5a9..e88084424b 100644 --- a/paddle/fluid/framework/details/CMakeLists.txt +++ b/paddle/fluid/framework/details/CMakeLists.txt @@ -50,10 +50,10 @@ cc_library(data_balance_op_handle SRCS data_balance_op_handle.cc DEPS op_handle_ cc_library(gather_op_handle SRCS gather_op_handle.cc DEPS op_handle_base scope ddim memory variable_visitor) cc_library(fuse_vars_op_handle SRCS fuse_vars_op_handle.cc DEPS op_handle_base scope) -cc_library(memory_optimize_pass SRCS analysis_var_pass.cc memory_reuse_types.cc DEPS graph graph_helper pass) +cc_library(memory_optimize_helper SRCS memory_optimize_helper.cc DEPS graph graph_helper) +cc_library(memory_optimize_pass SRCS memory_optimize_pass.cc DEPS memory_optimize_helper pass) +cc_library(inplace_op_pass SRCS inplace_op_pass.cc DEPS memory_optimize_pass op_info) cc_library(modify_op_lock_and_record_event_pass SRCS modify_op_lock_and_record_event_pass.cc DEPS computation_op_handle op_graph_view multi_devices_helper) -cc_library(memory_early_delete_pass SRCS memory_early_delete_pass.cc DEPS memory_optimize_pass computation_op_handle scale_loss_grad_op_handle rpc_op_handle - all_reduce_op_handle reduce_op_handle broadcast_op_handle data_balance_op_handle graph graph_helper pass) cc_library(reference_count_pass_helper SRCS reference_count_pass_helper.cc DEPS garbage_collector computation_op_handle) cc_library(eager_deletion_op_handle SRCS eager_deletion_op_handle.cc DEPS lod_tensor selected_rows reference_count_pass_helper) cc_library(eager_deletion_pass SRCS eager_deletion_pass.cc DEPS computation_op_handle eager_deletion_op_handle graph graph_helper pass) @@ -65,13 +65,11 @@ cc_library(all_reduce_deps_pass SRCS all_reduce_deps_pass.cc DEPS graph graph_he cc_library(multi_devices_graph_pass SRCS multi_devices_graph_pass.cc DEPS multi_devices_helper computation_op_handle scale_loss_grad_op_handle rpc_op_handle all_reduce_op_handle reduce_op_handle broadcast_op_handle data_balance_op_handle fused_broadcast_op_handle) -set(SSA_GRAPH_EXECUTOR_DEPS graph framework_proto sequential_execution_pass modify_op_lock_and_record_event_pass all_reduce_deps_pass reference_count_pass eager_deletion_pass memory_optimize_pass memory_early_delete_pass) +set(SSA_GRAPH_EXECUTOR_DEPS graph framework_proto sequential_execution_pass modify_op_lock_and_record_event_pass all_reduce_deps_pass reference_count_pass eager_deletion_pass memory_optimize_pass inplace_op_pass) if (WITH_GPU) list(APPEND SSA_GRAPH_EXECUTOR_DEPS reference_count_pass) endif() -cc_test(memory_reuse_types_test SRCS memory_reuse_types_test.cc memory_reuse_types.cc DEPS framework_proto graph) -cc_test(analysis_var_pass_test SRCS analysis_var_pass_test.cc analysis_var_pass.cc memory_reuse_types.cc DEPS framework_proto graph graph_helper op_registry pass) - +cc_test(memory_optimize_helper_test SRCS memory_optimize_helper_test.cc memory_optimize_helper.cc DEPS framework_proto graph graph_helper op_registry) cc_library(ssa_graph_executor SRCS ssa_graph_executor.cc DEPS ${SSA_GRAPH_EXECUTOR_DEPS}) cc_library(threaded_ssa_graph_executor SRCS threaded_ssa_graph_executor.cc DEPS fetch_op_handle ssa_graph_executor scope diff --git a/paddle/fluid/framework/details/analysis_var_pass.cc b/paddle/fluid/framework/details/analysis_var_pass.cc deleted file mode 100644 index 223b9da3cf..0000000000 --- a/paddle/fluid/framework/details/analysis_var_pass.cc +++ /dev/null @@ -1,656 +0,0 @@ -// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "paddle/fluid/framework/details/analysis_var_pass.h" -#include <algorithm> -#include <atomic> -#include <deque> -#include <fstream> -#include <iostream> -#include <iterator> -#include <memory> -#include <queue> -#include <sstream> -#include <string> -#include <type_traits> -#include <vector> -#include "gflags/gflags.h" -#include "paddle/fluid/framework/data_type.h" -#include "paddle/fluid/framework/ir/graph.h" -#include "paddle/fluid/framework/ir/graph_helper.h" - -DEFINE_bool(enable_subgraph_optimize, false, - "SubGraph also reuse global graph variables, it will reduce the " - "memory occupation" - "but a higher risk of memory reuse error. default disabled."); -DEFINE_string(memory_optimize_debug, "", - "debug the operator output variable when do the variable reuse." - "memory reuse pass." - "only for debug, default disabled."); - -namespace paddle { -namespace framework { -namespace details { - -static inline bool IsSameDesc(OpDesc* op1, OpDesc* op2) { - return op1->Type() == op2->Type() && op1->Inputs() == op2->Inputs() && - op1->Outputs() == op2->Outputs(); -} - -template <typename Container, typename Callback> -class FilterVariableImpl { - public: - void operator()(const Container& nodes, Callback callback) { - for (auto* node : nodes) { - callback(node); - } - } -}; - -// filter var node for op->inputs/outputs -template <typename Callback> -class FilterVariableImpl<std::vector<ir::Node*>, Callback> { - public: - void operator()(const std::vector<ir::Node*>& nodes, Callback callback) { - for (auto* var : nodes) { - if (var->IsVar() && !var->IsCtrlVar()) { - callback(var); - } - } - } -}; - -template <typename Container, typename Callback> -void FilterVariables(const Container& nodes, Callback callback) { - FilterVariableImpl<Container, Callback>()(nodes, callback); -} - -std::unique_ptr<ir::Graph> AnalysisVarPass::ApplyImpl( - std::unique_ptr<ir::Graph> graph) const { - auto nodes = graph->Nodes(); - auto subblock_vars = GetSubBlockVars(nodes); - skip_set_.insert(subblock_vars.begin(), subblock_vars.end()); - - cfg_.reset(new details::ControlFlowGraph(*graph)); - cfg_->LiveVariableAnalysis(); - InitSSAGraphNodes(); - - int reuse_id = 0; - for (size_t idx = 0; idx < cfg_->Ops().size(); ++idx) { - auto& op = cfg_->Ops()[idx]; - auto* op_desc = op->Op(); - // some op in graph has no op desc - if (op_desc == nullptr) continue; - if (OpHasSubBlock(op_desc)) { - if (FLAGS_enable_subgraph_optimize) { - SubGraphOptimize(op_desc); - } else { - VLOG(3) << op->Name() - << " has subblock, but disable subgraph optimize. skipped."; - continue; - } - } - - for (auto& var : op->outputs) { - if (NodeCanReused(var) && cfg_->Use(op).count(var->Name()) == 0) { - ir::Node* cache = pool_.NodeMatch(var); - if (var->Name() == FLAGS_memory_optimize_debug) { - VLOG(3) << "start match var " << DebugString(var) << " of op " - << op->Name(); - VLOG(3) << pool_.ToString(); - VLOG(3) << "matched in pool : " - << ((cache == nullptr) ? "False" : "True"); - } - if (cache != nullptr) { - if (var->Name() == cache->Name()) { - VLOG(3) << "The same cache variable is cascade reused." - << var->Name() << " is re-filled to the pool after" - << "the reused op is finished. Current op can not " - << "replace it again. Skip this candidate."; - continue; - } - - int node_idx_in_pool = pool_.GetIndex(cache); - VLOG(3) << string::Sprintf( - "!!! %s, %s => %s, cache idx %d, pool size %d", - std::to_string(reuse_id++), DebugString(var), DebugString(cache), - node_idx_in_pool, static_cast<int>(pool_.size())); - // update CFG Graph on the fly. - // reused var maybe re-fill into the pool - cfg_->RenameVarInCFGGraph(var->Name(), cache->Name(), idx); - // NOTE(dzhwinter): we need to both update the ProgramDesc - // and IR Graph. because op_desc/var_desc is used in CreateOp, - // CreateVar when running happens. But IR Graph - // define the dependence relationship between nodes. - RenameVarInGraphDesc(var->Name(), cache->Name(), idx); - RenameVarInGraphNode(var->Name(), cache->Name(), idx, graph.get()); - - pool_.Erase(cache); - } - } - } - // fill the pool - for (auto var : cfg_->LiveIn(op)) { - if (cfg_->LiveOut(op).count(var) == 0) { - ir::Node* var_node = cfg_->GetNodeFromVarName(var, op); - if (var_node == nullptr) continue; - if (NodeCanReused(var_node) && !pool_.Has(var_node)) { - pool_.Insert(var_node, op); - } - } - } - } - graph->ResolveHazard(var_nodes_); - - // For early delete pass. use GraphNodePool load the unlived vars. - // 1. find all deps op for each unlived var in memory pool. - for (auto& op : graph->Nodes()) { - for (auto& var : op->inputs) { - if (pool_.Has(var)) { - pool_.Insert(var, op); - } - } - } - // 2. convert ir node based memory pool to graph node - // because Node* maybe released bettwen passes. - auto& graph_pool = graph->Get<GraphNodePool>(kGraphNodePool); - for (auto it = pool_.begin(); it != pool_.end(); ++it) { - std::unordered_set<OpDesc*> descs; - for (auto& op : it->second) { - PADDLE_ENFORCE(op->IsOp()); - descs.insert(op->Op()); - } - graph_pool.push_back(std::make_pair(it->first->Name(), descs)); - } - - return graph; -} - -void AnalysisVarPass::SubGraphOptimize(OpDesc* op_desc) const { - // conditional block, while op and their grad op - auto* sub_block_desc = - AttrReader(op_desc->GetAttrMap()).Get<BlockDesc*>("sub_block"); - - // create a mirror block to construct an IR Graph. - ProgramDesc prog; - auto* copy_block = prog.MutableBlock(0); - for (auto* op : sub_block_desc->AllOps()) { - auto* copy_op = copy_block->AppendOp(); - copy_op->CopyFrom(*op); - copy_op->Flush(); - } - - for (auto* var : sub_block_desc->AllVars()) { - auto* copy_var = copy_block->Var(var->Name()); - copy_var->SetDataType(var->GetDataType()); - // only lod tensor can be reused. So ignore the multiple dims case. - copy_var->SetType(var->GetType()); - copy_var->SetShape(var->GetShape()); - copy_var->SetPersistable(var->Persistable()); - } - - ir::Graph sub_graph(prog); - std::unordered_set<ir::Node*> sub_graph_all_ops; - FilterVariables(sub_graph.Nodes(), [&](ir::Node* var) { - // sub_graph_all_ops.emplace(var); - if (var->IsVar() && !var->IsCtrlVar()) { - sub_graph_all_ops.emplace(var); - } - }); - int sub_reuse_id = 0; - // subgraph nodes is unordered, reuse need to follow the desc order. - // find the right op node through the descs - for (auto* sub_op_desc : sub_block_desc->AllOps()) { - ir::Node* sub_op = nullptr; - for (auto* node : sub_graph_all_ops) { - if (node->Op() == sub_op_desc) { - sub_op = node; - break; - } - } - PADDLE_ENFORCE(sub_op != nullptr); - for (auto* var : sub_op->outputs) { - if (NodeCanReused(var)) { - ir::Node* cache = pool_.NodeMatch(var); - if (cache != nullptr) { - if (var->Var()->GetDataType() != cache->Var()->GetDataType()) { - continue; - } - int node_idx_in_pool = pool_.GetIndex(cache); - VLOG(3) << string::Sprintf( - "!!! %s, %s => %s, cache idx %d, pool size %d", - std::to_string(sub_reuse_id++), DebugString(var), - DebugString(cache), node_idx_in_pool, - static_cast<int>(pool_.size())); - // NOTE(dzh): subblock is not in IR graph. Modify the block_desc - // immediately to make the subblock variable reuse strategy take - // effect. Because it is a single op in graph. No need to - // update the ir nodes. - sub_op_desc->Rename(var->Name(), cache->Name()); - if (sub_op_desc->Block()->HasVar(var->Name())) { - sub_op_desc->Block()->RemoveVar(var->Name()); - } - } - } - } - } -} - -std::unordered_set<std::string> AnalysisVarPass::GetSubBlockVars( - const std::unordered_set<ir::Node*>& nodes) const { - std::unordered_set<std::string> vars; - for (auto& op : nodes) { - if (!op->IsOp() || op->Op() == nullptr) continue; - auto* op_desc = op->Op(); - if (OpHasSubBlock(op_desc)) { - auto inputs = op_desc->InputArgumentNames(); - auto outputs = op_desc->OutputArgumentNames(); - vars.insert(inputs.begin(), inputs.end()); - vars.insert(outputs.begin(), outputs.end()); - } - } - return vars; -} - -void AnalysisVarPass::RenameVarInGraphDesc(const std::string& var, - const std::string& cache_var, - size_t idx) const { - for (size_t i = idx; i < cfg_->Ops().size(); ++i) { - auto* op = cfg_->Ops()[i]; - PADDLE_ENFORCE(op->IsOp() && op->Op()); - auto* op_desc = op->Op(); - op_desc->RenameInput(var, cache_var); - op_desc->RenameOutput(var, cache_var); - if (op_desc->Block()->HasVar(var)) op_desc->Block()->RemoveVar(var); - op_desc->Flush(); - } -} - -void AnalysisVarPass::InitSSAGraphNodes() const { - std::unordered_map<std::string, std::unordered_set<ir::Node*>> all_vars; - if (var_nodes_.empty()) { - for (auto* op : cfg_->Ops()) { - for (auto* node : op->inputs) { - if (all_vars[node->Name()].count(node) == 0) { - all_vars[node->Name()].emplace(node); - var_nodes_[node->Name()].emplace_back(node); - } - } - for (auto* node : op->outputs) { - if (all_vars[node->Name()].count(node) == 0) { - all_vars[node->Name()].emplace(node); - var_nodes_[node->Name()].emplace_back(node); - } - } - } - } -} - -void AnalysisVarPass::RenameVarInGraphNode(const std::string& var, - const std::string& cache_var, - size_t idx, ir::Graph* graph) const { - // if replace happens, we need to create a newer version cache_var - // but use the same dims/data_type with var. - PADDLE_ENFORCE(var_nodes_[var].size() >= 1 && - var_nodes_[var].at(0)->Var() != nullptr); - std::unique_ptr<VarDesc> var_desc(new VarDesc(*var_nodes_[var].at(0)->Var())); - var_desc->SetName(cache_var); - - for (size_t i = idx; i < cfg_->Ops().size(); ++i) { - auto* op = cfg_->Ops()[i]; - - // redirect the input to the latest version of cache_var - for (auto* node : op->inputs) { - if (node->Name() == var) { - ir::Node* cache_node = graph->CreateVarNode(var_desc.get()); - var_nodes_[cache_var].emplace_back(cache_node); - - // swap node to cache_node - cache_node->outputs.insert(cache_node->outputs.end(), - node->outputs.begin(), node->outputs.end()); - PADDLE_ENFORCE(node->inputs.size() == 1 && node->inputs[0]->IsOp()); - auto* prev_op = node->inputs[0]; - std::replace(prev_op->outputs.begin(), prev_op->outputs.end(), node, - cache_node); - cache_node->inputs.emplace_back(prev_op); - for (auto* next_op : node->outputs) { - std::replace(next_op->inputs.begin(), next_op->inputs.end(), node, - cache_node); - } - } - } - - // if we need to rename the output, - // always create a newer version of cache_var - for (auto* node : op->outputs) { - if (node->Name() == var) { - ir::Node* cache_node = graph->CreateVarNode(var_desc.get()); - var_nodes_[cache_var].emplace_back(cache_node); - - // swap node to cache node - cache_node->outputs.insert(cache_node->outputs.end(), - node->outputs.begin(), node->outputs.end()); - cache_node->inputs.emplace_back(op); - std::replace(op->outputs.begin(), op->outputs.end(), node, cache_node); - for (auto* next_op : node->outputs) { - std::replace(next_op->inputs.begin(), next_op->inputs.end(), node, - cache_node); - } - } - } - } - - // release node of unused var in graph - for (auto* node : var_nodes_[var]) { - graph->RemoveNode(node); - } - var_nodes_.at(var).clear(); -} - -bool AnalysisVarPass::NodeCanReused(ir::Node* node) const { - if (!node->IsVar() || node->IsCtrlVar()) return false; - auto* desc = node->Var(); - auto type = desc->GetType(); - if (desc->Persistable() || type != proto::VarType::LOD_TENSOR || - desc->GetShape().empty()) { - return false; - } - // vars can be @EMPTY@, @LR_DECAY_REUSE_ID@. For example, while_grad - std::string name = node->Name(); - if (!name.empty() && name[0] == '@' && name[name.size() - 1] == '@') - return false; - if (skip_set_.count(name)) return false; - for (auto* op : node->inputs) { - if (op->Op()->HasAttr("force_cpu")) { - // op output force generated in cpu, can not be reused. - return framework::AttrReader(op->Op()->GetAttrMap()) - .Get<bool>("force_cpu") == 0; - } - } - return true; -} - -bool AnalysisVarPass::OpHasSubBlock(OpDesc* desc) const { - const AttributeMap& attrs = desc->GetAttrMap(); - for (auto& attr : attrs) { - if (attr.second.type() == typeid(BlockDesc*) || // NOLINT - attr.second.type() == typeid(std::vector<BlockDesc*>)) // NOLINT - return true; - } - return false; -} - -std::vector<ir::Node*> SortOpLikeDescOrder(const ir::Graph& graph) { - PADDLE_ENFORCE(graph.Has(kAllOpDescs), - "Graph has no attribute of kAllOpDescs."); - // 1. get op desc order - auto& op_descs = graph.Get<const std::vector<OpDesc*>>(kAllOpDescs); - - // 2. topology sort order - auto nodes = graph.Nodes(); - std::deque<ir::Node*> ops; - FilterVariables(nodes, [&](ir::Node* op) { - if (op->IsOp() && op->Op() != nullptr) { - ops.emplace_back(op); - } - }); - std::unordered_map<ir::Node*, size_t> op_deps; - std::list<ir::Node*> ready_ops; - std::unordered_map<ir::Node*, std::unordered_set<ir::Node*>> pending_ops; - - for (auto* op : ops) { - std::unordered_set<ir::Node*> preceding_op; - for (auto* in : op->inputs) { - if (in->inputs.empty()) continue; - PADDLE_ENFORCE(in->inputs.size() == 1 && in->inputs[0]->IsOp()); - preceding_op.emplace(in->inputs[0]); - pending_ops[in->inputs[0]].emplace(op); - } - op_deps[op] = preceding_op.size(); - if (preceding_op.empty()) { - ready_ops.emplace_back(op); - } - } - - // 3. generated op list based desc order and the topology order - std::vector<ir::Node*> ret; - std::list<OpDesc*> op_descs_list(op_descs.begin(), op_descs.end()); - - auto update_by_found_node = [&](ir::Node* found_node) { - for (auto* pending_op : pending_ops[found_node]) { - if (--op_deps[pending_op] == 0) { - ready_ops.emplace_back(pending_op); - } - } - ready_ops.remove(found_node); - ret.emplace_back(found_node); - }; - - while (!ready_ops.empty()) { - bool all_of_ready_op_unmatched = true; - for (auto it = op_descs_list.begin(); it != op_descs_list.end();) { - auto op_desc = *it; - ir::Node* found_node = nullptr; - for (auto* op : ready_ops) { - if (IsSameDesc(op->Op(), op_desc)) { - found_node = op; - break; - } - } - - // 3.1 op desc deleted by other pass - if (found_node == nullptr) { - ++it; - continue; - } else { - all_of_ready_op_unmatched = false; - it = op_descs_list.erase(it); - } - update_by_found_node(found_node); - } - - // 3.2 op descs are added by other pass - // preceding op non empty means some new op descs are - // created, but not contained in return node list. - // these new op desc may depend on each other. - std::list<ir::Node*> prev_ready_ops(ready_ops); - if (all_of_ready_op_unmatched) { - for (auto op : prev_ready_ops) { - update_by_found_node(op); - } - } - } - - PADDLE_ENFORCE(std::all_of( - op_deps.begin(), op_deps.end(), - [&](const std::pair<ir::Node*, size_t>& p) { return p.second == 0; })); - - return ret; -} - -ControlFlowGraph::ControlFlowGraph(const ir::Graph& graph) { - ops_ = SortOpLikeDescOrder(graph); - ConnectNodes(); -} - -void ControlFlowGraph::BuildCFGGraph() { - // FIXME(dzh): same effect with ConnectNodes, but use the control - // link to build dependency graph, it goes wrong in transformer. - for (ir::Node* op : ops_) { - for (auto& input_var : op->inputs) { - if (!input_var->inputs.empty()) { - PADDLE_ENFORCE( - input_var->inputs.size() == 1 && input_var->inputs[0]->IsOp(), - "Preceding Op Node of Var Node must be unique"); - auto* pred_op = input_var->inputs[0]; - if (pred_op->Op() != nullptr) { - predecessors_[op].insert(pred_op); - successors_[pred_op].insert(op); - } - } - if (input_var->IsVar() && !input_var->IsCtrlVar()) { - uses_[op].insert(input_var->Name()); - } - } - for (auto& output_var : op->outputs) { - // output var may be used by many op - for (auto* succ_op : output_var->outputs) { - if (succ_op->Op() != nullptr) { - successors_[op].insert(succ_op); - predecessors_[succ_op].insert(op); - } - } - if (output_var->IsVar() && !output_var->IsCtrlVar()) { - defs_[op].insert(output_var->Name()); - } - } - } -} - -void ControlFlowGraph::ConnectNodes() { - for (size_t i = 0; i < ops_.size(); ++i) { - auto& op = ops_[i]; - try { - auto& next_op = ops_.at(i + 1); - successors_[op].insert(next_op); - predecessors_[next_op].insert(op); - } catch (...) { - // do nothing - } - - FilterVariables(op->inputs, - [&](ir::Node* var) { uses_[op].emplace(var->Name()); }); - - FilterVariables(op->outputs, - [&](ir::Node* var) { defs_[op].emplace(var->Name()); }); - } -} - -void ControlFlowGraph::LiveVariableAnalysis() { - // NOTE(dzh): variable liveless analysis (a.k.a reversed_ops algorithm) - // compute the liveness of for each variable though reversed_ops algorithm. - // It iterates the operators from end to begin, compute the live in/live out - // variable set for each op, then the diff between in/out will be used for - // the variable reuse. For detail refer to - // http://www.cs.cornell.edu/courses/cs4120/2013fa/lectures/lec26-fa13.pdf - std::list<ir::Node*> work_list(ops_.rbegin(), ops_.rend()); - while (!work_list.empty()) { - ir::Node* op = work_list.front(); - work_list.pop_front(); - // get the live_in calculated before. Empty if first. - auto prev_live_in = std::move(live_in_[op]); - for (auto& s : successors_[op]) { - for (auto& var : live_in_[s]) { - live_out_[op].insert(var); - } - } - for (auto& var : uses_[op]) { - live_in_[op].insert(var); - } - for (auto& var : live_out_[op]) { - live_in_[op].insert(var); - } - for (auto& var : defs_[op]) { - live_in_[op].erase(var); - } - - // If the live_in is not changed, then the liveness analysis of - // predecessors is completed. - // - // Otherwise, recalculate the predecessors liveness - if (live_in_[op] != prev_live_in) { - for (auto& pre : predecessors_[op]) { - work_list.push_back(pre); - } - } - } -} - -void ControlFlowGraph::RenameVarInCFGGraph(const std::string& old_node, - const std::string& new_node, - int begin_idx) { - // update graph from begin idx to the end - for (size_t i = begin_idx; i != ops_.size(); ++i) { - auto* op = ops_[i]; - if (uses_[op].find(old_node) != uses_[op].end()) { - uses_[op].erase(old_node); - uses_[op].insert(new_node); - } - if (defs_[op].find(old_node) != defs_[op].end()) { - defs_[op].erase(old_node); - defs_[op].insert(new_node); - } - if (live_in_[op].find(old_node) != live_in_[op].end()) { - live_in_[op].erase(old_node); - live_in_[op].insert(new_node); - } - if (live_out_[op].find(old_node) != live_out_[op].end()) { - live_out_[op].erase(old_node); - live_out_[op].insert(new_node); - } - } -} - -const std::set<std::string> ControlFlowGraph::LiveIn(ir::Node* op) const { - auto it = live_in_.find(op); - PADDLE_ENFORCE( - it != live_in_.end(), - string::Sprintf("Expect %s in live_in, but Not Found.", op->Name())); - return it->second; -} - -const std::set<std::string> ControlFlowGraph::LiveOut(ir::Node* op) const { - auto it = live_out_.find(op); - PADDLE_ENFORCE( - it != live_out_.end(), - string::Sprintf("Expect %s in live_out, but Not Found.", op->Name())); - return it->second; -} - -const std::set<std::string> ControlFlowGraph::Use(ir::Node* op) const { - auto it = uses_.find(op); - PADDLE_ENFORCE( - it != uses_.end(), - string::Sprintf("Expect %s in live_out, but Not Found.", op->Name())); - return it->second; -} - -const std::vector<ir::Node*> ControlFlowGraph::Ops() const { return ops_; } - -std::vector<ir::Node*>& ControlFlowGraph::Ops() { return ops_; } - -ir::Node* ControlFlowGraph::GetNodeFromVarName(const std::string& name, - ir::Node* op) const { - // in ssa-graph, different version nodes have same name, - // this function get the latest version var before target op - // It may return nullptr, such as data node. - ir::Node* found_node = nullptr; - for (auto* node : ops_) { - if (node == op) break; - for (auto& output : node->outputs) { - if (output->Name() == name) { - found_node = output; - } - } - } - return found_node; -} - -} // namespace details -} // namespace framework -} // namespace paddle - -REGISTER_PASS(analysis_var_pass, paddle::framework::details::AnalysisVarPass) - .RequireGraphAttr(paddle::framework::details::kGraphNodePool) - .RequireGraphAttr(paddle::framework::details::kAllOpDescs); diff --git a/paddle/fluid/framework/details/analysis_var_pass.h b/paddle/fluid/framework/details/analysis_var_pass.h deleted file mode 100644 index 144204beaf..0000000000 --- a/paddle/fluid/framework/details/analysis_var_pass.h +++ /dev/null @@ -1,120 +0,0 @@ -// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once - -#include <algorithm> -#include <list> -#include <map> -#include <memory> -#include <set> -#include <string> -#include <unordered_map> -#include <utility> -#include <vector> - -#include "paddle/fluid/framework/data_type.h" -#include "paddle/fluid/framework/details/memory_reuse_types.h" -#include "paddle/fluid/framework/ir/graph.h" -#include "paddle/fluid/framework/ir/pass.h" - -namespace paddle { -namespace framework { -namespace details { -constexpr char kAllOpDescs[] = "all_op_descs"; - -std::vector<ir::Node*> SortOpLikeDescOrder(const ir::Graph& graph); -// sort op in bfs order -std::vector<ir::Node*> BFSSortGraphOps(const ir::Graph& graph); - -class ControlFlowGraph; - -class AnalysisVarPass : public ir::Pass { - protected: - std::unique_ptr<ir::Graph> ApplyImpl( - std::unique_ptr<ir::Graph> graph) const override; - - private: - // fill the variable map(var_nodes) by version. - void InitSSAGraphNodes() const; - // update program descs - void RenameVarInGraphDesc(const std::string& var, - const std::string& cache_var, size_t idx) const; - // update ir nodes - void RenameVarInGraphNode(const std::string& var, - const std::string& cache_var, size_t idx, - ir::Graph* graph) const; - - void SubGraphOptimize(OpDesc* op_desc) const; - // valid a tensor can be reuse or not - bool NodeCanReused(ir::Node* node) const; - // scan subblock and collect the output/input variables. - std::unordered_set<std::string> GetSubBlockVars( - const std::unordered_set<ir::Node*>&) const; - // check op has subblock or not - bool OpHasSubBlock(OpDesc* desc) const; - - private: - // Reuse Node Pool, Owned. - mutable OrderedNodePairPool pool_; - // controlflow Graph - mutable std::unique_ptr<ControlFlowGraph> cfg_; - // skip set - mutable std::unordered_set<std::string> skip_set_; - // var nodes - mutable std::map<std::string, std::vector<ir::Node*>> var_nodes_; -}; - -class ControlFlowGraph { - public: - ControlFlowGraph() = default; - // For IR Graph in parallelexecutor - explicit ControlFlowGraph(const ir::Graph& graph); - - void LiveVariableAnalysis(); - - void RenameVarInCFGGraph(const std::string& old_node, - const std::string& new_node, int begin_idx); - - const std::set<std::string> LiveIn(ir::Node* op) const; - const std::set<std::string> LiveOut(ir::Node* op) const; - const std::set<std::string> Use(ir::Node* op) const; - const std::vector<ir::Node*> Ops() const; - std::vector<ir::Node*>& Ops(); - - // for ssa-graph nodes - ir::Node* GetNodeFromVarName(const std::string& name, ir::Node* op) const; - - private: - void BuildCFGGraph(); - void ConnectNodes(); - using NodeListMap = std::unordered_map<ir::Node*, std::set<ir::Node*>>; - using VarSetMap = std::map<ir::Node*, std::set<std::string>>; - // successors ops use the output variables. - NodeListMap successors_; - // predecessors ops generated input variables. - NodeListMap predecessors_; - // variables lived before run current op. - VarSetMap live_in_; - // variables lived after run current op. - VarSetMap live_out_; - VarSetMap uses_; // op inputs - VarSetMap defs_; // op outputs - - std::vector<ir::Node*> ops_; // op sequence by topology sort -}; - -} // namespace details -} // namespace framework -} // namespace paddle diff --git a/paddle/fluid/framework/details/build_strategy.cc b/paddle/fluid/framework/details/build_strategy.cc index ce5731a1f4..f8030c53f7 100644 --- a/paddle/fluid/framework/details/build_strategy.cc +++ b/paddle/fluid/framework/details/build_strategy.cc @@ -17,7 +17,7 @@ limitations under the License. */ #include <glog/logging.h> #include <memory> -#include "paddle/fluid/framework/details/memory_reuse_types.h" +#include "paddle/fluid/framework/details/memory_optimize_helper.h" #include "paddle/fluid/framework/details/multi_devices_graph_pass.h" #include "paddle/fluid/framework/details/multi_devices_graph_print_pass.h" #include "paddle/fluid/framework/details/reduce_op_handle.h" @@ -47,6 +47,22 @@ class ParallelExecutorPassBuilder : public ir::PassBuilder { AppendPass("sequential_execution_pass"); } + // Add op fusion. + if (strategy.fuse_relu_depthwise_conv_) { + AppendPass("fuse_relu_depthwise_conv_pass"); + } + + // NOTE(dzhwinter): A note for automatical inplace. + // 1. modify program desc passes should put + // before inplace pass. + // 2. manually configured inplace should put + // before inplace_pass + + // Add automatically inplace. + if (strategy_.enable_inplace_) { + AppendPass("inplace_pass"); + } + // Add a graph viz pass to record a graph. if (!strategy_.debug_graphviz_path_.empty()) { auto viz_pass = AppendPass("graph_viz_pass"); @@ -55,10 +71,6 @@ class ParallelExecutorPassBuilder : public ir::PassBuilder { viz_pass->Set<std::string>("graph_viz_path", new std::string(graph_path)); } - // Add op fusion. - if (strategy.fuse_relu_depthwise_conv_) { - AppendPass("fuse_relu_depthwise_conv_pass"); - } if (strategy.fuse_elewise_add_act_ops_) { auto fuse_elewise_add_act_pass = AppendPass("fuse_elewise_add_act_pass"); // Add a graph viz pass to record a graph. @@ -88,7 +100,7 @@ class ParallelExecutorPassBuilder : public ir::PassBuilder { // A side-effect of that, memory optimize cannot forsee the fetched vars // , so fetchlist should be set persistable before call the Run interface. if (strategy.memory_optimize_) { - auto analysis_var_pass = AppendPass("analysis_var_pass"); + auto memory_optimize_pass = AppendPass("memory_optimize_pass"); } AppendMultiDevPass(strategy); @@ -186,14 +198,14 @@ std::unique_ptr<ir::Graph> BuildStrategy::Apply( pass->Erase("nccl_ctxs"); pass->SetNotOwned<platform::NCCLContextMap>("nccl_ctxs", nctx); #endif - - } else if (pass->Type() == "analysis_var_pass") { + } else if (pass->Type() == "memory_optimize_pass") { + if (graph->Has(kAllOpDescs)) { + graph->Erase(kAllOpDescs); + } const std::vector<OpDesc *> *all_op_descs = new std::vector<OpDesc *>(main_program.Block(0).AllOps()); graph->Set<const std::vector<OpDesc *>>(kAllOpDescs, all_op_descs); // take ownership - graph->Set<GraphNodePool>(kGraphNodePool, - new GraphNodePool); // take ownership pass->Erase(kAllOpDescs); pass->SetNotOwned<const std::vector<OpDesc *>>(kAllOpDescs, all_op_descs); @@ -214,6 +226,13 @@ std::unique_ptr<ir::Graph> BuildStrategy::Apply( pass->Set<const std::vector<OpDesc *>>( kAllOpDescs, new std::vector<OpDesc *>(main_program.Block(0).AllOps())); + } else if (pass->Type() == "inplace_pass") { + if (graph->Has(kAllOpDescs)) { + graph->Erase(kAllOpDescs); + } + graph->Set<const std::vector<OpDesc *>>( + kAllOpDescs, + new std::vector<OpDesc *>(main_program.Block(0).AllOps())); } else if (pass->Type() == "fuse_relu_depthwise_conv_pass") { if (!use_cuda) { LOG(WARNING) << "fuse_relu_depthwise_conv_pass is only supported on " @@ -239,9 +258,10 @@ USE_PASS(allreduce_mode_multi_devices_pass); USE_PASS(dist_multi_devices_pass); USE_PASS(multi_devices_check_pass); USE_PASS(multi_devices_print_pass); -USE_PASS(analysis_var_pass); +USE_PASS(memory_optimize_pass); USE_PASS(sequential_execution_pass); USE_PASS(all_reduce_deps_pass); USE_PASS(modify_op_lock_and_record_event_pass); +USE_PASS(inplace_pass); USE_PASS(lock_free_optimize_pass); USE_PASS(graph_to_program_pass); diff --git a/paddle/fluid/framework/details/build_strategy.h b/paddle/fluid/framework/details/build_strategy.h index cd24a31759..e62e3edcef 100644 --- a/paddle/fluid/framework/details/build_strategy.h +++ b/paddle/fluid/framework/details/build_strategy.h @@ -77,8 +77,10 @@ struct BuildStrategy { bool fuse_relu_depthwise_conv_{false}; bool memory_optimize_{false}; - - bool memory_early_delete_{false}; + // TODO(dzhwinter): + // make enable_inplace, memory_optimize_ + // memory_early_delete_ true by default + bool enable_inplace_{false}; bool enable_sequential_execution_{false}; diff --git a/paddle/fluid/framework/details/computation_op_handle.h b/paddle/fluid/framework/details/computation_op_handle.h index 601ae4f8c6..1e3dbb1e44 100644 --- a/paddle/fluid/framework/details/computation_op_handle.h +++ b/paddle/fluid/framework/details/computation_op_handle.h @@ -26,7 +26,7 @@ namespace paddle { namespace framework { namespace details { -struct ComputationOpHandle : public OpHandleBase { +class ComputationOpHandle : public OpHandleBase { public: ComputationOpHandle(ir::Node *node, Scope *scope, platform::Place place, size_t scope_idx); diff --git a/paddle/fluid/framework/details/fused_broadcast_op_handle_test.cc b/paddle/fluid/framework/details/fused_broadcast_op_handle_test.cc index be0d941c4f..6d53dac5c0 100644 --- a/paddle/fluid/framework/details/fused_broadcast_op_handle_test.cc +++ b/paddle/fluid/framework/details/fused_broadcast_op_handle_test.cc @@ -34,8 +34,8 @@ struct TestFusedBroadcastOpHandle : TestBroadcastOpHandle { ->Var(details::kLocalExecScopeName) ->GetMutable<Scope*>() = &local_scope; for (size_t j = 0; j < input_scope_idxes.size(); ++j) { - local_scope.Var("out_var" + j); - if (i == j) local_scope.Var("in_var" + j); + local_scope.Var("out_var" + std::to_string(j)); + if (i == j) local_scope.Var("in_var" + std::to_string(j)); } param_scopes_.emplace_back(&local_scope); } @@ -62,20 +62,21 @@ struct TestFusedBroadcastOpHandle : TestBroadcastOpHandle { for (size_t i = 0; i < input_scope_idxes.size(); ++i) { // add input var handle - nodes_.emplace_back( - ir::CreateNodeForTest("in_node" + i, ir::Node::Type::kVariable)); - VarHandle* in_var_handle = - new VarHandle(nodes_.back().get(), 1, input_scope_idxes[i], - "in_var" + i, place_list_[input_scope_idxes[i]]); + nodes_.emplace_back(ir::CreateNodeForTest("in_node" + std::to_string(i), + ir::Node::Type::kVariable)); + VarHandle* in_var_handle = new VarHandle( + nodes_.back().get(), 1, input_scope_idxes[i], + "in_var" + std::to_string(i), place_list_[input_scope_idxes[i]]); vars_.emplace_back(in_var_handle); op_handle_->AddInput(in_var_handle); // add output var handle for (size_t j = 0; j < place_list_.size(); ++j) { - nodes_.emplace_back( - ir::CreateNodeForTest("out_node" + i, ir::Node::Type::kVariable)); - VarHandle* out_var_handle = new VarHandle( - nodes_.back().get(), 2, j, "out_var" + i, place_list_[j]); + nodes_.emplace_back(ir::CreateNodeForTest( + "out_node" + std::to_string(i), ir::Node::Type::kVariable)); + VarHandle* out_var_handle = + new VarHandle(nodes_.back().get(), 2, j, + "out_var" + std::to_string(i), place_list_[j]); vars_.emplace_back(out_var_handle); op_handle_->AddOutput(out_var_handle); } @@ -86,7 +87,7 @@ struct TestFusedBroadcastOpHandle : TestBroadcastOpHandle { std::vector<std::vector<float>> send_vec; f::LoD lod{{0, 10, 20}}; for (size_t i = 0; i < input_scope_idxes.size(); ++i) { - const std::string varname("in_var" + i); + const std::string varname("in_var" + std::to_string(i)); float val_scalar = static_cast<float>(i); send_vec.push_back( InitLoDTensor(varname, input_scope_idxes[i], lod, val_scalar)); @@ -96,7 +97,7 @@ struct TestFusedBroadcastOpHandle : TestBroadcastOpHandle { WaitAll(); for (size_t i = 0; i < input_scope_idxes.size(); ++i) { - const std::string& varname("out_var" + i); + const std::string& varname("out_var" + std::to_string(i)); for (size_t j = 0; j < place_list_.size(); ++j) { LoDTensorEqual(varname, send_vec[i], lod, param_scopes_[j]); } @@ -109,7 +110,7 @@ struct TestFusedBroadcastOpHandle : TestBroadcastOpHandle { 2, 4, 6, 3, 1, 1, 1, 1, 3, 7}; int height = static_cast<int>(kDims[0] * 2); for (size_t i = 0; i < input_scope_idxes.size(); ++i) { - const std::string varname("in_var" + i); + const std::string varname("in_var" + std::to_string(i)); float val_scalar = static_cast<float>(i); send_vector.push_back(InitSelectedRows(varname, input_scope_idxes[i], rows, height, val_scalar)); @@ -119,7 +120,7 @@ struct TestFusedBroadcastOpHandle : TestBroadcastOpHandle { WaitAll(); for (size_t i = 0; i < input_scope_idxes.size(); ++i) { - const std::string& varname("out_var" + i); + const std::string& varname("out_var" + std::to_string(i)); for (size_t j = 0; j < place_list_.size(); ++j) { SelectedRowsEqual(varname, input_scope_idxes[i], send_vector[i], rows, height); diff --git a/paddle/fluid/framework/details/graph_test_base.h b/paddle/fluid/framework/details/graph_test_base.h new file mode 100644 index 0000000000..126959bcd8 --- /dev/null +++ b/paddle/fluid/framework/details/graph_test_base.h @@ -0,0 +1,80 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include <algorithm> +#include <iostream> +#include <iterator> +#include <string> +#include "glog/logging.h" +#include "gtest/gtest.h" +#include "paddle/fluid/framework/ir/graph.h" +#include "paddle/fluid/framework/ir/graph_helper.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/framework/program_desc.h" + +namespace paddle { +namespace framework { + +class DummyOp : public OperatorBase { + public: + DummyOp(const std::string& type, const VariableNameMap& inputs, + const VariableNameMap& outputs, const AttributeMap& attrs) + : OperatorBase(type, inputs, outputs, attrs) {} + + private: + void RunImpl(const Scope& scope, + const platform::Place& place) const override {} +}; + +class SumOpMaker : public OpProtoAndCheckerMaker { + public: + void Make() { + AddInput("X", "").AsDuplicable(); + AddOutput("Out", ""); + AddComment(""); + } +}; + +class AssignOpMaker : public OpProtoAndCheckerMaker { + public: + void Make() { + AddInput("X", "").AsDuplicable(); + AddOutput("Out", ""); + AddComment(""); + } +}; + +class SplitOpMaker : public OpProtoAndCheckerMaker { + public: + void Make() { + AddInput("X", ""); + AddOutput("Out", "").AsDuplicable(); + AddComment(""); + } +}; + +class DummyVarTypeInference : public VarTypeInference { + public: + void operator()(const OpDesc& op_desc, BlockDesc* block) const override { + auto& inputs = op_desc.Input("X"); + auto type = block->Var(inputs.front())->GetType(); + auto out_var_name = op_desc.Output("Out").front(); + block->Var(out_var_name)->SetType(type); + } +}; + +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/details/inplace_op_pass.cc b/paddle/fluid/framework/details/inplace_op_pass.cc new file mode 100644 index 0000000000..b0c5968499 --- /dev/null +++ b/paddle/fluid/framework/details/inplace_op_pass.cc @@ -0,0 +1,432 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/framework/details/inplace_op_pass.h" +#include <algorithm> +#include <deque> +#include <iterator> +#include <stack> +#include <string> +#include <unordered_map> +#include <unordered_set> +#include <vector> +#include "paddle/fluid/framework/details/memory_optimize_pass.h" +#include "paddle/fluid/framework/ir/graph_helper.h" +#include "paddle/fluid/framework/op_info.h" + +// NOTE(dzhwinter): inplace means one op output variable reuse the input space. +// By our design, one operator only can read its input(const Variable), +// write its output(non-const Variable). If one operator is inplaced, means +// user have chance to write the space before reading happens. +// Especially when some optimize code writing style is applied. +// +// +// /* wrong case in operator */ +// /*In this case, a larger allocation is allocated, input content is lost*/ +// const Tensor* in = ctx.Input<Tensor>("In") +// Tensor* out = ctx.Output<Tensor>("Out"); +// auto* out_ptr = out->mutable_data<T>(ctx.GetPlace()); +// out_ptr[0] = 0; // input contect is overwrited. + +// NOTE(dzhwinter): +// Only for backward compacity and stable. if enable_inplace_whitelist is turn +// on. +// only the ops in whitelist will be use inplace strategy. +// if not, all the op will be inplaced if it registered with InplaceClass +DEFINE_bool( + enable_inplace_whitelist, false, + "If this option turns on, only these op in whitelist can be inplaced." + "If it turns off, all of the running op can be candidate of inplaced op." + "Such as scale, elementwise_add" + "By default, it's turned on"); + +DECLARE_string(memory_optimize_debug); + +// clang-format off +const std::string kInplacedOpWhiteList[] = { // NOLINT + "sigmoid", + "exp", + "relu", + "tanh", + "sqrt", + "ceil", + "floor", + "reciprocal", + "relu6", + "soft_relu", + "hard_sigmoid", + "batch_norm", + "batch_norm_grad", + "sum", + "sum_grad", + "scale", + "reshape", + "elementwise_add", + "elementwise_add_grad", +}; +// clang-format on + +namespace paddle { +namespace framework { +namespace details { + +static inline ir::Node* GetNextCascadeInplacedVar(ir::Node* var) { + // if next op is inplaced, then return the output var + // otherwise return nullptr + PADDLE_ENFORCE(var && var->IsVar() && !var->IsCtrlVar()); + ir::Node* inplaced_var = nullptr; + for (auto* next_op : var->outputs) { + for (auto* output : next_op->outputs) { + if (output->IsVar() && !output->IsCtrlVar() && + output->Name() == var->Name()) { + inplaced_var = output; + } + } + } + return inplaced_var; +} + +static inline ir::Node* GetPrevCascadeInplacedVar(ir::Node* var) { + PADDLE_ENFORCE(var && var->IsVar() && !var->IsCtrlVar()); + if (var->inputs.empty()) return nullptr; + auto* prev_op = var->inputs.at(0); + auto input_it = std::find_if(prev_op->inputs.begin(), prev_op->inputs.end(), + [&](ir::Node* node) { + if (node->IsVar() && !node->IsCtrlVar() && + node->Name() == var->Name()) { + return true; + } else { + return false; + } + }); + return input_it == prev_op->inputs.end() ? nullptr : *input_it; +} + +InplacePass::InplacePass() : Pass() { + if (FLAGS_enable_inplace_whitelist) { + for (auto& s : kInplacedOpWhiteList) { + whitelist_.emplace(s); + } + } +} + +void InplacePass::InitSSAGraphNodes() const { + std::unordered_map<std::string, std::unordered_set<ir::Node*>> all_vars; + for (auto* op : view_.AllOps()) { + for (auto* node : op->inputs) { + if (!node->IsVar() || node->IsCtrlVar()) continue; + if (all_vars[node->Name()].count(node) == 0) { + all_vars[node->Name()].emplace(node); + var_nodes_[node->Name()].emplace_back(node); + } + } + for (auto* node : op->outputs) { + if (!node->IsVar() || node->IsCtrlVar()) continue; + if (all_vars[node->Name()].count(node) == 0) { + all_vars[node->Name()].emplace(node); + var_nodes_[node->Name()].emplace_back(node); + } + } + } +} + +std::unique_ptr<ir::Graph> InplacePass::ApplyImpl( + std::unique_ptr<ir::Graph> graph) const { + var_nodes_.clear(); + view_.Build(graph.get()); + InitSSAGraphNodes(); + + for (auto* op : view_.AllOps()) { + if (FLAGS_enable_inplace_whitelist && !whitelist_.count(op->Name())) + continue; + TryInplaceOpInputOutput(op, graph.get()); + } + graph->ResolveHazard(var_nodes_); + + return graph; +} + +void InplacePass::InplaceModifyDesc(const std::string& var, + const std::string& cache_var, + const size_t& idx) const { + for (size_t i = idx; i < view_.AllOps().size(); ++i) { + ir::Node* op = view_.AllOps()[i]; + PADDLE_ENFORCE(op->IsOp() && op->Op()); + auto* op_desc = op->Op(); + op_desc->RenameInput(var, cache_var); + op_desc->RenameOutput(var, cache_var); + if (op_desc->Block()->HasVar(var)) op_desc->Block()->RemoveVar(var); + op_desc->Flush(); + } +} + +const NodeSwapQueue InplacePass::TryInplaceModifyVar( + const std::string& var, const std::string& cache_var, const size_t& idx, + ir::Graph* graph) const { + PADDLE_ENFORCE(var_nodes_[var].size() >= 1 && + var_nodes_[var].at(0)->Var() != nullptr); + std::unique_ptr<VarDesc> var_desc(new VarDesc(*var_nodes_[var].at(0)->Var())); + var_desc->SetName(cache_var); + + NodeSwapQueue swap_nodes; + + for (size_t i = idx; i < view_.AllOps().size(); ++i) { + auto* op = view_.AllOps()[i]; + + // redirect the input to the latest version of cache_var + for (auto* node : op->inputs) { + if (node->Name() == var) { + ir::Node* cache_node = graph->CreateVarNode(var_desc.get()); + + // swap node to cache_node + cache_node->outputs.insert(cache_node->outputs.end(), + node->outputs.begin(), node->outputs.end()); + PADDLE_ENFORCE(node->inputs.size() == 1 && node->inputs[0]->IsOp()); + auto* prev_op = node->inputs[0]; + std::replace(prev_op->outputs.begin(), prev_op->outputs.end(), node, + cache_node); + cache_node->inputs.emplace_back(prev_op); + for (auto* next_op : node->outputs) { + std::replace(next_op->inputs.begin(), next_op->inputs.end(), node, + cache_node); + } + + swap_nodes.emplace_back(std::make_pair(node, cache_node)); + } + } + + // if we need to rename the output, + // always create a newer version of cache_var + for (auto* node : op->outputs) { + if (node->Name() == var) { + ir::Node* cache_node = graph->CreateVarNode(var_desc.get()); + // swap node to cache node + cache_node->outputs.insert(cache_node->outputs.end(), + node->outputs.begin(), node->outputs.end()); + cache_node->inputs.emplace_back(op); + std::replace(op->outputs.begin(), op->outputs.end(), node, cache_node); + for (auto* next_op : node->outputs) { + std::replace(next_op->inputs.begin(), next_op->inputs.end(), node, + cache_node); + } + + swap_nodes.emplace_back(std::make_pair(node, cache_node)); + } + } + } + + return swap_nodes; +} + +void InplacePass::CommitModify(const NodeSwapQueue& swap_nodes, + ir::Graph* graph) const { + for (auto& pair : swap_nodes) { + auto *node = pair.first, *cache_node = pair.second; + const std::string var = node->Name(), cache_var = cache_node->Name(); + var_nodes_[cache_var].emplace_back(cache_node); + graph->RemoveNode(node); + auto& nodes = var_nodes_.at(var); + // release unused var in graph. Because python side memory optimize + // may reused the var in same name, so we only clear the var node + // after current inplaced index. + nodes.erase(std::remove(nodes.begin(), nodes.end(), node), nodes.end()); + } +} + +void InplacePass::WithdrawModify(const NodeSwapQueue& nodes, + ir::Graph* graph) const { + for (auto& pair : nodes) { + auto *node = pair.first, *cache_node = pair.second; + const std::string var = node->Name(), cache_var = cache_node->Name(); + auto* prev_op = node->inputs[0]; + std::replace(prev_op->outputs.begin(), prev_op->outputs.end(), cache_node, + node); + for (auto* next_op : node->outputs) { + std::replace(next_op->inputs.begin(), next_op->inputs.end(), cache_node, + node); + } + graph->RemoveNode(cache_node); + } +} + +void InplacePass::TryInplaceOpInputOutput(ir::Node* op, + ir::Graph* graph) const { + VLOG(4) << "Try to inplace op " << op->Name(); + PADDLE_ENFORCE(op->Op() != nullptr && op->Op()->Block() != nullptr, + "op_desc is nullptr"); + // some pre-requirments need to meet if the op want to inplaced. + + auto* op_desc = op->Op(); + auto& infer_inplace = + OpInfoMap::Instance().Get(op_desc->Type()).infer_inplace_; + + // 1. infer_inplace_ is registered. + if (!static_cast<bool>(infer_inplace)) return; + PADDLE_ENFORCE(static_cast<bool>(infer_inplace), + "%s's infer_inplace has not been registered", op_desc->Type()); + + auto* block = op_desc->Block(); + auto in_to_outs = infer_inplace(*op_desc, block); + + auto& all_ops = view_.AllOps(); + auto cursor = std::find(all_ops.begin(), all_ops.end(), op); + size_t idx = std::distance(all_ops.begin(), cursor); + + for (auto& pair : in_to_outs) { + auto& in_var_name = pair.first; + auto& out_var_name = pair.second; + auto* in_node = view_.GetNodeByName(in_var_name, op->inputs); + auto* out_node = view_.GetNodeByName(out_var_name, op->outputs); + + // 2. there is no external pending op on the input node + if (view_.PendingOpsOnVar(in_node).size() > 1) { + VLOG(4) << string::Sprintf( + "Skiped pair %s => %s. %s input has external dependency." + "inplace such pair will overwrite the memory.", + out_var_name, in_var_name, op->Name()); + continue; + } + + // 3. if output has been memory optimize by python(fluid.memory_optmize()). + // this candidate can not be inplaced. Will be deprecated in the future. + if (view_.InSkipSet(out_node->Name())) { + VLOG(4) << string::Sprintf( + "Skiped %s => %s reused previous memory block in python memory " + "optmize," + "it inplace may generate a circle", + out_var_name, in_var_name, op->Name()); + continue; + } + + // Debug Interface. Which would be skipped by the pass. + if (out_node->Name() == FLAGS_memory_optimize_debug) { + VLOG(3) << "Skiped var by force. FLAGS_memory_optimize_debug=" + << out_node->Name(); + continue; + } + + // NOTE(dzhwinter): + // two stage commit of inplaced process. if after inplace happens generate a + // circle, + // then withdraw the changes. Otherwise, safely add the node. + auto swap_nodes = + TryInplaceModifyVar(out_var_name, in_var_name, idx, graph); + + if (!ir::HasCircle(*graph)) { + VLOG(3) << string::Sprintf("!!! %s, %s => %s inplaced", op->Name(), + out_var_name, in_var_name); + InplaceModifyDesc(out_var_name, in_var_name, idx); + CommitModify(swap_nodes, graph); + } else { + VLOG(3) << string::Sprintf( + "Skiped pair %s => %s, inplace will generate a circle. withdraw %s", + out_var_name, in_var_name, op->Name()); + WithdrawModify(swap_nodes, graph); + } + } +} + +ir::Node* GraphView::GetNodeByName(const std::string& name, + const std::vector<ir::Node*>& nodes) const { + // nodes should be op->inputs/outputs + // node in same node do have different name. + std::unordered_set<std::string> nodes_in_op; + bool has_dup_node = + std::all_of(nodes.begin(), nodes.end(), [&nodes_in_op](ir::Node* node) { + if (!node->IsVar() || node->IsCtrlVar() || node->Var() == nullptr) { + if (nodes_in_op.count(node->Name())) return true; + nodes_in_op.emplace(node->Name()); + } + return false; + }); + PADDLE_ENFORCE(has_dup_node == false, "nodes has same name!"); + ir::Node* node = nullptr; + for (auto* it : nodes) { + if (!it->IsVar() || it->IsCtrlVar() || it->Var() == nullptr) continue; + if (it->Name() == name) { + node = it; + break; + } + } + PADDLE_ENFORCE(node != nullptr, + string::Sprintf("Not found var %s in nodes!", name)); + return node; +} + +std::vector<ir::Node*> GraphView::PendingOpsOnVar(ir::Node* node) { + // get the pending ops depends on same var node. + // because node also maybe a inplaced variable, so need to backtrack all the + // previous inplaced vars. + std::vector<ir::Node*> pending_ops; + ir::Node* p = node; + while (p != nullptr) { + pending_ops.insert(pending_ops.end(), p->outputs.begin(), p->outputs.end()); + p = GetPrevCascadeInplacedVar(p); + } + return pending_ops; +} + +void GraphView::Build(ir::Graph* g) { + // track the var nodes in correct order. + // Because we insert some new created node. Which may have data race between + // nodes. + // resolve data harzards depends on the var nodes in right order. + ops_ = SortOpLikeDescOrder(*g); + + // 1. track the nodes which reused previous node in Python memory optimize. + // these node can not be inplaced, otherwise may generate a circle in graph. + std::unordered_set<std::string> all_vars; + for (auto& node : g->Nodes()) { + if (node->IsVar()) continue; + for (auto& out : node->outputs) { + if (out->IsCtrlVar() || out->Var() == nullptr) continue; + if (all_vars.count(out->Name())) { + dup_nodes_.emplace(out->Name()); + } else { + all_vars.emplace(out->Name()); + } + } + } + + // 2. track the nodes which used by parameter server. + // these node can not be inplaced, otherwise trainer + // pserver can not find each other name. + auto update_skip_set = [&](ir::Node* node) { + for (auto& in : node->inputs) { + if (in->IsVar() && in->Var() != nullptr) dup_nodes_.emplace(in->Name()); + } + for (auto& out : node->outputs) { + if (out->IsVar() && out->Var() != nullptr) + dup_nodes_.emplace(out->Name()); + } + }; + for (auto& node : g->Nodes()) { + if (!node->IsOp()) continue; + if (node->Name() == "send") update_skip_set(node); + if (node->Name() == "recv") update_skip_set(node); + if (node->Name() == "prefetch") update_skip_set(node); + } +} + +const std::vector<ir::Node*>& GraphView::AllOps() { return ops_; } + +bool GraphView::InSkipSet(const std::string& var) const { + return dup_nodes_.count(var); +} + +} // namespace details +} // namespace framework +} // namespace paddle + +REGISTER_PASS(inplace_pass, paddle::framework::details::InplacePass); diff --git a/paddle/fluid/framework/details/inplace_op_pass.h b/paddle/fluid/framework/details/inplace_op_pass.h new file mode 100644 index 0000000000..7be7f31185 --- /dev/null +++ b/paddle/fluid/framework/details/inplace_op_pass.h @@ -0,0 +1,94 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may abtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#include <map> +#include <string> +#include <unordered_map> +#include <unordered_set> +#include <utility> +#include <vector> +#include "paddle/fluid/framework/details/memory_optimize_helper.h" +#include "paddle/fluid/framework/ir/graph.h" +#include "paddle/fluid/framework/ir/pass.h" + +namespace paddle { +namespace framework { +namespace details { + +class GraphView { + public: + GraphView() = default; + + void Build(ir::Graph* g); + + const std::vector<ir::Node*>& AllOps(); + + ir::Node* GetNodeByName(const std::string& name, + const std::vector<ir::Node*>& nodes) const; + + std::vector<ir::Node*> PendingOpsOnVar(ir::Node* var); + + // Will Deperated in the future. + // NOTE(dzhwinter) : + // 1. Python memory optimize will reuse + // memory based var name, so different op output may + // have the same variable name. enable inplace on such node + // will generate a circle in ssa graph. + // 2. DistributeTranspiler will use unique name to + // map the parameter and gradient, must be skipped. + bool InSkipSet(const std::string& var) const; + + private: + std::vector<ir::Node*> ops_; + std::unordered_set<std::string> dup_nodes_; // mem opt affect nodes + std::map<ir::Node*, std::unordered_set<ir::Node*>> adj_list_; +}; + +// swap pairs in sequence +typedef std::vector<std::pair<ir::Node*, ir::Node*>> NodeSwapQueue; +class InplacePass : public ir::Pass { + public: + InplacePass(); + + protected: + std::unique_ptr<ir::Graph> ApplyImpl( + std::unique_ptr<ir::Graph> graph) const override; + + void InitSSAGraphNodes() const; + + private: + const NodeSwapQueue TryInplaceModifyVar(const std::string& var, + const std::string& cache_var, + const size_t& idx, + ir::Graph* graph) const; + + void CommitModify(const NodeSwapQueue&, ir::Graph* graph) const; + + void WithdrawModify(const NodeSwapQueue& nodes, ir::Graph* graph) const; + + void InplaceModifyDesc(const std::string& in_var, const std::string& out_var, + const size_t& idx) const; + + void TryInplaceOpInputOutput(ir::Node* op, ir::Graph* graph) const; + + mutable std::map<std::string, std::vector<ir::Node*>> var_nodes_; + + mutable std::unordered_set<std::string> whitelist_; + mutable GraphView view_; +}; + +} // namespace details +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/details/memory_early_delete_pass.cc b/paddle/fluid/framework/details/memory_early_delete_pass.cc deleted file mode 100644 index 5906b7d57c..0000000000 --- a/paddle/fluid/framework/details/memory_early_delete_pass.cc +++ /dev/null @@ -1,117 +0,0 @@ -// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "paddle/fluid/framework/details/memory_early_delete_pass.h" -#include <queue> -#include <string> -#include <vector> -#include "paddle/fluid/framework/details/memory_reuse_types.h" -#include "paddle/fluid/framework/details/multi_devices_helper.h" -#include "paddle/fluid/framework/details/reference_count_pass_helper.h" -#include "paddle/fluid/framework/ir/graph_helper.h" - -namespace paddle { -namespace framework { -namespace details { - -static ComputationOpHandle* FindNextComputationOpHandle(VarHandle* var_in) { - std::queue<VarHandleBase*> queue; - queue.push(var_in); - do { - auto* var = queue.front(); - queue.pop(); - for (auto* op : var->PendingOps()) { - auto* compute_op = dynamic_cast<ComputationOpHandle*>(op); - if (compute_op != nullptr && compute_op->GetPlace() == var_in->place()) { - return compute_op; - } - for (auto* out_var : op->Outputs()) { - queue.push(out_var); - } - } - } while (!queue.empty()); - return nullptr; -} - -std::unique_ptr<ir::Graph> MemoryEarlyDeletePass::ApplyImpl( - std::unique_ptr<ir::Graph> graph) const { - auto& graph_pool = Get<GraphNodePool>(kGraphNodePool); - auto& gcs = Get<GarbageCollectorMap>(kGarbageCollector); - - std::unordered_map<std::string, std::unordered_set<OpDesc*>> unlived_vars; - unlived_vars.reserve(graph_pool.size()); - for (auto& pair : graph_pool) { - unlived_vars.insert(std::make_pair(pair.first, pair.second)); - } - - auto compare_and_insert_early_delete_op = [&]( - OpHandleBase* op, const std::vector<VarHandleBase*>& vars) { - if (unlived_vars.empty()) return; - // unlived vars can be deleted after the last used op has finished. - auto* compute_op = dynamic_cast<ComputationOpHandle*>(op); - const auto& places = Get<std::vector<platform::Place>>(kAllPlaces); - for (auto& var : vars) { - auto* var_handle = dynamic_cast<VarHandle*>(var); - auto var_name = var->Node()->Name(); - auto& var_place = var_handle->place(); - if (unlived_vars.count(var_name) == 0) continue; - if (!unlived_vars[var_name].empty()) { - if (compute_op != nullptr && - unlived_vars[var_name].count(compute_op->Node()->Op()) != 0) { - unlived_vars[var_name].erase(compute_op->Node()->Op()); - } - continue; - } - - if (var_handle == nullptr || !var_handle->Node()->IsVar() || - var_handle->Node()->IsCtrlVar()) - continue; - - // shameless copyed from reference count pass. - if (compute_op == nullptr) { - // use next computation op scope - compute_op = FindNextComputationOpHandle(var_handle); - } - auto* early_delete_node = - graph->CreateEmptyNode("early_delete", ir::Node::Type::kOperation); - GarbageCollector* gc = gcs.at(places[compute_op->GetScopeIdx()]).get(); - auto* early_delete_handle = new EarlyDeleteOpHandle( - early_delete_node, compute_op->GetScope(), var_place, {var_name}, gc); - if (compute_op->Outputs().empty()) { - auto* dep_var = new DummyVarHandle(graph->CreateControlDepVar()); - compute_op->AddOutput(dep_var); - graph->Get<GraphDepVars>(kGraphDepVars).emplace(dep_var); - } - early_delete_handle->AddInput(compute_op->Outputs().front()); - VLOG(5) << "Add early delete op " << var_name << " to Operator" - << compute_op->Name(); - } - }; - - auto all_ops = ir::FilterByNodeWrapper<OpHandleBase>(*graph); - for (auto& op : all_ops) { - compare_and_insert_early_delete_op(op, op->Inputs()); - compare_and_insert_early_delete_op(op, op->Outputs()); - } - return graph; -} - -} // namespace details -} // namespace framework -} // namespace paddle - -REGISTER_PASS(memory_early_delete_pass, - paddle::framework::details::MemoryEarlyDeletePass) - .RequireGraphAttr(paddle::framework::details::kGraphNodePool) - .RequireGraphAttr(paddle::framework::details::kGarbageCollector); diff --git a/paddle/fluid/framework/details/memory_optimize_helper.cc b/paddle/fluid/framework/details/memory_optimize_helper.cc new file mode 100644 index 0000000000..6345ba3359 --- /dev/null +++ b/paddle/fluid/framework/details/memory_optimize_helper.cc @@ -0,0 +1,474 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/framework/details/memory_optimize_helper.h" +#include <deque> +#include <functional> +#include <iostream> +#include <numeric> +#include <sstream> +#include <string> +#include "paddle/fluid/framework/var_desc.h" + +namespace paddle { +namespace framework { +namespace details { +using paddle::framework::VarDesc; + +std::vector<ir::Node*> SortOpLikeDescOrder(const ir::Graph& graph) { + PADDLE_ENFORCE(graph.Has(kAllOpDescs), + "Graph has no attribute of kAllOpDescs."); + // 1. get op desc order + auto& op_descs = graph.Get<const std::vector<OpDesc*>>(kAllOpDescs); + + // 2. topology sort order + auto nodes = graph.Nodes(); + std::deque<ir::Node*> ops; + FilterVariables(nodes, [&](ir::Node* op) { + if (op->IsOp() && op->Op() != nullptr) { + ops.emplace_back(op); + } + }); + std::unordered_map<ir::Node*, size_t> op_deps; + std::list<ir::Node*> ready_ops; + std::unordered_map<ir::Node*, std::unordered_set<ir::Node*>> pending_ops; + + for (auto* op : ops) { + std::unordered_set<ir::Node*> preceding_op; + for (auto* in : op->inputs) { + if (in->inputs.empty()) continue; + PADDLE_ENFORCE(in->inputs.size() == 1 && in->inputs[0]->IsOp()); + preceding_op.emplace(in->inputs[0]); + pending_ops[in->inputs[0]].emplace(op); + } + op_deps[op] = preceding_op.size(); + if (preceding_op.empty()) { + ready_ops.emplace_back(op); + } + } + + // 3. generated op list based desc order and the topology order + std::vector<ir::Node*> ret; + std::list<OpDesc*> op_descs_list(op_descs.begin(), op_descs.end()); + + auto update_by_found_node = [&](ir::Node* found_node) { + for (auto* pending_op : pending_ops[found_node]) { + if (--op_deps[pending_op] == 0) { + ready_ops.emplace_back(pending_op); + } + } + ready_ops.remove(found_node); + ret.emplace_back(found_node); + }; + + while (!ready_ops.empty()) { + bool all_of_ready_op_unmatched = true; + for (auto it = op_descs_list.begin(); it != op_descs_list.end();) { + auto op_desc = *it; + ir::Node* found_node = nullptr; + for (auto* op : ready_ops) { + if (IsSameDesc(op->Op(), op_desc)) { + found_node = op; + break; + } + } + + // 3.1 op desc deleted by other pass + if (found_node == nullptr) { + ++it; + continue; + } else { + all_of_ready_op_unmatched = false; + it = op_descs_list.erase(it); + } + update_by_found_node(found_node); + } + + // 3.2 op descs are added by other pass + // preceding op non empty means some new op descs are + // created, but not contained in return node list. + // these new op desc may depend on each other. + std::list<ir::Node*> prev_ready_ops(ready_ops); + if (all_of_ready_op_unmatched) { + for (auto op : prev_ready_ops) { + update_by_found_node(op); + } + } + } + + PADDLE_ENFORCE(std::all_of( + op_deps.begin(), op_deps.end(), + [&](const std::pair<ir::Node*, size_t>& p) { return p.second == 0; })); + + return ret; +} + +size_t NodeSize(const VarDesc& node) { + auto shape = node.GetShape(); + int size = + std::accumulate(shape.begin(), shape.end(), 1, std::multiplies<int>()); + size_t type_size = SizeOfType(node.GetDataType()); + return type_size * std::abs(size); +} + +size_t NodeSize(ir::Node* n) { + auto* desc = FindVarDescInBlock(n); + return NodeSize(*desc); +} + +std::string DebugStringImpl(VarDesc* var) { + std::stringstream ss; + ss << var->Name(); + ss << "["; + try { + auto shape = var->GetShape(); + for (size_t i = 0; i < shape.size(); ++i) { + if (i != shape.size() - 1) { + ss << shape[i] << ","; + } else { + ss << shape[i]; + } + } + ss << "]"; + } catch (...) { + ss << "Var has no VarDesc !!! Name:" << var->Name(); + } + return ss.str(); +} + +std::string DebugString(ir::Node* var) { + return DebugStringImpl(FindVarDescInBlock(var)); +} + +// NOTE(dzh): based ir node, if a large node has been reused +// by a small size node, then next time it appear in pool, it will +// have the small size. Find the original node shap from blockdesc. +VarDesc* FindVarDescInBlock(ir::Node* n) { + PADDLE_ENFORCE(n->IsVar() && !n->IsCtrlVar() && n->inputs.size() == 1); + BlockDesc* block = n->inputs[0]->Op()->Block(); + PADDLE_ENFORCE(block->HasVar(n->Name()), + string::Sprintf("Block do not has var %s", n->Name())); + return block->FindVar(n->Name()); +} + +struct NodeComparator { + bool operator()(ir::Node* lhs, ir::Node* rhs) const { + auto* lhs_desc = FindVarDescInBlock(lhs); + auto* rhs_desc = FindVarDescInBlock(rhs); + auto lhs_shape = lhs_desc->GetShape(); + auto rhs_shape = rhs_desc->GetShape(); + if ((lhs_shape[0] == -1 && rhs_shape[0] == -1) || + (lhs_shape[0] != -1 && rhs_shape[0] != -1)) { + return NodeSize(lhs) <= NodeSize(rhs); + } else { + return false; + } + } +}; + +void OrderedSet::Insert(ir::Node* var) { + PADDLE_ENFORCE(var->IsVar() && !var->IsCtrlVar()); + if (mark_table_.count(var->Name()) != 0) { + mark_table_[var->Name()]->emplace_back(var); + return; + } + + auto* var_desc = FindVarDescInBlock(var); + auto var_shape = var_desc->GetShape(); + int batch_size = static_cast<int>(var_shape[0]); + + NodeComparator functor; + Iter it = nodes_.begin(); + while (it != nodes_.end()) { + auto& prev = it->front(); + auto* cache_desc = FindVarDescInBlock(prev); + int cache_batch_size = cache_desc->GetShape()[0]; + if ((cache_batch_size == -1 && batch_size == -1) || + (cache_batch_size != -1 && batch_size != -1)) { + if (functor(prev, var)) { + ++it; + } else { + break; + } + } else if (cache_batch_size == -1 && batch_size != -1) { + ++it; + } else if (cache_batch_size != -1 && batch_size == -1) { + break; + } + } + + it = nodes_.insert(it, {var}); + mark_table_[var->Name()] = it; +} + +int OrderedSet::GetNodeIndexInPool(ir::Node* var) { + return std::distance(nodes_.begin(), mark_table_[var->Name()]); +} + +ir::Node* OrderedSet::FindBestFitNode(ir::Node* var) const { + ir::Node* found_node = nullptr; + NodeComparator functor; + + for (auto it = nodes_.begin(); it != nodes_.end(); ++it) { + auto& candidate = it->front(); + if (functor(var, candidate)) { + found_node = candidate; + break; + } + } + return found_node; +} + +bool OrderedSet::Has(ir::Node* var) const { + if (mark_table_.count(var->Name())) { + auto& node_in_samename = mark_table_.at(var->Name()); + auto iter = + std::find_if(node_in_samename->begin(), node_in_samename->end(), + [&](ir::Node* n) { return n->Name() == var->Name(); }); + return iter != node_in_samename->end(); + } + return false; +} + +void OrderedSet::Erase(ir::Node* var) { + PADDLE_ENFORCE(mark_table_.count(var->Name())); + nodes_.erase(mark_table_[var->Name()]); + mark_table_.erase(var->Name()); +} + +std::string OrderedSet::ToString() const { + std::stringstream ss; + for (auto it = nodes_.begin(); it != nodes_.end(); ++it) { + for (auto& node : *it) { + ss << DebugString(node) << " "; + } + } + return ss.str(); +} + +bool NodeCanReused(ir::Node* node) { + // valid the node is a var node + if (node == nullptr || !node->IsVar() || node->IsCtrlVar()) return false; + + bool flag = true; + // op output force generated in cpu, can not be reused. + for (auto* op : node->inputs) { + if (op->Op()->HasAttr("force_cpu")) { + flag &= framework::AttrReader(op->Op()->GetAttrMap()) + .Get<bool>("force_cpu") == 0; + } + } + // var desc validation. + flag &= NodeCanReused(*node->Var()); + return flag; +} + +bool NodeCanReused(const VarDesc& node) { + auto type = node.GetType(); + if (!(type == proto::VarType::LOD_TENSOR || + type == proto::VarType::SELECTED_ROWS || + type == proto::VarType::LOD_TENSOR_ARRAY)) { + return false; + } + if (node.Persistable() || node.GetShape().empty()) { + return false; + } + // vars can be @EMPTY@, @LR_DECAY_REUSE_ID@. For example, while_grad + std::string name = node.Name(); + if (!name.empty() && name[0] == '@' && name[name.size() - 1] == '@') + return false; + return true; +} + +bool OpHasSubBlock(OpDesc* desc) { + const AttributeMap& attrs = desc->GetAttrMap(); + for (auto& attr : attrs) { + if (attr.second.type() == typeid(BlockDesc*) || // NOLINT + attr.second.type() == typeid(std::vector<BlockDesc*>)) // NOLINT + return true; + } + return false; +} + +ControlFlowGraph::ControlFlowGraph(const ir::Graph& graph) { + ops_ = SortOpLikeDescOrder(graph); + ConnectNodes(); +} + +void ControlFlowGraph::BuildCFGGraph() { + // FIXME(dzh): same effect with ConnectNodes, but use the control + // link to build dependency graph, it goes wrong in transformer. + for (ir::Node* op : ops_) { + for (auto& input_var : op->inputs) { + if (!input_var->inputs.empty()) { + PADDLE_ENFORCE( + input_var->inputs.size() == 1 && input_var->inputs[0]->IsOp(), + "Preceding Op Node of Var Node must be unique"); + auto* pred_op = input_var->inputs[0]; + if (pred_op->Op() != nullptr) { + predecessors_[op].insert(pred_op); + successors_[pred_op].insert(op); + } + } + if (input_var->IsVar() && !input_var->IsCtrlVar()) { + uses_[op].insert(input_var->Name()); + } + } + for (auto& output_var : op->outputs) { + // output var may be used by many op + for (auto* succ_op : output_var->outputs) { + if (succ_op->Op() != nullptr) { + successors_[op].insert(succ_op); + predecessors_[succ_op].insert(op); + } + } + if (output_var->IsVar() && !output_var->IsCtrlVar()) { + defs_[op].insert(output_var->Name()); + } + } + } +} + +void ControlFlowGraph::ConnectNodes() { + for (size_t i = 0; i < ops_.size(); ++i) { + auto& op = ops_[i]; + try { + auto& next_op = ops_.at(i + 1); + successors_[op].insert(next_op); + predecessors_[next_op].insert(op); + } catch (...) { + // do nothing + } + + FilterVariables(op->inputs, + [&](ir::Node* var) { uses_[op].emplace(var->Name()); }); + + FilterVariables(op->outputs, + [&](ir::Node* var) { defs_[op].emplace(var->Name()); }); + } +} + +void ControlFlowGraph::LiveVariableAnalysis() { + // NOTE(dzh): variable liveless analysis (a.k.a reversed_ops algorithm) + // compute the liveness of for each variable though reversed_ops algorithm. + // It iterates the operators from end to begin, compute the live in/live out + // variable set for each op, then the diff between in/out will be used for + // the variable reuse. For detail refer to + // http://www.cs.cornell.edu/courses/cs4120/2013fa/lectures/lec26-fa13.pdf + std::list<ir::Node*> work_list(ops_.rbegin(), ops_.rend()); + while (!work_list.empty()) { + ir::Node* op = work_list.front(); + work_list.pop_front(); + // get the live_in calculated before. Empty if first. + auto prev_live_in = std::move(live_in_[op]); + for (auto& s : successors_[op]) { + for (auto& var : live_in_[s]) { + live_out_[op].insert(var); + } + } + for (auto& var : uses_[op]) { + live_in_[op].insert(var); + } + for (auto& var : live_out_[op]) { + live_in_[op].insert(var); + } + for (auto& var : defs_[op]) { + live_in_[op].erase(var); + } + + // If the live_in is not changed, then the liveness analysis of + // predecessors is completed. + // + // Otherwise, recalculate the predecessors liveness + if (live_in_[op] != prev_live_in) { + for (auto& pre : predecessors_[op]) { + work_list.push_back(pre); + } + } + } +} + +void ControlFlowGraph::RenameVarInCFGGraph(const std::string& old_node, + const std::string& new_node, + int begin_idx) { + // update graph from begin idx to the end + for (size_t i = begin_idx; i != ops_.size(); ++i) { + auto* op = ops_[i]; + if (uses_[op].find(old_node) != uses_[op].end()) { + uses_[op].erase(old_node); + uses_[op].insert(new_node); + } + if (defs_[op].find(old_node) != defs_[op].end()) { + defs_[op].erase(old_node); + defs_[op].insert(new_node); + } + if (live_in_[op].find(old_node) != live_in_[op].end()) { + live_in_[op].erase(old_node); + live_in_[op].insert(new_node); + } + if (live_out_[op].find(old_node) != live_out_[op].end()) { + live_out_[op].erase(old_node); + live_out_[op].insert(new_node); + } + } +} + +const std::set<std::string> ControlFlowGraph::LiveIn(ir::Node* op) const { + auto it = live_in_.find(op); + PADDLE_ENFORCE( + it != live_in_.end(), + string::Sprintf("Expect %s in live_in, but Not Found.", op->Name())); + return it->second; +} + +const std::set<std::string> ControlFlowGraph::LiveOut(ir::Node* op) const { + auto it = live_out_.find(op); + PADDLE_ENFORCE( + it != live_out_.end(), + string::Sprintf("Expect %s in live_out, but Not Found.", op->Name())); + return it->second; +} + +const std::set<std::string> ControlFlowGraph::Use(ir::Node* op) const { + auto it = uses_.find(op); + PADDLE_ENFORCE( + it != uses_.end(), + string::Sprintf("Expect %s in live_out, but Not Found.", op->Name())); + return it->second; +} + +const std::vector<ir::Node*> ControlFlowGraph::Ops() const { return ops_; } + +std::vector<ir::Node*>& ControlFlowGraph::Ops() { return ops_; } + +ir::Node* ControlFlowGraph::GetNodeByName(const std::string& name, + ir::Node* op) const { + // in ssa-graph, different version nodes have same name, + // this function get the latest version var before target op + // It may return nullptr, such as data node. + ir::Node* found_node = nullptr; + for (auto* node : ops_) { + if (node == op) break; + for (auto& output : node->outputs) { + if (output->Name() == name) { + found_node = output; + } + } + } + return found_node; +} + +} // namespace details +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/details/memory_optimize_helper.h b/paddle/fluid/framework/details/memory_optimize_helper.h new file mode 100644 index 0000000000..0bfaf827fe --- /dev/null +++ b/paddle/fluid/framework/details/memory_optimize_helper.h @@ -0,0 +1,182 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#include <algorithm> +#include <iostream> +#include <iterator> +#include <list> +#include <map> +#include <set> +#include <string> +#include <utility> +#include <vector> +#include "paddle/fluid/framework/data_type.h" +#include "paddle/fluid/framework/ir/graph.h" + +namespace paddle { +namespace framework { +namespace details { + +constexpr char kAllOpDescs[] = "all_op_descs"; + +std::vector<ir::Node*> SortOpLikeDescOrder(const ir::Graph& graph); + +// NOTE(dzh): A ordered set for node reuse in memory optimize. +// the orderedset sort node in ascend order(by node bytes size). +// in fluid, -1 means the batch_size, which is determined in runtime. +// So the reuse happens between nodes who's batch_size both are -1 +// simultaneously or not. +// +// sort rule: +// rule 0 : smaller node ranking in front. +// rule 1 : batch_size equal -1 ranking in the front than the node not. +// +// For example, +// node0[-1, 1] node1[-1, 1, 1], node2[1,1], node3[1,1024], .. + +class OrderedSet { + public: + // nodes with same name exists in pool. + using NodeVector = std::vector<ir::Node*>; + using Iter = typename std::list<NodeVector>::iterator; + using ConstIter = typename std::list<NodeVector>::const_iterator; + + void Insert(ir::Node* var); + void Erase(ir::Node* var); + bool Has(ir::Node* var) const; + void Clear() { + mark_table_.clear(); + nodes_.clear(); + } + // find the bestfit shape node block with var. + ir::Node* FindBestFitNode(ir::Node* var) const; + // map store non-const iterator, can not promise const + int GetNodeIndexInPool(ir::Node* var); + // pool all node to string + std::string ToString() const; + + Iter begin() { return nodes_.begin(); } + Iter end() { return nodes_.end(); } + ConstIter begin() const { return nodes_.begin(); } + ConstIter end() const { return nodes_.end(); } + + size_t size() const { return nodes_.size(); } + + private: + // for searching. + std::unordered_map<std::string, Iter> mark_table_; + // node pool + std::list<NodeVector> nodes_; +}; + +class ControlFlowGraph { + public: + ControlFlowGraph() = default; + // IR Graph + explicit ControlFlowGraph(const ir::Graph& graph); + + void LiveVariableAnalysis(); + + void RenameVarInCFGGraph(const std::string& old_node, + const std::string& new_node, int begin_idx); + + const std::set<std::string> LiveIn(ir::Node* op) const; + const std::set<std::string> LiveOut(ir::Node* op) const; + const std::set<std::string> Use(ir::Node* op) const; + const std::vector<ir::Node*> Ops() const; + std::vector<ir::Node*>& Ops(); + + // for ssa-graph nodes + ir::Node* GetNodeByName(const std::string& name, ir::Node* op) const; + + private: + void BuildCFGGraph(); + void ConnectNodes(); + + using NodeListMap = std::unordered_map<ir::Node*, std::set<ir::Node*>>; + using VarSetMap = std::map<ir::Node*, std::set<std::string>>; + // successors ops use the output variables. + NodeListMap successors_; + // predecessors ops generated input variables. + NodeListMap predecessors_; + // variables lived before run current op. + VarSetMap live_in_; + // variables lived after run current op. + VarSetMap live_out_; + VarSetMap uses_; // op inputs + VarSetMap defs_; // op outputs + + std::vector<ir::Node*> ops_; // op sequence by topology sort +}; + +// valid a tensor can be reuse or not +bool NodeCanReused(ir::Node* node); + +// valid a tensor can be reuse or not. +bool NodeCanReused(const VarDesc& node); + +// check op has subblock or not +bool OpHasSubBlock(OpDesc* desc); + +// node memory size in bytes +size_t NodeSize(ir::Node* n); + +// node memory size in bytes +size_t NodeSize(const VarDesc&); + +std::string DebugString(ir::Node* var); + +// NOTE(dzhwinter) +// after node reuse, the replaced node shape is +// different with its VarDesc. So need to find the +// correct VarDesc in Block. +VarDesc* FindVarDescInBlock(ir::Node* n); + +static inline bool IsSameDesc(OpDesc* op1, OpDesc* op2) { + return op1->Type() == op2->Type() && op1->Inputs() == op2->Inputs() && + op1->Outputs() == op2->Outputs(); +} + +template <typename Container, typename Callback> +class FilterVariableImpl { + public: + void operator()(const Container& nodes, Callback callback) { + for (auto* node : nodes) { + callback(node); + } + } +}; + +// filter var node for op->inputs/outputs +template <typename Callback> +class FilterVariableImpl<std::vector<ir::Node*>, Callback> { + public: + void operator()(const std::vector<ir::Node*>& nodes, Callback callback) { + for (auto* var : nodes) { + if (var->IsVar() && !var->IsCtrlVar()) { + callback(var); + } + } + } +}; + +template <typename Container, typename Callback> +void FilterVariables(const Container& nodes, Callback callback) { + FilterVariableImpl<Container, Callback>()(nodes, callback); +} + +} // namespace details +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/details/analysis_var_pass_test.cc b/paddle/fluid/framework/details/memory_optimize_helper_test.cc similarity index 82% rename from paddle/fluid/framework/details/analysis_var_pass_test.cc rename to paddle/fluid/framework/details/memory_optimize_helper_test.cc index 9bc4fd33f7..5c13dda9e5 100644 --- a/paddle/fluid/framework/details/analysis_var_pass_test.cc +++ b/paddle/fluid/framework/details/memory_optimize_helper_test.cc @@ -12,12 +12,18 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "paddle/fluid/framework/details/analysis_var_pass.h" +#include "paddle/fluid/framework/details/memory_optimize_helper.h" #include <algorithm> #include <iostream> #include <iterator> +#include <memory> +#include <sstream> +#include <string> +#include <utility> +#include <vector> #include "glog/logging.h" #include "gtest/gtest.h" +#include "paddle/fluid/framework/details/graph_test_base.h" #include "paddle/fluid/framework/ir/graph.h" #include "paddle/fluid/framework/ir/graph_helper.h" #include "paddle/fluid/framework/op_registry.h" @@ -26,46 +32,82 @@ namespace paddle { namespace framework { +namespace details { -class DummyOp : public OperatorBase { - public: - DummyOp(const std::string& type, const VariableNameMap& inputs, - const VariableNameMap& outputs, const AttributeMap& attrs) - : OperatorBase(type, inputs, outputs, attrs) {} - - private: - void RunImpl(const Scope& scope, - const platform::Place& place) const override {} -}; - -class SumOpMaker : public OpProtoAndCheckerMaker { - public: - void Make() { - AddInput("X", "").AsDuplicable(); - AddOutput("Out", ""); - AddComment(""); +TEST(OrderedSet, Normal) { + OrderedSet pool; + std::vector<std::unique_ptr<ir::Node>> nodes; + + // clang-format off + std::vector<std::vector<int64_t>> shapes = {{-1, 10}, + {-1, 20}, + {1, 2}, + {5, 2}, + {10, 20}, + {-1, 2, 5}, + {-1, 1, 5}, + {-1, 1}}; + // clang-format on + const int COUNT = shapes.size(); + ProgramDesc prog; + BlockDesc* block_desc = prog.MutableBlock(0); + auto* op_desc = block_desc->AppendOp(); + op_desc->SetType("dummy"); + std::unique_ptr<ir::Node> op = ir::CreateNodeForTest(op_desc); + + for (int i = 0; i < COUNT; ++i) { + auto desc = block_desc->Var(std::to_string(i)); + desc->SetShape(shapes[i]); + std::unique_ptr<ir::Node> node = ir::CreateNodeForTest(desc); + node->inputs.emplace_back(op.get()); + nodes.emplace_back(std::move(node)); } -}; - -class AssignOpMaker : public OpProtoAndCheckerMaker { - public: - void Make() { - AddInput("X", "").AsDuplicable(); - AddOutput("Out", ""); - AddComment(""); + + // Insert + for (auto& node : nodes) { + pool.Insert(node.get()); } -}; - -class DummyVarTypeInference : public VarTypeInference { - public: - void operator()(const OpDesc& op_desc, BlockDesc* block) const override { - auto& inputs = op_desc.Input("X"); - auto type = block->Var(inputs.front())->GetType(); - auto out_var_name = op_desc.Output("Out").front(); - block->Var(out_var_name)->SetType(type); + + // Has/size + ASSERT_EQ(pool.size(), shapes.size()); + for (auto& node : nodes) { + ASSERT_TRUE(pool.Has(node.get())); } -}; + // assert its order and interface. + std::cout << pool.ToString() << std::endl; + pool.Erase(nodes.front().get()); + std::cout << pool.ToString() << std::endl; + + ASSERT_EQ(pool.size(), static_cast<size_t>(COUNT - 1)); + ASSERT_EQ(pool.GetNodeIndexInPool(nodes.back().get()), 0); + + { + auto v1 = block_desc->Var("11"); + v1->SetShape({-1, 256, 56, 56}); + std::unique_ptr<ir::Node> node1 = ir::CreateNodeForTest(v1); + node1->inputs.emplace_back(op.get()); + auto* cache = pool.FindBestFitNode(node1.get()); + ASSERT_EQ(cache, nullptr); + } + { + auto v2 = block_desc->Var("12"); + v2->SetShape({-1, 2, 5}); + std::unique_ptr<ir::Node> node1 = ir::CreateNodeForTest(v2); + node1->inputs.emplace_back(op.get()); + auto* cache = pool.FindBestFitNode(node1.get()); + ASSERT_EQ(pool.GetNodeIndexInPool(cache), 2); // match 6:[-1,2,5] + } + { + auto v3 = block_desc->Var("13"); + v3->SetShape({2, 5}); + std::unique_ptr<ir::Node> node1 = ir::CreateNodeForTest(v3); + node1->inputs.emplace_back(op.get()); + auto* cache = pool.FindBestFitNode(node1.get()); + ASSERT_EQ(pool.GetNodeIndexInPool(cache), 5); // match 4:[5,2] + } +} +} // namespace details } // namespace framework } // namespace paddle @@ -102,11 +144,6 @@ namespace paddle { namespace framework { namespace details { -static inline bool IsSameDesc(OpDesc* op1, OpDesc* op2) { - return op1->Type() == op2->Type() && op1->Inputs() == op2->Inputs() && - op1->Outputs() == op2->Outputs(); -} - inline static ProgramDesc FillProgramDesc() { ProgramDesc prog; prog.MutableBlock(0)->Var("a")->SetType(proto::VarType::LOD_TENSOR); @@ -141,15 +178,6 @@ inline static ProgramDesc FillProgramDesc() { return prog; } -template <typename Container> -inline static std::string DebugString(const Container& c) { - std::stringstream ss; - for (auto& item : c) { - ss << item << " "; - } - return ss.str(); -} - TEST(CFGGraph, IRGraph) { // prepare ir graph auto prog = FillProgramDesc(); diff --git a/paddle/fluid/framework/details/memory_optimize_pass.cc b/paddle/fluid/framework/details/memory_optimize_pass.cc new file mode 100644 index 0000000000..41e4a834df --- /dev/null +++ b/paddle/fluid/framework/details/memory_optimize_pass.cc @@ -0,0 +1,327 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/framework/details/memory_optimize_pass.h" +#include <algorithm> +#include <atomic> +#include <deque> +#include <fstream> +#include <iostream> +#include <iterator> +#include <memory> +#include <queue> +#include <sstream> +#include <string> +#include <type_traits> +#include <vector> +#include "gflags/gflags.h" +#include "paddle/fluid/framework/data_type.h" +#include "paddle/fluid/framework/ir/graph.h" +#include "paddle/fluid/framework/ir/graph_helper.h" + +DEFINE_bool(enable_subgraph_optimize, false, + "SubGraph also reuse global graph variables, it will reduce the " + "memory occupation" + "but a higher risk of memory reuse error. default disabled."); +DEFINE_string(memory_optimize_debug, "", + "debug the operator output variable when do the variable reuse." + "memory reuse pass." + "only for debug, default disabled."); + +namespace paddle { +namespace framework { +namespace details { + +std::unique_ptr<ir::Graph> MemoryOptimizePass::ApplyImpl( + std::unique_ptr<ir::Graph> graph) const { + auto nodes = graph->Nodes(); + CollectSkipVarsSet(nodes); + + cfg_.reset(new details::ControlFlowGraph(*graph)); + cfg_->LiveVariableAnalysis(); + InitSSAGraphNodes(); + + int reuse_id = 0; + for (size_t idx = 0; idx < cfg_->Ops().size(); ++idx) { + auto& op = cfg_->Ops()[idx]; + auto* op_desc = op->Op(); + // some op in graph has no op desc + if (op_desc == nullptr) continue; + if (OpHasSubBlock(op_desc)) { + if (FLAGS_enable_subgraph_optimize) { + SubGraphOptimize(op_desc); + } else { + VLOG(3) << op->Name() + << " has subblock, but disable subgraph optimize. skipped."; + continue; + } + } + + for (auto& var : op->outputs) { + if (!NodeCanReused(var) || cfg_->Use(op).count(var->Name()) == 0 || + skip_set_.count(var->Name())) + continue; + ir::Node* cache = pool_.FindBestFitNode(var); + + if (var->Name() == FLAGS_memory_optimize_debug) { + VLOG(3) << "start match var " << DebugString(var) << " of op " + << op->Name(); + VLOG(3) << pool_.ToString(); + VLOG(3) << "matched in pool : " + << ((cache == nullptr) ? "False" : "True"); + } + + if (cache == nullptr) continue; + if (var->Name() == cache->Name()) { + VLOG(3) << "The same cache variable is cascade reused." << var->Name() + << " is re-filled to the pool after" + << "the reused op is finished. Current op can not " + << "replace it again. Skip this candidate."; + continue; + + int node_idx_in_pool = pool_.GetNodeIndexInPool(cache); + VLOG(3) << string::Sprintf( + "!!! %s, %s => %s, cache idx %d, pool size %d", + std::to_string(reuse_id++), DebugString(var), DebugString(cache), + node_idx_in_pool, static_cast<int>(pool_.size())); + + // update CFG Graph on the fly. + // reused var maybe re-fill into the pool + cfg_->RenameVarInCFGGraph(var->Name(), cache->Name(), idx); + // NOTE(dzhwinter): we need to both update the ProgramDesc + // and IR Graph. because op_desc/var_desc is used in CreateOp, + // CreateVar when running happens. But IR Graph + // define the dependence relationship between nodes. + RenameVarInGraphDesc(var->Name(), cache->Name(), idx); + RenameVarInGraphNode(var->Name(), cache->Name(), idx, graph.get()); + + pool_.Erase(cache); + } + + // fill the pool + std::unordered_set<std::string> unlived_vars; + for (auto var : cfg_->LiveIn(op)) { + if (cfg_->LiveOut(op).count(var) == 0) { + unlived_vars.emplace(var); + } + } + for (auto var : unlived_vars) { + ir::Node* var_node = cfg_->GetNodeByName(var, op); + if (NodeCanReused(var_node) && !pool_.Has(var_node)) { + pool_.Insert(var_node); + } + } + } + } + graph->ResolveHazard(var_nodes_); + + return graph; +} + +void MemoryOptimizePass::SubGraphOptimize(OpDesc* op_desc) const { + // conditional block, while op and their grad op + auto* sub_block_desc = + AttrReader(op_desc->GetAttrMap()).Get<BlockDesc*>("sub_block"); + + // create a mirror block to construct an IR Graph. + ProgramDesc prog; + auto* copy_block = prog.MutableBlock(0); + for (auto* op : sub_block_desc->AllOps()) { + auto* copy_op = copy_block->AppendOp(); + copy_op->CopyFrom(*op); + copy_op->Flush(); + } + + for (auto* var : sub_block_desc->AllVars()) { + auto* copy_var = copy_block->Var(var->Name()); + copy_var->SetDataType(var->GetDataType()); + // only lod tensor can be reused. So ignore the multiple dims case. + copy_var->SetType(var->GetType()); + copy_var->SetShape(var->GetShape()); + copy_var->SetPersistable(var->Persistable()); + } + + ir::Graph sub_graph(prog); + std::unordered_set<ir::Node*> sub_graph_all_ops; + FilterVariables(sub_graph.Nodes(), [&](ir::Node* var) { + // sub_graph_all_ops.emplace(var); + if (var->IsVar() && !var->IsCtrlVar()) { + sub_graph_all_ops.emplace(var); + } + }); + int sub_reuse_id = 0; + // subgraph nodes is unordered, reuse need to follow the desc order. + // find the right op node through the descs + for (auto* sub_op_desc : sub_block_desc->AllOps()) { + ir::Node* sub_op = nullptr; + for (auto* node : sub_graph_all_ops) { + if (node->Op() == sub_op_desc) { + sub_op = node; + break; + } + } + PADDLE_ENFORCE(sub_op != nullptr); + for (auto* var : sub_op->outputs) { + if (NodeCanReused(var)) { + ir::Node* cache = pool_.FindBestFitNode(var); + if (cache != nullptr) { + if (var->Var()->GetDataType() != cache->Var()->GetDataType()) { + continue; + } + int node_idx_in_pool = pool_.GetNodeIndexInPool(cache); + VLOG(3) << string::Sprintf( + "!!! %s, %s => %s, cache idx %d, pool size %d", + std::to_string(sub_reuse_id++), DebugString(var), + DebugString(cache), node_idx_in_pool, + static_cast<int>(pool_.size())); + // NOTE(dzh): subblock is not in IR graph. Modify the block_desc + // immediately to make the subblock variable reuse strategy take + // effect. Because it is a single op in graph. No need to + // update the ir nodes. + sub_op_desc->Rename(var->Name(), cache->Name()); + if (sub_op_desc->Block()->HasVar(var->Name())) { + sub_op_desc->Block()->RemoveVar(var->Name()); + } + } + } + } + } +} + +void MemoryOptimizePass::CollectSkipVarsSet( + const std::unordered_set<ir::Node*>& nodes) const { + auto update_skip_set = [&](OpDesc* op_desc) { + auto inputs = op_desc->InputArgumentNames(); + auto outputs = op_desc->OutputArgumentNames(); + skip_set_.insert(inputs.begin(), inputs.end()); + skip_set_.insert(outputs.begin(), outputs.end()); + }; + for (auto& op : nodes) { + if (!op->IsOp() || op->Op() == nullptr) continue; + auto* op_desc = op->Op(); + // NOTE(dzhwinter): + // current block can not reuse next level block vars. + if (OpHasSubBlock(op_desc)) update_skip_set(op_desc); + // NOTE(dzhwinter): + // distributed ops input/output name need to + // keep same bettwen trainer/pserver + if (op_desc->Type() == "send") update_skip_set(op_desc); + if (op_desc->Type() == "recv") update_skip_set(op_desc); + if (op_desc->Type() == "prefetch") update_skip_set(op_desc); + } +} + +void MemoryOptimizePass::RenameVarInGraphDesc(const std::string& var, + const std::string& cache_var, + size_t idx) const { + for (size_t i = idx; i < cfg_->Ops().size(); ++i) { + auto* op = cfg_->Ops()[i]; + PADDLE_ENFORCE(op->IsOp() && op->Op()); + auto* op_desc = op->Op(); + op_desc->RenameInput(var, cache_var); + op_desc->RenameOutput(var, cache_var); + if (op_desc->Block()->HasVar(var)) op_desc->Block()->RemoveVar(var); + op_desc->Flush(); + } +} + +void MemoryOptimizePass::InitSSAGraphNodes() const { + std::unordered_map<std::string, std::unordered_set<ir::Node*>> all_vars; + if (var_nodes_.empty()) { + for (auto* op : cfg_->Ops()) { + for (auto* node : op->inputs) { + if (all_vars[node->Name()].count(node) == 0) { + all_vars[node->Name()].emplace(node); + var_nodes_[node->Name()].emplace_back(node); + } + } + for (auto* node : op->outputs) { + if (all_vars[node->Name()].count(node) == 0) { + all_vars[node->Name()].emplace(node); + var_nodes_[node->Name()].emplace_back(node); + } + } + } + } +} + +void MemoryOptimizePass::RenameVarInGraphNode(const std::string& var, + const std::string& cache_var, + size_t idx, + ir::Graph* graph) const { + // if replace happens, we need to create a newer version cache_var + // but use the same dims/data_type with var. + PADDLE_ENFORCE(var_nodes_[var].size() >= 1 && + var_nodes_[var].at(0)->Var() != nullptr); + std::unique_ptr<VarDesc> var_desc(new VarDesc(*var_nodes_[var].at(0)->Var())); + var_desc->SetName(cache_var); + + for (size_t i = idx; i < cfg_->Ops().size(); ++i) { + auto* op = cfg_->Ops()[i]; + + // redirect the input to the latest version of cache_var + for (auto* node : op->inputs) { + if (node->Name() == var) { + ir::Node* cache_node = graph->CreateVarNode(var_desc.get()); + var_nodes_[cache_var].emplace_back(cache_node); + + // swap node to cache_node + cache_node->outputs.insert(cache_node->outputs.end(), + node->outputs.begin(), node->outputs.end()); + PADDLE_ENFORCE(node->inputs.size() == 1 && node->inputs[0]->IsOp()); + auto* prev_op = node->inputs[0]; + std::replace(prev_op->outputs.begin(), prev_op->outputs.end(), node, + cache_node); + cache_node->inputs.emplace_back(prev_op); + for (auto* next_op : node->outputs) { + std::replace(next_op->inputs.begin(), next_op->inputs.end(), node, + cache_node); + } + } + } + + // if we need to rename the output, + // always create a newer version of cache_var + for (auto* node : op->outputs) { + if (node->Name() == var) { + ir::Node* cache_node = graph->CreateVarNode(var_desc.get()); + var_nodes_[cache_var].emplace_back(cache_node); + + // swap node to cache node + cache_node->outputs.insert(cache_node->outputs.end(), + node->outputs.begin(), node->outputs.end()); + cache_node->inputs.emplace_back(op); + std::replace(op->outputs.begin(), op->outputs.end(), node, cache_node); + for (auto* next_op : node->outputs) { + std::replace(next_op->inputs.begin(), next_op->inputs.end(), node, + cache_node); + } + } + } + } + + // release node of unused var in graph + for (auto* node : var_nodes_[var]) { + graph->RemoveNode(node); + } + var_nodes_.at(var).clear(); +} + +} // namespace details +} // namespace framework +} // namespace paddle + +REGISTER_PASS(memory_optimize_pass, + paddle::framework::details::MemoryOptimizePass) + .RequireGraphAttr(paddle::framework::details::kAllOpDescs); diff --git a/paddle/fluid/framework/details/memory_optimize_pass.h b/paddle/fluid/framework/details/memory_optimize_pass.h new file mode 100644 index 0000000000..593ffc10fc --- /dev/null +++ b/paddle/fluid/framework/details/memory_optimize_pass.h @@ -0,0 +1,71 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include <algorithm> +#include <list> +#include <map> +#include <memory> +#include <set> +#include <string> +#include <unordered_map> +#include <utility> +#include <vector> + +#include "paddle/fluid/framework/data_type.h" +#include "paddle/fluid/framework/details/memory_optimize_helper.h" +#include "paddle/fluid/framework/ir/graph.h" +#include "paddle/fluid/framework/ir/pass.h" + +namespace paddle { +namespace framework { +namespace details { + +class MemoryOptimizePass : public ir::Pass { + protected: + std::unique_ptr<ir::Graph> ApplyImpl( + std::unique_ptr<ir::Graph> graph) const override; + // fill the variable map(var_nodes) by version. + void InitSSAGraphNodes() const; + + private: + // update program descs + void RenameVarInGraphDesc(const std::string& var, + const std::string& cache_var, size_t idx) const; + // update ir nodes + void RenameVarInGraphNode(const std::string& var, + const std::string& cache_var, size_t idx, + ir::Graph* graph) const; + + void SubGraphOptimize(OpDesc* op_desc) const; + // 1. scan op with subblock and collect the output/input vars. + // while, while_grad, conditional_block + // 2. scan distributed ops and collect the output/input vars + void CollectSkipVarsSet(const std::unordered_set<ir::Node*>&) const; + + private: + // Reuse Node Pool, Owned. + mutable OrderedSet pool_; + // controlflow Graph + mutable std::unique_ptr<ControlFlowGraph> cfg_; + // skip set + mutable std::unordered_set<std::string> skip_set_; + // var nodes + mutable std::map<std::string, std::vector<ir::Node*>> var_nodes_; +}; + +} // namespace details +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/details/memory_reuse_types.cc b/paddle/fluid/framework/details/memory_reuse_types.cc deleted file mode 100644 index 2b9ff518b9..0000000000 --- a/paddle/fluid/framework/details/memory_reuse_types.cc +++ /dev/null @@ -1,155 +0,0 @@ -// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "paddle/fluid/framework/details/memory_reuse_types.h" -#include <iostream> -#include <sstream> -#include <string> - -namespace paddle { -namespace framework { -namespace details { - -size_t NodeSizeInBytes(ir::Node* n) { - auto* desc = FindVarDescInBlock(n); - auto shape = desc->GetShape(); - size_t type_size = SizeOfType(desc->GetDataType()); - int size = 1; - for (auto& s : shape) { - size *= s; - } - return type_size * std::abs(size); -} - -std::string DebugStringImpl(VarDesc* var) { - std::stringstream ss; - ss << var->Name(); - ss << "["; - try { - auto shape = var->GetShape(); - for (size_t i = 0; i < shape.size(); ++i) { - if (i != shape.size() - 1) { - ss << shape[i] << ","; - } else { - ss << shape[i]; - } - } - ss << "]"; - } catch (...) { - ss << "Var has no VarDesc !!! Name:" << var->Name(); - } - return ss.str(); -} - -std::string DebugString(ir::Node* var) { - return DebugStringImpl(FindVarDescInBlock(var)); -} -// return DebugString(var->Var()); } - -// NOTE(dzh): based ir node, if a large node has been reused -// by a small size node, then next time it appear in pool, it will -// have the small size. Find the original node shap from blockdesc. -VarDesc* FindVarDescInBlock(ir::Node* n) { - PADDLE_ENFORCE(n->IsVar() && !n->IsCtrlVar() && n->inputs.size() == 1); - BlockDesc* block = n->inputs[0]->Op()->Block(); - PADDLE_ENFORCE(block->HasVar(n->Name()), - string::Sprintf("Block do not has var %s", n->Name())); - return block->FindVar(n->Name()); -} - -struct NodeComparator { - bool operator()(ir::Node* lhs, ir::Node* rhs) const { - auto* lhs_desc = FindVarDescInBlock(lhs); - auto* rhs_desc = FindVarDescInBlock(rhs); - auto lhs_shape = lhs_desc->GetShape(); - auto rhs_shape = rhs_desc->GetShape(); - if ((lhs_shape[0] == -1 && rhs_shape[0] == -1) || - (lhs_shape[0] != -1 && rhs_shape[0] != -1)) { - return NodeSizeInBytes(lhs) <= NodeSizeInBytes(rhs); - } else { - return false; - } - } -}; - -void OrderedNodePairPool::Insert(ir::Node* var, ir::Node* op) { - PADDLE_ENFORCE(var->IsVar() && !var->IsCtrlVar()); - PADDLE_ENFORCE(op->IsOp()); - if (mark_table_.count(var->Name()) != 0) { - mark_table_[var->Name()]->second.insert(op); - return; - } - - auto* var_desc = FindVarDescInBlock(var); - auto var_shape = var_desc->GetShape(); - int batch_size = static_cast<int>(var_shape[0]); - - NodeComparator compare_node; - Iter it = nodes_.begin(); - while (it != nodes_.end()) { - auto* cache_desc = FindVarDescInBlock(it->first); - int cache_batch_size = cache_desc->GetShape()[0]; - if ((cache_batch_size == -1 && batch_size == -1) || - (cache_batch_size != -1 && batch_size != -1)) { - if (compare_node(it->first, var)) { - ++it; - } else { - break; - } - } else if (cache_batch_size == -1 && batch_size != -1) { - ++it; - } else if (cache_batch_size != -1 && batch_size == -1) { - break; - } - } - - it = - nodes_.insert(it, std::make_pair(var, std::unordered_set<ir::Node*>{op})); - mark_table_[var->Name()] = it; -} - -int OrderedNodePairPool::GetIndex(ir::Node* var) { - return std::distance(nodes_.begin(), mark_table_[var->Name()]); -} - -ir::Node* OrderedNodePairPool::NodeMatch(ir::Node* var) const { - ir::Node* found_node = nullptr; - NodeComparator compare_node; - - for (auto it = nodes_.begin(); it != nodes_.end(); ++it) { - if (compare_node(var, it->first)) { - found_node = it->first; - break; - } - } - return found_node; -} - -void OrderedNodePairPool::Erase(ir::Node* var) { - PADDLE_ENFORCE(mark_table_.count(var->Name())); - nodes_.erase(mark_table_[var->Name()]); - mark_table_.erase(var->Name()); -} - -std::string OrderedNodePairPool::ToString() const { - std::stringstream ss; - for (auto it = nodes_.begin(); it != nodes_.end(); ++it) { - ss << DebugString(it->first) << " "; - } - return ss.str(); -} - -} // namespace details -} // namespace framework -} // namespace paddle diff --git a/paddle/fluid/framework/details/memory_reuse_types.h b/paddle/fluid/framework/details/memory_reuse_types.h deleted file mode 100644 index 9a9c1d948e..0000000000 --- a/paddle/fluid/framework/details/memory_reuse_types.h +++ /dev/null @@ -1,87 +0,0 @@ -// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once -#include <algorithm> -#include <iostream> -#include <iterator> -#include <list> -#include <string> -#include <utility> -#include <vector> -#include "paddle/fluid/framework/data_type.h" -#include "paddle/fluid/framework/ir/graph.h" - -namespace paddle { -namespace framework { -namespace details { - -constexpr char kFetchedVars[] = "fetched_vars"; -constexpr char kGraphNodePool[] = "graph_node_pool"; - -// NOTE(dzh): Variable and the operators use the var. -// for early delete pass. -// Because analysis var pass build base on ir::Node, which maybe released -// or modified between passes, so we use OpDesc* to mark ops. -using GraphNodePool = std::vector< - std::pair<std::string /*var node*/, std::unordered_set<OpDesc*> /* ops */>>; - -// NOTE(dzh): by default, it sort node in ascend order(by node bytes size). -// in fluid, -1 means the batch_size is determined in runtime. -// the node batch_size equal -1 always ranking in the front than the node not. -// For example, -// node0[-1, 1] node1[-1, 1, 1], node2[1,1], node3[1,1024], .. -// O(1) insert, delete -class OrderedNodePairPool { - public: - using NodePair = std::pair<ir::Node*, std::unordered_set<ir::Node*>>; - using Iter = typename std::list<NodePair>::iterator; - using ConstIter = typename std::list<NodePair>::const_iterator; - - void Insert(ir::Node* var, ir::Node* op); - - void Erase(ir::Node* var); - - bool Has(ir::Node* var) { return mark_table_.count(var->Name()); } - - ir::Node* NodeMatch(ir::Node* var) const; - // map store non-const iterator, can not promise const - int GetIndex(ir::Node* var); - // pool all node to string - std::string ToString() const; - - Iter begin() { return nodes_.begin(); } - Iter end() { return nodes_.end(); } - ConstIter begin() const { return nodes_.begin(); } - ConstIter end() const { return nodes_.end(); } - size_t size() const { return nodes_.size(); } - - private: - // for searching. - std::unordered_map<std::string, Iter> mark_table_; - // node swap pairs. var -> ops dep var - std::list<NodePair> nodes_; -}; - -// node memory size in bytes -size_t NodeSizeInBytes(ir::Node* n); - -std::string DebugString(ir::Node* var); - -// std::string DebugString(VarDesc* var); -VarDesc* FindVarDescInBlock(ir::Node* n); - -} // namespace details -} // namespace framework -} // namespace paddle diff --git a/paddle/fluid/framework/details/memory_reuse_types_test.cc b/paddle/fluid/framework/details/memory_reuse_types_test.cc deleted file mode 100644 index d2fabf5ce0..0000000000 --- a/paddle/fluid/framework/details/memory_reuse_types_test.cc +++ /dev/null @@ -1,99 +0,0 @@ -// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "paddle/fluid/framework/details/memory_reuse_types.h" -#include <algorithm> -#include <iostream> -#include <memory> -#include <sstream> -#include <string> -#include <utility> -#include <vector> -#include "glog/logging.h" -#include "gtest/gtest.h" - -namespace paddle { -namespace framework { -namespace details { - -TEST(OrderedNodePairPool, Normal) { - OrderedNodePairPool pool; - std::vector<std::unique_ptr<ir::Node>> nodes; - - // clang-format off - std::vector<std::vector<int64_t>> shapes = {{-1, 10}, - {-1, 20}, - {1, 2}, - {5, 2}, - {10, 20}, - {-1, 2, 5}, - {-1, 1, 5}, - {-1, 1}}; - // clang-format on - const int COUNT = shapes.size(); - ProgramDesc prog; - BlockDesc* block_desc = prog.MutableBlock(0); - auto* op_desc = block_desc->AppendOp(); - op_desc->SetType("dummy"); - std::unique_ptr<ir::Node> op = ir::CreateNodeForTest(op_desc); - - for (int i = 0; i < COUNT; ++i) { - auto desc = block_desc->Var(std::to_string(i)); - desc->SetShape(shapes[i]); - std::unique_ptr<ir::Node> node = ir::CreateNodeForTest(desc); - node->inputs.emplace_back(op.get()); - nodes.emplace_back(std::move(node)); - } - - for (auto& node : nodes) { - pool.Insert(node.get(), op.get()); - } - - // assert its order and interface. - std::cout << pool.ToString() << std::endl; - pool.Erase(nodes.front().get()); - std::cout << pool.ToString() << std::endl; - - ASSERT_EQ(pool.size(), static_cast<size_t>(COUNT - 1)); - ASSERT_EQ(pool.GetIndex(nodes.back().get()), 0); - - { - auto v1 = block_desc->Var("11"); - v1->SetShape({-1, 256, 56, 56}); - std::unique_ptr<ir::Node> node1 = ir::CreateNodeForTest(v1); - node1->inputs.emplace_back(op.get()); - auto* cache = pool.NodeMatch(node1.get()); - ASSERT_EQ(cache, nullptr); - } - { - auto v2 = block_desc->Var("12"); - v2->SetShape({-1, 2, 5}); - std::unique_ptr<ir::Node> node1 = ir::CreateNodeForTest(v2); - node1->inputs.emplace_back(op.get()); - auto* cache = pool.NodeMatch(node1.get()); - ASSERT_EQ(pool.GetIndex(cache), 2); // match 6:[-1,2,5] - } - { - auto v3 = block_desc->Var("13"); - v3->SetShape({2, 5}); - std::unique_ptr<ir::Node> node1 = ir::CreateNodeForTest(v3); - node1->inputs.emplace_back(op.get()); - auto* cache = pool.NodeMatch(node1.get()); - ASSERT_EQ(pool.GetIndex(cache), 5); // match 4:[5,2] - } -} - -} // namespace details -} // namespace framework -} // namespace paddle diff --git a/paddle/fluid/framework/details/op_registry.h b/paddle/fluid/framework/details/op_registry.h index eea7e712f8..0901e59f97 100644 --- a/paddle/fluid/framework/details/op_registry.h +++ b/paddle/fluid/framework/details/op_registry.h @@ -18,6 +18,7 @@ limitations under the License. */ #include <tuple> #include <vector> #include "paddle/fluid/framework/grad_op_desc_maker.h" +#include "paddle/fluid/framework/inplace_op_inference.h" #include "paddle/fluid/framework/op_info.h" #include "paddle/fluid/framework/op_proto_maker.h" #include "paddle/fluid/framework/operator.h" @@ -32,7 +33,8 @@ enum OpInfoFillType { kOpProtoAndCheckerMaker = 1, kGradOpDescMaker = 2, kVarTypeInference = 3, - kShapeInference = 4 + kShapeInference = 4, + kInplaceOpInference = 5 }; template <typename T> @@ -48,8 +50,11 @@ struct OpInfoFillTypeID { ? kVarTypeInference : (std::is_base_of<InferShapeBase, T>::value ? kShapeInference - : static_cast<OpInfoFillType>( - -1))))); + : (std::is_base_of< + InplaceOpInference, T>::value + ? kInplaceOpInference + : static_cast<OpInfoFillType>( + -1)))))); } }; @@ -139,6 +144,16 @@ struct OpInfoFiller<T, kShapeInference> { } }; +template <typename T> +struct OpInfoFiller<T, kInplaceOpInference> { + void operator()(const char* op_type, OpInfo* info) const { + info->infer_inplace_ = [](const OpDesc& op_desc, BlockDesc* block) { + T infer; + return infer(op_desc, block); + }; + } +}; + } // namespace details } // namespace framework diff --git a/paddle/fluid/framework/details/parallel_ssa_graph_executor.cc b/paddle/fluid/framework/details/parallel_ssa_graph_executor.cc index 128aaa33a2..e8deb5bfc6 100644 --- a/paddle/fluid/framework/details/parallel_ssa_graph_executor.cc +++ b/paddle/fluid/framework/details/parallel_ssa_graph_executor.cc @@ -65,7 +65,7 @@ FeedFetchList ParallelSSAGraphExecutor::Run( if (pool_) { run_futures.emplace_back(pool_->enqueue(std::move(call))); } else { - fetch_data.emplace_back(std::move(call())); + fetch_data.emplace_back(call()); } } @@ -74,7 +74,7 @@ FeedFetchList ParallelSSAGraphExecutor::Run( if (exception_holder_.IsCaught()) { f.wait(); } else { - fetch_data.emplace_back(std::move(f.get())); + fetch_data.emplace_back(f.get()); } } } diff --git a/paddle/fluid/framework/details/sequential_execution_pass.cc b/paddle/fluid/framework/details/sequential_execution_pass.cc index cc2c8bfef9..879fb29d59 100644 --- a/paddle/fluid/framework/details/sequential_execution_pass.cc +++ b/paddle/fluid/framework/details/sequential_execution_pass.cc @@ -17,6 +17,7 @@ #include <unordered_map> #include <unordered_set> #include <vector> +#include "paddle/fluid/framework/details/memory_optimize_helper.h" #include "paddle/fluid/framework/op_proto_maker.h" namespace paddle { diff --git a/paddle/fluid/framework/details/sequential_execution_pass.h b/paddle/fluid/framework/details/sequential_execution_pass.h index a04c08bc2e..ea3034877f 100644 --- a/paddle/fluid/framework/details/sequential_execution_pass.h +++ b/paddle/fluid/framework/details/sequential_execution_pass.h @@ -21,8 +21,6 @@ namespace paddle { namespace framework { namespace details { -constexpr char kAllOpDescs[] = "all_op_descs"; - class SequentialExecutionPass : public ir::Pass { protected: std::unique_ptr<ir::Graph> ApplyImpl( diff --git a/paddle/fluid/framework/inplace_op_inference.h b/paddle/fluid/framework/inplace_op_inference.h new file mode 100644 index 0000000000..a3ccf677c9 --- /dev/null +++ b/paddle/fluid/framework/inplace_op_inference.h @@ -0,0 +1,115 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#include <functional> +#include <numeric> +#include <string> +#include <unordered_map> +#include "glog/logging.h" +#include "paddle/fluid/framework/block_desc.h" +#include "paddle/fluid/framework/details/memory_optimize_helper.h" +#include "paddle/fluid/framework/op_desc.h" +#include "paddle/fluid/framework/type_defs.h" + +namespace paddle { +namespace framework { + +/* + Inplace Inference for create In->Out pairs for inplaced operator. + If we specify a pair of corresponding names. For example, X->Out. + then Out will inplaced use X's memory. The base class will do + legality validation for both variables. +*/ +class InplaceOpInference { + public: + virtual ~InplaceOpInference() {} + virtual std::unordered_map<std::string, std::string> operator()( + const OpDesc& op_desc, BlockDesc* block) const = 0; +}; + +class InplaceInToOut : public InplaceOpInference { + public: + std::unordered_map<std::string, std::string> operator()( + const OpDesc& op_desc, BlockDesc* block) const { + std::unordered_map<std::string, std::string> ret; + auto in_out_var_names_pair = this->Apply(op_desc, block); + for (auto& pair : in_out_var_names_pair) { + PADDLE_ENFORCE(!op_desc.Input(pair.first).empty(), + string::Sprintf("op %s do not have input of %s!", + op_desc.Type(), pair.first)); + PADDLE_ENFORCE(!op_desc.Output(pair.second).empty(), + string::Sprintf("op %s do not have output of %s!", + op_desc.Type(), pair.second)); + auto& in_name = op_desc.Input(pair.first).at(0); + auto& out_name = op_desc.Output(pair.second).at(0); + + auto in = block->FindRecursiveOrCreateVar(in_name); + auto out = block->FindRecursiveOrCreateVar(out_name); + if (TryInplaceInputOutput(in, out)) ret.insert({in_name, out_name}); + } + return ret; + } + + protected: + virtual std::unordered_map<std::string, std::string> Apply( + const OpDesc& op_desc, BlockDesc* block) const = 0; + + bool TryInplaceInputOutput(const VarDesc& in, const VarDesc& out) const { + return in.Name() != out.Name() && details::NodeCanReused(in) && + details::NodeCanReused(out) && + details::NodeSize(out) <= details::NodeSize(in); + } +}; + +/* + Inplace In and Out for operator only have an Input and an Output. + For example, activation op. + */ +class SingleOpInplaceInToOut : public InplaceInToOut { + protected: + std::unordered_map<std::string, std::string> Apply( + const OpDesc& op_desc, BlockDesc* block) const override { + PADDLE_ENFORCE(!op_desc.InputNames().empty(), + "Op inputs must not be empty"); + PADDLE_ENFORCE(!op_desc.OutputNames().empty(), + "Op outputs must not be empty"); + auto x_name = op_desc.InputNames().at(0); + auto out_name = op_desc.OutputNames().at(0); + return std::unordered_map<std::string, std::string>{{x_name, out_name}}; + } +}; + +/* + Gradient op. Inplace output use it's Input. + For example, Input@Grad->Input reuse strategy. + */ +class GradOpInplaceInToOut : public InplaceInToOut { + protected: + std::unordered_map<std::string, std::string> Apply( + const OpDesc& op_desc, BlockDesc* block) const override { + std::unordered_map<std::string, std::string> ret; + std::unordered_set<std::string> output_names(op_desc.OutputNames().begin(), + op_desc.OutputNames().end()); + for (auto& input_name : op_desc.InputNames()) { + if (output_names.count(GradVarName(input_name))) { + ret.insert({input_name, GradVarName(input_name)}); + } + } + return ret; + } +}; + +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/inplace_op_inference_test.cc b/paddle/fluid/framework/inplace_op_inference_test.cc new file mode 100644 index 0000000000..3e4d715c6f --- /dev/null +++ b/paddle/fluid/framework/inplace_op_inference_test.cc @@ -0,0 +1,288 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. */ + +#include <iterator> +#include <string> +#include "gtest/gtest.h" +#include "paddle/fluid/framework/op_info.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/framework/operator.h" +#include "paddle/fluid/framework/program_desc.h" +#include "paddle/fluid/framework/var_type_inference.h" + +namespace paddle { +namespace framework { + +class NOP : public OperatorBase { + public: + NOP(const std::string& type, const VariableNameMap& inputs, + const VariableNameMap& outputs, const AttributeMap& attrs) + : OperatorBase(type, inputs, outputs, attrs) {} + + private: + void RunImpl(const Scope& scope, + const platform::Place& place) const override {} +}; + +class SingleOpMaker : public OpProtoAndCheckerMaker { + public: + void Make() { + AddInput("X", "").AsDuplicable(); + AddOutput("Out", ""); + AddComment(""); + } +}; + +class SingleGradOpMaker : public framework::SingleGradOpDescMaker { + public: + using framework::SingleGradOpDescMaker::SingleGradOpDescMaker; + + protected: + std::unique_ptr<framework::OpDesc> Apply() const override { + auto* op = new framework::OpDesc(); + op->SetType("single_op_grad"); + op->SetInput("Out", OutputGrad("Out")); + op->SetOutput(framework::GradVarName("X"), InputGrad("X")); + return std::unique_ptr<OpDesc>(op); + } +}; + +class SingleOpShapeInference : public framework::InferShapeBase { + public: + void operator()(framework::InferShapeContext* ctx) const override { + ctx->HasInput("X"); + ctx->HasOutput("Out"); + ctx->SetOutputDim("Out", ctx->GetInputDim("X")); + } +}; + +class SingleGradOpShapeInference : public framework::InferShapeBase { + public: + void operator()(framework::InferShapeContext* ctx) const override { + ctx->HasInput(framework::GradVarName("Out")); + ctx->HasOutput(framework::GradVarName("X")); + ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("Out")); + } +}; + +class MultiOutOpMaker : public OpProtoAndCheckerMaker { + public: + void Make() { + AddInput("X", "").AsDuplicable(); + AddInput("Y", "").AsDuplicable(); + AddInput("Z", "").AsDuplicable(); + AddOutput("Out", ""); + AddOutput("YOut", ""); + AddOutput("ZOut", ""); + AddOutput("NotReuseOut", ""); + AddComment(""); + } +}; + +class MultiOutShapeInference : public framework::InferShapeBase { + public: + void operator()(framework::InferShapeContext* ctx) const override { + ctx->ShareDim("X", "Out"); + ctx->ShareDim("Y", "YOut"); + ctx->ShareDim("Z", "ZOut"); + } +}; + +class MultiGradOpMaker : public framework::SingleGradOpDescMaker { + public: + using framework::SingleGradOpDescMaker::SingleGradOpDescMaker; + + protected: + std::unique_ptr<framework::OpDesc> Apply() const override { + auto* op = new framework::OpDesc(); + op->SetType("multi_out_grad"); + op->SetInput("X", Input("X")); + op->SetOutput(framework::GradVarName("Y"), OutputGrad("YOut")); + op->SetOutput(framework::GradVarName("X"), OutputGrad("Out")); + op->SetOutput(framework::GradVarName("Z"), OutputGrad("ZOut")); + return std::unique_ptr<framework::OpDesc>(op); + } +}; + +class MultiOutGradShapeInference : public framework::InferShapeBase { + public: + void operator()(framework::InferShapeContext* ctx) const override { + ctx->SetOutputDim(framework::GradVarName("Y"), + ctx->GetInputDim(framework::GradVarName("YOut"))); + ctx->SetOutputDim(framework::GradVarName("X"), + ctx->GetInputDim(framework::GradVarName("Out"))); + ctx->SetOutputDim(framework::GradVarName("Z"), + ctx->GetInputDim(framework::GradVarName("ZOut"))); + } +}; + +class MultiOutInplaceInToOut : public framework::InplaceInToOut { + public: + using framework::InplaceInToOut::InplaceInToOut; + + protected: + std::unordered_map<std::string, std::string> Apply( + const OpDesc& op_desc, BlockDesc* block) const override { + return std::unordered_map<std::string, std::string>{ + {"X", "Out"}, {"Y", "YOut"}, {"Z", "ZOut"}, + }; + } +}; + +class MultiOutGradInplaceInToOut : public framework::InplaceInToOut { + public: + using framework::InplaceInToOut::InplaceInToOut; + + protected: + std::unordered_map<std::string, std::string> Apply( + const OpDesc& op_desc, BlockDesc* block) const override { + return std::unordered_map<std::string, std::string>{ + {framework::GradVarName("YOut"), framework::GradVarName("Y")}, + {framework::GradVarName("Out"), framework::GradVarName("X")}, + {framework::GradVarName("ZOut"), framework::GradVarName("Z")}, + }; + } +}; + +} // namespace framework +} // namespace paddle + +namespace f = paddle::framework; +REGISTER_OPERATOR(single_op, f::NOP, f::SingleOpMaker, f::SingleGradOpMaker, + f::SingleOpInplaceInToOut, f::SingleOpShapeInference); +REGISTER_OPERATOR(single_op_grad, f::NOP, f::SingleOpInplaceInToOut, + f::SingleGradOpShapeInference); +REGISTER_OPERATOR(multi_out_op, f::NOP, f::MultiOutOpMaker, f::MultiGradOpMaker, + f::MultiOutInplaceInToOut, f::MultiOutShapeInference); +REGISTER_OPERATOR(multi_out_grad, f::NOP, f::MultiOutGradInplaceInToOut, + f::MultiOutGradShapeInference); + +namespace paddle { +namespace framework { + +TEST(InferInplace, SingleOpInplaceInToOut) { + ProgramDesc prog; + auto* op = prog.MutableBlock(0)->AppendOp(); + op->SetType("single_op"); + op->SetInput("X", {"test2_a", "test2_b", "test2_c"}); + op->SetOutput("Out", {"test2_out"}); + + prog.MutableBlock(0)->Var("test2_a")->SetType(proto::VarType::LOD_TENSOR); + prog.MutableBlock(0)->Var("test2_a")->SetShape({32, 64}); + prog.MutableBlock(0)->Var("test2_b")->SetType(proto::VarType::LOD_TENSOR); + prog.MutableBlock(0)->Var("test2_c")->SetType(proto::VarType::LOD_TENSOR); + prog.MutableBlock(0)->Var("test2_out"); + prog.MutableBlock(0)->Var("test2_out")->SetShape({32, 16}); + + auto& infer_inplace = OpInfoMap::Instance().Get(op->Type()).infer_inplace_; + auto in_to_outs = infer_inplace(*op, op->Block()); + EXPECT_EQ(in_to_outs.size(), 1ul); + auto it = in_to_outs.begin(); + EXPECT_EQ(it->first, "test2_a"); + EXPECT_EQ(it->second, "test2_out"); +} + +TEST(InferInplace, SingleGradOpInplaceInToOut) { + ProgramDesc prog; + auto* op = prog.MutableBlock(0)->AppendOp(); + op->SetType("single_op_grad"); + op->SetInput(GradVarName("Out"), {"test2_out"}); + op->SetOutput(GradVarName("X"), {"test2_a", "test2_b", "test2_c"}); + + prog.MutableBlock(0)->Var("test2_a")->SetType(proto::VarType::LOD_TENSOR); + prog.MutableBlock(0)->Var("test2_a")->SetShape({32, 16}); + prog.MutableBlock(0)->Var("test2_b")->SetType(proto::VarType::LOD_TENSOR); + prog.MutableBlock(0)->Var("test2_c")->SetType(proto::VarType::LOD_TENSOR); + prog.MutableBlock(0)->Var("test2_out"); + prog.MutableBlock(0)->Var("test2_out")->SetShape({32, 16}); + + auto& infer_inplace = OpInfoMap::Instance().Get(op->Type()).infer_inplace_; + auto in_to_outs = infer_inplace(*op, op->Block()); + EXPECT_EQ(in_to_outs.size(), 1ul); + auto it = in_to_outs.begin(); + EXPECT_EQ(it->first, "test2_out"); + EXPECT_EQ(it->second, "test2_a"); +} + +TEST(InferInplace, MultiOutInplaceInToOut) { + ProgramDesc prog; + auto* op = prog.MutableBlock(0)->AppendOp(); + op->SetType("multi_out_op"); + op->SetInput("X", {"a0", "a1"}); + op->SetInput("Y", {"b0"}); + op->SetInput("Z", {"c0", "c1"}); + op->SetOutput("Out", {"o0"}); + op->SetOutput("YOut", {"y0"}); + op->SetOutput("ZOut", {"z0"}); + + prog.MutableBlock(0)->Var("a0")->SetType(proto::VarType::LOD_TENSOR); + prog.MutableBlock(0)->Var("b0")->SetType(proto::VarType::LOD_TENSOR); + prog.MutableBlock(0)->Var("c0")->SetType(proto::VarType::LOD_TENSOR); + prog.MutableBlock(0)->Var("c1")->SetType(proto::VarType::LOD_TENSOR); + prog.MutableBlock(0)->Var("o0"); + prog.MutableBlock(0)->Var("y0"); + prog.MutableBlock(0)->Var("z0"); + prog.MutableBlock(0)->Var("a0")->SetShape({32, 16}); + prog.MutableBlock(0)->Var("b0")->SetShape({32, 16}); + prog.MutableBlock(0)->Var("c0")->SetShape({32, 16}); + prog.MutableBlock(0)->Var("o0")->SetShape({32, 16}); + prog.MutableBlock(0)->Var("y0")->SetShape({32, 16}); + prog.MutableBlock(0)->Var("z0")->SetShape({32, 16}); + + auto& infer_inplace = OpInfoMap::Instance().Get(op->Type()).infer_inplace_; + auto in_to_outs = infer_inplace(*op, op->Block()); + EXPECT_EQ(in_to_outs.size(), 3ul); + std::unordered_map<std::string, std::string> expects = { + {"a0", "o0"}, {"b0", "y0"}, {"c0", "z0"}, + }; + EXPECT_TRUE(expects == in_to_outs); +} + +TEST(InferInplace, MultiGradInplaceInToOut) { + ProgramDesc prog; + auto* op = prog.MutableBlock(0)->AppendOp(); + op->SetType("multi_out_grad"); + op->SetInput(GradVarName("Out"), {"o0"}); + op->SetInput(GradVarName("YOut"), {"y0"}); + op->SetInput(GradVarName("ZOut"), {"z0"}); + op->SetOutput(GradVarName("X"), {"a0", "a1"}); + op->SetOutput(GradVarName("Y"), {"b0"}); + op->SetOutput(GradVarName("Z"), {"c0", "c1"}); + + prog.MutableBlock(0)->Var("a0")->SetType(proto::VarType::LOD_TENSOR); + prog.MutableBlock(0)->Var("b0")->SetType(proto::VarType::LOD_TENSOR); + prog.MutableBlock(0)->Var("c0")->SetType(proto::VarType::LOD_TENSOR); + prog.MutableBlock(0)->Var("c1")->SetType(proto::VarType::LOD_TENSOR); + prog.MutableBlock(0)->Var("o0"); + prog.MutableBlock(0)->Var("y0"); + prog.MutableBlock(0)->Var("z0"); + prog.MutableBlock(0)->Var("a0")->SetShape({32, 16}); + prog.MutableBlock(0)->Var("b0")->SetShape({32, 16}); + prog.MutableBlock(0)->Var("c0")->SetShape({32, 16}); + prog.MutableBlock(0)->Var("o0")->SetShape({32, 16}); + prog.MutableBlock(0)->Var("y0")->SetShape({32, 16}); + prog.MutableBlock(0)->Var("z0")->SetShape({32, 16}); + + auto& infer_inplace = OpInfoMap::Instance().Get(op->Type()).infer_inplace_; + auto in_to_outs = infer_inplace(*op, op->Block()); + + EXPECT_EQ(in_to_outs.size(), 3ul); + std::unordered_map<std::string, std::string> expects = { + {"o0", "a0"}, {"y0", "b0"}, {"z0", "c0"}, + }; + EXPECT_TRUE(expects == in_to_outs); +} + +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/ir/CMakeLists.txt b/paddle/fluid/framework/ir/CMakeLists.txt index 914bcce775..07c2c970d4 100644 --- a/paddle/fluid/framework/ir/CMakeLists.txt +++ b/paddle/fluid/framework/ir/CMakeLists.txt @@ -65,6 +65,7 @@ pass_library(conv_elementwise_add2_act_fuse_pass inference) pass_library(conv_elementwise_add_fuse_pass inference) pass_library(conv_affine_channel_fuse_pass inference) pass_library(transpose_flatten_concat_fuse_pass inference) +pass_library(identity_scale_op_clean_pass base) # There may be many transpose-flatten structures in a model, and the output of # these structures will be used as inputs to the concat Op. This pattern will diff --git a/paddle/fluid/framework/ir/graph.cc b/paddle/fluid/framework/ir/graph.cc index 3eb5bdba3b..4b5c846f32 100644 --- a/paddle/fluid/framework/ir/graph.cc +++ b/paddle/fluid/framework/ir/graph.cc @@ -76,7 +76,7 @@ std::map<std::string, std::vector<ir::Node *>> Graph::InitFromProgram( var->inputs.push_back(node); } } - return std::move(var_nodes); + return var_nodes; } void Graph::ResolveHazard( diff --git a/paddle/fluid/framework/ir/graph.h b/paddle/fluid/framework/ir/graph.h index 8bb3c27bdd..feb3330176 100644 --- a/paddle/fluid/framework/ir/graph.h +++ b/paddle/fluid/framework/ir/graph.h @@ -141,7 +141,8 @@ class Graph { ir::Node *CreateControlDepVar() { // TODO(panyx0718): control var name should be really unique. const std::string name = string::Sprintf( - "%s@%llu", ir::Node::kControlDepVarName, node_set_.size()); + "%s@%llu", static_cast<const char *>(ir::Node::kControlDepVarName), + num_node_created_); auto *x = AddNode(new ir::Node(name, ir::Node::Type::kVariable)); x->SetId(num_node_created_++); return x; diff --git a/paddle/fluid/framework/ir/graph_helper.cc b/paddle/fluid/framework/ir/graph_helper.cc index 8de93cf285..22d4c0a91c 100644 --- a/paddle/fluid/framework/ir/graph_helper.cc +++ b/paddle/fluid/framework/ir/graph_helper.cc @@ -52,16 +52,29 @@ bool HasCircleHelper( ir::Node *node, const std::map<ir::Node *, std::unordered_set<ir::Node *>> &adj_list, std::unordered_set<ir::Node *> *visited, - std::unordered_set<ir::Node *> *in_trace) { + std::unordered_set<ir::Node *> *in_trace, + std::vector<std::vector<ir::Node *>> *circles) { if (visited->find(node) == visited->end()) { visited->insert(node); in_trace->insert(node); for (ir::Node *in : adj_list.at(node)) { if (visited->find(in) == visited->end() && - HasCircleHelper(in, adj_list, visited, in_trace)) { + HasCircleHelper(in, adj_list, visited, in_trace, circles)) { return true; } else if (in_trace->find(in) != in_trace->end()) { + if (circles != nullptr) { + std::vector<ir::Node *> circle; + circle.emplace_back(in); + ir::Node *p = in; + for (auto &adj : adj_list.at(p)) { + if (in_trace->count(adj)) { + circle.emplace_back(adj); + p = adj; + } + } + circles->emplace_back(circle); + } return true; } } @@ -71,11 +84,12 @@ bool HasCircleHelper( } bool HasCircleInternal( - const std::map<ir::Node *, std::unordered_set<ir::Node *>> &adj_list) { + const std::map<ir::Node *, std::unordered_set<ir::Node *>> &adj_list, + std::vector<std::vector<ir::Node *>> *circles) { std::unordered_set<ir::Node *> visited; std::unordered_set<ir::Node *> in_trace; for (auto &adj : adj_list) { - if (HasCircleHelper(adj.first, adj_list, &visited, &in_trace)) { + if (HasCircleHelper(adj.first, adj_list, &visited, &in_trace, circles)) { return true; } } @@ -84,13 +98,18 @@ bool HasCircleInternal( } // namespace bool HasCircle(const Graph &graph) { - return HasCircleInternal(BuildOperationAdjList(graph)); + return HasCircleInternal(BuildOperationAdjList(graph), nullptr); +} + +bool FindCircleSubGraph(const Graph &graph, + std::vector<std::vector<ir::Node *>> *circles) { + return HasCircleInternal(BuildOperationAdjList(graph), circles); } std::vector<ir::Node *> TopologySortOperations(const Graph &graph) { std::map<ir::Node *, std::unordered_set<ir::Node *>> adj_list = BuildOperationAdjList(graph); - PADDLE_ENFORCE(!HasCircleInternal(adj_list)); + PADDLE_ENFORCE(!HasCircleInternal(adj_list, nullptr)); std::unordered_set<ir::Node *> visited; std::vector<ir::Node *> ret; for (auto adj : adj_list) { diff --git a/paddle/fluid/framework/ir/graph_helper.h b/paddle/fluid/framework/ir/graph_helper.h index fba4936f2c..214de9ec7d 100644 --- a/paddle/fluid/framework/ir/graph_helper.h +++ b/paddle/fluid/framework/ir/graph_helper.h @@ -28,6 +28,11 @@ namespace ir { // Test if the graph contains circle. bool HasCircle(const Graph &graph); +// Find All Circles for debugging, +// store all subgraph in circles. +bool FindCircleSubGraph(const Graph &graph, + std::vector<std::vector<ir::Node *>> *circles); + size_t GraphNum(const Graph &graph); // Topology Sort the operations in the graph from inputs to outputs. diff --git a/paddle/fluid/framework/ir/graph_helper_test.cc b/paddle/fluid/framework/ir/graph_helper_test.cc index 260a73ae76..d8973d5aed 100644 --- a/paddle/fluid/framework/ir/graph_helper_test.cc +++ b/paddle/fluid/framework/ir/graph_helper_test.cc @@ -195,6 +195,17 @@ void BuildTwoGraphs(Graph* g) { // v4->outputs.push_back(o5); } +TEST(GraphHelperTest, Circles) { + ProgramDesc prog; + + Graph g(prog); + BuildCircleGraph(&g); + + std::vector<std::vector<ir::Node*>> circles; + ASSERT_TRUE(FindCircleSubGraph(g, &circles)); + ASSERT_EQ(circles.size(), 1UL); +} + TEST(GraphHelperTest, GraphNum) { ProgramDesc prog; diff --git a/paddle/fluid/framework/ir/graph_pattern_detector.cc b/paddle/fluid/framework/ir/graph_pattern_detector.cc index 6282ced1e4..9ea0729e1f 100644 --- a/paddle/fluid/framework/ir/graph_pattern_detector.cc +++ b/paddle/fluid/framework/ir/graph_pattern_detector.cc @@ -117,11 +117,6 @@ bool GraphPatternDetector::MarkPDNodesInGraph(const ir::Graph &graph) { // return false; } } - for (auto &item : pdnodes2nodes_) { - for (auto &n : item.second) { - GetMarkedNodes(const_cast<Graph *>(&graph)).insert(n); - } - } VLOG(3) << pdnodes2nodes_.size() << " nodes marked"; return !pdnodes2nodes_.empty(); diff --git a/paddle/fluid/framework/ir/identity_scale_op_clean_pass.cc b/paddle/fluid/framework/ir/identity_scale_op_clean_pass.cc new file mode 100644 index 0000000000..3b738aa159 --- /dev/null +++ b/paddle/fluid/framework/ir/identity_scale_op_clean_pass.cc @@ -0,0 +1,80 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/framework/ir/identity_scale_op_clean_pass.h" +#include <string> +#include "paddle/fluid/framework/ir/graph_pattern_detector.h" + +namespace paddle { +namespace framework { +namespace ir { + +std::unique_ptr<ir::Graph> IdentityScaleOpCleanPass::ApplyImpl( + std::unique_ptr<ir::Graph> graph) const { + FusePassBase::Init("identity_scale_op_clean", graph.get()); + + // pre_op -> scale_in -> scale_op -> scale_out + // -> + // pre_op -> scale_out + GraphPatternDetector detector; + auto pre_op = detector.mutable_pattern()->NewNode("pre_op")->assert_is_op(); + auto scale_in = detector.mutable_pattern() + ->NewNode("scale_in") + ->assert_is_op_input("scale") + ->AsIntermediate(); + auto scale_op = detector.mutable_pattern() + ->NewNode("scale_fuse") + ->assert_is_op("scale") + ->assert_op_attr<float>("scale", 1.) + ->assert_op_attr<float>("bias", 0.); + auto scale_out = detector.mutable_pattern() + ->NewNode("scale_out") + ->assert_is_op_output("scale"); + + pre_op->LinksTo({scale_in}); + scale_op->LinksFrom({scale_in}).LinksTo({scale_out}); + + GraphPatternDetector::handle_t handler = [&]( + const GraphPatternDetector::subgraph_t& subgraph, Graph* graph) { + Node* scale_op_var = subgraph.at(scale_op); + Node* scale_in_var = subgraph.at(scale_in); + Node* scale_out_var = subgraph.at(scale_out); + Node* pre_op_var = subgraph.at(pre_op); + // Link pre_op directly to scale_out + const std::string scale_in_name = scale_in_var->Name(); + const std::string scale_out_name = scale_out_var->Name(); + // Remove links in graph + GraphSafeRemoveNodes(graph, {scale_in_var, scale_op_var}); + // Modify proto message + auto* pre_op_desc = pre_op_var->Op(); + for (auto& parameter : *pre_op_desc->Proto()->mutable_outputs()) { + auto* arguments = parameter.mutable_arguments(); + auto it = std::find(arguments->begin(), arguments->end(), scale_in_name); + PADDLE_ENFORCE(it != arguments->end()); + *it = scale_out_name; + } + + IR_NODE_LINK_TO(pre_op_var, scale_out_var); + }; + + detector(graph.get(), handler); + return graph; +} + +} // namespace ir +} // namespace framework +} // namespace paddle + +REGISTER_PASS(identity_scale_op_clean_pass, + paddle::framework::ir::IdentityScaleOpCleanPass); diff --git a/paddle/fluid/framework/details/memory_early_delete_pass.h b/paddle/fluid/framework/ir/identity_scale_op_clean_pass.h similarity index 62% rename from paddle/fluid/framework/details/memory_early_delete_pass.h rename to paddle/fluid/framework/ir/identity_scale_op_clean_pass.h index 8215aa1b2b..50a654d82f 100644 --- a/paddle/fluid/framework/details/memory_early_delete_pass.h +++ b/paddle/fluid/framework/ir/identity_scale_op_clean_pass.h @@ -1,4 +1,4 @@ -// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. @@ -13,20 +13,21 @@ // limitations under the License. #pragma once -#include "paddle/fluid/framework/details/early_delete_op_handle.h" -#include "paddle/fluid/framework/ir/graph.h" -#include "paddle/fluid/framework/ir/pass.h" + +#include "paddle/fluid/framework/ir/fuse_pass_base.h" namespace paddle { namespace framework { -namespace details { +namespace ir { -class MemoryEarlyDeletePass : public ir::Pass { +class IdentityScaleOpCleanPass : public FusePassBase { protected: - std::unique_ptr<ir::Graph> ApplyImpl( - std::unique_ptr<ir::Graph> graph) const override; + std::unique_ptr<ir::Graph> ApplyImpl(std::unique_ptr<ir::Graph> graph) const; + + private: + virtual ~IdentityScaleOpCleanPass() = default; }; -} // namespace details +} // namespace ir } // namespace framework } // namespace paddle diff --git a/paddle/fluid/framework/ir/infer_clean_graph_pass.cc b/paddle/fluid/framework/ir/infer_clean_graph_pass.cc index 7713ed1eab..6607c026a7 100644 --- a/paddle/fluid/framework/ir/infer_clean_graph_pass.cc +++ b/paddle/fluid/framework/ir/infer_clean_graph_pass.cc @@ -37,6 +37,7 @@ class InferCleanGraphPass : public FusePassBase { std::unordered_set<const Node*> invalid_nodes; int valid_op = 0; for (auto* node : graph->Nodes()) { + PADDLE_ENFORCE_NOT_NULL(node); if (is_valid_node(node)) { invalid_nodes.insert(node); } else if (node->IsOp()) { diff --git a/paddle/fluid/framework/ir/seqpool_concat_fuse_pass_tester.cc b/paddle/fluid/framework/ir/seqpool_concat_fuse_pass_tester.cc index 456a03192c..35d1d5129b 100644 --- a/paddle/fluid/framework/ir/seqpool_concat_fuse_pass_tester.cc +++ b/paddle/fluid/framework/ir/seqpool_concat_fuse_pass_tester.cc @@ -164,7 +164,7 @@ ProgramDesc BuildProgramDesc(int num_inputs_of_concat) { }; std::vector<std::string> concat_inputs; for (int i = 0; i < num_inputs_of_concat; ++i) { - std::string prefix = "seqpool_op_" + i; + std::string prefix = "seqpool_op_" + std::to_string(i); new_var(prefix + "in"); new_var(prefix + "out"); new_var(prefix + "out_unused"); diff --git a/paddle/fluid/framework/op_info.h b/paddle/fluid/framework/op_info.h index 19e5c2c73e..4b55bd0703 100644 --- a/paddle/fluid/framework/op_info.h +++ b/paddle/fluid/framework/op_info.h @@ -38,6 +38,7 @@ struct OpInfo { OpAttrChecker* checker_{nullptr}; InferVarTypeFN infer_var_type_; InferShapeFN infer_shape_; + InferInplaceOpFN infer_inplace_; bool HasOpProtoAndChecker() const { return proto_ != nullptr && checker_ != nullptr; diff --git a/paddle/fluid/framework/operator.cc b/paddle/fluid/framework/operator.cc index 9d6c10ab9e..b22523e0f4 100644 --- a/paddle/fluid/framework/operator.cc +++ b/paddle/fluid/framework/operator.cc @@ -188,14 +188,14 @@ void OperatorBase::Run(const Scope& scope, const platform::Place& place) { VLOG(3) << place << " " << DebugStringEx(&scope); } catch (platform::EnforceNotMet exception) { if (Attrs().count("sub_block") != 0) { - throw exception; + throw; } auto& callstack = Attr<std::vector<std::string>>( OpProtoAndCheckerMaker::OpCreationCallstackAttrName()); if (callstack.empty()) { - throw exception; + throw; } std::ostringstream sout; sout << "Invoke operator " << Type() << " error.\n"; @@ -206,7 +206,7 @@ void OperatorBase::Run(const Scope& scope, const platform::Place& place) { sout << "C++ Callstacks: \n"; sout << exception.err_str_; exception.err_str_ = sout.str(); - throw exception; + throw; } catch (...) { std::rethrow_exception(std::current_exception()); } @@ -589,7 +589,7 @@ class RuntimeInferShapeContext : public InferShapeContext { public: RuntimeInferShapeContext(const OperatorBase& op, const Scope& scope, const RuntimeContext& ctx) - : op_(op), scope_(scope), ctx_(ctx) {} + : op_(op), ctx_(ctx) {} bool HasInput(const std::string& name) const override { // has only one input @@ -881,7 +881,6 @@ class RuntimeInferShapeContext : public InferShapeContext { } const OperatorBase& op_; - const Scope& scope_; const RuntimeContext& ctx_; }; diff --git a/paddle/fluid/framework/operator.h b/paddle/fluid/framework/operator.h index 40d935a5ff..e33214b44b 100644 --- a/paddle/fluid/framework/operator.h +++ b/paddle/fluid/framework/operator.h @@ -222,12 +222,7 @@ class ExecutionContext { if (it == ctx_.inputs.end()) { return {}; } - std::vector<const Variable*> res; - res.reserve(it->second.size()); - std::transform(it->second.begin(), it->second.end(), - std::back_inserter(res), - [this](Variable* var) { return var; }); - return res; + return {it->second.begin(), it->second.end()}; } std::vector<Variable*> MultiOutputVar(const std::string& name) const { diff --git a/paddle/fluid/framework/parallel_executor.cc b/paddle/fluid/framework/parallel_executor.cc index f61c9e3a91..ff7ef0cce2 100644 --- a/paddle/fluid/framework/parallel_executor.cc +++ b/paddle/fluid/framework/parallel_executor.cc @@ -171,14 +171,6 @@ std::unique_ptr<ir::Graph> ParallelExecutorPrivate::PrepareGCAndRefCnts( eager_deletion_pass->SetNotOwned(details::kAllPlaces, &places_); graph = eager_deletion_pass->Apply(std::move(graph)); VLOG(10) << "EagerDeletionPass Applied"; - - if (build_strategy_.memory_early_delete_) { - auto early_delete_pass = - ir::PassRegistry::Instance().Get("memory_early_delete_pass"); - early_delete_pass->SetNotOwned(details::kGarbageCollector, &gcs_); - graph = early_delete_pass->Apply(std::move(graph)); - } - VLOG(10) << "MemoryEarlyDeletePass Applied."; } return graph; @@ -288,6 +280,8 @@ ParallelExecutor::ParallelExecutor( graphs.push_back(std::move(graph)); #endif auto max_memory_size = GetEagerDeletionThreshold(); + VLOG(10) << "Eager Deletion Threshold " + << static_cast<float>(max_memory_size) / (1 << 30); if (max_memory_size >= 0) { for (size_t i = 0; i < graphs.size(); ++i) { graphs[i] = member_->PrepareGCAndRefCnts( @@ -506,6 +500,5 @@ ParallelExecutor::~ParallelExecutor() { } // namespace framework } // namespace paddle -USE_PASS(memory_early_delete_pass); USE_PASS(reference_count_pass); USE_PASS(eager_deletion_pass); diff --git a/paddle/fluid/framework/scope.cc b/paddle/fluid/framework/scope.cc index 9536185609..87f0f307d3 100644 --- a/paddle/fluid/framework/scope.cc +++ b/paddle/fluid/framework/scope.cc @@ -22,11 +22,7 @@ limitations under the License. */ #include "paddle/fluid/framework/threadpool.h" #include "paddle/fluid/string/printf.h" -DEFINE_bool(benchmark, false, - "Doing memory benchmark. It will make deleting scope synchronized, " - "and add some memory usage logs." - "Default cuda is asynchronous device, set to True will" - "force op run in synchronous mode."); +DECLARE_bool(benchmark); DEFINE_bool( eager_delete_scope, true, diff --git a/paddle/fluid/framework/type_defs.h b/paddle/fluid/framework/type_defs.h index 938e2024c3..d02c699b97 100644 --- a/paddle/fluid/framework/type_defs.h +++ b/paddle/fluid/framework/type_defs.h @@ -57,5 +57,8 @@ using InferVarTypeFN = using InferShapeFN = std::function<void(InferShapeContext*)>; +using InplacePair = std::unordered_map<std::string, std::string>; +using InferInplaceOpFN = std::function<InplacePair(const OpDesc&, BlockDesc*)>; + } // namespace framework } // namespace paddle diff --git a/paddle/fluid/imperative/CMakeLists.txt b/paddle/fluid/imperative/CMakeLists.txt index 5db4221199..ec8dedd605 100644 --- a/paddle/fluid/imperative/CMakeLists.txt +++ b/paddle/fluid/imperative/CMakeLists.txt @@ -1,5 +1,5 @@ if(WITH_PYTHON) -cc_library(layer SRCS layer.cc DEPS proto_desc operator device_context blas) -cc_library(tracer SRCS tracer.cc DEPS proto_desc device_context) +cc_library(layer SRCS layer.cc DEPS proto_desc operator device_context blas pybind) +cc_library(tracer SRCS tracer.cc DEPS proto_desc device_context pybind) cc_library(engine SRCS engine.cc) endif() diff --git a/paddle/fluid/inference/CMakeLists.txt b/paddle/fluid/inference/CMakeLists.txt index 11484a6473..157862016e 100644 --- a/paddle/fluid/inference/CMakeLists.txt +++ b/paddle/fluid/inference/CMakeLists.txt @@ -58,12 +58,13 @@ if(WIN32) sep_library(paddle_fluid_shared SHARED SRCS ${SHARED_INFERENCE_SRCS} DEPS ${fluid_modules} paddle_fluid_api reset_tensor_array analysis_config paddle_pass_builder) - target_link_libraries(paddle_fluid_shared shlwapi) else(WIN32) cc_library(paddle_fluid_shared SHARED SRCS ${SHARED_INFERENCE_SRCS} DEPS ${fluid_modules} paddle_fluid_api reset_tensor_array analysis_config paddle_pass_builder) endif() +get_property(os_dependency_modules GLOBAL PROPERTY OS_DEPENDENCY_MODULES) +target_link_libraries(paddle_fluid_shared ${os_dependency_modules}) set_target_properties(paddle_fluid_shared PROPERTIES OUTPUT_NAME paddle_fluid) if(NOT APPLE AND NOT WIN32) diff --git a/paddle/fluid/inference/analysis/ir_pass_manager.cc b/paddle/fluid/inference/analysis/ir_pass_manager.cc index fe3c841186..8d5ee36ae6 100644 --- a/paddle/fluid/inference/analysis/ir_pass_manager.cc +++ b/paddle/fluid/inference/analysis/ir_pass_manager.cc @@ -83,7 +83,6 @@ void IRPassManager::CreatePasses(Argument *argument, new std::string(GetOrCreateModelOptCacheDir(model_opt_cache_dir))); } - // graph_ = pass->Apply(std::move(graph_)); pre_pass = pass_name; passes_.emplace_back(std::move(pass)); @@ -97,11 +96,12 @@ std::unique_ptr<Graph> IRPassManager::Apply(std::unique_ptr<Graph> graph) { PADDLE_ENFORCE(graph.get()); // Apply all the passes for (const auto &pass : passes_) { - if (pass->Type() == "graph_viz_pass") continue; - PrettyLogEndl(Style::H2(), "--- Running IR pass [%s]", pass->Type()); + if (pass->Type() != "graph_viz_pass") { + PrettyLogEndl(Style::H2(), "--- Running IR pass [%s]", pass->Type()); + } graph = pass->Apply(std::move(graph)); } - return std::move(graph); + return graph; } framework::proto::ProgramDesc IRPassManager::AcquireProgram( diff --git a/paddle/fluid/inference/analysis/ir_passes/CMakeLists.txt b/paddle/fluid/inference/analysis/ir_passes/CMakeLists.txt index eb6e1768a2..410a90132a 100644 --- a/paddle/fluid/inference/analysis/ir_passes/CMakeLists.txt +++ b/paddle/fluid/inference/analysis/ir_passes/CMakeLists.txt @@ -1,4 +1,7 @@ cc_library(subgraph_detector SRCS subgraph_detector.cc DEPS proto_desc) +if(WITH_TESTING) + add_dependencies(subgraph_detector gtest) +endif() if (WITH_GPU AND TENSORRT_FOUND) cc_library(tensorrt_subgraph_pass SRCS tensorrt_subgraph_pass.cc DEPS subgraph_detector tensorrt_op_teller) diff --git a/paddle/fluid/inference/analysis/passes/memory_optimize_pass.cc b/paddle/fluid/inference/analysis/passes/memory_optimize_pass.cc index 3d1be9196f..4b0a9d9b1c 100644 --- a/paddle/fluid/inference/analysis/passes/memory_optimize_pass.cc +++ b/paddle/fluid/inference/analysis/passes/memory_optimize_pass.cc @@ -18,6 +18,7 @@ #include <limits> #include <map> #include <string> +#include <type_traits> #include <utility> #include <vector> #include "paddle/fluid/framework/ir/graph_helper.h" @@ -168,7 +169,11 @@ bool FindSuitableTensorToReuse( if (!cluster->count(candidate)) continue; size_t space = space_table.at(candidate); - size_t space_diff = std::abs<size_t>(space - space_required); + PADDLE_ENFORCE( + space <= std::numeric_limits<std::make_signed<size_t>::type>::max(), + "space overload"); + size_t space_diff = + std::abs((std::make_signed<size_t>::type)space - space_required); if (space_diff < best_fit.second) { best_fit.first = candidate; best_fit.second = space_diff; diff --git a/paddle/fluid/inference/api/CMakeLists.txt b/paddle/fluid/inference/api/CMakeLists.txt index ad0af4005a..85755fc471 100644 --- a/paddle/fluid/inference/api/CMakeLists.txt +++ b/paddle/fluid/inference/api/CMakeLists.txt @@ -52,8 +52,8 @@ cc_test(test_analysis_predictor SRCS analysis_predictor_tester.cc DEPS analysis_ if (WITH_ANAKIN AND WITH_MKL) # only needed in CI # compile the libinference_anakin_api.a and anakin.so. - cc_library(inference_anakin_api SRCS api.cc api_anakin_engine.cc DEPS anakin_shared anakin_saber mklml zero_copy_tensor_dummy) - cc_library(inference_anakin_api_shared SHARED SRCS api.cc api_anakin_engine.cc DEPS anakin_shared anakin_saber zero_copy_tensor_dummy) + cc_library(inference_anakin_api SRCS api.cc api_anakin_engine.cc DEPS anakin_shared anakin_saber mklml zero_copy_tensor_dummy device_context) + cc_library(inference_anakin_api_shared SHARED SRCS api.cc api_anakin_engine.cc DEPS anakin_shared anakin_saber zero_copy_tensor_dummy device_context) function(anakin_target target_name) target_compile_options(${target_name} BEFORE PUBLIC ${ANAKIN_COMPILE_EXTRA_FLAGS}) endfunction() diff --git a/paddle/fluid/inference/api/analysis_config.cc b/paddle/fluid/inference/api/analysis_config.cc index eecab238a8..e92273b4dd 100644 --- a/paddle/fluid/inference/api/analysis_config.cc +++ b/paddle/fluid/inference/api/analysis_config.cc @@ -318,4 +318,9 @@ NativeConfig AnalysisConfig::ToNativeConfig() const { return config; } +void AnalysisConfig::SwitchIrDebug(int x) { + ir_debug_ = x; + Update(); +} + } // namespace paddle diff --git a/paddle/fluid/inference/api/analysis_predictor.cc b/paddle/fluid/inference/api/analysis_predictor.cc index 14d6ba8c56..712e010db4 100644 --- a/paddle/fluid/inference/api/analysis_predictor.cc +++ b/paddle/fluid/inference/api/analysis_predictor.cc @@ -58,7 +58,8 @@ namespace { bool IsPersistable(const framework::VarDesc *var) { if (var->Persistable() && var->GetType() != framework::proto::VarType::FEED_MINIBATCH && - var->GetType() != framework::proto::VarType::FETCH_LIST) { + var->GetType() != framework::proto::VarType::FETCH_LIST && + var->GetType() != framework::proto::VarType::RAW) { return true; } return false; @@ -420,7 +421,7 @@ std::unique_ptr<PaddlePredictor> CreatePaddlePredictor< if (!dynamic_cast<AnalysisPredictor *>(predictor.get())->Init(nullptr)) { return nullptr; } - return std::move(predictor); + return predictor; } void AnalysisPredictor::PrepareFeedFetch() { diff --git a/paddle/fluid/inference/api/analysis_predictor_tester.cc b/paddle/fluid/inference/api/analysis_predictor_tester.cc index 6d11b46108..002ba90e40 100644 --- a/paddle/fluid/inference/api/analysis_predictor_tester.cc +++ b/paddle/fluid/inference/api/analysis_predictor_tester.cc @@ -196,7 +196,7 @@ TEST(AnalysisPredictor, memory_optim) { AnalysisConfig config(FLAGS_dirname); config.DisableGpu(); config.EnableMemoryOptim(true); - config.pass_builder()->TurnOnDebug(); + config.SwitchIrDebug(); auto native_predictor = CreatePaddlePredictor<NativeConfig>(config.ToNativeConfig()); diff --git a/paddle/fluid/inference/api/paddle_analysis_config.h b/paddle/fluid/inference/api/paddle_analysis_config.h index 9d9ed6a39d..47361b3279 100644 --- a/paddle/fluid/inference/api/paddle_analysis_config.h +++ b/paddle/fluid/inference/api/paddle_analysis_config.h @@ -140,9 +140,12 @@ struct AnalysisConfig { */ bool tensorrt_engine_enabled() const { return use_tensorrt_; } - /** Control whther to debug IR graph analysis phase. + /** \brief Control whether to debug IR graph analysis phase. + * + * This will generate DOT files for visualizing the computation graph after + * each analysis pass applied. */ - void SwitchIrDebug(int x = true) { ir_debug_ = x; } + void SwitchIrDebug(int x = true); /** Turn on MKLDNN. */ diff --git a/paddle/fluid/inference/api/paddle_pass_builder.h b/paddle/fluid/inference/api/paddle_pass_builder.h index 391932a1ee..aa353f12ca 100644 --- a/paddle/fluid/inference/api/paddle_pass_builder.h +++ b/paddle/fluid/inference/api/paddle_pass_builder.h @@ -117,6 +117,7 @@ class CpuPassStrategy : public PassStrategy { "conv_bn_fuse_pass", // "conv_eltwiseadd_bn_fuse_pass", // "is_test_pass", // + "identity_scale_op_clean_pass", // }); use_gpu_ = false; } @@ -155,6 +156,7 @@ class GpuPassStrategy : public PassStrategy { GpuPassStrategy() : PassStrategy({}) { passes_.assign({ "infer_clean_graph_pass", // + "identity_scale_op_clean_pass", // "conv_affine_channel_fuse_pass", // "conv_eltwiseadd_affine_channel_fuse_pass", // "conv_bn_fuse_pass", // diff --git a/paddle/fluid/inference/tests/api/analyzer_seq_pool1_tester.cc b/paddle/fluid/inference/tests/api/analyzer_seq_pool1_tester.cc index 8be2a6d79b..dd953e0dcc 100644 --- a/paddle/fluid/inference/tests/api/analyzer_seq_pool1_tester.cc +++ b/paddle/fluid/inference/tests/api/analyzer_seq_pool1_tester.cc @@ -142,7 +142,7 @@ void SetConfig(AnalysisConfig *cfg, bool use_mkldnn = false) { cfg->SetModel(FLAGS_infer_model + "/model", FLAGS_infer_model + "/params"); cfg->DisableGpu(); cfg->SwitchSpecifyInputNames(); - cfg->pass_builder()->TurnOnDebug(); + cfg->SwitchIrDebug(); cfg->SetCpuMathLibraryNumThreads(FLAGS_paddle_num_threads); if (use_mkldnn) { cfg->EnableMKLDNN(); diff --git a/paddle/fluid/inference/tests/api/analyzer_text_classification_tester.cc b/paddle/fluid/inference/tests/api/analyzer_text_classification_tester.cc index 2db297e200..2003be8201 100644 --- a/paddle/fluid/inference/tests/api/analyzer_text_classification_tester.cc +++ b/paddle/fluid/inference/tests/api/analyzer_text_classification_tester.cc @@ -69,7 +69,7 @@ void SetInput(std::vector<std::vector<PaddleTensor>> *inputs) { TEST(Analyzer_Text_Classification, profile) { AnalysisConfig cfg; SetConfig(&cfg); - cfg.pass_builder()->TurnOnDebug(); + cfg.SwitchIrDebug(); std::vector<PaddleTensor> outputs; std::vector<std::vector<PaddleTensor>> input_slots_all; diff --git a/paddle/fluid/inference/utils/benchmark_tester.cc b/paddle/fluid/inference/utils/benchmark_tester.cc index 80763160df..0c48c2db9b 100644 --- a/paddle/fluid/inference/utils/benchmark_tester.cc +++ b/paddle/fluid/inference/utils/benchmark_tester.cc @@ -34,6 +34,6 @@ TEST(Benchmark, PersistToFile) { benchmark.SetLatency(220); benchmark.PersistToFile("1.log"); - benchmark.PersistToFile("1.log"); - benchmark.PersistToFile("1.log"); + benchmark.PersistToFile("2.log"); + benchmark.PersistToFile("3.log"); } diff --git a/paddle/fluid/memory/allocation/allocator_facade.cc b/paddle/fluid/memory/allocation/allocator_facade.cc index 794d729bdc..ea0b729dc6 100644 --- a/paddle/fluid/memory/allocation/allocator_facade.cc +++ b/paddle/fluid/memory/allocation/allocator_facade.cc @@ -83,7 +83,7 @@ class ChunkedAllocator : public Allocator { VLOG(1) << "Create AutoIncrementAllocator with chunk_size " << max_chunk_size_ << " and capacity " << capacity; default_allocator_ = std::make_shared<AutoIncrementAllocator>( - [this] { return std::move(CreateAllocatorWithChunk()); }, capacity); + [this] { return CreateAllocatorWithChunk(); }, capacity); } } diff --git a/paddle/fluid/memory/allocation/legacy_allocator.cc b/paddle/fluid/memory/allocation/legacy_allocator.cc index 8759ec8096..e983ae327d 100644 --- a/paddle/fluid/memory/allocation/legacy_allocator.cc +++ b/paddle/fluid/memory/allocation/legacy_allocator.cc @@ -36,6 +36,7 @@ DEFINE_bool(init_allocated_mem, false, "that initializing the allocated memory with a small value " "during unit testing."); DECLARE_double(fraction_of_gpu_memory_to_use); +DECLARE_bool(benchmark); namespace paddle { namespace memory { @@ -59,11 +60,6 @@ size_t memory_usage(const platform::Place &p); using BuddyAllocator = detail::BuddyAllocator; -std::unordered_map</*device id*/ int, - std::pair</*current memory usage*/ uint64_t, - /*peak memory usage*/ uint64_t>> - gpu_mem_info; - BuddyAllocator *GetCPUBuddyAllocator() { // We tried thread_local for inference::RNN1 model, but that not works much // for multi-thread test. @@ -144,6 +140,8 @@ BuddyAllocator *GetGPUBuddyAllocator(int gpu_id) { devices = platform::GetSelectedDevices(); int gpu_num = devices.size(); + allocation::GPUMemMonitor.Initialize(devices.size()); + a_arr = new BuddyAllocator *[gpu_num]; for (size_t i = 0; i < devices.size(); ++i) { int dev_id = devices[i]; @@ -190,25 +188,19 @@ void *Alloc<platform::CUDAPlace>(const platform::CUDAPlace &place, platform::SetDeviceId(place.device); size_t avail, total; platform::GpuMemoryUsage(&avail, &total); - LOG(WARNING) << "Cannot allocate " << string::HumanReadableSize(size) - << " in GPU " << place.device << ", available " - << string::HumanReadableSize(avail); - LOG(WARNING) << "total " << total; - LOG(WARNING) << "GpuMinChunkSize " - << string::HumanReadableSize( - buddy_allocator->GetMinChunkSize()); - LOG(WARNING) << "GpuMaxChunkSize " - << string::HumanReadableSize( - buddy_allocator->GetMaxChunkSize()); - LOG(WARNING) << "GPU memory used: " - << string::HumanReadableSize(Used<platform::CUDAPlace>(place)); + LOG(FATAL) << "Cannot allocate " << string::HumanReadableSize(size) + << " in GPU " << place.device << ", available " + << string::HumanReadableSize(avail) << "total " << total + << "GpuMinChunkSize " + << string::HumanReadableSize(buddy_allocator->GetMinChunkSize()) + << "GpuMaxChunkSize " + << string::HumanReadableSize(buddy_allocator->GetMaxChunkSize()) + << "GPU memory used: " + << string::HumanReadableSize(Used<platform::CUDAPlace>(place)); platform::SetDeviceId(cur_dev); } else { - gpu_mem_info[place.device].first += size; - if (gpu_mem_info[place.device].first > gpu_mem_info[place.device].second) { - gpu_mem_info[place.device].second = gpu_mem_info[place.device].first; - VLOG(3) << "device: " << place.device << " peak memory usage : " - << (gpu_mem_info[place.device].second >> 20) << " MiB"; + if (FLAGS_benchmark) { + allocation::GPUMemMonitor.Add(place.device, size); } if (FLAGS_init_allocated_mem) { cudaMemset(ptr, 0xEF, size); @@ -225,7 +217,9 @@ void Free<platform::CUDAPlace>(const platform::CUDAPlace &place, void *p, size_t size) { #ifdef PADDLE_WITH_CUDA GetGPUBuddyAllocator(place.device)->Free(p); - gpu_mem_info[place.device].first -= size; + if (FLAGS_benchmark) { + allocation::GPUMemMonitor.Minus(place.device, size); + } #else PADDLE_THROW("'CUDAPlace' is not supported in CPU only device."); #endif @@ -264,7 +258,7 @@ void *Alloc<platform::CUDAPinnedPlace>(const platform::CUDAPinnedPlace &place, void *ptr = buddy_allocator->Alloc(size); if (ptr == nullptr) { - LOG(WARNING) << "cudaMallocHost Cannot allocate " << size + LOG(WARNING) << "cudaHostAlloc Cannot allocate " << size << " bytes in CUDAPinnedPlace"; } if (FLAGS_init_allocated_mem) { @@ -335,6 +329,8 @@ size_t Usage::operator()(const platform::CUDAPinnedPlace &cuda_pinned) const { namespace allocation { +LegacyMemMonitor GPUMemMonitor; + Allocation *LegacyAllocator::AllocateImpl(size_t size, Allocator::Attr attr) { void *ptr = boost::apply_visitor(legacy::AllocVisitor(size), place_); return new Allocation(ptr, size, place_); @@ -346,6 +342,63 @@ void LegacyAllocator::Free(Allocation *allocation) { allocation->place()); delete allocation; } + +bool MemInfo::Add(const size_t &size) { + std::lock_guard<std::mutex> lock(mutex_); + usage_ += size; + bool peak_point = usage_ > peak_usage_; + if (peak_point) peak_usage_ = usage_; + return peak_point; +} + +void MemInfo::Minus(const size_t &size) { + std::lock_guard<std::mutex> lock(mutex_); + usage_ -= size; +} + +uint64_t MemInfo::GetPeakUsage() { return peak_usage_; } + +LegacyMemMonitor::~LegacyMemMonitor() { + for (auto &item : gpu_mem_info_) delete item.second; +} + +void LegacyMemMonitor::Initialize(const int &device_num) { + for (auto i = 0; i < device_num; ++i) { + gpu_mem_info_[i] = new MemInfo(); + } +} + +void LegacyMemMonitor::Add(const int &device, const size_t &size) { + if (gpu_mem_info_[device]->Add(size)) { + VLOG(3) << "#LegacyMemMonitor# device: " << device + << " peak memory usage : " + << (gpu_mem_info_[device]->GetPeakUsage() >> 20) << " MiB"; + } +} + +void LegacyMemMonitor::Minus(const int &device, const size_t &size) { + gpu_mem_info_[device]->Minus(size); +} + +uint64_t LegacyMemMonitor::GetMemUsage(const int &device) { + return gpu_mem_info_.find(device) == gpu_mem_info_.end() + ? 0 + : gpu_mem_info_[device]->GetPeakUsage(); +} + +void LegacyMemMonitor::PrintMemUsage() { + std::vector<int> devices; + for (const auto &item : gpu_mem_info_) { + devices.emplace_back(item.first); + } + std::sort(devices.begin(), devices.end()); + for (const auto &device : devices) { + std::cout << "Device : " << device << " Peak Memory Usage : " + << (gpu_mem_info_[device]->GetPeakUsage() >> 20) << " MiB" + << std::endl; + } +} + } // namespace allocation } // namespace memory } // namespace paddle diff --git a/paddle/fluid/memory/allocation/legacy_allocator.h b/paddle/fluid/memory/allocation/legacy_allocator.h index 503a7a685c..ccbc8c70d8 100644 --- a/paddle/fluid/memory/allocation/legacy_allocator.h +++ b/paddle/fluid/memory/allocation/legacy_allocator.h @@ -13,12 +13,59 @@ // limitations under the License. #pragma once +#include <algorithm> +#include <mutex> // NOLINT +#include <unordered_map> +#include <utility> +#include <vector> #include "paddle/fluid/memory/allocation/allocator.h" #include "paddle/fluid/platform/place.h" namespace paddle { namespace memory { namespace allocation { +class MemInfo { + public: + MemInfo() : usage_(0), peak_usage_(0) {} + MemInfo(const MemInfo &) = delete; + MemInfo &operator=(const MemInfo &) = delete; + + // return a flag to indicate current operation will create a peak point or not + bool Add(const size_t &); + void Minus(const size_t &); + + uint64_t GetPeakUsage(); + + private: + /* current memory usage*/ + uint64_t usage_; + uint64_t peak_usage_; + std::mutex mutex_; +}; + +class LegacyMemMonitor { + public: + // used to store the GPU memory usage of each devices + using MemUsage = std::unordered_map</*device id*/ int, + /*mem usage info node*/ MemInfo *>; + + MemUsage GetMemUsageInfo() { return gpu_mem_info_; } + ~LegacyMemMonitor(); + + void Initialize(const int &); + void Add(const int &, const size_t &); + void Minus(const int &, const size_t &); + + uint64_t GetMemUsage(const int &); + + void PrintMemUsage(); + + protected: + MemUsage gpu_mem_info_; +}; + +extern LegacyMemMonitor GPUMemMonitor; + class LegacyAllocatorPrivate; class LegacyAllocator : public Allocator { public: diff --git a/paddle/fluid/memory/allocation/pinned_allocator.cc b/paddle/fluid/memory/allocation/pinned_allocator.cc index 6ac3aefdd1..de81d12cca 100644 --- a/paddle/fluid/memory/allocation/pinned_allocator.cc +++ b/paddle/fluid/memory/allocation/pinned_allocator.cc @@ -32,7 +32,7 @@ Allocation *CPUPinnedAllocator::AllocateImpl(size_t size, // "CPUPinnedAllocator should be used for Cross-Device Communication"); void *ptr; - PADDLE_ENFORCE(cudaMallocHost(&ptr, size)); + PADDLE_ENFORCE(cudaHostAlloc(&ptr, size, cudaHostAllocPortable)); return new CPUPinnedAllocation(ptr, size); } } // namespace allocation diff --git a/paddle/fluid/memory/allocation/pinned_allocator.h b/paddle/fluid/memory/allocation/pinned_allocator.h index 26d12dd91c..42d0938f2a 100644 --- a/paddle/fluid/memory/allocation/pinned_allocator.h +++ b/paddle/fluid/memory/allocation/pinned_allocator.h @@ -19,7 +19,7 @@ namespace paddle { namespace memory { namespace allocation { -// Allocator uses `cudaMallocHost` +// Allocator uses `cudaHostAlloc` class CPUPinnedAllocation : public Allocation { public: CPUPinnedAllocation(void *ptr, size_t size) diff --git a/paddle/fluid/memory/detail/system_allocator.cc b/paddle/fluid/memory/detail/system_allocator.cc index 3e8fb83e9d..197d1c2f21 100644 --- a/paddle/fluid/memory/detail/system_allocator.cc +++ b/paddle/fluid/memory/detail/system_allocator.cc @@ -173,14 +173,14 @@ void* CUDAPinnedAllocator::Alloc(size_t* index, size_t size) { void* p; // PINNED memory is visible to all CUDA contexts. - cudaError_t result = cudaMallocHost(&p, size); + cudaError_t result = cudaHostAlloc(&p, size, cudaHostAllocPortable); if (result == cudaSuccess) { *index = 1; // PINNED memory cuda_pinnd_alloc_size_ += size; return p; } else { - LOG(WARNING) << "cudaMallocHost failed."; + LOG(WARNING) << "cudaHostAlloc failed."; return nullptr; } diff --git a/paddle/fluid/operators/activation_op.cc b/paddle/fluid/operators/activation_op.cc index 7ec9d2fed5..65efe2966c 100644 --- a/paddle/fluid/operators/activation_op.cc +++ b/paddle/fluid/operators/activation_op.cc @@ -37,7 +37,7 @@ using paddle::framework::Tensor; "(bool, default false) Set to true for inference only, false " \ "for training. Some layers may run faster when this is true.") \ .SetDefault(false); \ - AddComment(#OP_COMMENT); \ + AddComment(OP_COMMENT); \ } \ } @@ -124,7 +124,7 @@ class ActivationOpGrad : public framework::OperatorWithKernel { UNUSED constexpr char SigmoidDoc[] = R"DOC( Sigmoid Activation Operator -$$out = \frac{1}{1 + e^{-x}}$$ +$$out = \\frac{1}{1 + e^{-x}}$$ )DOC"; @@ -187,14 +187,14 @@ $out = |x|$ UNUSED constexpr char CeilDoc[] = R"DOC( Ceil Activation Operator. -$out = ceil(x)$ +$out = \left \lceil x \right \rceil$ )DOC"; UNUSED constexpr char FloorDoc[] = R"DOC( Floor Activation Operator. -$out = floor(x)$ +$out = \left \lfloor x \right \rfloor$ )DOC"; @@ -252,7 +252,7 @@ $out = \ln(1 + e^{x})$ UNUSED constexpr char SoftsignDoc[] = R"DOC( Softsign Activation Operator. -$$out = \frac{x}{1 + |x|}$$ +$$out = \\frac{x}{1 + \|x\|}$$ )DOC"; @@ -547,12 +547,14 @@ namespace ops = paddle::operators; __macro(Swish, swish); \ __macro(ThresholdedRelu, thresholded_relu); -#define REGISTER_INPLACE_ACTIVATION_OP(OP_NAME, KERNEL_TYPE) \ - REGISTER_OPERATOR(KERNEL_TYPE, ::paddle::operators::ActivationOp, \ - ::paddle::operators::OP_NAME##OpMaker, \ - ::paddle::operators::ActivationOpInferVarType, \ - ::paddle::operators::OP_NAME##GradMaker); \ - REGISTER_OPERATOR(KERNEL_TYPE##_grad, ::paddle::operators::ActivationOpGrad) +#define REGISTER_INPLACE_ACTIVATION_OP(OP_NAME, KERNEL_TYPE) \ + REGISTER_OPERATOR(KERNEL_TYPE, ::paddle::operators::ActivationOp, \ + ::paddle::operators::OP_NAME##OpMaker, \ + ::paddle::operators::ActivationOpInferVarType, \ + ::paddle::operators::OP_NAME##GradMaker, \ + ::paddle::framework::SingleOpInplaceInToOut); \ + REGISTER_OPERATOR(KERNEL_TYPE##_grad, ::paddle::operators::ActivationOpGrad, \ + ::paddle::framework::SingleOpInplaceInToOut) #define REGISTER_ACTIVATION_OP(OP_NAME, KERNEL_TYPE) \ REGISTER_OPERATOR(KERNEL_TYPE, ::paddle::operators::ActivationOp, \ diff --git a/paddle/fluid/operators/batch_norm_op.cc b/paddle/fluid/operators/batch_norm_op.cc index 8b672e09b2..feac412538 100644 --- a/paddle/fluid/operators/batch_norm_op.cc +++ b/paddle/fluid/operators/batch_norm_op.cc @@ -589,8 +589,10 @@ class BatchNormGradMaker : public framework::SingleGradOpDescMaker { op->SetInput("SavedVariance", Output("SavedVariance")); // used when setting use_global_stats True during training - op->SetInput("Mean", Output("MeanOut")); - op->SetInput("Variance", Output("VarianceOut")); + if (boost::get<bool>(GetAttr("use_global_stats"))) { + op->SetInput("Mean", Output("MeanOut")); + op->SetInput("Variance", Output("VarianceOut")); + } op->SetAttrMap(Attrs()); @@ -602,13 +604,48 @@ class BatchNormGradMaker : public framework::SingleGradOpDescMaker { } }; +class BatchNormInplaceInToOut : public framework::InplaceInToOut { + public: + using InplaceInToOut::InplaceInToOut; + + protected: + std::unordered_map<std::string, std::string> Apply( + const framework::OpDesc &op_desc, + framework::BlockDesc *block) const override { + std::unordered_map<std::string, std::string> inplace_in_to_out = { + {"Mean", "MeanOut"}, {"Variance", "VarianceOut"}, {"X", "Y"}, + }; + return inplace_in_to_out; + } +}; + +class BatchNormGradInplaceInToOut : public framework::InplaceInToOut { + public: + using InplaceInToOut::InplaceInToOut; + + protected: + std::unordered_map<std::string, std::string> Apply( + const framework::OpDesc &op_desc, + framework::BlockDesc *block) const override { + std::unordered_map<std::string, std::string> inplace_in_to_out = { + // Scale, Bias, SavedMean, SavedVariance shape is [batch_size, C] + {framework::GradVarName("Y"), framework::GradVarName("X")}, + {"SavedMean", framework::GradVarName("Scale")}, + {"SavedVariance", framework::GradVarName("Bias")}, + }; + return inplace_in_to_out; + } +}; + } // namespace operators } // namespace paddle namespace ops = paddle::operators; REGISTER_OPERATOR(batch_norm, ops::BatchNormOp, ops::BatchNormOpMaker, - ops::BatchNormOpInferVarType, ops::BatchNormGradMaker); -REGISTER_OPERATOR(batch_norm_grad, ops::BatchNormGradOp); + ops::BatchNormOpInferVarType, ops::BatchNormGradMaker, + ops::BatchNormInplaceInToOut); +REGISTER_OPERATOR(batch_norm_grad, ops::BatchNormGradOp, + ops::BatchNormGradInplaceInToOut); REGISTER_OP_CPU_KERNEL( batch_norm, ops::BatchNormKernel<paddle::platform::CPUDeviceContext, float>, diff --git a/paddle/fluid/operators/detection/CMakeLists.txt b/paddle/fluid/operators/detection/CMakeLists.txt index cace42bc1b..f6fbe97565 100644 --- a/paddle/fluid/operators/detection/CMakeLists.txt +++ b/paddle/fluid/operators/detection/CMakeLists.txt @@ -31,6 +31,7 @@ detection_library(polygon_box_transform_op SRCS polygon_box_transform_op.cc polygon_box_transform_op.cu) detection_library(rpn_target_assign_op SRCS rpn_target_assign_op.cc) detection_library(generate_proposal_labels_op SRCS generate_proposal_labels_op.cc) +detection_library(box_clip_op SRCS box_clip_op.cc box_clip_op.cu) detection_library(yolov3_loss_op SRCS yolov3_loss_op.cc) if(WITH_GPU) diff --git a/paddle/fluid/operators/detection/bbox_util.h b/paddle/fluid/operators/detection/bbox_util.h index b99edb5bf0..a7bc3e0272 100644 --- a/paddle/fluid/operators/detection/bbox_util.h +++ b/paddle/fluid/operators/detection/bbox_util.h @@ -99,5 +99,29 @@ void BboxOverlaps(const framework::Tensor& r_boxes, } } +template <class T> +void ClipTiledBoxes(const platform::DeviceContext& ctx, + const framework::Tensor& im_info, + const framework::Tensor& input_boxes, + framework::Tensor* out) { + T* out_data = out->mutable_data<T>(ctx.GetPlace()); + const T* im_info_data = im_info.data<T>(); + const T* input_boxes_data = input_boxes.data<T>(); + T zero(0); + T im_w = round(im_info_data[1] / im_info_data[2]); + T im_h = round(im_info_data[0] / im_info_data[2]); + for (int64_t i = 0; i < input_boxes.numel(); ++i) { + if (i % 4 == 0) { + out_data[i] = std::max(std::min(input_boxes_data[i], im_w - 1), zero); + } else if (i % 4 == 1) { + out_data[i] = std::max(std::min(input_boxes_data[i], im_h - 1), zero); + } else if (i % 4 == 2) { + out_data[i] = std::max(std::min(input_boxes_data[i], im_w - 1), zero); + } else { + out_data[i] = std::max(std::min(input_boxes_data[i], im_h - 1), zero); + } + } +} + } // namespace operators } // namespace paddle diff --git a/paddle/fluid/operators/detection/box_clip_op.cc b/paddle/fluid/operators/detection/box_clip_op.cc new file mode 100644 index 0000000000..3aa766559a --- /dev/null +++ b/paddle/fluid/operators/detection/box_clip_op.cc @@ -0,0 +1,86 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + http://www.apache.org/licenses/LICENSE-2.0 +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/detection/box_clip_op.h" +#include "paddle/fluid/framework/op_registry.h" + +namespace paddle { +namespace operators { + +class BoxClipOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + protected: + void InferShape(framework::InferShapeContext* ctx) const override { + PADDLE_ENFORCE(ctx->HasInput("Input"), + "Input(Input) of BoxClipOp should not be null."); + PADDLE_ENFORCE(ctx->HasInput("ImInfo"), + "Input(ImInfo) of BoxClipOp should not be null."); + + auto input_box_dims = ctx->GetInputDim("Input"); + auto im_info_dims = ctx->GetInputDim("ImInfo"); + + if (ctx->IsRuntime()) { + auto input_box_size = input_box_dims.size(); + PADDLE_ENFORCE_EQ(input_box_dims[input_box_size - 1], 4, + "The last dimension of Input must be 4"); + PADDLE_ENFORCE_EQ(im_info_dims.size(), 2, + "The rank of Input(Input) in BoxClipOp must be 2"); + PADDLE_ENFORCE_EQ(im_info_dims[1], 3, + "The last dimension of ImInfo must be 3"); + } + ctx->ShareDim("Input", /*->*/ "Output"); + ctx->ShareLoD("Input", /*->*/ "Output"); + } +}; + +class BoxClipOpMaker : public framework::OpProtoAndCheckerMaker { + public: + void Make() override { + AddInput("Input", + "(LoDTensor) " + "Input is a LoDTensor with shape [..., 4] holds 4 points" + "in last dimension in format [xmin, ymin, xmax, ymax]"); + AddInput("ImInfo", + "(Tensor) Information for image reshape is in shape (N, 3), " + "in format (height, width, im_scale)"); + AddOutput("Output", + "(LoDTensor) " + "Output is a LoDTensor with the same shape as Input" + "and it is the result after clip"); + AddComment(R"DOC( +This operator clips input boxes to original input images. + +For each input box, The formula is given as follows: + + $$xmin = \max(\min(xmin, im_w - 1), 0)$$ + $$ymin = \max(\min(ymin, im_h - 1), 0)$$ + $$xmax = \max(\min(xmax, im_w - 1), 0)$$ + $$ymax = \max(\min(ymax, im_h - 1), 0)$$ + +where im_w and im_h are computed from ImInfo, the formula is given as follows: + + $$im_w = \round(width / im_scale)$$ + $$im_h = \round(height / im_scale)$$ +)DOC"); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OPERATOR(box_clip, ops::BoxClipOp, ops::BoxClipOpMaker, + paddle::framework::EmptyGradOpMaker); +REGISTER_OP_CPU_KERNEL( + box_clip, ops::BoxClipKernel<paddle::platform::CPUDeviceContext, float>, + ops::BoxClipKernel<paddle::platform::CPUDeviceContext, double>); diff --git a/paddle/fluid/operators/detection/box_clip_op.cu b/paddle/fluid/operators/detection/box_clip_op.cu new file mode 100644 index 0000000000..b727da5f7b --- /dev/null +++ b/paddle/fluid/operators/detection/box_clip_op.cu @@ -0,0 +1,74 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include <algorithm> +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/operators/detection/box_clip_op.h" +#include "paddle/fluid/operators/math/math_function.h" +#include "paddle/fluid/platform/cuda_primitives.h" +#include "paddle/fluid/platform/hostdevice.h" + +namespace paddle { +namespace operators { + +using Tensor = framework::Tensor; +using LoDTenso = framework::LoDTensor; + +static constexpr int ImInfoSize = 3; + +template <typename T, int BlockSize> +static __global__ void GPUBoxClip(const T *input, const size_t *lod, + const size_t width, const T *im_info, + T *output) { + T im_w = round(im_info[blockIdx.x * ImInfoSize + 1] / + im_info[blockIdx.x * ImInfoSize + 2]); + T im_h = round(im_info[blockIdx.x * ImInfoSize] / + im_info[blockIdx.x * ImInfoSize + 2]); + for (int i = threadIdx.x; i < (lod[blockIdx.x + 1] - lod[blockIdx.x]) * width; + i += BlockSize) { + int idx = lod[blockIdx.x] * width + i; + T im_size = (idx % 2 == 0) ? im_w : im_h; + output[idx] = max(min(input[idx], im_size - 1), T(0.)); + } +} + +template <typename DeviceContext, typename T> +class GPUBoxClipKernel : public framework::OpKernel<T> { + public: + void Compute(const framework::ExecutionContext &context) const override { + PADDLE_ENFORCE(platform::is_gpu_place(context.GetPlace()), + "This kernel only runs on GPU device."); + auto *input = context.Input<LoDTensor>("Input"); + auto *im_info = context.Input<Tensor>("ImInfo"); + auto *output = context.Output<LoDTensor>("Output"); + const int64_t num = input->dims()[0]; + const int64_t bbox_width = input->numel() / num; + auto lod = input->lod(); + framework::LoD abs_offset_lod = framework::ToAbsOffset(lod); + auto &dev_ctx = context.template device_context<DeviceContext>(); + auto stream = dev_ctx.stream(); + const size_t batch_size = lod.back().size() - 1; + T *output_data = output->mutable_data<T>(dev_ctx.GetPlace()); + GPUBoxClip<T, 512><<<batch_size, 512, 0, stream>>>( + input->data<T>(), abs_offset_lod[0].CUDAMutableData(dev_ctx.GetPlace()), + bbox_width, im_info->data<T>(), output_data); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OP_CUDA_KERNEL( + box_clip, ops::GPUBoxClipKernel<paddle::platform::CUDADeviceContext, float>, + ops::GPUBoxClipKernel<paddle::platform::CUDADeviceContext, double>); diff --git a/paddle/fluid/operators/detection/box_clip_op.h b/paddle/fluid/operators/detection/box_clip_op.h new file mode 100644 index 0000000000..74e1f88f8d --- /dev/null +++ b/paddle/fluid/operators/detection/box_clip_op.h @@ -0,0 +1,50 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + http://www.apache.org/licenses/LICENSE-2.0 +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once +#include <string> +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/operators/detection/bbox_util.h" +#include "paddle/fluid/operators/math/math_function.h" + +namespace paddle { +namespace operators { + +using Tensor = framework::Tensor; +using LoDTensor = framework::LoDTensor; + +template <typename DeviceContext, typename T> +class BoxClipKernel : public framework::OpKernel<T> { + public: + void Compute(const framework::ExecutionContext& context) const override { + auto* input_box = context.Input<LoDTensor>("Input"); + auto* im_info = context.Input<LoDTensor>("ImInfo"); + auto* output_box = context.Output<LoDTensor>("Output"); + auto& dev_ctx = + context.template device_context<platform::CPUDeviceContext>(); + output_box->mutable_data<T>(context.GetPlace()); + if (input_box->lod().size()) { + PADDLE_ENFORCE_EQ(input_box->lod().size(), 1UL, + "Only support 1 level of LoD."); + } + auto box_lod = input_box->lod().back(); + int64_t n = static_cast<int64_t>(box_lod.size() - 1); + for (int i = 0; i < n; ++i) { + Tensor im_info_slice = im_info->Slice(i, i + 1); + Tensor box_slice = input_box->Slice(box_lod[i], box_lod[i + 1]); + Tensor output_slice = output_box->Slice(box_lod[i], box_lod[i + 1]); + ClipTiledBoxes<T>(dev_ctx, im_info_slice, box_slice, &output_slice); + } + } +}; + +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/detection/box_coder_op.cc b/paddle/fluid/operators/detection/box_coder_op.cc index fdcff62e1f..0a51d50e06 100644 --- a/paddle/fluid/operators/detection/box_coder_op.cc +++ b/paddle/fluid/operators/detection/box_coder_op.cc @@ -38,20 +38,12 @@ class BoxCoderOp : public framework::OperatorWithKernel { "The shape of PriorBox is [N, 4]"); if (ctx->HasInput("PriorBoxVar")) { auto prior_box_var_dims = ctx->GetInputDim("PriorBoxVar"); - PADDLE_ENFORCE( - prior_box_var_dims.size() == 1 || prior_box_var_dims.size() == 2, - "Input(PriorBoxVar) of BoxCoderOp should be 1 or 2."); - if (prior_box_var_dims.size() == 1) { - PADDLE_ENFORCE_EQ( - prior_box_var_dims[0], 4, - "The 1st dimension of Input(PriorBoxVar) should be 4" - "when the rank is 1."); - } else { - PADDLE_ENFORCE_EQ( - prior_box_dims, prior_box_var_dims, - "The dimension of Input(PriorBoxVar) should be equal to" - "the dimension of Input(PriorBox when the rank is 2.)"); - } + PADDLE_ENFORCE(prior_box_var_dims.size() == 2, + "Input(PriorBoxVar) of BoxCoderOp should be 2."); + PADDLE_ENFORCE_EQ( + prior_box_dims, prior_box_var_dims, + "The dimension of Input(PriorBoxVar) should be equal to" + "the dimension of Input(PriorBox) when the rank is 2."); } } diff --git a/paddle/fluid/operators/detection/box_coder_op.cu b/paddle/fluid/operators/detection/box_coder_op.cu index e078af3eb4..19a5bb90fa 100644 --- a/paddle/fluid/operators/detection/box_coder_op.cu +++ b/paddle/fluid/operators/detection/box_coder_op.cu @@ -56,10 +56,7 @@ __global__ void EncodeCenterSizeKernel( output[idx * len + 2] = log(fabs(target_box_width / prior_box_width)); output[idx * len + 3] = log(fabs(target_box_height / prior_box_height)); if (prior_box_var_data) { - int prior_var_offset = 0; - if (prior_box_var_size == 2) { - prior_var_offset = col_idx * len; - } + int prior_var_offset = col_idx * len; output[idx * len] /= prior_box_var_data[prior_var_offset]; output[idx * len + 1] /= prior_box_var_data[prior_var_offset + 1]; output[idx * len + 2] /= prior_box_var_data[prior_var_offset + 2]; @@ -99,10 +96,7 @@ __global__ void DecodeCenterSizeKernel( T box_var_x = T(1), box_var_y = T(1); T box_var_w = T(1), box_var_h = T(1); if (prior_box_var_data) { - int prior_var_offset = 0; - if (prior_box_var_size == 2) { - prior_var_offset = axis == 0 ? col_idx * len : row_idx * len; - } + int prior_var_offset = axis == 0 ? col_idx * len : row_idx * len; box_var_x = prior_box_var_data[prior_var_offset]; box_var_y = prior_box_var_data[prior_var_offset + 1]; box_var_w = prior_box_var_data[prior_var_offset + 2]; diff --git a/paddle/fluid/operators/detection/box_coder_op.h b/paddle/fluid/operators/detection/box_coder_op.h index a0b1faf7bd..6d406f8196 100644 --- a/paddle/fluid/operators/detection/box_coder_op.h +++ b/paddle/fluid/operators/detection/box_coder_op.h @@ -79,10 +79,7 @@ class BoxCoderKernel : public framework::OpKernel<T> { output[offset + 3] = std::log(std::fabs(target_box_height / prior_box_height)); if (prior_box_var) { - int prior_var_offset = 0; - if (prior_box_var->dims().size() == 2) { - prior_var_offset = j * len; - } + int prior_var_offset = j * len; output[offset] /= prior_box_var_data[prior_var_offset]; output[offset + 1] /= prior_box_var_data[prior_var_offset + 1]; output[offset + 2] /= prior_box_var_data[prior_var_offset + 2]; @@ -95,11 +92,12 @@ class BoxCoderKernel : public framework::OpKernel<T> { } } } + template <int axis, int var_size> void DecodeCenterSize(const framework::Tensor* target_box, const framework::Tensor* prior_box, const framework::Tensor* prior_box_var, - const bool normalized, const int axis, - const std::vector<float> variance, T* output) const { + const bool normalized, std::vector<float> variance, + T* output) const { int64_t row = target_box->dims()[0]; int64_t col = target_box->dims()[1]; int64_t len = target_box->dims()[2]; @@ -107,19 +105,17 @@ class BoxCoderKernel : public framework::OpKernel<T> { auto* target_box_data = target_box->data<T>(); auto* prior_box_data = prior_box->data<T>(); const T* prior_box_var_data = nullptr; - if (prior_box_var) prior_box_var_data = prior_box_var->data<T>(); + if (var_size == 2) prior_box_var_data = prior_box_var->data<T>(); int prior_box_offset = 0; + T var_data[4] = {1., 1., 1., 1.}; + T* var_ptr = var_data; #ifdef PADDLE_WITH_MKLML #pragma omp parallel for collapse(2) #endif for (int64_t i = 0; i < row; ++i) { for (int64_t j = 0; j < col; ++j) { size_t offset = i * col * len + j * len; - if (axis == 0) { - prior_box_offset = j * len; - } else if (axis == 1) { - prior_box_offset = i * len; - } + prior_box_offset = axis == 0 ? j * len : i * len; T prior_box_width = prior_box_data[prior_box_offset + 2] - prior_box_data[prior_box_offset] + (normalized == false); @@ -133,26 +129,18 @@ class BoxCoderKernel : public framework::OpKernel<T> { T target_box_center_x = 0, target_box_center_y = 0; T target_box_width = 0, target_box_height = 0; - T box_var_x = T(1), box_var_y = T(1); - T box_var_w = T(1), box_var_h = T(1); - if (prior_box_var) { - int prior_var_offset = 0; - if (prior_box_var->dims().size() == 2) { - if (axis == 0) - prior_var_offset = j * len; - else if (axis == 1) - prior_var_offset = i * len; - } - box_var_x = prior_box_var_data[prior_var_offset]; - box_var_y = prior_box_var_data[prior_var_offset + 1]; - box_var_w = prior_box_var_data[prior_var_offset + 2]; - box_var_h = prior_box_var_data[prior_var_offset + 3]; - } else if (!(variance.empty())) { - box_var_x = static_cast<T>(variance[0]); - box_var_y = static_cast<T>(variance[1]); - box_var_w = static_cast<T>(variance[2]); - box_var_h = static_cast<T>(variance[3]); + int prior_var_offset = axis == 0 ? j * len : i * len; + if (var_size == 2) { + std::memcpy(var_ptr, prior_box_var_data + prior_var_offset, + 4 * sizeof(T)); + } else if (var_size == 1) { + var_ptr = reinterpret_cast<T*>(variance.data()); } + T box_var_x = *var_ptr; + T box_var_y = *(var_ptr + 1); + T box_var_w = *(var_ptr + 2); + T box_var_h = *(var_ptr + 3); + target_box_center_x = box_var_x * target_box_data[offset] * prior_box_width + prior_box_center_x; @@ -211,8 +199,31 @@ class BoxCoderKernel : public framework::OpKernel<T> { EncodeCenterSize(target_box, prior_box, prior_box_var, normalized, variance, output); } else if (code_type == BoxCodeType::kDecodeCenterSize) { - DecodeCenterSize(target_box, prior_box, prior_box_var, normalized, axis, - variance, output); + if (prior_box_var) { + if (axis == 0) { + DecodeCenterSize<0, 2>(target_box, prior_box, prior_box_var, + normalized, variance, output); + } else { + DecodeCenterSize<1, 2>(target_box, prior_box, prior_box_var, + normalized, variance, output); + } + } else if (!(variance.empty())) { + if (axis == 0) { + DecodeCenterSize<0, 1>(target_box, prior_box, prior_box_var, + normalized, variance, output); + } else { + DecodeCenterSize<1, 1>(target_box, prior_box, prior_box_var, + normalized, variance, output); + } + } else { + if (axis == 0) { + DecodeCenterSize<0, 0>(target_box, prior_box, prior_box_var, + normalized, variance, output); + } else { + DecodeCenterSize<1, 0>(target_box, prior_box, prior_box_var, + normalized, variance, output); + } + } } } }; diff --git a/paddle/fluid/operators/detection/density_prior_box_op.h b/paddle/fluid/operators/detection/density_prior_box_op.h index ed2f5df80c..3591681fc3 100644 --- a/paddle/fluid/operators/detection/density_prior_box_op.h +++ b/paddle/fluid/operators/detection/density_prior_box_op.h @@ -52,6 +52,10 @@ class DensityPriorBoxOpKernel : public framework::OpKernel<T> { step_height = step_h; } int num_priors = 0; + +#ifdef PADDLE_WITH_MKLML +#pragma omp parallel for reduction(+ : num_priors) +#endif for (size_t i = 0; i < densities.size(); ++i) { num_priors += (fixed_ratios.size()) * (pow(densities[i], 2)); } @@ -64,6 +68,17 @@ class DensityPriorBoxOpKernel : public framework::OpKernel<T> { auto e_boxes = framework::EigenTensor<T, 4>::From(*boxes).setConstant(0.0); int step_average = static_cast<int>((step_width + step_height) * 0.5); + std::vector<float> sqrt_fixed_ratios; +#ifdef PADDLE_WITH_MKLML +#pragma omp parallel for +#endif + for (int i = 0; i < fixed_ratios.size(); i++) { + sqrt_fixed_ratios.push_back(sqrt(fixed_ratios[i])); + } + +#ifdef PADDLE_WITH_MKLML +#pragma omp parallel for collapse(2) +#endif for (int h = 0; h < feature_height; ++h) { for (int w = 0; w < feature_width; ++w) { T center_x = (w + offset) * step_width; @@ -73,34 +88,25 @@ class DensityPriorBoxOpKernel : public framework::OpKernel<T> { for (size_t s = 0; s < fixed_sizes.size(); ++s) { auto fixed_size = fixed_sizes[s]; int density = densities[s]; + int shift = step_average / density; // Generate density prior boxes with fixed ratios. for (size_t r = 0; r < fixed_ratios.size(); ++r) { - float ar = fixed_ratios[r]; - int shift = step_average / density; - float box_width_ratio = fixed_size * sqrt(ar); - float box_height_ratio = fixed_size / sqrt(ar); + float box_width_ratio = fixed_size * sqrt_fixed_ratios[r]; + float box_height_ratio = fixed_size / sqrt_fixed_ratios[r]; + float density_center_x = center_x - step_average / 2. + shift / 2.; + float density_center_y = center_y - step_average / 2. + shift / 2.; for (int di = 0; di < density; ++di) { for (int dj = 0; dj < density; ++dj) { - float center_x_temp = - center_x - step_average / 2. + shift / 2. + dj * shift; - float center_y_temp = - center_y - step_average / 2. + shift / 2. + di * shift; - e_boxes(h, w, idx, 0) = - (center_x_temp - box_width_ratio / 2.) / img_width >= 0 - ? (center_x_temp - box_width_ratio / 2.) / img_width - : 0; - e_boxes(h, w, idx, 1) = - (center_y_temp - box_height_ratio / 2.) / img_height >= 0 - ? (center_y_temp - box_height_ratio / 2.) / img_height - : 0; - e_boxes(h, w, idx, 2) = - (center_x_temp + box_width_ratio / 2.) / img_width <= 1 - ? (center_x_temp + box_width_ratio / 2.) / img_width - : 1; - e_boxes(h, w, idx, 3) = - (center_y_temp + box_height_ratio / 2.) / img_height <= 1 - ? (center_y_temp + box_height_ratio / 2.) / img_height - : 1; + float center_x_temp = density_center_x + dj * shift; + float center_y_temp = density_center_y + di * shift; + e_boxes(h, w, idx, 0) = std::max( + (center_x_temp - box_width_ratio / 2.) / img_width, 0.); + e_boxes(h, w, idx, 1) = std::max( + (center_y_temp - box_height_ratio / 2.) / img_height, 0.); + e_boxes(h, w, idx, 2) = std::min( + (center_x_temp + box_width_ratio / 2.) / img_width, 1.); + e_boxes(h, w, idx, 3) = std::min( + (center_y_temp + box_height_ratio / 2.) / img_height, 1.); idx++; } } @@ -131,8 +137,14 @@ class DensityPriorBoxOpKernel : public framework::OpKernel<T> { vars->Resize({box_num, static_cast<int>(variances.size())}); auto e_vars = framework::EigenMatrix<T, Eigen::RowMajor>::From(*vars); - - e_vars = var_et.broadcast(Eigen::DSizes<int, 2>(box_num, 1)); +#ifdef PADDLE_WITH_MKLML +#pragma omp parallel for collapse(2) +#endif + for (int i = 0; i < box_num; ++i) { + for (int j = 0; j < variances.size(); ++j) { + e_vars(i, j) = variances[j]; + } + } vars->Resize(var_dim); boxes->Resize(box_dim); diff --git a/paddle/fluid/operators/elementwise/elementwise_add_op.cc b/paddle/fluid/operators/elementwise/elementwise_add_op.cc index 7e789cd8d9..c6c658236c 100644 --- a/paddle/fluid/operators/elementwise/elementwise_add_op.cc +++ b/paddle/fluid/operators/elementwise/elementwise_add_op.cc @@ -18,6 +18,7 @@ namespace ops = paddle::operators; REGISTER_ELEMWISE_GRAD_MAKER(elementwise_add, Add); REGISTER_ELEMWISE_EXPLICIT_OP(elementwise_add, "Add", "Out = X + Y", "Out", "X"); + REGISTER_OP_CPU_KERNEL( elementwise_add, ops::ElementwiseAddKernel<paddle::platform::CPUDeviceContext, float>, diff --git a/paddle/fluid/operators/elementwise/elementwise_op.h b/paddle/fluid/operators/elementwise/elementwise_op.h index fd2a98cb45..91e4415265 100644 --- a/paddle/fluid/operators/elementwise/elementwise_op.h +++ b/paddle/fluid/operators/elementwise/elementwise_op.h @@ -250,6 +250,37 @@ class ElemwiseGradKernel : public framework::OpKernel<T> { } }; +class ElementwiseOpInplace : public framework::InplaceInToOut { + public: + using framework::InplaceInToOut::InplaceInToOut; + + protected: + std::unordered_map<std::string, std::string> Apply( + const framework::OpDesc &op_desc, + framework::BlockDesc *block) const override { + return std::unordered_map<std::string, std::string>{ + {"X", "Out"}, + }; + } +}; + +class ElementwiseGradOpInplace : public framework::InplaceInToOut { + public: + using framework::InplaceInToOut::InplaceInToOut; + + protected: + std::unordered_map<std::string, std::string> Apply( + const framework::OpDesc &op_desc, + framework::BlockDesc *block) const override { + std::unordered_map<std::string, std::string> ret; + if (block->HasVar(framework::GradVarName("X")) && + block->HasVar(framework::GradVarName("Out"))) { + ret[framework::GradVarName("Out")] = framework::GradVarName("X"); + } + return ret; + } +}; + } // namespace operators } // namespace paddle @@ -299,6 +330,8 @@ class ElemwiseGradKernel : public framework::OpKernel<T> { REGISTER_OPERATOR(op_type, ::paddle::operators::ElementwiseOp, \ __ElemwiseOp##op_type##Maker__, \ ::paddle::operators::ElementwiseOpInferVarType, \ - op_type##GradMaker); \ + op_type##GradMaker, \ + ::paddle::operators::ElementwiseOpInplace); \ REGISTER_OPERATOR(op_type##_grad, \ - ::paddle::operators::ElementwiseOpExplicitGrad) + ::paddle::operators::ElementwiseOpExplicitGrad, \ + ::paddle::operators::ElementwiseGradOpInplace) diff --git a/paddle/fluid/operators/flatten_op.cc b/paddle/fluid/operators/flatten_op.cc index 8e80dc0e64..bb904166c4 100644 --- a/paddle/fluid/operators/flatten_op.cc +++ b/paddle/fluid/operators/flatten_op.cc @@ -267,6 +267,35 @@ class Flatten2GradOp : public framework::OperatorBase { } }; +class FlattenOpInplaceInToOut : public framework::InplaceInToOut { + public: + using InplaceInToOut::InplaceInToOut; + + protected: + std::unordered_map<std::string, std::string> Apply( + const framework::OpDesc &op_desc, + framework::BlockDesc *block) const override { + std::unordered_map<std::string, std::string> inplace_in_to_out = { + {"X", "Out"}, + }; + return inplace_in_to_out; + } +}; + +class FlattenGradInplaceinToOut : public framework::InplaceInToOut { + using InplaceInToOut::InplaceInToOut; + + protected: + std::unordered_map<std::string, std::string> Apply( + const framework::OpDesc &op_desc, + framework::BlockDesc *block) const override { + std::unordered_map<std::string, std::string> inplace_in_to_out = { + {framework::GradVarName("Out"), framework::GradVarName("X")}, + }; + return inplace_in_to_out; + } +}; + } // namespace operators } // namespace paddle @@ -275,10 +304,13 @@ USE_OP(reshape); namespace ops = paddle::operators; REGISTER_OPERATOR(flatten, ops::FlattenOp, ops::FlattenOpMaker, ops::FlattenOpInferShape, - paddle::framework::DefaultGradOpDescMaker<true>); -REGISTER_OPERATOR(flatten_grad, ops::FlattenGradOp, ops::FlattenGradInferShape); + paddle::framework::DefaultGradOpDescMaker<true>, + ops::FlattenOpInplaceInToOut); +REGISTER_OPERATOR(flatten_grad, ops::FlattenGradOp, ops::FlattenGradInferShape, + ops::FlattenGradInplaceinToOut); REGISTER_OPERATOR(flatten2, ops::Flatten2Op, ops::Flatten2OpMaker, - ops::Flatten2OpInferShape, ops::Flatten2GradOpMaker); + ops::Flatten2OpInferShape, ops::Flatten2GradOpMaker, + ops::FlattenOpInplaceInToOut); REGISTER_OPERATOR(flatten2_grad, ops::Flatten2GradOp, - ops::Flatten2GradInferShape); + ops::Flatten2GradInferShape, ops::FlattenGradInplaceinToOut); diff --git a/paddle/fluid/operators/jit/benchmark.cc b/paddle/fluid/operators/jit/benchmark.cc index 1b9360afce..97ddf223ae 100644 --- a/paddle/fluid/operators/jit/benchmark.cc +++ b/paddle/fluid/operators/jit/benchmark.cc @@ -93,6 +93,7 @@ std::vector<int> TestSizes() { template <typename KernelTuples, typename... Args> struct BenchFunc { // return this function avg time + // TODO(TJ): clear cache every time double operator()(const typename KernelTuples::func_type tgt, Args... args) { for (int i = 0; i < FLAGS_burning; ++i) { tgt(args...); @@ -172,6 +173,9 @@ void BenchXYZNKernel() { RandomVec<T>(d, y_data); BenchAllImpls<KT, jit::XYZNTuples<T>, PlaceType>(d, x.data<T>(), y.data<T>(), z_data, d); + // test inplace + BenchAllImpls<KT, jit::XYZNTuples<T>, PlaceType>(d, x.data<T>(), z_data, + z_data, d); } } diff --git a/paddle/fluid/operators/jit/gen/act.h b/paddle/fluid/operators/jit/gen/act.h index 68e66f9298..1664dfa906 100644 --- a/paddle/fluid/operators/jit/gen/act.h +++ b/paddle/fluid/operators/jit/gen/act.h @@ -63,7 +63,6 @@ class VActFunc : public JitCode { public: explicit VActFunc(size_t code_size, void* code_ptr) : JitCode(code_size, code_ptr) {} - virtual const char* name() const = 0; virtual void genCode() = 0; protected: diff --git a/paddle/fluid/operators/jit/gen/blas.cc b/paddle/fluid/operators/jit/gen/blas.cc index dee6c7b9d3..5da24c359e 100644 --- a/paddle/fluid/operators/jit/gen/blas.cc +++ b/paddle/fluid/operators/jit/gen/blas.cc @@ -155,7 +155,7 @@ class NCHW16CMulNCCreator : public JitCodeCreator<int> { class name##Creator : public JitCodeCreator<int> { \ public: \ bool UseMe(const int& attr) const override { \ - return platform::MayIUse(platform::avx); \ + return platform::MayIUse(platform::avx) && attr <= 1024; \ } \ size_t CodeSize(const int& d) const override { \ return 96 + d / YMM_FLOAT_BLOCK * 4 * 8; \ diff --git a/paddle/fluid/operators/jit/gen/blas.h b/paddle/fluid/operators/jit/gen/blas.h index de6b33f467..e991139266 100644 --- a/paddle/fluid/operators/jit/gen/blas.h +++ b/paddle/fluid/operators/jit/gen/blas.h @@ -41,7 +41,7 @@ class VXXJitCode : public JitCode { this->genCode(); } - virtual const char* name() const { + virtual const char* name() const override { std::string base = "VXXJitCode"; if (scalar_index_ == 1) { base += "_Scalar"; @@ -61,6 +61,7 @@ class VXXJitCode : public JitCode { base += "_Vec"; } base += (with_relu_ ? "_Relu" : ""); + base += "_D" + std::to_string(num_); return base.c_str(); } void genCode() override; diff --git a/paddle/fluid/operators/jit/gen/hopv.h b/paddle/fluid/operators/jit/gen/hopv.h index d3bc94b63d..c336fe73fe 100644 --- a/paddle/fluid/operators/jit/gen/hopv.h +++ b/paddle/fluid/operators/jit/gen/hopv.h @@ -35,7 +35,7 @@ class HOPVJitCode : public JitCode { this->genCode(); } - virtual const char* name() const { + virtual const char* name() const override { std::string base = "VXXJitCode"; if (type_ == operand_type::MAX) { base += "_MAX"; diff --git a/paddle/fluid/operators/jit/gen/jitcode.h b/paddle/fluid/operators/jit/gen/jitcode.h index c388109604..91058f6cf6 100644 --- a/paddle/fluid/operators/jit/gen/jitcode.h +++ b/paddle/fluid/operators/jit/gen/jitcode.h @@ -68,7 +68,6 @@ class JitCode : public GenBase, public Xbyak::CodeGenerator { (code_size % 4096 != 0 ? (code_size / 4096 + 1) * 4096 : code_size), code_ptr) {} - virtual const char* name() const = 0; virtual void genCode() = 0; size_t getSize() const override { return CodeGenerator::getSize(); } diff --git a/paddle/fluid/operators/jit/gen/matmul.h b/paddle/fluid/operators/jit/gen/matmul.h index 626baa8f73..7976e3112d 100644 --- a/paddle/fluid/operators/jit/gen/matmul.h +++ b/paddle/fluid/operators/jit/gen/matmul.h @@ -36,7 +36,7 @@ class MatMulJitCode : public JitCode { this->genCode(); } - virtual const char* name() const { + virtual const char* name() const override { std::string base = "MatMulJitCode"; base = base + "_M" + std::to_string(m_) + "_N" + std::to_string(n_) + "_K" + std::to_string(k_); diff --git a/paddle/fluid/operators/jit/gen/seqpool.h b/paddle/fluid/operators/jit/gen/seqpool.h index fcbbb3c84c..c464c2eac8 100644 --- a/paddle/fluid/operators/jit/gen/seqpool.h +++ b/paddle/fluid/operators/jit/gen/seqpool.h @@ -38,7 +38,7 @@ class SeqPoolJitCode : public JitCode { this->genCode(); } - virtual const char* name() const { + virtual const char* name() const override { std::string base = "SeqPoolJitCode"; if (type_ == SeqPoolType::kSum) { base += "_Sum"; diff --git a/paddle/fluid/operators/jit/gen_base.cc b/paddle/fluid/operators/jit/gen_base.cc index 3cd5f6554b..f3603875ad 100644 --- a/paddle/fluid/operators/jit/gen_base.cc +++ b/paddle/fluid/operators/jit/gen_base.cc @@ -17,7 +17,13 @@ #include <iostream> #include <sstream> #include <vector> +#include "paddle/fluid/memory/allocation/cpu_allocator.h" // for posix_memalign #include "paddle/fluid/platform/cpu_info.h" +#include "paddle/fluid/platform/enforce.h" + +#ifndef _WIN32 +#define posix_memalign_free free +#endif DEFINE_bool(dump_jitcode, false, "Whether to dump the jitcode to file"); @@ -40,6 +46,17 @@ void GenBase::dumpCode(const unsigned char* code) const { } } +void* GenBase::operator new(size_t size) { + void* ptr; + constexpr size_t alignment = 32ul; + PADDLE_ENFORCE_EQ(posix_memalign(&ptr, alignment, size), 0, + "GenBase Alloc %ld error!", size); + PADDLE_ENFORCE(ptr, "Fail to allocate GenBase CPU memory: size = %d .", size); + return ptr; +} + +void GenBase::operator delete(void* ptr) { posix_memalign_free(ptr); } + std::vector<int> packed_groups(int n, int k, int* block_out, int* rest_out) { int block; int max_num_regs; diff --git a/paddle/fluid/operators/jit/gen_base.h b/paddle/fluid/operators/jit/gen_base.h index d808a33247..0f85245ba9 100644 --- a/paddle/fluid/operators/jit/gen_base.h +++ b/paddle/fluid/operators/jit/gen_base.h @@ -42,6 +42,11 @@ class GenBase : public Kernel { return reinterpret_cast<Func>(const_cast<unsigned char*>(code)); } + void* operator new(size_t size); + void operator delete(void* ptr); + void* operator new[](size_t size) { return operator new(size); } + void operator delete[](void* ptr) { operator delete(ptr); } + protected: void dumpCode(const unsigned char* code) const; }; diff --git a/paddle/fluid/operators/jit/helper.h b/paddle/fluid/operators/jit/helper.h index bba3a13619..d5773d6594 100644 --- a/paddle/fluid/operators/jit/helper.h +++ b/paddle/fluid/operators/jit/helper.h @@ -118,26 +118,33 @@ typename KernelTuples::func_type Get( return GetRefer<KT, KernelTuples>(); } -template <KernelType KT, typename KernelTuples> -class KernelFuncsCache { +template <KernelType KT, typename KernelTuples, typename PlaceType> +class KernelFuncs { public: - KernelFuncsCache() = default; - static KernelFuncsCache& Instance() { - static thread_local KernelFuncsCache<KT, KernelTuples> g_func_cache; + KernelFuncs() = default; + static KernelFuncs& Cache() { + static thread_local KernelFuncs<KT, KernelTuples, PlaceType> g_func_cache; return g_func_cache; } bool Has(int key) const { return funcs_.find(key) != funcs_.end(); } - typename KernelTuples::func_type At(int key) { return funcs_.at(key); } - void Insert(int key, typename KernelTuples::func_type func) { funcs_.emplace(key, func); } + typename KernelTuples::func_type At(int key) { + if (Has(key)) { + return funcs_.at(key); + } + auto func = Get<KT, KernelTuples, PlaceType>(key); + Insert(key, func); + return func; + } + private: std::unordered_map<int, typename KernelTuples::func_type> funcs_; - DISABLE_COPY_AND_ASSIGN(KernelFuncsCache); + DISABLE_COPY_AND_ASSIGN(KernelFuncs); }; const char* to_string(KernelType kt); diff --git a/paddle/fluid/operators/jit/more/mix/mix.cc b/paddle/fluid/operators/jit/more/mix/mix.cc index 0f42ac158c..0036d1c238 100644 --- a/paddle/fluid/operators/jit/more/mix/mix.cc +++ b/paddle/fluid/operators/jit/more/mix/mix.cc @@ -49,49 +49,16 @@ void VTanh(const T* x, T* y, int n) { } void Softmax(const T* x, T* y, int n, int bs) { - typename XRNTuples<T>::func_type compute_hmax{nullptr}; - typename XRNTuples<T>::func_type compute_hsum{nullptr}; - typename AXYNTuples<T>::func_type compute_vscal{nullptr}; - typename AXYNTuples<T>::func_type compute_vaddbias{nullptr}; - typename XYNTuples<T>::func_type compute_vexp{nullptr}; - - if (!KernelFuncsCache<kHMax, XRNTuples<T>>::Instance().Has(n)) { - compute_hmax = Get<kHMax, XRNTuples<T>, platform::CPUPlace>(n); - KernelFuncsCache<kHMax, XRNTuples<T>>::Instance().Insert(n, compute_hmax); - } else { - compute_hmax = KernelFuncsCache<kHMax, XRNTuples<T>>::Instance().At(n); - } - - if (!KernelFuncsCache<kHSum, XRNTuples<T>>::Instance().Has(n)) { - compute_hsum = Get<kHSum, XRNTuples<T>, platform::CPUPlace>(n); - KernelFuncsCache<kHSum, XRNTuples<T>>::Instance().Insert(n, compute_hsum); - } else { - compute_hsum = KernelFuncsCache<kHSum, XRNTuples<T>>::Instance().At(n); - } - - if (!KernelFuncsCache<kVScal, AXYNTuples<T>>::Instance().Has(n)) { - compute_vscal = Get<kVScal, AXYNTuples<T>, platform::CPUPlace>(n); - KernelFuncsCache<kVScal, AXYNTuples<T>>::Instance().Insert(n, - compute_vscal); - } else { - compute_vscal = KernelFuncsCache<kVScal, AXYNTuples<T>>::Instance().At(n); - } - - if (!KernelFuncsCache<kVAddBias, AXYNTuples<T>>::Instance().Has(n)) { - compute_vaddbias = Get<kVAddBias, AXYNTuples<T>, platform::CPUPlace>(n); - KernelFuncsCache<kVAddBias, AXYNTuples<T>>::Instance().Insert( - n, compute_vaddbias); - } else { - compute_vaddbias = - KernelFuncsCache<kVAddBias, AXYNTuples<T>>::Instance().At(n); - } - - if (!KernelFuncsCache<kVExp, XYNTuples<T>>::Instance().Has(n)) { - compute_vexp = Get<KernelType::kVExp, XYNTuples<T>, platform::CPUPlace>(n); - KernelFuncsCache<kVExp, XYNTuples<T>>::Instance().Insert(n, compute_vexp); - } else { - compute_vexp = KernelFuncsCache<kVExp, XYNTuples<T>>::Instance().At(n); - } + auto compute_hmax = + KernelFuncs<kHMax, XRNTuples<T>, platform::CPUPlace>::Cache().At(n); + auto compute_hsum = + KernelFuncs<kHSum, XRNTuples<T>, platform::CPUPlace>::Cache().At(n); + auto compute_vscal = + KernelFuncs<kVScal, AXYNTuples<T>, platform::CPUPlace>::Cache().At(n); + auto compute_vaddbias = + KernelFuncs<kVAddBias, AXYNTuples<T>, platform::CPUPlace>::Cache().At(n); + auto compute_vexp = + KernelFuncs<kVExp, XYNTuples<T>, platform::CPUPlace>::Cache().At(n); for (int i = 0; i < bs; ++i) { T scalar; diff --git a/paddle/fluid/operators/jit/more/mkl/mkl.cc b/paddle/fluid/operators/jit/more/mkl/mkl.cc index c7d0215eda..4c999131ab 100644 --- a/paddle/fluid/operators/jit/more/mkl/mkl.cc +++ b/paddle/fluid/operators/jit/more/mkl/mkl.cc @@ -136,7 +136,7 @@ bool VMulKernel<float>::UseMe(const int& d) const { template <> bool VAddKernel<float>::UseMe(const int& d) const { - return platform::MayIUse(platform::avx512f) && d > 512; + return platform::MayIUse(platform::avx) && d > 512; } template <> diff --git a/paddle/fluid/operators/lookup_table_op.h b/paddle/fluid/operators/lookup_table_op.h index a7d0fd4856..56c6e37ae3 100644 --- a/paddle/fluid/operators/lookup_table_op.h +++ b/paddle/fluid/operators/lookup_table_op.h @@ -129,6 +129,7 @@ class LookupTableGradKernel : public framework::OpKernel<T> { "must be either LoDTensor or SelectedRows"); } + int64_t padding_idx = context.Attr<int64_t>("padding_idx"); bool is_sparse = context.Attr<bool>("is_sparse"); // Since paddings are not trainable and fixed in forward, the gradient of // paddings makes no sense and we don't deal with it in backward. @@ -187,10 +188,15 @@ class LookupTableGradKernel : public framework::OpKernel<T> { memset(d_table_data, 0, d_table->numel() * sizeof(T)); for (int64_t i = 0; i < ids->numel(); ++i) { - PADDLE_ENFORCE_LT(ids_data[i], N); - PADDLE_ENFORCE_GE(ids_data[i], 0); - for (int j = 0; j < D; ++j) { - d_table_data[ids_data[i] * D + j] += d_output_data[i * D + j]; + if (padding_idx != kNoPadding && ids_data[i] == padding_idx) { + // the gradient of padding_idx should be 0, already done by memset, so + // do nothing. + } else { + PADDLE_ENFORCE_LT(ids_data[i], N); + PADDLE_ENFORCE_GE(ids_data[i], 0); + for (int j = 0; j < D; ++j) { + d_table_data[ids_data[i] * D + j] += d_output_data[i * D + j]; + } } } } diff --git a/paddle/fluid/operators/math/CMakeLists.txt b/paddle/fluid/operators/math/CMakeLists.txt index e20524012a..4b6eef18d8 100644 --- a/paddle/fluid/operators/math/CMakeLists.txt +++ b/paddle/fluid/operators/math/CMakeLists.txt @@ -37,7 +37,7 @@ math_library(concat_and_split) math_library(context_project DEPS im2col math_function) math_library(cross_entropy) math_library(cos_sim_functor) -math_library(depthwise_conv) +math_library(depthwise_conv DEPS cub) math_library(im2col) math_library(sampler) diff --git a/paddle/fluid/operators/math/fc_compute.h b/paddle/fluid/operators/math/fc_compute.h index cddd0a18db..0ad57c51be 100644 --- a/paddle/fluid/operators/math/fc_compute.h +++ b/paddle/fluid/operators/math/fc_compute.h @@ -30,15 +30,17 @@ inline void FCCompute(const BlasT<DeviceContext, T>& blas, const int M, return; } if (relu) { - auto compute = - jit::Get<jit::kVAddRelu, jit::XYZNTuples<T>, platform::CPUPlace>(N); + auto compute = jit::KernelFuncs<jit::kVAddRelu, jit::XYZNTuples<T>, + platform::CPUPlace>::Cache() + .At(N); for (int i = 0; i < M; i++) { T* dst = Y + i * N; compute(B, dst, dst, N); } } else { - auto compute = - jit::Get<jit::kVAdd, jit::XYZNTuples<T>, platform::CPUPlace>(N); + auto compute = jit::KernelFuncs<jit::kVAdd, jit::XYZNTuples<T>, + platform::CPUPlace>::Cache() + .At(N); #ifdef PADDLE_WITH_MKLML #pragma omp parallel for #endif diff --git a/paddle/fluid/operators/math/softmax_impl.h b/paddle/fluid/operators/math/softmax_impl.h index 1ff9ff684f..a1cb3f9728 100644 --- a/paddle/fluid/operators/math/softmax_impl.h +++ b/paddle/fluid/operators/math/softmax_impl.h @@ -82,8 +82,9 @@ class SoftmaxFunctor<DeviceContext, float, true, enable_if_CPU<DeviceContext>> { const int kClassDim = 1; // 2D data. Batch x C auto compute_softmax = - jit::Get<jit::kSoftmax, jit::SoftmaxTuples<float>, platform::CPUPlace>( - in_dims[kClassDim]); + jit::KernelFuncs<jit::kSoftmax, jit::SoftmaxTuples<float>, + platform::CPUPlace>::Cache() + .At(in_dims[kClassDim]); compute_softmax(in_data, out_data, in_dims[kClassDim], in_dims[kBatchDim]); } }; diff --git a/paddle/fluid/operators/ngraph/ngraph_bridge.cc b/paddle/fluid/operators/ngraph/ngraph_bridge.cc index 13b168ce45..e8b92fc02a 100644 --- a/paddle/fluid/operators/ngraph/ngraph_bridge.cc +++ b/paddle/fluid/operators/ngraph/ngraph_bridge.cc @@ -31,6 +31,11 @@ std::map<std::string, std::shared_ptr<std::unordered_map< std::string, std::shared_ptr<ngraph::Node>>>)>> NgraphBridge::NG_NODE_MAP = { + {"accuracy", NG_OPS::BuildAccuracyNode}, + {"conv2d", NG_OPS::BuildConv2dNode}, + {"conv2d_grad", NG_OPS::BuildConv2dGradNode}, + {"batch_norm", NG_OPS::BuildBatchNormNode}, + {"batch_norm_grad", NG_OPS::BuildBatchNormGradNode}, {"elementwise_add", NG_OPS::BuildElementwiseAddNode}, {"elementwise_add_grad", NG_OPS::BuildElementwiseAddGradNode}, {"fill_constant", NG_OPS::BuildFillConstantNode}, diff --git a/paddle/fluid/operators/ngraph/ngraph_engine_op.h b/paddle/fluid/operators/ngraph/ngraph_engine_op.h index d2974298b0..2f194a9b87 100644 --- a/paddle/fluid/operators/ngraph/ngraph_engine_op.h +++ b/paddle/fluid/operators/ngraph/ngraph_engine_op.h @@ -35,7 +35,7 @@ class NgraphEngineOp : public framework::OperatorWithKernel { framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext& ctx) const override { framework::OpKernelType kt = framework::OpKernelType( - framework::proto::VarType::FP32, ctx.GetPlace()); + framework::proto::VarType::FP32, platform::CPUPlace()); return kt; } }; diff --git a/paddle/fluid/operators/ngraph/ngraph_ops.h b/paddle/fluid/operators/ngraph/ngraph_ops.h index 4b7aa3393b..438a9c1be9 100644 --- a/paddle/fluid/operators/ngraph/ngraph_ops.h +++ b/paddle/fluid/operators/ngraph/ngraph_ops.h @@ -21,7 +21,10 @@ limitations under the License. */ #pragma once -#include "ops/binary_unnary_op.h" +#include "ops/accuracy_op.h" +#include "ops/batch_norm_op.h" +#include "ops/binary_unary_op.h" +#include "ops/conv2d_op.h" #include "ops/elementwise_add_op.h" #include "ops/fill_constant_op.h" #include "ops/mean_op.h" diff --git a/paddle/fluid/operators/ngraph/ops/accuracy_op.h b/paddle/fluid/operators/ngraph/ops/accuracy_op.h new file mode 100644 index 0000000000..bf37ce48d8 --- /dev/null +++ b/paddle/fluid/operators/ngraph/ops/accuracy_op.h @@ -0,0 +1,65 @@ +/*Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include <string> +#include <vector> +#include "ngraph/ngraph.hpp" +#include "paddle/fluid/platform/ngraph_helper.h" + +namespace paddle { +namespace operators { +namespace ngraphs { + +void BuildAccuracyNode( + const std::shared_ptr<framework::OperatorBase>& op, + std::shared_ptr< + std::unordered_map<std::string, std::shared_ptr<ngraph::Node>>> + ngb_node_map) { + auto indices = platform::GetInputNode(op, "Indices", ngb_node_map); + auto label = platform::GetInputNode(op, "Label", ngb_node_map); + auto inference = platform::GetInputNode(op, "Out", ngb_node_map); + auto inference_shape = inference->get_shape(); + size_t num_samples = inference_shape.at(0); + size_t k = inference_shape.at(1); + + std::shared_ptr<ngraph::Node> label_k = label; + if (k > 1) { + auto label_1d = std::make_shared<ngraph::op::Reshape>( + label, ngraph::AxisVector{0, 1}, ngraph::Shape{num_samples}); + label_k = std::make_shared<ngraph::op::Broadcast>(label_1d, inference_shape, + ngraph::AxisSet{1}); + } + + auto node_equal = std::make_shared<ngraph::op::Equal>(indices, label_k); + auto node_eq_int = + std::make_shared<ngraph::op::Convert>(node_equal, ngraph::element::i64); + auto num_correct_0d = + std::make_shared<ngraph::op::Sum>(node_eq_int, ngraph::AxisSet{0, 1}); + std::shared_ptr<ngraph::Node> num_correct = + platform::NgReshaper(num_correct_0d, ngraph::Shape{1}); + std::shared_ptr<ngraph::Node> n_samples = ngraph::op::Constant::create( + ngraph::element::i64, ngraph::Shape{1}, {num_samples}); + std::shared_ptr<ngraph::Node> accuracy = std::make_shared<ngraph::op::Divide>( + std::make_shared<ngraph::op::Convert>(num_correct, ngraph::element::f32), + std::make_shared<ngraph::op::Convert>(n_samples, ngraph::element::f32)); + + platform::SetOutputNode(op, "Accuracy", accuracy, ngb_node_map); + platform::SetOutputNode(op, "Correct", num_correct, ngb_node_map); + platform::SetOutputNode(op, "Total", n_samples, ngb_node_map); +} +} // namespace ngraphs +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/ngraph/ops/batch_norm_op.h b/paddle/fluid/operators/ngraph/ops/batch_norm_op.h new file mode 100644 index 0000000000..2cdd029976 --- /dev/null +++ b/paddle/fluid/operators/ngraph/ops/batch_norm_op.h @@ -0,0 +1,150 @@ +/*Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include <string> +#include <vector> + +#include "ngraph/ngraph.hpp" +#include "paddle/fluid/operators/ngraph/ops/elementwise_node.h" +#include "paddle/fluid/operators/ngraph/ops/elementwise_scalar_op.h" +#include "paddle/fluid/platform/ngraph_helper.h" + +namespace paddle { +namespace operators { +namespace ngraphs { + +void BuildBatchNormNode( + const std::shared_ptr<paddle::framework::OperatorBase>& op, + std::shared_ptr< + std::unordered_map<std::string, std::shared_ptr<ngraph::Node>>> + ngb_node_map) { + auto op_attrs = paddle::framework::AttrReader(op->Attrs()); + auto& data_layout = op_attrs.Get<std::string>("data_layout"); + + auto bias = paddle::platform::GetInputNode(op, "Bias", ngb_node_map); + auto mean = paddle::platform::GetInputNode(op, "Mean", ngb_node_map); + auto variance = paddle::platform::GetInputNode(op, "Variance", ngb_node_map); + auto scale = paddle::platform::GetInputNode(op, "Scale", ngb_node_map); + auto x = paddle::platform::GetInputNode(op, "X", ngb_node_map); + + const bool is_test = op_attrs.Get<bool>("is_test"); + const float epsilon = op_attrs.Get<float>("epsilon"); + const float momentum = op_attrs.Get<float>("momentum"); + + if (data_layout == "NHWC") { + x = paddle::platform::Nhwc2Nchw(x); + } + + std::shared_ptr<ngraph::Node> mean_out, saved_mean, saved_variance, + variance_out, y; + + if (!is_test) { + auto BN = std::make_shared<ngraph::op::BatchNormTraining>(epsilon, scale, + bias, x); + y = std::make_shared<ngraph::op::GetOutputElement>(BN, 0); + saved_mean = std::make_shared<ngraph::op::GetOutputElement>(BN, 1); + saved_variance = std::make_shared<ngraph::op::GetOutputElement>(BN, 2); + + mean_out = std::make_shared<ngraph::op::Add>( + paddle::operators::ngraphs::ElementwiseScalar<ngraph::op::Multiply>( + momentum, mean), + paddle::operators::ngraphs::ElementwiseScalar<ngraph::op::Multiply>( + 1. - momentum, saved_mean)); + variance_out = std::make_shared<ngraph::op::Add>( + paddle::operators::ngraphs::ElementwiseScalar<ngraph::op::Multiply>( + momentum, variance), + paddle::operators::ngraphs::ElementwiseScalar<ngraph::op::Multiply>( + 1. - momentum, saved_variance)); + + if (data_layout == "NHWC") { + y = paddle::platform::Nchw2Nhwc(y); + } + + paddle::platform::SetOutputNode(op, "MeanOut", mean_out, ngb_node_map); + paddle::platform::SetOutputNode(op, "VarianceOut", variance_out, + ngb_node_map); + paddle::platform::SetOutputNode(op, "SavedMean", saved_mean, ngb_node_map); + paddle::platform::SetOutputNode(op, "SavedVariance", saved_variance, + ngb_node_map); + paddle::platform::SetOutputNode(op, "Y", y, ngb_node_map); + } else { + y = std::make_shared<ngraph::op::BatchNormInference>(epsilon, scale, bias, + x, mean, variance); + paddle::platform::SetOutputNode(op, "Y", y, ngb_node_map); + } +} + +void BuildBatchNormGradNode( + const std::shared_ptr<paddle::framework::OperatorBase>& op, + std::shared_ptr< + std::unordered_map<std::string, std::shared_ptr<ngraph::Node>>> + ngb_node_map) { + auto op_attrs = paddle::framework::AttrReader(op->Attrs()); + auto& data_layout = op_attrs.Get<std::string>("data_layout"); + + auto bias = paddle::platform::GetInputNode(op, "Bias", ngb_node_map); + auto saved_mean = + paddle::platform::GetInputNode(op, "SavedMean", ngb_node_map); + auto saved_variance = + paddle::platform::GetInputNode(op, "SavedVariance", ngb_node_map); + auto scale = paddle::platform::GetInputNode(op, "Scale", ngb_node_map); + auto x = paddle::platform::GetInputNode(op, "X", ngb_node_map); + auto dy = paddle::platform::GetInputNode(op, "Y@GRAD", ngb_node_map); + auto x_shape = x->get_shape(); + auto dy_shape = dy->get_shape(); + + PADDLE_ENFORCE(x_shape.size() == 2 || x_shape.size() == 4, + "BN grap input size needs to be 2 or 4"); + PADDLE_ENFORCE_EQ(x_shape.size(), dy_shape.size(), + "BN grap input and delta size needs to be equal"); + + if (x_shape.size() == 2) { + x = std::make_shared<ngraph::op::Reshape>( + x, ngraph::AxisVector{0, 1}, + ngraph::Shape{x_shape.at(0), x_shape.at(1), 1, 1}); + dy = std::make_shared<ngraph::op::Reshape>( + dy, ngraph::AxisVector{0, 1}, + ngraph::Shape{dy_shape.at(0), dy_shape.at(1), 1, 1}); + } + + if (data_layout == "NHWC") { + x = paddle::platform::Nhwc2Nchw(dy); + dy = paddle::platform::Nhwc2Nchw(dy); + } + const float epsilon = op_attrs.Get<float>("epsilon"); + + auto bn_bprop = std::make_shared<ngraph::op::BatchNormTrainingBackprop>( + epsilon, scale, bias, x, saved_mean, saved_variance, dy); + + std::shared_ptr<ngraph::Node> dx = + std::make_shared<ngraph::op::GetOutputElement>(bn_bprop, 0); + auto dscale = std::make_shared<ngraph::op::GetOutputElement>(bn_bprop, 1); + auto dbias = std::make_shared<ngraph::op::GetOutputElement>(bn_bprop, 2); + paddle::platform::SetOutputNode(op, "Bias@GRAD", dbias, ngb_node_map); + paddle::platform::SetOutputNode(op, "Scale@GRAD", dscale, ngb_node_map); + if (x_shape.size() == 2) { + paddle::platform::SetOutputNode( + op, "X@GRAD", paddle::platform::NgReshaper(dx, x_shape), ngb_node_map); + } else { + if (data_layout == "NHWC") { + dx = paddle::platform::Nchw2Nhwc(dx); + } + paddle::platform::SetOutputNode(op, "X@GRAD", dx, ngb_node_map); + } +} +} // namespace ngraphs +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/ngraph/ops/binary_unnary_op.h b/paddle/fluid/operators/ngraph/ops/binary_unary_op.h similarity index 100% rename from paddle/fluid/operators/ngraph/ops/binary_unnary_op.h rename to paddle/fluid/operators/ngraph/ops/binary_unary_op.h diff --git a/paddle/fluid/operators/ngraph/ops/conv2d_op.h b/paddle/fluid/operators/ngraph/ops/conv2d_op.h new file mode 100644 index 0000000000..46fb2703f5 --- /dev/null +++ b/paddle/fluid/operators/ngraph/ops/conv2d_op.h @@ -0,0 +1,235 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include <string> +#include <vector> +#include "ngraph/ngraph.hpp" +#include "paddle/fluid/platform/ngraph_helper.h" + +namespace paddle { +namespace operators { +namespace ngraphs { + +std::shared_ptr<ngraph::Node> GroupedConvolution( + const std::shared_ptr<ngraph::Node>& data_batch, + const std::shared_ptr<ngraph::Node>& filters, const ngraph::Strides strides, + const ngraph::Strides dilations, const ngraph::CoordinateDiff& paddings, + size_t groups) { + auto& data_shape = data_batch->get_shape(); + auto& filter_shape = filters->get_shape(); + ngraph::NodeVector ng_slices; + + for (size_t i = 0; i < groups; ++i) { + size_t channel_step = filter_shape.at(1); + const std::vector<size_t> lower_bound{0, i * channel_step, 0, 0}; + const std::vector<size_t> upper_bound{data_shape.at(0), + (i + 1) * channel_step, + data_shape.at(2), data_shape.at(3)}; + auto data_slice = std::make_shared<ngraph::op::Slice>( + data_batch, lower_bound, upper_bound); + + size_t filter_step = filter_shape.at(0) / groups; + const std::vector<size_t> filter_lower_bound{i * filter_step, 0, 0, 0}; + const std::vector<size_t> filter_upper_bound{ + (i + 1) * filter_step, filter_shape.at(1), filter_shape.at(2), + filter_shape.at(3)}; + auto filter_slice = std::make_shared<ngraph::op::Slice>( + filters, filter_lower_bound, filter_upper_bound); + auto ng_conv = std::make_shared<ngraph::op::Convolution>( + data_slice, filter_slice, strides, dilations, paddings, paddings); + ng_slices.push_back(ng_conv); + } + + size_t concat_axis = 1; + return std::make_shared<ngraph::op::Concat>(ng_slices, concat_axis); +} + +std::shared_ptr<ngraph::Node> GroupedGradConvolutionFilter( + const std::shared_ptr<ngraph::Node>& data_batch, + const std::shared_ptr<ngraph::Node>& filters, + const std::shared_ptr<ngraph::Node>& doutput, const ngraph::Strides strides, + const ngraph::Strides dilations, const ngraph::CoordinateDiff& paddings, + size_t groups) { + auto& data_shape = data_batch->get_shape(); + auto& filter_shape = filters->get_shape(); + auto& out_shape = doutput->get_shape(); + ngraph::NodeVector ng_slices; + + for (size_t i = 0; i < groups; ++i) { + size_t channel_step = filter_shape.at(1); + const std::vector<size_t> lower_bound{0, i * channel_step, 0, 0}; + const std::vector<size_t> upper_bound{data_shape.at(0), + (i + 1) * channel_step, + data_shape.at(2), data_shape.at(3)}; + auto data_slice = std::make_shared<ngraph::op::Slice>( + data_batch, lower_bound, upper_bound); + + size_t filter_step = data_shape.at(0); + + const std::vector<size_t> filter_lower_bound{i * filter_step, 0, 0, 0}; + const std::vector<size_t> filter_upper_bound{ + (i + 1) * filter_step, filter_shape.at(1), filter_shape.at(2), + filter_shape.at(3)}; + auto filter_slice = std::make_shared<ngraph::op::Slice>( + filters, filter_lower_bound, filter_upper_bound); + + const std::vector<size_t> olower_bound{0, i * filter_step, 0, 0}; + const std::vector<size_t> oupper_bound{out_shape.at(0), + (i + 1) * filter_step, + out_shape.at(2), out_shape.at(3)}; + auto out_slice = std::make_shared<ngraph::op::Slice>(doutput, olower_bound, + oupper_bound); + + auto ng_conv = std::make_shared<ngraph::op::ConvolutionBackpropFilters>( + data_slice, filter_slice->get_shape(), out_slice, strides, dilations, + paddings, paddings, ngraph::Strides{1, 1}); + + ng_slices.push_back(ng_conv); + } + + size_t concat_axis = 0; + return std::make_shared<ngraph::op::Concat>(ng_slices, concat_axis); +} + +std::shared_ptr<ngraph::Node> GroupedGradConvolutionData( + const std::shared_ptr<ngraph::Node>& data_batch, + const std::shared_ptr<ngraph::Node>& filters, + const std::shared_ptr<ngraph::Node>& doutput, const ngraph::Strides strides, + const ngraph::Strides dilations, const ngraph::CoordinateDiff& paddings, + size_t groups) { + auto& data_shape = data_batch->get_shape(); + auto& filter_shape = filters->get_shape(); + auto& out_shape = doutput->get_shape(); + ngraph::NodeVector ng_slices; + + for (size_t i = 0; i < groups; ++i) { + size_t channel_step = filter_shape.at(1); + const std::vector<size_t> lower_bound{0, i * channel_step, 0, 0}; + const std::vector<size_t> upper_bound{data_shape.at(0), + (i + 1) * channel_step, + data_shape.at(2), data_shape.at(3)}; + auto data_slice = std::make_shared<ngraph::op::Slice>( + data_batch, lower_bound, upper_bound); + + size_t filter_step = data_shape.at(0); + + const std::vector<size_t> filter_lower_bound{i * filter_step, 0, 0, 0}; + const std::vector<size_t> filter_upper_bound{ + (i + 1) * filter_step, filter_shape.at(1), filter_shape.at(2), + filter_shape.at(3)}; + auto filter_slice = std::make_shared<ngraph::op::Slice>( + filters, filter_lower_bound, filter_upper_bound); + + const std::vector<size_t> olower_bound{0, i * filter_step, 0, 0}; + const std::vector<size_t> oupper_bound{out_shape.at(0), + (i + 1) * filter_step, + out_shape.at(2), out_shape.at(3)}; + auto out_slice = std::make_shared<ngraph::op::Slice>(doutput, olower_bound, + oupper_bound); + + auto ng_conv = std::make_shared<ngraph::op::ConvolutionBackpropData>( + data_slice->get_shape(), filter_slice, out_slice, strides, dilations, + paddings, paddings, ngraph::Strides{1, 1}); + ng_slices.push_back(ng_conv); + } + + size_t concat_axis = 1; + return std::make_shared<ngraph::op::Concat>(ng_slices, concat_axis); +} + +void BuildConv2dNode( + const std::shared_ptr<paddle::framework::OperatorBase>& op, + std::shared_ptr< + std::unordered_map<std::string, std::shared_ptr<ngraph::Node>>> + ngb_node_map) { + auto op_attrs = paddle::framework::AttrReader(op->Attrs()); + auto filters = paddle::platform::GetInputNode(op, "Filter", ngb_node_map); + auto input = paddle::platform::GetInputNode(op, "Input", ngb_node_map); + + std::vector<int> strides = op_attrs.Get<std::vector<int>>("strides"); + std::vector<int> paddings = op_attrs.Get<std::vector<int>>("paddings"); + std::vector<int> dilations = op_attrs.Get<std::vector<int>>("dilations"); + + const ngraph::Strides ng_strides{static_cast<size_t>(strides.at(0)), + static_cast<size_t>(strides.at(1))}; + const ngraph::Strides ng_dilations{static_cast<size_t>(dilations.at(0)), + static_cast<size_t>(dilations.at(1))}; + const ngraph::CoordinateDiff ng_paddings{ + static_cast<std::ptrdiff_t>(paddings.at(0)), + static_cast<std::ptrdiff_t>(paddings.at(1))}; + + int groups = static_cast<size_t>(op_attrs.Get<int>("groups")); + PADDLE_ENFORCE_GE(groups, 1, "conv groups needs be no less than 1"); + + std::shared_ptr<ngraph::Node> result; + if (groups == 1) { + result = std::make_shared<ngraph::op::Convolution>( + input, filters, ng_strides, ng_dilations, ng_paddings, ng_paddings); + } else { + result = GroupedConvolution(input, filters, ng_strides, ng_dilations, + ng_paddings, groups); + } + paddle::platform::SetOutputNode(op, "Output", result, ngb_node_map); +} + +void BuildConv2dGradNode( + const std::shared_ptr<paddle::framework::OperatorBase>& op, + std::shared_ptr< + std::unordered_map<std::string, std::shared_ptr<ngraph::Node>>> + ngb_node_map) { + auto op_attrs = paddle::framework::AttrReader(op->Attrs()); + auto filter = paddle::platform::GetInputNode(op, "Filter", ngb_node_map); + auto input = paddle::platform::GetInputNode(op, "Input", ngb_node_map); + auto doutput = + paddle::platform::GetInputNode(op, "Output@GRAD", ngb_node_map); + + int groups = op_attrs.Get<int>("groups"); + std::vector<int> strides = op_attrs.Get<std::vector<int>>("strides"); + std::vector<int> paddings = op_attrs.Get<std::vector<int>>("paddings"); + std::vector<int> dilations = op_attrs.Get<std::vector<int>>("dilations"); + + const ngraph::Strides ng_strides{static_cast<size_t>(strides.at(0)), + static_cast<size_t>(strides.at(1))}; + const ngraph::Strides ng_dilations{static_cast<size_t>(dilations.at(0)), + static_cast<size_t>(dilations.at(1))}; + const ngraph::CoordinateDiff ng_paddings{ + static_cast<std::ptrdiff_t>(paddings.at(0)), + static_cast<std::ptrdiff_t>(paddings.at(1))}; + + std::shared_ptr<ngraph::Node> dfilter; + std::shared_ptr<ngraph::Node> dinput; + if (groups == 1) { + dfilter = std::make_shared<ngraph::op::ConvolutionBackpropFilters>( + input, filter->get_shape(), doutput, ng_strides, ng_dilations, + ng_paddings, ng_paddings, ngraph::Strides{1, 1}); + + dinput = std::make_shared<ngraph::op::ConvolutionBackpropData>( + input->get_shape(), filter, doutput, ng_strides, ng_dilations, + ng_paddings, ng_paddings, ngraph::Strides{1, 1}); + + } else { + dfilter = GroupedGradConvolutionFilter(input, filter, doutput, ng_strides, + ng_dilations, ng_paddings, groups); + dinput = GroupedGradConvolutionData(input, filter, doutput, ng_strides, + ng_dilations, ng_paddings, groups); + } + + paddle::platform::SetOutputNode(op, "Filter@GRAD", dfilter, ngb_node_map); + paddle::platform::SetOutputNode(op, "Input@GRAD", dinput, ngb_node_map); +} +} // namespace ngraphs +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/ngraph/ops/top_k_op.h b/paddle/fluid/operators/ngraph/ops/top_k_op.h index ea66953a12..852ecd7139 100644 --- a/paddle/fluid/operators/ngraph/ops/top_k_op.h +++ b/paddle/fluid/operators/ngraph/ops/top_k_op.h @@ -36,11 +36,6 @@ void BuildTopKNode( std::make_shared<ngraph::op::GetOutputElement>(top_k, 0); std::shared_ptr<ngraph::Node> out = std::make_shared<ngraph::op::GetOutputElement>(top_k, 1); - auto dummy_out = paddle::platform::GetOutputNode(op, "Out", ngb_node_map); - if (dummy_out && dummy_out->get_element_type() != out->get_element_type()) { - out = std::make_shared<ngraph::op::Convert>(out, - dummy_out->get_element_type()); - } paddle::platform::SetOutputNode(op, "Indices", indices, ngb_node_map); paddle::platform::SetOutputNode(op, "Out", out, ngb_node_map); } diff --git a/paddle/fluid/operators/norm_op.h b/paddle/fluid/operators/norm_op.h index 6c95d3f3bf..f81cbc2c73 100644 --- a/paddle/fluid/operators/norm_op.h +++ b/paddle/fluid/operators/norm_op.h @@ -99,10 +99,10 @@ class NormGradKernel : public framework::OpKernel<T> { auto dx_e = framework::EigenVector<T>::Flatten(*out_dx); Eigen::DSizes<int, 3> shape(pre, n, post); - Eigen::DSizes<int, 2> norm_shape(pre, post); + Eigen::DSizes<int, 3> rshape(pre, 1, post); auto x = x_e.reshape(shape); auto dy = dy_e.reshape(shape); - auto norm = norm_e.reshape(norm_shape); + auto norm = norm_e.reshape(rshape); auto dx = dx_e.reshape(shape); framework::Tensor rsum; @@ -111,7 +111,6 @@ class NormGradKernel : public framework::OpKernel<T> { Eigen::DSizes<int, 1> rdim(1); Eigen::DSizes<int, 3> bcast(1, n, 1); - Eigen::DSizes<int, 3> rshape(pre, 1, post); // dx = ( dy/sqrt(sum(x*x)) ) * [1 - x*sum(x) / (sum(x*x) + e)] // = [dy - dy * x * sum(x) / (sum(x*x) + e)] / sqrt(sum(x*x)) diff --git a/paddle/fluid/operators/pool_op.cc b/paddle/fluid/operators/pool_op.cc index 5399ae556e..fc3636e0b2 100644 --- a/paddle/fluid/operators/pool_op.cc +++ b/paddle/fluid/operators/pool_op.cc @@ -259,7 +259,7 @@ Example: W_{out} = \\frac{(W_{in} - ksize[1] + 2 * paddings[1] + strides[1] - 1)}{strides[1]} + 1 $$ - For exclusive = true: + For exclusive = false: $$ hstart = i * strides[0] - paddings[0] hend = hstart + ksize[0] @@ -267,7 +267,7 @@ Example: wend = wstart + ksize[1] Output(i ,j) = \\frac{sum(Input[hstart:hend, wstart:wend])}{ksize[0] * ksize[1]} $$ - For exclusive = false: + For exclusive = true: $$ hstart = max(0, i * strides[0] - paddings[0]) hend = min(H, hstart + ksize[0]) @@ -403,7 +403,7 @@ Example: H_{out} = \frac{(H_{in} - ksize[1] + 2 * paddings[1] + strides[1] -1)}{strides[1]} + 1 \\ W_{out} = \frac{(W_{in} - ksize[2] + 2 * paddings[2] + strides[2] -1)}{strides[2]} + 1 $$ - For exclusive = true: + For exclusive = false: $$ dstart = i * strides[0] - paddings[0] dend = dstart + ksize[0] @@ -413,7 +413,7 @@ Example: wend = wstart + ksize[2] Output(i ,j, k) = \\frac{sum(Input[dstart:dend, hstart:hend, wstart:wend])}{ksize[0] * ksize[1] * ksize[2]} $$ - For exclusive = false: + For exclusive = true: $$ dstart = max(0, i * strides[0] - paddings[0]) dend = min(D, dstart + ksize[0]) diff --git a/paddle/fluid/operators/random_crop_op.h b/paddle/fluid/operators/random_crop_op.h index d68ba9d661..ee034b2705 100644 --- a/paddle/fluid/operators/random_crop_op.h +++ b/paddle/fluid/operators/random_crop_op.h @@ -121,7 +121,7 @@ struct RandomCropFunctor { HOSTDEVICE void operator()(size_t ins_idx) { typename Random<DeviceContext>::Engine engine(seed_); engine.discard(ins_idx * (rank_ - num_batchsize_dims_)); - size_t offsets[9]; + size_t offsets[9] = {}; for (int i = num_batchsize_dims_; i < rank_; ++i) { typename Random<DeviceContext>::template UniformIntDist<size_t> dist( 0, x_dims_[i] - out_dims_[i]); diff --git a/paddle/fluid/operators/reader/buffered_reader.cc b/paddle/fluid/operators/reader/buffered_reader.cc index 26ff221dfa..defc29b91f 100644 --- a/paddle/fluid/operators/reader/buffered_reader.cc +++ b/paddle/fluid/operators/reader/buffered_reader.cc @@ -14,6 +14,7 @@ #include "paddle/fluid/operators/reader/buffered_reader.h" #include <vector> +#include "paddle/fluid/framework/data_type.h" namespace paddle { namespace operators { @@ -24,6 +25,13 @@ BufferedReader::~BufferedReader() { position_.front().wait(); position_.pop(); } +#ifdef PADDLE_WITH_CUDA + if (platform::is_gpu_place(place_)) { + platform::SetDeviceId(boost::get<platform::CUDAPlace>(place_).device); + PADDLE_ENFORCE(cudaStreamDestroy(stream)); + for (auto &event : events) PADDLE_ENFORCE(cudaEventDestroy(event)); + } +#endif } BufferedReader::BufferedReader( @@ -33,6 +41,19 @@ BufferedReader::BufferedReader( thread_pool_(1), place_(place), buffer_size_(buffer_size) { +#ifdef PADDLE_WITH_CUDA + if (platform::is_gpu_place(place_)) { + platform::SetDeviceId(boost::get<platform::CUDAPlace>(place_).device); + compute_stream = + ((platform::CUDADeviceContext *)(platform::DeviceContextPool::Instance() + .Get(place_))) + ->stream(); + events.resize(buffer_size); + for (auto &event : events) + PADDLE_ENFORCE(cudaEventCreateWithFlags(&event, cudaEventDisableTiming)); + PADDLE_ENFORCE(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking)); + } +#endif cpu_buffer_.resize(buffer_size); gpu_buffer_.resize(buffer_size); ReadTillBufferFullAsync(); @@ -46,6 +67,12 @@ void BufferedReader::ReadTillBufferFullAsync() { } void BufferedReader::ReadAsync(size_t i) { +#ifdef PADDLE_WITH_CUDA + if (platform::is_gpu_place(place_)) { + platform::SetDeviceId(boost::get<platform::CUDAPlace>(place_).device); + PADDLE_ENFORCE(cudaEventRecord(events[i], compute_stream)); + } +#endif position_.emplace(thread_pool_.enqueue([this, i]() -> size_t { TensorVec &cpu = cpu_buffer_[i]; reader_->ReadNext(&cpu); @@ -54,14 +81,41 @@ void BufferedReader::ReadAsync(size_t i) { return -1UL; } +#ifdef PADDLE_WITH_CUDA + // NOTE(liangdun): using async copy instead of TensorCopySync + // TensorCopySync would block other stream if (platform::is_gpu_place(place_)) { + platform::SetDeviceId(boost::get<platform::CUDAPlace>(place_).device); + PADDLE_ENFORCE(cudaStreamWaitEvent(stream, events[i], 0)); TensorVec &gpu = gpu_buffer_[i]; gpu.resize(cpu.size()); for (size_t i = 0; i < cpu.size(); ++i) { - framework::TensorCopySync(cpu[i], place_, &gpu[i]); + gpu[i].Resize(cpu[i].dims()); + gpu[i].set_layout(cpu[i].layout()); + auto cpu_place = cpu[i].place(); + auto cpu_ptr = cpu[i].data<void>(); + auto gpu_ptr = gpu[i].mutable_data(place_, cpu[i].type()); + auto size = + cpu[i].numel() * paddle::framework::SizeOfType(cpu[i].type()); + if (platform::is_cuda_pinned_place(cpu_place)) + memory::Copy(boost::get<platform::CUDAPlace>(place_), gpu_ptr, + boost::get<platform::CUDAPinnedPlace>(cpu_place), + cpu_ptr, size, stream); + else if ((platform::is_gpu_place(cpu_place))) + memory::Copy(boost::get<platform::CUDAPlace>(place_), gpu_ptr, + boost::get<platform::CUDAPlace>(cpu_place), cpu_ptr, + size, stream); + else + // if cpu place is not pinned, async copy is slower than sync copy, + // so we use sync copy instead. + memory::Copy(boost::get<platform::CUDAPlace>(place_), gpu_ptr, + boost::get<platform::CPUPlace>(cpu_place), cpu_ptr, size, + 0); gpu[i].set_lod(cpu[i].lod()); } + PADDLE_ENFORCE(cudaStreamSynchronize(stream)); } +#endif return i; })); } diff --git a/paddle/fluid/operators/reader/buffered_reader.h b/paddle/fluid/operators/reader/buffered_reader.h index cbe2bc1b5f..87680da01a 100644 --- a/paddle/fluid/operators/reader/buffered_reader.h +++ b/paddle/fluid/operators/reader/buffered_reader.h @@ -19,6 +19,9 @@ #include <vector> #include "ThreadPool.h" #include "paddle/fluid/framework/reader.h" +#ifdef PADDLE_WITH_CUDA +#include "paddle/fluid/platform/gpu_info.h" +#endif namespace paddle { namespace operators { @@ -59,6 +62,11 @@ class BufferedReader : public framework::DecoratedReader { std::vector<TensorVec> cpu_buffer_; std::vector<TensorVec> gpu_buffer_; size_t prev_pos_{-1UL}; +#ifdef PADDLE_WITH_CUDA + cudaStream_t stream; + cudaStream_t compute_stream; + std::vector<cudaEvent_t> events; +#endif }; } // namespace reader diff --git a/paddle/fluid/operators/reader/ctr_reader.cc b/paddle/fluid/operators/reader/ctr_reader.cc index f08798794a..43a49de522 100644 --- a/paddle/fluid/operators/reader/ctr_reader.cc +++ b/paddle/fluid/operators/reader/ctr_reader.cc @@ -213,7 +213,7 @@ void ReadSvmData(const DataDesc& data_desc, std::shared_ptr<Reader> reader, framework::LoD lod{lod_data}; lod_tensor.set_lod(lod); int64_t* tensor_data = lod_tensor.mutable_data<int64_t>( - framework::make_ddim({1, static_cast<int64_t>(batch_feasign.size())}), + framework::make_ddim({static_cast<int64_t>(batch_feasign.size()), 1}), platform::CPUPlace()); memcpy(tensor_data, batch_feasign.data(), batch_feasign.size() * sizeof(int64_t)); @@ -223,7 +223,7 @@ void ReadSvmData(const DataDesc& data_desc, std::shared_ptr<Reader> reader, // insert label tensor framework::LoDTensor label_tensor; auto* label_tensor_data = label_tensor.mutable_data<int64_t>( - framework::make_ddim({1, static_cast<int64_t>(batch_label.size())}), + framework::make_ddim({static_cast<int64_t>(batch_label.size()), 1}), platform::CPUPlace()); memcpy(label_tensor_data, batch_label.data(), batch_label.size() * sizeof(int64_t)); diff --git a/paddle/fluid/operators/reader/ctr_reader_test.cc b/paddle/fluid/operators/reader/ctr_reader_test.cc index 9f3a254c84..6410439816 100644 --- a/paddle/fluid/operators/reader/ctr_reader_test.cc +++ b/paddle/fluid/operators/reader/ctr_reader_test.cc @@ -123,7 +123,7 @@ TEST(CTR_READER, read_data) { std::vector<std::tuple<LoD, std::vector<int64_t>>> data_slot_6003{b1, b2, b3, b4}; - std::vector<DDim> label_dims = {{1, 3}, {1, 3}, {1, 3}, {1, 1}}; + std::vector<DDim> label_dims = {{3, 1}, {3, 1}, {3, 1}, {1, 1}}; LoDTensorBlockingQueueHolder queue_holder; int capacity = 64; diff --git a/paddle/fluid/operators/reduce_ops/CMakeLists.txt b/paddle/fluid/operators/reduce_ops/CMakeLists.txt index 5fe4d15ae2..ebd07d90eb 100644 --- a/paddle/fluid/operators/reduce_ops/CMakeLists.txt +++ b/paddle/fluid/operators/reduce_ops/CMakeLists.txt @@ -1,5 +1,9 @@ include(operators) -register_operators() +if(WITH_GPU) + register_operators(DEPS cub) +else() + register_operators() +endif() if(WITH_GPU) file(GLOB OPS RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "*.part.cu") diff --git a/paddle/fluid/operators/reshape_op.cc b/paddle/fluid/operators/reshape_op.cc index 32365d6a96..eda54f76b8 100644 --- a/paddle/fluid/operators/reshape_op.cc +++ b/paddle/fluid/operators/reshape_op.cc @@ -327,14 +327,45 @@ class Reshape2GradOp : public framework::OperatorWithKernel { } }; +class ReshapeOpInplaceInToOut : public framework::InplaceInToOut { + public: + using InplaceInToOut::InplaceInToOut; + + protected: + std::unordered_map<std::string, std::string> Apply( + const framework::OpDesc &op_desc, + framework::BlockDesc *block) const override { + std::unordered_map<std::string, std::string> inplace_in_to_out = { + {"X", "Out"}, + }; + return inplace_in_to_out; + } +}; + +class ReshapeGradInplaceInToOut : public framework::InplaceInToOut { + using InplaceInToOut::InplaceInToOut; + + protected: + std::unordered_map<std::string, std::string> Apply( + const framework::OpDesc &op_desc, + framework::BlockDesc *block) const override { + std::unordered_map<std::string, std::string> inplace_in_to_out = { + {framework::GradVarName("Out"), framework::GradVarName("X")}, + }; + return inplace_in_to_out; + } +}; + } // namespace operators } // namespace paddle namespace ops = paddle::operators; namespace plat = paddle::platform; REGISTER_OPERATOR(reshape, ops::ReshapeOp, ops::ReshapeOpMaker, - paddle::framework::DefaultGradOpDescMaker<true>); -REGISTER_OPERATOR(reshape_grad, ops::ReshapeGradOp); + paddle::framework::DefaultGradOpDescMaker<true>, + ops::ReshapeOpInplaceInToOut); +REGISTER_OPERATOR(reshape_grad, ops::ReshapeGradOp, + ops::ReshapeGradInplaceInToOut); REGISTER_OP_CPU_KERNEL_FUNCTOR(reshape, float, ops::ReshapeKernel, double, ops::ReshapeKernel, int, ops::ReshapeKernel, int64_t, ops::ReshapeKernel); @@ -344,8 +375,9 @@ REGISTER_OP_CPU_KERNEL_FUNCTOR(reshape_grad, float, ops::ReshapeGradKernel, ops::ReshapeGradKernel); REGISTER_OPERATOR(reshape2, ops::Reshape2Op, ops::Reshape2OpMaker, - ops::Reshape2GradMaker); -REGISTER_OPERATOR(reshape2_grad, ops::Reshape2GradOp); + ops::Reshape2GradMaker, ops::ReshapeOpInplaceInToOut); +REGISTER_OPERATOR(reshape2_grad, ops::Reshape2GradOp, + ops::ReshapeGradInplaceInToOut); REGISTER_OP_CPU_KERNEL_FUNCTOR(reshape2, float, ops::ReshapeKernel, double, ops::ReshapeKernel, int, ops::ReshapeKernel, int64_t, ops::ReshapeKernel); diff --git a/paddle/fluid/operators/scale_op.cc b/paddle/fluid/operators/scale_op.cc index 981969d2aa..4ea77ed30d 100644 --- a/paddle/fluid/operators/scale_op.cc +++ b/paddle/fluid/operators/scale_op.cc @@ -100,13 +100,14 @@ class ScaleGradMaker : public framework::SingleGradOpDescMaker { } }; +using ScaleOpInplace = framework::SingleOpInplaceInToOut; } // namespace operators } // namespace paddle namespace ops = paddle::operators; REGISTER_OPERATOR(scale, ops::ScaleOp, ops::ScaleOpMaker, ops::ScaleGradMaker, - ops::ScaleOpVarTypeInference); + ops::ScaleOpVarTypeInference, ops::ScaleOpInplace); REGISTER_OP_CPU_KERNEL( scale, ops::ScaleKernel<paddle::platform::CPUDeviceContext, float>, ops::ScaleKernel<paddle::platform::CPUDeviceContext, double>, diff --git a/paddle/fluid/operators/softmax_op.cc b/paddle/fluid/operators/softmax_op.cc index bc889a5a04..8fbf299a7c 100644 --- a/paddle/fluid/operators/softmax_op.cc +++ b/paddle/fluid/operators/softmax_op.cc @@ -198,6 +198,21 @@ class SoftmaxOpGradMaker : public framework::SingleGradOpDescMaker { return std::unique_ptr<framework::OpDesc>(op); } }; + +class SoftmaxInplaceInToOut : public framework::InplaceInToOut { + public: + using framework::InplaceInToOut::InplaceInToOut; + + protected: + std::unordered_map<std::string, std::string> Apply( + const framework::OpDesc& op_desc, + framework::BlockDesc* block) const override { + return std::unordered_map<std::string, std::string>{ + {"X", "Out"}, + }; + } +}; + } // namespace operators } // namespace paddle diff --git a/paddle/fluid/platform/CMakeLists.txt b/paddle/fluid/platform/CMakeLists.txt index 1f51b5bab3..fbb2ac3fe8 100644 --- a/paddle/fluid/platform/CMakeLists.txt +++ b/paddle/fluid/platform/CMakeLists.txt @@ -1,4 +1,4 @@ -proto_library(profiler_proto SRCS profiler.proto DEPS framework_proto) +proto_library(profiler_proto SRCS profiler.proto DEPS framework_proto simple_threadpool) py_proto_compile(profiler_py_proto SRCS profiler.proto) add_custom_target(profiler_py_proto_init ALL COMMAND ${CMAKE_COMMAND} -E touch __init__.py) @@ -36,7 +36,7 @@ cc_test(cpu_info_test SRCS cpu_info_test.cc DEPS cpu_info) nv_library(gpu_info SRCS gpu_info.cc DEPS gflags glog enforce) -cc_library(place SRCS place.cc DEPS enforce boost) +cc_library(place SRCS place.cc DEPS enforce boost lib_any) cc_test(place_test SRCS place_test.cc DEPS place glog gflags) add_subdirectory(dynload) diff --git a/paddle/fluid/platform/cuda_device_function.h b/paddle/fluid/platform/cuda_device_function.h index 2ce8f141d3..31b6c38d61 100644 --- a/paddle/fluid/platform/cuda_device_function.h +++ b/paddle/fluid/platform/cuda_device_function.h @@ -53,10 +53,12 @@ inline static int RoundToPowerOfTwo(int dim) { __VA_ARGS__; \ } break -#define CUDA_LAUNCH_KERNEL_HELPER(...) \ - CUDA_LAUNCH_KERNEL_BASE(256, ##__VA_ARGS__); \ - CUDA_LAUNCH_KERNEL_BASE(128, ##__VA_ARGS__); \ - CUDA_LAUNCH_KERNEL_BASE(64, ##__VA_ARGS__); \ +#define CUDA_LAUNCH_KERNEL_HELPER(...) \ + CUDA_LAUNCH_KERNEL_BASE(1024, ##__VA_ARGS__); \ + CUDA_LAUNCH_KERNEL_BASE(512, ##__VA_ARGS__); \ + CUDA_LAUNCH_KERNEL_BASE(256, ##__VA_ARGS__); \ + CUDA_LAUNCH_KERNEL_BASE(128, ##__VA_ARGS__); \ + CUDA_LAUNCH_KERNEL_BASE(64, ##__VA_ARGS__); \ CUDA_LAUNCH_KERNEL_BASE(32, ##__VA_ARGS__); template <typename T> diff --git a/paddle/fluid/platform/ngraph_helper.h b/paddle/fluid/platform/ngraph_helper.h index c5b65d6636..5ee985ea71 100644 --- a/paddle/fluid/platform/ngraph_helper.h +++ b/paddle/fluid/platform/ngraph_helper.h @@ -23,6 +23,26 @@ limitations under the License. */ namespace paddle { namespace platform { +std::shared_ptr<ngraph::Node> Nhwc2Nchw(std::shared_ptr<ngraph::Node> in) { + auto in_shape = in->get_shape(); + in_shape[0] = in->get_shape()[0]; + in_shape[1] = in->get_shape()[3]; + in_shape[2] = in->get_shape()[1]; + in_shape[3] = in->get_shape()[2]; + ngraph::AxisVector axis_vec = {0, 3, 1, 2}; + return std::make_shared<ngraph::op::Reshape>(in, axis_vec, in_shape); +} + +std::shared_ptr<ngraph::Node> Nchw2Nhwc(std::shared_ptr<ngraph::Node> in) { + auto in_shape = in->get_shape(); + in_shape[0] = in->get_shape()[0]; + in_shape[1] = in->get_shape()[2]; + in_shape[2] = in->get_shape()[3]; + in_shape[3] = in->get_shape()[1]; + ngraph::AxisVector axis_vec = {0, 2, 3, 1}; + return std::make_shared<ngraph::op::Reshape>(in, axis_vec, in_shape); +} + ngraph::Shape FlattenTo2d(ngraph::Shape sh, int num) { auto x1 = std::accumulate(std::begin(sh), std::begin(sh) + num, 1, std::multiplies<size_t>()); @@ -43,13 +63,14 @@ std::shared_ptr<ngraph::Node> NgReshaper(std::shared_ptr<ngraph::Node> input, std::shared_ptr<ngraph::Node> GetNode( const std::shared_ptr<paddle::framework::OperatorBase>& op, - const std::string prm, const paddle::framework::VariableNameMap& var_map, + const std::string name, const paddle::framework::VariableNameMap& var_map, std::shared_ptr< std::unordered_map<std::string, std::shared_ptr<ngraph::Node>>> ngb_node_map) { - auto& var_names = var_map.at(prm); + auto& var_names = var_map.at(name); PADDLE_ENFORCE_EQ(var_names.size(), 1, - "op %s prm %s expects one associated var", op->Type(), prm); + "op %s name %s expects one associated var", op->Type(), + name); if (ngb_node_map->find(var_names[0]) != ngb_node_map->end()) { return (*ngb_node_map)[var_names[0]]; } else { @@ -59,43 +80,53 @@ std::shared_ptr<ngraph::Node> GetNode( std::shared_ptr<ngraph::Node> GetInputNode( const std::shared_ptr<paddle::framework::OperatorBase>& op, - const std::string prm, + const std::string name, std::shared_ptr< std::unordered_map<std::string, std::shared_ptr<ngraph::Node>>> ngb_node_map) { - return GetNode(op, prm, op->Inputs(), ngb_node_map); + return GetNode(op, name, op->Inputs(), ngb_node_map); } std::shared_ptr<ngraph::Node> GetOutputNode( const std::shared_ptr<paddle::framework::OperatorBase>& op, - const std::string prm, + const std::string name, std::shared_ptr< std::unordered_map<std::string, std::shared_ptr<ngraph::Node>>> ngb_node_map) { - return GetNode(op, prm, op->Outputs(), ngb_node_map); + return GetNode(op, name, op->Outputs(), ngb_node_map); } void SetOutputNode( const std::shared_ptr<paddle::framework::OperatorBase>& op, - const std::string prm, std::shared_ptr<ngraph::Node> node, + const std::string name, std::shared_ptr<ngraph::Node> node, std::shared_ptr< std::unordered_map<std::string, std::shared_ptr<ngraph::Node>>> ngb_node_map) { - auto& var_names = op->Outputs().at(prm); + auto& var_names = op->Outputs().at(name); if (var_names.size() == 1) { + /* */ + auto dummy_out = GetOutputNode(op, name, ngb_node_map); + if (dummy_out && dummy_out->get_shape() != node->get_shape()) { + node = NgReshaper(node, dummy_out->get_shape()); + } + if (dummy_out && + dummy_out->get_element_type() != node->get_element_type()) { + node = std::make_shared<ngraph::op::Convert>( + node, dummy_out->get_element_type()); + } (*ngb_node_map)[var_names[0]] = node; } else if (var_names.size() == 0) { (*ngb_node_map)[""] = node; } else { - PADDLE_THROW("prm %s has more than 1 var_names.", prm); + PADDLE_THROW("name %s has more than 1 var_names.", name); } } bool HasOutput(const std::shared_ptr<paddle::framework::OperatorBase>& op, - const std::string prm) { + const std::string name) { auto& outputs = op->Outputs(); - if (outputs.find(prm) == outputs.end()) return false; - return outputs.at(prm).size() > 0; + if (outputs.find(name) == outputs.end()) return false; + return outputs.at(name).size() > 0; } inline void GetMidDims(const ngraph::Shape& x_shape, diff --git a/paddle/fluid/platform/place.cc b/paddle/fluid/platform/place.cc index 655ce8485d..60b2d83f15 100644 --- a/paddle/fluid/platform/place.cc +++ b/paddle/fluid/platform/place.cc @@ -14,6 +14,12 @@ limitations under the License. */ #include "paddle/fluid/platform/place.h" +DEFINE_bool(benchmark, false, + "Doing memory benchmark. It will make deleting scope synchronized, " + "and add some memory usage logs." + "Default cuda is asynchronous device, set to True will" + "force op run in synchronous mode."); + namespace paddle { namespace platform { diff --git a/paddle/fluid/pybind/CMakeLists.txt b/paddle/fluid/pybind/CMakeLists.txt index 803ea6b260..4ac5b83c56 100644 --- a/paddle/fluid/pybind/CMakeLists.txt +++ b/paddle/fluid/pybind/CMakeLists.txt @@ -26,5 +26,5 @@ if(WITH_PYTHON) get_property (os_dependency_modules GLOBAL PROPERTY OS_DEPENDENCY_MODULES) target_link_libraries(paddle_pybind ${os_dependency_modules}) - cc_test(tensor_py_test SRCS tensor_py_test.cc DEPS python) + cc_test(tensor_py_test SRCS tensor_py_test.cc DEPS python pybind) endif(WITH_PYTHON) diff --git a/paddle/fluid/pybind/inference_api.cc b/paddle/fluid/pybind/inference_api.cc index 39e47be606..7db2bb451b 100644 --- a/paddle/fluid/pybind/inference_api.cc +++ b/paddle/fluid/pybind/inference_api.cc @@ -74,12 +74,12 @@ void BindPaddleBuf(py::module *m) { .def(py::init([](std::vector<float> &data) { auto buf = PaddleBuf(data.size() * sizeof(float)); std::memcpy(buf.data(), static_cast<void *>(data.data()), buf.length()); - return std::move(buf); + return buf; })) .def(py::init([](std::vector<int64_t> &data) { auto buf = PaddleBuf(data.size() * sizeof(int64_t)); std::memcpy(buf.data(), static_cast<void *>(data.data()), buf.length()); - return std::move(buf); + return buf; })) .def("resize", &PaddleBuf::Resize) .def("reset", diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc index adbe1dcf4d..a4a01ad647 100644 --- a/paddle/fluid/pybind/pybind.cc +++ b/paddle/fluid/pybind/pybind.cc @@ -37,6 +37,7 @@ limitations under the License. */ #include "paddle/fluid/framework/version.h" #include "paddle/fluid/imperative/layer.h" #include "paddle/fluid/memory/allocation/allocator_strategy.h" +#include "paddle/fluid/memory/allocation/legacy_allocator.h" #include "paddle/fluid/operators/activation_op.h" #include "paddle/fluid/operators/py_func_op.h" #include "paddle/fluid/operators/reader/lod_tensor_blocking_queue.h" @@ -127,6 +128,13 @@ PYBIND11_MODULE(core, m) { m.add_object("_cleanup", py::capsule([]() { ScopePool::Instance().Clear(); })); + m.def("get_mem_usage", [](int device) { + return memory::allocation::GPUMemMonitor.GetMemUsage(device); + }); + + m.def("print_mem_usage", + []() { return memory::allocation::GPUMemMonitor.PrintMemUsage(); }); + py::class_<imperative::VarBase>(m, "VarBase", R"DOC()DOC") // .def(py::init<>()) .def(py::init<bool>(), py::arg("stop_gradient") = false) @@ -287,6 +295,7 @@ PYBIND11_MODULE(core, m) { .def("_get_float_element", TensorGetElement<float>) .def("_set_double_element", TensorSetElement<double>) .def("_get_double_element", TensorGetElement<double>) + .def("_place", [](Tensor &self) { return self.place(); }) .def("_dtype", [](Tensor &self) { return self.type(); }); py::class_<LoDTensor, Tensor>(m, "LoDTensor", R"DOC( @@ -665,6 +674,12 @@ All parameter, weight, gradient are variables in Paddle. py::class_<platform::Place>(m, "Place") .def(py::init<>()) + .def("is_gpu_place", + [](platform::Place &self) { return platform::is_gpu_place(self); }) + .def("gpu_device_id", + [](platform::Place &self) { + return boost::get<platform::CUDAPlace>(self).device; + }) .def("set_place", [](platform::Place &self, const platform::CPUPlace &cpu_place) { self = cpu_place; @@ -1082,9 +1097,9 @@ All parameter, weight, gradient are variables in Paddle. [](const BuildStrategy &self) { return self.is_distribution_; }, [](BuildStrategy &self, bool b) { self.is_distribution_ = b; }) .def_property( - "memory_early_delete", - [](const BuildStrategy &self) { return self.memory_early_delete_; }, - [](BuildStrategy &self, bool b) { self.memory_early_delete_ = b; }) + "enable_inplace", + [](const BuildStrategy &self) { return self.enable_inplace_; }, + [](BuildStrategy &self, bool b) { self.enable_inplace_ = b; }) .def("_finalize_strategy_and_create_passes", [](BuildStrategy &self) -> std::shared_ptr<ir::PassBuilder> { return self.CreatePassesFromStrategy(true); diff --git a/paddle/scripts/fast_install.sh b/paddle/scripts/fast_install.sh new file mode 100644 index 0000000000..b960d0f00a --- /dev/null +++ b/paddle/scripts/fast_install.sh @@ -0,0 +1,923 @@ +#!/bin/bash + +path='http://paddlepaddle.org/download?url=' +#release_version=`curl -s https://pypi.org/project/paddlepaddle/|grep -E "/project/paddlepaddle/"|grep "release"|awk -F '/' '{print $(NF-1)}'|head -1` +release_version=1.2.0 +python_list=( +"27" +"35" +"36" +"37" +) + + +function use_cpu(){ + while true + do + read -p "是否安装CPU版本的PaddlePaddle?(y/n)" cpu_option + cpu_option=`echo $cpu_option | tr 'A-Z' 'a-z'` + if [[ "$cpu_option" == "" || "$cpu_option" == "n" ]];then + echo "退出安装中..." + exit + else + GPU='cpu' + echo "将为您安装CPU版本的PaddlePaddle" + break + fi + done +} + +function checkLinuxCUDNN(){ + echo + read -n1 -p "请按回车键进行下一步..." + echo + while true + do + version_file='/usr/local/cuda/include/cudnn.h' + if [ -f "$version_file" ];then + CUDNN=`cat $version_file | grep CUDNN_MAJOR |awk 'NR==1{print $NF}'` + fi + if [ "$CUDNN" == "" ];then + version_file=`sudo find /usr -name "cudnn.h"|head -1` + if [ "$version_file" != "" ];then + CUDNN=`cat ${version_file} | grep CUDNN_MAJOR -A 2|awk 'NR==1{print $NF}'` + else + echo "检测结果:未在常规路径下找到cuda/include/cudnn.h文件" + while true + do + read -p "请核实cudnn.h位置,并在此输入路径(请注意,路径需要输入到“cudnn.h”这一级):" cudnn_version + echo + if [ "$cudnn_version" == "" ] || [ ! -f "$cudnn_version" ];then + read -p "仍未找到cuDNN,输入y将安装CPU版本的PaddlePaddle,输入n可重新录入cuDNN路径,请输入(y/n)" cpu_option + echo + cpu_option=`echo $cpu_option | tr 'A-Z' 'a-z'` + if [ "$cpu_option" == "y" -o "$cpu_option" == "" ];then + GPU='cpu' + break + else + echo "请重新输入" + echo + fi + else + CUDNN=`cat $cudnn_version | grep CUDNN_MAJOR |awk 'NR==1{print $NF}'` + echo "检测结果:找到cudnn.h" + break + fi + done + if [ "$GPU" == "cpu" ];then + break + fi + fi + fi + if [ "$CUDA" == "9" -a "$CUDNN" != "7" ];then + echo + echo "目前CUDA9下仅支持cuDNN7,暂不支持您机器上的CUDNN${CUDNN}。您可以访问NVIDIA官网下载适合版本的CUDNN,请ctrl+c退出安装进程。按回车键将为您安装CPU版本的PaddlePaddle" + echo + use_cpu() + if [ "$GPU"=="cpu" ];then + break + fi + fi + + if [ "$CUDNN" == 5 ] || [ "$CUDNN" == 7 ];then + echo + echo "您的CUDNN版本是: CUDNN$CUDNN" + break + else + echo + read -n1 -p "目前支持的CUDNN版本为5和7,暂不支持您机器上的CUDNN${CUDNN},将为您安装CPU版本的PaddlePaddle,请按回车键开始安装" + echo + use_cpu + if [ "$GPU"=="cpu" ];then + break + fi + fi + done +} + +function checkLinuxCUDA(){ + while true + do + CUDA=`echo ${CUDA_VERSION}|awk -F "[ .]" '{print $1}'` + if [ "$CUDA" == "" ];then + if [ -f "/usr/local/cuda/version.txt" ];then + CUDA=`cat /usr/local/cuda/version.txt | grep 'CUDA Version'|awk -F '[ .]' '{print $3}'` + tmp_cuda=$CUDA + fi + if [ -f "/usr/local/cuda8/version.txt" ];then + CUDA=`cat /usr/local/cuda8/version.txt | grep 'CUDA Version'|awk -F '[ .]' '{print $3}'` + tmp_cuda8=$CUDA + fi + if [ -f "/usr/local/cuda9/version.txt" ];then + CUDA=`cat /usr/local/cuda9/version.txt | grep 'CUDA Version'|awk -F '[ .]' '{print $3}'` + tmp_cuda9=$CUDA + fi + fi + + if [ "$tmp_cuda" != "" ];then + echo "检测结果:找到CUDA $tmp_cuda" + fi + if [ "$tmp_cudai8" != "" ];then + echo "检测结果:找到CUDA $tmp_cuda8" + fi + if [ "$tmp_cuda9" != "" ];then + echo "检测结果:找到CUDA $tmp_cuda9" + fi + + if [ "$CUDA" == "" ];then + echo "检测结果:没有在常规路径下找到cuda/version.txt文件" + while true + do + read -p "请输入cuda/version.txt的路径:" cuda_version + if [ "$cuda_version" == "" || ! -f "$cuda_version" ];then + read -p "仍未找到CUDA,输入y将安装CPU版本的PaddlePaddle,输入n可重新录入CUDA路径,请输入(y/n)" cpu_option + cpu_option=`echo $cpu_option | tr 'A-Z' 'a-z'` + if [ "$cpu_option" == "y" || "$cpu_option" == "" ];then + GPU='cpu' + break + else + echo "重新输入..." + fi + else + CUDA=`cat $cuda_version | grep 'CUDA Version'|awk -F '[ .]' '{print $3}'` + if [ "$CUDA" == "" ];then + echo "未能在version.txt中找到CUDA相关信息" + else + break + fi + fi + done + if [ "$GPU" == "cpu" ];then + break + fi + fi + + if [ "$CUDA" == "8" ] || [ "$CUDA" == "9" ];then + echo "您的CUDA版本是${CUDA}" + break + else + echo "目前支持CUDA8/9,暂不支持您的CUDA${CUDA},将为您安装CPU版本的PaddlePaddle" + echo + use_cpu + fi + + if [ "$GPU" == "cpu" ];then + break + fi + done +} + +function checkLinuxMathLibrary(){ + while true + do + if [ "$AVX" == "" ];then + echo "正在检测您环境中是否存在AVX指令集..." + echo + echo "检测结果:您电脑上没有AVX指令集,目前针对无AVX指令集的环境,我们仅提供支持mkl数学库的PaddlePaddle,将为您安装此版本的PaddlePaddle" + math='mkl' + break + elif [ "$GPU" == "gpu" ];then + math='mkl' + echo "检测到您的机器上配备GPU,推荐您使用mkl数学库" + break + else + read -p "请输入您希望使用的数学库: + 1:openblas 一个高性能多核 BLAS 库 + 2:mkl(推荐) 英特尔数学核心函数库 + => 请输入数字1或2。如输入其他字符或直接回车,将会默认选择【 2. mkl 】 。请在这里输入并回车:" math + if [ "$math" == "" ];then + math="mkl" + echo "您选择了数字【2】" + break + fi + if [ "$math" == "1" ];then + math=openblas + echo "您选择了数字【1】" + break + elif [ "$math" == "2" ];then + math=mkl + echo "您选择了数字【2】" + break + fi + echo "输入错误,请再次输入" + fi + done +} + +function checkLinuxPaddleVersion(){ + read -n1 -p "请按回车键继续..." + while true + do + read -p " + 1. 开发版:对应Github上develop分支,如您需要开发、或希望使用PaddlePaddle最新功能,请选用此版本 + 2. 稳定版(推荐):如您无特殊开发需求,建议使用此版本,目前最新的版本号为 ${release_version} + => 请输入数字1或2。如输入其他字符或直接回车,将会默认选择【 2. 稳定版 】 。请在这里输入并回车:" paddle_version + if [ "$paddle_version" == "" ];then + paddle_version="release-${release_version}" + echo "您选择了数字【2】,为您安装release-${release_version}" + break + fi + if [ "$paddle_version" == "1" ];then + echo "您选择了数字【1】,将为您安装开发版" + break + elif [ "$paddle_version" == "2" ];then + echo "您选择了数字【2】,为您安装release-${release_version}" + break + fi + echo "输入错误,请再次输入" + done +} + +function checkLinuxPip(){ + while true + do + echo "请输入您要使用的pip目录(您可以另起终端,并使用which pip来查看):" + read -p "" pip_path + if [ "$pip_path" == "" -o ! -f "$pip_path" ];then + echo "检测结果:pip不存在,请重新输入" + continue + fi + python_version=`$pip_path --version|awk -F "[ |)]" '{print $6}'|sed 's#\.##g'` + if [ "$python_version" == "27" ];then + uncode=`python -c "import pip._internal;print(pip._internal.pep425tags.get_supported())"|grep "cp27mu"` + if [[ "$uncode" == "" ]];then + uncode= + else + uncode=u + fi + fi + if [ "$python_version" == "" ];then + echo "检测结果:pip不存在,请重新输入" + else + version_list=`echo "${python_list[@]}" | grep "$python_version" ` + if [ "$version_list" != "" ];then + echo "检测结果:找到python${python_version}版本" + break + else + echo "检测结果:找不到可用的 pip, 我们只支持Python27/35/36/37及其对应的pip, 请重新输入, 或使用ctrl + c退出 " + fi + fi + done +} + +function checkLinuxAVX(){ + while true + do + if [[ "$AVX" != "" ]];then + AVX="avx" + break + else + if [ "$CUDA" == "8" -a "$CUDNN" == "7" ] || [ "$GPU" == "cpu" ];then + AVX="noavx" + break + else + echo "Step 6. 检测是否有avx" + echo + echo "检测结果:未能找到avx,我们仅提供CPU版本或配置为CUDA8 cuDNN7的GPU版本的安装包" + break + fi + fi + done +} + +function PipLinuxInstall(){ + wheel_cpu_release="http://paddle-wheel.bj.bcebos.com/${release_version}-${GPU}-${AVX}-${math}/paddlepaddle-${release_version}-cp${python_version}-cp${python_version}m${uncode}-linux_x86_64.whl" + wheel_gpu_release="http://paddle-wheel.bj.bcebos.com/${release_version}-gpu-cuda${CUDA}-cudnn${CUDNN}-${AVX}-${math}/paddlepaddle_gpu-${release_version}.post${CUDA}${CUDNN}-cp${python_version}-cp${python_version}m${uncode}-linux_x86_64.whl" + wheel_gpu_release_noavx="http://paddle-wheel.bj.bcebos.com/${release_version}-gpu-cuda${CUDA}-cudnn${CUDNN}-${AVX}-${math}/paddlepaddle_gpu-${release_version}-cp${python_version}-cp${python_version}m${uncode}-linux_x86_64.whl" + wheel_cpu_develop="http://paddle-wheel.bj.bcebos.com/latest-cpu-${AVX}-${math}/paddlepaddle-latest-cp${python_version}-cp${python_version}m${uncode}-linux_x86_64.whl" + wheel_gpu_develop="http://paddle-wheel.bj.bcebos.com/latest-gpu-cuda${CUDA}-cudnn${CUDNN}-${AVX}-${math}/paddlepaddle_gpu-latest-cp${python_version}-cp${python_version}m${uncode}-linux_x86_64.whl" + + + if [[ "$paddle_version" == "2" ]];then + if [[ "$GPU" == "gpu" ]];then + if [[ ${AVX} == "avx" ]];then + rm -rf `echo $wheel_gpu_release|awk -F '/' '{print $NF}'` + wget -q $wheel_gpu_release + if [ "$?" == "0" ];then + $pip_path install --user -i https://mirrors.aliyun.com/pypi/simple --trusted-host=mirrors.aliyun.com $wheel_gpu_release + else + echo "paddlepaddle whl包下载失败" + exit 1 + fi + else + rm -rf `echo $wheel_gpu_release_novax|awk -F '/' '{print $NF}'` + wget -q $wheel_gpu_release_novax + if [ "$?" == "0" ];then + $pip_path install --user -i https://mirrors.aliyun.com/pypi/simple --trusted-host=mirrors.aliyun.com $wheel_gpu_release_noavx + else + echo "paddlepaddle whl包下载失败" + exit 1 + fi + fi + else + rm -rf `echo $wheel_cpu_release|awk -F '/' '{print $NF}'` + wget -q $wheel_cpu_release + if [ "$?" == "0" ];then + $pip_path install --user -i https://mirrors.aliyun.com/pypi/simple --trusted-host=mirrors.aliyun.com $wheel_cpu_release + else + echo "paddlepaddle whl包下载失败" + exit 1 + fi + fi + else + if [[ "$GPU" == "gpu" ]];then + rm -rf `echo $wheel_gpu_develop|awk -F '/' '{print $NF}'` + wget -q $wheel_gpu_develop + if [ "$?" == "0" ];then + $pip_path install --user -i https://mirrors.aliyun.com/pypi/simple --trusted-host=mirrors.aliyun.com $wheel_gpu_develop + else + echo "paddlepaddle whl包下载失败" + exit 1 + fi + else + rm -rf `echo $wheel_cpu_develop|awk -F '/' '{print $NF}'` + wget -q $wheel_cpu_develop + if [ "$?" == "0" ];then + $pip_path install --user -i https://mirrors.aliyun.com/pypi/simple --trusted-host=mirrors.aliyun.com $wheel_cpu_develop + else + echo "paddlepaddle whl包下载失败" + exit 1 + fi + fi + fi +} + + +function checkLinuxGPU(){ + read -n1 -p "即将检测您的机器是否含GPU,请按回车键继续..." + echo + AVX=`cat /proc/cpuinfo |grep avx|tail -1|grep avx` + which nvidia-smi >/dev/null 2>&1 + if [ "$?" != "0" ];then + GPU='cpu' + echo "未在机器上找到GPU,或PaddlePaddle暂不支持此型号的GPU" + else + GPU='gpu' + echo "已在您的机器上找到GPU,即将确认CUDA和CUDNN版本..." + echo + fi + if [ "$GPU" == 'gpu' ];then + checkLinuxCUDA + checkLinuxCUDNN + fi +} + +function linux(){ +gpu_list=( +"GeForce 410M" +"GeForce 610M" +"GeForce 705M" +"GeForce 710M" +"GeForce 800M" +"GeForce 820M" +"GeForce 830M" +"GeForce 840M" +"GeForce 910M" +"GeForce 920M" +"GeForce 930M" +"GeForce 940M" +"GeForce GT 415M" +"GeForce GT 420M" +"GeForce GT 430" +"GeForce GT 435M" +"GeForce GT 440" +"GeForce GT 445M" +"GeForce GT 520" +"GeForce GT 520M" +"GeForce GT 520MX" +"GeForce GT 525M" +"GeForce GT 540M" +"GeForce GT 550M" +"GeForce GT 555M" +"GeForce GT 610" +"GeForce GT 620" +"GeForce GT 620M" +"GeForce GT 625M" +"GeForce GT 630" +"GeForce GT 630M" +"GeForce GT 635M" +"GeForce GT 640" +"GeForce GT 640 (GDDR5)" +"GeForce GT 640M" +"GeForce GT 640M LE" +"GeForce GT 645M" +"GeForce GT 650M" +"GeForce GT 705" +"GeForce GT 720" +"GeForce GT 720M" +"GeForce GT 730" +"GeForce GT 730M" +"GeForce GT 735M" +"GeForce GT 740" +"GeForce GT 740M" +"GeForce GT 745M" +"GeForce GT 750M" +"GeForce GTS 450" +"GeForce GTX 1050" +"GeForce GTX 1060" +"GeForce GTX 1070" +"GeForce GTX 1080" +"GeForce GTX 1080 Ti" +"GeForce GTX 460" +"GeForce GTX 460M" +"GeForce GTX 465" +"GeForce GTX 470" +"GeForce GTX 470M" +"GeForce GTX 480" +"GeForce GTX 480M" +"GeForce GTX 485M" +"GeForce GTX 550 Ti" +"GeForce GTX 560M" +"GeForce GTX 560 Ti" +"GeForce GTX 570" +"GeForce GTX 570M" +"GeForce GTX 580" +"GeForce GTX 580M" +"GeForce GTX 590" +"GeForce GTX 650" +"GeForce GTX 650 Ti" +"GeForce GTX 650 Ti BOOST" +"GeForce GTX 660" +"GeForce GTX 660M" +"GeForce GTX 660 Ti" +"GeForce GTX 670" +"GeForce GTX 670M" +"GeForce GTX 670MX" +"GeForce GTX 675M" +"GeForce GTX 675MX" +"GeForce GTX 680" +"GeForce GTX 680M" +"GeForce GTX 680MX" +"GeForce GTX 690" +"GeForce GTX 750" +"GeForce GTX 750 Ti" +"GeForce GTX 760" +"GeForce GTX 760M" +"GeForce GTX 765M" +"GeForce GTX 770" +"GeForce GTX 770M" +"GeForce GTX 780" +"GeForce GTX 780M" +"GeForce GTX 780 Ti" +"GeForce GTX 850M" +"GeForce GTX 860M" +"GeForce GTX 870M" +"GeForce GTX 880M" +"GeForce GTX 950" +"GeForce GTX 950M" +"GeForce GTX 960" +"GeForce GTX 960M" +"GeForce GTX 965M" +"GeForce GTX 970" +"GeForce GTX 970M" +"GeForce GTX 980" +"GeForce GTX 980M" +"GeForce GTX 980 Ti" +"GeForce GTX TITAN" +"GeForce GTX TITAN Black" +"GeForce GTX TITAN X" +"GeForce GTX TITAN Z" +"Jetson TK1" +"Jetson TX1" +"Jetson TX2" +"Mobile Products" +"NVIDIA NVS 310" +"NVIDIA NVS 315" +"NVIDIA NVS 510" +"NVIDIA NVS 810" +"NVIDIA TITAN V" +"NVIDIA TITAN X" +"NVIDIA TITAN Xp" +"NVS 4200M" +"NVS 5200M" +"NVS 5400M" +"Quadro 410" +"Quadro GP100" +"Quadro K1100M" +"Quadro K1200" +"Quadro K2000" +"Quadro K2000D" +"Quadro K2100M" +"Quadro K2200" +"Quadro K2200M" +"Quadro K3100M" +"Quadro K4000" +"Quadro K4100M" +"Quadro K420" +"Quadro K4200" +"Quadro K4200M" +"Quadro K5000" +"Quadro K500M" +"Quadro K5100M" +"Quadro K510M" +"Quadro K5200" +"Quadro K5200M" +"Quadro K600" +"Quadro K6000" +"Quadro K6000M" +"Quadro K610M" +"Quadro K620" +"Quadro K620M" +"Quadro M1000M" +"Quadro M1200" +"Quadro M2000" +"Quadro M2000M" +"Quadro M2200" +"Quadro M3000M" +"Quadro M4000" +"Quadro M4000M" +"Quadro M5000" +"Quadro M5000M" +"Quadro M500M" +"Quadro M520" +"Quadro M5500M" +"Quadro M6000" +"Quadro M6000 24GB" +"Quadro M600M" +"Quadro M620" +"Quadro Mobile Products" +"Quadro P1000" +"Quadro P2000" +"Quadro P3000" +"Quadro P400" +"Quadro P4000" +"Quadro P5000" +"Quadro P600" +"Quadro P6000" +"Quadro Plex 7000" +"Tegra K1" +"Tegra X1" +"Tesla C2050/C2070" +"Tesla C2075" +"Tesla Data Center Products" +"Tesla K10" +"Tesla K20" +"Tesla K40" +"Tesla K80" +"Tesla M40" +"Tesla M60" +"Tesla P100" +"Tesla P4" +"Tesla P40" +"Tesla V100") + + echo "Step 2. 检测GPU型号和CUDA/cuDNN版本" + echo + checkLinuxGPU + echo + echo "Step 3. 检测数学库" + echo + checkLinuxMathLibrary + echo + echo "Step 4. 选择要安装的PaddlePaddle版本" + echo + checkLinuxPaddleVersion + echo + echo "Step 5. 检测pip版本" + echo + checkLinuxPip + echo + checkLinuxAVX + echo "*********************2. 开始安装*****************************" + PipLinuxInstall +} + +function checkMacPython2(){ + while true + do + read -p " + => 未能在常规路径下找到Python2,请使用ctrl+c命令退出安装程序,并使用brew或pypi.org下载安装Python2(注意Python版本不能低于2.7.15) + 如希望自定义Python路径,请输入路径:" python_root + echo + python_version=`$python_root --version 2>&1 1>&1` + if [ $? == "0" ];then + : + else + python_version="" + fi + check_python=`echo $python_version | grep "Python 2"` + if [ "$python_version" == "" ] || [ "$python_root" == "/usr/bin/python" -a "$python_version" == "Python 2.7.10" ] ;then + python_version="" + elif [ -n "$check_python" ];then + while true + do + read -p " + => 在您的环境中找到 $python_version, 确认使用此版本请输入y;如您希望自定义Python路径请输入n。请在这里输入(y/n)并回车: " use_python + echo + use_python=`echo $use_python | tr 'A-Z' 'a-z'` + if [ "$use_python" == "y" ]||[ "$use_python" == "" ];then + use_python="y" + break + elif [ "$use_python" == "n" ];then + python_root="" + break + else + echo "输入错误,请重新输入(y/n)" + fi + done + if [ "$use_python" == "y" ];then + break + fi + else + echo "您输入Python的不是Python2" + python_version="" + fi + done +} + +function checkMacPython3(){ + while true + do + read -p " + => 未能在常规路径下找到Python3,请使用ctrl+c命令退出安装程序,并使用brew或pypi.org下载Python3 + 如希望自定义Python路径,请输入路径:" python_root + python_version=`$python_root --version 2>&1 1>&1` + if [ $? == "0" ];then + : + else + python_version="" + fi + check_python=`echo $python_version | grep "Python 3"` + if [ "$python_version" == "" ] || [ "$python_root" == "/usr/bin/python" -a "$python_version" == "Python 2.7.10" ] ;then + python_version="" + elif [ -n "$check_python" ] ;then + while true + do + read -p " + => 在您的环境中找到 $python_version, 确认使用此版本请输入y;如您希望自定义Python路径请输入n。请在这里输入(y/n)并回车: " use_python + echo + use_python=`echo $use_python | tr 'A-Z' 'a-z'` + if [ "$use_python" == "y" ]||[ "$use_python" == "" ];then + use_python="y" + break + elif [ "$use_python" == "n" ];then + python_root="" + break + else + echo "输入错误,请重新输入(y/n)" + fi + done + if [ "$use_python" == "y" ];then + break + fi + else + echo "您输入Python的不是Python3" + python_version="" + fi + done +} + +function checkMacPaddleVersion(){ + while true + do + read -n1 -p "Step 2. 选择PaddlePaddle的版本,请按回车键继续..." + echo + read -p " + 1. 开发版:对应Github上develop分支,如您需要开发、或希望使用PaddlePaddle最新功能,请选用此版本 + 2. 稳定版(推荐):如您无特殊开发需求,建议使用此版本,目前最新的版本号为 ${release_version} + + => 请输入数字1或2。如输入其他字符或直接回车,将会默认选择【 2. 稳定版 】 。请在这里输入并回车:" paddle_version + if [ "$paddle_version" == "1" ]||[ "$paddle_version" == "2" ];then + echo + echo "您选择了数字【"$paddle_version" 】" + echo + break + else + paddle_version="2" + echo + echo "您选择了数字【2】" + echo + break + fi + done +} + +function checkMacPythonVersion(){ + while true + do + read -n1 -p "Step 3. 选择Python版本,请按回车键继续..." + read -p " + 2. 使用python 2.x + 3. 使用python 3.x + + => 请输入数字2或3。如输入其他字符或直接回车,将会默认使用【Python 2 】。请在这里输入并回车:" python_V + echo + if [ "$python_V" == "" ];then + python_V="2" + fi + echo "您选择了数字【"$python_V"】,正在寻找符合您要求的Python版本,请按回车键继续..." + echo + if [ "$python_V" == "2" ];then + python_root=`which python2.7` + if [ "$python_root" == "" ];then + python_root=`which python` + fi + python_version=`$python_root --version 2>&1 1>&1` + if [ $? == "0" ];then + : + else + python_version="" + fi + if [ "$python_root" == "" ]||[ "$python_root" == "/usr/bin/python" -a "$python_version" == "Python 2.7.10" ]||[ "$python_root" == "/usr/bin/python2.7" -a "$python_version" == "Python 2.7.10" ];then + checkMacPython2 + fi + while true + do + read -p " + => 在您的环境中找到 $python_version, 确认使用此版本请输入y;如您希望自定义Python路径请输入n。请在这里输入(y/n)并回车:" use_python + echo + use_python=`echo $use_python | tr 'A-Z' 'a-z'` + if [ "$use_python" == "y" ]||[ "$use_python" == "" ];then + break + elif [ "$use_python" == "n" ];then + python_root="" + checkMacPython2 + break + else + echo "输入错误,请重新输入(y/n)" + fi + done + + elif [ "$python_V" == "3" ];then + python_root=`which python3` + python_version=`$python_root --version 2>&1 1>&1` + if [ $? == "0" ];then + : + else + python_version="" + fi + if [ "$python_root" == "" ]||[ "$python_root" == "/usr/bin/python" -a "$python_version" == "Python 2.7.10" ];then + checkMacPython3 + fi + while true + do + read -p " + => 在您的环境中找到 $python_version, 确认使用此版本请输入y;如您希望自定义Python路径请输入n。请在这里输入(y/n)并回车:" use_python + echo + use_python=`echo $use_python | tr 'A-Z' 'a-z'` + if [ "$use_python" == "y" ]||[ "$use_python" == "" ];then + break + elif [ "$use_python" == "n" ];then + checkMacPython3 + break + else + echo "输入错误,请重新输入(y/n)" + fi + done + else + : + fi + + + if [ "$python_V" == "2" ]||[ "$python_V" == "3" ];then + python_brief_version=`$python_root -m pip -V |awk -F "[ |)]" '{print $6}'|sed 's#\.##g'` + if [[ $python_brief_version == "27" ]];then + uncode=`python -c "import pip._internal;print(pip._internal.pep425tags.get_supported())"|grep "cp27"` + if [[ $uncode == "" ]];then + uncode="mu" + else + uncode="m" + fi + fi + version_list=`echo "${python_list[@]}" | grep "$python_brief_version" ` + if [ "$version_list" != "" ];then + break + else + echo "未找到可用的pip或pip3。PaddlePaddle目前支持:Python2.7/3.5/3.6/3.7及其对应的pip, 请重新输入,或使用ctrl + c退出" + fi + else + echo "输入错误,请重新输入" + fi + done +} + +function checkMacAVX(){ + read -n1 -p "Step 4. 检测您的Mac是否支持AVX指令集,请按回车键继续..." + echo + if [[ $AVX != "" ]];then + AVX="avx" + echo "检测结果:支持" + else + read -n1 -p "检测结果:不支持。非常抱歉,PaddlePaddle在Mac系统暂不提供no_avx类型的安装包,您可以选择在Linux系统中安装no_avx版的PaddlePaddle, 请按回车键退出..." + exit + fi + echo +} + +function checkMacGPU(){ + read -n1 -p "Step 5. 选择CPU/GPU版本,请按回车键继续..." + echo + if [[ $GPU != "" ]];then + echo "MacOS环境下,暂未提供GPU版本的PaddlePaddle安装包,将为您安装CPU版本的PaddlePaddle" + else + echo "MacOS环境下,暂未提供GPU版本的PaddlePaddle安装包,将为您安装CPU版本的PaddlePaddle" + GPU=cpu + fi + echo +} + +function macos() { + path='http://paddlepaddle.org/download?url=' + AVX=`sysctl -a | grep cpu | grep AVX1.0 | tail -1 | grep AVX` + + while true + do + checkMacPaddleVersion + checkMacPythonVersion + checkMacAVX + checkMacGPU + + + echo "*********************2. 开始安装*****************************" + echo + read -n1 -p "即将为您下载并安装PaddlePaddle,请按回车键继续..." + echo + if [[ $paddle_version == "2" ]];then + $python_root -m pip install paddlepaddle + if [ $? == "0" ];then + echo "安装成功,可以使用: ${python_root} 来启动安装了PaddlePaddle的Python解释器" + break + else + rm $whl_cpu_release + echo "未能正常安装PaddlePaddle,请尝试更换您输入的python路径,或者ctrl + c退出后请检查您使用的python对应的pip或pip源是否可用" + echo"" + echo "==========================================================================================" + echo"" + exit 1 + fi + else + if [ -f $whl_cpu_develop ];then + $python_root -m pip install $whl_cpu_develop + if [ $? == "0" ];then + rm -rf $whl_cpu_develop + echo "安装成功!小提示:可以使用: ${python_root} 来启动安装了PaddlePaddle的Python解释器" + break + else + echo "未能正常安装PaddlePaddle,请尝试更换您输入的python路径,或者ctrl + c退出后请检查您使用的python对应的pip或pip源是否可用" + echo"" + echo "==========================================================================================" + echo"" + exit 1 + fi + else + wget ${path}$whl_cpu_develop -O $whl_cpu_develop + if [ $? == "0" ];then + $python_root -m pip install $whl_cpu_develop + if [ $? == "0" ];then + rm $wheel_cpu_develop + echo "安装成功,可以使用: ${python_root} 来启动安装了PaddlePaddle的Python解释器" + break + else + rm $whl_cpu_release + echo "未能正常安装PaddlePaddle,请尝试更换您输入的python路径,或者ctrl + c退出后请检查您使用的python对应的pip或pip源是否可用" + echo"" + echo "==========================================================================================" + echo"" + exit 1 + fi + else + rm $whl_cpu_develop + echo "未能正常安装PaddlePaddle,请检查您的网络 或者确认您是否安装有 wget,或者ctrl + c退出后反馈至https://github.com/PaddlePaddle/Paddle/issues" + echo"" + echo "==========================================================================================" + echo"" + exit 1 + fi + fi + fi + done +} + +function main() { + echo "*********************************" + echo "欢迎使用PaddlePaddle快速安装脚本" + echo "*********************************" + echo + echo "如果您在安装过程中遇到任何问题,请在https://github.com/PaddlePaddle/Paddle/issues反馈,我们的工作人员将会帮您答疑解惑" + echo + echo "本安装包将帮助您在Linux或Mac系统下安装PaddlePaddle,包括 1)安装前的准备和 2)开始安装 两部分" + echo + read -n1 -p "请按回车键进行下一步..." + echo + echo + echo "*********************1. 安装前的准备*****************************" + echo + echo "Step 1. 正在检测您的操作系统信息..." + echo + SYSTEM=`uname -s` + if [ "$SYSTEM" == "Darwin" ];then + echo "您的系统为:MAC OSX" + echo + macos + else + echo "您的系统为:Linux" + echo + OS=`cat /etc/issue|awk 'NR==1 {print $1}'` + if [ $OS == "\S" ] || [ "$OS" == "CentOS" ] || [ $OS == "Ubuntu" ];then + linux + else + echo "您的系统不在本安装包的支持范围,如您需要在windows环境下安装PaddlePaddle,请您参考PaddlePaddle官网的windows安装文档" + fi + fi +} +main diff --git a/python/CMakeLists.txt b/python/CMakeLists.txt index 4cdf96efbd..bcc997ff45 100644 --- a/python/CMakeLists.txt +++ b/python/CMakeLists.txt @@ -54,7 +54,7 @@ ELSE(WIN32) DEPENDS copy_paddle_pybind ${FLUID_CORE} framework_py_proto profiler_py_proto ${PY_FILES} ${external_project_dependencies} ${COPY_PADDLE_MASTER}) ENDIF() -set(paddle_python_deps ${PADDLE_PYTHON_BUILD_DIR}/.timestamp ${MKL_DEPENDS}) +set(paddle_python_deps ${PADDLE_PYTHON_BUILD_DIR}/.timestamp ${MKL_DEPENDS} ${external_project_dependencies}) add_custom_target(paddle_python ALL DEPENDS ${paddle_python_deps}) set(PADDLE_PYTHON_PACKAGE_DIR ${CMAKE_CURRENT_BINARY_DIR}/dist/) diff --git a/python/paddle/__init__.py b/python/paddle/__init__.py index 53746afdb2..fe2ae67ec6 100644 --- a/python/paddle/__init__.py +++ b/python/paddle/__init__.py @@ -25,4 +25,5 @@ import paddle.reader import paddle.dataset import paddle.batch import paddle.compat +import paddle.distributed batch = batch.batch diff --git a/python/paddle/distributed/__init__.py b/python/paddle/distributed/__init__.py new file mode 100644 index 0000000000..d0c32e2609 --- /dev/null +++ b/python/paddle/distributed/__init__.py @@ -0,0 +1,13 @@ +# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/tools/run_mp.py b/python/paddle/distributed/launch.py similarity index 83% rename from tools/run_mp.py rename to python/paddle/distributed/launch.py index 2485400ab8..03c4078775 100644 --- a/tools/run_mp.py +++ b/python/paddle/distributed/launch.py @@ -37,7 +37,7 @@ default_envs = { GPUS = 8 -def start_procs(gpus, cmd, log_dir): +def start_procs(gpus, entrypoint, entrypoint_args, log_dir): procs = [] log_fns = [] os.system("mkdir -p %s" % log_dir) @@ -73,12 +73,11 @@ def start_procs(gpus, cmd, log_dir): "PADDLE_TRAINER_ENDPOINTS": all_nodes_devices_endpoints }) - print("starting process ", i, cmd, curr_env) + print("starting process ", i, entrypoint, entrypoint_args, curr_env) fn = open("%s/workerlog.%d" % (log_dir, i), "w") log_fns.append(fn) - procs.append( - subprocess.Popen( - cmd.strip().split(" "), stdout=fn, stderr=fn, env=curr_env)) + cmd = [sys.executable, "-u", entrypoint] + entrypoint_args + procs.append(subprocess.Popen(cmd, stdout=fn, stderr=fn, env=curr_env)) for i in range(gpus): try: @@ -89,7 +88,8 @@ def start_procs(gpus, cmd, log_dir): pass -def main(): +def parse_args(): + parser = argparse.ArgumentParser( description='''start paddle training using multi-process mode. NOTE: your train program ***must*** run as distributed nccl2 mode, @@ -108,21 +108,27 @@ POD_IP (current node ip address, not needed for local training) type=int, default=8, help='start number of processes for every gpu') - parser.add_argument( - '--cmd', - type=str, - default="", - help='command to run for each process, e.g. python train.py --lr 0.1') parser.add_argument( '--log_dir', type=str, default="mylog", help='directory to put logs per process.') - args = parser.parse_args() - if args.cmd == "": - parser.print_help() - exit(0) - start_procs(args.gpus, args.cmd, args.log_dir) + parser.add_argument( + 'entrypoint_script', + type=str, + help="The entrypoint script to be launched in parallel," + "followed by all the arguments for each process," + "e.g. train.py --lr 0.1") + parser.add_argument('entrypoint_args', nargs=argparse.REMAINDER) + return parser.parse_args() + + +def main(): + args = parse_args() + + # launch multiple training process + start_procs(args.gpus, args.entrypoint_script, args.entrypoint_args, + args.log_dir) if __name__ == "__main__": diff --git a/python/paddle/fluid/__init__.py b/python/paddle/fluid/__init__.py index 564882bd2a..aa1f85734d 100644 --- a/python/paddle/fluid/__init__.py +++ b/python/paddle/fluid/__init__.py @@ -158,9 +158,9 @@ def __bootstrap__(): 'enable_cublas_tensor_op_math', 'conv_workspace_size_limit', 'cudnn_exhaustive_search', 'memory_optimize_debug', 'selected_gpus', 'sync_nccl_allreduce', 'limit_of_tmp_allocation', - 'times_excess_than_required_tmp_allocation' + 'times_excess_than_required_tmp_allocation', + 'enable_inplace_whitelist' ] - core.init_gflags([sys.argv[0]] + ["--tryfromenv=" + ",".join(read_env_flags)]) core.init_glog(sys.argv[0]) diff --git a/python/paddle/fluid/compiler.py b/python/paddle/fluid/compiler.py index a35a4c5983..ef02429428 100644 --- a/python/paddle/fluid/compiler.py +++ b/python/paddle/fluid/compiler.py @@ -174,6 +174,11 @@ class CompiledProgram(object): self._exec_strategy.num_threads = cpu_num * 2 trainers_endpoints = self._program._trainers_endpoints + + # FIXME(dzhwinter): enable_inplace should be after memory_optimize + # if turn on python memory optimize, turn off the inplace_pass. + self._build_strategy.enable_inplace = False if self._program._is_mem_optimized else True + if self._build_strategy.num_trainers > 1 and trainers_endpoints: assert self._build_strategy.num_trainers == len( trainers_endpoints), "num_trainers == len(end_points)" diff --git a/python/paddle/fluid/contrib/decoder/beam_search_decoder.py b/python/paddle/fluid/contrib/decoder/beam_search_decoder.py index f2b7ac8375..5854cadb58 100644 --- a/python/paddle/fluid/contrib/decoder/beam_search_decoder.py +++ b/python/paddle/fluid/contrib/decoder/beam_search_decoder.py @@ -22,7 +22,7 @@ This API is still under active development and may change drastically. from __future__ import print_function -import contextlib +from ...wrapped_decorator import signature_safe_contextmanager import numpy as np import six @@ -419,7 +419,7 @@ class TrainingDecoder(object): self._state_cell = state_cell self._state_cell._enter_decoder(self) - @contextlib.contextmanager + @signature_safe_contextmanager def block(self): """ Define the behavior of the decoder for each RNN time step. @@ -613,7 +613,7 @@ class BeamSearchDecoder(object): self._word_dim = word_dim self._input_var_dict = input_var_dict - @contextlib.contextmanager + @signature_safe_contextmanager def block(self): """ Define the behavior of the decoder for each RNN time step. diff --git a/python/paddle/fluid/contrib/inferencer.py b/python/paddle/fluid/contrib/inferencer.py index b8d5f4ffea..4f37129234 100644 --- a/python/paddle/fluid/contrib/inferencer.py +++ b/python/paddle/fluid/contrib/inferencer.py @@ -14,7 +14,7 @@ from __future__ import print_function -import contextlib +from ..wrapped_decorator import signature_safe_contextmanager from .. import core @@ -105,7 +105,7 @@ class Inferencer(object): return results - @contextlib.contextmanager + @signature_safe_contextmanager def _prog_and_scope_guard(self): with framework.program_guard(main_program=self.inference_program): with executor.scope_guard(self.scope): diff --git a/python/paddle/fluid/contrib/trainer.py b/python/paddle/fluid/contrib/trainer.py index 8569e486f9..d27b808438 100644 --- a/python/paddle/fluid/contrib/trainer.py +++ b/python/paddle/fluid/contrib/trainer.py @@ -14,7 +14,7 @@ from __future__ import print_function -import contextlib +from ..wrapped_decorator import signature_safe_contextmanager import os import errno import shutil @@ -453,7 +453,7 @@ class Trainer(object): io.save_inference_model(param_path, feeded_var_names, target_vars, exe) - @contextlib.contextmanager + @signature_safe_contextmanager def _prog_and_scope_guard(self): with framework.program_guard( main_program=self.train_program, diff --git a/python/paddle/fluid/executor.py b/python/paddle/fluid/executor.py index d3ff14a179..8815911eae 100644 --- a/python/paddle/fluid/executor.py +++ b/python/paddle/fluid/executor.py @@ -17,7 +17,7 @@ from __future__ import print_function import os import multiprocessing import numpy as np -import contextlib +from .wrapped_decorator import signature_safe_contextmanager import six from .framework import Program, default_main_program, Variable from . import core @@ -49,7 +49,7 @@ def _switch_scope(scope): return ex -@contextlib.contextmanager +@signature_safe_contextmanager def scope_guard(scope): """ Change the global/default scope instance by Python `with` statement. All diff --git a/python/paddle/fluid/framework.py b/python/paddle/fluid/framework.py index dcb20704fe..ef304b1110 100644 --- a/python/paddle/fluid/framework.py +++ b/python/paddle/fluid/framework.py @@ -18,6 +18,7 @@ import collections from collections import defaultdict from collections import Iterable import contextlib +from .wrapped_decorator import signature_safe_contextmanager import os import re import traceback @@ -112,7 +113,7 @@ class NameScope(object): _name_scope = NameScope() -@contextlib.contextmanager +@signature_safe_contextmanager def name_scope(prefix=None): """ Generate hierarchical name prefix for the operators. @@ -1922,6 +1923,19 @@ class Program(object): self._trainers_endpoints = [] # the distributed lookup table names self._distributed_lookup_table = None + # @deprecated(the python memory optimize transpiler is deprecated) + # whether the program is optimized by memory_optimize_transpiler + self.__is_mem_optimized = False + + @property + def _is_mem_optimized(self): + # if the program is optimized, operator input/outputs + # maybe same, which conflict with save_inference_model. + return self.__is_mem_optimized + + @_is_mem_optimized.setter + def _is_mem_optimized(self, target): + self.__is_mem_optimized = target @property def op_role(self): @@ -1941,7 +1955,7 @@ class Program(object): return self._current_role @op_role.setter - def set_op_role(self, role): + def op_role(self, role): self._current_role = role @property @@ -1959,7 +1973,7 @@ class Program(object): def set_op_role_var(self, var_name): self._op_role_var = [var_name] - @contextlib.contextmanager + @signature_safe_contextmanager def _optimized_guard(self, param_and_grads): """ A with guard to set :code:`Optimization` :code:`OpRole` and @@ -1989,7 +2003,7 @@ class Program(object): self._op_role_var = tmp_var self._current_role = tmp_role - @contextlib.contextmanager + @signature_safe_contextmanager def _lr_schedule_guard(self, is_with_opt=False): """ A with guard to set :code:`LRSched` :code:`OpRole` and @@ -2643,7 +2657,7 @@ def switch_startup_program(program): return prev_program -@contextlib.contextmanager +@signature_safe_contextmanager def program_guard(main_program, startup_program=None): """ Change the global main program and startup program with `with` statement. @@ -2708,7 +2722,7 @@ def _get_var(name, program=None): return program.global_block().var(name) -@contextlib.contextmanager +@signature_safe_contextmanager def _imperative_guard(tracer): global _imperative_tracer_ tmp_trace = _imperative_tracer_ @@ -2719,7 +2733,7 @@ def _imperative_guard(tracer): _imperative_tracer_ = tmp_trace -@contextlib.contextmanager +@signature_safe_contextmanager def _imperative_place_guard(place): global _imperative_current_expected_place_ tmp_place = _imperative_current_expected_place_ diff --git a/python/paddle/fluid/imperative/base.py b/python/paddle/fluid/imperative/base.py index ff3984b11f..d4525233cc 100644 --- a/python/paddle/fluid/imperative/base.py +++ b/python/paddle/fluid/imperative/base.py @@ -11,7 +11,7 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -import contextlib +from ..wrapped_decorator import signature_safe_contextmanager import numpy as np from paddle.fluid import core @@ -24,7 +24,7 @@ def enabled(): return framework._in_imperative_mode() -@contextlib.contextmanager +@signature_safe_contextmanager def guard(place=None): train = framework.Program() startup = framework.Program() diff --git a/python/paddle/fluid/initializer.py b/python/paddle/fluid/initializer.py index 5be21ff7f7..e8341be286 100644 --- a/python/paddle/fluid/initializer.py +++ b/python/paddle/fluid/initializer.py @@ -16,7 +16,7 @@ from __future__ import print_function from . import framework import numpy as np -import contextlib +from .wrapped_decorator import signature_safe_contextmanager from .core import VarDesc from . import unique_name @@ -49,7 +49,7 @@ def force_init_on_cpu(): return _force_init_on_cpu_ -@contextlib.contextmanager +@signature_safe_contextmanager def init_on_cpu(): """ Force the variable to be inited on CPU. diff --git a/python/paddle/fluid/io.py b/python/paddle/fluid/io.py index 6b1d4cc34f..a2abbf36c0 100644 --- a/python/paddle/fluid/io.py +++ b/python/paddle/fluid/io.py @@ -16,14 +16,16 @@ from __future__ import print_function import os import errno +import warnings import time import shutil import six from functools import reduce +from paddle.fluid import layers from paddle.fluid.executor import Executor from paddle.fluid.evaluator import Evaluator -from paddle.fluid.framework import Program, Parameter, default_main_program, default_startup_program, Variable +from paddle.fluid.framework import Program, Parameter, default_main_program, default_startup_program, Variable, program_guard from . import core __all__ = [ @@ -930,6 +932,24 @@ def save_inference_model(dirname, if main_program is None: main_program = default_main_program() + if main_program._is_mem_optimized: + warnings.warn( + "save_inference_model must put before you call memory_optimize. \ + the memory_optimize will modify the original program, \ + is not suitable for saving inference model \ + we save the original program as inference model.", + RuntimeWarning) + + # fix the bug that the activation op's output as target will be pruned. + # will affect the inference performance. + # TODO(Superjomn) add an IR pass to remove 1-scale op. + with program_guard(main_program): + uniq_target_vars = [] + for var in target_vars: + if isinstance(var, Variable): + var1 = layers.scale(var, 1.) + uniq_target_vars.append(var1) + target_vars = uniq_target_vars # when a pserver and a trainer running on the same machine, mkdir may conflict try: diff --git a/python/paddle/fluid/layer_helper.py b/python/paddle/fluid/layer_helper.py index a172141b3a..7d1636774c 100644 --- a/python/paddle/fluid/layer_helper.py +++ b/python/paddle/fluid/layer_helper.py @@ -302,7 +302,8 @@ class LayerHelper(object): if default_initializer is None and attr.initializer is None: if isinstance(dtype, core.VarDesc.VarType): if dtype != core.VarDesc.VarType.FP32 and \ - dtype != core.VarDesc.VarType.FP64: + dtype != core.VarDesc.VarType.FP64 and \ + dtype != core.VarDesc.VarType.FP16: raise TypeError( "Can not create parameter with default initializer when dtype is not float type. Set default_initializer to fit the parameter dtype!" ) diff --git a/python/paddle/fluid/layers/control_flow.py b/python/paddle/fluid/layers/control_flow.py index a7494aacea..3a6753b01f 100644 --- a/python/paddle/fluid/layers/control_flow.py +++ b/python/paddle/fluid/layers/control_flow.py @@ -13,7 +13,7 @@ # limitations under the License. from __future__ import print_function -import contextlib +from ..wrapped_decorator import signature_safe_contextmanager from .layer_function_generator import autodoc, templatedoc from .tensor import assign, fill_constant @@ -1532,7 +1532,7 @@ class DynamicRNN(object): outputs={'Out': [x_reordered]}) return shrink_memory(x_reordered, self.step_idx, self.lod_rank_table) - @contextlib.contextmanager + @signature_safe_contextmanager def block(self): """ The block for user to define operators in RNN. See the class docstring diff --git a/python/paddle/fluid/layers/detection.py b/python/paddle/fluid/layers/detection.py index 0602d7a194..3b43ae0b9c 100644 --- a/python/paddle/fluid/layers/detection.py +++ b/python/paddle/fluid/layers/detection.py @@ -49,6 +49,7 @@ __all__ = [ 'box_coder', 'polygon_box_transform', 'yolov3_loss', + 'box_clip', 'multiclass_nms', ] @@ -396,10 +397,10 @@ def box_coder(prior_box, input is image feature map, they are close to the origin of the coordinate system. [xmax, ymax] is the right bottom coordinate of the anchor box. - prior_box_var(Variable|list): prior_box_var supports two types of input. - One is variable with shape [M, 4] holds M group. - The other one is list consist of 4 elements - shared by all boxes. + prior_box_var(Variable|list|None): prior_box_var supports two types + of input. One is variable with shape [M, 4] + holds M group. The other one is list consist of + 4 elements shared by all boxes. target_box(Variable): This input can be a 2-D LoDTensor with shape [N, 4] when code_type is 'encode_center_size'. This input also can be a 3-D Tensor with shape @@ -2055,6 +2056,54 @@ def generate_proposals(scores, return rpn_rois, rpn_roi_probs +def box_clip(input, im_info, name=None): + """ + Clip the box into the size given by im_info + For each input box, The formula is given as follows: + + .. code-block:: text + + xmin = max(min(xmin, im_w - 1), 0) + ymin = max(min(ymin, im_h - 1), 0) + xmax = max(min(xmax, im_w - 1), 0) + ymax = max(min(ymax, im_h - 1), 0) + + where im_w and im_h are computed from im_info: + + .. code-block:: text + + im_h = round(height / scale) + im_w = round(weight / scale) + + Args: + input(variable): The input box, the last dimension is 4. + im_info(variable): The information of image with shape [N, 3] with + layout (height, width, scale). height and width + is the input size and scale is the ratio of input + size and original size. + name (str): The name of this layer. It is optional. + + Returns: + Variable: The cliped tensor variable. + + Examples: + .. code-block:: python + + boxes = fluid.layers.data( + name='data', shape=[8, 4], dtype='float32', lod_level=1) + im_info = fluid.layers.data(name='im_info', shape=[3]) + out = fluid.layers.box_clip( + input=boxes, im_info=im_info, inplace=True) + """ + + helper = LayerHelper("box_clip", **locals()) + output = helper.create_variable_for_type_inference(dtype=input.dtype) + inputs = {"Input": input, "ImInfo": im_info} + helper.append_op(type="box_clip", inputs=inputs, outputs={"Output": output}) + + return output + + def multiclass_nms(bboxes, scores, score_threshold, @@ -2132,9 +2181,11 @@ def multiclass_nms(bboxes, (After version 1.3, when no boxes detected, the lod is changed from {0} to {1}) + Examples: .. code-block:: python + boxes = fluid.layers.data(name='bboxes', shape=[81, 4], dtype='float32', lod_level=1) scores = fluid.layers.data(name='scores', shape=[81], diff --git a/python/paddle/fluid/layers/io.py b/python/paddle/fluid/layers/io.py index 1762bd3e34..b88be66906 100644 --- a/python/paddle/fluid/layers/io.py +++ b/python/paddle/fluid/layers/io.py @@ -13,7 +13,7 @@ # limitations under the License. from __future__ import print_function -import contextlib +from ..wrapped_decorator import signature_safe_contextmanager import multiprocessing import os import six @@ -1116,7 +1116,7 @@ class Preprocessor(object): def _is_completed(self): return self.sub_block and self.source_var_names and self.sink_var_names - @contextlib.contextmanager + @signature_safe_contextmanager def block(self): self.status = Preprocessor.IN_SUB_BLOCK self.sub_block = self.main_prog._create_block() diff --git a/python/paddle/fluid/optimizer.py b/python/paddle/fluid/optimizer.py index e0e781a322..fbd04f1eb4 100644 --- a/python/paddle/fluid/optimizer.py +++ b/python/paddle/fluid/optimizer.py @@ -15,7 +15,7 @@ from __future__ import print_function from collections import defaultdict -from contextlib import contextmanager +from .wrapped_decorator import signature_safe_contextmanager from paddle.fluid.framework import Program, Variable, name_scope, default_main_program from paddle.fluid.distribute_lookup_table import find_distributed_lookup_table @@ -1610,7 +1610,7 @@ class ModelAverage(Optimizer): }, stop_gradient=True) - @contextmanager + @signature_safe_contextmanager def apply(self, executor, need_restore=True): """Apply average values to parameters of current model. """ diff --git a/python/paddle/fluid/parallel_executor.py b/python/paddle/fluid/parallel_executor.py index a07ff6ac69..22212ae9a2 100644 --- a/python/paddle/fluid/parallel_executor.py +++ b/python/paddle/fluid/parallel_executor.py @@ -146,6 +146,10 @@ class ParallelExecutor(object): # step4: get main_program, scope, local_scopes main = main_program if main_program \ else framework.default_main_program() + # FIXME(dzhwinter): enable_inplace should be after memory_optimize + # if turn on python memory optimize, turn off the inplace_pass. + if build_strategy.enable_inplace is None: + build_strategy.enable_inplace = False if main._is_mem_optimized else True scope = scope if scope is not None else executor.global_scope() if share_vars_from and not isinstance(share_vars_from, diff --git a/python/paddle/fluid/profiler.py b/python/paddle/fluid/profiler.py index e05885f5f5..d5670dbc82 100644 --- a/python/paddle/fluid/profiler.py +++ b/python/paddle/fluid/profiler.py @@ -15,7 +15,7 @@ from __future__ import print_function from . import core -from contextlib import contextmanager +from .wrapped_decorator import signature_safe_contextmanager import os import six @@ -35,7 +35,7 @@ NVPROF_CONFIG = [ ] -@contextmanager +@signature_safe_contextmanager def cuda_profiler(output_file, output_mode=None, config=None): """The CUDA profiler. This fuctions is used to profile CUDA program by CUDA runtime application @@ -217,7 +217,7 @@ def stop_profiler(sorted_key=None, profile_path='/tmp/profile'): core.disable_profiler(key_map[sorted_key], profile_path) -@contextmanager +@signature_safe_contextmanager def profiler(state, sorted_key=None, profile_path='/tmp/profile'): """The profiler interface. Different from cuda_profiler, this profiler can be used to profile both CPU diff --git a/python/paddle/fluid/recordio_writer.py b/python/paddle/fluid/recordio_writer.py index 076a942cdd..aa581f23a1 100644 --- a/python/paddle/fluid/recordio_writer.py +++ b/python/paddle/fluid/recordio_writer.py @@ -15,14 +15,14 @@ from __future__ import print_function import os -import contextlib +from .wrapped_decorator import signature_safe_contextmanager from . import core __all__ = [ 'convert_reader_to_recordio_file', 'convert_reader_to_recordio_files' ] -@contextlib.contextmanager +@signature_safe_contextmanager def create_recordio_writer(filename, compressor=core.RecordIOWriter.Compressor.Snappy, max_num_records=1000): diff --git a/python/paddle/fluid/tests/test_detection.py b/python/paddle/fluid/tests/test_detection.py index 77dfa1cb51..0d39a139ee 100644 --- a/python/paddle/fluid/tests/test_detection.py +++ b/python/paddle/fluid/tests/test_detection.py @@ -482,6 +482,17 @@ class TestYoloDetection(unittest.TestCase): self.assertIsNotNone(loss) +class TestBoxClip(unittest.TestCase): + def test_box_clip(self): + program = Program() + with program_guard(program): + input_box = layers.data( + name='input_box', shape=[7, 4], dtype='float32', lod_level=1) + im_info = layers.data(name='im_info', shape=[3], dtype='float32') + out = layers.box_clip(input_box, im_info) + self.assertIsNotNone(out) + + class TestMulticlassNMS(unittest.TestCase): def test_multiclass_nms(self): program = Program() diff --git a/python/paddle/fluid/tests/unittests/CMakeLists.txt b/python/paddle/fluid/tests/unittests/CMakeLists.txt index 699181d01d..4b26bacce9 100644 --- a/python/paddle/fluid/tests/unittests/CMakeLists.txt +++ b/python/paddle/fluid/tests/unittests/CMakeLists.txt @@ -110,6 +110,10 @@ py_test_modules(test_parallel_executor_transformer MODULES test_parallel_executo if(NOT APPLE) py_test_modules(test_image_classification_resnet MODULES test_image_classification_resnet SERIAL) endif() +if(CMAKE_BUILD_TYPE STREQUAL "Debug") + # change the timeout from 600 to 900, because in debug mode, this test need more time. + set_tests_properties(test_image_classification_resnet PROPERTIES TIMEOUT 900) +endif() if (WITH_NGRAPH) add_subdirectory(ngraph) diff --git a/python/paddle/fluid/tests/unittests/ngraph/test_accuracy_ngraph_op.py b/python/paddle/fluid/tests/unittests/ngraph/test_accuracy_ngraph_op.py new file mode 100644 index 0000000000..84b9198dbf --- /dev/null +++ b/python/paddle/fluid/tests/unittests/ngraph/test_accuracy_ngraph_op.py @@ -0,0 +1,53 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function + +import unittest +import numpy as np +from paddle.fluid.tests.unittests.op_test import OpTest + + +class TestNGRAPHAccuracyOp(OpTest): + def setUp(self): + self.op_type = "accuracy" + self.dtype = np.float32 + self.init_dtype() + n = 128 + infer = np.random.random((n, 1)).astype(self.dtype) + indices = np.random.randint(0, 2, (n, 1)) + label = np.random.randint(0, 2, (n, 1)) + self.inputs = {'Out': infer, 'Indices': indices, "Label": label} + num_correct = 0 + for rowid in range(n): + for ele in indices[rowid]: + if ele == label[rowid]: + num_correct += 1 + break + self.outputs = { + 'Accuracy': np.array([num_correct / float(n)]).astype(self.dtype), + 'Correct': np.array([num_correct]).astype("int64"), + 'Total': np.array([n]).astype("int64") + } + self._cpu_only = True + + def init_dtype(self): + pass + + def test_check_output(self): + self.check_output() + + +if __name__ == '__main__': + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/ngraph/test_batch_norm_ngraph_op.py b/python/paddle/fluid/tests/unittests/ngraph/test_batch_norm_ngraph_op.py new file mode 100644 index 0000000000..511173af5e --- /dev/null +++ b/python/paddle/fluid/tests/unittests/ngraph/test_batch_norm_ngraph_op.py @@ -0,0 +1,37 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function + +import unittest +from paddle.fluid.tests.unittests.test_batch_norm_op import TestBatchNormOpTraining, TestBatchNormOpInference + + +class TestNGRAPHBatchNormOpTraining(TestBatchNormOpTraining): + def init_kernel_type(self): + super(TestNGRAPHBatchNormOpTraining, self).init_kernel_type() + + +class TestNGRAPHBatchNormOpInference(TestBatchNormOpInference): + def init_kernel_type(self): + super(TestNGRAPHBatchNormOpInference, self).init_kernel_type() + + +class TestNGRAPHBatchNormOpWithReluInference(TestBatchNormOpInference): + def init_kernel_type(self): + super(TestNGRAPHBatchNormOpWithReluInference, self).init_kernel_type() + + +if __name__ == '__main__': + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/ngraph/test_conv2d_ngraph_op.py b/python/paddle/fluid/tests/unittests/ngraph/test_conv2d_ngraph_op.py new file mode 100644 index 0000000000..dbc8557b4e --- /dev/null +++ b/python/paddle/fluid/tests/unittests/ngraph/test_conv2d_ngraph_op.py @@ -0,0 +1,76 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function + +import unittest +from paddle.fluid.tests.unittests.test_conv2d_op import TestConv2dOp, TestWithPad, TestWithStride, TestWithGroup, TestWith1x1, TestWithInput1x1Filter1x1 + + +class TestNGRAPH(TestConv2dOp): + def setUp(self): + super(TestNGRAPH, self).setUp() + self._cpu_only = True + + def init_kernel_type(self): + super(TestNGRAPH, self).init_kernel_type() + + +class TestNGRAPHWithPad(TestWithPad): + def setUp(self): + super(TestNGRAPHWithPad, self).setUp() + self._cpu_only = True + + def init_kernel_type(self): + super(TestNGRAPHWithPad, self).init_kernel_type() + + +class TestNGRAPHWithStride(TestWithStride): + def setUp(self): + super(TestNGRAPHWithStride, self).setUp() + self._cpu_only = True + + def init_kernel_type(self): + super(TestNGRAPHWithStride, self).init_kernel_type() + + +class TestNGRAPHWithGroup(TestWithGroup): + def setUp(self): + super(TestNGRAPHWithGroup, self).setUp() + self._cpu_only = True + + def init_kernel_type(self): + super(TestNGRAPHWithGroup, self).init_kernel_type() + + +class TestNGRAPHWith1x1(TestWith1x1): + def setUp(self): + super(TestNGRAPHWith1x1, self).setUp() + self._cpu_only = True + + def init_kernel_type(self): + super(TestNGRAPHWith1x1, self).init_kernel_type() + + +class TestNGRAPHWithInput1x1Filter1x1(TestWithInput1x1Filter1x1): + def setUp(self): + super(TestNGRAPHWithInput1x1Filter1x1, self).setUp() + self._cpu_only = True + + def init_kernel_type(self): + super(TestNGRAPHWithInput1x1Filter1x1, self).init_kernel_type() + + +if __name__ == '__main__': + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/ngraph/test_elementwise_add_ngraph_op.py b/python/paddle/fluid/tests/unittests/ngraph/test_elementwise_add_ngraph_op.py index 67722db89b..67f749bfee 100644 --- a/python/paddle/fluid/tests/unittests/ngraph/test_elementwise_add_ngraph_op.py +++ b/python/paddle/fluid/tests/unittests/ngraph/test_elementwise_add_ngraph_op.py @@ -14,73 +14,16 @@ from __future__ import print_function import unittest -from paddle.fluid.tests.unittests.test_elementwise_add_op import * +from paddle.fluid.tests.unittests.test_elementwise_add_op import TestElementwiseAddOp class TestNGRAPHElementwiseAddOp(TestElementwiseAddOp): - def init_input_output(self): - super(TestNGRAPHElementwiseAddOp, self).init_input_output() - - -class TestNGRAPHElementwiseAddOp_scalar(TestElementwiseAddOp_scalar): - def init_input_output(self): - super(TestNGRAPHElementwiseAddOp_scalar, self).init_input_output() - - -class TestNGRAPHElementwiseAddOp_scalar2(TestElementwiseAddOp_scalar2): - def init_input_output(self): - super(TestNGRAPHElementwiseAddOp_scalar2, self).init_input_output() - - -class TestNGRAPHElementwiseAddOp_Vector(TestElementwiseAddOp_Vector): - def init_input_output(self): - super(TestNGRAPHElementwiseAddOp_Vector, self).init_input_output() - - -class TesNGRAPHtElementwiseAddOp_broadcast_0(TestElementwiseAddOp_broadcast_0): - def init_input_output(self): - super(TesNGRAPHtElementwiseAddOp_broadcast_0, self).init_input_output() - - -class TestNGRAPHElementwiseAddOp_broadcast_1(TestElementwiseAddOp_broadcast_1): - def init_input_output(self): - super(TestNGRAPHElementwiseAddOp_broadcast_1, self).init_input_output() + def setUp(self): + super(TestNGRAPHElementwiseAddOp, self).setUp() + self._cpu_only = True - -class TestNGRAPHElementwiseAddOp_broadcast_2(TestElementwiseAddOp_broadcast_2): def init_input_output(self): - super(TestNGRAPHElementwiseAddOp_broadcast_2, self).init_input_output() - - -class TestNGRAPHElementwiseAddOp_broadcast_3(TestElementwiseAddOp_broadcast_3): - def init_input_output(self): - super(TestNGRAPHElementwiseAddOp_broadcast_3, self).init_input_output() - - -class TestNGRAPHElementwiseAddOp_broadcast_4(TestElementwiseAddOp_broadcast_4): - def init_input_output(self): - super(TestNGRAPHElementwiseAddOp_broadcast_4, self).init_input_output() - - -class TestNGRAPHElementwiseAddOp_rowwise_add_0( - TestElementwiseAddOp_rowwise_add_0): - def init_input_output(self): - super(TestNGRAPHElementwiseAddOp_rowwise_add_0, - self).init_input_output() - - -class TestNGRAPHElementwiseAddOp_rowwise_add_1( - TestElementwiseAddOp_rowwise_add_1): - def init_input_output(self): - super(TestNGRAPHElementwiseAddOp_rowwise_add_1, - self).init_input_output() - - -class TestNGRAPHElementwiseAddOp_channelwise_add( - TestElementwiseAddOp_channelwise_add): - def init_input_output(self): - super(TestNGRAPHElementwiseAddOp_channelwise_add, - self).init_input_output() + super(TestNGRAPHElementwiseAddOp, self).init_input_output() if __name__ == '__main__': diff --git a/python/paddle/fluid/tests/unittests/ngraph/test_mean_ngraph_op.py b/python/paddle/fluid/tests/unittests/ngraph/test_mean_ngraph_op.py index 5535427ea8..11881ac6e5 100644 --- a/python/paddle/fluid/tests/unittests/ngraph/test_mean_ngraph_op.py +++ b/python/paddle/fluid/tests/unittests/ngraph/test_mean_ngraph_op.py @@ -14,17 +14,13 @@ from __future__ import print_function import unittest -from paddle.fluid.tests.unittests.test_mean_op import TestMeanOp, TestFP16MeanOp +from paddle.fluid.tests.unittests.test_mean_op import TestMeanOp class TestNGRAPHMeanOp(TestMeanOp): def setUp(self): super(TestNGRAPHMeanOp, self).setUp() - - -class TestNGRAPHFP16MeanOp(TestFP16MeanOp): - def setUp(self): - super(TestNGRAPHFP16MeanOp, self).setUp() + self._cpu_only = True if __name__ == "__main__": diff --git a/python/paddle/fluid/tests/unittests/ngraph/test_mul_ngraph_op.py b/python/paddle/fluid/tests/unittests/ngraph/test_mul_ngraph_op.py index 6aba62f7c0..a916c8d450 100644 --- a/python/paddle/fluid/tests/unittests/ngraph/test_mul_ngraph_op.py +++ b/python/paddle/fluid/tests/unittests/ngraph/test_mul_ngraph_op.py @@ -15,27 +15,38 @@ from __future__ import print_function import unittest -from paddle.fluid.tests.unittests.test_mul_op import TestMulOp, TestMulOp2, TestFP16MulOp1, TestFP16MulOp2 +import numpy as np +from paddle.fluid.tests.unittests.op_test import OpTest + + +class TestNGRAPHMulOp(OpTest): + def setUp(self): + self.op_type = "mul" + self.dtype = np.float32 + self.init_dtype_type() + self.inputs = { + 'X': np.random.random((2, 4)).astype(self.dtype), + 'Y': np.random.random((4, 4)).astype(self.dtype) + } + self.outputs = {'Out': np.dot(self.inputs['X'], self.inputs['Y'])} + self._cpu_only = True - -class TestNGRAPHMulOp(TestMulOp): def init_dtype_type(self): pass + def test_check_output(self): + self.check_output() -class TestNGRAPHMulOp2(TestMulOp2): - def init_dtype_type(self): - pass + def test_check_grad_normal(self): + self.check_grad(['X', 'Y'], 'Out', max_relative_error=0.5) + def test_check_grad_ingore_x(self): + self.check_grad( + ['Y'], 'Out', max_relative_error=0.5, no_grad_set=set("X")) -class TestNGRAPHFP16MulOp1(TestFP16MulOp1): - def init_dtype_type(self): - pass - - -class TestNGRAPHFP16MulOp2(TestFP16MulOp2): - def init_dtype_type(self): - pass + def test_check_grad_ingore_y(self): + self.check_grad( + ['X'], 'Out', max_relative_error=0.5, no_grad_set=set('Y')) if __name__ == "__main__": diff --git a/python/paddle/fluid/tests/unittests/ngraph/test_pool2d_ngraph_op.py b/python/paddle/fluid/tests/unittests/ngraph/test_pool2d_ngraph_op.py index 95e592e8ec..96a2b72d8a 100644 --- a/python/paddle/fluid/tests/unittests/ngraph/test_pool2d_ngraph_op.py +++ b/python/paddle/fluid/tests/unittests/ngraph/test_pool2d_ngraph_op.py @@ -14,35 +14,59 @@ from __future__ import print_function -from paddle.fluid.tests.unittests.test_pool2d_op import * +from paddle.fluid.tests.unittests.test_pool2d_op import TestPool2D_Op, TestCase1, TestCase2, TestCase3, TestCase4, TestCase5 class TestNGRAPHPool2D_Op(TestPool2D_Op): + def setUp(self): + super(TestNGRAPHPool2D_Op, self).setUp() + self._cpu_only = True + def init_test_case(self): super(TestNGRAPHPool2D_Op, self).init_test_case() class TestNGRAPHCase1(TestCase1): + def setUp(self): + super(TestNGRAPHCase1, self).setUp() + self._cpu_only = True + def init_test_case(self): super(TestNGRAPHCase1, self).init_test_case() class TestNGRAPHCase2(TestCase2): + def setUp(self): + super(TestNGRAPHCase2, self).setUp() + self._cpu_only = True + def init_test_case(self): super(TestNGRAPHCase2, self).init_test_case() class TestNGRAPHCase3(TestCase3): + def setUp(self): + super(TestNGRAPHCase3, self).setUp() + self._cpu_only = True + def init_pool_type(self): super(TestNGRAPHCase3, self).init_pool_type() class TestNGRAPHCase4(TestCase4): + def setUp(self): + super(TestNGRAPHCase4, self).setUp() + self._cpu_only = True + def init_pool_type(self): super(TestNGRAPHCase4, self).init_pool_type() class TestNGRAPHCase5(TestCase5): + def setUp(self): + super(TestNGRAPHCase5, self).setUp() + self._cpu_only = True + def init_pool_type(self): super(TestNGRAPHCase5, self).init_pool_type() diff --git a/python/paddle/fluid/tests/unittests/ngraph/test_scale_ngraph_op.py b/python/paddle/fluid/tests/unittests/ngraph/test_scale_ngraph_op.py index b42a1f73fa..4da5ca4583 100644 --- a/python/paddle/fluid/tests/unittests/ngraph/test_scale_ngraph_op.py +++ b/python/paddle/fluid/tests/unittests/ngraph/test_scale_ngraph_op.py @@ -13,25 +13,23 @@ # limitations under the License. from __future__ import print_function import unittest -from paddle.fluid.tests.unittests.test_scale_op import TestScaleOp, TestScaleOpSelectedRows, TestScaleFp16Op, TestScaleFp16OpSelectedRows +from paddle.fluid.tests.unittests.test_scale_op import TestScaleOp, TestScaleOpSelectedRows class TestNGRAPHScaleOp(TestScaleOp): - def init_dtype_type(self): - pass + def setUp(self): + super(TestNGRAPHScaleOp, self).setUp() + self._cpu_only = True - -class TestNGRAPHScaleOpSelectedRows(TestScaleOpSelectedRows): def init_dtype_type(self): pass -class TestNGRAPHScaleFp16Op(TestScaleFp16Op): - def init_dtype_type(self): - pass - +class TestNGRAPHScaleOpSelectedRows(TestScaleOpSelectedRows): + def setUp(self): + super(TestNGRAPHScaleOpSelectedRows, self).setUp() + self._cpu_only = True -class TestNGRAPHScaleFp16OpSelectedRows(TestScaleFp16OpSelectedRows): def init_dtype_type(self): pass diff --git a/python/paddle/fluid/tests/unittests/ngraph/test_top_k_ngraph_op.py b/python/paddle/fluid/tests/unittests/ngraph/test_top_k_ngraph_op.py index 3a0171087d..fa68df1adf 100644 --- a/python/paddle/fluid/tests/unittests/ngraph/test_top_k_ngraph_op.py +++ b/python/paddle/fluid/tests/unittests/ngraph/test_top_k_ngraph_op.py @@ -20,21 +20,25 @@ from paddle.fluid.tests.unittests.test_top_k_op import TestTopkOp, TestTopkOp3d, class TestNGRAPHTopkOp(TestTopkOp): def setUp(self): super(TestNGRAPHTopkOp, self).setUp() + self._cpu_only = True class TestNGRAPHTopkOp2(TestTopkOp2): def setUp(self): super(TestNGRAPHTopkOp2, self).setUp() + self._cpu_only = True class TestNGRAPHTopkOp3(TestTopkOp3): def setUp(self): super(TestNGRAPHTopkOp3, self).setUp() + self._cpu_only = True class TestNGRAPHTopkOp4(TestTopkOp4): def setUp(self): super(TestNGRAPHTopkOp4, self).setUp() + self._cpu_only = True if __name__ == "__main__": diff --git a/python/paddle/fluid/tests/unittests/parallel_executor_test_base.py b/python/paddle/fluid/tests/unittests/parallel_executor_test_base.py index fdacd241f9..c429c8af7d 100644 --- a/python/paddle/fluid/tests/unittests/parallel_executor_test_base.py +++ b/python/paddle/fluid/tests/unittests/parallel_executor_test_base.py @@ -40,7 +40,8 @@ class TestParallelExecutorBase(unittest.TestCase): seed=None, use_parallel_executor=True, use_reduce=False, - use_ir_memory_optimize=False, + use_ir_memory_optimize=True, + enable_inplace=True, fuse_elewise_add_act_ops=False, fuse_relu_depthwise_conv=False, optimizer=fluid.optimizer.Adam, @@ -60,63 +61,65 @@ class TestParallelExecutorBase(unittest.TestCase): main.random_seed = seed loss = method(use_feed=feed_dict is not None) - if optimizer: optimizer().minimize(loss) if memory_opt: fluid.memory_optimize(main) - place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace() - exe = fluid.Executor(place) - exe.run(startup) - exec_strategy = fluid.ExecutionStrategy() - exec_strategy.allow_op_delay = allow_op_delay - if use_fast_executor: - exec_strategy.use_experimental_executor = True - build_strategy = fluid.BuildStrategy() - build_strategy.reduce_strategy = fluid.BuildStrategy.ReduceStrategy.Reduce \ - if use_reduce else fluid.BuildStrategy.ReduceStrategy.AllReduce - build_strategy.fuse_elewise_add_act_ops = fuse_elewise_add_act_ops - build_strategy.fuse_relu_depthwise_conv = fuse_relu_depthwise_conv - build_strategy.memory_optimize = use_ir_memory_optimize - build_strategy.enable_sequential_execution = enable_sequential_execution - if use_cuda and core.is_compiled_with_cuda(): - build_strategy.remove_unnecessary_lock = True - if use_parallel_executor: - binary = compiler.CompiledProgram(main).with_data_parallel( - loss_name=loss.name, - build_strategy=build_strategy, - exec_strategy=exec_strategy) - else: - binary = compiler.CompiledProgram(main) - - if batch_size is not None: - batch_size *= fluid.core.get_cuda_device_count( - ) if use_cuda else int( - os.environ.get('CPU_NUM', multiprocessing.cpu_count())) - begin = time.time() - first_loss, = run_executor( - exe=exe, binary=binary, feed=feed_dict, fetch_list=[loss.name]) - - for i in range(iter): - run_executor( - exe=exe, binary=binary, feed=feed_dict, fetch_list=[]) - - last_loss, = run_executor( - exe=exe, binary=binary, feed=feed_dict, fetch_list=[loss.name]) - end = time.time() - - if batch_size is not None: - print("%.4f Instance per second" % ( - (batch_size * iter + 2) / (end - begin))) - - avg_last_loss_val = np.array(last_loss).mean() - avg_first_loss_val = np.array(first_loss).mean() - if math.isnan(float(avg_last_loss_val)) or math.isnan( - float(avg_first_loss_val)): - sys.exit("got NaN loss, training failed.") - - print(first_loss, last_loss) - # self.assertGreater(first_loss[0], last_loss[0]) - return first_loss, last_loss + place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace() + exe = fluid.Executor(place) + exe.run(startup) + exec_strategy = fluid.ExecutionStrategy() + exec_strategy.allow_op_delay = allow_op_delay + if use_fast_executor: + exec_strategy.use_experimental_executor = True + build_strategy = fluid.BuildStrategy() + build_strategy.reduce_strategy = fluid.BuildStrategy.ReduceStrategy.Reduce \ + if use_reduce else fluid.BuildStrategy.ReduceStrategy.AllReduce + build_strategy.fuse_elewise_add_act_ops = fuse_elewise_add_act_ops + build_strategy.fuse_relu_depthwise_conv = fuse_relu_depthwise_conv + build_strategy.memory_optimize = use_ir_memory_optimize + # python memory optimization is conflict with inplace pass. + # Use ir graph memory optimization after inplace pass is the correct way. + build_strategy.enable_inplace = False if memory_opt else enable_inplace + build_strategy.enable_sequential_execution = enable_sequential_execution + + if use_cuda and core.is_compiled_with_cuda(): + build_strategy.remove_unnecessary_lock = True + if use_parallel_executor: + binary = compiler.CompiledProgram(main).with_data_parallel( + loss_name=loss.name, + build_strategy=build_strategy, + exec_strategy=exec_strategy) + else: + binary = compiler.CompiledProgram(main) + + if batch_size is not None: + batch_size *= fluid.core.get_cuda_device_count( + ) if use_cuda else int( + os.environ.get('CPU_NUM', multiprocessing.cpu_count())) + begin = time.time() + first_loss, = run_executor( + exe=exe, binary=binary, feed=feed_dict, fetch_list=[loss.name]) + + for i in range(iter): + run_executor(exe=exe, binary=binary, feed=feed_dict, fetch_list=[]) + + last_loss, = run_executor( + exe=exe, binary=binary, feed=feed_dict, fetch_list=[loss.name]) + end = time.time() + + if batch_size is not None: + print("%.4f Instance per second" % ( + (batch_size * iter + 2) / (end - begin))) + + avg_last_loss_val = np.array(last_loss).mean() + avg_first_loss_val = np.array(first_loss).mean() + if math.isnan(float(avg_last_loss_val)) or math.isnan( + float(avg_first_loss_val)): + sys.exit("got NaN loss, training failed.") + + print(first_loss, last_loss) + # self.assertGreater(first_loss[0], last_loss[0]) + return first_loss, last_loss diff --git a/python/paddle/fluid/tests/unittests/test_box_clip_op.py b/python/paddle/fluid/tests/unittests/test_box_clip_op.py new file mode 100644 index 0000000000..b2b0598f31 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/test_box_clip_op.py @@ -0,0 +1,70 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function + +import unittest +import numpy as np +import sys +import math +from op_test import OpTest +import copy + + +def box_clip(input_box, im_info, output_box): + im_w = round(im_info[1] / im_info[2]) + im_h = round(im_info[0] / im_info[2]) + output_box[:, :, 0] = np.maximum( + np.minimum(input_box[:, :, 0], im_w - 1), 0) + output_box[:, :, 1] = np.maximum( + np.minimum(input_box[:, :, 1], im_h - 1), 0) + output_box[:, :, 2] = np.maximum( + np.minimum(input_box[:, :, 2], im_w - 1), 0) + output_box[:, :, 3] = np.maximum( + np.minimum(input_box[:, :, 3], im_h - 1), 0) + + +def batch_box_clip(input_boxes, im_info, lod): + n = input_boxes.shape[0] + m = input_boxes.shape[1] + output_boxes = np.zeros((n, m, 4), dtype=np.float32) + cur_offset = 0 + for i in range(len(lod)): + box_clip(input_boxes[cur_offset:(cur_offset + lod[i]), :, :], + im_info[i, :], + output_boxes[cur_offset:(cur_offset + lod[i]), :, :]) + cur_offset += lod[i] + return output_boxes + + +class TestBoxClipOp(OpTest): + def test_check_output(self): + self.check_output() + + def setUp(self): + self.op_type = "box_clip" + lod = [[1, 2, 3]] + input_boxes = np.random.random((6, 10, 4)) * 5 + im_info = np.array([[5, 8, 1.], [6, 6, 1.], [7, 5, 1.]]) + output_boxes = batch_box_clip(input_boxes, im_info, lod[0]) + + self.inputs = { + 'Input': (input_boxes.astype('float32'), lod), + 'ImInfo': im_info.astype('float32'), + } + self.outputs = {'Output': output_boxes} + + +if __name__ == '__main__': + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_box_coder_op.py b/python/paddle/fluid/tests/unittests/test_box_coder_op.py index 6156268bf2..220bffebe8 100644 --- a/python/paddle/fluid/tests/unittests/test_box_coder_op.py +++ b/python/paddle/fluid/tests/unittests/test_box_coder_op.py @@ -34,7 +34,9 @@ def box_decoder(t_box, p_box, pb_v, output_box, norm, axis=0): pb_y = pb_y.reshape(shape) if pb_v.ndim == 2: - pb_v = pb_v.reshape(1, pb_v.shape[0], pb_v.shape[1]) + var_shape = (1, pb_v.shape[0], pb_v.shape[1]) if axis == 0 else ( + pb_v.shape[0], 1, pb_v.shape[1]) + pb_v = pb_v.reshape(var_shape) if pb_v.ndim == 1: tb_x = pb_v[0] * t_box[:, :, 0] * pb_w + pb_x tb_y = pb_v[1] * t_box[:, :, 1] * pb_h + pb_y @@ -125,33 +127,6 @@ class TestBoxCoderOp(OpTest): self.outputs = {'OutputBox': output_box} -class TestBoxCoderOpWithOneRankVar(OpTest): - def test_check_output(self): - self.check_output() - - def setUp(self): - self.op_type = "box_coder" - lod = [[1, 1, 1, 1, 1]] - prior_box = np.random.random((81, 4)).astype('float32') - prior_box_var = np.random.random((4)).astype('float32') - target_box = np.random.random((20, 81, 4)).astype('float32') - code_type = "DecodeCenterSize" - box_normalized = False - output_box = batch_box_coder(prior_box, prior_box_var, target_box, - lod[0], code_type, box_normalized) - - self.inputs = { - 'PriorBox': prior_box, - 'PriorBoxVar': prior_box_var, - 'TargetBox': target_box, - } - self.attrs = { - 'code_type': 'decode_center_size', - 'box_normalized': False - } - self.outputs = {'OutputBox': output_box} - - class TestBoxCoderOpWithoutBoxVar(OpTest): def test_check_output(self): self.check_output() @@ -210,7 +185,7 @@ class TestBoxCoderOpWithAxis(OpTest): self.op_type = "box_coder" lod = [[1, 1, 1, 1, 1]] prior_box = np.random.random((30, 4)).astype('float32') - prior_box_var = np.random.random((4)).astype('float32') + prior_box_var = np.random.random((30, 4)).astype('float32') target_box = np.random.random((30, 81, 4)).astype('float32') code_type = "DecodeCenterSize" box_normalized = False diff --git a/python/paddle/fluid/tests/unittests/test_eager_deletion_transformer.py b/python/paddle/fluid/tests/unittests/test_eager_deletion_transformer.py index 754d5fd409..603c8e7488 100644 --- a/python/paddle/fluid/tests/unittests/test_eager_deletion_transformer.py +++ b/python/paddle/fluid/tests/unittests/test_eager_deletion_transformer.py @@ -16,12 +16,10 @@ import os import unittest os.environ['FLAGS_eager_delete_tensor_gb'] = "0.0" -from test_parallel_executor_transformer import TestTransformer - - -class EagerDeletionTestTransformer(TestTransformer): - pass +os.environ[ + 'RECORDIO_FILENAME'] = '/tmp/eager_deletion_transformer.wmt16.recordio' +from test_parallel_executor_transformer import TestTransformer if __name__ == '__main__': unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_inference_model_io.py b/python/paddle/fluid/tests/unittests/test_inference_model_io.py index 9962702f69..9c9f863307 100644 --- a/python/paddle/fluid/tests/unittests/test_inference_model_io.py +++ b/python/paddle/fluid/tests/unittests/test_inference_model_io.py @@ -25,6 +25,7 @@ import paddle.fluid.layers as layers import paddle.fluid.optimizer as optimizer from paddle.fluid.framework import Program, program_guard from paddle.fluid.io import save_inference_model, load_inference_model +from paddle.fluid.transpiler import memory_optimize class TestBook(unittest.TestCase): @@ -82,9 +83,36 @@ class TestBook(unittest.TestCase): self.assertEqual(feed_var_names, ["x", "y"]) self.assertEqual(len(fetch_vars), 1) - self.assertEqual(str(fetch_vars[0]), str(avg_cost)) + print("fetch %s" % str(fetch_vars[0])) + self.assertTrue("scale" in str(fetch_vars[0])) self.assertEqual(expected, actual) +class TestSaveInferenceModel(unittest.TestCase): + def test_save_inference_model(self): + MODEL_DIR = "./tmp/inference_model2" + init_program = Program() + program = Program() + + # fake program without feed/fetch + with program_guard(program, init_program): + x = layers.data(name='x', shape=[2], dtype='float32') + y = layers.data(name='y', shape=[1], dtype='float32') + + y_predict = layers.fc(input=x, size=1, act=None) + + cost = layers.square_error_cost(input=y_predict, label=y) + avg_cost = layers.mean(cost) + + place = core.CPUPlace() + exe = executor.Executor(place) + exe.run(init_program, feed={}, fetch_list=[]) + + memory_optimize(program, print_log=True) + self.assertEqual(program._is_mem_optimized, True) + # will print warning message + save_inference_model(MODEL_DIR, ["x", "y"], [avg_cost], exe, program) + + if __name__ == '__main__': unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_ir_inplace_pass.py b/python/paddle/fluid/tests/unittests/test_ir_inplace_pass.py new file mode 100644 index 0000000000..4e196758ef --- /dev/null +++ b/python/paddle/fluid/tests/unittests/test_ir_inplace_pass.py @@ -0,0 +1,76 @@ +# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function + +import os +import unittest +import numpy as np +import paddle.fluid.core as core +import paddle.fluid as fluid +from parallel_executor_test_base import TestParallelExecutorBase + + +def fc_with_batchnorm(use_feed): + img = fluid.layers.data(name='image', shape=[784], dtype='float32') + label = fluid.layers.data(name='label', shape=[1], dtype='int64') + + hidden = img + for _ in range(3): + hidden = fluid.layers.fc( + hidden, + size=200, + act='tanh', + bias_attr=fluid.ParamAttr( + initializer=fluid.initializer.Constant(value=1.0))) + + hidden = fluid.layers.batch_norm(input=hidden) + prediction = fluid.layers.fc(hidden, size=10, act='softmax') + loss = fluid.layers.cross_entropy(input=prediction, label=label) + loss = fluid.layers.mean(loss) + return loss + + +class TestIrInplace(TestParallelExecutorBase): + @classmethod + def setUpClass(cls): + os.environ['CPU_NUM'] = str(4) + + def _fc_with_batchnorm(self, + ir_memory_optimize, + enable_inplace, + memory_opt=False): + + if not core.is_compiled_with_cuda(): + return + np.random.seed(5) + img = np.random.random(size=[32, 784]).astype(np.float32) + label = np.ones(shape=[32, 1], dtype='int64') + self.check_network_convergence( + fc_with_batchnorm, + feed_dict={"image": img, + "label": label}, + use_cuda=True, + memory_opt=memory_opt, + use_ir_memory_optimize=ir_memory_optimize, + enable_inplace=enable_inplace) + + def test_fc_with_batchnorm(self, delta=1e-3): + loss00 = self._fc_with_batchnorm(False, False) + loss10 = self._fc_with_batchnorm(True, False) + loss01 = self._fc_with_batchnorm(False, True) + loss11 = self._fc_with_batchnorm(True, True) + self.assertAlmostEqual(loss00, loss10, delta=delta) + self.assertAlmostEqual(loss00, loss01, delta=delta) + self.assertAlmostEqual(loss00, loss11, delta=delta) diff --git a/python/paddle/fluid/tests/unittests/test_parallel_executor_seresnext.py b/python/paddle/fluid/tests/unittests/test_parallel_executor_seresnext.py index e7a56bb638..9548598d75 100644 --- a/python/paddle/fluid/tests/unittests/test_parallel_executor_seresnext.py +++ b/python/paddle/fluid/tests/unittests/test_parallel_executor_seresnext.py @@ -200,7 +200,7 @@ class TestResnet(TestParallelExecutorBase): model, use_cuda, iter=20, - delta2=1e-6): + delta2=1e-5): if use_cuda and not core.is_compiled_with_cuda(): return @@ -228,7 +228,7 @@ class TestResnet(TestParallelExecutorBase): optimizer=optimizer) for loss in zip(all_reduce_first_loss, reduce_first_loss): - self.assertAlmostEquals(loss[0], loss[1], delta=1e-6) + self.assertAlmostEquals(loss[0], loss[1], delta=1e-5) for loss in zip(all_reduce_last_loss, reduce_last_loss): self.assertAlmostEquals(loss[0], loss[1], delta=delta2) @@ -258,17 +258,17 @@ class TestResnet(TestParallelExecutorBase): enable_sequential_execution=True) for loss in zip(all_reduce_first_loss, all_reduce_first_loss_seq): - self.assertAlmostEquals(loss[0], loss[1], delta=1e-6) + self.assertAlmostEquals(loss[0], loss[1], delta=1e-5) for loss in zip(all_reduce_last_loss, all_reduce_last_loss_seq): self.assertAlmostEquals(loss[0], loss[1], delta=delta2) for loss in zip(reduce_first_loss, reduce_first_loss_seq): - self.assertAlmostEquals(loss[0], loss[1], delta=1e-6) + self.assertAlmostEquals(loss[0], loss[1], delta=1e-5) for loss in zip(reduce_last_loss, reduce_last_loss_seq): self.assertAlmostEquals(loss[0], loss[1], delta=delta2) for loss in zip(all_reduce_first_loss_seq, reduce_first_loss_seq): - self.assertAlmostEquals(loss[0], loss[1], delta=1e-6) + self.assertAlmostEquals(loss[0], loss[1], delta=1e-5) for loss in zip(all_reduce_last_loss_seq, reduce_last_loss_seq): self.assertAlmostEquals(loss[0], loss[1], delta=delta2) @@ -277,7 +277,7 @@ class TestResnet(TestParallelExecutorBase): use_cuda=True, use_reduce=False, iter=20, - delta2=1e-6): + delta2=1e-5): if use_cuda and not core.is_compiled_with_cuda(): return @@ -308,7 +308,7 @@ class TestResnet(TestParallelExecutorBase): optimizer=optimizer) self.assertAlmostEquals( - np.mean(parallel_first_loss), single_first_loss[0], delta=1e-6) + np.mean(parallel_first_loss), single_first_loss[0], delta=1e-5) self.assertAlmostEquals( np.mean(parallel_last_loss), single_last_loss[0], delta=delta2) diff --git a/python/paddle/fluid/tests/unittests/test_parallel_executor_transformer.py b/python/paddle/fluid/tests/unittests/test_parallel_executor_transformer.py index 3827743908..aacc1c3ecd 100644 --- a/python/paddle/fluid/tests/unittests/test_parallel_executor_transformer.py +++ b/python/paddle/fluid/tests/unittests/test_parallel_executor_transformer.py @@ -24,7 +24,7 @@ import paddle.fluid.core as core import paddle.dataset.wmt16 as wmt16 import os -WMT16_RECORDIO_FILE = "/tmp/wmt16.recordio" +WMT16_RECORDIO_FILE = os.environ.get('RECORDIO_FILENAME', '/tmp/wmt16.recordio') class ModelHyperParams(object): diff --git a/python/paddle/fluid/tests/unittests/test_peak_gpumem_monitor.py b/python/paddle/fluid/tests/unittests/test_peak_gpumem_monitor.py new file mode 100644 index 0000000000..3673fd10c4 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/test_peak_gpumem_monitor.py @@ -0,0 +1,59 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function + +import unittest +import os +os.environ['FLAGS_benchmark'] = 'True' + +import numpy +import paddle.fluid.core as core +from paddle.fluid.executor import Executor +from paddle.fluid.layers import mul, data + + +class TestPeakMemoryMonitoring(unittest.TestCase): + def test_mul(self): + + a = data(name='a', shape=[784], dtype='float32') + b = data( + name='b', + shape=[784, 100], + dtype='float32', + append_batch_size=False) + out = mul(x=a, y=b) + + if core.is_compiled_with_cuda(): + place = core.CUDAPlace(0) + + a_np = numpy.random.random((100, 784)).astype('float32') + b_np = numpy.random.random((784, 100)).astype('float32') + self.assertEqual(0, core.get_mem_usage(0)) + exe = Executor(place) + outs = exe.run(feed={'a': a_np, 'b': b_np}, fetch_list=[out]) + out = outs[0] + #disable this assert since ctest will ignore the os.environ setting + #self.assertGreater(core.get_mem_usage(0), 0) + + raised = False + try: + core.print_mem_usage() + except: + raised = True + self.assertFalse(raised, 'Exception raised') + + +if __name__ == '__main__': + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/transformer_model.py b/python/paddle/fluid/tests/unittests/transformer_model.py index 143d187edc..905b7d6fe7 100644 --- a/python/paddle/fluid/tests/unittests/transformer_model.py +++ b/python/paddle/fluid/tests/unittests/transformer_model.py @@ -17,6 +17,7 @@ from __future__ import print_function from functools import partial import numpy as np +import os import paddle.fluid as fluid import paddle.fluid.layers as layers from paddle.fluid.layers.io import open_recordio_file @@ -408,7 +409,7 @@ def transformer( trg_pad_idx, pos_pad_idx, ): file_obj = open_recordio_file( - filename='/tmp/wmt16.recordio', + filename=os.environ.get('RECORDIO_FILENAME', '/tmp/wmt16.recordio'), shapes=[ [batch_size * max_length, 1], [batch_size * max_length, 1], diff --git a/python/paddle/fluid/transpiler/memory_optimization_transpiler.py b/python/paddle/fluid/transpiler/memory_optimization_transpiler.py index e5d48d3d19..52c1aea288 100755 --- a/python/paddle/fluid/transpiler/memory_optimization_transpiler.py +++ b/python/paddle/fluid/transpiler/memory_optimization_transpiler.py @@ -540,6 +540,7 @@ def memory_optimize(input_program, if skip_opt_set is not None: skip_opt_set = set(map(to_name_str, skip_opt_set)) cfgs = _get_cfgs(input_program) + input_program._is_mem_optimized = True for cfg in cfgs: cfg.memory_optimize(skip_opt_set=skip_opt_set, level=level) @@ -559,5 +560,6 @@ def release_memory(input_program, skip_opt_set=None): None """ cfgs = _get_cfgs(input_program) + input_program._is_mem_optimized = True for cfg in cfgs: cfg.release_memory(skip_opt_set=skip_opt_set) diff --git a/python/paddle/fluid/unique_name.py b/python/paddle/fluid/unique_name.py index b9957a699e..324257c13f 100644 --- a/python/paddle/fluid/unique_name.py +++ b/python/paddle/fluid/unique_name.py @@ -15,7 +15,7 @@ from __future__ import print_function import collections -import contextlib +from .wrapped_decorator import signature_safe_contextmanager import six import sys @@ -68,7 +68,7 @@ def switch(new_generator=None): return old -@contextlib.contextmanager +@signature_safe_contextmanager def guard(new_generator=None): if isinstance(new_generator, six.string_types): new_generator = UniqueNameGenerator(new_generator) diff --git a/python/paddle/fluid/wrapped_decorator.py b/python/paddle/fluid/wrapped_decorator.py new file mode 100644 index 0000000000..7e7dbff656 --- /dev/null +++ b/python/paddle/fluid/wrapped_decorator.py @@ -0,0 +1,30 @@ +# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import decorator +import contextlib + +__all__ = ['wrap_decorator', 'signature_safe_contextmanager'] + + +def wrap_decorator(decorator_func): + @decorator.decorator + def __impl__(func, *args, **kwargs): + wrapped_func = decorator_func(func) + return wrapped_func(*args, **kwargs) + + return __impl__ + + +signature_safe_contextmanager = wrap_decorator(contextlib.contextmanager) diff --git a/python/requirements.txt b/python/requirements.txt index 03d5e33e88..5a70f1aa3f 100644 --- a/python/requirements.txt +++ b/python/requirements.txt @@ -11,3 +11,4 @@ graphviz six funcsigs pyyaml +decorator diff --git a/python/setup.py.in b/python/setup.py.in index f93f0cd130..a7c1e91f9c 100644 --- a/python/setup.py.in +++ b/python/setup.py.in @@ -100,6 +100,7 @@ packages=['paddle', 'paddle.utils', 'paddle.dataset', 'paddle.reader', + 'paddle.distributed', 'paddle.fluid', 'paddle.fluid.imperative', 'paddle.fluid.proto',