diff --git a/.travis.yml b/.travis.yml index 9dd5f48164..bf6a41d13c 100644 --- a/.travis.yml +++ b/.travis.yml @@ -56,7 +56,7 @@ script: export DEPLOY_DOCS_SH=https://raw.githubusercontent.com/PaddlePaddle/PaddlePaddle.org/master/scripts/deploy/deploy_docs.sh export DOCS_DIR=`pwd` cd .. - curl $DEPLOY_DOCS_SH | bash -s $CONTENT_DEC_PASSWD $TRAVIS_BRANCH $DOCS_DIR $DOCS_DIR/build/doc/v2 + curl $DEPLOY_DOCS_SH | bash -s $CONTENT_DEC_PASSWD $TRAVIS_BRANCH $DOCS_DIR $DOCS_DIR/build/doc/ notifications: email: on_success: change diff --git a/cmake/external/openblas.cmake b/cmake/external/openblas.cmake index e2b7ef8d54..8af2765f58 100644 --- a/cmake/external/openblas.cmake +++ b/cmake/external/openblas.cmake @@ -77,7 +77,8 @@ IF(NOT ${CBLAS_FOUND}) INSTALL_DIR ${CBLAS_INSTALL_DIR} BUILD_IN_SOURCE 1 BUILD_COMMAND ${CMAKE_MAKE_PROGRAM} ${COMMON_ARGS} ${OPTIONAL_ARGS} - INSTALL_COMMAND ${CMAKE_MAKE_PROGRAM} install NO_SHARED=1 NO_LAPACK=1 PREFIX= + INSTALL_COMMAND ${CMAKE_MAKE_PROGRAM} install NO_SHARED=1 NO_LAPACK=1 PREFIX= + && rm -r ${CBLAS_INSTALL_DIR}/lib/cmake ${CBLAS_INSTALL_DIR}/lib/pkgconfig UPDATE_COMMAND "" CONFIGURE_COMMAND "" ) @@ -100,11 +101,6 @@ IF(NOT ${CBLAS_FOUND}) \"${CBLAS_INSTALL_DIR}/lib -> ${CMAKE_INSTALL_PREFIX}/${TMP_INSTALL_DIR}\" )" ) - INSTALL(CODE "execute_process( - COMMAND rm -r ${CMAKE_INSTALL_PREFIX}/${TMP_INSTALL_DIR}/cmake - ${CMAKE_INSTALL_PREFIX}/${TMP_INSTALL_DIR}/pkgconfig - )" - ) ENDIF() ENDIF(NOT ${CBLAS_FOUND}) diff --git a/cmake/generic.cmake b/cmake/generic.cmake index 356da582d1..d0b5eaec2e 100644 --- a/cmake/generic.cmake +++ b/cmake/generic.cmake @@ -186,7 +186,9 @@ function(cc_library TARGET_NAME) add_library(${TARGET_NAME} SHARED ${cc_library_SRCS}) else() add_library(${TARGET_NAME} STATIC ${cc_library_SRCS}) + find_fluid_modules(${TARGET_NAME}) endif() + if(cc_library_DEPS) # Don't need link libwarpctc.so if("${cc_library_DEPS};" MATCHES "warpctc;") @@ -263,7 +265,8 @@ function(nv_library TARGET_NAME) if (nv_library_SHARED OR nv_library_shared) # build *.so cuda_add_library(${TARGET_NAME} SHARED ${nv_library_SRCS}) else() - cuda_add_library(${TARGET_NAME} STATIC ${nv_library_SRCS}) + cuda_add_library(${TARGET_NAME} STATIC ${nv_library_SRCS}) + find_fluid_modules(${TARGET_NAME}) endif() if (nv_library_DEPS) add_dependencies(${TARGET_NAME} ${nv_library_DEPS}) diff --git a/cmake/inference_lib.cmake b/cmake/inference_lib.cmake index 4471df36b0..6b2237b858 100644 --- a/cmake/inference_lib.cmake +++ b/cmake/inference_lib.cmake @@ -1,9 +1,22 @@ +set_property(GLOBAL PROPERTY FLUID_MODULES "") +# find all fluid modules is used for paddle fluid static library +function(find_fluid_modules TARGET_NAME) + get_filename_component(__target_path ${TARGET_NAME} ABSOLUTE) + string(FIND "${__target_path}" "fluid" pos) + if(pos GREATER 1) + get_property(fluid_modules GLOBAL PROPERTY FLUID_MODULES) + set(fluid_modules ${fluid_modules} ${TARGET_NAME}) + set_property(GLOBAL PROPERTY FLUID_MODULES "${fluid_modules}") + endif() +endfunction(find_fluid_modules) + # make package for paddle fluid shared and static library function(copy TARGET) set(options "") set(oneValueArgs "") set(multiValueArgs SRCS DSTS DEPS) cmake_parse_arguments(copy_lib "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN}) + set(inference_lib_dist_dep ${TARGET} ${inference_lib_dist_dep} PARENT_SCOPE) list(LENGTH copy_lib_SRCS copy_lib_SRCS_len) list(LENGTH copy_lib_DSTS copy_lib_DSTS_len) @@ -42,13 +55,21 @@ copy(glog_lib DSTS ${dst_dir} ${dst_dir}/lib ) -IF(NOT PROTOBUF_FOUND) +if(NOT PROTOBUF_FOUND) set(dst_dir "${CMAKE_INSTALL_PREFIX}/third_party/install/protobuf") copy(protobuf_lib - SRCS ${PROTOBUF_INCLUDE_DIR} ${PROTOBUF_LITE_LIBRARY} + SRCS ${PROTOBUF_INCLUDE_DIR} ${PROTOBUF_LIBRARY} DSTS ${dst_dir} ${dst_dir}/lib ) -ENDIF(NOT PROTOBUF_FOUND) +endif() + +if(NOT CBLAS_FOUND) + set(dst_dir "${CMAKE_INSTALL_PREFIX}/third_party/install/openblas") + copy(openblas_lib + SRCS ${CBLAS_INSTALL_DIR}/lib ${CBLAS_INSTALL_DIR}/include + DSTS ${dst_dir} ${dst_dir} + ) +endif() # paddle fluid module set(src_dir "${PADDLE_SOURCE_DIR}/paddle/fluid") @@ -66,8 +87,8 @@ copy(memory_lib ) set(module "inference") -copy(inference_lib DEPENDS paddle_fluid_shared - SRCS ${src_dir}/${module}/*.h ${PADDLE_BINARY_DIR}/paddle/fluid/inference/libpaddle_fluid.so +copy(inference_lib DEPS paddle_fluid_shared paddle_fluid + SRCS ${src_dir}/${module}/*.h ${PADDLE_BINARY_DIR}/paddle/fluid/inference/libpaddle_fluid.* DSTS ${dst_dir}/${module} ${dst_dir}/${module} ) @@ -83,6 +104,4 @@ copy(string_lib DSTS ${dst_dir}/${module} ${dst_dir}/${module}/tinyformat ) -add_custom_target(inference_lib_dist DEPENDS - inference_lib framework_lib memory_lib platform_lib string_lib - gflags_lib glog_lib protobuf_lib eigen3_lib) +add_custom_target(inference_lib_dist DEPENDS ${inference_lib_dist_dep}) diff --git a/doc/v2/howto/optimization/pprof_1.png b/doc/fluid/howto/optimization/pprof_1.png similarity index 100% rename from doc/v2/howto/optimization/pprof_1.png rename to doc/fluid/howto/optimization/pprof_1.png diff --git a/doc/v2/howto/optimization/pprof_2.png b/doc/fluid/howto/optimization/pprof_2.png similarity index 100% rename from doc/v2/howto/optimization/pprof_2.png rename to doc/fluid/howto/optimization/pprof_2.png diff --git a/doc/fluid/howto/optimization/timeline.jpeg b/doc/fluid/howto/optimization/timeline.jpeg new file mode 100644 index 0000000000..38ec3f80c9 Binary files /dev/null and b/doc/fluid/howto/optimization/timeline.jpeg differ diff --git a/doc/fluid/howto/optimization/timeline.md b/doc/fluid/howto/optimization/timeline.md new file mode 100644 index 0000000000..9d9565a3e6 --- /dev/null +++ b/doc/fluid/howto/optimization/timeline.md @@ -0,0 +1,27 @@ +## how to use timeline tool to do profile + +1. Add `with profiler.profiler(...)` to the main training loop. After run, the code will generate a profile record file `/tmp/profile`. **Warning**: Please do not run too many batches when use profiler to record timeline information, for the profile record will grow with the batch number. + + ```python + with profiler.profiler('All', 'total', '/tmp/profile') as prof: + for pass_id in range(pass_num): + for batch_id, data in enumerate(train_reader()): + exe.run(fluid.default_main_program(), + feed=feeder.feed(data), + fetch_list=[], + use_program_cache=True) + ... + ``` + +1. Run `python paddle/tools/timeline.py` to process `/tmp/profile`, it will generate another +file `/tmp/timeline` by default. You can change the path by cmd parameter, please take a look at +[timeline.py](https://github.com/PaddlePaddle/Paddle/blob/develop/tools/timeline.py) for details. + +1. Open chrome and visit , use `load` button to load the generated `timeline` file. + + ![chrome tracing](./tracing.jpeg) + +1. The resulting timeline should be like: + + + ![chrome timeline](./timeline.jpeg) diff --git a/doc/fluid/howto/optimization/tracing.jpeg b/doc/fluid/howto/optimization/tracing.jpeg new file mode 100644 index 0000000000..3a49fc4f8a Binary files /dev/null and b/doc/fluid/howto/optimization/tracing.jpeg differ diff --git a/doc/fluid/read_source.md b/doc/fluid/read_source.md index edf46aff8c..bb6d4563f5 100644 --- a/doc/fluid/read_source.md +++ b/doc/fluid/read_source.md @@ -2,17 +2,17 @@ Examples: https://github.com/PaddlePaddle/Paddle/tree/develop/python/paddle/fluid/tests/book -Core: https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/framework +Core: https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/fluid/framework -Operator: https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/operators +Operator: https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/fluid/operators -Memory: https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/memory +Memory: https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/fluid/memory -Platform: https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/platform +Platform: https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/fluid/platform # Compile Time -The following **defines** the NN. The definition goes into this [protocol buffer](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/framework.proto). +The following **defines** the NN. The definition goes into this [protocol buffer](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/framework/framework.proto). ```python x = fluid.layers.data(name='x', shape=[13], dtype='float32') @@ -29,10 +29,10 @@ sgd_optimizer.minimize(avg_cost) - Variables: `x`, `y`, `y_predict`, `cost` and `avg_cost`. [Python](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/fluid/framework.py#) - Layers: `fluid.layers.data`, `fluid.layers.fc` and `fluid.layers.mean` are layers. [Python](https://github.com/PaddlePaddle/Paddle/tree/develop/python/paddle/fluid/layers) - Every Layer has one or more operators and variables/parameters - - All the operators are defined at [`paddle/operators/`](https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/operators). Other worth-looking files: - - Base class: [`paddle/framework/operator.h`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/operator.h) - - Operator Registration: [`paddle/framework/op_registry.h`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/op_registry.h) - - Operator Lookup: [`paddle/framework/op_info.h`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/op_info.h) + - All the operators are defined at [`paddle/fluid/operators/`](https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/fluid/operators). Other worth-looking files: + - Base class: [`paddle/fluid/framework/operator.h`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/framework/operator.h) + - Operator Registration: [`paddle/fluid/framework/op_registry.h`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/framework/op_registry.h) + - Operator Lookup: [`paddle/fluid/framework/op_info.h`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/framework/op_info.h) - Optimizer: `fluid.optimizer.SGD`. It does the following - Add backward operators. [[Python](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/fluid/backward.py)] - Add optimizer operators. [[Python](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/fluid/optimizer.py)] @@ -55,13 +55,13 @@ exe.run(fluid.default_main_program(), fetch_list=[avg_cost]) ``` -- Place: `place`. one of CPU, GPU or FPGA. [C++](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/platform/place.h) - - The device handle are at [paddle/platform/device_context.h](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/platform/device_context.h) -- Executor: `fluid.Executor(place)`. [[Python](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/fluid/executor.py), [C++](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/executor.cc)] +- Place: `place`. one of CPU, GPU or FPGA. [C++](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/platform/place.h) + - The device handle are at [paddle/fluid/platform/device_context.h](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/platform/device_context.h) +- Executor: `fluid.Executor(place)`. [[Python](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/fluid/executor.py), [C++](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/framework/executor.cc)] - Feeds the data: `feed=feeder.feed(data)` - Evaluates all the operators - Fetches the result: `fetch_list=[avg_cost]` - Other worth looking files: - - Scope: [paddle/framework/scope.h](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/scope.h). Where all the variables live - - Variable: [paddle/framework/variable.h](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/variable.h). Where all the data (most likely tensors) live - - Tensor: [paddle/framework/tensor.h](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/tensor.h). Where we allocate memory through [`paddle/memory/`](https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/memory) + - Scope: [paddle/fluid/framework/scope.h](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/framework/scope.h). Where all the variables live + - Variable: [paddle/fluid/framework/variable.h](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/framework/variable.h). Where all the data (most likely tensors) live + - Tensor: [paddle/fluid/framework/tensor.h](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/framework/tensor.h). Where we allocate memory through [`paddle/fluid/memory/`](https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/fluid/memory) diff --git a/doc/v2/build_and_install/pip_install_cn.rst b/doc/v2/build_and_install/pip_install_cn.rst index 8e4165da6b..ddcd42a0c6 100644 --- a/doc/v2/build_and_install/pip_install_cn.rst +++ b/doc/v2/build_and_install/pip_install_cn.rst @@ -39,7 +39,7 @@ PaddlePaddle可以使用常用的Python包管理工具 "cpu_avx_mkl", "`paddlepaddle-0.11.0-cp27-cp27mu-linux_x86_64.whl `_", "`paddlepaddle-0.11.0-cp27-cp27m-linux_x86_64.whl `_", "`paddle.tgz `_" "cpu_avx_openblas", "`paddlepaddle-0.11.0-cp27-cp27mu-linux_x86_64.whl `_", "`paddlepaddle-0.11.0-cp27-cp27m-linux_x86_64.whl `_", "暂无" - "cpu_noavx_openblas", "`paddlepaddle-0.11.0-cp27-cp27mu-linux_x86_64.whl `_", "`paddlepaddle-0.11.0-cp27-cp27m-linux_x86_64.whl `_", "暂无" + "cpu_noavx_openblas", "`paddlepaddle-0.11.0-cp27-cp27mu-linux_x86_64.whl `_", "`paddlepaddle-0.11.0-cp27-cp27m-linux_x86_64.whl `_", "`paddle.tgz `_" "cuda7.5_cudnn5_avx_mkl", "`paddlepaddle_gpu-0.11.0-cp27-cp27mu-linux_x86_64.whl `_", "`paddlepaddle_gpu-0.11.0-cp27-cp27m-linux_x86_64.whl `_", "`paddle.tgz `_" "cuda8.0_cudnn5_avx_mkl", "`paddlepaddle_gpu-0.11.0-cp27-cp27mu-linux_x86_64.whl `_", "`paddlepaddle_gpu-0.11.0-cp27-cp27m-linux_x86_64.whl `_", "`paddle.tgz `_" "cuda8.0_cudnn7_avx_mkl", "`paddlepaddle_gpu-0.11.0-cp27-cp27mu-linux_x86_64.whl `_", "`paddlepaddle_gpu-0.11.0-cp27-cp27m-linux_x86_64.whl `_", "`paddle.tgz `_" diff --git a/doc/v2/build_and_install/pip_install_en.rst b/doc/v2/build_and_install/pip_install_en.rst index 0d4c925b6e..e08c84703b 100644 --- a/doc/v2/build_and_install/pip_install_en.rst +++ b/doc/v2/build_and_install/pip_install_en.rst @@ -42,7 +42,7 @@ If the links below shows up the login form, just click "Log in as guest" to star "cpu_avx_mkl", "`paddlepaddle-0.11.0-cp27-cp27mu-linux_x86_64.whl `_", "`paddlepaddle-0.11.0-cp27-cp27m-linux_x86_64.whl `_", "`paddle.tgz `_" "cpu_avx_openblas", "`paddlepaddle-0.11.0-cp27-cp27mu-linux_x86_64.whl `_", "`paddlepaddle-0.11.0-cp27-cp27m-linux_x86_64.whl `_", "Not Available" - "cpu_noavx_openblas", "`paddlepaddle-0.11.0-cp27-cp27mu-linux_x86_64.whl `_", "`paddlepaddle-0.11.0-cp27-cp27m-linux_x86_64.whl `_", "Not Available" + "cpu_noavx_openblas", "`paddlepaddle-0.11.0-cp27-cp27mu-linux_x86_64.whl `_", "`paddlepaddle-0.11.0-cp27-cp27m-linux_x86_64.whl `_", "`paddle.tgz `_" "cuda7.5_cudnn5_avx_mkl", "`paddlepaddle_gpu-0.11.0-cp27-cp27mu-linux_x86_64.whl `_", "`paddlepaddle_gpu-0.11.0-cp27-cp27m-linux_x86_64.whl `_", "`paddle.tgz `_" "cuda8.0_cudnn5_avx_mkl", "`paddlepaddle_gpu-0.11.0-cp27-cp27mu-linux_x86_64.whl `_", "`paddlepaddle_gpu-0.11.0-cp27-cp27m-linux_x86_64.whl `_", "`paddle.tgz `_" "cuda8.0_cudnn7_avx_mkl", "`paddlepaddle_gpu-0.11.0-cp27-cp27mu-linux_x86_64.whl `_", "`paddlepaddle_gpu-0.11.0-cp27-cp27m-linux_x86_64.whl `_", "`paddle.tgz `_" diff --git a/doc/v2/howto/index_cn.rst b/doc/v2/howto/index_cn.rst index 0c534f107b..b0268907bc 100644 --- a/doc/v2/howto/index_cn.rst +++ b/doc/v2/howto/index_cn.rst @@ -1,11 +1,37 @@ 进阶使用 ======== +PaddlePaddle支持用户灵活地设置各种命令行参数,以实现对模型训练或预测流程的控制。使用方式请参考: + .. toctree:: :maxdepth: 1 cmd_parameter/index_cn.rst + +PaddlePaddle支持在fabric集群、MPI集群、kubernetes集群上分布式训练任务,具体环境配置和使用说明请参考: + +.. toctree:: + :maxdepth: 1 + cluster/index_cn.rst + +PaddlePaddle提供了用于预测的C-API,关于C-API的使用,我们提供了如下指南: + +.. toctree:: + :maxdepth: 1 + capi/index_cn.rst + +PaddlePaddle支持多种灵活和高效的循环神经网络,具体配置使用方式请参考: + +.. toctree:: + :maxdepth: 1 + rnn/index_cn.rst + +关于如何使用内置的定时工具、nvprof 或 nvvp 来运行性能分析和调优,请参考: + +.. toctree:: + :maxdepth: 1 + optimization/gpu_profiling_cn.rst diff --git a/doc/v2/howto/rnn/index_cn.rst b/doc/v2/howto/rnn/index_cn.rst index bcc8c2f46e..2032fb9e29 100644 --- a/doc/v2/howto/rnn/index_cn.rst +++ b/doc/v2/howto/rnn/index_cn.rst @@ -1,10 +1,34 @@ RNN模型 =========== +循环神经网络(RNN)是对序列数据建模的重要工具。PaddlePaddle提供了灵活的接口以支持复杂循环神经网络的构建。 +这里将分为以下四个部分详细介绍如何使用PaddlePaddle搭建循环神经网络。 + +第一部分由浅入深的展示了使用PaddlePaddle搭建循环神经网络的全貌:首先以简单的循环神经网络(vanilla RNN)为例, +说明如何封装配置循环神经网络组件;然后更进一步的通过序列到序列(sequence to sequence)模型,逐步讲解如何构建完整而复杂的循环神经网络模型。 .. toctree:: :maxdepth: 1 rnn_config_cn.rst + +Recurrent Group是PaddlePaddle中实现复杂循环神经网络的关键,第二部分阐述了PaddlePaddle中Recurrent Group的相关概念和原理, +对Recurrent Group接口进行了详细说明。另外,对双层RNN(对应的输入为双层序列)及Recurrent Group在其中的使用进行了介绍。 + +.. toctree:: + :maxdepth: 1 + recurrent_group_cn.md + +第三部分对双层序列进行了解释说明,列出了PaddlePaddle中支持双层序列作为输入的Layer,并对其使用进行了逐一介绍。 + +.. toctree:: + :maxdepth: 1 + hierarchical_layer_cn.rst + +第四部分以PaddlePaddle的双层RNN单元测试中的网络配置为示例,辅以效果相同的单层RNN网络配置作为对比,讲解了多种情况下双层RNN的使用。 + +.. toctree:: + :maxdepth: 1 + hrnn_rnn_api_compare_cn.rst diff --git a/paddle/fluid/framework/data_type_transform.cc b/paddle/fluid/framework/data_type_transform.cc index 554cd58916..c0523f3c79 100644 --- a/paddle/fluid/framework/data_type_transform.cc +++ b/paddle/fluid/framework/data_type_transform.cc @@ -53,6 +53,7 @@ struct CastDataType { auto* context = static_cast(ctx_); trans(*context, in_begin, in_end, out_begin, CastDataTypeFunctor()); + context->Wait(); #endif } else { PADDLE_THROW("Unsupported place!"); diff --git a/paddle/fluid/framework/data_type_transform_test.cc b/paddle/fluid/framework/data_type_transform_test.cc index c992cba9a3..6b9a8f5e28 100644 --- a/paddle/fluid/framework/data_type_transform_test.cc +++ b/paddle/fluid/framework/data_type_transform_test.cc @@ -50,13 +50,13 @@ TEST(DataTypeTransform, CPUTransform) { TransDataType(kernel_fp32, kernel_fp64, in, &out); double* out_data_double = out.data(); for (int i = 0; i < data_number; ++i) { - ASSERT_EQ(out_data_double[i], static_cast(i / 3)); + EXPECT_EQ(out_data_double[i], static_cast(i / 3)); } TransDataType(kernel_fp32, kernel_int32, in, &out); int* out_data_int = out.data(); for (int i = 0; i < data_number; ++i) { - ASSERT_EQ(out_data_int[i], static_cast(i / 3)); + EXPECT_EQ(out_data_int[i], static_cast(i / 3)); } } @@ -76,31 +76,31 @@ TEST(DataTypeTransform, CPUTransform) { TransDataType(kernel_fp16, kernel_fp32, in, &out); float* out_data_float = out.data(); for (int i = 0; i < data_number; ++i) { - ASSERT_EQ(out_data_float[i], static_cast(ptr[i])); + EXPECT_EQ(out_data_float[i], static_cast(ptr[i])); } TransDataType(kernel_fp16, kernel_fp64, in, &out); double* out_data_double = out.data(); for (int i = 0; i < data_number; ++i) { - ASSERT_EQ(out_data_double[i], static_cast(ptr[i])); + EXPECT_EQ(out_data_double[i], static_cast(ptr[i])); } TransDataType(kernel_fp16, kernel_int32, in, &out); int* out_data_int = out.data(); for (int i = 0; i < data_number; ++i) { - ASSERT_EQ(out_data_int[i], static_cast(ptr[i])); + EXPECT_EQ(out_data_int[i], static_cast(ptr[i])); } TransDataType(kernel_fp16, kernel_int64, in, &out); int64_t* out_data_int64 = out.data(); for (int i = 0; i < data_number; ++i) { - ASSERT_EQ(out_data_int64[i], static_cast(ptr[i])); + EXPECT_EQ(out_data_int64[i], static_cast(ptr[i])); } TransDataType(kernel_fp16, kernel_bool, in, &out); bool* out_data_bool = out.data(); for (int i = 0; i < data_number; ++i) { - ASSERT_EQ(out_data_bool[i], static_cast(ptr[i])); + EXPECT_EQ(out_data_bool[i], static_cast(ptr[i])); } // transform float to float16 @@ -112,7 +112,7 @@ TEST(DataTypeTransform, CPUTransform) { TransDataType(kernel_fp32, kernel_fp16, in, &out); ptr = out.data(); for (int i = 0; i < data_number; ++i) { - ASSERT_EQ(ptr[i].x, static_cast(in_data_float[i]).x); + EXPECT_EQ(ptr[i].x, static_cast(in_data_float[i]).x); } // transform double to float16 @@ -124,7 +124,7 @@ TEST(DataTypeTransform, CPUTransform) { TransDataType(kernel_fp64, kernel_fp16, in, &out); ptr = out.data(); for (int i = 0; i < data_number; ++i) { - ASSERT_EQ(ptr[i].x, static_cast(in_data_double[i]).x); + EXPECT_EQ(ptr[i].x, static_cast(in_data_double[i]).x); } // transform int to float16 @@ -136,7 +136,7 @@ TEST(DataTypeTransform, CPUTransform) { TransDataType(kernel_int32, kernel_fp16, in, &out); ptr = out.data(); for (int i = 0; i < data_number; ++i) { - ASSERT_EQ(ptr[i].x, static_cast(in_data_int[i]).x); + EXPECT_EQ(ptr[i].x, static_cast(in_data_int[i]).x); } // transform int64 to float16 @@ -148,7 +148,7 @@ TEST(DataTypeTransform, CPUTransform) { TransDataType(kernel_int64, kernel_fp16, in, &out); ptr = out.data(); for (int i = 0; i < data_number; ++i) { - ASSERT_EQ(ptr[i].x, static_cast(in_data_int64[i]).x); + EXPECT_EQ(ptr[i].x, static_cast(in_data_int64[i]).x); } // transform bool to float16 @@ -160,7 +160,7 @@ TEST(DataTypeTransform, CPUTransform) { TransDataType(kernel_bool, kernel_fp16, in, &out); ptr = out.data(); for (int i = 0; i < data_number; ++i) { - ASSERT_EQ(ptr[i].x, static_cast(in_data_bool[i]).x); + EXPECT_EQ(ptr[i].x, static_cast(in_data_bool[i]).x); } } } diff --git a/paddle/fluid/framework/data_type_transform_test.cu b/paddle/fluid/framework/data_type_transform_test.cu index 3939bc5e75..de389ddabc 100644 --- a/paddle/fluid/framework/data_type_transform_test.cu +++ b/paddle/fluid/framework/data_type_transform_test.cu @@ -49,15 +49,16 @@ TEST(DataTypeTransform, GPUTransform) { float arr[6] = {0, 1, 2, 3, 4, 5}; int data_number = sizeof(arr) / sizeof(arr[0]); memcpy(in_ptr, arr, sizeof(arr)); - TensorCopy(in, gpu_place, context, &in_gpu); + TensorCopy(in, gpu_place, context, &in_gpu); + context.Wait(); TransDataType(kernel_fp32, kernel_fp64, in_gpu, &out_gpu); TensorCopy(out_gpu, cpu_place, context, &out); context.Wait(); double* out_data_double = out.data(); for (int i = 0; i < data_number; ++i) { - ASSERT_EQ(out_data_double[i], static_cast(arr[i])); + EXPECT_EQ(out_data_double[i], static_cast(arr[i])); } TransDataType(kernel_fp32, kernel_int32, in_gpu, &out_gpu); @@ -66,7 +67,7 @@ TEST(DataTypeTransform, GPUTransform) { int* out_data_int = out.data(); for (int i = 0; i < data_number; ++i) { - ASSERT_EQ(out_data_int[i], static_cast(arr[i])); + EXPECT_EQ(out_data_int[i], static_cast(arr[i])); } } @@ -83,6 +84,7 @@ TEST(DataTypeTransform, GPUTransform) { int data_number = sizeof(arr) / sizeof(arr[0]); memcpy(ptr, arr, sizeof(arr)); TensorCopy(in, gpu_place, context, &in_gpu); + context.Wait(); // transform from float16 to other data types TransDataType(kernel_fp16, kernel_fp32, in_gpu, &out_gpu); @@ -91,7 +93,7 @@ TEST(DataTypeTransform, GPUTransform) { float* out_data_float = out.data(); for (int i = 0; i < data_number; ++i) { - ASSERT_EQ(out_data_float[i], static_cast(ptr[i])); + EXPECT_EQ(out_data_float[i], static_cast(ptr[i])); } TransDataType(kernel_fp16, kernel_fp64, in_gpu, &out_gpu); @@ -100,7 +102,7 @@ TEST(DataTypeTransform, GPUTransform) { double* out_data_double = out.data(); for (int i = 0; i < data_number; ++i) { - ASSERT_EQ(out_data_double[i], static_cast(ptr[i])); + EXPECT_EQ(out_data_double[i], static_cast(ptr[i])); } TransDataType(kernel_fp16, kernel_int32, in_gpu, &out_gpu); @@ -109,7 +111,7 @@ TEST(DataTypeTransform, GPUTransform) { int* out_data_int = out.data(); for (int i = 0; i < data_number; ++i) { - ASSERT_EQ(out_data_int[i], static_cast(ptr[i])); + EXPECT_EQ(out_data_int[i], static_cast(ptr[i])); } TransDataType(kernel_fp16, kernel_int64, in_gpu, &out_gpu); @@ -118,7 +120,7 @@ TEST(DataTypeTransform, GPUTransform) { int64_t* out_data_int64 = out.data(); for (int i = 0; i < data_number; ++i) { - ASSERT_EQ(out_data_int64[i], static_cast(ptr[i])); + EXPECT_EQ(out_data_int64[i], static_cast(ptr[i])); } TransDataType(kernel_fp16, kernel_bool, in_gpu, &out_gpu); @@ -127,7 +129,7 @@ TEST(DataTypeTransform, GPUTransform) { bool* out_data_bool = out.data(); for (int i = 0; i < data_number; ++i) { - ASSERT_EQ(out_data_bool[i], static_cast(ptr[i])); + EXPECT_EQ(out_data_bool[i], static_cast(ptr[i])); } // transform float to float16 @@ -137,13 +139,14 @@ TEST(DataTypeTransform, GPUTransform) { } TensorCopy(in, gpu_place, context, &in_gpu); + context.Wait(); TransDataType(kernel_fp32, kernel_fp16, in_gpu, &out_gpu); TensorCopy(out_gpu, cpu_place, context, &out); context.Wait(); ptr = out.data(); for (int i = 0; i < data_number; ++i) { - ASSERT_EQ(ptr[i].x, static_cast(in_data_float[i]).x); + EXPECT_EQ(ptr[i].x, static_cast(in_data_float[i]).x); } // transform double to float16 @@ -154,13 +157,14 @@ TEST(DataTypeTransform, GPUTransform) { } TensorCopy(in, gpu_place, context, &in_gpu); + context.Wait(); TransDataType(kernel_fp64, kernel_fp16, in_gpu, &out_gpu); TensorCopy(out_gpu, cpu_place, context, &out); context.Wait(); ptr = out.data(); for (int i = 0; i < data_number; ++i) { - ASSERT_EQ(ptr[i].x, static_cast(in_data_double[i]).x); + EXPECT_EQ(ptr[i].x, static_cast(in_data_double[i]).x); } // transform int to float16 @@ -170,13 +174,14 @@ TEST(DataTypeTransform, GPUTransform) { } TensorCopy(in, gpu_place, context, &in_gpu); + context.Wait(); TransDataType(kernel_int32, kernel_fp16, in_gpu, &out_gpu); TensorCopy(out_gpu, cpu_place, context, &out); context.Wait(); ptr = out.data(); for (int i = 0; i < data_number; ++i) { - ASSERT_EQ(ptr[i].x, static_cast(in_data_int[i]).x); + EXPECT_EQ(ptr[i].x, static_cast(in_data_int[i]).x); } // transform int64 to float16 @@ -187,13 +192,14 @@ TEST(DataTypeTransform, GPUTransform) { } TensorCopy(in, gpu_place, context, &in_gpu); + context.Wait(); TransDataType(kernel_int64, kernel_fp16, in_gpu, &out_gpu); TensorCopy(out_gpu, cpu_place, context, &out); context.Wait(); ptr = out.data(); for (int i = 0; i < data_number; ++i) { - ASSERT_EQ(ptr[i].x, static_cast(in_data_int64[i]).x); + EXPECT_EQ(ptr[i].x, static_cast(in_data_int64[i]).x); } // transform bool to float16 @@ -203,13 +209,14 @@ TEST(DataTypeTransform, GPUTransform) { } TensorCopy(in, gpu_place, context, &in_gpu); + context.Wait(); TransDataType(kernel_bool, kernel_fp16, in_gpu, &out_gpu); TensorCopy(out_gpu, cpu_place, context, &out); context.Wait(); ptr = out.data(); for (int i = 0; i < data_number; ++i) { - ASSERT_EQ(ptr[i].x, static_cast(in_data_bool[i]).x); + EXPECT_EQ(ptr[i].x, static_cast(in_data_bool[i]).x); } } } diff --git a/paddle/fluid/framework/executor.cc b/paddle/fluid/framework/executor.cc index 961e3e22f2..f8e7d0d990 100644 --- a/paddle/fluid/framework/executor.cc +++ b/paddle/fluid/framework/executor.cc @@ -125,8 +125,9 @@ void Executor::Run(const ProgramDesc& pdesc, Scope* scope, int block_id, for (auto& op_desc : block.AllOps()) { auto op = paddle::framework::OpRegistry::CreateOp(*op_desc); - VLOG(3) << place_ << " " << op->DebugStringEx(local_scope); + VLOG(4) << place_ << " " << op->DebugStringEx(local_scope); op->Run(*local_scope, place_); + VLOG(3) << place_ << " " << op->DebugStringEx(local_scope); if (FLAGS_benchmark) { VLOG(2) << "Memory used after operator " + op->Type() + " running: " diff --git a/paddle/fluid/framework/tensor_util.cc b/paddle/fluid/framework/tensor_util.cc index 9b465b85b0..8b7533ce71 100644 --- a/paddle/fluid/framework/tensor_util.cc +++ b/paddle/fluid/framework/tensor_util.cc @@ -187,7 +187,6 @@ bool TensorContainsInf(const framework::Tensor& tensor) { void TensorToStream(std::ostream& os, const Tensor& tensor, const platform::DeviceContext& dev_ctx) { - // TODO(typhoonzero): serialize to ostream { // the 1st field, uint32_t version constexpr uint32_t version = 0; os.write(reinterpret_cast(&version), sizeof(version)); diff --git a/paddle/fluid/inference/CMakeLists.txt b/paddle/fluid/inference/CMakeLists.txt index bdb147955c..17ccca8cdc 100644 --- a/paddle/fluid/inference/CMakeLists.txt +++ b/paddle/fluid/inference/CMakeLists.txt @@ -5,7 +5,8 @@ cc_library(paddle_fluid_api DEPS ${FLUID_CORE_MODULES} ${GLOB_OP_LIB}) # Create static library -cc_library(paddle_fluid DEPS paddle_fluid_api ${FLUID_CORE_MODULES} ${GLOB_OP_LIB}) +get_property(fluid_modules GLOBAL PROPERTY FLUID_MODULES) +cc_library(paddle_fluid DEPS ${fluid_modules}) # Create shared library cc_library(paddle_fluid_shared SHARED diff --git a/paddle/fluid/inference/io.cc b/paddle/fluid/inference/io.cc index 80eb988967..52e9c0baa6 100644 --- a/paddle/fluid/inference/io.cc +++ b/paddle/fluid/inference/io.cc @@ -22,14 +22,14 @@ namespace paddle { namespace inference { void ReadBinaryFile(const std::string& filename, std::string& contents) { - VLOG(3) << "loading model from " << filename; - std::ifstream inputfs(filename, std::ios::in | std::ios::binary); - inputfs.seekg(0, std::ios::end); + std::ifstream fin(filename, std::ios::in | std::ios::binary); + PADDLE_ENFORCE(static_cast(fin), "Cannot open file %s", filename); + fin.seekg(0, std::ios::end); contents.clear(); - contents.resize(inputfs.tellg()); - inputfs.seekg(0, std::ios::beg); - inputfs.read(&contents[0], contents.size()); - inputfs.close(); + contents.resize(fin.tellg()); + fin.seekg(0, std::ios::beg); + fin.read(&contents[0], contents.size()); + fin.close(); } bool IsPersistable(const framework::VarDesc* var) { @@ -97,6 +97,7 @@ std::unique_ptr Load(framework::Executor& executor, const std::string& dirname) { std::string model_filename = dirname + "/__model__"; std::string program_desc_str; + VLOG(3) << "loading model from " << model_filename; ReadBinaryFile(model_filename, program_desc_str); std::unique_ptr main_program( diff --git a/paddle/fluid/inference/tests/book/test_inference_image_classification.cc b/paddle/fluid/inference/tests/book/test_inference_image_classification.cc index d6fc51301b..e9a27171f1 100644 --- a/paddle/fluid/inference/tests/book/test_inference_image_classification.cc +++ b/paddle/fluid/inference/tests/book/test_inference_image_classification.cc @@ -17,10 +17,13 @@ limitations under the License. */ #include "paddle/fluid/inference/tests/test_helper.h" DEFINE_string(dirname, "", "Directory of the inference model."); +DEFINE_int32(batch_size, 1, "Batch size of input data"); +DEFINE_int32(repeat, 1, "Running the inference program repeat times"); TEST(inference, image_classification) { - if (FLAGS_dirname.empty()) { - LOG(FATAL) << "Usage: ./example --dirname=path/to/your/model"; + if (FLAGS_dirname.empty() || FLAGS_batch_size < 1 || FLAGS_repeat < 1) { + LOG(FATAL) << "Usage: ./example --dirname=path/to/your/model " + "--batch_size=1 --repeat=1"; } LOG(INFO) << "FLAGS_dirname: " << FLAGS_dirname << std::endl; @@ -29,13 +32,11 @@ TEST(inference, image_classification) { // 0. Call `paddle::framework::InitDevices()` initialize all the devices // In unittests, this is done in paddle/testing/paddle_gtest_main.cc - int64_t batch_size = 1; - paddle::framework::LoDTensor input; // Use normilized image pixels as input data, // which should be in the range [0.0, 1.0]. SetupTensor(input, - {batch_size, 3, 32, 32}, + {FLAGS_batch_size, 3, 32, 32}, static_cast(0), static_cast(1)); std::vector cpu_feeds; @@ -46,7 +47,9 @@ TEST(inference, image_classification) { cpu_fetchs1.push_back(&output1); // Run inference on CPU - TestInference(dirname, cpu_feeds, cpu_fetchs1); + LOG(INFO) << "--- CPU Runs: ---"; + TestInference( + dirname, cpu_feeds, cpu_fetchs1, FLAGS_repeat); LOG(INFO) << output1.dims(); #ifdef PADDLE_WITH_CUDA @@ -55,7 +58,9 @@ TEST(inference, image_classification) { cpu_fetchs2.push_back(&output2); // Run inference on CUDA GPU - TestInference(dirname, cpu_feeds, cpu_fetchs2); + LOG(INFO) << "--- GPU Runs: ---"; + TestInference( + dirname, cpu_feeds, cpu_fetchs2, FLAGS_repeat); LOG(INFO) << output2.dims(); CheckError(output1, output2); diff --git a/paddle/fluid/inference/tests/book/test_inference_recognize_digits.cc b/paddle/fluid/inference/tests/book/test_inference_recognize_digits.cc index 99bee94cb8..1fb0f9e777 100644 --- a/paddle/fluid/inference/tests/book/test_inference_recognize_digits.cc +++ b/paddle/fluid/inference/tests/book/test_inference_recognize_digits.cc @@ -17,10 +17,13 @@ limitations under the License. */ #include "paddle/fluid/inference/tests/test_helper.h" DEFINE_string(dirname, "", "Directory of the inference model."); +DEFINE_int32(batch_size, 1, "Batch size of input data"); +DEFINE_int32(repeat, 1, "Running the inference program repeat times"); TEST(inference, recognize_digits) { - if (FLAGS_dirname.empty()) { - LOG(FATAL) << "Usage: ./example --dirname=path/to/your/model"; + if (FLAGS_dirname.empty() || FLAGS_batch_size < 1 || FLAGS_repeat < 1) { + LOG(FATAL) << "Usage: ./example --dirname=path/to/your/model " + "--batch_size=1 --repeat=1"; } LOG(INFO) << "FLAGS_dirname: " << FLAGS_dirname << std::endl; @@ -29,77 +32,39 @@ TEST(inference, recognize_digits) { // 0. Call `paddle::framework::InitDevices()` initialize all the devices // In unittests, this is done in paddle/testing/paddle_gtest_main.cc - int64_t batch_size = 1; - paddle::framework::LoDTensor input; // Use normilized image pixels as input data, // which should be in the range [-1.0, 1.0]. SetupTensor(input, - {batch_size, 1, 28, 28}, + {FLAGS_batch_size, 1, 28, 28}, static_cast(-1), static_cast(1)); std::vector cpu_feeds; cpu_feeds.push_back(&input); - paddle::framework::LoDTensor output1; - std::vector cpu_fetchs1; - cpu_fetchs1.push_back(&output1); + for (auto is_combined : {false, true}) { + paddle::framework::LoDTensor output1; + std::vector cpu_fetchs1; + cpu_fetchs1.push_back(&output1); - // Run inference on CPU - TestInference(dirname, cpu_feeds, cpu_fetchs1); - LOG(INFO) << output1.dims(); + // Run inference on CPU + LOG(INFO) << "--- CPU Runs: is_combined=" << is_combined << " ---"; + TestInference( + dirname, cpu_feeds, cpu_fetchs1, FLAGS_repeat, is_combined); + LOG(INFO) << output1.dims(); #ifdef PADDLE_WITH_CUDA - paddle::framework::LoDTensor output2; - std::vector cpu_fetchs2; - cpu_fetchs2.push_back(&output2); + paddle::framework::LoDTensor output2; + std::vector cpu_fetchs2; + cpu_fetchs2.push_back(&output2); - // Run inference on CUDA GPU - TestInference(dirname, cpu_feeds, cpu_fetchs2); - LOG(INFO) << output2.dims(); + // Run inference on CUDA GPU + LOG(INFO) << "--- GPU Runs: is_combined=" << is_combined << " ---"; + TestInference( + dirname, cpu_feeds, cpu_fetchs2, FLAGS_repeat, is_combined); + LOG(INFO) << output2.dims(); - CheckError(output1, output2); + CheckError(output1, output2); #endif -} - -TEST(inference, recognize_digits_combine) { - if (FLAGS_dirname.empty()) { - LOG(FATAL) << "Usage: ./example --dirname=path/to/your/model"; } - - LOG(INFO) << "FLAGS_dirname: " << FLAGS_dirname << std::endl; - std::string dirname = FLAGS_dirname; - - // 0. Call `paddle::framework::InitDevices()` initialize all the devices - // In unittests, this is done in paddle/testing/paddle_gtest_main.cc - - paddle::framework::LoDTensor input; - // Use normilized image pixels as input data, - // which should be in the range [-1.0, 1.0]. - SetupTensor( - input, {1, 1, 28, 28}, static_cast(-1), static_cast(1)); - std::vector cpu_feeds; - cpu_feeds.push_back(&input); - - paddle::framework::LoDTensor output1; - std::vector cpu_fetchs1; - cpu_fetchs1.push_back(&output1); - - // Run inference on CPU - TestInference( - dirname, cpu_feeds, cpu_fetchs1); - LOG(INFO) << output1.dims(); - -#ifdef PADDLE_WITH_CUDA - paddle::framework::LoDTensor output2; - std::vector cpu_fetchs2; - cpu_fetchs2.push_back(&output2); - - // Run inference on CUDA GPU - TestInference( - dirname, cpu_feeds, cpu_fetchs2); - LOG(INFO) << output2.dims(); - - CheckError(output1, output2); -#endif } diff --git a/paddle/fluid/inference/tests/test_helper.h b/paddle/fluid/inference/tests/test_helper.h index 49518e50d8..0f5fe6d0aa 100644 --- a/paddle/fluid/inference/tests/test_helper.h +++ b/paddle/fluid/inference/tests/test_helper.h @@ -15,6 +15,7 @@ limitations under the License. */ #include #include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/inference/io.h" +#include "paddle/fluid/platform/profiler.h" template void SetupTensor(paddle::framework::LoDTensor& input, @@ -87,31 +88,60 @@ void CheckError(paddle::framework::LoDTensor& output1, EXPECT_EQ(count, 0U) << "There are " << count << " different elements."; } -template +template void TestInference(const std::string& dirname, const std::vector& cpu_feeds, - std::vector& cpu_fetchs) { + std::vector& cpu_fetchs, + const int repeat = 1, + const bool is_combined = false) { // 1. Define place, executor, scope auto place = Place(); auto executor = paddle::framework::Executor(place); auto* scope = new paddle::framework::Scope(); + // Profile the performance + paddle::platform::ProfilerState state; + if (paddle::platform::is_cpu_place(place)) { + state = paddle::platform::ProfilerState::kCPU; + } else { +#ifdef PADDLE_WITH_CUDA + state = paddle::platform::ProfilerState::kCUDA; + // The default device_id of paddle::platform::CUDAPlace is 0. + // Users can get the device_id using: + // int device_id = place.GetDeviceId(); + paddle::platform::SetDeviceId(0); +#else + PADDLE_THROW("'CUDAPlace' is not supported in CPU only device."); +#endif + } + + // Enable the profiler + paddle::platform::EnableProfiler(state); + // 2. Initialize the inference_program and load parameters std::unique_ptr inference_program; - if (IsCombined) { - // All parameters are saved in a single file. - // Hard-coding the file names of program and parameters in unittest. - // The file names should be consistent with that used in Python API - // `fluid.io.save_inference_model`. - std::string prog_filename = "__model_combined__"; - std::string param_filename = "__params_combined__"; - inference_program = paddle::inference::Load(executor, - *scope, - dirname + "/" + prog_filename, - dirname + "/" + param_filename); - } else { - // Parameters are saved in separate files sited in the specified `dirname`. - inference_program = paddle::inference::Load(executor, *scope, dirname); + { + paddle::platform::RecordEvent record_event( + "init_program", + paddle::platform::DeviceContextPool::Instance().Get(place)); + + if (is_combined) { + // All parameters are saved in a single file. + // Hard-coding the file names of program and parameters in unittest. + // The file names should be consistent with that used in Python API + // `fluid.io.save_inference_model`. + std::string prog_filename = "__model_combined__"; + std::string param_filename = "__params_combined__"; + inference_program = + paddle::inference::Load(executor, + *scope, + dirname + "/" + prog_filename, + dirname + "/" + param_filename); + } else { + // Parameters are saved in separate files sited in the specified + // `dirname`. + inference_program = paddle::inference::Load(executor, *scope, dirname); + } } // 3. Get the feed_target_names and fetch_target_names @@ -134,7 +164,21 @@ void TestInference(const std::string& dirname, } // 6. Run the inference program - executor.Run(*inference_program, scope, feed_targets, fetch_targets); + { + // Run repeat times to profile the performance + for (int i = 0; i < repeat; ++i) { + paddle::platform::RecordEvent record_event( + "run_inference", + paddle::platform::DeviceContextPool::Instance().Get(place)); + + executor.Run(*inference_program, scope, feed_targets, fetch_targets); + } + } + + // Disable the profiler and print the timing information + paddle::platform::DisableProfiler(paddle::platform::EventSortingKey::kDefault, + "profiler.txt"); + paddle::platform::ResetProfiler(); delete scope; } diff --git a/paddle/fluid/operators/detail/CMakeLists.txt b/paddle/fluid/operators/detail/CMakeLists.txt index 0581bd2ac5..94395ccfbc 100644 --- a/paddle/fluid/operators/detail/CMakeLists.txt +++ b/paddle/fluid/operators/detail/CMakeLists.txt @@ -1,3 +1,6 @@ if(WITH_DISTRIBUTE) - grpc_library(sendrecvop_grpc SRCS sendrecvop_utils.cc grpc_client.cc grpc_server.cc PROTO send_recv.proto DEPS lod_tensor selected_rows) + grpc_library(sendrecvop_grpc SRCS bytebuffer_stream.cc sendrecvop_utils.cc grpc_client.cc grpc_server.cc PROTO send_recv.proto DEPS lod_tensor selected_rows) + set(DISTRIBUTE_COMPILE_FLAGS "-Wno-non-virtual-dtor -Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor") + set_source_files_properties(test_serde.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) + cc_test(serde_test SRCS test_serde.cc DEPS grpc++_unsecure grpc_unsecure gpr cares zlib protobuf sendrecvop_grpc) endif() diff --git a/paddle/fluid/operators/detail/bytebuffer_stream.cc b/paddle/fluid/operators/detail/bytebuffer_stream.cc new file mode 100644 index 0000000000..a9488156e0 --- /dev/null +++ b/paddle/fluid/operators/detail/bytebuffer_stream.cc @@ -0,0 +1,88 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +// NOTE: This file was originally created by tensorflow +// (https://github.com/tensorflow/tensorflow/) we borrow this +// file and did some modifications so that we can send gRPC +// requests without too much copying of the tensor data. + +#include "bytebuffer_stream.h" + +namespace paddle { +namespace operators { +namespace detail { + +GrpcByteBufferSource::GrpcByteBufferSource() {} + +bool GrpcByteBufferSource::Init(const grpc::ByteBuffer& src) { + cur_ = -1; + left_ = 0; + ptr_ = nullptr; + byte_count_ = 0; + bool ok = src.Dump(&slices_).ok(); + if (!ok) { + slices_.clear(); + } + return ok; +} + +bool GrpcByteBufferSource::Next(const void** data, int* size) { + // Use loop instead of if in case buffer contained empty slices. + while (left_ == 0) { + // Advance to next slice. + cur_++; + if (cur_ >= slices_.size()) { + return false; + } + const ::grpc::Slice& s = slices_[cur_]; + left_ = s.size(); + ptr_ = reinterpret_cast(s.begin()); + } + + *data = ptr_; + *size = left_; + byte_count_ += left_; + ptr_ += left_; + left_ = 0; + return true; +} + +void GrpcByteBufferSource::BackUp(int count) { + ptr_ -= count; + left_ += count; + byte_count_ -= count; +} + +bool GrpcByteBufferSource::Skip(int count) { + const void* data; + int size; + while (Next(&data, &size)) { + if (size >= count) { + BackUp(size - count); + return true; + } + // size < count; + count -= size; + } + // error or we have too large count; + return false; +} + +google::protobuf::int64 GrpcByteBufferSource::ByteCount() const { + return byte_count_; +} + +} // namespace detail +} // namespace operators +} // namespace paddle \ No newline at end of file diff --git a/paddle/fluid/operators/detail/bytebuffer_stream.h b/paddle/fluid/operators/detail/bytebuffer_stream.h new file mode 100644 index 0000000000..099deb12d0 --- /dev/null +++ b/paddle/fluid/operators/detail/bytebuffer_stream.h @@ -0,0 +1,51 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +// NOTE: This file was originally created by tensorflow +// (https://github.com/tensorflow/tensorflow/) we borrow this +// file and did some modifications so that we can send gRPC +// requests without too much copying of the tensor data. + +#pragma once + +#include +#include "google/protobuf/io/coded_stream.h" +#include "google/protobuf/io/zero_copy_stream.h" + +namespace paddle { +namespace operators { +namespace detail { + +// A ZeroCopyInputStream that reads from a grpc::ByteBuffer. +class GrpcByteBufferSource + : public ::google::protobuf::io::ZeroCopyInputStream { + public: + GrpcByteBufferSource(); + bool Init(const ::grpc::ByteBuffer& src); // Can be called multiple times. + bool Next(const void** data, int* size) override; + void BackUp(int count) override; + bool Skip(int count) override; + ::google::protobuf::int64 ByteCount() const override; + + private: + std::vector<::grpc::Slice> slices_; + size_t cur_; // Current slice index. + int left_; // Number of bytes in slices_[cur_] left to yield. + const char* ptr_; // Address of next byte in slices_[cur_] to yield. + ::google::protobuf::int64 byte_count_; +}; + +} // namespace detail +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/detail/proto_encoder_helper.h b/paddle/fluid/operators/detail/proto_encoder_helper.h new file mode 100644 index 0000000000..4a7bfb8bd5 --- /dev/null +++ b/paddle/fluid/operators/detail/proto_encoder_helper.h @@ -0,0 +1,147 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +// NOTE: This file was originally created by tensorflow +// (https://github.com/tensorflow/tensorflow/) we borrow this +// file and did some modifications so that we can send gRPC +// requests without too much copying of the tensor data. + +#pragma once + +#include +#include "paddle/fluid/platform/enforce.h" + +namespace paddle { +namespace operators { +namespace detail { + +char* EncodeVarint32(char* dst, uint32_t v) { + // Operate on characters as unsigneds + unsigned char* ptr = reinterpret_cast(dst); + static const int B = 128; + if (v < (1 << 7)) { + *(ptr++) = v; + } else if (v < (1 << 14)) { + *(ptr++) = v | B; + *(ptr++) = v >> 7; + } else if (v < (1 << 21)) { + *(ptr++) = v | B; + *(ptr++) = (v >> 7) | B; + *(ptr++) = v >> 14; + } else if (v < (1 << 28)) { + *(ptr++) = v | B; + *(ptr++) = (v >> 7) | B; + *(ptr++) = (v >> 14) | B; + *(ptr++) = v >> 21; + } else { + *(ptr++) = v | B; + *(ptr++) = (v >> 7) | B; + *(ptr++) = (v >> 14) | B; + *(ptr++) = (v >> 21) | B; + *(ptr++) = v >> 28; + } + return reinterpret_cast(ptr); +} + +char* EncodeVarint64(char* dst, uint64_t v) { + static const int B = 128; + unsigned char* ptr = reinterpret_cast(dst); + while (v >= B) { + *(ptr++) = (v & (B - 1)) | B; + v >>= 7; + } + *(ptr++) = static_cast(v); + return reinterpret_cast(ptr); +} + +int VarintLength(uint64_t v) { + int len = 1; + while (v >= 128) { + v >>= 7; + len++; + } + return len; +} + +class ProtoEncodeHelper { + public: + ProtoEncodeHelper(char* buf, int max_size) + : base_(buf), p_(buf), limit_(base_ + max_size) {} + + ~ProtoEncodeHelper() { + // Make sure callers didn't do operations that went over max_size promised + PADDLE_ENFORCE_LE(p_, limit_); + } + + const char* data() const { return base_; } + size_t size() const { return p_ - base_; } + + void WriteUint64(int tag, uint64_t v) { + Encode32(combine(tag, WIRETYPE_VARINT)); + Encode64(v); + } + void WriteBool(int tag, bool v) { + Encode32(combine(tag, WIRETYPE_VARINT)); + EncodeBool(v); + } + void WriteString(int tag, const std::string& v) { + Encode32(combine(tag, WIRETYPE_LENGTH_DELIMITED)); + Encode32(v.size()); + EncodeBytes(v.data(), v.size()); + } + void WriteVarlengthBeginning(int tag, uint32_t len) { + Encode32(combine(tag, WIRETYPE_LENGTH_DELIMITED)); + Encode32(len); + } + void WriteRawBytes(const std::string& v) { EncodeBytes(v.data(), v.size()); } + + private: + // Note: this module's behavior must match the protocol buffer wire encoding + // format. + enum { + WIRETYPE_VARINT = 0, + WIRETYPE_LENGTH_DELIMITED = 2, + }; + static uint32_t combine(uint32_t tag, uint32_t type) { + return ((tag << 3) | type); + } + inline void Encode32(uint32_t v) { + if (v < 128) { + // Fast path for single-byte values. Many of the calls will use a + // constant value for v, so the comparison will get optimized away + // when Encode32 is inlined into the caller. + *p_ = v; + p_++; + } else { + p_ = EncodeVarint32(p_, v); + } + } + void Encode64(uint64_t v) { p_ = EncodeVarint64(p_, v); } + void EncodeBool(bool v) { + *p_ = (v ? 1 : 0); // Equal to varint32 encoding of 0 or 1 + p_++; + } + void EncodeBytes(const char* bytes, int N) { + memcpy(p_, bytes, N); + p_ += N; + } + + char* base_; + char* p_; + char* limit_; // Just for CHECKs +}; + +} // detail +} // operators +} // paddle diff --git a/paddle/fluid/operators/detail/send_recv.proto b/paddle/fluid/operators/detail/send_recv.proto index 8f962b4c69..b0215d4a80 100644 --- a/paddle/fluid/operators/detail/send_recv.proto +++ b/paddle/fluid/operators/detail/send_recv.proto @@ -33,10 +33,34 @@ enum VarType { } message VariableMessage { + enum Type { + // Pod Types + BOOL = 0; + INT16 = 1; + INT32 = 2; + INT64 = 3; + FP16 = 4; + FP32 = 5; + FP64 = 6; + } + + message LodData { repeated int64 lod_data = 1; } + string varname = 1; // TODO(Yancey1989): reference framework::proto::VarDesc::VarType VarType type = 2; - bytes serialized = 3; + // bool persistable is not needed for sending. + // tensor info: + Type data_type = 3; + repeated int64 dims = 4; + + // lod details: + int64 lod_level = 5; + repeated LodData lod = 6; + // tensor data + bytes serialized = 7; + // selected_rows data + bytes rows = 8; } message VoidMessage {} diff --git a/paddle/fluid/operators/detail/sendrecvop_utils.cc b/paddle/fluid/operators/detail/sendrecvop_utils.cc index 169fd40fd9..64d181f408 100644 --- a/paddle/fluid/operators/detail/sendrecvop_utils.cc +++ b/paddle/fluid/operators/detail/sendrecvop_utils.cc @@ -13,6 +13,11 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/operators/detail/sendrecvop_utils.h" +#include "google/protobuf/io/coded_stream.h" +#include "google/protobuf/io/zero_copy_stream.h" +#include "paddle/fluid/framework/data_type.h" +#include "paddle/fluid/operators/detail/bytebuffer_stream.h" +#include "paddle/fluid/operators/detail/proto_encoder_helper.h" namespace paddle { namespace operators { @@ -63,6 +68,242 @@ void DeserializeFromMessage(const sendrecv::VariableMessage& msg, } } +void SerializeToByteBuffer(const std::string& name, framework::Variable* var, + const platform::DeviceContext& ctx, + ::grpc::ByteBuffer* msg) { + using VarMsg = sendrecv::VariableMessage; + sendrecv::VariableMessage request; + std::string header; + request.AppendToString(&header); + // When using GPU, need to free the copied CPU buffer + // when the ByteBuffer destroies + // TODO(typhoonzero): add unref here, if we have dependent + // parallelism execution, need to know when to free the tensor. + DestroyCallback destroy_callback = [](void* backing) {}; + + void* buf = malloc(1024); + void* payload; + size_t payload_size; + ProtoEncodeHelper e((char*)buf, 1024); + e.WriteString(VarMsg::kVarnameFieldNumber, name); + if (var->IsType()) { + e.WriteUint64(VarMsg::kTypeFieldNumber, 0); + } else if (var->IsType()) { + e.WriteUint64(VarMsg::kTypeFieldNumber, 1); + } + + switch (framework::ToVarType(var->Type())) { + case framework::proto::VarType_Type_LOD_TENSOR: { + auto tensor = var->Get(); + e.WriteUint64(VarMsg::kDataTypeFieldNumber, + framework::ToDataType(tensor.type())); + for (auto& dim : framework::vectorize(tensor.dims())) { + e.WriteUint64(VarMsg::kDimsFieldNumber, dim); + } + auto lod = tensor.lod(); // std::vector> + if (lod.size() > 0) { + e.WriteUint64(VarMsg::kLodLevelFieldNumber, lod.size()); + + for (auto& each : lod) { + e.WriteVarlengthBeginning(VarMsg::kLodFieldNumber, + 2 + // tag + varintlength of submessage + 1 + // kLodDataFieldNumber + each.size()); + // auto copied from GPU + for (auto& d : each) { + e.WriteUint64(VarMsg::LodData::kLodDataFieldNumber, d); + } + } + } + if (platform::is_gpu_place(ctx.GetPlace())) { +#ifdef PADDLE_WITH_CUDA + PADDLE_ENFORCE(platform::is_gpu_place(tensor.place())); + platform::CPUPlace cpu; + auto& gpu_dev_ctx = + static_cast(ctx); + auto copy_size = tensor.memory_size(); + payload = memory::Alloc(cpu, copy_size); + memory::Copy(cpu, payload, + boost::get(tensor.place()), + reinterpret_cast(tensor.data()), + copy_size, gpu_dev_ctx.stream()); + destroy_callback = [](void* backing) { + std::cout << "destroy payload" << std::endl; + platform::CPUPlace cpu; + memory::Free(cpu, backing); + }; +#endif + } else { + payload = tensor.data(); + } + payload_size = tensor.memory_size(); + + std::string tmp(reinterpret_cast(payload), payload_size); + for (int i = 0; i < tmp.size(); ++i) { + printf("%02X ", tmp.data()[i]); + } + printf("\n"); + e.WriteVarlengthBeginning(VarMsg::kSerializedFieldNumber, payload_size); + } break; + case framework::proto::VarType_Type_SELECTED_ROWS: { + // TODO(typhoonzero): selectedrows implement should not use unique_ptr + auto* slr = var->GetMutable(); + e.WriteUint64(VarMsg::kDataTypeFieldNumber, + framework::ToDataType(slr->value().type())); + for (auto& dim : framework::vectorize(slr->value().dims())) { + e.WriteUint64(VarMsg::kDimsFieldNumber, dim); + } + e.WriteUint64(VarMsg::kLodLevelFieldNumber, 0); + auto* tensor = slr->mutable_value(); + if (platform::is_gpu_place(ctx.GetPlace())) { +#ifdef PADDLE_WITH_CUDA + platform::CPUPlace cpu; + auto& gpu_dev_ctx = + static_cast(ctx); + auto copy_size = tensor->memory_size(); + payload = memory::Alloc(cpu, copy_size); + memory::Copy(cpu, payload, + boost::get(tensor->place()), + reinterpret_cast(tensor->data()), + copy_size, gpu_dev_ctx.stream()); + ctx.Wait(); + float* ttt = reinterpret_cast(payload); + for (int i = 0; i < copy_size / 4; i++) { + std::cout << "copied to cpu: " << ttt[i] << std::endl; + } + destroy_callback = [](void* backing) { + std::cout << "destroy..." << std::endl; + // platform::CPUPlace cpu; + // memory::Free(cpu, backing); + }; +#endif + } else { + payload = slr->mutable_value()->data(); + } + payload_size = tensor->memory_size(); + e.WriteVarlengthBeginning(VarMsg::kSerializedFieldNumber, payload_size); + } break; + default: + PADDLE_THROW("Serialize does not support type: %s", + typeid(var->Type()).name()); + break; + } + // steal reference of tensor data + ::grpc::Slice slices[4]; // metadata, tensor, rows meta, rows + int num_slices = 2; // only SelectedRows have rows buffer + slices[0] = ::grpc::Slice(e.size()); + memcpy(const_cast(slices[0].begin()), e.data(), e.size()); + slices[1] = ::grpc::Slice( + grpc_slice_new_with_user_data(payload, payload_size, destroy_callback, + static_cast(payload)), + ::grpc::Slice::STEAL_REF); + + if (framework::ToVarType(var->Type()) == + framework::proto::VarType_Type_SELECTED_ROWS) { + auto* slr = var->GetMutable(); + + ProtoEncodeHelper e2((char*)buf, 128); + // NOTE: rows is of type int64_t + size_t rows_memory_size = + slr->rows().capacity() * framework::SizeOfType(typeid(int64_t)); + e2.WriteVarlengthBeginning(VarMsg::kRowsFieldNumber, rows_memory_size); + slices[2] = ::grpc::Slice(e2.size()); + memcpy(const_cast(slices[2].begin()), e2.data(), e2.size()); + + slices[3] = ::grpc::Slice( + grpc_slice_new_with_user_data( + const_cast( + reinterpret_cast(slr->rows().data())), + rows_memory_size, + [](void* backing) { + // TODO(typhoonzero): add unref here, same as above. + }, + const_cast( + reinterpret_cast(slr->rows().data()))), + ::grpc::Slice::STEAL_REF); + num_slices = 4; + } + + ::grpc::ByteBuffer tmp(&slices[0], num_slices); + msg->Swap(&tmp); +} + +void DeserializeFromByteBuffer(const ::grpc::ByteBuffer& msg, + const platform::DeviceContext& ctx, + framework::Variable* var) { + sendrecv::VariableMessage meta; + GrpcByteBufferSource source; + source.Init(msg); + ::google::protobuf::io::CodedInputStream input(&source); + // do zerocopy parsing + PADDLE_ENFORCE(meta.ParseFromCodedStream(&input)); + PADDLE_ENFORCE(input.ConsumedEntireMessage()); + // dims is needed by both tensor and selectedrows + std::vector vecdims; + for (auto& d : meta.dims()) { + vecdims.push_back(d); + } + framework::DDim dims = framework::make_ddim(vecdims); + + if (meta.type() == sendrecv::LOD_TENSOR) { + auto* tensor = var->GetMutable(); + tensor->Resize(dims); + void* tensor_data = tensor->mutable_data( + ctx.GetPlace(), + paddle::operators::detail::ToTypeIndex(meta.data_type())); + framework::LoD lod; + for (int i = 0; i < meta.lod_level(); ++i) { + framework::Vector v; + for (int j = 0; j < meta.lod(i).lod_data_size(); ++j) { + v.push_back(meta.lod(i).lod_data(j)); + } + lod.push_back(v); + } + tensor->set_lod(lod); + // How to avoid copying and use the message buffer directly? + // Maybe need to find a way to release all memory except tensor content. + if (platform::is_gpu_place(ctx.GetPlace())) { +#ifdef PADDLE_WITH_CUDA + platform::CPUPlace cpu; + auto& gpu_dev_ctx = static_cast(ctx); + memory::Copy(boost::get(tensor->place()), + tensor_data, cpu, + reinterpret_cast(meta.serialized().data()), + meta.serialized().size(), gpu_dev_ctx.stream()); +#endif + } else { + memcpy(tensor_data, + reinterpret_cast(meta.serialized().data()), + meta.serialized().size()); + } + } else if (meta.type() == sendrecv::SELECTED_ROWS) { + auto* slr = var->GetMutable(); + auto* tensor = slr->mutable_value(); + int64_t* rows_data = slr->mutable_rows()->data(); + tensor->Resize(dims); + void* tensor_data = tensor->mutable_data( + ctx.GetPlace(), + paddle::operators::detail::ToTypeIndex(meta.data_type())); + if (platform::is_gpu_place(ctx.GetPlace())) { +#ifdef PADDLE_WITH_CUDA + platform::CPUPlace cpu; + auto& gpu_dev_ctx = static_cast(ctx); + memory::Copy(boost::get(tensor->place()), + tensor_data, cpu, + reinterpret_cast(meta.serialized().data()), + meta.serialized().size(), gpu_dev_ctx.stream()); +#endif + } else { + memcpy(tensor_data, + reinterpret_cast(meta.serialized().data()), + meta.serialized().size()); + } + // copy rows CPU data, GPU data will be copied lazly + memcpy(rows_data, reinterpret_cast(meta.rows().data()), + meta.rows().size()); + } +} + } // namespace detail } // namespace operators -} // namespace paddle +} // namespace paddle \ No newline at end of file diff --git a/paddle/fluid/operators/detail/sendrecvop_utils.h b/paddle/fluid/operators/detail/sendrecvop_utils.h index 670d0e1624..65704db5ae 100644 --- a/paddle/fluid/operators/detail/sendrecvop_utils.h +++ b/paddle/fluid/operators/detail/sendrecvop_utils.h @@ -33,6 +33,14 @@ namespace detail { #define LISTEN_TERMINATE_MESSAGE "TERMINATE@RECV" #define BATCH_BARRIER_MESSAGE "BATCH_BARRIER@RECV" +typedef void (*DestroyCallback)(void*); + +inline int64_t GetTimestamp() { + return std::chrono::duration_cast( + std::chrono::system_clock::now().time_since_epoch()) + .count(); +} + void SerializeToMessage(const std::string& name, const framework::Variable* var, const platform::DeviceContext& ctx, sendrecv::VariableMessage* msg); @@ -40,6 +48,32 @@ void SerializeToMessage(const std::string& name, const framework::Variable* var, void DeserializeFromMessage(const sendrecv::VariableMessage& msg, const platform::DeviceContext& ctx, framework::Variable* var); + +void SerializeToByteBuffer(const std::string& name, framework::Variable* var, + const platform::DeviceContext& ctx, + ::grpc::ByteBuffer* msg); + +void DeserializeFromByteBuffer(const ::grpc::ByteBuffer& msg, + const platform::DeviceContext& ctx, + framework::Variable* var); + +inline std::type_index ToTypeIndex(sendrecv::VariableMessage::Type type) { + switch (type) { + case sendrecv::VariableMessage::FP32: + return typeid(float); // NOLINT + case sendrecv::VariableMessage::FP64: + return typeid(double); // NOLINT + case sendrecv::VariableMessage::INT32: + return typeid(int); // NOLINT + case sendrecv::VariableMessage::INT64: + return typeid(int64_t); // NOLINT + case sendrecv::VariableMessage::BOOL: + return typeid(bool); // NOLINT + default: + PADDLE_THROW("Not support type %d", type); + } +} + } // namespace detail } // namespace operators } // namespace paddle diff --git a/paddle/fluid/operators/detail/test_serde.cc b/paddle/fluid/operators/detail/test_serde.cc new file mode 100644 index 0000000000..8054c89ecf --- /dev/null +++ b/paddle/fluid/operators/detail/test_serde.cc @@ -0,0 +1,195 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include +#include +#include + +#include "gtest/gtest.h" +#include "paddle/fluid/framework/lod_tensor.h" +#include "paddle/fluid/framework/tensor_util.h" +#include "paddle/fluid/framework/variable.h" +#include "paddle/fluid/operators/detail/sendrecvop_utils.h" +#include "paddle/fluid/operators/math/math_function.h" +#include "paddle/fluid/platform/place.h" +#include "paddle/fluid/string/printf.h" + +namespace framework = paddle::framework; +namespace platform = paddle::platform; +namespace operators = paddle::operators; +namespace math = paddle::operators::math; +namespace memory = paddle::memory; + +void RunSerdeTestTensor(platform::Place place) { + // serialize var to ByteBuffer + framework::Variable var; + auto* tensor = var.GetMutable(); + tensor->Resize(framework::make_ddim({4, 8, 4, 2})); + framework::LoD lod; + lod.push_back(framework::Vector({1, 3, 8})); + tensor->set_lod(lod); + int tensor_numel = 4 * 8 * 4 * 2; + platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance(); + auto& ctx = *pool.Get(place); + float* orig_tensor_data = tensor->mutable_data(place); + math::set_constant(ctx, tensor, 31.9); + + ::grpc::ByteBuffer msg; + operators::detail::SerializeToByteBuffer("myvar", &var, ctx, &msg); + EXPECT_GT(msg.Length(), 0); + + // deserialize + std::vector<::grpc::Slice> slices; + (void)msg.Dump(&slices); + std::string tmp; + for (const auto& s : slices) { + tmp.append(reinterpret_cast(s.begin()), s.size()); + } + sendrecv::VariableMessage varmsg; + EXPECT_TRUE(varmsg.ParseFromString(tmp)); + EXPECT_EQ(varmsg.varname(), "myvar"); + EXPECT_EQ(varmsg.type(), 0); + EXPECT_EQ(varmsg.dims()[0], 4); + EXPECT_EQ(varmsg.dims()[1], 8); + EXPECT_EQ(varmsg.dims()[2], 4); + EXPECT_EQ(varmsg.dims()[3], 2); + EXPECT_EQ(varmsg.lod_level(), 1); + EXPECT_EQ(varmsg.lod(0).lod_data(0), 1); + EXPECT_EQ(varmsg.lod(0).lod_data(1), 3); + EXPECT_EQ(varmsg.lod(0).lod_data(2), 8); + + const float* tensor_data = + reinterpret_cast(varmsg.serialized().data()); + for (int i = 0; i < varmsg.serialized().size(); ++i) { + printf("%02X ", varmsg.serialized().data()[i]); + } + printf("\n"); + for (int i = 0; i < tensor_numel; ++i) { + std::cout << "#####tensor data: " << tensor_data[i] << std::endl; + EXPECT_EQ(tensor_data[i], orig_tensor_data[i]); + std::cout << "test end 1 " << std::endl; + } + std::cout << "tensor data end " << std::endl; + + // deserialize zero-copy + framework::Variable var2; + operators::detail::DeserializeFromByteBuffer(msg, ctx, &var2); + auto tensor2 = var2.Get(); + float* tensor_data2 = nullptr; + framework::Tensor tmp_tensor; + + if (platform::is_gpu_place(ctx.GetPlace())) { + platform::CPUPlace cpu; + framework::TensorCopy(tensor2, cpu, &tmp_tensor); + tensor_data2 = tmp_tensor.data(); + } else { + tensor_data2 = const_cast(tensor2.data()); + } + + EXPECT_EQ(varmsg.lod_level(), 1); + EXPECT_EQ(varmsg.lod(0).lod_data(0), 1); + EXPECT_EQ(varmsg.lod(0).lod_data(1), 3); + EXPECT_EQ(varmsg.lod(0).lod_data(2), 8); + for (int i = 0; i < tensor_numel; ++i) + EXPECT_EQ(tensor_data2[i], orig_tensor_data[i]); +} + +void RunSerdeTestSelectedRows(platform::Place place) { + platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance(); + auto& ctx = *pool.Get(place); + + // serialize var to ByteBuffer + framework::Variable var; + auto* slr = var.GetMutable(); + auto* tensor = slr->mutable_value(); + auto* rows = slr->mutable_rows(); + + tensor->Resize(framework::make_ddim({2, 10})); + int tensor_numel = 2 * 10; + float* orig_tensor_data = tensor->mutable_data(place); + math::set_constant(ctx, tensor, 32.7); + rows->push_back(3); + rows->push_back(10); + + ::grpc::ByteBuffer msg; + operators::detail::SerializeToByteBuffer("myvar", &var, ctx, &msg); + EXPECT_GT(msg.Length(), 0); + + // deserialize + std::vector<::grpc::Slice> slices; + (void)msg.Dump(&slices); + std::string tmp; + for (const auto& s : slices) { + tmp.append(reinterpret_cast(s.begin()), s.size()); + } + sendrecv::VariableMessage varmsg; + EXPECT_TRUE(varmsg.ParseFromString(tmp)); + + EXPECT_EQ(varmsg.varname(), "myvar"); + EXPECT_EQ(varmsg.type(), 1); + + const float* tensor_data = + reinterpret_cast(varmsg.serialized().data()); + const int64_t* rows_data = + reinterpret_cast(varmsg.rows().data()); + for (int i = 0; i < tensor_numel; ++i) { + EXPECT_EQ(tensor_data[i], orig_tensor_data[i]); + } + EXPECT_EQ(rows_data[0], 3); + EXPECT_EQ(rows_data[1], 10); + // deserialize zero-copy + framework::Variable var2; + operators::detail::DeserializeFromByteBuffer(msg, ctx, &var2); + + auto* slr2 = var2.GetMutable(); + auto* tensor2 = slr2->mutable_value(); + auto* rows2 = slr2->mutable_rows(); + float* tensor_data2 = nullptr; + framework::Tensor tmp_tensor; + + if (platform::is_gpu_place(ctx.GetPlace())) { + platform::CPUPlace cpu; + framework::TensorCopy(*tensor2, cpu, &tmp_tensor); + tensor_data2 = tmp_tensor.data(); + } else { + tensor_data2 = const_cast(tensor2->data()); + } + const int64_t* rows_data2 = rows2->data(); + + for (int i = 0; i < tensor_numel; ++i) { + EXPECT_EQ(tensor_data2[i], orig_tensor_data[i]); + } + EXPECT_EQ(rows_data2[0], 3); + EXPECT_EQ(rows_data2[1], 10); +} + +// TEST(SelectedRows, CPU) { +// platform::CPUPlace place; +// RunSerdeTestSelectedRows(place); +// } + +// TEST(SelectedRows, GPU) { +// platform::CUDAPlace place; +// RunSerdeTestSelectedRows(place); +// } + +TEST(Tensor, CPU) { + platform::CPUPlace place; + RunSerdeTestTensor(place); +} + +TEST(Tensor, GPU) { + platform::CUDAPlace place; + RunSerdeTestTensor(place); +} \ No newline at end of file diff --git a/paddle/fluid/operators/detection_map_op.cc b/paddle/fluid/operators/detection_map_op.cc index 9b8ca92537..73c84c2fe0 100644 --- a/paddle/fluid/operators/detection_map_op.cc +++ b/paddle/fluid/operators/detection_map_op.cc @@ -71,7 +71,7 @@ class DetectionMAPOp : public framework::OperatorWithKernel { return framework::OpKernelType( framework::ToDataType( ctx.Input("DetectRes")->type()), - ctx.device_context()); + platform::CPUPlace()); } }; diff --git a/paddle/fluid/operators/detection_map_op.h b/paddle/fluid/operators/detection_map_op.h index 637f8368f8..a009e9dfce 100644 --- a/paddle/fluid/operators/detection_map_op.h +++ b/paddle/fluid/operators/detection_map_op.h @@ -144,6 +144,15 @@ class DetectionMAPOpKernel : public framework::OpKernel { } } + inline void ClipBBox(const Box& bbox, Box* clipped_bbox) const { + T one = static_cast(1.0); + T zero = static_cast(0.0); + clipped_bbox->xmin = std::max(std::min(bbox.xmin, one), zero); + clipped_bbox->ymin = std::max(std::min(bbox.ymin, one), zero); + clipped_bbox->xmax = std::max(std::min(bbox.xmax, one), zero); + clipped_bbox->ymax = std::max(std::min(bbox.ymax, one), zero); + } + void GetBoxes(const framework::LoDTensor& input_label, const framework::LoDTensor& input_detect, std::vector>>& gt_boxes, @@ -360,7 +369,9 @@ class DetectionMAPOpKernel : public framework::OpKernel { size_t max_idx = 0; auto score = pred_boxes[i].first; for (size_t j = 0; j < matched_bboxes.size(); ++j) { - T overlap = JaccardOverlap(pred_boxes[i].second, matched_bboxes[j]); + Box& pred_box = pred_boxes[i].second; + ClipBBox(pred_box, &pred_box); + T overlap = JaccardOverlap(pred_box, matched_bboxes[j]); if (overlap > max_overlap) { max_overlap = overlap; max_idx = j; diff --git a/paddle/fluid/operators/elementwise_div_op.h b/paddle/fluid/operators/elementwise_div_op.h index 6bcc577456..95649ac46e 100644 --- a/paddle/fluid/operators/elementwise_div_op.h +++ b/paddle/fluid/operators/elementwise_div_op.h @@ -41,77 +41,14 @@ class ElementwiseDivKernel : public framework::OpKernel { }; template -struct ElementwiseDivGradFunctor { - template - void operator()(Device d, X x, Y y, Z z, dX dx, dY dy, dZ dz) { - auto y_e = framework::EigenVector::Flatten(*y); - auto z_e = framework::EigenVector::Flatten(*z); - auto dz_e = framework::EigenVector::Flatten(*dz); - - if (dx) { - auto dx_e = framework::EigenVector::Flatten(*dx); - dx_e.device(d) = dz_e / y_e; - } - - if (dy) { - auto dy_e = framework::EigenVector::Flatten(*dy); - dy_e.device(d) = -1.0 * dz_e * z_e / y_e; - } - } -}; - -template -struct ElementwiseDivBroadCastGradFunctor { - template - void operator()(Device d, X x, Y y, Z z, dX dx, dY dy, dZ dz, Pre pre, N n) { - auto x_e = framework::EigenVector::Flatten(*x); - auto y_e = framework::EigenVector::Flatten(*y); - auto dz_e = framework::EigenVector::Flatten(*dz); - - auto y_e_bcast = y_e.reshape(Eigen::DSizes(1, n)) - .broadcast(Eigen::DSizes(pre, 1)) - .reshape(Eigen::DSizes(x_e.size())); - - if (dx) { - auto dx_e = framework::EigenVector::Flatten(*dx); - dx_e.device(d) = dz_e / y_e_bcast; - } - - if (dy) { - auto dy_e = framework::EigenVector::Flatten(*dy); - dy_e.device(d) = (-1.0 * (x_e * dz_e) / (y_e_bcast * y_e_bcast)) - .reshape(Eigen::DSizes(pre, n)) - .sum(Eigen::array{{0}}); - } - } +struct DivGradDX { + HOSTDEVICE T operator()(T x, T y, T out, T dout) const { return dout / y; } }; template -struct ElementwiseDivBroadCast2GradFunctor { - template - void operator()(Device d, X x, Y y, Z z, dX dx, dY dy, dZ dz, Pre pre, N n, - Post post) { - auto x_e = framework::EigenVector::Flatten(*x); - auto y_e = framework::EigenVector::Flatten(*y); - auto dz_e = framework::EigenVector::Flatten(*dz); - - auto y_e_bcast = y_e.reshape(Eigen::DSizes(1, n, 1)) - .broadcast(Eigen::DSizes(pre, 1, post)) - .reshape(Eigen::DSizes(x_e.size())); - if (dx) { - auto dx_e = framework::EigenVector::Flatten(*dx); - dx_e.device(d) = dz_e / y_e_bcast; - } - - if (dy) { - auto dy_e = framework::EigenVector::Flatten(*dy); - dy_e.device(d) = (-1.0 * (x_e * dz_e) / (y_e_bcast * y_e_bcast)) - .reshape(Eigen::DSizes(pre, n, post)) - .sum(Eigen::array{{0, 2}}); - } +struct DivGradDY { + HOSTDEVICE T operator()(T x, T y, T out, T dout) const { + return -dout * x / (y * y); } }; @@ -128,10 +65,8 @@ class ElementwiseDivGradKernel : public framework::OpKernel { auto* dx = ctx.Output(framework::GradVarName("X")); auto* dy = ctx.Output(framework::GradVarName("Y")); int axis = ctx.Attr("axis"); - ElementwiseGradCompute, - ElementwiseDivBroadCastGradFunctor, - ElementwiseDivBroadCast2GradFunctor>( - ctx, x, y, out, dout, axis, dx, dy); + ElemwiseGradCompute, DivGradDY>( + ctx, *x, *y, *out, *dout, axis, dx, dy, DivGradDX(), DivGradDY()); } }; diff --git a/paddle/fluid/operators/elementwise_max_op.h b/paddle/fluid/operators/elementwise_max_op.h index ab3a3d5827..527a18ee3b 100644 --- a/paddle/fluid/operators/elementwise_max_op.h +++ b/paddle/fluid/operators/elementwise_max_op.h @@ -41,76 +41,16 @@ class ElementwiseMaxKernel : public framework::OpKernel { }; template -struct ElementwiseMaxGradFunctor { - template - void operator()(Device d, X x, Y y, Z z, dX dx, dY dy, dZ dz) { - auto x_e = framework::EigenVector::Flatten(*x); - auto y_e = framework::EigenVector::Flatten(*y); - auto dz_e = framework::EigenVector::Flatten(*dz); - - if (dx) { - auto dx_e = framework::EigenVector::Flatten(*dx); - dx_e.device(d) = (x_e > y_e).template cast() * dz_e; - } - if (dy) { - auto dy_e = framework::EigenVector::Flatten(*dy); - dy_e.device(d) = (x_e <= y_e).template cast() * dz_e; - } +struct MaxGradDx { + HOSTDEVICE T operator()(T x, T y, T out, T dout) const { + return dout * (x > y); } }; template -struct ElementwiseMaxBroadCastGradFunctor { - template - void operator()(Device d, X x, Y y, Z z, dX dx, dY dy, dZ dz, Pre pre, N n) { - auto x_e = framework::EigenVector::Flatten(*x); - auto y_e = framework::EigenVector::Flatten(*y); - auto dz_e = framework::EigenVector::Flatten(*dz); - - auto y_e_bcast = y_e.reshape(Eigen::DSizes(1, n)) - .broadcast(Eigen::DSizes(pre, 1)) - .reshape(Eigen::DSizes(x_e.size())); - - if (dx) { - auto dx_e = framework::EigenVector::Flatten(*dx); - dx_e.device(d) = (x_e > y_e_bcast).template cast() * dz_e; - } - - if (dy) { - auto dy_e = framework::EigenVector::Flatten(*dy); - dy_e.device(d) = ((x_e <= y_e_bcast).template cast() * dz_e) - .reshape(Eigen::DSizes(pre, n)) - .sum(Eigen::array{{0}}); - } - } -}; - -template -struct ElementwiseMaxBroadCast2GradFunctor { - template - void operator()(Device d, X x, Y y, Z z, dX dx, dY dy, dZ dz, Pre pre, N n, - Post post) { - auto x_e = framework::EigenVector::Flatten(*x); - auto y_e = framework::EigenVector::Flatten(*y); - auto dz_e = framework::EigenVector::Flatten(*dz); - - auto y_e_bcast = y_e.reshape(Eigen::DSizes(1, n, 1)) - .broadcast(Eigen::DSizes(pre, 1, post)) - .reshape(Eigen::DSizes(x_e.size())); - if (dx) { - auto dx_e = framework::EigenVector::Flatten(*dx); - dx_e.device(d) = (x_e > y_e_bcast).template cast() * dz_e; - } - - if (dy) { - auto dy_e = framework::EigenVector::Flatten(*dy); - dy_e.device(d) = ((x_e <= y_e_bcast).template cast() * dz_e) - .reshape(Eigen::DSizes(pre, n, post)) - .sum(Eigen::array{{0, 2}}); - } +struct MaxGradDy { + HOSTDEVICE T operator()(T x, T y, T out, T dout) const { + return dout * (x <= y); } }; @@ -127,12 +67,9 @@ class ElementwiseMaxGradKernel : public framework::OpKernel { auto* dx = ctx.Output(framework::GradVarName("X")); auto* dy = ctx.Output(framework::GradVarName("Y")); int axis = ctx.Attr("axis"); - ElementwiseGradCompute, - ElementwiseMaxBroadCastGradFunctor, - ElementwiseMaxBroadCast2GradFunctor>( - ctx, x, y, out, dout, axis, dx, dy); + ElemwiseGradCompute, MaxGradDy>( + ctx, *x, *y, *out, *dout, axis, dx, dy, MaxGradDx(), MaxGradDy()); } }; - } // namespace operators } // namespace paddle diff --git a/paddle/fluid/operators/elementwise_min_op.h b/paddle/fluid/operators/elementwise_min_op.h index f0eec9d246..d4e5831463 100644 --- a/paddle/fluid/operators/elementwise_min_op.h +++ b/paddle/fluid/operators/elementwise_min_op.h @@ -41,76 +41,16 @@ class ElementwiseMinKernel : public framework::OpKernel { }; template -struct ElementwiseMinGradFunctor { - template - void operator()(Device d, X x, Y y, Z z, dX dx, dY dy, dZ dz) { - auto x_e = framework::EigenVector::Flatten(*x); - auto y_e = framework::EigenVector::Flatten(*y); - auto dz_e = framework::EigenVector::Flatten(*dz); - - if (dx) { - auto dx_e = framework::EigenVector::Flatten(*dx); - dx_e.device(d) = (x_e < y_e).template cast() * dz_e; - } - if (dy) { - auto dy_e = framework::EigenVector::Flatten(*dy); - dy_e.device(d) = (x_e >= y_e).template cast() * dz_e; - } +struct MinGradDx { + HOSTDEVICE T operator()(T x, T y, T out, T dout) const { + return dout * (x < y); } }; template -struct ElementwiseMinBroadCastGradFunctor { - template - void operator()(Device d, X x, Y y, Z z, dX dx, dY dy, dZ dz, Pre pre, N n) { - auto x_e = framework::EigenVector::Flatten(*x); - auto y_e = framework::EigenVector::Flatten(*y); - auto dz_e = framework::EigenVector::Flatten(*dz); - - auto y_e_bcast = y_e.reshape(Eigen::DSizes(1, n)) - .broadcast(Eigen::DSizes(pre, 1)) - .reshape(Eigen::DSizes(x_e.size())); - - if (dx) { - auto dx_e = framework::EigenVector::Flatten(*dx); - dx_e.device(d) = (x_e < y_e_bcast).template cast() * dz_e; - } - - if (dy) { - auto dy_e = framework::EigenVector::Flatten(*dy); - dy_e.device(d) = ((x_e >= y_e_bcast).template cast() * dz_e) - .reshape(Eigen::DSizes(pre, n)) - .sum(Eigen::array{{0}}); - } - } -}; - -template -struct ElementwiseMinBroadCast2GradFunctor { - template - void operator()(Device d, X x, Y y, Z z, dX dx, dY dy, dZ dz, Pre pre, N n, - Post post) { - auto x_e = framework::EigenVector::Flatten(*x); - auto y_e = framework::EigenVector::Flatten(*y); - auto dz_e = framework::EigenVector::Flatten(*dz); - - auto y_e_bcast = y_e.reshape(Eigen::DSizes(1, n, 1)) - .broadcast(Eigen::DSizes(pre, 1, post)) - .reshape(Eigen::DSizes(x_e.size())); - if (dx) { - auto dx_e = framework::EigenVector::Flatten(*dx); - dx_e.device(d) = (x_e < y_e_bcast).template cast() * dz_e; - } - - if (dy) { - auto dy_e = framework::EigenVector::Flatten(*dy); - dy_e.device(d) = ((x_e >= y_e_bcast).template cast() * dz_e) - .reshape(Eigen::DSizes(pre, n, post)) - .sum(Eigen::array{{0, 2}}); - } +struct MinGradDy { + HOSTDEVICE T operator()(T x, T y, T out, T dout) const { + return dout * (x >= y); } }; @@ -127,12 +67,9 @@ class ElementwiseMinGradKernel : public framework::OpKernel { auto* dx = ctx.Output(framework::GradVarName("X")); auto* dy = ctx.Output(framework::GradVarName("Y")); int axis = ctx.Attr("axis"); - ElementwiseGradCompute, - ElementwiseMinBroadCastGradFunctor, - ElementwiseMinBroadCast2GradFunctor>( - ctx, x, y, out, dout, axis, dx, dy); + ElemwiseGradCompute, MinGradDy>( + ctx, *x, *y, *out, *dout, axis, dx, dy, MinGradDx(), MinGradDy()); } }; - } // namespace operators } // namespace paddle diff --git a/paddle/fluid/operators/elementwise_mul_op.h b/paddle/fluid/operators/elementwise_mul_op.h index e2b59b3112..dc73cb6f23 100644 --- a/paddle/fluid/operators/elementwise_mul_op.h +++ b/paddle/fluid/operators/elementwise_mul_op.h @@ -40,14 +40,15 @@ class ElementwiseMulKernel : public framework::OpKernel { }; template -struct IdentityGrad_DX { +struct MulGradDX { HOSTDEVICE T operator()(T x, T y, T out, T dout) const { return dout * y; } }; template -struct IdentityGrad_DY { +struct MulGradDY { HOSTDEVICE T operator()(T x, T y, T out, T dout) const { return dout * x; } }; + template class ElementwiseMulGradKernel : public framework::OpKernel { public: @@ -61,10 +62,8 @@ class ElementwiseMulGradKernel : public framework::OpKernel { auto* dx = ctx.Output(framework::GradVarName("X")); auto* dy = ctx.Output(framework::GradVarName("Y")); int axis = ctx.Attr("axis"); - ElemwiseGradCompute, - IdentityGrad_DY>(ctx, *x, *y, *out, *dout, axis, dx, - dy, IdentityGrad_DX(), - IdentityGrad_DY()); + ElemwiseGradCompute, MulGradDY>( + ctx, *x, *y, *out, *dout, axis, dx, dy, MulGradDX(), MulGradDY()); } }; } // namespace operators diff --git a/paddle/fluid/operators/elementwise_sub_op.h b/paddle/fluid/operators/elementwise_sub_op.h index a8fc242ed7..fe088b8203 100644 --- a/paddle/fluid/operators/elementwise_sub_op.h +++ b/paddle/fluid/operators/elementwise_sub_op.h @@ -40,61 +40,13 @@ class ElementwiseSubKernel : public framework::OpKernel { }; template -struct ElementwiseSubGradFunctor { - template - void operator()(Device d, X x, Y y, Z z, dX dx, dY dy, dZ dz) { - auto dz_e = framework::EigenVector::Flatten(*dz); - if (dx) { - auto dx_e = framework::EigenVector::Flatten(*dx); - dx_e.device(d) = dz_e; - } - if (dy) { - auto dy_e = framework::EigenVector::Flatten(*dy); - dy_e.device(d) = (-1.0) * dz_e; - } - } +struct SubGradDX { + HOSTDEVICE T operator()(T x, T y, T out, T dout) const { return dout; } }; template -struct ElementwiseSubBroadCastGradFunctor { - template - void operator()(Device d, X x, Y y, Z z, dX dx, dY dy, dZ dz, Pre pre, N n) { - auto dz_e = framework::EigenVector::Flatten(*dz); - if (dx) { - auto dx_e = framework::EigenVector::Flatten(*dx); - dx_e.device(d) = dz_e; - } - - if (dy) { - auto dy_e = framework::EigenVector::Flatten(*dy); - dy_e.device(d) = (-1.0) * - dz_e.reshape(Eigen::DSizes(pre, n)) - .sum(Eigen::array{{0}}); - } - } -}; - -template -struct ElementwiseSubBroadCast2GradFunctor { - template - void operator()(Device d, X x, Y y, Z z, dX dx, dY dy, dZ dz, Pre pre, N n, - Post post) { - auto dz_e = framework::EigenVector::Flatten(*dz); - if (dx) { - auto dx_e = framework::EigenVector::Flatten(*dx); - dx_e.device(d) = dz_e; - } - - if (dy) { - auto dy_e = framework::EigenVector::Flatten(*dy); - dy_e.device(d) = (-1.0) * - dz_e.reshape(Eigen::DSizes(pre, n, post)) - .sum(Eigen::array{{0, 2}}); - } - } +struct SubGradDY { + HOSTDEVICE T operator()(T x, T y, T out, T dout) const { return -dout; } }; template @@ -110,12 +62,9 @@ class ElementwiseSubGradKernel : public framework::OpKernel { auto* dx = ctx.Output(framework::GradVarName("X")); auto* dy = ctx.Output(framework::GradVarName("Y")); int axis = ctx.Attr("axis"); - ElementwiseGradCompute, - ElementwiseSubBroadCastGradFunctor, - ElementwiseSubBroadCast2GradFunctor>( - ctx, x, y, out, dout, axis, dx, dy); + ElemwiseGradCompute, SubGradDY>( + ctx, *x, *y, *out, *dout, axis, dx, dy, SubGradDX(), SubGradDY()); } }; - } // namespace operators } // namespace paddle diff --git a/paddle/fluid/operators/math/math_function.cc b/paddle/fluid/operators/math/math_function.cc index f7f33917d7..35d251f71a 100644 --- a/paddle/fluid/operators/math/math_function.cc +++ b/paddle/fluid/operators/math/math_function.cc @@ -15,11 +15,23 @@ limitations under the License. */ #include "paddle/fluid/operators/math/math_function.h" #include "paddle/fluid/framework/data_type.h" #include "paddle/fluid/operators/math/math_function_impl.h" +#include "paddle/fluid/platform/float16.h" namespace paddle { namespace operators { namespace math { +using float16 = paddle::platform::float16; + +template <> +void gemm( + const platform::CPUDeviceContext& context, const CBLAS_TRANSPOSE transA, + const CBLAS_TRANSPOSE transB, const int M, const int N, const int K, + const float16 alpha, const float16* A, const float16* B, const float16 beta, + float16* C) { + PADDLE_THROW("float16 GEMM not supported on CPU"); +} + template <> void gemm( const platform::CPUDeviceContext& context, const CBLAS_TRANSPOSE transA, @@ -46,6 +58,15 @@ void gemm( beta, C, ldc); } +template <> +void gemm( + const platform::CPUDeviceContext& context, const bool transA, + const bool transB, const int M, const int N, const int K, + const float16 alpha, const float16* A, const int lda, const float16* B, + const int ldb, const float16 beta, float16* C, const int ldc) { + PADDLE_THROW("float16 GEMM not supported on CPU"); +} + template <> void gemm( const platform::CPUDeviceContext& context, const bool transA, @@ -68,6 +89,15 @@ void gemm( lda, B, ldb, beta, C, ldc); } +template <> +void matmul( + const platform::CPUDeviceContext& context, + const framework::Tensor& matrix_a, bool trans_a, + const framework::Tensor& matrix_b, bool trans_b, float16 alpha, + framework::Tensor* matrix_out, float16 beta) { + PADDLE_THROW("float16 matmul not supported on CPU"); +} + template <> void matmul( const platform::CPUDeviceContext& context, @@ -126,6 +156,15 @@ void matmul( matrix_b.data(), beta, matrix_out->data()); } +template <> +void batched_gemm( + const platform::CPUDeviceContext& context, const CBLAS_TRANSPOSE transA, + const CBLAS_TRANSPOSE transB, const int M, const int N, const int K, + const float16 alpha, const float16* A, const float16* B, const float16 beta, + float16* C, const int batchCount, const int strideA, const int strideB) { + PADDLE_THROW("float16 batched_gemm not supported on CPU"); +} + #ifdef PADDLE_WITH_MKLML // Use cblas_{s,d}gemm_batched if available: Run with 1 group of size batchSize. template <> diff --git a/paddle/fluid/operators/math/math_function.cu b/paddle/fluid/operators/math/math_function.cu index f8d0349ac5..36655508be 100644 --- a/paddle/fluid/operators/math/math_function.cu +++ b/paddle/fluid/operators/math/math_function.cu @@ -16,11 +16,40 @@ limitations under the License. */ #include "paddle/fluid/framework/data_type.h" #include "paddle/fluid/operators/math/math_function.h" #include "paddle/fluid/operators/math/math_function_impl.h" +#include "paddle/fluid/platform/float16.h" namespace paddle { namespace operators { namespace math { +using float16 = paddle::platform::float16; + +template <> +void gemm( + const platform::CUDADeviceContext& context, const CBLAS_TRANSPOSE transA, + const CBLAS_TRANSPOSE transB, const int M, const int N, const int K, + const float16 alpha, const float16* A, const float16* B, const float16 beta, + float16* C) { + // Note that cublas follows fortran order, so the order is different from + // the cblas convention. + int lda = (transA == CblasNoTrans) ? K : M; + int ldb = (transB == CblasNoTrans) ? N : K; + cublasOperation_t cuTransA = + (transA == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T; + cublasOperation_t cuTransB = + (transB == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T; + + const half h_alpha = static_cast(alpha); + const half h_beta = static_cast(beta); + const half* h_A = reinterpret_cast(A); + const half* h_B = reinterpret_cast(B); + half* h_C = reinterpret_cast(C); + + PADDLE_ENFORCE(platform::dynload::cublasHgemm( + context.cublas_handle(), cuTransB, cuTransA, N, M, K, &h_alpha, h_B, ldb, + h_A, lda, &h_beta, h_C, N)); +} + template <> void gemm( const platform::CUDADeviceContext& context, const CBLAS_TRANSPOSE transA, @@ -60,6 +89,28 @@ void gemm( lda, &beta, C, N)); } +template <> +void gemm( + const platform::CUDADeviceContext& context, const bool transA, + const bool transB, const int M, const int N, const int K, + const float16 alpha, const float16* A, const int lda, const float16* B, + const int ldb, const float16 beta, float16* C, const int ldc) { + // Note that cublas follows fortran order, so the order is different from + // the cblas convention. + cublasOperation_t cuTransA = transA == false ? CUBLAS_OP_N : CUBLAS_OP_T; + cublasOperation_t cuTransB = transB == false ? CUBLAS_OP_N : CUBLAS_OP_T; + + const half h_alpha = static_cast(alpha); + const half h_beta = static_cast(beta); + const half* h_A = reinterpret_cast(A); + const half* h_B = reinterpret_cast(B); + half* h_C = reinterpret_cast(C); + + PADDLE_ENFORCE(platform::dynload::cublasHgemm( + context.cublas_handle(), cuTransB, cuTransA, N, M, K, &h_alpha, h_B, ldb, + h_A, lda, &h_beta, h_C, ldc)); +} + template <> void gemm( const platform::CUDADeviceContext& context, const bool transA, @@ -90,6 +141,35 @@ void gemm( lda, &beta, C, ldc)); } +template <> +void matmul( + const platform::CUDADeviceContext& context, + const framework::Tensor& matrix_a, bool trans_a, + const framework::Tensor& matrix_b, bool trans_b, float16 alpha, + framework::Tensor* matrix_out, float16 beta) { + auto dim_a = matrix_a.dims(); + auto dim_b = matrix_b.dims(); + auto dim_out = matrix_out->dims(); + PADDLE_ENFORCE(dim_a.size() == 2 && dim_b.size() == 2 && dim_out.size() == 2, + "The input and output of matmul be matrix"); + + PADDLE_ENFORCE(platform::is_gpu_place(matrix_a.place()) && + platform::is_gpu_place(matrix_b.place()) && + platform::is_gpu_place(matrix_out->place()), + "Matrix must all be in CUDAPlace"); + + int M = dim_out[0]; + int N = dim_out[1]; + int K = (trans_a == false) ? dim_a[1] : dim_a[0]; + + CBLAS_TRANSPOSE transA = (trans_a == false) ? CblasNoTrans : CblasTrans; + CBLAS_TRANSPOSE transB = (trans_b == false) ? CblasNoTrans : CblasTrans; + + gemm( + context, transA, transB, M, N, K, alpha, matrix_a.data(), + matrix_b.data(), beta, matrix_out->data()); +} + template <> void matmul( const platform::CUDADeviceContext& context, @@ -148,6 +228,34 @@ void matmul( matrix_b.data(), beta, matrix_out->data()); } +template <> +void batched_gemm( + const platform::CUDADeviceContext& context, const CBLAS_TRANSPOSE transA, + const CBLAS_TRANSPOSE transB, const int M, const int N, const int K, + const float16 alpha, const float16* A, const float16* B, const float16 beta, + float16* C, const int batchCount, const int strideA, const int strideB) { + // Note that cublas follows fortran order, so the order is different from + // the cblas convention. + int lda = (transA == CblasNoTrans) ? K : M; + int ldb = (transB == CblasNoTrans) ? N : K; + int ldc = N; + cublasOperation_t cuTransA = + (transA == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T; + cublasOperation_t cuTransB = + (transB == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T; + const int strideC = M * N; + + const half h_alpha = static_cast(alpha); + const half h_beta = static_cast(beta); + const half* h_A = reinterpret_cast(A); + const half* h_B = reinterpret_cast(B); + half* h_C = reinterpret_cast(C); + + PADDLE_ENFORCE(platform::dynload::cublasHgemmStridedBatched( + context.cublas_handle(), cuTransB, cuTransA, N, M, K, &h_alpha, h_B, ldb, + strideB, h_A, lda, strideA, &h_beta, h_C, ldc, strideC, batchCount)); +} + template <> void batched_gemm( const platform::CUDADeviceContext& context, const CBLAS_TRANSPOSE transA, diff --git a/paddle/fluid/operators/math/math_function_test.cu b/paddle/fluid/operators/math/math_function_test.cu index 207d6a87bc..442e62d563 100644 --- a/paddle/fluid/operators/math/math_function_test.cu +++ b/paddle/fluid/operators/math/math_function_test.cu @@ -14,30 +14,41 @@ #include "gtest/gtest.h" #include "paddle/fluid/operators/math/math_function.h" -TEST(math_function, notrans_mul_trans) { - paddle::framework::Tensor input1; - paddle::framework::Tensor input1_gpu; - paddle::framework::Tensor input2_gpu; - paddle::framework::Tensor out_gpu; - paddle::framework::Tensor out; - - auto* cpu_place = new paddle::platform::CPUPlace(); - float* input1_ptr = input1.mutable_data({2, 3}, *cpu_place); +void fill_fp16_data(paddle::platform::float16* in_ptr, size_t size, + const std::vector& data) { + PADDLE_ENFORCE_EQ(size, data.size()); + for (size_t i = 0; i < data.size(); ++i) { + in_ptr[i] = paddle::platform::float16(data[i]); + } +} + +TEST(math_function, notrans_mul_trans_fp32) { + using namespace paddle::framework; + using namespace paddle::platform; + + Tensor input1; + Tensor input1_gpu; + Tensor input2_gpu; + Tensor out_gpu; + Tensor out; + + CPUPlace cpu_place; + CUDAPlace gpu_place(0); + CUDADeviceContext context(gpu_place); + + float* input1_ptr = input1.mutable_data({2, 3}, cpu_place); float arr[6] = {0, 1, 2, 3, 4, 5}; memcpy(input1_ptr, arr, 6 * sizeof(float)); - auto* gpu_place = new paddle::platform::CUDAPlace(0); - paddle::platform::CUDADeviceContext context(*gpu_place); - - paddle::framework::TensorCopy(input1, *gpu_place, context, &input1_gpu); - paddle::framework::TensorCopy(input1, *gpu_place, context, &input2_gpu); + TensorCopy(input1, gpu_place, context, &input1_gpu); + TensorCopy(input1, gpu_place, context, &input2_gpu); - out_gpu.mutable_data({2, 2}, *gpu_place); + out_gpu.mutable_data({2, 2}, gpu_place); - paddle::operators::math::matmul( + paddle::operators::math::matmul( context, input1_gpu, false, input2_gpu, true, 1, &out_gpu, 0); - paddle::framework::TensorCopy(out_gpu, *cpu_place, context, &out); + TensorCopy(out_gpu, cpu_place, context, &out); float* out_ptr = out.data(); context.Wait(); @@ -45,33 +56,71 @@ TEST(math_function, notrans_mul_trans) { EXPECT_EQ(out_ptr[1], 14); EXPECT_EQ(out_ptr[2], 14); EXPECT_EQ(out_ptr[3], 50); - delete gpu_place; } -TEST(math_function, trans_mul_notrans) { - paddle::framework::Tensor input1; - paddle::framework::Tensor input1_gpu; - paddle::framework::Tensor input2_gpu; - paddle::framework::Tensor out_gpu; - paddle::framework::Tensor out; +TEST(math_function, notrans_mul_trans_fp16) { + using namespace paddle::framework; + using namespace paddle::platform; + + Tensor input1; + Tensor input1_gpu; + Tensor input2_gpu; + Tensor out_gpu; + Tensor out; + + CPUPlace cpu_place; + CUDAPlace gpu_place(0); + CUDADeviceContext context(gpu_place); + + float16* input1_ptr = input1.mutable_data({2, 3}, cpu_place); + fill_fp16_data(input1_ptr, input1.numel(), {0, 1, 2, 3, 4, 5}); + + TensorCopy(input1, gpu_place, context, &input1_gpu); + TensorCopy(input1, gpu_place, context, &input2_gpu); + + out_gpu.mutable_data({2, 2}, gpu_place); + + paddle::operators::math::matmul( + context, input1_gpu, false, input2_gpu, true, float16(1), &out_gpu, + float16(0)); + + TensorCopy(out_gpu, cpu_place, context, &out); + + float16* out_ptr = out.data(); + context.Wait(); + EXPECT_EQ(static_cast(out_ptr[0]), 5); + EXPECT_EQ(static_cast(out_ptr[1]), 14); + EXPECT_EQ(static_cast(out_ptr[2]), 14); + EXPECT_EQ(static_cast(out_ptr[3]), 50); +} + +TEST(math_function, trans_mul_notrans_fp32) { + using namespace paddle::framework; + using namespace paddle::platform; + + Tensor input1; + Tensor input1_gpu; + Tensor input2_gpu; + Tensor out_gpu; + Tensor out; + + CPUPlace cpu_place; + CUDAPlace gpu_place(0); + CUDADeviceContext context(gpu_place); - auto* cpu_place = new paddle::platform::CPUPlace(); - float* input1_ptr = input1.mutable_data({2, 3}, *cpu_place); + float* input1_ptr = input1.mutable_data({2, 3}, cpu_place); float arr[6] = {0, 1, 2, 3, 4, 5}; memcpy(input1_ptr, arr, 6 * sizeof(float)); - auto* gpu_place = new paddle::platform::CUDAPlace(0); - paddle::platform::CUDADeviceContext context(*gpu_place); + TensorCopy(input1, gpu_place, context, &input1_gpu); + TensorCopy(input1, gpu_place, context, &input2_gpu); - paddle::framework::TensorCopy(input1, *gpu_place, context, &input1_gpu); - paddle::framework::TensorCopy(input1, *gpu_place, context, &input2_gpu); - - out_gpu.mutable_data({3, 3}, *gpu_place); + out_gpu.mutable_data({3, 3}, gpu_place); paddle::operators::math::matmul( context, input1_gpu, true, input2_gpu, false, 1, &out_gpu, 0); - paddle::framework::TensorCopy(out_gpu, *cpu_place, context, &out); + TensorCopy(out_gpu, cpu_place, context, &out); float* out_ptr = out.data(); context.Wait(); @@ -84,45 +133,88 @@ TEST(math_function, trans_mul_notrans) { EXPECT_EQ(out_ptr[6], 15); EXPECT_EQ(out_ptr[7], 22); EXPECT_EQ(out_ptr[8], 29); - delete gpu_place; } -TEST(math_function, gemm_notrans_cublas) { - paddle::framework::Tensor input1; - paddle::framework::Tensor input2; - paddle::framework::Tensor input3; - paddle::framework::Tensor input1_gpu; - paddle::framework::Tensor input2_gpu; - paddle::framework::Tensor input3_gpu; +TEST(math_function, trans_mul_notrans_fp16) { + using namespace paddle::framework; + using namespace paddle::platform; + + Tensor input1; + Tensor input1_gpu; + Tensor input2_gpu; + Tensor out_gpu; + Tensor out; + + CPUPlace cpu_place; + CUDAPlace gpu_place(0); + CUDADeviceContext context(gpu_place); + + float16* input1_ptr = input1.mutable_data({2, 3}, cpu_place); + fill_fp16_data(input1_ptr, input1.numel(), {0, 1, 2, 3, 4, 5}); + + TensorCopy(input1, gpu_place, context, &input1_gpu); + TensorCopy(input1, gpu_place, context, &input2_gpu); + + out_gpu.mutable_data({3, 3}, gpu_place); + + paddle::operators::math::matmul( + context, input1_gpu, true, input2_gpu, false, float16(1), &out_gpu, + float16(0)); + + TensorCopy(out_gpu, cpu_place, context, &out); + + float16* out_ptr = out.data(); + context.Wait(); + EXPECT_EQ(static_cast(out_ptr[0]), 9); + EXPECT_EQ(static_cast(out_ptr[1]), 12); + EXPECT_EQ(static_cast(out_ptr[2]), 15); + EXPECT_EQ(static_cast(out_ptr[3]), 12); + EXPECT_EQ(static_cast(out_ptr[4]), 17); + EXPECT_EQ(static_cast(out_ptr[5]), 22); + EXPECT_EQ(static_cast(out_ptr[6]), 15); + EXPECT_EQ(static_cast(out_ptr[7]), 22); + EXPECT_EQ(static_cast(out_ptr[8]), 29); +} + +TEST(math_function, gemm_notrans_cublas_fp32) { + using namespace paddle::framework; + using namespace paddle::platform; + + Tensor input1; + Tensor input2; + Tensor input3; + Tensor input1_gpu; + Tensor input2_gpu; + Tensor input3_gpu; + + CPUPlace cpu_place; + CUDAPlace gpu_place(0); + CUDADeviceContext context(gpu_place); int m = 2; int n = 3; int k = 3; - auto* cpu_place = new paddle::platform::CPUPlace(); - float* input1_ptr = input1.mutable_data({2, 3}, *cpu_place); + float* input1_ptr = input1.mutable_data({2, 3}, cpu_place); float arr1[6] = {0, 1, 2, 3, 4, 5}; memcpy(input1_ptr, arr1, 6 * sizeof(float)); - float* input2_ptr = input2.mutable_data({3, 4}, *cpu_place); + float* input2_ptr = input2.mutable_data({3, 4}, cpu_place); float arr2[12] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11}; memcpy(input2_ptr, arr2, 12 * sizeof(float)); - float* input3_ptr = input3.mutable_data({2, 4}, *cpu_place); + float* input3_ptr = input3.mutable_data({2, 4}, cpu_place); float arr3[8] = {0, 1, 2, 3, 4, 5, 6, 7}; memcpy(input3_ptr, arr3, 8 * sizeof(float)); - auto* gpu_place = new paddle::platform::CUDAPlace(0); - paddle::platform::CUDADeviceContext context(*gpu_place); - - paddle::framework::TensorCopy(input1, *gpu_place, context, &input1_gpu); - paddle::framework::TensorCopy(input2, *gpu_place, context, &input2_gpu); - paddle::framework::TensorCopy(input3, *gpu_place, context, &input3_gpu); + TensorCopy(input1, gpu_place, context, &input1_gpu); + TensorCopy(input2, gpu_place, context, &input2_gpu); + TensorCopy(input3, gpu_place, context, &input3_gpu); float* a = input1_gpu.data(); float* b = input2_gpu.data(); - float* c = input3_gpu.mutable_data(*gpu_place); + float* c = input3_gpu.mutable_data(gpu_place); paddle::operators::math::gemm( context, false, false, m, n, k, 1, a, 3, b + 1, 4, 1, c + 1, 4); - paddle::framework::TensorCopy(input3_gpu, *cpu_place, context, &input3); + TensorCopy(input3_gpu, cpu_place, context, &input3); // numpy code: // a = np.arange(6).reshape(2, 3) @@ -139,47 +231,105 @@ TEST(math_function, gemm_notrans_cublas) { EXPECT_EQ(input3_ptr[5], 73); EXPECT_EQ(input3_ptr[6], 86); EXPECT_EQ(input3_ptr[7], 99); - delete gpu_place; } -TEST(math_function, gemm_trans_cublas) { - paddle::framework::Tensor input1; - paddle::framework::Tensor input2; - paddle::framework::Tensor input3; - paddle::framework::Tensor input1_gpu; - paddle::framework::Tensor input2_gpu; - paddle::framework::Tensor input3_gpu; +TEST(math_function, gemm_notrans_cublas_fp16) { + using namespace paddle::framework; + using namespace paddle::platform; + + Tensor input1; + Tensor input2; + Tensor input3; + Tensor input1_gpu; + Tensor input2_gpu; + Tensor input3_gpu; + + CPUPlace cpu_place; + CUDAPlace gpu_place(0); + CUDADeviceContext context(gpu_place); + + int m = 2; + int n = 3; + int k = 3; + float16* input1_ptr = input1.mutable_data({2, 3}, cpu_place); + fill_fp16_data(input1_ptr, input1.numel(), {0, 1, 2, 3, 4, 5}); + float16* input2_ptr = input2.mutable_data({3, 4}, cpu_place); + fill_fp16_data(input2_ptr, input2.numel(), + {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11}); + float16* input3_ptr = input3.mutable_data({2, 4}, cpu_place); + fill_fp16_data(input3_ptr, input3.numel(), {0, 1, 2, 3, 4, 5, 6, 7}); + + TensorCopy(input1, gpu_place, context, &input1_gpu); + TensorCopy(input2, gpu_place, context, &input2_gpu); + TensorCopy(input3, gpu_place, context, &input3_gpu); + float16* a = input1_gpu.data(); + float16* b = input2_gpu.data(); + float16* c = input3_gpu.mutable_data(gpu_place); + + paddle::operators::math::gemm( + context, false, false, m, n, k, float16(1), a, 3, b + 1, 4, float16(1), + c + 1, 4); + + TensorCopy(input3_gpu, cpu_place, context, &input3); + + // numpy code: + // a = np.arange(6).reshape(2, 3) + // b = np.arange(12).reshape(3, 4)[:, 1:] + // c = np.arange(8).reshape(2, 4)[:, 1:] + // out = np.arange(8).reshape(2, 4) + // out[:, 1:] = np.dot(a, b) + c + context.Wait(); + EXPECT_EQ(static_cast(input3_ptr[0]), 0); + EXPECT_EQ(static_cast(input3_ptr[1]), 24); + EXPECT_EQ(static_cast(input3_ptr[2]), 28); + EXPECT_EQ(static_cast(input3_ptr[3]), 32); + EXPECT_EQ(static_cast(input3_ptr[4]), 4); + EXPECT_EQ(static_cast(input3_ptr[5]), 73); + EXPECT_EQ(static_cast(input3_ptr[6]), 86); + EXPECT_EQ(static_cast(input3_ptr[7]), 99); +} + +TEST(math_function, gemm_trans_cublas_fp32) { + using namespace paddle::framework; + using namespace paddle::platform; + + Tensor input1; + Tensor input2; + Tensor input3; + Tensor input1_gpu; + Tensor input2_gpu; + Tensor input3_gpu; + + CPUPlace cpu_place; + CUDAPlace gpu_place(0); + CUDADeviceContext context(gpu_place); int m = 2; int n = 3; int k = 3; - auto* cpu_place = new paddle::platform::CPUPlace(); - float* input1_ptr = input1.mutable_data({2, 3}, *cpu_place); + float* input1_ptr = input1.mutable_data({2, 3}, cpu_place); float arr1[6] = {0, 1, 2, 3, 4, 5}; memcpy(input1_ptr, arr1, 6 * sizeof(float)); - float* input2_ptr = input2.mutable_data({4, 3}, *cpu_place); + float* input2_ptr = input2.mutable_data({4, 3}, cpu_place); float arr2[12] = {0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11}; memcpy(input2_ptr, arr2, 12 * sizeof(float)); - float* input3_ptr = input3.mutable_data({2, 4}, *cpu_place); + float* input3_ptr = input3.mutable_data({2, 4}, cpu_place); float arr3[8] = {0, 1, 2, 3, 4, 5, 6, 7}; memcpy(input3_ptr, arr3, 8 * sizeof(float)); - auto* gpu_place = new paddle::platform::CUDAPlace(0); - paddle::platform::CUDADeviceContext context(*gpu_place); - - paddle::framework::TensorCopy(input1, *gpu_place, context, &input1_gpu); - paddle::framework::TensorCopy(input2, *gpu_place, context, &input2_gpu); - paddle::framework::TensorCopy(input3, *gpu_place, context, &input3_gpu); + TensorCopy(input1, gpu_place, context, &input1_gpu); + TensorCopy(input2, gpu_place, context, &input2_gpu); + TensorCopy(input3, gpu_place, context, &input3_gpu); float* a = input1_gpu.data(); float* b = input2_gpu.data(); - float* c = input3_gpu.mutable_data(*gpu_place); + float* c = input3_gpu.mutable_data(gpu_place); paddle::operators::math::gemm( context, false, true, m, n, k, 1, a, 3, b + 3, 3, 1, c + 1, 4); - paddle::framework::TensorCopy(input3_gpu, *cpu_place, context, &input3); - context.Wait(); + TensorCopy(input3_gpu, cpu_place, context, &input3); + context.Wait(); EXPECT_EQ(input3_ptr[0], 0); EXPECT_EQ(input3_ptr[1], 24); EXPECT_EQ(input3_ptr[2], 28); @@ -188,27 +338,81 @@ TEST(math_function, gemm_trans_cublas) { EXPECT_EQ(input3_ptr[5], 73); EXPECT_EQ(input3_ptr[6], 86); EXPECT_EQ(input3_ptr[7], 99); - delete gpu_place; +} + +TEST(math_function, gemm_trans_cublas_fp16) { + using namespace paddle::framework; + using namespace paddle::platform; + + Tensor input1; + Tensor input2; + Tensor input3; + Tensor input1_gpu; + Tensor input2_gpu; + Tensor input3_gpu; + + CPUPlace cpu_place; + CUDAPlace gpu_place(0); + CUDADeviceContext context(gpu_place); + + int m = 2; + int n = 3; + int k = 3; + float16* input1_ptr = input1.mutable_data({2, 3}, cpu_place); + fill_fp16_data(input1_ptr, input1.numel(), {0, 1, 2, 3, 4, 5}); + float16* input2_ptr = input2.mutable_data({4, 3}, cpu_place); + fill_fp16_data(input2_ptr, input2.numel(), + {0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11}); + float16* input3_ptr = input3.mutable_data({2, 4}, cpu_place); + fill_fp16_data(input3_ptr, input3.numel(), {0, 1, 2, 3, 4, 5, 6, 7}); + + TensorCopy(input1, gpu_place, context, &input1_gpu); + TensorCopy(input2, gpu_place, context, &input2_gpu); + TensorCopy(input3, gpu_place, context, &input3_gpu); + float16* a = input1_gpu.data(); + float16* b = input2_gpu.data(); + float16* c = input3_gpu.mutable_data(gpu_place); + + paddle::operators::math::gemm( + context, false, true, m, n, k, float16(1), a, 3, b + 3, 3, float16(1), + c + 1, 4); + + TensorCopy(input3_gpu, cpu_place, context, &input3); + + context.Wait(); + EXPECT_EQ(static_cast(input3_ptr[0]), 0); + EXPECT_EQ(static_cast(input3_ptr[1]), 24); + EXPECT_EQ(static_cast(input3_ptr[2]), 28); + EXPECT_EQ(static_cast(input3_ptr[3]), 32); + EXPECT_EQ(static_cast(input3_ptr[4]), 4); + EXPECT_EQ(static_cast(input3_ptr[5]), 73); + EXPECT_EQ(static_cast(input3_ptr[6]), 86); + EXPECT_EQ(static_cast(input3_ptr[7]), 99); } template void GemvTest(int m, int n, bool trans) { - paddle::framework::Tensor mat_a; - paddle::framework::Tensor vec_b; - paddle::framework::Tensor vec_c; - auto* cpu_place = new paddle::platform::CPUPlace(); - - T* data_a = mat_a.mutable_data({m, n}, *cpu_place); - T* data_b = vec_b.mutable_data({trans ? m : n}, *cpu_place); - T* data_c = vec_c.mutable_data({trans ? n : m}, *cpu_place); - - auto* gpu_place = new paddle::platform::CUDAPlace(0); - paddle::framework::Tensor g_mat_a; - paddle::framework::Tensor g_vec_b; - paddle::framework::Tensor g_vec_c; - T* g_data_a = g_mat_a.mutable_data(mat_a.dims(), *gpu_place); - T* g_data_b = g_vec_b.mutable_data(vec_b.dims(), *gpu_place); - T* g_data_c = g_vec_c.mutable_data(vec_c.dims(), *gpu_place); + using namespace paddle::framework; + using namespace paddle::platform; + + Tensor mat_a; + Tensor vec_b; + Tensor vec_c; + + CPUPlace cpu_place; + CUDAPlace gpu_place(0); + CUDADeviceContext context(gpu_place); + + T* data_a = mat_a.mutable_data({m, n}, cpu_place); + T* data_b = vec_b.mutable_data({trans ? m : n}, cpu_place); + T* data_c = vec_c.mutable_data({trans ? n : m}, cpu_place); + + Tensor g_mat_a; + Tensor g_vec_b; + Tensor g_vec_c; + T* g_data_a = g_mat_a.mutable_data(mat_a.dims(), gpu_place); + T* g_data_b = g_vec_b.mutable_data(vec_b.dims(), gpu_place); + T* g_data_c = g_vec_c.mutable_data(vec_c.dims(), gpu_place); for (int i = 0; i < mat_a.numel(); ++i) { data_a[i] = static_cast(i); @@ -217,16 +421,14 @@ void GemvTest(int m, int n, bool trans) { data_b[i] = static_cast(i); } - paddle::platform::CUDADeviceContext context(*gpu_place); - paddle::framework::TensorCopy(mat_a, *gpu_place, context, &g_mat_a); - paddle::framework::TensorCopy(vec_b, *gpu_place, context, &g_vec_b); + TensorCopy(mat_a, gpu_place, context, &g_mat_a); + TensorCopy(vec_b, gpu_place, context, &g_vec_b); - paddle::operators::math::gemv( + paddle::operators::math::gemv( context, trans, static_cast(m), static_cast(n), 1., g_data_a, g_data_b, 0., g_data_c); - paddle::framework::TensorCopy(g_vec_c, paddle::platform::CPUPlace(), context, - &vec_c); + TensorCopy(g_vec_c, cpu_place, context, &vec_c); if (!trans) { for (int i = 0; i < m; ++i) { diff --git a/paddle/fluid/platform/device_tracer.cc b/paddle/fluid/platform/device_tracer.cc index e2fd8e90b6..78e00d5420 100644 --- a/paddle/fluid/platform/device_tracer.cc +++ b/paddle/fluid/platform/device_tracer.cc @@ -199,20 +199,29 @@ class DeviceTracerImpl : public DeviceTracer { return; } std::lock_guard l(trace_mu_); - cpu_records_.push_back( - CPURecord{anno, start_ns, end_ns, - std::hash{}(std::this_thread::get_id())}); + cpu_records_.push_back(CPURecord{anno, start_ns, end_ns, 0}); } void AddMemRecords(const std::string &name, uint64_t start_ns, uint64_t end_ns, uint32_t device_id, uint32_t stream_id, uint32_t correlation_id, uint64_t bytes) { + // 0 means timestamp information could not be collected for the kernel. + if (start_ns == 0 || end_ns == 0) { + VLOG(3) << name << " cannot be traced"; + return; + } + std::lock_guard l(trace_mu_); mem_records_.push_back(MemRecord{name, start_ns, end_ns, device_id, stream_id, correlation_id, bytes}); } void AddKernelRecords(uint64_t start, uint64_t end, uint32_t device_id, uint32_t stream_id, uint32_t correlation_id) { + // 0 means timestamp information could not be collected for the kernel. + if (start == 0 || end == 0) { + VLOG(3) << correlation_id << " cannot be traced"; + return; + } std::lock_guard l(trace_mu_); kernel_records_.push_back( KernelRecord{start, end, device_id, stream_id, correlation_id}); @@ -285,10 +294,10 @@ class DeviceTracerImpl : public DeviceTracer { event->set_device_id(r.device_id); event->mutable_memcopy()->set_bytes(r.bytes); } - std::string profile_str; - google::protobuf::TextFormat::PrintToString(profile_pb, &profile_str); std::ofstream profile_f; profile_f.open(profile_path, std::ios::out | std::ios::trunc); + std::string profile_str; + profile_pb.SerializeToString(&profile_str); profile_f << profile_str; profile_f.close(); return profile_pb; diff --git a/paddle/fluid/platform/dynload/cublas.h b/paddle/fluid/platform/dynload/cublas.h index 580ed9bb57..fa9041134d 100644 --- a/paddle/fluid/platform/dynload/cublas.h +++ b/paddle/fluid/platform/dynload/cublas.h @@ -68,6 +68,8 @@ extern void *cublas_dso_handle; __macro(cublasDgemv_v2); \ __macro(cublasSgemm_v2); \ __macro(cublasDgemm_v2); \ + __macro(cublasHgemm); \ + __macro(cublasSgemmEx); \ __macro(cublasSgeam_v2); \ __macro(cublasDgeam_v2); \ __macro(cublasCreate_v2); \ @@ -83,6 +85,7 @@ extern void *cublas_dso_handle; __macro(cublasDgemmStridedBatched); \ __macro(cublasCgemmStridedBatched); \ __macro(cublasZgemmStridedBatched); \ + __macro(cublasHgemmStridedBatched); \ __macro(cublasSgetrfBatched); \ __macro(cublasSgetriBatched); \ __macro(cublasDgetrfBatched); \ diff --git a/paddle/fluid/platform/profiler.cc b/paddle/fluid/platform/profiler.cc index 094f9224f7..28ef3e04b1 100644 --- a/paddle/fluid/platform/profiler.cc +++ b/paddle/fluid/platform/profiler.cc @@ -178,7 +178,7 @@ void EnableProfiler(ProfilerState state) { } #ifdef PADDLE_WITH_CUDA if (g_state == ProfilerState::kCUDA) { - // Generate some dummy evenets first to reduce the startup overhead. + // Generate some dummy events first to reduce the startup overhead. for (int i = 0; i < 5; i++) { ForEachDevice([](int d) { DeviceContext* dev_ctx = new CUDADeviceContext(CUDAPlace(d)); diff --git a/paddle/fluid/platform/profiler.proto b/paddle/fluid/platform/profiler.proto index 06db7ed638..71b5a9b12e 100644 --- a/paddle/fluid/platform/profiler.proto +++ b/paddle/fluid/platform/profiler.proto @@ -15,7 +15,7 @@ limitations under the License. */ syntax = "proto2"; package paddle.platform.proto; -message MemCopy { optional uint64 bytes = 3; } +message MemCopy { optional uint64 bytes = 1; } message Event { optional string name = 1; diff --git a/paddle/fluid/platform/profiler_test.cc b/paddle/fluid/platform/profiler_test.cc index 4a86d8ec62..fc77e0f321 100644 --- a/paddle/fluid/platform/profiler_test.cc +++ b/paddle/fluid/platform/profiler_test.cc @@ -75,6 +75,7 @@ TEST(RecordEvent, RecordEvent) { * ... * PopEvent(evt_name, dev_ctx); */ + LOG(INFO) << "Usage 1: PushEvent & PopEvent"; for (int loop = 0; loop < 3; ++loop) { for (int i = 1; i < 5; ++i) { std::string name = "op_" + std::to_string(i); @@ -93,6 +94,7 @@ TEST(RecordEvent, RecordEvent) { * ... * } */ + LOG(INFO) << "Usage 2: RecordEvent"; for (int i = 1; i < 5; ++i) { std::string name = "evs_op_" + std::to_string(i); RecordEvent record_event(name, dev_ctx); @@ -100,6 +102,34 @@ TEST(RecordEvent, RecordEvent) { while (counter != i * 1000) counter++; } + /* Usage 3 + * { + * RecordEvent record_event(name1, dev_ctx); + * ... + * code to be analyzed + * ... + * { + * RecordEvent nested_record_event(name2, dev_ctx); + * ... + * code to be analyzed + * ... + * } + * } + */ + LOG(INFO) << "Usage 3: nested RecordEvent"; + for (int i = 1; i < 5; ++i) { + std::string name = "ano_evs_op_" + std::to_string(i); + RecordEvent record_event(name, dev_ctx); + int counter = 1; + while (counter != i * 100) counter++; + { + std::string nested_name = "nested_ano_evs_op_" + std::to_string(i); + RecordEvent nested_record_event(nested_name, dev_ctx); + int nested_counter = 1; + while (nested_counter != i * 100) nested_counter++; + } + } + // Bad Usage: PushEvent("event_without_pop", dev_ctx); PopEvent("event_without_push", dev_ctx); diff --git a/paddle/scripts/docker/build.sh b/paddle/scripts/docker/build.sh index 06319fc638..6be2bd8fad 100644 --- a/paddle/scripts/docker/build.sh +++ b/paddle/scripts/docker/build.sh @@ -213,7 +213,7 @@ function gen_fluid_inference_lib() { if [ ${WITH_C_API:-OFF} == "OFF" ] ; then cat < && cd .. && rm -rf nccl - CMD ["bash", "/paddle/paddle/scripts/docker/build.sh"] diff --git a/tools/manylinux1/build_scripts/install_nccl2.sh b/tools/manylinux1/build_scripts/install_nccl2.sh new file mode 100644 index 0000000000..7efc1fe865 --- /dev/null +++ b/tools/manylinux1/build_scripts/install_nccl2.sh @@ -0,0 +1,18 @@ +#!/bin/bash +DEB="nccl-repo-ubuntu1604-2.1.4-ga-cuda8.0_1-1_amd64.deb" +DIR="/nccl2" +mkdir -p $DIR +# we cached the nccl2 deb package in BOS, so we can download it with wget +# install nccl2: http://docs.nvidia.com/deeplearning/sdk/nccl-install-guide/index.html#down +wget -O $DIR/$DEB \ + "http://nccl2-deb.gz.bcebos.com/nccl-repo-ubuntu1604-2.1.4-ga-cuda8.0_1-1_amd64.deb?responseContentDisposition=attachment" + +cd $DIR && ar x $DEB && tar xf data.tar.xz +DEBS=$(find ./var/ -name "*.deb") +for sub_deb in $DEBS; do + echo $sub_deb + ar x $sub_deb && tar xf data.tar.xz +done +mv -f usr/include/nccl.h /usr/local/include/ +mv -f usr/lib/libnccl* /usr/local/lib/ +rm -rf $DIR diff --git a/tools/timeline.py b/tools/timeline.py index d1d1dae2bd..ee83a1baec 100644 --- a/tools/timeline.py +++ b/tools/timeline.py @@ -159,7 +159,7 @@ if args.timeline_path: with open(profile_path, 'r') as f: profile_s = f.read() profile_pb = profiler_pb2.Profile() - text_format.Merge(profile_s, profile_pb) + profile_pb.ParseFromString(profile_s) tl = Timeline(profile_pb) with open(timeline_path, 'w') as f: