diff --git a/CMakeLists.txt b/CMakeLists.txt index df00e977eb..6aa2e1715b 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -127,6 +127,9 @@ set(THIRD_PARTY_PATH "${CMAKE_BINARY_DIR}/third_party" CACHE STRING set(FLUID_INSTALL_DIR "${CMAKE_BINARY_DIR}/fluid_install_dir" CACHE STRING "A path setting fluid shared and static libraries") +set(FLUID_INFERENCE_INSTALL_DIR "${CMAKE_BINARY_DIR}/fluid_inference_install_dir" CACHE STRING + "A path setting fluid inference shared and static libraries") + if (WITH_C_API AND WITH_PYTHON) message(WARNING "It is suggest not embedded a python interpreter in Paddle " "when using C-API. It will give an unpredictable behavior when using a " diff --git a/README.md b/README.md index 46fdef5e37..8ee67f6642 100644 --- a/README.md +++ b/README.md @@ -2,8 +2,8 @@ [![Build Status](https://travis-ci.org/PaddlePaddle/Paddle.svg?branch=develop)](https://travis-ci.org/PaddlePaddle/Paddle) -[![Documentation Status](https://img.shields.io/badge/docs-latest-brightgreen.svg?style=flat)](http://www.paddlepaddle.org/docs/develop/documentation/en/getstarted/index_en.html) -[![Documentation Status](https://img.shields.io/badge/中文文档-最新-brightgreen.svg)](http://www.paddlepaddle.org/docs/develop/documentation/zh/getstarted/index_cn.html) +[![Documentation Status](https://img.shields.io/badge/docs-latest-brightgreen.svg?style=flat)](http://paddlepaddle.org/documentation/docs/en/1.0/getstarted/index_en.html) +[![Documentation Status](https://img.shields.io/badge/中文文档-最新-brightgreen.svg)](http://paddlepaddle.org/documentation/docs/zh/1.0/beginners_guide/index.html) [![Release](https://img.shields.io/github/release/PaddlePaddle/Paddle.svg)](https://github.com/PaddlePaddle/Paddle/releases) [![License](https://img.shields.io/badge/license-Apache%202-blue.svg)](LICENSE) @@ -19,7 +19,7 @@ Our vision is to enable deep learning for everyone via PaddlePaddle. Please refer to our [release announcement](https://github.com/PaddlePaddle/Paddle/releases) to track the latest feature of PaddlePaddle. -### Latest PaddlePaddle Release: [Fluid 0.15.0](https://github.com/PaddlePaddle/Paddle/tree/v0.15.0) +### Latest PaddlePaddle Release: [Fluid 1.0.1](https://github.com/PaddlePaddle/Paddle/tree/release/1.0.0) ### Install Latest Stable Release: ``` # Linux CPU @@ -27,9 +27,9 @@ pip install paddlepaddle # Linux GPU cuda9cudnn7 pip install paddlepaddle-gpu # Linux GPU cuda8cudnn7 -pip install paddlepaddle-gpu==0.15.0.post87 +pip install paddlepaddle-gpu==1.0.1.post87 # Linux GPU cuda8cudnn5 -pip install paddlepaddle-gpu==0.15.0.post85 +pip install paddlepaddle-gpu==1.0.1.post85 # For installation on other platform, refer to http://paddlepaddle.org/ ``` @@ -76,26 +76,26 @@ pip install paddlepaddle-gpu==0.15.0.post85 ## Installation -It is recommended to read [this doc](http://paddlepaddle.org/documentation/docs/zh/0.15.0/new_docs/beginners_guide/install/install_doc.html) on our website. +It is recommended to read [this doc](http://paddlepaddle.org/documentation/docs/zh/1.0/beginners_guide/index.html) on our website. ## Documentation -We provide [English](http://paddlepaddle.org/documentation/docs/en/0.15.0/getstarted/index_en.html) and -[Chinese](http://paddlepaddle.org/documentation/docs/zh/0.15.0/new_docs/beginners_guide/index.html) documentation. +We provide [English](http://paddlepaddle.org/documentation/docs/en/1.0.0/getstarted/index_en.html) and +[Chinese](http://paddlepaddle.org/documentation/docs/zh/1.0/beginners_guide/index.html) documentation. - [Deep Learning 101](https://github.com/PaddlePaddle/book) You might want to start from this online interactive book that can run in a Jupyter Notebook. -- [Distributed Training](http://paddlepaddle.org/documentation/docs/zh/0.15.0/new_docs/user_guides/howto/training/cluster_howto.html) +- [Distributed Training](http://paddlepaddle.org/documentation/docs/zh/1.0/user_guides/howto/training/cluster_howto.html) You can run distributed training jobs on MPI clusters. -- [Python API](http://paddlepaddle.org/documentation/api/zh/0.15.0/fluid.html) +- [Python API](http://paddlepaddle.org/documentation/api/zh/1.0/fluid.html) Our new API enables much shorter programs. -- [How to Contribute](http://paddlepaddle.org/documentation/docs/zh/0.15.0/new_docs/advanced_usage/development/contribute_to_paddle.html) +- [How to Contribute](http://paddlepaddle.org/documentation/docs/zh/1.0/advanced_usage/development/contribute_to_paddle.html) We appreciate your contributions! diff --git a/benchmark/fluid/run.sh b/benchmark/fluid/run.sh old mode 100644 new mode 100755 diff --git a/cmake/generic.cmake b/cmake/generic.cmake index 5bf82b4ddf..62227c6784 100644 --- a/cmake/generic.cmake +++ b/cmake/generic.cmake @@ -261,6 +261,13 @@ function(cc_library TARGET_NAME) add_dependencies(${TARGET_NAME} mklml) target_link_libraries(${TARGET_NAME} "-L${MKLML_LIB_DIR} -liomp5 -Wl,--as-needed") endif() + # remove link to python, see notes at: + # https://github.com/pybind/pybind11/blob/master/docs/compiling.rst#building-manually + if("${cc_library_DEPS};" MATCHES "python;") + list(REMOVE_ITEM cc_library_DEPS python) + add_dependencies(${TARGET_NAME} python) + target_link_libraries(${TARGET_NAME} "-Wl,-undefined,dynamic_lookup") + endif() target_link_libraries(${TARGET_NAME} ${cc_library_DEPS}) add_dependencies(${TARGET_NAME} ${cc_library_DEPS}) endif() @@ -311,6 +318,8 @@ function(cc_test TARGET_NAME) set_property(TEST ${TARGET_NAME} PROPERTY ENVIRONMENT FLAGS_cpu_deterministic=true) set_property(TEST ${TARGET_NAME} PROPERTY ENVIRONMENT FLAGS_init_allocated_mem=true) set_property(TEST ${TARGET_NAME} PROPERTY ENVIRONMENT FLAGS_cudnn_deterministic=true) + # No unit test should exceed 10 minutes. + set_tests_properties(${TARGET_NAME} PROPERTIES TIMEOUT 600) endif() endfunction(cc_test) @@ -629,6 +638,8 @@ function(py_test TARGET_NAME) PYTHONPATH=${PADDLE_BINARY_DIR}/python ${py_test_ENVS} ${PYTHON_EXECUTABLE} -u ${py_test_SRCS} ${py_test_ARGS} WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}) + # No unit test should exceed 10 minutes. + set_tests_properties(${TARGET_NAME} PROPERTIES TIMEOUT 600) endif() endfunction() diff --git a/cmake/inference_lib.cmake b/cmake/inference_lib.cmake index 077072f6ea..67cca09b64 100644 --- a/cmake/inference_lib.cmake +++ b/cmake/inference_lib.cmake @@ -18,7 +18,7 @@ function(copy TARGET) set(oneValueArgs "") set(multiValueArgs SRCS DSTS DEPS) cmake_parse_arguments(copy_lib "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN}) - set(inference_lib_dist_dep ${TARGET} ${inference_lib_dist_dep} PARENT_SCOPE) + set(fluid_lib_dist_dep ${TARGET} ${fluid_lib_dist_dep} PARENT_SCOPE) list(LENGTH copy_lib_SRCS copy_lib_SRCS_len) list(LENGTH copy_lib_DSTS copy_lib_DSTS_len) @@ -150,16 +150,16 @@ if (WITH_ANAKIN AND WITH_MKL) SRCS ${PADDLE_BINARY_DIR}/paddle/fluid/inference/api/libinference_anakin_api* # compiled anakin api ${ANAKIN_INSTALL_DIR} # anakin release - DSTS ${dst_dir}/inference/anakin ${FLUID_INSTALL_DIR}/third_party/install/anakin) + DSTS ${FLUID_INSTALL_DIR}/third_party/install/anakin ${FLUID_INSTALL_DIR}/third_party/install/anakin) list(APPEND inference_deps anakin_inference_lib) endif() set(module "inference") copy(inference_lib DEPS ${inference_deps} SRCS ${src_dir}/${module}/*.h ${PADDLE_BINARY_DIR}/paddle/fluid/inference/libpaddle_fluid.* - ${src_dir}/${module}/api/paddle_inference_api.h ${src_dir}/${module}/api/demo_ci + ${src_dir}/${module}/api/paddle_inference_api.h ${PADDLE_BINARY_DIR}/paddle/fluid/inference/api/paddle_inference_pass.h - DSTS ${dst_dir}/${module} ${dst_dir}/${module} ${dst_dir}/${module} ${dst_dir}/${module} ${dst_dir}/${module} + DSTS ${dst_dir}/${module} ${dst_dir}/${module} ${dst_dir}/${module} ${dst_dir}/${module} ) set(module "platform") @@ -185,20 +185,41 @@ copy(cmake_cache SRCS ${CMAKE_CURRENT_BINARY_DIR}/CMakeCache.txt DSTS ${FLUID_INSTALL_DIR}) -add_custom_target(inference_lib_dist DEPENDS ${inference_lib_dist_dep}) +# This command generates a complete fluid library for both train and inference +add_custom_target(fluid_lib_dist DEPENDS ${fluid_lib_dist_dep}) + +# Following commands generate a inference-only fluid library +# third_party, version.txt and CMakeCache.txt are the same position with ${FLUID_INSTALL_DIR} +copy(third_party DEPS fluid_lib_dist + SRCS ${FLUID_INSTALL_DIR}/third_party ${FLUID_INSTALL_DIR}/CMakeCache.txt + DSTS ${FLUID_INFERENCE_INSTALL_DIR} ${FLUID_INFERENCE_INSTALL_DIR} +) + +# only need libpaddle_fluid.so/a and paddle_inference_api.h for inference-only library +copy(inference_api_lib DEPS fluid_lib_dist + SRCS ${FLUID_INSTALL_DIR}/paddle/fluid/inference/libpaddle_fluid.* + ${FLUID_INSTALL_DIR}/paddle/fluid/inference/paddle_inference_api.h + DSTS ${FLUID_INFERENCE_INSTALL_DIR}/paddle/lib ${FLUID_INFERENCE_INSTALL_DIR}/paddle/include +) + +add_custom_target(inference_lib_dist DEPENDS third_party inference_api_lib) # paddle fluid version -execute_process( - COMMAND ${GIT_EXECUTABLE} log --pretty=format:%H -1 - WORKING_DIRECTORY ${PADDLE_SOURCE_DIR} - OUTPUT_VARIABLE PADDLE_GIT_COMMIT) -set(version_file ${FLUID_INSTALL_DIR}/version.txt) -file(WRITE ${version_file} - "GIT COMMIT ID: ${PADDLE_GIT_COMMIT}\n" - "WITH_MKL: ${WITH_MKL}\n" - "WITH_GPU: ${WITH_GPU}\n") -if(WITH_GPU) - file(APPEND ${version_file} - "CUDA version: ${CUDA_VERSION}\n" - "CUDNN version: v${CUDNN_MAJOR_VERSION}\n") -endif() +function(version version_file) + execute_process( + COMMAND ${GIT_EXECUTABLE} log --pretty=format:%H -1 + WORKING_DIRECTORY ${PADDLE_SOURCE_DIR} + OUTPUT_VARIABLE PADDLE_GIT_COMMIT) + file(WRITE ${version_file} + "GIT COMMIT ID: ${PADDLE_GIT_COMMIT}\n" + "WITH_MKL: ${WITH_MKL}\n" + "WITH_MKLDNN: ${WITH_MKLDNN}\n" + "WITH_GPU: ${WITH_GPU}\n") + if(WITH_GPU) + file(APPEND ${version_file} + "CUDA version: ${CUDA_VERSION}\n" + "CUDNN version: v${CUDNN_MAJOR_VERSION}\n") + endif() +endfunction() +version(${FLUID_INSTALL_DIR}/version.txt) +version(${FLUID_INFERENCE_INSTALL_DIR}/version.txt) diff --git a/paddle/fluid/API.spec b/paddle/fluid/API.spec index c6dd919a93..19ef23cdfa 100644 --- a/paddle/fluid/API.spec +++ b/paddle/fluid/API.spec @@ -61,12 +61,12 @@ paddle.fluid.layers.cos_sim ArgSpec(args=['X', 'Y'], varargs=None, keywords=None paddle.fluid.layers.cross_entropy ArgSpec(args=['input', 'label', 'soft_label', 'ignore_index'], varargs=None, keywords=None, defaults=(False, -100)) paddle.fluid.layers.square_error_cost ArgSpec(args=['input', 'label'], varargs=None, keywords=None, defaults=None) paddle.fluid.layers.chunk_eval ArgSpec(args=['input', 'label', 'chunk_scheme', 'num_chunk_types', 'excluded_chunk_types'], varargs=None, keywords=None, defaults=(None,)) -paddle.fluid.layers.sequence_conv ArgSpec(args=['input', 'num_filters', 'filter_size', 'filter_stride', 'padding', 'bias_attr', 'param_attr', 'act'], varargs=None, keywords=None, defaults=(3, 1, None, None, None, None)) +paddle.fluid.layers.sequence_conv ArgSpec(args=['input', 'num_filters', 'filter_size', 'filter_stride', 'padding', 'bias_attr', 'param_attr', 'act', 'name'], varargs=None, keywords=None, defaults=(3, 1, None, None, None, None, None)) paddle.fluid.layers.conv2d ArgSpec(args=['input', 'num_filters', 'filter_size', 'stride', 'padding', 'dilation', 'groups', 'param_attr', 'bias_attr', 'use_cudnn', 'act', 'name'], varargs=None, keywords=None, defaults=(1, 0, 1, None, None, None, True, None, None)) paddle.fluid.layers.conv3d ArgSpec(args=['input', 'num_filters', 'filter_size', 'stride', 'padding', 'dilation', 'groups', 'param_attr', 'bias_attr', 'use_cudnn', 'act', 'name'], varargs=None, keywords=None, defaults=(1, 0, 1, None, None, None, True, None, None)) paddle.fluid.layers.sequence_pool ArgSpec(args=['input', 'pool_type'], varargs=None, keywords=None, defaults=None) -paddle.fluid.layers.sequence_softmax ArgSpec(args=['input', 'param_attr', 'bias_attr', 'use_cudnn'], varargs=None, keywords=None, defaults=(None, None, False)) -paddle.fluid.layers.softmax ArgSpec(args=['input', 'param_attr', 'bias_attr', 'use_cudnn', 'name'], varargs=None, keywords=None, defaults=(None, None, True, None)) +paddle.fluid.layers.sequence_softmax ArgSpec(args=['input', 'use_cudnn', 'name'], varargs=None, keywords=None, defaults=(False, None)) +paddle.fluid.layers.softmax ArgSpec(args=['input', 'use_cudnn', 'name'], varargs=None, keywords=None, defaults=(True, None)) paddle.fluid.layers.pool2d ArgSpec(args=['input', 'pool_size', 'pool_type', 'pool_stride', 'pool_padding', 'global_pooling', 'use_cudnn', 'ceil_mode', 'name'], varargs=None, keywords=None, defaults=(-1, 'max', 1, 0, False, True, False, None)) paddle.fluid.layers.pool3d ArgSpec(args=['input', 'pool_size', 'pool_type', 'pool_stride', 'pool_padding', 'global_pooling', 'use_cudnn', 'ceil_mode', 'name'], varargs=None, keywords=None, defaults=(-1, 'max', 1, 0, False, True, False, None)) paddle.fluid.layers.batch_norm ArgSpec(args=['input', 'act', 'is_test', 'momentum', 'epsilon', 'param_attr', 'bias_attr', 'data_layout', 'in_place', 'name', 'moving_mean_name', 'moving_variance_name', 'do_model_average_for_mean_and_var', 'fuse_with_relu'], varargs=None, keywords=None, defaults=(None, False, 0.9, 1e-05, None, None, 'NCHW', False, None, None, None, False, False)) @@ -75,7 +75,8 @@ paddle.fluid.layers.conv2d_transpose ArgSpec(args=['input', 'num_filters', 'outp paddle.fluid.layers.conv3d_transpose ArgSpec(args=['input', 'num_filters', 'output_size', 'filter_size', 'padding', 'stride', 'dilation', 'groups', 'param_attr', 'bias_attr', 'use_cudnn', 'act', 'name'], varargs=None, keywords=None, defaults=(None, None, 0, 1, 1, None, None, None, True, None, None)) paddle.fluid.layers.sequence_expand ArgSpec(args=['x', 'y', 'ref_level', 'name'], varargs=None, keywords=None, defaults=(-1, None)) paddle.fluid.layers.sequence_expand_as ArgSpec(args=['x', 'y', 'name'], varargs=None, keywords=None, defaults=(None,)) -paddle.fluid.layers.sequence_pad ArgSpec(args=['x', 'pad_value', 'maxlen'], varargs=None, keywords=None, defaults=(None,)) +paddle.fluid.layers.sequence_pad ArgSpec(args=['x', 'pad_value', 'maxlen', 'name'], varargs=None, keywords=None, defaults=(None, None)) +paddle.fluid.layers.sequence_unpad ArgSpec(args=['x', 'length', 'name'], varargs=None, keywords=None, defaults=(None,)) paddle.fluid.layers.lstm_unit ArgSpec(args=['x_t', 'hidden_t_prev', 'cell_t_prev', 'forget_bias', 'param_attr', 'bias_attr', 'name'], varargs=None, keywords=None, defaults=(0.0, None, None, None)) paddle.fluid.layers.reduce_sum ArgSpec(args=['input', 'dim', 'keep_dim', 'name'], varargs=None, keywords=None, defaults=(None, False, None)) paddle.fluid.layers.reduce_mean ArgSpec(args=['input', 'dim', 'keep_dim', 'name'], varargs=None, keywords=None, defaults=(None, False, None)) @@ -84,6 +85,7 @@ paddle.fluid.layers.reduce_min ArgSpec(args=['input', 'dim', 'keep_dim', 'name'] paddle.fluid.layers.reduce_prod ArgSpec(args=['input', 'dim', 'keep_dim', 'name'], varargs=None, keywords=None, defaults=(None, False, None)) paddle.fluid.layers.sequence_first_step ArgSpec(args=['input'], varargs=None, keywords=None, defaults=None) paddle.fluid.layers.sequence_last_step ArgSpec(args=['input'], varargs=None, keywords=None, defaults=None) +paddle.fluid.layers.sequence_slice ArgSpec(args=['input', 'offset', 'length', 'name'], varargs=None, keywords=None, defaults=(None,)) paddle.fluid.layers.dropout ArgSpec(args=['x', 'dropout_prob', 'is_test', 'seed', 'name'], varargs=None, keywords=None, defaults=(False, None, None)) paddle.fluid.layers.split ArgSpec(args=['input', 'num_or_sections', 'dim', 'name'], varargs=None, keywords=None, defaults=(-1, None)) paddle.fluid.layers.ctc_greedy_decoder ArgSpec(args=['input', 'blank', 'name'], varargs=None, keywords=None, defaults=(None,)) @@ -95,8 +97,8 @@ paddle.fluid.layers.warpctc ArgSpec(args=['input', 'label', 'blank', 'norm_by_ti paddle.fluid.layers.sequence_reshape ArgSpec(args=['input', 'new_dim'], varargs=None, keywords=None, defaults=None) paddle.fluid.layers.transpose ArgSpec(args=['x', 'perm', 'name'], varargs=None, keywords=None, defaults=(None,)) paddle.fluid.layers.im2sequence ArgSpec(args=['input', 'filter_size', 'stride', 'padding', 'input_image_size', 'out_stride', 'name'], varargs=None, keywords=None, defaults=(1, 1, 0, None, 1, None)) -paddle.fluid.layers.nce ArgSpec(args=['input', 'label', 'num_total_classes', 'sample_weight', 'param_attr', 'bias_attr', 'num_neg_samples'], varargs=None, keywords=None, defaults=(None, None, None, None)) -paddle.fluid.layers.hsigmoid ArgSpec(args=['input', 'label', 'num_classes', 'param_attr', 'bias_attr'], varargs=None, keywords=None, defaults=(None, None)) +paddle.fluid.layers.nce ArgSpec(args=['input', 'label', 'num_total_classes', 'sample_weight', 'param_attr', 'bias_attr', 'num_neg_samples', 'name'], varargs=None, keywords=None, defaults=(None, None, None, None, None)) +paddle.fluid.layers.hsigmoid ArgSpec(args=['input', 'label', 'num_classes', 'param_attr', 'bias_attr', 'name'], varargs=None, keywords=None, defaults=(None, None, None)) paddle.fluid.layers.beam_search ArgSpec(args=['pre_ids', 'pre_scores', 'ids', 'scores', 'beam_size', 'end_id', 'level', 'name'], varargs=None, keywords=None, defaults=(0, None)) paddle.fluid.layers.row_conv ArgSpec(args=['input', 'future_context_size', 'param_attr', 'act'], varargs=None, keywords=None, defaults=(None, None)) paddle.fluid.layers.multiplex ArgSpec(args=['inputs', 'index'], varargs=None, keywords=None, defaults=None) @@ -114,6 +116,7 @@ paddle.fluid.layers.pad ArgSpec(args=['x', 'paddings', 'pad_value', 'name'], var paddle.fluid.layers.pad_constant_like ArgSpec(args=['x', 'y', 'pad_value', 'name'], varargs=None, keywords=None, defaults=(0.0, None)) paddle.fluid.layers.label_smooth ArgSpec(args=['label', 'prior_dist', 'epsilon', 'dtype', 'name'], varargs=None, keywords=None, defaults=(None, 0.1, 'float32', None)) paddle.fluid.layers.roi_pool ArgSpec(args=['input', 'rois', 'pooled_height', 'pooled_width', 'spatial_scale'], varargs=None, keywords=None, defaults=(1, 1, 1.0)) +paddle.fluid.layers.roi_align ArgSpec(args=['input', 'rois', 'pooled_height', 'pooled_width', 'spatial_scale', 'sampling_ratio', 'name'], varargs=None, keywords=None, defaults=(1, 1, 1.0, -1, None)) paddle.fluid.layers.dice_loss ArgSpec(args=['input', 'label', 'epsilon'], varargs=None, keywords=None, defaults=(1e-05,)) paddle.fluid.layers.image_resize ArgSpec(args=['input', 'out_shape', 'scale', 'name', 'resample'], varargs=None, keywords=None, defaults=(None, None, None, 'BILINEAR')) paddle.fluid.layers.image_resize_short ArgSpec(args=['input', 'out_short_len', 'resample'], varargs=None, keywords=None, defaults=('BILINEAR',)) @@ -127,6 +130,7 @@ paddle.fluid.layers.relu ArgSpec(args=['x', 'name'], varargs=None, keywords=None paddle.fluid.layers.log ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)) paddle.fluid.layers.crop ArgSpec(args=['x', 'shape', 'offsets', 'name'], varargs=None, keywords=None, defaults=(None, None, None)) paddle.fluid.layers.rank_loss ArgSpec(args=['label', 'left', 'right', 'name'], varargs=None, keywords=None, defaults=(None,)) +paddle.fluid.layers.margin_rank_loss ArgSpec(args=['label', 'left', 'right', 'margin', 'name'], varargs=None, keywords=None, defaults=(0.1, None)) paddle.fluid.layers.elu ArgSpec(args=['x', 'alpha', 'name'], varargs=None, keywords=None, defaults=(1.0, None)) paddle.fluid.layers.relu6 ArgSpec(args=['x', 'threshold', 'name'], varargs=None, keywords=None, defaults=(6.0, None)) paddle.fluid.layers.pow ArgSpec(args=['x', 'factor', 'name'], varargs=None, keywords=None, defaults=(1.0, None)) @@ -170,6 +174,7 @@ paddle.fluid.layers.mean ArgSpec(args=['x', 'name'], varargs=None, keywords=None paddle.fluid.layers.mul ArgSpec(args=['x', 'y', 'x_num_col_dims', 'y_num_col_dims', 'name'], varargs=None, keywords=None, defaults=(1, 1, None)) paddle.fluid.layers.sigmoid_cross_entropy_with_logits ArgSpec(args=['x', 'label', 'name'], varargs=None, keywords=None, defaults=(None,)) paddle.fluid.layers.maxout ArgSpec(args=['x', 'groups', 'name'], varargs=None, keywords=None, defaults=(None,)) +paddle.fluid.layers.affine_channel ArgSpec(args=['x', 'scale', 'bias', 'data_layout', 'name'], varargs=None, keywords=None, defaults=(None, None, 'NCHW', None)) paddle.fluid.layers.data ArgSpec(args=['name', 'shape', 'append_batch_size', 'dtype', 'lod_level', 'type', 'stop_gradient'], varargs=None, keywords=None, defaults=(True, 'float32', 0, VarType.LOD_TENSOR, True)) paddle.fluid.layers.open_files ArgSpec(args=['filenames', 'shapes', 'lod_levels', 'dtypes', 'thread_num', 'buffer_size', 'pass_num', 'is_test'], varargs=None, keywords=None, defaults=(None, None, 1, None)) paddle.fluid.layers.read_file ArgSpec(args=['reader'], varargs=None, keywords=None, defaults=None) diff --git a/paddle/fluid/CMakeLists.txt b/paddle/fluid/CMakeLists.txt index 519a00fb07..48b36df649 100644 --- a/paddle/fluid/CMakeLists.txt +++ b/paddle/fluid/CMakeLists.txt @@ -12,6 +12,5 @@ endif(NOT WIN32) if(WITH_INFERENCE) # NOTE: please add subdirectory inference at last. add_subdirectory(inference) + add_subdirectory(train) endif() - -add_subdirectory(train) diff --git a/paddle/fluid/framework/details/op_handle_base.h b/paddle/fluid/framework/details/op_handle_base.h index 9fbefabc84..d09b94a3fd 100644 --- a/paddle/fluid/framework/details/op_handle_base.h +++ b/paddle/fluid/framework/details/op_handle_base.h @@ -64,7 +64,8 @@ class OpHandleBase { virtual bool IsMultiDeviceTransfer() { return false; } const platform::DeviceContext *DeviceContext(platform::Place place) { - return dev_ctxes_[place]; + auto it = dev_ctxes_.find(place); + return it != dev_ctxes_.end() ? it->second : nullptr; } void SetDeviceContext(platform::Place place, platform::DeviceContext *ctx_) { diff --git a/paddle/fluid/framework/details/var_handle.h b/paddle/fluid/framework/details/var_handle.h index d8c2bc40b9..a1f458c660 100644 --- a/paddle/fluid/framework/details/var_handle.h +++ b/paddle/fluid/framework/details/var_handle.h @@ -49,6 +49,8 @@ struct VarHandleBase { void AddOutput(OpHandleBase* out, ir::Node* node) { if (pending_ops_.find(out) == pending_ops_.end()) { + PADDLE_ENFORCE(out != nullptr, "The output of %s should not be nullptr", + this->Node()->Name()); pending_ops_.insert(out); node_->outputs.push_back(node); } diff --git a/paddle/fluid/framework/executor.cc b/paddle/fluid/framework/executor.cc index 70ec6e90a4..b212666637 100644 --- a/paddle/fluid/framework/executor.cc +++ b/paddle/fluid/framework/executor.cc @@ -46,6 +46,41 @@ ExecutorPrepareContext::~ExecutorPrepareContext() { VLOG(5) << "destroy ExecutorPrepareContext"; } +template +static void DeleteUnusedTensors(const Scope& scope, const OperatorBase* op, + GarbageCollector* gc, + RefCntMap* ref_cnts) { + std::unordered_set erase_tensors; + + auto handler = [&](const VariableNameMap& name_map) { + for (auto& name_pair : name_map) { + for (auto& name : name_pair.second) { + auto it = ref_cnts->find(name); + if (it == ref_cnts->end()) continue; + if ((it->second)-- == 1) { + auto* var = scope.FindVar(name); + if (var != nullptr) { + VLOG(10) << "Erase tensor \'" << name << "\'"; + if (var->IsType()) { + erase_tensors.insert(var->GetMutable()); + } else if (var->IsType()) { + erase_tensors.insert( + var->GetMutable()->mutable_value()); + } + } + } + } + } + }; + + handler(op->Inputs()); + handler(op->Outputs()); + + if (!erase_tensors.empty()) { + gc->Add(erase_tensors); + } +} + Executor::Executor(const platform::Place& place) : place_(place) {} void Executor::Close() { @@ -66,7 +101,7 @@ void InitializeVariable(Variable* var, proto::VarType::Type var_type) { } else if (var_type == proto::VarType::FETCH_LIST) { var->GetMutable(); } else if (var_type == proto::VarType::STEP_SCOPES) { - var->GetMutable>(); + var->GetMutable>(); } else if (var_type == proto::VarType::LOD_RANK_TABLE) { var->GetMutable(); } else if (var_type == proto::VarType::LOD_TENSOR_ARRAY) { @@ -331,9 +366,13 @@ void Executor::RunPreparedContext(ExecutorPrepareContext* ctx, Scope* scope, } int64_t max_memory_size = GetEagerDeletionThreshold(); - std::unique_ptr> gc; - if (max_memory_size >= 0) { + // WhileOp would set keep_kids to false + // WhileGradOp would need the scopes created in WhileOp + // Perhaps, we should not perform eager deletion in WhileOp + // The scopes and variables created by WhileOp would be deleted + // in WhileGradOp. + if (max_memory_size >= 0 && !keep_kids) { ctx->ResetReferenceCount(); #ifdef PADDLE_WITH_CUDA if (platform::is_gpu_place(place_)) { @@ -352,45 +391,8 @@ void Executor::RunPreparedContext(ExecutorPrepareContext* ctx, Scope* scope, op->Run(*local_scope, place_); if (gc != nullptr) { - std::vector erase_vars; - for (auto& input : op->Inputs()) { - for (auto& input_name : input.second) { - auto it = ctx->cur_ref_cnts_.find(input_name); - if (it == ctx->cur_ref_cnts_.end()) continue; - if (it->second == 1) { // should delete it - erase_vars.emplace_back(input_name); - ctx->cur_ref_cnts_.erase(input_name); - } else { - --(it->second); - } - } - } - - for (auto& output : op->Outputs()) { - for (auto& output_name : output.second) { - auto it = ctx->cur_ref_cnts_.find(output_name); - if (it == ctx->cur_ref_cnts_.end()) continue; - if (it->second == 1) { - erase_vars.emplace_back(output_name); - ctx->cur_ref_cnts_.erase(output_name); - } else { - --(it->second); - } - } - } - - if (!erase_vars.empty()) { - std::vector erase_tensors; - for (auto& name : erase_vars) { - auto* var = local_scope->FindVar(name); - if (var == nullptr) continue; - if (var->IsType()) { - auto* tensor = var->GetMutable(); - erase_tensors.push_back(tensor); - } - } - if (!erase_tensors.empty()) gc->Add(erase_tensors); - } + DeleteUnusedTensors(*local_scope, op.get(), gc.get(), + &(ctx->cur_ref_cnts_)); } if (FLAGS_benchmark) { diff --git a/paddle/fluid/framework/executor.h b/paddle/fluid/framework/executor.h index f0cc1338a8..36b36d49c2 100644 --- a/paddle/fluid/framework/executor.h +++ b/paddle/fluid/framework/executor.h @@ -32,38 +32,32 @@ template std::unordered_map GetNonPersistableReferenceCount( const ProgramDesc& prog, size_t block_id) { auto& block = prog.Block(block_id); - std::unordered_set ignored_vars; std::unordered_map ref_cnts; - for (auto var_desc : block.AllVars()) { - auto type = var_desc->Proto()->type().type(); - if (type != proto::VarType::LOD_TENSOR || var_desc->Persistable()) { - ignored_vars.insert(var_desc->Name()); // ignore persistable vars - } - } - - for (auto op_desc : block.AllOps()) { - for (auto& input : op_desc->Inputs()) { - for (auto& input_name : input.second) { - if (!ignored_vars.count(input_name)) { - if (ref_cnts.count(input_name)) - ++ref_cnts[input_name]; - else - ref_cnts[input_name] = 1; + auto update_ref_cnts = [&](OpDesc* op_desc, const VariableNameMap& name_map) { + for (auto& name_pair : name_map) { + for (auto& name : name_pair.second) { + auto* var_desc = block.FindVar(name); + if (var_desc == nullptr || var_desc->Persistable()) continue; + auto type = var_desc->Proto()->type().type(); + if (type != proto::VarType::LOD_TENSOR && + type != proto::VarType::SELECTED_ROWS) { + continue; } - } - } - for (auto& output : op_desc->Outputs()) { - for (auto output_name : output.second) { - if (!ignored_vars.count(output_name)) { - if (ref_cnts.count(output_name)) - ++ref_cnts[output_name]; - else - ref_cnts[output_name] = 1; + auto it = ref_cnts.find(name); + if (it != ref_cnts.end()) { + ++it->second; + } else { + ref_cnts[name] = 1; } } } + }; + + for (auto op_desc : block.AllOps()) { + update_ref_cnts(op_desc, op_desc->Inputs()); + update_ref_cnts(op_desc, op_desc->Outputs()); } return ref_cnts; } diff --git a/paddle/fluid/framework/feed_fetch_method.cc b/paddle/fluid/framework/feed_fetch_method.cc index 8e1f93c5eb..3e9353f5cf 100644 --- a/paddle/fluid/framework/feed_fetch_method.cc +++ b/paddle/fluid/framework/feed_fetch_method.cc @@ -27,8 +27,7 @@ void SetFeedVariable(Scope* scope, const LoDTensor& input, // be created. VLOG(3) << "SetFeedVariable name=" << var_name << " index=" << index; Variable* g_feed_value = scope->Var(var_name); - auto& feed_inputs = - *(g_feed_value->GetMutable>()); + auto& feed_inputs = *(g_feed_value->GetMutable()); if (index >= feed_inputs.size()) { feed_inputs.resize(index + 1); } diff --git a/paddle/fluid/framework/ir/CMakeLists.txt b/paddle/fluid/framework/ir/CMakeLists.txt index 0076a8bece..a145b2fafe 100644 --- a/paddle/fluid/framework/ir/CMakeLists.txt +++ b/paddle/fluid/framework/ir/CMakeLists.txt @@ -10,7 +10,7 @@ function(pass_library TARGET DEST) set(oneValueArgs "") set(multiValueArgs SRCS DEPS) cmake_parse_arguments(op_library "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN}) - cc_library(${TARGET} SRCS ${TARGET}.cc DEPS graph_pattern_detector pass ${op_library_DEPS}) + cc_library(${TARGET} SRCS ${TARGET}.cc DEPS graph_pattern_detector pass fuse_pass_base ${op_library_DEPS}) # add more DEST here, such as train, dist and collect USE_PASS into a file automatically. if (${DEST} STREQUAL "base" OR ${DEST} STREQUAL "inference") message(STATUS "add pass ${TARGET} ${DEST}") @@ -25,19 +25,25 @@ cc_library(graph_helper SRCS graph_helper.cc DEPS graph) cc_library(pass SRCS pass.cc DEPS graph node graph_helper) cc_library(graph_traits SRCS graph_traits.cc DEPS graph) cc_library(graph_pattern_detector SRCS graph_pattern_detector.cc DEPS graph graph_helper graph_traits) +cc_library(fuse_pass_base SRCS fuse_pass_base.cc DEPS pass) pass_library(graph_to_program_pass base) pass_library(graph_viz_pass base) pass_library(fc_fuse_pass inference) -if (WITH_MKLDNN) - pass_library(conv_relu_mkldnn_fuse_pass inference) -endif () pass_library(attention_lstm_fuse_pass inference) pass_library(infer_clean_graph_pass inference) pass_library(fc_lstm_fuse_pass inference) pass_library(embedding_fc_lstm_fuse_pass inference) pass_library(fc_gru_fuse_pass inference) pass_library(seq_concat_fc_fuse_pass inference) +pass_library(conv_bn_fuse_pass inference) +pass_library(seqconv_eltadd_relu_fuse_pass inference) +if(WITH_MKLDNN) + pass_library(mkldnn_placement_pass base) + pass_library(conv_bias_mkldnn_fuse_pass inference) + pass_library(conv_relu_mkldnn_fuse_pass inference) + pass_library(conv_elementwise_add_mkldnn_fuse_pass inference) +endif() cc_library(fuse_elewise_add_act_pass SRCS fuse_elewise_add_act_pass.cc DEPS pass graph_pattern_detector ) @@ -53,4 +59,5 @@ cc_test(test_graph_pattern_detector SRCS graph_pattern_detector_tester.cc DEPS g cc_test(test_fc_fuse_pass SRCS fc_fuse_pass_tester.cc DEPS fc_fuse_pass framework_proto) if (WITH_MKLDNN) cc_test(test_conv_relu_mkldnn_fuse_pass SRCS conv_relu_mkldnn_fuse_pass_tester.cc DEPS conv_relu_mkldnn_fuse_pass) + cc_test(test_conv_elementwise_add_mkldnn_fuse_pass SRCS conv_elementwise_add_mkldnn_fuse_pass_tester.cc DEPS conv_elementwise_add_mkldnn_fuse_pass) endif () diff --git a/paddle/fluid/framework/ir/attention_lstm_fuse_pass.cc b/paddle/fluid/framework/ir/attention_lstm_fuse_pass.cc index 1c75cb5a82..6090f1fe76 100644 --- a/paddle/fluid/framework/ir/attention_lstm_fuse_pass.cc +++ b/paddle/fluid/framework/ir/attention_lstm_fuse_pass.cc @@ -262,7 +262,7 @@ std::unique_ptr AttentionLSTMFusePass::ApplyImpl( std::unordered_set specified_vars({"data_lod_attention", "cell_init", "hidden_init", "data", "week", "minute"}); - int count = 0; + size_t count = 0; for (auto* node : graph->Nodes()) { if (node->IsVar() && specified_vars.count(node->Name())) { ++count; diff --git a/paddle/fluid/framework/ir/conv_bias_mkldnn_fuse_pass.cc b/paddle/fluid/framework/ir/conv_bias_mkldnn_fuse_pass.cc new file mode 100644 index 0000000000..449cc78be1 --- /dev/null +++ b/paddle/fluid/framework/ir/conv_bias_mkldnn_fuse_pass.cc @@ -0,0 +1,137 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/framework/ir/conv_bias_mkldnn_fuse_pass.h" +#include +#include +#include +#include "paddle/fluid/framework/lod_tensor.h" +#include "paddle/fluid/platform/enforce.h" + +namespace paddle { +namespace framework { +namespace ir { + +template +LoDTensor tensor_apply_eltwise(const LoDTensor& vec_a, const LoDTensor& vec_b, + BinaryOperation f) { + PADDLE_ENFORCE_EQ(vec_a.dims(), vec_b.dims()); + LoDTensor vec_y; + vec_y.Resize(vec_a.dims()); + const float* a = vec_a.data(); + const float* b = vec_b.data(); + float* y = vec_y.mutable_data(platform::CPUPlace()); + for (int i = 0; i < vec_a.numel(); i++) { + y[i] = f(a[i], b[i]); + } + return vec_y; +} + +std::unique_ptr ConvBiasFusePass::ApplyImpl( + std::unique_ptr graph) const { + PADDLE_ENFORCE(graph.get()); + FusePassBase::Init(name_scope_, graph.get()); + + auto* scope = param_scope(); + PADDLE_ENFORCE(scope); + + GraphPatternDetector gpd; + auto* conv_input = + gpd.mutable_pattern() + ->NewNode(patterns::PDNodeName(name_scope_, "conv_input")) + ->AsInput() + ->assert_is_op_input("conv2d", "Input"); + patterns::ConvBias conv_bias_pattern(gpd.mutable_pattern(), name_scope_); + conv_bias_pattern(conv_input); + int found_conv_bias_count = 0; + auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph, + Graph* g) { + VLOG(4) << "handle ConvBias fuse"; + GET_IR_NODE_FROM_SUBGRAPH(conv_weight, conv_weight, + conv_bias_pattern); // Filter + GET_IR_NODE_FROM_SUBGRAPH(conv_out, conv_out, conv_bias_pattern); // tmp + GET_IR_NODE_FROM_SUBGRAPH(conv, conv, conv_bias_pattern); // CONV op + // bias + GET_IR_NODE_FROM_SUBGRAPH(eltwise_bias, eltwise_bias, conv_bias_pattern); + // output + GET_IR_NODE_FROM_SUBGRAPH(eltwise_out, eltwise_out, conv_bias_pattern); + // elementwise_add op + GET_IR_NODE_FROM_SUBGRAPH(eltwise, eltwise, conv_bias_pattern); + + PADDLE_ENFORCE(subgraph.count(conv_input)); + + // check if fuse can be done and if MKL-DNN should be used + FuseOptions fuse_option = FindFuseOption(*conv, *eltwise); + if (fuse_option == DO_NOT_FUSE || fuse_option == FUSE_NATIVE) { + VLOG(3) << "do not perform conv+bias fuse"; + return; + } + + auto* eltwise_bias_tensor = + scope->FindVar(eltwise_bias->Name())->GetMutable(); + + auto input_names = conv->Op()->InputNames(); + bool has_bias = std::find(input_names.begin(), input_names.end(), "Bias") != + input_names.end(); + if (has_bias && conv->Op()->Input("Bias").size() > 0) { + auto conv_bias_names = conv->Op()->Input("Bias"); + // add eltwise bias to existing conv bias + PADDLE_ENFORCE_EQ(conv_bias_names.size(), 1); + auto* conv_bias_var = scope->FindVar(conv_bias_names[0]); + auto* conv_bias_tensor = conv_bias_var->GetMutable(); + PADDLE_ENFORCE_EQ(conv_bias_tensor->dims(), eltwise_bias_tensor->dims()); + *conv_bias_tensor = tensor_apply_eltwise( + *conv_bias_tensor, *eltwise_bias_tensor, std::plus()); + + conv->Op()->SetOutput("Output", + std::vector({eltwise_out->Name()})); + + GraphSafeRemoveNodes(graph.get(), {eltwise, conv_out}); + + IR_NODE_LINK_TO(conv, eltwise_out); + } else { + // take eltwise bias as conv bias + OpDesc desc; + + desc.SetInput( + "Input", std::vector({subgraph.at(conv_input)->Name()})); + desc.SetInput("Filter", std::vector({conv_weight->Name()})); + desc.SetInput("Bias", std::vector({eltwise_bias->Name()})); + desc.SetOutput("Output", std::vector({eltwise_out->Name()})); + desc.SetType("conv2d"); + + for (auto& attr : conv->Op()->GetAttrMap()) { + desc.SetAttr(attr.first, attr.second); + } + auto conv_bias_node = g->CreateOpNode(&desc); + + IR_NODE_LINK_TO(subgraph.at(conv_input), conv_bias_node); + IR_NODE_LINK_TO(conv_weight, conv_bias_node); + IR_NODE_LINK_TO(eltwise_bias, conv_bias_node); + IR_NODE_LINK_TO(conv_bias_node, eltwise_out); + + GraphSafeRemoveNodes(graph.get(), {conv, eltwise, conv_out}); + } + + found_conv_bias_count++; + }; + gpd(graph.get(), handler); + AddStatis(found_conv_bias_count); + return graph; +} +} // namespace ir +} // namespace framework +} // namespace paddle +REGISTER_PASS(conv_bias_mkldnn_fuse_pass, + paddle::framework::ir::ConvBiasFusePass); diff --git a/paddle/fluid/framework/ir/conv_bias_mkldnn_fuse_pass.h b/paddle/fluid/framework/ir/conv_bias_mkldnn_fuse_pass.h new file mode 100644 index 0000000000..5775b83b88 --- /dev/null +++ b/paddle/fluid/framework/ir/conv_bias_mkldnn_fuse_pass.h @@ -0,0 +1,36 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +#pragma once +#include +#include "paddle/fluid/framework/ir/fuse_pass_base.h" +#include "paddle/fluid/framework/ir/graph.h" +#include "paddle/fluid/framework/ir/graph_pattern_detector.h" +#include "paddle/fluid/framework/ir/pass.h" +namespace paddle { +namespace framework { +namespace ir { +/* +* Fuse the Conv and Elementwise_add to a ConvBiasOp. +*/ +class ConvBiasFusePass : public FusePassBase { + public: + virtual ~ConvBiasFusePass() {} + + protected: + std::unique_ptr ApplyImpl(std::unique_ptr graph) const; + const std::string name_scope_{"conv_bias_mkldnn_fuse"}; +}; +} // namespace ir +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/ir/conv_bn_fuse_pass.cc b/paddle/fluid/framework/ir/conv_bn_fuse_pass.cc new file mode 100644 index 0000000000..846a14e365 --- /dev/null +++ b/paddle/fluid/framework/ir/conv_bn_fuse_pass.cc @@ -0,0 +1,298 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/framework/ir/conv_bn_fuse_pass.h" +#include +#include +#include +#include "paddle/fluid/framework/lod_tensor.h" +#include "paddle/fluid/operators/math/cpu_vec.h" +#include "paddle/fluid/platform/enforce.h" + +namespace paddle { +namespace framework { +namespace ir { + +#define GET_CONV_BN_NODES(pattern_name) \ + /* OPERATORS */ \ + GET_IR_NODE_FROM_SUBGRAPH(conv, conv, pattern_name); \ + GET_IR_NODE_FROM_SUBGRAPH(batch_norm, batch_norm, pattern_name); \ + /* CONV inputs */ \ + GET_IR_NODE_FROM_SUBGRAPH(conv_weight, conv_weight, pattern_name); \ + /* CONV outputs */ \ + GET_IR_NODE_FROM_SUBGRAPH(conv_out, conv_out, pattern_name); \ + /* BN inputs */ \ + GET_IR_NODE_FROM_SUBGRAPH(bn_scale, bn_scale, pattern_name); \ + GET_IR_NODE_FROM_SUBGRAPH(bn_bias, bn_bias, pattern_name); \ + GET_IR_NODE_FROM_SUBGRAPH(bn_mean, bn_mean, pattern_name); \ + GET_IR_NODE_FROM_SUBGRAPH(bn_variance, bn_variance, pattern_name); \ + /* BN outputs */ \ + GET_IR_NODE_FROM_SUBGRAPH(bn_out, bn_out, pattern_name); /* Out */ \ + GET_IR_NODE_FROM_SUBGRAPH(bn_mean_out, bn_mean_out, pattern_name); \ + GET_IR_NODE_FROM_SUBGRAPH(bn_variance_out, bn_variance_out, pattern_name); \ + GET_IR_NODE_FROM_SUBGRAPH(bn_saved_mean, bn_saved_mean, pattern_name); \ + GET_IR_NODE_FROM_SUBGRAPH(bn_saved_variance, bn_saved_variance, pattern_name) + +void recompute_bias_and_weights(const Scope* scope, + ir::Node* conv_weight, // + const ir::Node& bn_scale, // + const LoDTensor& bn_bias_tensor, // + const ir::Node& bn_mean, // + const ir::Node& bn_variance, // + LoDTensor* eltwise_y_in_tensor, // + float epsilon) { + using EigenVectorArrayMap = + Eigen::Map>; + using ConstEigenVectorArrayMap = + Eigen::Map>; + using EigenMatrixArrayMap = Eigen::Map< + Eigen::Array>; + + // Re-compute bias of conv2d from BN + PADDLE_ENFORCE_EQ(eltwise_y_in_tensor->dims(), bn_bias_tensor.dims()); + + auto* scale_tensor = scope->FindVar(bn_scale.Name())->GetMutable(); + auto* variance_tensor = + scope->FindVar(bn_variance.Name())->GetMutable(); + auto* mean_tensor = scope->FindVar(bn_mean.Name())->GetMutable(); + + ConstEigenVectorArrayMap scale_array(scale_tensor->data(), + scale_tensor->numel(), 1); + EigenVectorArrayMap variance_array( + variance_tensor->mutable_data(platform::CPUPlace()), + variance_tensor->numel(), 1); + ConstEigenVectorArrayMap mean_array(mean_tensor->data(), + mean_tensor->numel(), 1); + ConstEigenVectorArrayMap bn_bias_array(bn_bias_tensor.data(), + bn_bias_tensor.numel(), 1); + + // variance will not be used anymore, so make it std_array and then tmp_array + variance_array += epsilon; + variance_array = variance_array.sqrt(); + variance_array = scale_array / variance_array; + + EigenVectorArrayMap eltwise_y_in_array( + eltwise_y_in_tensor->mutable_data(platform::CPUPlace()), + eltwise_y_in_tensor->numel(), 1); + + eltwise_y_in_array = + ((eltwise_y_in_array - mean_array) * variance_array) + bn_bias_array; + + // Re-compute weight of conv2d from BN + auto* weights = scope->FindVar(conv_weight->Name())->GetMutable(); + auto weights_shape = weights->dims(); + auto weights_shape_2d = flatten_to_2d(weights_shape, 1); + + EigenMatrixArrayMap weights_array_2d( + weights->mutable_data(platform::CPUPlace()), weights_shape_2d[0], + weights_shape_2d[1]); + + weights_array_2d.colwise() *= variance_array; +} + +std::unique_ptr ConvBNFusePass::ApplyImpl( + std::unique_ptr graph) const { + PADDLE_ENFORCE(graph.get()); + FusePassBase::Init(name_scope_, graph.get()); + + auto* scope = param_scope(); + PADDLE_ENFORCE(scope); + + GraphPatternDetector gpd; + auto* conv_input = + gpd.mutable_pattern() + ->NewNode(patterns::PDNodeName(name_scope_, "conv_input")) + ->AsInput() + ->assert_is_op_input("conv2d", "Input"); + patterns::ConvBN conv_bn_pattern(gpd.mutable_pattern(), name_scope_); + conv_bn_pattern(conv_input, false /*with_eltwise_add*/); + + int found_conv_bn_count = 0; + auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph, + Graph* g) { + VLOG(4) << "handle ConvBN fuse"; + + // conv, batch_norm, + // conv_weight, conv_out, + // bn_scale, bn_bias, bn_mean, bn_variance, + // bn_out, bn_mean_out, bn_variance_out, bn_saved_mean, + // bn_saved_variance + GET_CONV_BN_NODES(conv_bn_pattern); + + // check if fuse can be done and if MKL-DNN should be used + FuseOptions fuse_option = FindFuseOption(*conv, *batch_norm); + if (fuse_option == DO_NOT_FUSE) { + VLOG(3) << "do not perform conv+bn fuse"; + return; + } + + // Create eltwise_y (conv bias) variable + VarDesc eltwise_y_in_desc( + patterns::PDNodeName(name_scope_, "eltwise_y_in")); + eltwise_y_in_desc.SetPersistable(true); + auto* eltwise_y_in_node = g->CreateVarNode(&eltwise_y_in_desc); + auto* eltwise_y_in_tensor = + scope->Var(eltwise_y_in_node->Name())->GetMutable(); + + // Get batch norm bias + auto* bn_bias_tensor = + scope->FindVar(bn_bias->Name())->GetMutable(); + + // Initialize eltwise_y + eltwise_y_in_tensor->Resize(bn_bias_tensor->dims()); + std::fill_n(eltwise_y_in_tensor->mutable_data(platform::CPUPlace()), + eltwise_y_in_tensor->numel(), 0.0f); + + // update weights and biases + float epsilon = boost::get(batch_norm->Op()->GetAttr("epsilon")); + recompute_bias_and_weights(scope, conv_weight, *bn_scale, *bn_bias_tensor, + *bn_mean, *bn_variance, eltwise_y_in_tensor, + epsilon); + + // with MKL-DNN fuse conv+bn into conv with bias + // without MKL-DNN fuse conv+bn into conv+elementwise_add + if (fuse_option == FUSE_MKLDNN) { + auto input_names = conv->Op()->InputNames(); + bool has_bias = std::find(input_names.begin(), input_names.end(), + "Bias") != input_names.end(); + if (has_bias && conv->Op()->Input("Bias").size() > 0) { + // reuse existing conv bias node + auto conv_bias_names = conv->Op()->Input("Bias"); + PADDLE_ENFORCE_EQ(conv_bias_names.size(), 1); + auto* conv_bias_var = scope->FindVar(conv_bias_names[0]); + auto* conv_bias_tensor = conv_bias_var->GetMutable(); + PADDLE_ENFORCE_EQ(conv_bias_tensor->dims(), + eltwise_y_in_tensor->dims()); + + auto eigen_conv_bias = EigenVector::From(*conv_bias_tensor); + eigen_conv_bias += EigenVector::From(*eltwise_y_in_tensor); + } else { + // add new conv_bias node + conv->Op()->SetInput( + "Bias", std::vector({eltwise_y_in_node->Name()})); + IR_NODE_LINK_TO(eltwise_y_in_node, conv); + } + conv->Op()->SetOutput("Output", + std::vector({bn_out->Name()})); + + GraphSafeRemoveNodes( + graph.get(), + {conv_out, bn_scale, bn_bias, bn_mean, bn_variance, batch_norm, + bn_mean_out, bn_variance_out, bn_saved_mean, bn_saved_variance}); + + IR_NODE_LINK_TO(conv, bn_out); + found_conv_bn_count++; + } else { // fuse_option == FUSE_NATIVE + // create an elementwise add node. + OpDesc desc; + desc.SetInput("X", std::vector({conv_out->Name()})); + desc.SetInput("Y", std::vector({eltwise_y_in_node->Name()})); + desc.SetOutput("Out", std::vector({bn_out->Name()})); + desc.SetType("elementwise_add"); + desc.SetAttr("axis", 1); + auto eltwise_op = g->CreateOpNode(&desc); // OpDesc will be copied. + + GraphSafeRemoveNodes( + graph.get(), + {bn_scale, bn_bias, bn_mean, bn_variance, batch_norm, bn_mean_out, + bn_variance_out, bn_saved_mean, bn_saved_variance}); + + IR_NODE_LINK_TO(conv_out, eltwise_op); + IR_NODE_LINK_TO(eltwise_y_in_node, eltwise_op); + IR_NODE_LINK_TO(eltwise_op, bn_out); + found_conv_bn_count++; + } + }; + + gpd(graph.get(), handler); + + AddStatis(found_conv_bn_count); + return graph; +} + +std::unique_ptr ConvEltwiseAddBNFusePass::ApplyImpl( + std::unique_ptr graph) const { + PADDLE_ENFORCE(graph.get()); + FusePassBase::Init(name_scope_, graph.get()); + + auto* scope = param_scope(); + PADDLE_ENFORCE(scope); + + GraphPatternDetector gpd; + auto* conv_input = + gpd.mutable_pattern() + ->NewNode(patterns::PDNodeName(name_scope_, "conv_input")) + ->AsInput() + ->assert_is_op_input("conv2d", "Input"); + patterns::ConvBN conv_bn_pattern(gpd.mutable_pattern(), name_scope_); + conv_bn_pattern(conv_input, true /*with_eltwise_add*/); + + int found_conv_bn_count = 0; + auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph, + Graph* g) { + VLOG(4) << "handle ConvBN fuse"; + + // conv, batch_norm, + // conv_weight, conv_out, + // bn_scale, bn_bias, bn_mean, bn_variance, + // bn_out, bn_mean_out, bn_variance_out, bn_saved_mean,bn_saved_variance + GET_CONV_BN_NODES(conv_bn_pattern); + // OPERATORS + GET_IR_NODE_FROM_SUBGRAPH(eltwise, eltwise, conv_bn_pattern); + // BIAS inputs + GET_IR_NODE_FROM_SUBGRAPH(eltwise_y_in, eltwise_y_in, conv_bn_pattern); + // BIAS outputs + GET_IR_NODE_FROM_SUBGRAPH(eltwise_out, eltwise_out, conv_bn_pattern); + + // Get eltwise_y (conv bias) variable + auto* eltwise_y_in_tensor = + scope->FindVar(eltwise_y_in->Name())->GetMutable(); + + // Get batch norm bias + auto* bn_bias_tensor = + scope->FindVar(bn_bias->Name())->GetMutable(); + + // update weights and biases + float epsilon = boost::get(batch_norm->Op()->GetAttr("epsilon")); + recompute_bias_and_weights(scope, conv_weight, *bn_scale, *bn_bias_tensor, + *bn_mean, *bn_variance, eltwise_y_in_tensor, + epsilon); + + // Update the elementwise_add node + eltwise->Op()->SetAttr("axis", 1); + eltwise->Op()->SetOutput("Out", std::vector({bn_out->Name()})); + + GraphSafeRemoveNodes( + graph.get(), + {bn_scale, bn_bias, bn_mean, bn_variance, batch_norm, bn_mean_out, + bn_variance_out, bn_saved_mean, bn_saved_variance, eltwise_out}); + + IR_NODE_LINK_TO(eltwise, bn_out); + + found_conv_bn_count++; + }; + + gpd(graph.get(), handler); + + AddStatis(found_conv_bn_count); + return graph; +} + +} // namespace ir +} // namespace framework +} // namespace paddle + +REGISTER_PASS(conv_bn_fuse_pass, paddle::framework::ir::ConvBNFusePass); +REGISTER_PASS(conv_eltwiseadd_bn_fuse_pass, + paddle::framework::ir::ConvEltwiseAddBNFusePass); diff --git a/paddle/fluid/framework/ir/conv_bn_fuse_pass.h b/paddle/fluid/framework/ir/conv_bn_fuse_pass.h new file mode 100644 index 0000000000..2c9eb574fe --- /dev/null +++ b/paddle/fluid/framework/ir/conv_bn_fuse_pass.h @@ -0,0 +1,49 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include "paddle/fluid/framework/ir/fuse_pass_base.h" +#include "paddle/fluid/framework/ir/graph.h" +#include "paddle/fluid/framework/ir/graph_pattern_detector.h" + +namespace paddle { +namespace framework { +namespace ir { + +/* + * Fuse the Conv and BatchNorm to a ConvBNMKLDNNOp. + */ +class ConvBNFusePass : public FusePassBase { + public: + virtual ~ConvBNFusePass() {} + + protected: + std::unique_ptr ApplyImpl(std::unique_ptr graph) const; + const std::string name_scope_{"conv_bn_fuse"}; +}; + +class ConvEltwiseAddBNFusePass : public FusePassBase { + public: + virtual ~ConvEltwiseAddBNFusePass() {} + + protected: + std::unique_ptr ApplyImpl(std::unique_ptr graph) const; + const std::string name_scope_{"conv_eltwiseadd_bn_fuse"}; +}; + +} // namespace ir +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/ir/conv_elementwise_add_mkldnn_fuse_pass.cc b/paddle/fluid/framework/ir/conv_elementwise_add_mkldnn_fuse_pass.cc new file mode 100644 index 0000000000..8d0035ae98 --- /dev/null +++ b/paddle/fluid/framework/ir/conv_elementwise_add_mkldnn_fuse_pass.cc @@ -0,0 +1,154 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/framework/ir/conv_elementwise_add_mkldnn_fuse_pass.h" +#include +#include + +#include "paddle/fluid/framework/ir/graph_traits.h" + +namespace paddle { +namespace framework { +namespace ir { +namespace { + +// The function keeps the graph consistent by replacing +// a node 'from' in the set of inputs nodes +// of the visited node by a node 'to'. +void CorrectGraphEdges(Graph* graph, Node* from, Node* to) { + for (auto& node : GraphTraits::DFS(*graph)) { + auto from_in_inputs = + std::find(std::begin(node.inputs), std::end(node.inputs), from); + + if (from_in_inputs != std::end(node.inputs)) { + IR_NODE_LINK_TO(to, (&node)); + + auto inputs = node.Op()->Inputs(); + + using input_type = VariableNameMap::value_type; + + std::for_each(std::begin(inputs), std::end(inputs), + [from, to, &node](const input_type& i) -> void { + auto param_names = i.second; + auto pi = std::find(std::begin(param_names), + std::end(param_names), from->Name()); + + if (pi != std::end(param_names)) { + node.Op()->SetInput(i.first, {to->Name()}); + } + }); + } + } +} +} // namespace +using graph_ptr = std::unique_ptr; + +graph_ptr ConvElementwiseAddMKLDNNFusePass::ApplyImpl(graph_ptr graph) const { + FusePassBase::Init(name_scope_, graph.get()); + + GraphPatternDetector gpd; + auto pattern = gpd.mutable_pattern(); + + patterns::Conv conv_pattern{pattern, name_scope_}; + auto conv_output = conv_pattern(); + + patterns::ElementwiseAdd elementwise_add_pattern{pattern, name_scope_}; + elementwise_add_pattern(conv_output); + + conv_output->AsIntermediate(); + + auto conv_op_has_bias = [](const Node& conv_op) -> std::pair { + auto bias_input_names = conv_op.Op()->Inputs(); + auto bias_it = bias_input_names.find("Bias"); + + if (bias_it != std::end(bias_input_names)) { + bool has_bias = !bias_it->second.empty(); + + if (has_bias) { + auto conv_bias_names = bias_it->second; + auto conv_bias_names_it = + std::find_if(std::begin(conv_op.inputs), std::end(conv_op.inputs), + [&conv_bias_names](Node* n) -> bool { + return n->Name() == conv_bias_names[0]; + }); + return std::make_pair(has_bias, *conv_bias_names_it); + } + } + + return std::make_pair(false, nullptr); + }; + + auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph, + Graph* g) { + GET_IR_NODE_FROM_SUBGRAPH(conv_op, conv_op, conv_pattern); + GET_IR_NODE_FROM_SUBGRAPH(conv_input, conv_input, conv_pattern); + GET_IR_NODE_FROM_SUBGRAPH(conv_filter, conv_filter, conv_pattern); + GET_IR_NODE_FROM_SUBGRAPH(conv_output, conv_output, conv_pattern); + GET_IR_NODE_FROM_SUBGRAPH(elementwise_add_op, elementwise_add_op, + elementwise_add_pattern); + GET_IR_NODE_FROM_SUBGRAPH(elementwise_add_x, elementwise_add_x, + elementwise_add_pattern); + GET_IR_NODE_FROM_SUBGRAPH(elementwise_add_out, elementwise_add_out, + elementwise_add_pattern); + + if (FindFuseOption(*conv_op, *elementwise_add_op) != FUSE_MKLDNN) return; + + OpDesc op_desc; + op_desc.SetType("conv2d"); + + op_desc.SetInput("Input", {conv_input->Name()}); + op_desc.SetInput("Filter", {conv_filter->Name()}); + op_desc.SetInput("ResidualData", {elementwise_add_x->Name()}); + op_desc.SetOutput("Output", {conv_output->Name()}); + + bool has_bias; + Node* conv_bias; + + std::tie(has_bias, conv_bias) = conv_op_has_bias(*conv_op); + + if (has_bias) { + op_desc.SetInput("Bias", {conv_bias->Name()}); + } + + for (const auto& attr : conv_op->Op()->GetAttrMap()) { + op_desc.SetAttr(attr.first, attr.second); + } + + op_desc.SetAttr("fuse_residual_connection", true); + + auto fused_conv_op = g->CreateOpNode(&op_desc); + + IR_NODE_LINK_TO(conv_input, fused_conv_op); + IR_NODE_LINK_TO(conv_filter, fused_conv_op); + IR_NODE_LINK_TO(elementwise_add_x, fused_conv_op); + IR_NODE_LINK_TO(fused_conv_op, conv_output); + + if (has_bias) { + IR_NODE_LINK_TO(conv_bias, fused_conv_op); + } + + CorrectGraphEdges(g, elementwise_add_out, conv_output); + GraphSafeRemoveNodes(g, {elementwise_add_out, conv_op, elementwise_add_op}); + }; + + gpd(graph.get(), handler); + + return graph; +} +} // namespace ir +} // namespace framework +} // namespace paddle + +REGISTER_PASS(conv_elementwise_add_mkldnn_fuse_pass, + paddle::framework::ir::ConvElementwiseAddMKLDNNFusePass); diff --git a/paddle/fluid/framework/ir/conv_elementwise_add_mkldnn_fuse_pass.h b/paddle/fluid/framework/ir/conv_elementwise_add_mkldnn_fuse_pass.h new file mode 100644 index 0000000000..f4a899f1ad --- /dev/null +++ b/paddle/fluid/framework/ir/conv_elementwise_add_mkldnn_fuse_pass.h @@ -0,0 +1,38 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include "paddle/fluid/framework/ir/fuse_pass_base.h" +#include "paddle/fluid/framework/ir/graph.h" +#include "paddle/fluid/framework/ir/graph_pattern_detector.h" + +namespace paddle { +namespace framework { +namespace ir { + +class ConvElementwiseAddMKLDNNFusePass : public FusePassBase { + public: + virtual ~ConvElementwiseAddMKLDNNFusePass() {} + + protected: + std::unique_ptr ApplyImpl(std::unique_ptr graph) const; + + const std::string name_scope_{"residual_connections_fuse_pass"}; +}; + +} // namespace ir +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/ir/conv_elementwise_add_mkldnn_fuse_pass_tester.cc b/paddle/fluid/framework/ir/conv_elementwise_add_mkldnn_fuse_pass_tester.cc new file mode 100644 index 0000000000..348a3dfc5d --- /dev/null +++ b/paddle/fluid/framework/ir/conv_elementwise_add_mkldnn_fuse_pass_tester.cc @@ -0,0 +1,247 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include + +#include "paddle/fluid/framework/ir/conv_elementwise_add_mkldnn_fuse_pass.h" +#include "paddle/fluid/framework/ir/graph_traits.h" + +namespace paddle { +namespace framework { +namespace ir { + +namespace { +constexpr int nodes_removed = 3; +constexpr int nodes_added = 1; + +void SetOp(ProgramDesc* prog, const std::string& type, + const std::vector>& inputs, + const std::pair& output) { + auto op = prog->MutableBlock(0)->AppendOp(); + op->SetType(type); + op->SetAttr("use_mkldnn", true); + + for (const auto& input : inputs) { + op->SetInput(input.first, {input.second}); + } + + op->SetOutput(output.first, {output.second}); +} + +struct IsReachable { + using func = std::function; + + auto operator()(const std::unique_ptr& graph) -> func { + auto find_node = [](const std::unique_ptr& graph, + const std::string& name) -> Node* { + for (auto& node : GraphTraits::DFS(*graph)) { + if (name == node.Name()) { + return &node; + } + } + + return nullptr; + }; + + return [&](std::string from, const std::string to) -> bool { + if (from == to) return true; + + std::map visited; + + for (auto& node : GraphTraits::DFS(*graph)) { + visited[node.Name()] = false; + } + + visited[from] = true; + + std::list queue; + queue.push_back(from); + + while (!queue.empty()) { + auto cur = find_node(graph, queue.front()); + queue.pop_front(); + + if (cur == nullptr) return false; + + for (auto n : cur->outputs) { + if (n->Name() == to) return true; + + if (!visited[n->Name()]) { + visited[n->Name()] = true; + queue.push_back(n->Name()); + } + } + } + return false; + }; + } +}; + +void AssertOpsCount(const std::unique_ptr& graph) { + int conv_count = 0; + int elementwise_add_count = 0; + + for (auto* node : graph->Nodes()) { + if (node->IsOp() && node->Op()->Type() == "conv2d") { + ++conv_count; + } + if (node->IsOp() && node->Op()->Type() == "elementwise_add") { + ++elementwise_add_count; + } + } + EXPECT_EQ(conv_count, 1); + EXPECT_EQ(elementwise_add_count, 0); +} + +ProgramDesc BuildProgramDesc(const std::vector& transient_vars, + const std::vector& persistent_vars) { + ProgramDesc prog; + + auto add_var_to_prog = [&prog](const std::string& var_name) -> VarDesc* { + auto var = prog.MutableBlock(0)->Var(var_name); + var->SetType(proto::VarType::LOD_TENSOR); + + return var; + }; + + for (const auto& v : transient_vars) { + add_var_to_prog(v); + } + + for (const auto& v : persistent_vars) { + auto var = add_var_to_prog(v); + var->SetPersistable(true); + } + + return prog; +} +} // namespace + +TEST(ConvElementwiseAddMKLDNNFusePass, ConvolutionWithElementwiseAddRelu) { + auto prog = + BuildProgramDesc({"a", "b", "c", "d", "e", "f"}, {"bias", "weights"}); + + SetOp(&prog, "conv2d", + {{"Input", "a"}, {"Bias", "bias"}, {"Filter", "weights"}}, + {"Output", "b"}); + SetOp(&prog, "elementwise_add", {{"X", "b"}, {"Y", "c"}}, {"Out", "d"}); + SetOp(&prog, "relu", {{"X", "d"}}, {"Out", "e"}); + + std::unique_ptr graph(new ir::Graph(prog)); + + IsReachable is_reachable; + EXPECT_TRUE(is_reachable(graph)("a", "relu")); + + auto pass = + PassRegistry::Instance().Get("conv_elementwise_add_mkldnn_fuse_pass"); + int original_nodes_num = graph->Nodes().size(); + graph = pass->Apply(std::move(graph)); + int current_nodes_num = graph->Nodes().size(); + + EXPECT_TRUE(is_reachable(graph)("a", "relu")); + + EXPECT_EQ(original_nodes_num - nodes_removed + nodes_added, + current_nodes_num); + + AssertOpsCount(graph); +} + +TEST(ConvElementwiseAddMKLDNNFusePass, + ConvolutionWithElementwiseAddReluNoBias) { + auto prog = BuildProgramDesc({"a", "b", "c", "d", "e"}, {"weights"}); + SetOp(&prog, "conv2d", {{"Input", "a"}, {"Filter", "weights"}}, + {"Output", "b"}); + SetOp(&prog, "elementwise_add", {{"X", "b"}, {"Y", "c"}}, {"Out", "d"}); + SetOp(&prog, "relu", {{"X", "d"}}, {"Out", "e"}); + + std::unique_ptr graph(new ir::Graph(prog)); + + IsReachable is_reachable; + + EXPECT_TRUE(is_reachable(graph)("a", "relu")); + + auto pass = + PassRegistry::Instance().Get("conv_elementwise_add_mkldnn_fuse_pass"); + int original_nodes_num = graph->Nodes().size(); + graph = pass->Apply(std::move(graph)); + int current_nodes_num = graph->Nodes().size(); + + EXPECT_TRUE(is_reachable(graph)("a", "relu")); + + EXPECT_EQ(original_nodes_num - nodes_removed + nodes_added, + current_nodes_num); + + AssertOpsCount(graph); +} + +TEST(ConvElementwiseAddMKLDNNFusePass, ConvolutionElementwiseAdd) { + auto prog = BuildProgramDesc({"a", "b", "c", "d"}, {"bias", "weights"}); + SetOp(&prog, "conv2d", + {{"Input", "a"}, {"Bias", "bias"}, {"Filter", "weights"}}, + {"Output", "b"}); + SetOp(&prog, "elementwise_add", {{"X", "b"}, {"Y", "c"}}, {"Out", "d"}); + + std::unique_ptr graph(new ir::Graph(prog)); + + IsReachable is_reachable; + EXPECT_TRUE(is_reachable(graph)("a", "d")); + + auto pass = + PassRegistry::Instance().Get("conv_elementwise_add_mkldnn_fuse_pass"); + int original_nodes_num = graph->Nodes().size(); + graph = pass->Apply(std::move(graph)); + int current_nodes_num = graph->Nodes().size(); + + EXPECT_FALSE(is_reachable(graph)("a", "d")); + + EXPECT_EQ(original_nodes_num - nodes_removed + nodes_added, + current_nodes_num); + AssertOpsCount(graph); +} + +TEST(ConvElementwiseAddMKLDNNFusePass, SigmoidConvolutionAddElementwiseRelu) { + auto prog = + BuildProgramDesc({"a", "b", "c", "d", "e", "f"}, {"bias", "weights"}); + SetOp(&prog, "sigmoid", {{"X", "a"}}, {"Out", "b"}); + SetOp(&prog, "conv2d", + {{"Input", "b"}, {"Bias", "bias"}, {"Filter", "weights"}}, + {"Output", "c"}); + SetOp(&prog, "elementwise_add", {{"X", "c"}, {"Y", "d"}}, {"Out", "e"}); + SetOp(&prog, "relu", {{"X", "e"}}, {"Out", "f"}); + + std::unique_ptr graph(new ir::Graph(prog)); + + IsReachable is_reachable; + + EXPECT_TRUE(is_reachable(graph)("a", "f")); + + auto pass = + PassRegistry::Instance().Get("conv_elementwise_add_mkldnn_fuse_pass"); + int original_nodes_num = graph->Nodes().size(); + graph = pass->Apply(std::move(graph)); + int current_nodes_num = graph->Nodes().size(); + + EXPECT_TRUE(is_reachable(graph)("a", "f")); + + EXPECT_EQ(original_nodes_num - nodes_removed + nodes_added, + current_nodes_num); + AssertOpsCount(graph); +} + +} // namespace ir +} // namespace framework +} // namespace paddle + +USE_PASS(conv_elementwise_add_mkldnn_fuse_pass); diff --git a/paddle/fluid/framework/ir/conv_relu_mkldnn_fuse_pass.cc b/paddle/fluid/framework/ir/conv_relu_mkldnn_fuse_pass.cc index d7df6389cf..e359a3832e 100644 --- a/paddle/fluid/framework/ir/conv_relu_mkldnn_fuse_pass.cc +++ b/paddle/fluid/framework/ir/conv_relu_mkldnn_fuse_pass.cc @@ -46,6 +46,12 @@ std::unique_ptr ConvReLUFusePass::ApplyImpl( GET_IR_NODE_FROM_SUBGRAPH(relu_out, relu_out, conv_relu_pattern); // Out GET_IR_NODE_FROM_SUBGRAPH(relu, relu, conv_relu_pattern); // ReLU op + FuseOptions fuse_option = FindFuseOption(*conv, *relu); + if (fuse_option == DO_NOT_FUSE) { + VLOG(3) << "do not perform conv+relu fuse"; + return; + } + // Transform Conv node into ConvReLU node. OpDesc* desc = conv->Op(); desc->SetOutput("Output", std::vector({relu_out->Name()})); diff --git a/paddle/fluid/framework/ir/conv_relu_mkldnn_fuse_pass_tester.cc b/paddle/fluid/framework/ir/conv_relu_mkldnn_fuse_pass_tester.cc index 9dd780ec89..8f4bab25ed 100644 --- a/paddle/fluid/framework/ir/conv_relu_mkldnn_fuse_pass_tester.cc +++ b/paddle/fluid/framework/ir/conv_relu_mkldnn_fuse_pass_tester.cc @@ -20,17 +20,19 @@ namespace paddle { namespace framework { namespace ir { -void SetOp(ProgramDesc* prog, const std::string& type, +void SetOp(ProgramDesc* prog, const std::string& type, const std::string& name, const std::vector& inputs, - const std::vector& outputs) { + const std::vector& outputs, bool use_mkldnn = false) { auto* op = prog->MutableBlock(0)->AppendOp(); op->SetType(type); if (type == "conv2d") { - op->SetAttr("use_mkldnn", true); + op->SetAttr("use_mkldnn", use_mkldnn); + op->SetAttr("name", name); op->SetInput("Input", {inputs[0]}); op->SetInput("Filter", {inputs[1]}); op->SetInput("Bias", {inputs[2]}); } else if (type == "relu") { + op->SetAttr("use_mkldnn", use_mkldnn); op->SetInput("X", inputs); } op->SetOutput("Out", outputs); @@ -43,7 +45,8 @@ void SetOp(ProgramDesc* prog, const std::string& type, ProgramDesc BuildProgramDesc() { ProgramDesc prog; for (auto& v : - std::vector({"a", "b", "c", "weights", "bias", "f", "g"})) { + std::vector({"a", "b", "c", "weights", "bias", "f", "g", + "h", "weights2", "bias2", "k", "l"})) { auto* var = prog.MutableBlock(0)->Var(v); var->SetType(proto::VarType::SELECTED_ROWS); if (v == "weights" || v == "bias") { @@ -51,14 +54,24 @@ ProgramDesc BuildProgramDesc() { } } - SetOp(&prog, "OP0", std::vector({"a"}), + SetOp(&prog, "OP0", "op0", std::vector({"a"}), std::vector({"b"})); - SetOp(&prog, "OP1", std::vector({"b"}), + SetOp(&prog, "OP1", "op1", std::vector({"b"}), std::vector({"c"})); - SetOp(&prog, "conv2d", std::vector({"c", "weights", "bias"}), - std::vector({"f"})); - SetOp(&prog, "relu", std::vector({"f"}), - std::vector({"g"})); + // conv+relu, both with MKL-DNN + SetOp(&prog, "conv2d", "conv1", + std::vector({"c", "weights", "bias"}), + std::vector({"f"}), true); + SetOp(&prog, "relu", "relu1", std::vector({"f"}), + std::vector({"g"}), true); + SetOp(&prog, "OP3", "op3", std::vector({"g"}), + std::vector({"h"})); + // conv+relu, only one with MKL-DNN + SetOp(&prog, "conv2d", "conv2", + std::vector({"h", "weights2", "bias2"}), + std::vector({"k"}), true); + SetOp(&prog, "relu", "relu2", std::vector({"k"}), + std::vector({"l"})); return prog; } @@ -88,10 +101,16 @@ TEST(ConvReLUFusePass, basic) { auto* op = node->Op(); ASSERT_TRUE(op->HasAttr("use_mkldnn")); EXPECT_TRUE(boost::get(op->GetAttr("use_mkldnn"))); - ASSERT_TRUE(op->HasAttr("fuse_relu")); - bool fuse_relu = boost::get(op->GetAttr("fuse_relu")); - if (fuse_relu) { - ++conv_relu_count; + // check if only "conv1" convolution is fused + auto op_name = boost::get(op->GetAttr("name")); + if (op_name == "conv1") { + ASSERT_TRUE(op->HasAttr("fuse_relu")); + bool fuse_relu = boost::get(op->GetAttr("fuse_relu")); + if (fuse_relu) { + ++conv_relu_count; + } + } else if (op_name == "conv2") { + ASSERT_FALSE(op->HasAttr("fuse_relu")); } } } diff --git a/paddle/fluid/framework/ir/fuse_pass_base.cc b/paddle/fluid/framework/ir/fuse_pass_base.cc new file mode 100644 index 0000000000..d70010089e --- /dev/null +++ b/paddle/fluid/framework/ir/fuse_pass_base.cc @@ -0,0 +1,62 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/framework/ir/fuse_pass_base.h" + +namespace paddle { +namespace framework { +namespace ir { + +void FusePassBase::Init(const std::string& repr, Graph* graph) const { + repr_ = repr; + graph_ = graph; +} + +Scope* FusePassBase::param_scope() const { + PADDLE_ENFORCE(graph_->Has(kParamScopeAttr)); + return graph_->Get(kParamScopeAttr); +} + +void FusePassBase::AddStatis(int count_of_fused) const { + PADDLE_ENFORCE(graph_); + PADDLE_ENFORCE(!repr_.empty()); + if (!graph_->Has(kFuseStatisAttr)) { + graph_->Set(kFuseStatisAttr, new std::unordered_map); + } + auto& info = + graph_->Get>(kFuseStatisAttr); + info[repr_] = count_of_fused; +} + +FuseOptions FusePassBase::FindFuseOption(const Node& node1, + const Node& node2) const { +#ifdef PADDLE_WITH_MKLDNN + bool node1_mkldnn = node1.Op()->HasAttr("use_mkldnn") && + boost::get(node1.Op()->GetAttr("use_mkldnn")); + bool node2_mkldnn = node2.Op()->HasAttr("use_mkldnn") && + boost::get(node2.Op()->GetAttr("use_mkldnn")); + if (node1_mkldnn && node2_mkldnn) + return FUSE_MKLDNN; + else if (!node1_mkldnn && !node2_mkldnn) + return FUSE_NATIVE; + else + return DO_NOT_FUSE; +#else + return FUSE_NATIVE; +#endif +}; + +} // namespace ir +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/ir/fuse_pass_base.h b/paddle/fluid/framework/ir/fuse_pass_base.h index 877bbeb502..c53b2a6186 100644 --- a/paddle/fluid/framework/ir/fuse_pass_base.h +++ b/paddle/fluid/framework/ir/fuse_pass_base.h @@ -25,32 +25,24 @@ namespace ir { static const char kParamScopeAttr[] = "__param_scope__"; static const char kFuseStatisAttr[] = "__fuse_statis__"; +enum FuseOptions { + DO_NOT_FUSE, // fusing will not be done + FUSE_NATIVE, // fusing will be done without MKL-DNN + FUSE_MKLDNN // fusing will be done with MKL-DNN +}; + class FusePassBase : public Pass { public: - void Init(const std::string& repr, Graph* graph) const { - repr_ = repr; - graph_ = graph; - } - - Scope* param_scope() const { - PADDLE_ENFORCE(graph_->Has(kParamScopeAttr)); - return graph_->Get(kParamScopeAttr); - } - - void AddStatis(int count_of_fused) const { - PADDLE_ENFORCE(graph_); - PADDLE_ENFORCE(!repr_.empty()); - if (!graph_->Has(kFuseStatisAttr)) { - graph_->Set(kFuseStatisAttr, new std::unordered_map); - } - auto& info = - graph_->Get>(kFuseStatisAttr); - info[repr_] = count_of_fused; - } + void Init(const std::string& repr, Graph* graph) const; + Scope* param_scope() const; + void AddStatis(int count_of_fused) const; virtual ~FusePassBase() {} protected: + virtual FuseOptions FindFuseOption(const Node& node1, + const Node& node2) const; + mutable Graph* graph_; mutable std::string repr_; }; diff --git a/paddle/fluid/framework/ir/graph_helper_test.cc b/paddle/fluid/framework/ir/graph_helper_test.cc index cea9028093..260a73ae76 100644 --- a/paddle/fluid/framework/ir/graph_helper_test.cc +++ b/paddle/fluid/framework/ir/graph_helper_test.cc @@ -200,15 +200,15 @@ TEST(GraphHelperTest, GraphNum) { Graph g(prog); BuildZeroGraph(&g); - ASSERT_EQ(GraphNum(g), 0); + ASSERT_EQ(GraphNum(g), 0UL); Graph g2(prog); BuildOneGraph(&g2); - ASSERT_EQ(GraphNum(g2), 1); + ASSERT_EQ(GraphNum(g2), 1UL); Graph g3(prog); BuildTwoGraphs(&g3); - ASSERT_EQ(GraphNum(g3), 2); + ASSERT_EQ(GraphNum(g3), 2UL); } } // namespace ir diff --git a/paddle/fluid/framework/ir/graph_pattern_detector.cc b/paddle/fluid/framework/ir/graph_pattern_detector.cc index 46c6a52c09..29b604afbf 100644 --- a/paddle/fluid/framework/ir/graph_pattern_detector.cc +++ b/paddle/fluid/framework/ir/graph_pattern_detector.cc @@ -259,6 +259,8 @@ GraphPatternDetector::DetectPatterns() { return result; } +// TODO(Superjomn) enhance the function as it marks unique unique as duplicates +// see https://github.com/PaddlePaddle/Paddle/issues/13550 void GraphPatternDetector::UniquePatterns( std::vector *subgraphs) { if (subgraphs->empty()) return; @@ -626,6 +628,112 @@ bool VarLinksFromOp(Node *node, const std::string &op_type) { return false; } +PDNode *patterns::ConvBN::operator()(paddle::framework::ir::PDNode *conv_input, + bool with_eltwise_add) { + // Create Operators + conv_input->assert_is_op_input("conv2d", "Input"); + auto *conv_op = pattern->NewNode(conv_repr())->assert_is_op("conv2d"); + + PDNode *eltwise_op = nullptr; + if (with_eltwise_add) { + eltwise_op = + pattern->NewNode(eltwise_repr())->assert_is_op("elementwise_add"); + } + auto *batch_norm_op = + pattern->NewNode(batch_norm_repr())->assert_is_op("batch_norm"); + // Create variables + // Conv Filter + auto *conv_weight_var = pattern->NewNode(conv_weight_repr()) + ->AsInput() + ->assert_is_persistable_var() + ->assert_is_op_input("conv2d", "Filter"); + + auto *conv_out_var = pattern->NewNode(conv_out_repr()) + ->AsIntermediate() + ->assert_is_only_output_of_op("conv2d"); + + PDNode *eltwise_y_in_var = nullptr; + PDNode *eltwise_out_var = nullptr; + if (with_eltwise_add) { + // Conv output as Bias input + conv_out_var->assert_is_op_input("elementwise_add", "X"); + // Bias + eltwise_y_in_var = pattern->NewNode(eltwise_y_in_repr()) + ->assert_is_op_input("elementwise_add", "Y") + ->AsInput(); + eltwise_out_var = pattern->NewNode(eltwise_out_repr()) + ->AsIntermediate() + ->assert_is_only_output_of_op("elementwise_add"); + } else { + // Conv output as BN input + conv_out_var->assert_is_op_input("batch_norm", "X"); + } + + // BN Scale + auto *bn_scale_var = pattern->NewNode(bn_scale_repr()) + ->AsInput() + ->assert_is_persistable_var() + ->assert_is_op_input("batch_norm", "Scale"); + // BN Bias + auto *bn_bias_var = pattern->NewNode(bn_bias_repr()) + ->AsInput() + ->assert_is_persistable_var() + ->assert_is_op_input("batch_norm", "Bias"); + // BN Mean + auto *bn_mean_var = pattern->NewNode(bn_mean_repr()) + ->AsInput() + ->assert_is_persistable_var() + ->assert_is_op_input("batch_norm", "Mean"); + // BN Variance + auto *bn_variance_var = pattern->NewNode(bn_variance_repr()) + ->AsInput() + ->assert_is_persistable_var() + ->assert_is_op_input("batch_norm", "Variance"); + + // BN output + auto *bn_out_var = pattern->NewNode(bn_out_repr()) + ->AsOutput() + ->assert_is_op_output("batch_norm"); + + auto *bn_mean_out_var = pattern->NewNode(bn_mean_out_repr()) + ->AsOutput() + ->assert_is_op_output("batch_norm", "MeanOut"); + + auto *bn_variance_out_var = + pattern->NewNode(bn_variance_out_repr()) + ->AsOutput() + ->assert_is_op_output("batch_norm", "VarianceOut"); + + auto *bn_saved_mean_var = + pattern->NewNode(bn_saved_mean_repr()) + ->AsOutput() + ->assert_is_op_output("batch_norm", "SavedMean"); + + auto *bn_saved_variance_var = + pattern->NewNode(bn_saved_variance_repr()) + ->AsOutput() + ->assert_is_op_output("batch_norm", "SavedVariance"); + + conv_op->LinksFrom({conv_input, conv_weight_var}).LinksTo({conv_out_var}); + + if (with_eltwise_add) { + eltwise_op->LinksFrom({conv_out_var, eltwise_y_in_var}) + .LinksTo({eltwise_out_var}); + batch_norm_op + ->LinksFrom({eltwise_out_var, bn_scale_var, bn_bias_var, bn_mean_var, + bn_variance_var}) + .LinksTo({bn_out_var, bn_mean_out_var, bn_variance_out_var, + bn_saved_mean_var, bn_saved_variance_var}); + } else { + batch_norm_op + ->LinksFrom({conv_out_var, bn_scale_var, bn_bias_var, bn_mean_var, + bn_variance_var}) + .LinksTo({bn_out_var, bn_mean_out_var, bn_variance_out_var, + bn_saved_mean_var, bn_saved_variance_var}); + } + return bn_out_var; +} + PDNode *patterns::ConvReLU::operator()( paddle::framework::ir::PDNode *conv_input) { // Create Operators @@ -653,6 +761,51 @@ PDNode *patterns::ConvReLU::operator()( return relu_out_var; } +PDNode *patterns::SeqConvEltAddRelu::operator()( + paddle::framework::ir::PDNode *seqconv_input) { + // Create Operators + seqconv_input->assert_is_op_input("sequence_conv", "X"); + auto *seqconv_op = pattern->NewNode(seqconv_repr()) + ->assert_is_op("sequence_conv") + ->assert_op_attr("paddingTrainable", false) + ->assert_op_attr("contextStride", 1); + + auto *eltadd_op = + pattern->NewNode(eltadd_repr())->assert_is_op("elementwise_add"); + auto *relu_op = pattern->NewNode(relu_repr())->assert_is_op("relu"); + // Create variables + // Filter + auto *seqconv_weight_var = + pattern->NewNode(seqconv_weight_repr()) + ->AsInput() + ->assert_is_persistable_var() + ->assert_is_op_input("sequence_conv", "Filter"); + // Bias + auto *eltadd_bias_var = pattern->NewNode(eltadd_bias_repr()) + ->AsInput() + ->assert_is_op_input("elementwise_add"); + // intermediate variable, will be removed in the IR after fuse. + auto *seqconv_out_var = pattern->NewNode(seqconv_out_repr()) + ->AsIntermediate() + ->assert_is_only_output_of_op("sequence_conv") + ->assert_is_op_input("elementwise_add"); + auto *eltadd_out_var = pattern->NewNode(eltadd_out_repr()) + ->AsIntermediate() + ->assert_is_only_output_of_op("elementwise_add") + ->assert_is_only_input_of_op("relu"); + // output + auto *relu_out_var = pattern->NewNode(relu_out_repr()) + ->AsOutput() + ->assert_is_op_output("relu"); + + seqconv_op->LinksFrom({seqconv_input, seqconv_weight_var}) + .LinksTo({seqconv_out_var}); + eltadd_op->LinksFrom({seqconv_out_var, eltadd_bias_var}) + .LinksTo({eltadd_out_var}); + relu_op->LinksFrom({eltadd_out_var}).LinksTo({relu_out_var}); + return relu_out_var; +} + PDNode *patterns::FC::operator()(paddle::framework::ir::PDNode *x, bool with_bias) { // Create shared nodes. @@ -858,6 +1011,79 @@ PDNode *patterns::ElewiseAddActInplaceGrad::operator()( return ele_add_grad; } +PDNode *patterns::ConvBias::operator()( + paddle::framework::ir::PDNode *conv_input) { + // Create Operators + conv_input->assert_is_op_input("conv2d", "Input"); + auto *conv_op = pattern->NewNode(conv_repr())->assert_is_op("conv2d"); + auto *eltiwse_op = + pattern->NewNode(eltwise_repr())->assert_is_op("elementwise_add"); + // Create variables + // Filter + auto *conv_weight_var = pattern->NewNode(conv_weight_repr()) + ->AsInput() + ->assert_is_persistable_var() + ->assert_is_op_input("conv2d", "Filter"); + // intermediate variable, will be removed in the IR after fuse. + auto *conv_out_var = pattern->NewNode(conv_out_repr()) + ->AsIntermediate() + ->assert_is_only_output_of_op("conv2d") + ->assert_is_op_input("elementwise_add"); + // Bias stored in elementwise_add + auto *eltwise_bias_var = pattern->NewNode(eltwise_bias_repr()) + ->AsInput() + ->assert_is_persistable_var() + ->assert_is_op_input("elementwise_add", "Y"); + // output + auto *eltwise_out_var = pattern->NewNode(eltwise_out_repr()) + ->AsOutput() + ->assert_is_op_output("elementwise_add"); + conv_op->LinksFrom({conv_input, conv_weight_var}).LinksTo({conv_out_var}); + eltiwse_op->LinksFrom({conv_out_var, eltwise_bias_var}) + .LinksTo({eltwise_out_var}); + return eltwise_out_var; +} + +PDNode *patterns::Conv::operator()() { + auto conv_op = pattern->NewNode(conv_op_repr())->assert_is_op("conv2d"); + + auto input_var = pattern->NewNode(conv_input_repr()) + ->AsInput() + ->assert_is_op_input("conv2d", "Input"); + + auto filter_var = pattern->NewNode(conv_filter_repr()) + ->AsInput() + ->assert_is_op_input("conv2d", "Filter"); + + auto output_var = pattern->NewNode(conv_output_repr()) + ->AsOutput() + ->assert_is_op_output("conv2d", "Output"); + + conv_op->LinksFrom({input_var, filter_var}); + conv_op->LinksTo({output_var}); + + return output_var; +} + +PDNode *patterns::ElementwiseAdd::operator()(PDNode *x_var) { + auto elementwise_add_op = pattern->NewNode(elementwise_add_op_repr()) + ->assert_is_op("elementwise_add"); + + x_var->assert_is_op_input("elementwise_add", "X"); + + auto y_var = pattern->NewNode(elementwise_add_x_repr()) + ->AsInput() + ->assert_is_op_input("elementwise_add", "Y"); + + auto out_var = pattern->NewNode(elementwise_add_out_repr()) + ->AsOutput() + ->assert_is_op_output("elementwise_add", "Out"); + + elementwise_add_op->LinksFrom({x_var, y_var}); + elementwise_add_op->LinksTo({out_var}); + + return out_var; +} } // namespace ir } // namespace framework } // namespace paddle diff --git a/paddle/fluid/framework/ir/graph_pattern_detector.h b/paddle/fluid/framework/ir/graph_pattern_detector.h index 508113bf4f..9e462ac671 100644 --- a/paddle/fluid/framework/ir/graph_pattern_detector.h +++ b/paddle/fluid/framework/ir/graph_pattern_detector.h @@ -128,6 +128,15 @@ struct PDNode { const std::unordered_set& op_types, const std::string& argument, int nth); + template + PDNode* assert_op_attr(const std::string& attr_name, const T& attr) { + asserts_.emplace_back([=](Node* x) { + return x && x->IsOp() && x->Op()->HasAttr(attr_name) && + boost::get(x->Op()->GetAttr(attr_name)) == attr; + }); + return this; + } + private: PDNode(PDPattern* pattern, const std::string& name = "", Type type = Type::kVar) @@ -375,6 +384,44 @@ struct PatternBase { size_t id_; }; +// Conv with batch norm +// op: conv + (elementwise_add +) batch_norm +// named nodes: +// conv_weight, conv_out, conv, +// bn_x, bn_scale, bn_bias, bn_mean, bn_variance, +// bn_batch_norm, bn_y, bn_mean_out, bn_variance_out, +// bn_saved_mean, bn_saved_variance +struct ConvBN : public PatternBase { + ConvBN(PDPattern* pattern, const std::string& name_scope) + : PatternBase(pattern, name_scope, "conv_bn") {} + + PDNode* operator()(PDNode* conv_input, bool with_eltwise_add); + + // declare operator node's name + PATTERN_DECL_NODE(conv); + PATTERN_DECL_NODE(batch_norm); + PATTERN_DECL_NODE(eltwise); // ELEMENTWISE_ADD + // CONV inputs + PATTERN_DECL_NODE(conv_weight); // Filter + // CONV outputs + PATTERN_DECL_NODE(conv_out); // tmp + // ELTWISE inputs + PATTERN_DECL_NODE(eltwise_y_in); + // ELTWISE outputs + PATTERN_DECL_NODE(eltwise_out); // tmp + // BN inputs + PATTERN_DECL_NODE(bn_scale); + PATTERN_DECL_NODE(bn_bias); + PATTERN_DECL_NODE(bn_mean); + PATTERN_DECL_NODE(bn_variance); + // BN outputs + PATTERN_DECL_NODE(bn_out); // Out + PATTERN_DECL_NODE(bn_mean_out); + PATTERN_DECL_NODE(bn_variance_out); + PATTERN_DECL_NODE(bn_saved_mean); + PATTERN_DECL_NODE(bn_saved_variance); +}; + // CONV with ReLU // op: conv + relu // named nodes: @@ -396,6 +443,31 @@ struct ConvReLU : public PatternBase { PATTERN_DECL_NODE(relu_out); }; +// SEQCONV with Elementwise_Add ReLU +// op: seqconv + elementwise_add + relu +// named nodes: +// seqconv_input, seqconv_weight, +// seqconv_out, seqconv, +// elementwise_add_bias, elementwise_add_out, elementwise_add +// relu_out, relu +struct SeqConvEltAddRelu : public PatternBase { + SeqConvEltAddRelu(PDPattern* pattern, const std::string& name_scope) + : PatternBase(pattern, name_scope, "seqconv_eltadd_relu") {} + + PDNode* operator()(PDNode* seqconv_input); + + // declare operator node's name + PATTERN_DECL_NODE(seqconv); + PATTERN_DECL_NODE(eltadd); + PATTERN_DECL_NODE(relu); + // declare variable node's name + PATTERN_DECL_NODE(seqconv_weight); + PATTERN_DECL_NODE(seqconv_out); + PATTERN_DECL_NODE(eltadd_bias); + PATTERN_DECL_NODE(eltadd_out); + PATTERN_DECL_NODE(relu_out); +}; + // FC with bias // op: mul + elementwise_add // named nodes: @@ -540,6 +612,65 @@ struct ElewiseAddActInplaceGrad : public PatternBase { PATTERN_DECL_NODE(d_ele_y); PATTERN_DECL_NODE(ele_y); }; + +// Conv with Elementwise_add as bias +// op: conv + elementwise_add +// named nodes: +// conv_input, conv_weight, +// conv_out, conv, +// eltwise_bias, eltwise_out, +// elementwise_add +struct ConvBias : public PatternBase { + ConvBias(PDPattern* pattern, const std::string& name_scope) + : PatternBase(pattern, name_scope, "conv_bias") {} + PDNode* operator()(PDNode* conv_input); + // declare operator node's name + PATTERN_DECL_NODE(conv); + PATTERN_DECL_NODE(eltwise); + // declare variable node's name + PATTERN_DECL_NODE(conv_weight); + PATTERN_DECL_NODE(conv_out); + PATTERN_DECL_NODE(eltwise_bias); + PATTERN_DECL_NODE(eltwise_out); +}; + +// Convolution op +// Forward pass for convolution. +// conv_input, conv_bias and conv_filter are inputs. +// conv_output is a result of the operator. +// residual_data is data used by skip connection. +// If residual connection fusion is on, the formula is: +// conv_output = conv_op(conv_filter, conv_input, conv_bias) +// + conv_residual_data +// If the fusion is off, conv_residual_data is not added. +struct Conv : public PatternBase { + Conv(PDPattern* pattern, const std::string& name_scope) + : PatternBase(pattern, name_scope, "convolution") {} + + PDNode* operator()(); + + PATTERN_DECL_NODE(conv_op); + PATTERN_DECL_NODE(conv_input); + PATTERN_DECL_NODE(conv_filter); + PATTERN_DECL_NODE(conv_residual_data); + PATTERN_DECL_NODE(conv_output); +}; + +// ElementwiseAdd used in residual connections. +// y_var is used and convolution output. +// The operator is removed, when residual +// connection fusion is on. +struct ElementwiseAdd : public PatternBase { + ElementwiseAdd(PDPattern* pattern, const std::string& name_scope) + : PatternBase(pattern, name_scope, "elementwise_add") {} + + PDNode* operator()(PDNode* x_var); + + PATTERN_DECL_NODE(elementwise_add_op); + PATTERN_DECL_NODE(elementwise_add_x); + PATTERN_DECL_NODE(elementwise_add_y); + PATTERN_DECL_NODE(elementwise_add_out); +}; } // namespace patterns // Link two ir::Nodes from each other. diff --git a/paddle/fluid/framework/ir/graph_test.cc b/paddle/fluid/framework/ir/graph_test.cc index cadda49c39..7ed2f96eb2 100644 --- a/paddle/fluid/framework/ir/graph_test.cc +++ b/paddle/fluid/framework/ir/graph_test.cc @@ -124,7 +124,7 @@ TEST(GraphTest, Basic) { ASSERT_EQ(n->outputs.size(), 0UL); } } - ASSERT_EQ(nodes.size(), 5); + ASSERT_EQ(nodes.size(), 5UL); } TEST(GraphTest, WriteAfterRead) { diff --git a/paddle/fluid/framework/ir/mkldnn_placement_pass.cc b/paddle/fluid/framework/ir/mkldnn_placement_pass.cc new file mode 100644 index 0000000000..65be69b7f5 --- /dev/null +++ b/paddle/fluid/framework/ir/mkldnn_placement_pass.cc @@ -0,0 +1,37 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/framework/ir/mkldnn_placement_pass.h" + +namespace paddle { +namespace framework { +namespace ir { + +std::unique_ptr MKLDNNPlacementPass::ApplyImpl( + std::unique_ptr graph) const { + VLOG(3) << "Aplies MKL-DNN placement strategy."; + for (const Node* n : graph->Nodes()) { + if (n->IsOp() && n->Op()->HasAttr("use_mkldnn")) { + n->Op()->SetAttr("use_mkldnn", true); + } + } + return graph; +} + +} // namespace ir +} // namespace framework +} // namespace paddle + +REGISTER_PASS(mkldnn_placement_pass, + paddle::framework::ir::MKLDNNPlacementPass); diff --git a/paddle/fluid/framework/ir/mkldnn_placement_pass.h b/paddle/fluid/framework/ir/mkldnn_placement_pass.h new file mode 100644 index 0000000000..3d4dc9e2b6 --- /dev/null +++ b/paddle/fluid/framework/ir/mkldnn_placement_pass.h @@ -0,0 +1,31 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include "paddle/fluid/framework/ir/pass.h" + +namespace paddle { +namespace framework { +namespace ir { + +class MKLDNNPlacementPass : public Pass { + protected: + std::unique_ptr ApplyImpl( + std::unique_ptr graph) const override; +}; + +} // namespace ir +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/ir/seqconv_eltadd_relu_fuse_pass.cc b/paddle/fluid/framework/ir/seqconv_eltadd_relu_fuse_pass.cc new file mode 100644 index 0000000000..0a1f65d274 --- /dev/null +++ b/paddle/fluid/framework/ir/seqconv_eltadd_relu_fuse_pass.cc @@ -0,0 +1,101 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/framework/ir/seqconv_eltadd_relu_fuse_pass.h" +#include +#include "paddle/fluid/framework/lod_tensor.h" + +namespace paddle { +namespace framework { +namespace ir { + +int BuildFusion(Graph* graph, const std::string& name_scope, Scope* scope) { + GraphPatternDetector gpd; + auto* pattern = gpd.mutable_pattern(); + + PDNode* x = pattern->NewNode(patterns::PDNodeName(name_scope, "X")) + ->assert_is_op_input("sequence_conv") + ->assert_var_not_persistable(); + patterns::SeqConvEltAddRelu fuse_pattern(pattern, name_scope); + fuse_pattern(x); + + // Create New OpDesc + auto fuse_creator = [&](Node* seqconv, Node* input, Node* seqconv_weight, + Node* eltadd_bias, Node* relu_out) { + OpDesc op_desc; + op_desc.SetType("fusion_seqconv_eltadd_relu"); + op_desc.SetInput("X", {input->Name()}); + op_desc.SetInput("Filter", {seqconv_weight->Name()}); + op_desc.SetInput("Bias", {eltadd_bias->Name()}); + op_desc.SetAttr("contextLength", seqconv->Op()->GetAttr("contextLength")); + op_desc.SetAttr("contextStart", seqconv->Op()->GetAttr("contextStart")); + op_desc.SetAttr("contextStride", seqconv->Op()->GetAttr("contextStride")); + PADDLE_ENFORCE(graph->Has(kParamScopeAttr)); + auto* scope = graph->Get(kParamScopeAttr); + const std::string ColMat = patterns::UniqueKey("SeqConvColMat"); + op_desc.SetOutput("ColMat", {ColMat}); + op_desc.SetOutput("Out", {relu_out->Name()}); + scope->Var(ColMat)->GetMutable(); + + auto* op = graph->CreateOpNode(&op_desc); + IR_NODE_LINK_TO(input, op); + IR_NODE_LINK_TO(seqconv_weight, op); + IR_NODE_LINK_TO(eltadd_bias, op); + IR_NODE_LINK_TO(op, relu_out); + return op; + }; + + int fusion_count{0}; + + auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph, + Graph* g) { + VLOG(4) << "handle SeqConv EltAdd Relu fuse"; + GET_IR_NODE_FROM_SUBGRAPH(seqconv, seqconv, fuse_pattern); + GET_IR_NODE_FROM_SUBGRAPH(seqconv_weight, seqconv_weight, fuse_pattern); + GET_IR_NODE_FROM_SUBGRAPH(seqconv_out, seqconv_out, fuse_pattern); + GET_IR_NODE_FROM_SUBGRAPH(eltadd, eltadd, fuse_pattern); + GET_IR_NODE_FROM_SUBGRAPH(eltadd_bias, eltadd_bias, fuse_pattern); + GET_IR_NODE_FROM_SUBGRAPH(eltadd_out, eltadd_out, fuse_pattern); + GET_IR_NODE_FROM_SUBGRAPH(relu, relu, fuse_pattern); + GET_IR_NODE_FROM_SUBGRAPH(relu_out, relu_out, fuse_pattern); + + fuse_creator(seqconv, subgraph.at(x), seqconv_weight, eltadd_bias, + relu_out); + std::unordered_set marked_nodes( + {seqconv, seqconv_out, eltadd, eltadd_out, relu}); + GraphSafeRemoveNodes(graph, marked_nodes); + ++fusion_count; + }; + + gpd(graph, handler); + + return fusion_count; +} + +std::unique_ptr SeqConvEltAddReluFusePass::ApplyImpl( + std::unique_ptr graph) const { + FusePassBase::Init(name_scope_, graph.get()); + + int fusion_count = BuildFusion(graph.get(), name_scope_, param_scope()); + AddStatis(fusion_count); + + return graph; +} + +} // namespace ir +} // namespace framework +} // namespace paddle + +REGISTER_PASS(seqconv_eltadd_relu_fuse_pass, + paddle::framework::ir::SeqConvEltAddReluFusePass); diff --git a/paddle/fluid/framework/ir/seqconv_eltadd_relu_fuse_pass.h b/paddle/fluid/framework/ir/seqconv_eltadd_relu_fuse_pass.h new file mode 100644 index 0000000000..dac9de7193 --- /dev/null +++ b/paddle/fluid/framework/ir/seqconv_eltadd_relu_fuse_pass.h @@ -0,0 +1,38 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include "paddle/fluid/framework/ir/fuse_pass_base.h" +#include "paddle/fluid/framework/ir/graph.h" +#include "paddle/fluid/framework/ir/graph_pattern_detector.h" + +namespace paddle { +namespace framework { +namespace ir { + +class SeqConvEltAddReluFusePass : public FusePassBase { + public: + virtual ~SeqConvEltAddReluFusePass() {} + + protected: + std::unique_ptr ApplyImpl(std::unique_ptr graph) const; + + const std::string name_scope_{"seqconv_eltadd_relu_fuse"}; +}; + +} // namespace ir +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/naive_executor.cc b/paddle/fluid/framework/naive_executor.cc index ba10687d65..2840d503f1 100644 --- a/paddle/fluid/framework/naive_executor.cc +++ b/paddle/fluid/framework/naive_executor.cc @@ -37,7 +37,7 @@ static void InitializeVariable(Variable *var, proto::VarType::Type var_type) { } else if (var_type == proto::VarType::FETCH_LIST) { var->GetMutable(); } else if (var_type == proto::VarType::STEP_SCOPES) { - var->GetMutable>(); + var->GetMutable>(); } else if (var_type == proto::VarType::LOD_RANK_TABLE) { var->GetMutable(); } else if (var_type == proto::VarType::LOD_TENSOR_ARRAY) { diff --git a/paddle/fluid/framework/op_desc.cc b/paddle/fluid/framework/op_desc.cc index b29ac44699..c293cf92b4 100644 --- a/paddle/fluid/framework/op_desc.cc +++ b/paddle/fluid/framework/op_desc.cc @@ -85,10 +85,6 @@ class CompileTimeInferShapeContext : public InferShapeContext { VLOG(3) << "input " << in << " is not LodTensor"; return; } - PADDLE_ENFORCE_EQ(in_var->GetType(), proto::VarType::LOD_TENSOR, - "The %d-th output of Output(%s) must be LoDTensor.", j, - out); - out_var->SetLoDLevel(in_var->GetLoDLevel()); } @@ -519,20 +515,14 @@ void OpDesc::InferShape(const BlockDesc &block) const { } void OpDesc::InferVarType(BlockDesc *block) const { + // There are a few places that var type can be set. + // When VarDesc is created, default set to LOD_TENSOR. + // When output variable is created, default is defaut set to LOD_TENSOR. + // We limit here to be the only place that operator defines its customized + // var type inference. Hence, we don't do any "default" setting here. auto &info = OpInfoMap::Instance().Get(this->Type()); if (info.infer_var_type_) { info.infer_var_type_(*this, block); - } else { - // all output type is LoDTensor by default - VLOG(10) << this->Type() - << " has not registered InferVarType. Set output variables to " - "LOD_TENSOR"; - for (auto &out_pair : this->outputs_) { - for (auto &out_var_name : out_pair.second) { - block->FindRecursiveOrCreateVar(out_var_name) - .SetType(proto::VarType::LOD_TENSOR); - } - } } } diff --git a/paddle/fluid/framework/op_desc.h b/paddle/fluid/framework/op_desc.h index b4205aba83..440e0509be 100644 --- a/paddle/fluid/framework/op_desc.h +++ b/paddle/fluid/framework/op_desc.h @@ -100,16 +100,6 @@ class OpDesc { std::vector InputNames() const { return MapKeys(inputs_); } std::vector OutputNames() const { return MapKeys(outputs_); } - void SetInputMap(const VariableNameMap &input) { - this->inputs_ = input; - this->need_update_ = true; - } - - void SetOutputMap(const VariableNameMap &output) { - this->outputs_ = output; - this->need_update_ = true; - } - const VariableNameMap &Inputs() const { return inputs_; } const VariableNameMap &Outputs() const { return outputs_; } diff --git a/paddle/fluid/framework/operator.cc b/paddle/fluid/framework/operator.cc index 9f93006532..14fcde2fe3 100644 --- a/paddle/fluid/framework/operator.cc +++ b/paddle/fluid/framework/operator.cc @@ -149,9 +149,17 @@ void OperatorBase::Run(const Scope& scope, const platform::Place& place) { platform::SetDeviceId(dev_id); #endif } - platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance(); - platform::RecordEvent record_event(Type(), pool.Get(place)); - RunImpl(scope, place); + + // The profile has a process-wide mutex, results in serious performance issue + // in concurrency scenerio. Here use an `if` to fix this issue. + // Please not remove the `if`, ask @Superjomn if there are any concern. + if (platform::IsProfileEnabled()) { + platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance(); + platform::RecordEvent record_event(Type(), pool.Get(place)); + RunImpl(scope, place); + } else { + RunImpl(scope, place); + } VLOG(3) << place << " " << DebugStringEx(&scope); } diff --git a/paddle/fluid/framework/parallel_executor.cc b/paddle/fluid/framework/parallel_executor.cc index f06bad6c78..093108cb54 100644 --- a/paddle/fluid/framework/parallel_executor.cc +++ b/paddle/fluid/framework/parallel_executor.cc @@ -299,6 +299,12 @@ void ParallelExecutor::FeedAndSplitTensorIntoLocalScopes( } ParallelExecutor::~ParallelExecutor() { + const auto dev_ctxs = + platform::DeviceContextPool::Instance().GetAllDeviceContexts(); + for (auto &dev_ctx : dev_ctxs) { + dev_ctx->Wait(); + } + if (member_->own_local_scope_) { for (size_t i = 1; i < member_->local_scopes_.size(); ++i) { Scope *local_scope = member_->local_scopes_[i]; @@ -307,6 +313,10 @@ ParallelExecutor::~ParallelExecutor() { } } } + + // member_ must be destructed before gcs_ since the destructor of + // ReferenceCountOpHandle use raw pointers of gcs_ inside. + member_.reset(); } } // namespace framework diff --git a/paddle/fluid/framework/parallel_executor.h b/paddle/fluid/framework/parallel_executor.h index fd386a5987..ef09b98b2a 100644 --- a/paddle/fluid/framework/parallel_executor.h +++ b/paddle/fluid/framework/parallel_executor.h @@ -75,7 +75,7 @@ class ParallelExecutor { private: void BCastParamsToDevices(const std::unordered_set &vars) const; - ParallelExecutorPrivate *member_; + std::unique_ptr member_; #ifdef PADDLE_WITH_CUDA // ref_cnts_ is only initialized when ParallelExecutor constructs, and then diff --git a/paddle/fluid/framework/program_desc.cc b/paddle/fluid/framework/program_desc.cc index 589905828f..4b9667113b 100644 --- a/paddle/fluid/framework/program_desc.cc +++ b/paddle/fluid/framework/program_desc.cc @@ -126,7 +126,7 @@ const std::vector ProgramDesc::GetFeedTargetNames() { std::vector feed_target_names; for (auto *op : global_block.AllOps()) { if (op->Type() == kFeedOpType) { - int col = boost::get(op->GetAttr("col")); + size_t col = boost::get(op->GetAttr("col")); if (col >= feed_target_names.size()) { feed_target_names.resize(col + 1); } @@ -143,7 +143,7 @@ const std::vector ProgramDesc::GetFetchTargetNames() { std::vector fetch_target_names; for (auto *op : global_block.AllOps()) { if (op->Type() == kFetchOpType) { - int col = boost::get(op->GetAttr("col")); + size_t col = boost::get(op->GetAttr("col")); if (col >= fetch_target_names.size()) { fetch_target_names.resize(col + 1); } diff --git a/paddle/fluid/framework/program_desc_test.cc b/paddle/fluid/framework/program_desc_test.cc index 7e689a37da..48bde2785e 100644 --- a/paddle/fluid/framework/program_desc_test.cc +++ b/paddle/fluid/framework/program_desc_test.cc @@ -103,7 +103,7 @@ TEST(ProgramDesc, copy_ctor) { ASSERT_EQ(1, op->GetBlockAttrId("sub_block")); found_sub_block = true; - ASSERT_EQ(2, op->GetBlocksAttrIds("sub_blocks").size()); + ASSERT_EQ(2UL, op->GetBlocksAttrIds("sub_blocks").size()); found_sub_blocks = true; } } diff --git a/paddle/fluid/framework/reader_test.cc b/paddle/fluid/framework/reader_test.cc index f0d07cb7c1..d812417a38 100644 --- a/paddle/fluid/framework/reader_test.cc +++ b/paddle/fluid/framework/reader_test.cc @@ -39,8 +39,8 @@ TEST(READER, decorate_chain) { { auto endpoints = root->GetEndPoints(); ASSERT_EQ(endpoints.size(), 2U); - ASSERT_NE(endpoints.count(end_point1.get()), 0); - ASSERT_NE(endpoints.count(end_point2.get()), 0); + ASSERT_NE(endpoints.count(end_point1.get()), 0UL); + ASSERT_NE(endpoints.count(end_point2.get()), 0UL); } { diff --git a/paddle/fluid/framework/scope.cc b/paddle/fluid/framework/scope.cc index 1a727a2c8c..a4abd1b128 100644 --- a/paddle/fluid/framework/scope.cc +++ b/paddle/fluid/framework/scope.cc @@ -49,18 +49,18 @@ int64_t GetEagerDeletionThreshold() { Scope::~Scope() { DropKids(); } Scope& Scope::NewScope() const { - std::unique_lock lock(mutex_); + std::lock_guard lock(mutex_); kids_.push_back(new Scope(this)); return *kids_.back(); } Variable* Scope::Var(const std::string& name) { - std::unique_lock lock(mutex_); + std::lock_guard lock(mutex_); return VarInternal(name); } Variable* Scope::Var(std::string* name) { - std::unique_lock lock(mutex_); + std::lock_guard lock(mutex_); auto new_name = string::Sprintf("%p.%d", this, vars_.size()); if (name != nullptr) { *name = new_name; @@ -69,29 +69,34 @@ Variable* Scope::Var(std::string* name) { } Variable* Scope::FindVar(const std::string& name) const { - std::unique_lock lock(mutex_); + std::lock_guard lock(mutex_); return FindVarInternal(name); } +Variable* Scope::FindLocalVar(const std::string& name) const { + std::lock_guard lock(mutex_); + return FindVarLocally(name); +} + const Scope* Scope::FindScope(const Variable* var) const { - std::unique_lock lock(mutex_); + std::lock_guard lock(mutex_); return FindScopeInternal(var); } void Scope::DropKids() { - std::unique_lock lock(mutex_); + std::lock_guard lock(mutex_); for (Scope* s : kids_) delete s; kids_.clear(); } bool Scope::HasKid(const Scope* scope) const { - std::unique_lock lock(mutex_); + std::lock_guard lock(mutex_); auto it = std::find(this->kids_.begin(), this->kids_.end(), scope); return it != this->kids_.end(); } std::vector Scope::LocalVarNames() const { - std::unique_lock lock(mutex_); + std::lock_guard lock(mutex_); std::vector known_vars; known_vars.reserve(this->vars_.size()); for (auto& p : vars_) { @@ -101,7 +106,7 @@ std::vector Scope::LocalVarNames() const { } void Scope::DeleteScope(Scope* scope) const { - std::unique_lock lock(mutex_); + std::lock_guard lock(mutex_); auto it = std::find(this->kids_.begin(), this->kids_.end(), scope); PADDLE_ENFORCE(it != this->kids_.end(), "Cannot find %p as kid scope", scope); this->kids_.erase(it); @@ -114,7 +119,7 @@ void Scope::DeleteScope(Scope* scope) const { } void Scope::EraseVars(const std::vector& var_names) { - std::unique_lock lock(mutex_); + std::lock_guard lock(mutex_); std::set var_set(var_names.begin(), var_names.end()); for (auto it = vars_.begin(); it != vars_.end();) { if (var_set.find(it->first) != var_set.end()) { @@ -127,12 +132,12 @@ void Scope::EraseVars(const std::vector& var_names) { void Scope::Rename(const std::string& origin_name, const std::string& new_name) const { - std::unique_lock lock(mutex_); + std::lock_guard lock(mutex_); RenameInternal(origin_name, new_name); } std::string Scope::Rename(const std::string& origin_name) const { - std::unique_lock lock(mutex_); + std::lock_guard lock(mutex_); auto new_name = string::Sprintf("%p.%d", this, vars_.size()); RenameInternal(origin_name, new_name); return new_name; diff --git a/paddle/fluid/framework/scope.h b/paddle/fluid/framework/scope.h index e42fff1d79..14f9f36812 100644 --- a/paddle/fluid/framework/scope.h +++ b/paddle/fluid/framework/scope.h @@ -63,6 +63,11 @@ class Scope { /// Caller doesn't own the returned Variable. Variable* FindVar(const std::string& name) const; + /// Find a variable in the current scope. + /// Return nullptr if cannot find. + /// Caller doesn't own the returned Variable. + Variable* FindLocalVar(const std::string& name) const; + const Scope* parent() const { return parent_; } /// Find the scope or an ancestor scope that contains the given variable. diff --git a/paddle/fluid/framework/selected_rows_test.cc b/paddle/fluid/framework/selected_rows_test.cc index 928e1ad8b9..9c427a4ae4 100644 --- a/paddle/fluid/framework/selected_rows_test.cc +++ b/paddle/fluid/framework/selected_rows_test.cc @@ -91,7 +91,7 @@ TEST(SelectedRows, SparseTable) { ASSERT_TRUE(table.HasKey(10)); ASSERT_TRUE(table.HasKey(8)); ASSERT_TRUE(table.HasKey(6)); - ASSERT_EQ(table.rows().size(), 3); + ASSERT_EQ(table.rows().size(), 3UL); framework::Tensor ids; ids.Resize(framework::make_ddim({4})); diff --git a/paddle/fluid/framework/tensor_util.cc b/paddle/fluid/framework/tensor_util.cc index 1d7a2eb5b3..69bcbc0e58 100644 --- a/paddle/fluid/framework/tensor_util.cc +++ b/paddle/fluid/framework/tensor_util.cc @@ -36,6 +36,11 @@ void TensorCopy(const Tensor& src, const platform::Place& dst_place, auto size = src.numel() * SizeOfType(src.type()); if (platform::is_cpu_place(src_place) && platform::is_cpu_place(dst_place)) { + if (src_ptr == dst_ptr) { + VLOG(3) << "Skip copy the same data async from " << src_place << " to " + << dst_place; + return; + } memory::Copy(boost::get(dst_place), dst_ptr, boost::get(src_place), src_ptr, size); } @@ -71,6 +76,11 @@ void TensorCopy(const Tensor& src, const platform::Place& dst_place, auto stream = reinterpret_cast(ctx).stream(); if (platform::is_same_place(src_place, dst_place)) { + if (src_ptr == dst_ptr) { + VLOG(3) << "Skip copy the same data async from " << src_place << " to " + << dst_place; + return; + } memory::Copy(dst_gpu_place, dst_ptr, src_gpu_place, src_ptr, size, stream); } else { @@ -114,6 +124,11 @@ void TensorCopySync(const Tensor& src, const platform::Place& dst_place, auto dst_ptr = dst->mutable_data(dst_place, src.type()); auto size = src.numel() * SizeOfType(src.type()); if (platform::is_cpu_place(src_place) && platform::is_cpu_place(dst_place)) { + if (src_ptr == dst_ptr) { + VLOG(3) << "Skip copy the same data from " << src_place << " to " + << dst_place; + return; + } memory::Copy(boost::get(dst_place), dst_ptr, boost::get(src_place), src_ptr, size); } @@ -130,6 +145,11 @@ void TensorCopySync(const Tensor& src, const platform::Place& dst_place, memory::Copy(dst_gpu_place, dst_ptr, src_cpu_place, src_ptr, size, nullptr); } else if (platform::is_gpu_place(src_place) && platform::is_gpu_place(dst_place)) { + if (src_ptr == dst_ptr && platform::is_same_place(src_place, dst_place)) { + VLOG(3) << "Skip copy the same data from " << src_place << " to " + << dst_place; + return; + } auto src_gpu_place = boost::get(src_place); auto dst_gpu_place = boost::get(dst_place); memory::Copy(dst_gpu_place, dst_ptr, src_gpu_place, src_ptr, size, nullptr); diff --git a/paddle/fluid/framework/tensor_util_test.cc b/paddle/fluid/framework/tensor_util_test.cc index a1e5b967a8..793ccfc79f 100644 --- a/paddle/fluid/framework/tensor_util_test.cc +++ b/paddle/fluid/framework/tensor_util_test.cc @@ -41,6 +41,11 @@ TEST(TensorCopy, Tensor) { EXPECT_EQ(src_ptr[i], dst_ptr[i]); } + TensorCopy(dst_tensor, *cpu_place, &dst_tensor); + for (size_t i = 0; i < 9; ++i) { + EXPECT_EQ(src_ptr[i], dst_ptr[i]); + } + EXPECT_TRUE(dst_tensor.layout() == src_tensor.layout()); Tensor slice_tensor = src_tensor.Slice(1, 2); @@ -82,6 +87,15 @@ TEST(TensorCopy, Tensor) { EXPECT_EQ(src_ptr[i], dst_ptr[i]); } + // Copy the same tensor + TensorCopy(gpu_tensor, *gpu_place, gpu_ctx, &gpu_tensor); + gpu_ctx.Wait(); + const int* dst_ptr_tmp = dst_tensor.data(); + EXPECT_NE(src_ptr, dst_ptr_tmp); + for (size_t i = 0; i < 9; ++i) { + EXPECT_EQ(src_ptr[i], dst_ptr_tmp[i]); + } + Tensor slice_tensor = src_tensor.Slice(1, 2); // CPU Slice Tensor to GPU Tensor diff --git a/paddle/fluid/framework/var_desc.h b/paddle/fluid/framework/var_desc.h index e33849ef50..9d3fb81119 100644 --- a/paddle/fluid/framework/var_desc.h +++ b/paddle/fluid/framework/var_desc.h @@ -59,6 +59,7 @@ class VarDesc { public: explicit VarDesc(const std::string &name) { desc_.set_name(name); + // TODO(paddle-dev): Why default to lodtensor. desc_.mutable_type()->set_type(proto::VarType::LOD_TENSOR); } diff --git a/paddle/fluid/framework/variable.h b/paddle/fluid/framework/variable.h index 067e0c2b83..873e1b20a5 100644 --- a/paddle/fluid/framework/variable.h +++ b/paddle/fluid/framework/variable.h @@ -38,8 +38,12 @@ class Variable { template T* GetMutable() { - if (!IsType()) { + if (!holder_) { holder_.reset(new PlaceholderImpl(new T())); + } else { + PADDLE_ENFORCE(IsType(), + "Variable must be type %s, the holding type is %s", + typeid(T).name(), holder_->Type().name()); } return static_cast(holder_->Ptr()); } diff --git a/paddle/fluid/framework/variable_test.cc b/paddle/fluid/framework/variable_test.cc index c5c1d215f4..003dcfd3df 100644 --- a/paddle/fluid/framework/variable_test.cc +++ b/paddle/fluid/framework/variable_test.cc @@ -33,9 +33,10 @@ TEST(Variable, GetMutable) { const Tensor& tt = v->Get(); EXPECT_EQ(1234, tt.content_); - std::string* s = v->GetMutable(); - *s = "hello"; - - const std::string& ss = v->Get(); - EXPECT_EQ("hello", ss); + try { + v->GetMutable(); + } catch (std::exception& e) { + return; + } + EXPECT_TRUE(false); } diff --git a/paddle/fluid/inference/CMakeLists.txt b/paddle/fluid/inference/CMakeLists.txt index ec1bc7825d..9794a193bc 100644 --- a/paddle/fluid/inference/CMakeLists.txt +++ b/paddle/fluid/inference/CMakeLists.txt @@ -19,9 +19,19 @@ cc_library(paddle_fluid_origin DEPS ${fluid_modules} paddle_fluid_api) add_subdirectory(api) +set(STATIC_INFERENCE_APIS paddle_fluid_api paddle_inference_api analysis_predictor) +set(SHARED_INFERENCE_SRCS + io.cc ${CMAKE_CURRENT_SOURCE_DIR}/api/api.cc ${CMAKE_CURRENT_SOURCE_DIR}/api/api_impl.cc + ${CMAKE_CURRENT_SOURCE_DIR}/api/analysis_predictor.cc + ${CMAKE_CURRENT_SOURCE_DIR}/api/details/zero_copy_tensor.cc) +if (WITH_GPU AND TENSORRT_FOUND) + set(STATIC_INFERENCE_APIS ${STATIC_INFERENCE_APIS} paddle_inference_tensorrt_subgraph_engine) + set(SHARED_INFERENCE_SRCS ${SHARED_INFERENCE_SRCS} ${CMAKE_CURRENT_SOURCE_DIR}/api/api_tensorrt_subgraph_engine.cc) +endif() + # Create static library -cc_library(paddle_fluid DEPS ${fluid_modules} paddle_fluid_api paddle_inference_api - analysis_predictor zero_copy_tensor) +cc_library(paddle_fluid DEPS ${fluid_modules} ${STATIC_INFERENCE_APIS} zero_copy_tensor) + if(NOT APPLE) # TODO(liuyiqu: Temporarily disable the link flag because it is not support on Mac. set(LINK_FLAGS "-Wl,--retain-symbols-file ${CMAKE_CURRENT_SOURCE_DIR}/paddle_fluid.sym") @@ -29,10 +39,7 @@ if(NOT APPLE) endif() # Create shared library -cc_library(paddle_fluid_shared SHARED - SRCS io.cc ${CMAKE_CURRENT_SOURCE_DIR}/api/api.cc ${CMAKE_CURRENT_SOURCE_DIR}/api/api_impl.cc - ${CMAKE_CURRENT_SOURCE_DIR}/api/analysis_predictor.cc - ${CMAKE_CURRENT_SOURCE_DIR}/api/details/zero_copy_tensor.cc +cc_library(paddle_fluid_shared SHARED SRCS ${SHARED_INFERENCE_SRCS} DEPS ${fluid_modules} paddle_fluid_api) set_target_properties(paddle_fluid_shared PROPERTIES OUTPUT_NAME paddle_fluid) diff --git a/paddle/fluid/inference/analysis/analyzer.cc b/paddle/fluid/inference/analysis/analyzer.cc index 8a8aeb5e09..2e79d495d5 100644 --- a/paddle/fluid/inference/analysis/analyzer.cc +++ b/paddle/fluid/inference/analysis/analyzer.cc @@ -70,7 +70,7 @@ class DfgPassManagerImpl final : public DfgPassManager { auto trt_teller = [&](const Node* node) { std::unordered_set teller_set( {"mul", "conv2d", "pool2d", "relu", "softmax", "sigmoid", - "depthwise_conv2d", "batch_norm", "concat", "tanh", + "depthwise_conv2d", "batch_norm", "concat", "tanh", "pad", "elementwise_add", "dropout"}); if (!node->IsFunction()) return false; @@ -101,7 +101,13 @@ Analyzer::Analyzer() { Register("manager1", new DfgPassManagerImpl); } void Analyzer::Run(Argument* argument) { std::vector passes; - for (auto& pass : all_ir_passes_) { +#ifdef PADDLE_WITH_MKLDNN + if (use_mkldnn_) { + VLOG(3) << "Adding MKL-DNN placement pass"; + passes.push_back("mkldnn_placement_pass"); + } +#endif + for (auto& pass : ir_passes_) { if (!disabled_ir_passes_.count(pass)) { passes.push_back(pass); passes.push_back("graph_viz_pass"); // add graphviz for debug. @@ -117,11 +123,26 @@ void Analyzer::Run(Argument* argument) { } } +Analyzer& Analyzer::IncludeAllIrPasses() { + ir_passes_ = all_ir_passes_; + return *this; +} + Analyzer& Analyzer::DisableIrPasses(const std::vector& passes) { disabled_ir_passes_.insert(passes.begin(), passes.end()); return *this; } +Analyzer& Analyzer::IncludeIrPasses(const std::vector& passes) { + ir_passes_ = passes; + return *this; +} + +Analyzer& Analyzer::SetUseMkldnn(bool use_mkldnn) { + use_mkldnn_ = use_mkldnn; + return *this; +} + } // namespace analysis } // namespace inference } // namespace paddle diff --git a/paddle/fluid/inference/analysis/analyzer.h b/paddle/fluid/inference/analysis/analyzer.h index 0aa9367bf5..c51a4fdb2f 100644 --- a/paddle/fluid/inference/analysis/analyzer.h +++ b/paddle/fluid/inference/analysis/analyzer.h @@ -54,6 +54,9 @@ class Analyzer : public OrderedRegistry { void Run(Argument* argument); Analyzer& DisableIrPasses(const std::vector& passes); + Analyzer& IncludeIrPasses(const std::vector& passes); + Analyzer& IncludeAllIrPasses(); + Analyzer& SetUseMkldnn(bool use_mkldnn); DISABLE_COPY_AND_ASSIGN(Analyzer); @@ -64,21 +67,29 @@ class Analyzer : public OrderedRegistry { // larger fusion. const std::vector all_ir_passes_{{ // Manual update the passes here. - "infer_clean_graph_pass", // - "attention_lstm_fuse_pass", // - "embedding_fc_lstm_fuse_pass", // - "fc_lstm_fuse_pass", // - "mul_lstm_fuse_pass", // - "fc_gru_fuse_pass", // - "mul_gru_fuse_pass", // - "seq_concat_fc_fuse_pass", // - "fc_fuse_pass", // + "infer_clean_graph_pass", // + "attention_lstm_fuse_pass", // + "seqconv_eltadd_relu_fuse_pass", // + "embedding_fc_lstm_fuse_pass", // + "fc_lstm_fuse_pass", // + "mul_lstm_fuse_pass", // + "fc_gru_fuse_pass", // + "mul_gru_fuse_pass", // + "seq_concat_fc_fuse_pass", // + "fc_fuse_pass", // + "conv_bn_fuse_pass", // + "conv_eltwiseadd_bn_fuse_pass", // #ifdef PADDLE_WITH_MKLDNN - "conv_relu_mkldnn_fuse_pass", // + "conv_bias_mkldnn_fuse_pass", // + "conv_relu_mkldnn_fuse_pass", // + "conv_elementwise_add_mkldnn_fuse_pass", // #endif }}; std::unordered_set disabled_ir_passes_; + // Ir passes to run + std::vector ir_passes_; + bool use_mkldnn_; }; } // namespace analysis diff --git a/paddle/fluid/inference/analysis/analyzer_tester.cc b/paddle/fluid/inference/analysis/analyzer_tester.cc index f90910ac0d..5430e5c1ef 100644 --- a/paddle/fluid/inference/analysis/analyzer_tester.cc +++ b/paddle/fluid/inference/analysis/analyzer_tester.cc @@ -51,9 +51,7 @@ void TestWord2vecPrediction(const std::string& model_path) { config.model_dir = model_path; config.use_gpu = false; config.device = 0; - auto predictor = - ::paddle::CreatePaddlePredictor( - config); + auto predictor = ::paddle::CreatePaddlePredictor(config); // One single batch diff --git a/paddle/fluid/inference/api/analysis_predictor.cc b/paddle/fluid/inference/api/analysis_predictor.cc index 3bc6af5241..eec6657671 100644 --- a/paddle/fluid/inference/api/analysis_predictor.cc +++ b/paddle/fluid/inference/api/analysis_predictor.cc @@ -25,9 +25,11 @@ #include "paddle/fluid/inference/api/paddle_inference_api.h" #include "paddle/fluid/inference/api/paddle_inference_pass.h" #include "paddle/fluid/inference/utils/singleton.h" +#include "paddle/fluid/platform/cpu_helper.h" #include "paddle/fluid/platform/profiler.h" DECLARE_bool(profile); +DECLARE_int32(paddle_num_threads); namespace paddle { @@ -47,6 +49,9 @@ bool AnalysisPredictor::Init( } #endif + // no matter with or without MKLDNN + paddle::platform::SetNumThreads(FLAGS_paddle_num_threads); + if (config_.use_gpu) { place_ = paddle::platform::CUDAPlace(config_.device); LOG(WARNING) << "ir optimize only supports CPU currently, enable_ir_optim " @@ -72,10 +77,6 @@ bool AnalysisPredictor::Init( inference_program_ = program; } - if (config_._use_mkldnn) { - executor_->EnableMKLDNN(*inference_program_); - } - executor_->Prepare(scope_.get(), *inference_program_, 0, config_.use_feed_fetch_ops); @@ -220,10 +221,24 @@ void AnalysisPredictor::OptimizeInferenceProgram() { argument_.origin_program_desc.reset( new ProgramDesc(*inference_program_->Proto())); - PADDLE_ENFORCE( - config_.ir_mode == contrib::AnalysisConfig::IrPassMode::kExclude, - "Only kExclude is supported yet."); - Analyzer().DisableIrPasses(config_.ir_passes).Run(&argument_); + + switch (config_.ir_mode) { + case contrib::AnalysisConfig::IrPassMode::kExclude: + Analyzer() + .IncludeAllIrPasses() + .SetUseMkldnn(config_._use_mkldnn) + .DisableIrPasses(config_.ir_passes) + .Run(&argument_); + break; + case contrib::AnalysisConfig::IrPassMode::kInclude: + Analyzer() + .SetUseMkldnn(config_._use_mkldnn) + .IncludeIrPasses(config_.ir_passes) + .Run(&argument_); + break; + default: + LOG(ERROR) << "Only kExclude and kInclude modes are supoorted yet."; + } CHECK(argument_.transformed_program_desc); VLOG(5) << "to prepare executor"; @@ -335,6 +350,19 @@ bool AnalysisPredictor::LoadProgramDesc() { } return true; } + +AnalysisPredictor::~AnalysisPredictor() { +#if !defined(_WIN32) + if (FLAGS_profile) { + platform::DisableProfiler(platform::EventSortingKey::kTotal, + "./profile.log"); + } +#endif + if (sub_scope_) { + scope_->DeleteScope(sub_scope_); + } +} + std::unique_ptr AnalysisPredictor::Clone() { auto *x = new AnalysisPredictor(config_); x->Init(scope_, inference_program_); diff --git a/paddle/fluid/inference/api/analysis_predictor.h b/paddle/fluid/inference/api/analysis_predictor.h index 0d01d7ac2b..5a9f4d3695 100644 --- a/paddle/fluid/inference/api/analysis_predictor.h +++ b/paddle/fluid/inference/api/analysis_predictor.h @@ -72,6 +72,7 @@ class AnalysisPredictor : public PaddlePredictor { template void GetFetchOne(const framework::LoDTensor &fetchs, PaddleTensor *output_data); + ~AnalysisPredictor(); private: contrib::AnalysisConfig config_; diff --git a/paddle/fluid/inference/api/analysis_predictor_tester.cc b/paddle/fluid/inference/api/analysis_predictor_tester.cc index 1d25f55b31..13c25da1b5 100644 --- a/paddle/fluid/inference/api/analysis_predictor_tester.cc +++ b/paddle/fluid/inference/api/analysis_predictor_tester.cc @@ -27,9 +27,7 @@ TEST(AnalysisPredictor, ZeroCopy) { config.model_dir = FLAGS_dirname + "/word2vec.inference.model"; config.use_feed_fetch_ops = false; - auto predictor = - CreatePaddlePredictor( - config); + auto predictor = CreatePaddlePredictor(config); auto w0 = predictor->GetInputTensor("firstw"); auto w1 = predictor->GetInputTensor("secondw"); diff --git a/paddle/fluid/inference/api/api_impl.cc b/paddle/fluid/inference/api/api_impl.cc index 6682e0a81b..7cda9c5d8a 100644 --- a/paddle/fluid/inference/api/api_impl.cc +++ b/paddle/fluid/inference/api/api_impl.cc @@ -23,9 +23,11 @@ limitations under the License. */ #include "paddle/fluid/framework/feed_fetch_method.h" #include "paddle/fluid/inference/api/api_impl.h" #include "paddle/fluid/inference/api/helper.h" +#include "paddle/fluid/platform/cpu_helper.h" #include "paddle/fluid/platform/profiler.h" DEFINE_bool(profile, false, "Turn on profiler for fluid"); +DECLARE_int32(paddle_num_threads); namespace paddle { namespace { @@ -72,6 +74,9 @@ bool NativePaddlePredictor::Init( } #endif + // no matter with or without MKLDNN + paddle::platform::SetNumThreads(FLAGS_paddle_num_threads); + if (config_.use_gpu) { place_ = paddle::platform::CUDAPlace(config_.device); } else { diff --git a/paddle/fluid/inference/api/api_impl_tester.cc b/paddle/fluid/inference/api/api_impl_tester.cc index bed7c87131..b7b8ee6ea0 100644 --- a/paddle/fluid/inference/api/api_impl_tester.cc +++ b/paddle/fluid/inference/api/api_impl_tester.cc @@ -205,7 +205,7 @@ void MainThreadsWord2Vec(bool use_gpu) { float* ref_data = refs[tid].data(); EXPECT_EQ(refs[tid].numel(), static_cast(len / sizeof(float))); for (int i = 0; i < refs[tid].numel(); ++i) { - EXPECT_NEAR(ref_data[i], data[i], ACC_DIFF); + EXPECT_NEAR(ref_data[i], data[i], 2e-3); } }); } diff --git a/paddle/fluid/inference/api/api_tensorrt_subgraph_engine.cc b/paddle/fluid/inference/api/api_tensorrt_subgraph_engine.cc index 5ee6a5a931..7ac468ee4d 100644 --- a/paddle/fluid/inference/api/api_tensorrt_subgraph_engine.cc +++ b/paddle/fluid/inference/api/api_tensorrt_subgraph_engine.cc @@ -185,3 +185,4 @@ USE_TRT_CONVERTER(softmax); USE_TRT_CONVERTER(batch_norm); USE_TRT_CONVERTER(concat); USE_TRT_CONVERTER(dropout); +USE_TRT_CONVERTER(pad); diff --git a/paddle/fluid/inference/api/api_tensorrt_subgraph_engine_tester.cc b/paddle/fluid/inference/api/api_tensorrt_subgraph_engine_tester.cc index fc6310e90b..702158ea3b 100644 --- a/paddle/fluid/inference/api/api_tensorrt_subgraph_engine_tester.cc +++ b/paddle/fluid/inference/api/api_tensorrt_subgraph_engine_tester.cc @@ -41,11 +41,8 @@ void CompareTensorRTWithFluid(bool enable_tensorrt) { config1.device = 0; config1.max_batch_size = 10; - auto predictor0 = - CreatePaddlePredictor(config0); - auto predictor1 = - CreatePaddlePredictor(config1); + auto predictor0 = CreatePaddlePredictor(config0); + auto predictor1 = CreatePaddlePredictor(config1); for (int batch_id = 0; batch_id < 1; batch_id++) { //# 2. Prepare input. diff --git a/paddle/fluid/inference/api/demo_ci/CMakeLists.txt b/paddle/fluid/inference/api/demo_ci/CMakeLists.txt index d4e6bb3e4a..03f0f726eb 100644 --- a/paddle/fluid/inference/api/demo_ci/CMakeLists.txt +++ b/paddle/fluid/inference/api/demo_ci/CMakeLists.txt @@ -3,6 +3,7 @@ project(cpp_inference_demo CXX C) option(WITH_MKL "Compile demo with MKL/OpenBlas support, default use MKL." ON) option(WITH_GPU "Compile demo with GPU/CPU, default use CPU." OFF) option(WITH_STATIC_LIB "Compile demo with static/shared library, default use static." ON) +option(USE_TENSORRT "Compile demo with TensorRT." OFF) macro(safe_set_static_flag) foreach(flag_var @@ -60,6 +61,13 @@ endif(NOT WIN32) include_directories("${PADDLE_LIB}/third_party/boost") include_directories("${PADDLE_LIB}/third_party/eigen3") +if (NOT WIN32) + if (USE_TENSORRT AND WITH_GPU) + include_directories("${TENSORRT_INCLUDE_DIR}") + link_directories("${TENSORRT_LIB_DIR}") + endif() +endif(NOT WIN32) + if (NOT WIN32) link_directories("${PADDLE_LIB}/third_party/install/snappy/lib") link_directories("${PADDLE_LIB}/third_party/install/snappystream/lib") @@ -69,7 +77,7 @@ endif(NOT WIN32) link_directories("${PADDLE_LIB}/third_party/install/protobuf/lib") link_directories("${PADDLE_LIB}/third_party/install/glog/lib") link_directories("${PADDLE_LIB}/third_party/install/gflags/lib") -link_directories("${PADDLE_LIB}/paddle/fluid/inference") +link_directories("${PADDLE_LIB}/paddle/lib") add_executable(${DEMO_NAME} ${DEMO_NAME}.cc) @@ -89,10 +97,10 @@ endif() # Note: libpaddle_inference_api.so/a must put before libpaddle_fluid.so/a if(WITH_STATIC_LIB) set(DEPS - ${PADDLE_LIB}/paddle/fluid/inference/libpaddle_fluid${CMAKE_STATIC_LIBRARY_SUFFIX}) + ${PADDLE_LIB}/paddle/lib/libpaddle_fluid${CMAKE_STATIC_LIBRARY_SUFFIX}) else() set(DEPS - ${PADDLE_LIB}/paddle/fluid/inference/libpaddle_fluid${CMAKE_SHARED_LIBRARY_SUFFIX}) + ${PADDLE_LIB}/paddle/lib/libpaddle_fluid${CMAKE_SHARED_LIBRARY_SUFFIX}) endif() if (NOT WIN32) @@ -112,6 +120,10 @@ endif(NOT WIN32) if(WITH_GPU) if(NOT WIN32) + if (USE_TENSORRT) + set(DEPS ${DEPS} ${TENSORRT_LIB_DIR}/libnvinfer${CMAKE_STATIC_LIBRARY_SUFFIX}) + set(DEPS ${DEPS} ${TENSORRT_LIB_DIR}/libnvinfer_plugin${CMAKE_STATIC_LIBRARY_SUFFIX}) + endif() set(DEPS ${DEPS} ${CUDA_LIB}/libcudart${CMAKE_SHARED_LIBRARY_SUFFIX}) else() set(DEPS ${DEPS} ${CUDA_LIB}/cudart${CMAKE_STATIC_LIBRARY_SUFFIX} ) diff --git a/paddle/fluid/inference/api/demo_ci/run.sh b/paddle/fluid/inference/api/demo_ci/run.sh index 44335a872f..67994aad70 100755 --- a/paddle/fluid/inference/api/demo_ci/run.sh +++ b/paddle/fluid/inference/api/demo_ci/run.sh @@ -3,11 +3,15 @@ PADDLE_ROOT=$1 TURN_ON_MKL=$2 # use MKL or Openblas TEST_GPU_CPU=$3 # test both GPU/CPU mode or only CPU mode DATA_DIR=$4 # dataset +TENSORRT_INCLUDE_DIR=$5 # TensorRT header file dir, defalut to /usr/local/TensorRT/include +TENSORRT_LIB_DIR=$6 # TensorRT lib file dir, default to /usr/local/TensorRT/lib +inference_install_dir=${PADDLE_ROOT}/build/fluid_inference_install_dir + cd `dirname $0` current_dir=`pwd` if [ $2 == ON ]; then # You can export yourself if move the install path - MKL_LIB=${PADDLE_ROOT}/build/fluid_install_dir/third_party/install/mklml/lib + MKL_LIB=${inference_install_dir}/third_party/install/mklml/lib export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:${MKL_LIB} fi if [ $3 == ON ]; then @@ -16,6 +20,11 @@ else use_gpu_list='false' fi +USE_TENSORRT=OFF +if [ [-d"$TENSORRT_INCLUDE_DIR"] -a [-d"$TENSORRT_LIB_DIR"] ]; then + USE_TENSORRT=ON +fi + PREFIX=inference-vis-demos%2F URL_ROOT=http://paddlemodels.cdn.bcebos.com/${PREFIX} @@ -47,7 +56,7 @@ cd build for WITH_STATIC_LIB in ON OFF; do # -----simple_on_word2vec----- rm -rf * - cmake .. -DPADDLE_LIB=${PADDLE_ROOT}/build/fluid_install_dir/ \ + cmake .. -DPADDLE_LIB=${inference_install_dir} \ -DWITH_MKL=$TURN_ON_MKL \ -DDEMO_NAME=simple_on_word2vec \ -DWITH_GPU=$TEST_GPU_CPU \ @@ -67,7 +76,7 @@ for WITH_STATIC_LIB in ON OFF; do fi # ---------vis_demo--------- rm -rf * - cmake .. -DPADDLE_LIB=${PADDLE_ROOT}/build/fluid_install_dir/ \ + cmake .. -DPADDLE_LIB=${inference_install_dir} \ -DWITH_MKL=$TURN_ON_MKL \ -DDEMO_NAME=vis_demo \ -DWITH_GPU=$TEST_GPU_CPU \ @@ -86,5 +95,23 @@ for WITH_STATIC_LIB in ON OFF; do fi done done + + # --------tensorrt mobilenet------ + if [ $USE_TENSORRT == ON -a $TEST_GPU_CPU == ON ]; then + rm -rf * + cmake .. -DPADDLE_LIB=${inference_install_dir} \ + -DWITH_MKL=$TURN_ON_MKL \ + -DDEMO_NAME=trt_mobilenet_demo \ + -DWITH_GPU=$TEST_GPU_CPU \ + -DWITH_STATIC_LIB=$WITH_STATIC_LIB \ + -DUSE_TENSORRT=$USE_TENSORRT \ + -DTENSORRT_INCLUDE_DIR=$TENSORRT_INCLUDE_DIR \ + -DTENSORRT_LIB_DIR=$TENSORRT_LIB_DIR + make -j + ./trt_mobilenet_demo \ + --modeldir=$DATA_DIR/mobilenet/model \ + --data=$DATA_DIR/mobilenet/data.txt \ + --refer=$DATA_DIR/mobilenet/result.txt + fi done set +x diff --git a/paddle/fluid/inference/api/demo_ci/simple_on_word2vec.cc b/paddle/fluid/inference/api/demo_ci/simple_on_word2vec.cc index 8058d7e881..5446fd4d42 100644 --- a/paddle/fluid/inference/api/demo_ci/simple_on_word2vec.cc +++ b/paddle/fluid/inference/api/demo_ci/simple_on_word2vec.cc @@ -23,7 +23,7 @@ limitations under the License. */ #include #include //NOLINT -#include "paddle/fluid/inference/paddle_inference_api.h" +#include "paddle/include/paddle_inference_api.h" DEFINE_string(dirname, "", "Directory of the inference model."); DEFINE_bool(use_gpu, false, "Whether use gpu."); @@ -42,8 +42,7 @@ void Main(bool use_gpu) { config.use_gpu = use_gpu; config.fraction_of_gpu_memory = 0.15; config.device = 0; - auto predictor = - CreatePaddlePredictor(config); + auto predictor = CreatePaddlePredictor(config); for (int batch_id = 0; batch_id < 3; batch_id++) { //# 2. Prepare input. @@ -85,8 +84,7 @@ void MainThreads(int num_threads, bool use_gpu) { config.use_gpu = use_gpu; config.fraction_of_gpu_memory = 0.15; config.device = 0; - auto main_predictor = - CreatePaddlePredictor(config); + auto main_predictor = CreatePaddlePredictor(config); std::vector threads; for (int tid = 0; tid < num_threads; ++tid) { diff --git a/paddle/fluid/inference/api/demo_ci/trt_mobilenet_demo.cc b/paddle/fluid/inference/api/demo_ci/trt_mobilenet_demo.cc new file mode 100644 index 0000000000..4a8404f21c --- /dev/null +++ b/paddle/fluid/inference/api/demo_ci/trt_mobilenet_demo.cc @@ -0,0 +1,82 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +/* + * This file contains demo of mobilenet for tensorrt. + */ + +#include +#include // use glog instead of CHECK to avoid importing other paddle header files. +#include "utils.h" // NOLINT + +DECLARE_double(fraction_of_gpu_memory_to_use); +DEFINE_string(modeldir, "", "Directory of the inference model."); +DEFINE_string(refer, "", "path to reference result for comparison."); +DEFINE_string( + data, "", + "path of data; each line is a record, format is " + "'\t predictor; + paddle::contrib::MixedRTConfig config; + config.param_file = FLAGS_modeldir + "/__params__"; + config.prog_file = FLAGS_modeldir + "/__model__"; + config.use_gpu = true; + config.device = 0; + config.max_batch_size = 1; + config.fraction_of_gpu_memory = 0.1; // set by yourself + predictor = CreatePaddlePredictor(config); + + VLOG(3) << "begin to process data"; + // Just a single batch of data. + std::string line; + std::ifstream file(FLAGS_data); + std::getline(file, line); + auto record = ProcessALine(line); + file.close(); + + // Inference. + PaddleTensor input; + input.shape = record.shape; + input.data = + PaddleBuf(record.data.data(), record.data.size() * sizeof(float)); + input.dtype = PaddleDType::FLOAT32; + + VLOG(3) << "run executor"; + std::vector output; + predictor->Run({input}, &output, 1); + + VLOG(3) << "output.size " << output.size(); + auto& tensor = output.front(); + VLOG(3) << "output: " << SummaryTensor(tensor); + + // compare with reference result + CheckOutput(FLAGS_refer, tensor); +} + +} // namespace demo +} // namespace paddle + +int main(int argc, char** argv) { + google::ParseCommandLineFlags(&argc, &argv, true); + paddle::demo::Main(); + return 0; +} diff --git a/paddle/fluid/inference/api/demo_ci/utils.h b/paddle/fluid/inference/api/demo_ci/utils.h index cb89906711..d70c6aea79 100644 --- a/paddle/fluid/inference/api/demo_ci/utils.h +++ b/paddle/fluid/inference/api/demo_ci/utils.h @@ -14,13 +14,20 @@ #pragma once #include +#include +#include #include #include -#include "paddle/fluid/inference/paddle_inference_api.h" +#include "paddle/include/paddle_inference_api.h" namespace paddle { namespace demo { +struct Record { + std::vector data; + std::vector shape; +}; + static void split(const std::string& str, char sep, std::vector* pieces) { pieces->clear(); @@ -39,6 +46,58 @@ static void split(const std::string& str, char sep, } } +Record ProcessALine(const std::string& line) { + VLOG(3) << "process a line"; + std::vector columns; + split(line, '\t', &columns); + CHECK_EQ(columns.size(), 2UL) + << "data format error, should be \t"; + + Record record; + std::vector data_strs; + split(columns[0], ' ', &data_strs); + for (auto& d : data_strs) { + record.data.push_back(std::stof(d)); + } + + std::vector shape_strs; + split(columns[1], ' ', &shape_strs); + for (auto& s : shape_strs) { + record.shape.push_back(std::stoi(s)); + } + VLOG(3) << "data size " << record.data.size(); + VLOG(3) << "data shape size " << record.shape.size(); + return record; +} + +void CheckOutput(const std::string& referfile, const PaddleTensor& output) { + std::string line; + std::ifstream file(referfile); + std::getline(file, line); + auto refer = ProcessALine(line); + file.close(); + + size_t numel = output.data.length() / PaddleDtypeSize(output.dtype); + VLOG(3) << "predictor output numel " << numel; + VLOG(3) << "reference output numel " << refer.data.size(); + CHECK_EQ(numel, refer.data.size()); + switch (output.dtype) { + case PaddleDType::INT64: { + for (size_t i = 0; i < numel; ++i) { + CHECK_EQ(static_cast(output.data.data())[i], refer.data[i]); + } + break; + } + case PaddleDType::FLOAT32: + for (size_t i = 0; i < numel; ++i) { + CHECK_LT( + fabs(static_cast(output.data.data())[i] - refer.data[i]), + 1e-5); + } + break; + } +} + /* * Get a summary of a PaddleTensor content. */ diff --git a/paddle/fluid/inference/api/demo_ci/vis_demo.cc b/paddle/fluid/inference/api/demo_ci/vis_demo.cc index fb59cea457..8d546e3e9c 100644 --- a/paddle/fluid/inference/api/demo_ci/vis_demo.cc +++ b/paddle/fluid/inference/api/demo_ci/vis_demo.cc @@ -18,11 +18,7 @@ limitations under the License. */ #include #include // use glog instead of CHECK to avoid importing other paddle header files. -#include -#include - -// #include "paddle/fluid/platform/enforce.h" -#include "paddle/fluid/inference/demo_ci/utils.h" +#include "utils.h" // NOLINT #ifdef PADDLE_WITH_CUDA DECLARE_double(fraction_of_gpu_memory_to_use); @@ -38,70 +34,13 @@ DEFINE_bool(use_gpu, false, "Whether use gpu."); namespace paddle { namespace demo { -struct Record { - std::vector data; - std::vector shape; -}; - -void split(const std::string& str, char sep, std::vector* pieces); - -Record ProcessALine(const std::string& line) { - VLOG(3) << "process a line"; - std::vector columns; - split(line, '\t', &columns); - CHECK_EQ(columns.size(), 2UL) - << "data format error, should be \t"; - - Record record; - std::vector data_strs; - split(columns[0], ' ', &data_strs); - for (auto& d : data_strs) { - record.data.push_back(std::stof(d)); - } - - std::vector shape_strs; - split(columns[1], ' ', &shape_strs); - for (auto& s : shape_strs) { - record.shape.push_back(std::stoi(s)); - } - VLOG(3) << "data size " << record.data.size(); - VLOG(3) << "data shape size " << record.shape.size(); - return record; -} - -void CheckOutput(const std::string& referfile, const PaddleTensor& output) { - std::string line; - std::ifstream file(referfile); - std::getline(file, line); - auto refer = ProcessALine(line); - file.close(); - - size_t numel = output.data.length() / PaddleDtypeSize(output.dtype); - VLOG(3) << "predictor output numel " << numel; - VLOG(3) << "reference output numel " << refer.data.size(); - CHECK_EQ(numel, refer.data.size()); - switch (output.dtype) { - case PaddleDType::INT64: { - for (size_t i = 0; i < numel; ++i) { - CHECK_EQ(static_cast(output.data.data())[i], refer.data[i]); - } - break; - } - case PaddleDType::FLOAT32: - for (size_t i = 0; i < numel; ++i) { - CHECK_LT( - fabs(static_cast(output.data.data())[i] - refer.data[i]), - 1e-5); - } - break; - } -} - +using contrib::AnalysisConfig; /* - * Use the native fluid engine to inference the demo. + * Use the native and analysis fluid engine to inference the demo. */ void Main(bool use_gpu) { - NativeConfig config; + std::unique_ptr predictor, analysis_predictor; + AnalysisConfig config; config.param_file = FLAGS_modeldir + "/__params__"; config.prog_file = FLAGS_modeldir + "/__model__"; config.use_gpu = use_gpu; @@ -111,8 +50,8 @@ void Main(bool use_gpu) { } VLOG(3) << "init predictor"; - auto predictor = - CreatePaddlePredictor(config); + predictor = CreatePaddlePredictor(config); + analysis_predictor = CreatePaddlePredictor(config); VLOG(3) << "begin to process data"; // Just a single batch of data. @@ -130,8 +69,8 @@ void Main(bool use_gpu) { input.dtype = PaddleDType::FLOAT32; VLOG(3) << "run executor"; - std::vector output; - predictor->Run({input}, &output); + std::vector output, analysis_output; + predictor->Run({input}, &output, 1); VLOG(3) << "output.size " << output.size(); auto& tensor = output.front(); @@ -139,6 +78,10 @@ void Main(bool use_gpu) { // compare with reference result CheckOutput(FLAGS_refer, tensor); + + // the analysis_output has some diff with native_output, + // TODO(luotao): add CheckOutput for analysis_output later. + analysis_predictor->Run({input}, &analysis_output, 1); } } // namespace demo @@ -146,9 +89,10 @@ void Main(bool use_gpu) { int main(int argc, char** argv) { google::ParseCommandLineFlags(&argc, &argv, true); - paddle::demo::Main(false /* use_gpu*/); if (FLAGS_use_gpu) { paddle::demo::Main(true /*use_gpu*/); + } else { + paddle::demo::Main(false /*use_gpu*/); } return 0; } diff --git a/paddle/fluid/inference/api/paddle_inference_api.h b/paddle/fluid/inference/api/paddle_inference_api.h index d2876dc27c..07ee6e72d1 100644 --- a/paddle/fluid/inference/api/paddle_inference_api.h +++ b/paddle/fluid/inference/api/paddle_inference_api.h @@ -259,10 +259,17 @@ struct AnalysisConfig : public NativeConfig { kExclude // Specify the disabled passes in `ir_passes`. }; + void SetIncludeMode() { + ir_mode = IrPassMode::kInclude; + // this pass has to be run at the beginning of all fuse passes + ir_passes = {"infer_clean_graph_pass"}; + } + // Determine whether to perform graph optimization. bool enable_ir_optim = true; // Manually determine the IR passes to run. IrPassMode ir_mode{IrPassMode::kExclude}; + // passes to be excluded/included std::vector ir_passes{"embedding_fc_lstm_fuse_pass"}; // NOT stable yet. diff --git a/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt b/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt index fac1babf6e..0a35e10f69 100644 --- a/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt +++ b/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt @@ -1,7 +1,7 @@ # Add TRT tests nv_library(tensorrt_converter SRCS mul_op.cc conv2d_op.cc fc_op.cc pool2d_op.cc elementwise_op.cc -batch_norm_op.cc activation_op.cc softmax_op.cc concat_op.cc dropout_op.cc +batch_norm_op.cc activation_op.cc softmax_op.cc concat_op.cc dropout_op.cc pad_op.cc DEPS tensorrt_engine operator scope framework_proto op_registry) nv_test(test_op_converter SRCS test_op_converter.cc DEPS @@ -26,6 +26,8 @@ nv_test(test_trt_batch_norm_op SRCS test_batch_norm_op.cc batch_norm_op.cc DEPS ${FLUID_CORE_MODULES} tensorrt_engine batch_norm_op SERIAL) nv_test(test_trt_concat_op SRCS test_concat_op.cc concat_op.cc DEPS ${FLUID_CORE_MODULES} tensorrt_engine concat_op SERIAL) - nv_test(test_trt_dropout_op SRCS test_dropout_op.cc dropout_op.cc DEPS ${FLUID_CORE_MODULES} tensorrt_engine dropout_op SERIAL) + +nv_test(test_trt_pad_op SRCS test_pad_op.cc pad_op.cc + DEPS ${FLUID_CORE_MODULES} tensorrt_engine pad_op SERIAL) diff --git a/paddle/fluid/inference/tensorrt/convert/pad_op.cc b/paddle/fluid/inference/tensorrt/convert/pad_op.cc new file mode 100644 index 0000000000..218030a591 --- /dev/null +++ b/paddle/fluid/inference/tensorrt/convert/pad_op.cc @@ -0,0 +1,68 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/inference/tensorrt/convert/op_converter.h" + +namespace paddle { +namespace inference { +namespace tensorrt { + +/* + * PadOp. + */ +class PadOpConverter : public OpConverter { + public: + void operator()(const framework::proto::OpDesc& op, + const framework::Scope& scope, bool test_mode) override { + VLOG(4) << "convert a fluid transpose op to tensorrt tranpose layer"; + + framework::OpDesc op_desc(op, nullptr); + // Declare inputs + auto* input = engine_->GetITensor(op_desc.Input("X")[0]); + + const std::vector paddings = + boost::get>(op_desc.GetAttr("paddings")); + const float pad_value = boost::get(op_desc.GetAttr("pad_value")); + + nvinfer1::Dims input_shape = input->getDimensions(); + int nbDims = input_shape.nbDims; + int pad_size = static_cast(paddings.size()); + PADDLE_ENFORCE_GE(nbDims, 2); + PADDLE_ENFORCE_EQ((nbDims + 1) * 2, pad_size); + PADDLE_ENFORCE(pad_value == 0.0, "The pad layer of TRT only support zero."); + + nvinfer1::DimsHW pre_pad(paddings[pad_size - 4], paddings[pad_size - 2]); + nvinfer1::DimsHW post_pad(paddings[pad_size - 3], paddings[pad_size - 1]); + + auto* layer = TRT_ENGINE_ADD_LAYER(engine_, Padding, + *const_cast(input), + pre_pad, post_pad); + + PADDLE_ENFORCE(layer != nullptr); + auto output_name = op_desc.Output("Out")[0]; + engine_->SetITensor(output_name, layer->getOutput(0)); + layer->setName(("scale (Output: " + output_name + ")").c_str()); + layer->getOutput(0)->setName(output_name.c_str()); + if (test_mode) { // the test framework can not determine which is the + // output, so place the declaration inside. + engine_->DeclareOutput(output_name); + } + } +}; + +} // namespace tensorrt +} // namespace inference +} // namespace paddle + +REGISTER_TRT_OP_CONVERTER(pad, PadOpConverter); diff --git a/paddle/fluid/inference/tensorrt/convert/test_pad_op.cc b/paddle/fluid/inference/tensorrt/convert/test_pad_op.cc new file mode 100644 index 0000000000..ba35d7ddbb --- /dev/null +++ b/paddle/fluid/inference/tensorrt/convert/test_pad_op.cc @@ -0,0 +1,52 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. */ + +#include +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/inference/tensorrt/convert/ut_helper.h" + +namespace paddle { +namespace inference { +namespace tensorrt { + +TEST(PadConverter, main) { + framework::Scope scope; + std::unordered_set parameters; + TRTConvertValidation validator(10, parameters, scope, 1000); + validator.DeclInputVar("pad-X", nvinfer1::Dims3(3, 2, 2)); + validator.DeclOutputVar("pad-Out", nvinfer1::Dims3(3, 3, 5)); + + // Prepare Op description + framework::OpDesc desc; + desc.SetType("pad"); + desc.SetInput("X", {"pad-X"}); + desc.SetOutput("Out", {"pad-Out"}); + + std::vector paddings = {0, 0, 0, 0, 0, 1, 1, 2}; + float pad_value = 0.0; + desc.SetAttr("paddings", paddings); + desc.SetAttr("pad_value", pad_value); + + LOG(INFO) << "set OP"; + validator.SetOp(*desc.Proto()); + LOG(INFO) << "execute"; + + validator.Execute(2); +} + +} // namespace tensorrt +} // namespace inference +} // namespace paddle + +USE_OP(pad); diff --git a/paddle/fluid/inference/tests/api/analyzer_resnet50_tester.cc b/paddle/fluid/inference/tests/api/analyzer_resnet50_tester.cc index 290fb007d8..c2151eea08 100644 --- a/paddle/fluid/inference/tests/api/analyzer_resnet50_tester.cc +++ b/paddle/fluid/inference/tests/api/analyzer_resnet50_tester.cc @@ -52,9 +52,10 @@ void SetInput(std::vector> *inputs) { } // Easy for profiling independently. -TEST(Analyzer_resnet50, profile) { +void profile(bool use_mkldnn = false) { AnalysisConfig cfg; SetConfig(&cfg); + cfg._use_mkldnn = use_mkldnn; std::vector outputs; std::vector> input_slots_all; @@ -69,6 +70,11 @@ TEST(Analyzer_resnet50, profile) { } } +TEST(Analyzer_resnet50, profile) { profile(); } +#ifdef PADDLE_WITH_MKLDNN +TEST(Analyzer_resnet50, profile_mkldnn) { profile(true /* use_mkldnn */); } +#endif + // Check the fuse status TEST(Analyzer_resnet50, fuse_statis) { AnalysisConfig cfg; @@ -82,15 +88,21 @@ TEST(Analyzer_resnet50, fuse_statis) { } // Compare result of NativeConfig and AnalysisConfig -TEST(Analyzer_resnet50, compare) { +void compare(bool use_mkldnn = false) { AnalysisConfig cfg; SetConfig(&cfg); + cfg._use_mkldnn = use_mkldnn; std::vector> input_slots_all; SetInput(&input_slots_all); CompareNativeAndAnalysis(cfg, input_slots_all); } +TEST(Analyzer_resnet50, compare) { compare(); } +#ifdef PADDLE_WITH_MKLDNN +TEST(Analyzer_resnet50, compare_mkldnn) { compare(true /* use_mkldnn */); } +#endif + } // namespace analysis } // namespace inference } // namespace paddle diff --git a/paddle/fluid/inference/tests/api/analyzer_rnn1_tester.cc b/paddle/fluid/inference/tests/api/analyzer_rnn1_tester.cc index c76d72ccd9..6399476680 100644 --- a/paddle/fluid/inference/tests/api/analyzer_rnn1_tester.cc +++ b/paddle/fluid/inference/tests/api/analyzer_rnn1_tester.cc @@ -308,18 +308,13 @@ TEST(Analyzer_rnn1, ZeroCopy) { PaddlePlace place; int output_size{0}; - auto predictor = - CreatePaddlePredictor( - config); + auto predictor = CreatePaddlePredictor(config); config.use_feed_fetch_ops = true; - auto native_predictor = - CreatePaddlePredictor(config); + auto native_predictor = CreatePaddlePredictor(config); config.use_feed_fetch_ops = true; // the analysis predictor needs feed/fetch. - auto analysis_predictor = - CreatePaddlePredictor( - config); + auto analysis_predictor = CreatePaddlePredictor(config); #define NEW_TENSOR(name__) \ auto name__##_tensor = predictor->GetInputTensor(#name__); diff --git a/paddle/fluid/inference/tests/api/analyzer_rnn2_tester.cc b/paddle/fluid/inference/tests/api/analyzer_rnn2_tester.cc index ba04d030b9..e0eb919bd8 100644 --- a/paddle/fluid/inference/tests/api/analyzer_rnn2_tester.cc +++ b/paddle/fluid/inference/tests/api/analyzer_rnn2_tester.cc @@ -18,12 +18,12 @@ namespace paddle { namespace inference { using namespace framework; // NOLINT +static std::vector result_data; struct DataRecord { std::vector>> link_step_data_all; std::vector lod; std::vector> rnn_link_data; - std::vector result_data; size_t num_samples; // total number of samples size_t batch_iter{0}; size_t batch_size{1}; @@ -57,6 +57,7 @@ struct DataRecord { std::ifstream file(path); std::string line; int num_lines = 0; + result_data.clear(); while (std::getline(file, line)) { num_lines++; std::vector data; @@ -135,13 +136,12 @@ TEST(Analyzer_rnn2, profile) { if (FLAGS_num_threads == 1 && !FLAGS_test_all_data) { // the first inference result - DataRecord data(FLAGS_infer_data, FLAGS_batch_size); PADDLE_ENFORCE_GT(outputs.size(), 0); size_t size = GetSize(outputs[0]); PADDLE_ENFORCE_GT(size, 0); float *result = static_cast(outputs[0].data.data()); for (size_t i = 0; i < size; i++) { - EXPECT_NEAR(result[i], data.result_data[i], 1e-3); + EXPECT_NEAR(result[i], result_data[i], 1e-3); } } } diff --git a/paddle/fluid/inference/tests/api/analyzer_seq_conv1_tester.cc b/paddle/fluid/inference/tests/api/analyzer_seq_conv1_tester.cc index cb4671c437..f590ef2796 100644 --- a/paddle/fluid/inference/tests/api/analyzer_seq_conv1_tester.cc +++ b/paddle/fluid/inference/tests/api/analyzer_seq_conv1_tester.cc @@ -183,7 +183,13 @@ TEST(Analyzer_seq_conv1, fuse_statis) { SetConfig(&cfg); int num_ops; auto predictor = CreatePaddlePredictor(cfg); - GetFuseStatis(predictor.get(), &num_ops); + + auto fuse_statis = GetFuseStatis(predictor.get(), &num_ops); + ASSERT_TRUE(fuse_statis.count("fc_fuse")); + ASSERT_TRUE(fuse_statis.count("seqconv_eltadd_relu_fuse")); + EXPECT_EQ(fuse_statis.at("fc_fuse"), 2); + EXPECT_EQ(fuse_statis.at("seqconv_eltadd_relu_fuse"), 6); + EXPECT_EQ(num_ops, 32); } // Compare result of NativeConfig and AnalysisConfig diff --git a/paddle/fluid/inference/tests/api/analyzer_vis_tester.cc b/paddle/fluid/inference/tests/api/analyzer_vis_tester.cc index 305b8bfe15..8933296490 100644 --- a/paddle/fluid/inference/tests/api/analyzer_vis_tester.cc +++ b/paddle/fluid/inference/tests/api/analyzer_vis_tester.cc @@ -59,9 +59,6 @@ void SetConfig(AnalysisConfig *cfg) { cfg->specify_input_name = true; // TODO(TJ): fix fusion gru cfg->ir_passes.push_back("fc_gru_fuse_pass"); -#ifdef PADDLE_WITH_MKLDNN - cfg->_use_mkldnn = true; -#endif } void SetInput(std::vector> *inputs) { @@ -84,9 +81,10 @@ void SetInput(std::vector> *inputs) { // Easy for profiling independently. // ocr, mobilenet and se_resnext50 -TEST(Analyzer_vis, profile) { +void profile(bool use_mkldnn = false) { AnalysisConfig cfg; SetConfig(&cfg); + cfg._use_mkldnn = use_mkldnn; std::vector outputs; std::vector> input_slots_all; @@ -108,6 +106,12 @@ TEST(Analyzer_vis, profile) { } } +TEST(Analyzer_vis, profile) { profile(); } + +#ifdef PADDLE_WITH_MKLDNN +TEST(Analyzer_vis, profile_mkldnn) { profile(true /* use_mkldnn */); } +#endif + // Check the fuse status TEST(Analyzer_vis, fuse_statis) { AnalysisConfig cfg; @@ -118,15 +122,21 @@ TEST(Analyzer_vis, fuse_statis) { } // Compare result of NativeConfig and AnalysisConfig -TEST(Analyzer_vis, compare) { +void compare(bool use_mkldnn = false) { AnalysisConfig cfg; SetConfig(&cfg); + cfg._use_mkldnn = use_mkldnn; std::vector> input_slots_all; SetInput(&input_slots_all); CompareNativeAndAnalysis(cfg, input_slots_all); } +TEST(Analyzer_vis, compare) { compare(); } +#ifdef PADDLE_WITH_MKLDNN +TEST(Analyzer_vis, compare_mkldnn) { compare(true /* use_mkldnn */); } +#endif + } // namespace analysis } // namespace inference } // namespace paddle diff --git a/paddle/fluid/inference/tests/api/tester_helper.h b/paddle/fluid/inference/tests/api/tester_helper.h index 8603d09cbd..5589b58b06 100644 --- a/paddle/fluid/inference/tests/api/tester_helper.h +++ b/paddle/fluid/inference/tests/api/tester_helper.h @@ -50,7 +50,7 @@ void CompareResult(const std::vector &outputs, auto &ref_out = ref_outputs[i]; size_t size = VecReduceToInt(out.shape); size_t ref_size = VecReduceToInt(ref_out.shape); - EXPECT_GT(size, 0); + EXPECT_GT(size, 0UL); EXPECT_EQ(size, ref_size); EXPECT_EQ(out.dtype, ref_out.dtype); switch (out.dtype) { @@ -77,11 +77,9 @@ void CompareResult(const std::vector &outputs, std::unique_ptr CreateTestPredictor( const AnalysisConfig &config, bool use_analysis = true) { if (use_analysis) { - return CreatePaddlePredictor(config); + return CreatePaddlePredictor(config); } else { - return CreatePaddlePredictor( - config); + return CreatePaddlePredictor(config); } } @@ -165,7 +163,8 @@ void TestPrediction(const AnalysisConfig &config, const std::vector> &inputs, std::vector *outputs, int num_threads, bool use_analysis = FLAGS_use_analysis) { - LOG(INFO) << "use_analysis: " << use_analysis; + LOG(INFO) << "use_analysis: " << use_analysis + << ", use_mkldnn: " << config._use_mkldnn; if (num_threads == 1) { TestOneThreadPrediction(config, inputs, outputs, use_analysis); } else { @@ -177,6 +176,7 @@ void TestPrediction(const AnalysisConfig &config, void CompareNativeAndAnalysis( const AnalysisConfig &config, const std::vector> &inputs) { + LOG(INFO) << "use_mkldnn: " << config._use_mkldnn; std::vector native_outputs, analysis_outputs; TestOneThreadPrediction(config, inputs, &native_outputs, false); TestOneThreadPrediction(config, inputs, &analysis_outputs, true); diff --git a/paddle/fluid/inference/tests/api/trt_models_tester.cc b/paddle/fluid/inference/tests/api/trt_models_tester.cc index bf320a0cbc..91111f2af5 100644 --- a/paddle/fluid/inference/tests/api/trt_models_tester.cc +++ b/paddle/fluid/inference/tests/api/trt_models_tester.cc @@ -51,11 +51,8 @@ void CompareTensorRTWithFluid(int batch_size, std::string model_dirname) { config1.model_dir = model_dirname; config1.max_batch_size = batch_size; - auto predictor0 = - CreatePaddlePredictor(config0); - auto predictor1 = - CreatePaddlePredictor(config1); + auto predictor0 = CreatePaddlePredictor(config0); + auto predictor1 = CreatePaddlePredictor(config1); // Prepare inputs int height = 224; int width = 224; diff --git a/paddle/fluid/operators/CMakeLists.txt b/paddle/fluid/operators/CMakeLists.txt index 2ef13b72ed..78ef6f207e 100644 --- a/paddle/fluid/operators/CMakeLists.txt +++ b/paddle/fluid/operators/CMakeLists.txt @@ -86,7 +86,7 @@ function(op_library TARGET) # remove windows unsupported op, because windows has no nccl, no warpctc such ops. foreach(windows_unsupport_op "nccl_op" "gen_nccl_id_op" "warpctc_op" "hierarchical_sigmoid_op" "crf_decoding_op" "select_op" "lstmp_op" "gru_op" "fusion_gru_op" "lstm_op" "fusion_lstm_op" "cumsum_op" - "channel_send_op" "channel_create_op" "channel_close_op" "channel_recv_op") + "fusion_seqconv_eltadd_relu_op" "channel_send_op" "channel_create_op" "channel_close_op" "channel_recv_op") if ("${TARGET}" STREQUAL "${windows_unsupport_op}") return() endif() @@ -230,7 +230,7 @@ if(WITH_DISTRIBUTE) op_library(${dist_op} DEPS ${DISTRIBUTE_DEPS}) set_source_files_properties(${dist_op}.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) endforeach() - + #set_source_files_properties(send_recv_op_test.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) #cc_test(test_send_recv SRCS send_recv_op_test.cc DEPS prefetch_op send_op # listen_and_serv_op sum_op executor SERIAL) @@ -268,6 +268,7 @@ if (WITH_GPU AND TENSORRT_FOUND) else() set(DEPS_OPS ${DEPS_OPS} tensorrt_engine_op) endif() +op_library(clip_by_norm_op DEPS selected_rows_functor selected_rows) op_library(sum_op DEPS selected_rows_functor) op_library(sgd_op DEPS selected_rows_functor) op_library(print_op DEPS lod_tensor) @@ -283,10 +284,10 @@ op_library(max_sequence_len_op DEPS lod_rank_table) op_library(sequence_conv_op DEPS context_project) op_library(sequence_pool_op DEPS sequence_pooling) if (NOT WIN32) -op_library(lstm_op DEPS sequence2batch lstm_compute) -op_library(hierarchical_sigmoid_op DEPS matrix_bit_code) -op_library(lstmp_op DEPS sequence2batch lstm_compute) -op_library(gru_op DEPS sequence2batch gru_compute) + op_library(lstm_op DEPS sequence2batch lstm_compute) + op_library(hierarchical_sigmoid_op DEPS matrix_bit_code) + op_library(lstmp_op DEPS sequence2batch lstm_compute) + op_library(gru_op DEPS sequence2batch gru_compute) endif(NOT WIN32) op_library(recurrent_op DEPS executor) op_library(warpctc_op DEPS dynload_warpctc sequence_padding sequence_scale) @@ -299,11 +300,12 @@ op_library(flatten_op DEPS reshape_op) op_library(sequence_pad_op DEPS sequence_padding) op_library(unstack_op DEPS stack_op) op_library(fake_quantize_op DEPS memory) -op_library(fusion_lstm_op DEPS cpu_lstm_compute) +op_library(fusion_lstm_op DEPS jit_kernel) if (WITH_GPU) op_library(conv_op DEPS vol2col depthwise_conv im2col) op_library(layer_norm_op DEPS cub) op_library(reduce_mean_op DEPS cub) + op_library(affine_channel_op DEPS cub) else() op_library(conv_op DEPS vol2col im2col) endif() @@ -314,7 +316,7 @@ op_library(save_op DEPS lod_tensor) op_library(load_op DEPS lod_tensor) op_library(save_combine_op DEPS lod_tensor) op_library(load_combine_op DEPS lod_tensor) -op_library(concat_op DEPS concat) +op_library(concat_op DEPS concat_and_split) list(REMOVE_ITEM GENERAL_OPS ${DEPS_OPS}) @@ -346,6 +348,6 @@ cc_test(strided_memcpy_test SRCS strided_memcpy_test.cc DEPS tensor memory) cc_test(save_load_op_test SRCS save_load_op_test.cc DEPS save_op load_op) cc_test(save_load_combine_op_test SRCS save_load_combine_op_test.cc DEPS save_combine_op load_combine_op) if(NOT WIN32) -nv_test(nccl_op_test SRCS nccl_op_test.cu.cc DEPS nccl_op gpu_info device_context) + nv_test(nccl_op_test SRCS nccl_op_test.cu.cc DEPS nccl_op gpu_info device_context) endif() nv_test(dropout_op_test SRCS dropout_op_test.cc DEPS dropout_op tensor) diff --git a/paddle/fluid/operators/adadelta_op.cc b/paddle/fluid/operators/adadelta_op.cc index d1970515f5..89a7a49e0f 100644 --- a/paddle/fluid/operators/adadelta_op.cc +++ b/paddle/fluid/operators/adadelta_op.cc @@ -18,6 +18,7 @@ namespace paddle { namespace operators { using Tensor = framework::Tensor; + class AdadeltaOp : public framework::OperatorWithKernel { public: using framework::OperatorWithKernel::OperatorWithKernel; @@ -31,6 +32,16 @@ class AdadeltaOp : public framework::OperatorWithKernel { "Input(AvgSquaredGrad) of AdadeltaOp should not be null."); PADDLE_ENFORCE(ctx->HasInput("AvgSquaredUpdate"), "Input(AvgSquaredUpdate) of AdadeltaOp should not be null."); + PADDLE_ENFORCE( + ctx->GetInputsVarType("Param").front() == + framework::proto::VarType::LOD_TENSOR, + "The input var's type should be LoDTensor, but the received is %s", + ctx->Inputs("Param").front(), ctx->GetInputsVarType("Param").front()); + PADDLE_ENFORCE( + ctx->GetInputsVarType("Grad").front() == + framework::proto::VarType::LOD_TENSOR, + "The input var's type should be LoDTensor, but the received is %s", + ctx->Inputs("Grad").front(), ctx->GetInputsVarType("Grad").front()); PADDLE_ENFORCE(ctx->HasOutput("ParamOut"), "Output(ParamOut) of AdadeltaOp should not be null."); @@ -56,6 +67,7 @@ class AdadeltaOp : public framework::OperatorWithKernel { ctx->SetOutputDim("AvgSquaredGradOut", param_dim); ctx->SetOutputDim("AvgSquaredUpdateOut", param_dim); } + framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext &ctx) const override { auto input_data_type = diff --git a/paddle/fluid/operators/adadelta_op.h b/paddle/fluid/operators/adadelta_op.h index 822458daf6..6c616aa03d 100644 --- a/paddle/fluid/operators/adadelta_op.h +++ b/paddle/fluid/operators/adadelta_op.h @@ -23,6 +23,17 @@ template class AdadeltaOpKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& ctx) const override { + const auto* param_var = ctx.InputVar("Param"); + PADDLE_ENFORCE(param_var->IsType(), + "The Var(%s)'s type should be LoDTensor, " + "but the received is %s", + ctx.Inputs("Param").front(), param_var->Type().name()); + const auto* grad_var = ctx.InputVar("Grad"); + PADDLE_ENFORCE(grad_var->IsType(), + "The Var(%s)'s type should be LoDTensor, " + "but the received is %s", + ctx.Inputs("Grad").front(), grad_var->Type().name()); + auto param_out_tensor = ctx.Output("ParamOut"); auto avg_squared_grad_out_tensor = ctx.Output("AvgSquaredGradOut"); diff --git a/paddle/fluid/operators/adagrad_op.h b/paddle/fluid/operators/adagrad_op.h index df520fcc89..0a16ce00f7 100644 --- a/paddle/fluid/operators/adagrad_op.h +++ b/paddle/fluid/operators/adagrad_op.h @@ -13,6 +13,7 @@ See the License for the specific language governing permissions and limitations under the License. */ #pragma once + #include "paddle/fluid/framework/eigen.h" #include "paddle/fluid/framework/op_registry.h" @@ -21,25 +22,31 @@ namespace operators { template struct SparseAdagradFunctor { - void operator()(const DeviceContext& context, - const framework::SelectedRows& grad, - const framework::Tensor& learning_rate, T epsilon, - framework::Tensor* moment, framework::Tensor* param); + void operator()(const DeviceContext &context, + const framework::SelectedRows &grad, + const framework::Tensor &learning_rate, T epsilon, + framework::Tensor *moment, framework::Tensor *param); }; template class AdagradOpKernel : public framework::OpKernel { public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto* param_out_tensor = ctx.Output("ParamOut"); - auto* moment_out_tensor = ctx.Output("MomentOut"); + void Compute(const framework::ExecutionContext &ctx) const override { + const auto *param_var = ctx.InputVar("Param"); + PADDLE_ENFORCE(param_var->IsType(), + "The Var(%s)'s type should be LoDTensor, " + "but the received is %s", + ctx.Inputs("Param").front(), param_var->Type().name()); + + auto *param_out_tensor = ctx.Output("ParamOut"); + auto *moment_out_tensor = ctx.Output("MomentOut"); param_out_tensor->mutable_data(ctx.GetPlace()); moment_out_tensor->mutable_data(ctx.GetPlace()); T epsilon = static_cast(ctx.Attr("epsilon")); - auto* grad_var = ctx.InputVar("Grad"); + auto *grad_var = ctx.InputVar("Grad"); if (grad_var->IsType()) { auto param = framework::EigenVector::Flatten( *ctx.Input("Param")); @@ -47,16 +54,16 @@ class AdagradOpKernel : public framework::OpKernel { *ctx.Input("Grad")); auto moment = framework::EigenVector::Flatten( *ctx.Input("Moment")); - auto* learning_rate = ctx.Input("LearningRate"); + auto *learning_rate = ctx.Input("LearningRate"); auto param_out = framework::EigenVector::Flatten(*param_out_tensor); auto moment_out = framework::EigenVector::Flatten(*moment_out_tensor); - auto* place = ctx.template device_context().eigen_device(); + auto *place = ctx.template device_context().eigen_device(); moment_out.device(*place) = moment + grad * grad; Eigen::DSizes m_dsize(moment_out_tensor->numel()); if (platform::is_cpu_place(ctx.GetPlace())) { - auto* lr = learning_rate->data(); + auto *lr = learning_rate->data(); param_out.device(*place) = param - lr[0] * grad / (moment_out.sqrt() + epsilon); } else { @@ -66,10 +73,10 @@ class AdagradOpKernel : public framework::OpKernel { lr.broadcast(m_dsize) * grad / (moment_out.sqrt() + epsilon); } } else if (grad_var->IsType()) { - auto* param_tensor = ctx.Input("Param"); + auto *param_tensor = ctx.Input("Param"); PADDLE_ENFORCE_EQ(param_tensor, param_out_tensor); - auto* moment_tensor = ctx.Input("Moment"); + auto *moment_tensor = ctx.Input("Moment"); PADDLE_ENFORCE_EQ(moment_tensor, moment_out_tensor); SparseAdagradFunctor functor; diff --git a/paddle/fluid/operators/adam_op.h b/paddle/fluid/operators/adam_op.h index 4cb1f3a80e..3455d1ee54 100644 --- a/paddle/fluid/operators/adam_op.h +++ b/paddle/fluid/operators/adam_op.h @@ -18,6 +18,7 @@ limitations under the License. */ #include #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/operators/detail/safe_ref.h" +#include "paddle/fluid/operators/math/algorithm.h" #include "paddle/fluid/operators/math/selected_rows_functor.h" #include "paddle/fluid/platform/for_range.h" @@ -199,23 +200,9 @@ struct SparseAdamFunctor { row_numel_(row_numel), row_count_(row_count) {} - inline HOSTDEVICE int64_t BinarySearchInRows(int64_t row) const { - int64_t beg = 0, end = row_count_ - 1; - while (beg <= end) { - auto mid = ((beg + end) >> 1); - if (rows_[mid] == row) - return mid; - else if (rows_[mid] < row) - beg = mid + 1; - else - end = mid - 1; - } - return -1; - } - inline HOSTDEVICE void operator()(size_t i) const { - int64_t row = i / row_numel_; - auto row_idx = BinarySearchInRows(row); + auto row_idx = + math::BinarySearch(rows_, row_count_, i / row_numel_); T g = row_idx >= 0 ? grad_[row_idx * row_numel_ + i % row_numel_] : 0; // The following code is the same as dense @@ -244,6 +231,12 @@ template class AdamOpKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& ctx) const override { + const auto* param_var = ctx.InputVar("Param"); + PADDLE_ENFORCE(param_var->IsType(), + "The Var(%s)'s type should be LoDTensor, " + "but the received is %s", + ctx.Inputs("Param").front(), param_var->Type().name()); + using paddle::framework::LoDTensor; using paddle::operators::detail::Ref; diff --git a/paddle/fluid/operators/adamax_op.cc b/paddle/fluid/operators/adamax_op.cc index 32062574bc..d4aa4d338a 100644 --- a/paddle/fluid/operators/adamax_op.cc +++ b/paddle/fluid/operators/adamax_op.cc @@ -35,6 +35,16 @@ class AdamaxOp : public framework::OperatorWithKernel { "Input(LearningRate) of AdamaxOp should not be null."); PADDLE_ENFORCE(ctx->HasInput("Beta1Pow"), "Input(Beta1Pow) of AdamaxOp should not be null."); + PADDLE_ENFORCE( + ctx->GetInputsVarType("Param").front() == + framework::proto::VarType::LOD_TENSOR, + "The input var's type should be LoDTensor, but the received is %s", + ctx->Inputs("Param").front(), ctx->GetInputsVarType("Param").front()); + PADDLE_ENFORCE( + ctx->GetInputsVarType("Grad").front() == + framework::proto::VarType::LOD_TENSOR, + "The input var's type should be LoDTensor, but the received is %s", + ctx->Inputs("Grad").front(), ctx->GetInputsVarType("Grad").front()); PADDLE_ENFORCE(ctx->HasOutput("ParamOut"), "Output(ParamOut) of AdamaxOp should not be null."); diff --git a/paddle/fluid/operators/adamax_op.h b/paddle/fluid/operators/adamax_op.h index de644676fd..7137fbd965 100644 --- a/paddle/fluid/operators/adamax_op.h +++ b/paddle/fluid/operators/adamax_op.h @@ -23,6 +23,17 @@ template class AdamaxOpKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& ctx) const override { + const auto* param_var = ctx.InputVar("Param"); + PADDLE_ENFORCE(param_var->IsType(), + "The Var(%s)'s type should be LoDTensor, " + "but the received is %s", + ctx.Inputs("Param").front(), param_var->Type().name()); + const auto* grad_var = ctx.InputVar("Grad"); + PADDLE_ENFORCE(grad_var->IsType(), + "The Var(%s)'s type should be LoDTensor, " + "but the received is %s", + ctx.Inputs("Grad").front(), grad_var->Type().name()); + auto param_out_tensor = ctx.Output("ParamOut"); auto moment_out_tensor = ctx.Output("MomentOut"); auto inf_norm_out_tensor = ctx.Output("InfNormOut"); diff --git a/paddle/fluid/operators/affine_channel_op.cc b/paddle/fluid/operators/affine_channel_op.cc new file mode 100644 index 0000000000..8944a74967 --- /dev/null +++ b/paddle/fluid/operators/affine_channel_op.cc @@ -0,0 +1,255 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +Indicesou may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/framework/data_layout.h" +#include "paddle/fluid/framework/eigen.h" +#include "paddle/fluid/framework/op_registry.h" + +namespace paddle { +namespace operators { + +class AffineChannelOpMaker : public framework::OpProtoAndCheckerMaker { + public: + void Make() override { + AddInput("X", + "(Tensor) Feature map input can be a 4D tensor with order NCHW " + "or NHWC. It also can be a 2D tensor and C is the second " + "dimension."); + AddInput("Scale", + "(Tensor) 1D input of shape (C), the c-th element " + "is the scale factor of the affine transformation " + "for the c-th channel of the input."); + AddInput("Bias", + "(Tensor) 1D input of shape (C), the c-th element " + "is the bias of the affine transformation for the " + "c-th channel of the input."); + AddAttr( + "data_layout", + "(string, default NCHW) Only used in " + "An optional string from: \"NHWC\", \"NCHW\". " + "Defaults to \"NHWC\". Specify the data format of the output data, " + "the input will be transformed automatically. ") + .SetDefault("AnyLayout"); + AddOutput("Out", "(Tensor) A tensor of the same shape and order with X."); + AddComment(R"DOC( + +Applies a separate affine transformation to each channel of the input. Useful +for replacing spatial batch norm with its equivalent fixed transformation. +The input also can be 2D tensor and applies a affine transformation in second +dimension. + +$$Out = Scale*X + Bias$$ + +)DOC"); + } +}; + +class AffineChannelOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + void InferShape(framework::InferShapeContext* ctx) const override { + PADDLE_ENFORCE(ctx->HasInput("X"), + "Input(X) of AffineChannelOp should not be null."); + PADDLE_ENFORCE(ctx->HasInput("Scale"), + "Input(Scale) of AffineChannelOp should not be null."); + PADDLE_ENFORCE(ctx->HasInput("Bias"), + "Input(Bias) of AffineChannelOp should not be null."); + PADDLE_ENFORCE(ctx->HasOutput("Out"), + "Output(Out) of AffineChannelOp should not be null."); + ctx->SetOutputDim("Out", ctx->GetInputDim("X")); + ctx->ShareLoD("X", "Out"); + } +}; + +class AffineChannelOpGrad : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + void InferShape(framework::InferShapeContext* ctx) const override { + PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")), + "Input(Out@GRAD) should not be null."); + if (ctx->HasOutput(framework::GradVarName("X"))) { + PADDLE_ENFORCE(ctx->HasInput("Scale"), + "Input(Scale) should not be null."); + ctx->SetOutputDim(framework::GradVarName("X"), + ctx->GetInputDim(framework::GradVarName("Out"))); + } + if (ctx->HasOutput(framework::GradVarName("Scale"))) { + // Scale@GRAD and Bias@GRAD must exist at the same time. + PADDLE_ENFORCE(ctx->HasOutput(framework::GradVarName("Bias")), + "Output(Scale@GRAD) should not be null."); + PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should not be null."); + ctx->SetOutputDim(framework::GradVarName("Scale"), + ctx->GetInputDim("Scale")); + ctx->SetOutputDim(framework::GradVarName("Bias"), + ctx->GetInputDim("Scale")); + } + } +}; + +template +using EigenArrayMap = + Eigen::Map>; +template +using ConstEigenArrayMap = + Eigen::Map>; +template +using EigenVectorArrayMap = Eigen::Map>; +template +using ConstEigenVectorArrayMap = + Eigen::Map>; + +template +class AffineChannelKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto* x = ctx.Input("X"); + auto* scale = ctx.Input("Scale"); + auto* bias = ctx.Input("Bias"); + + auto* y = ctx.Output("Out"); + y->mutable_data(ctx.GetPlace()); + + const framework::DataLayout layout = + framework::StringToDataLayout(ctx.Attr("data_layout")); + + auto dims = x->dims(); + int N = dims[0]; + int C = layout == framework::DataLayout::kNCHW ? dims[1] + : dims[dims.size() - 1]; + int HxW = x->numel() / N / C; + + auto* scale_d = scale->data(); + auto* bias_d = bias->data(); + ConstEigenVectorArrayMap a_e(scale_d, C); + ConstEigenVectorArrayMap b_e(bias_d, C); + + auto* x_d = x->data(); + auto* y_d = y->data(); + if (layout == framework::DataLayout::kNCHW) { + int stride = C * HxW; + for (int i = 0; i < N; i++) { + ConstEigenArrayMap x_e(x_d, HxW, C); + EigenArrayMap y_e(y_d, HxW, C); + y_e = (x_e.rowwise() * a_e.transpose()).rowwise() + b_e.transpose(); + x_d += stride; + y_d += stride; + } + } else { + int num = N * HxW; + ConstEigenArrayMap x_e(x_d, C, num); + EigenArrayMap y_e(y_d, C, num); + y_e = (x_e.colwise() * a_e).colwise() + b_e; + } + } +}; + +template +class AffineChannelGradKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto* x = ctx.Input("X"); + auto* scale = ctx.Input("Scale"); + auto* dy = ctx.Input(framework::GradVarName("Out")); + + auto* dx = ctx.Output(framework::GradVarName("X")); + auto* dscale = + ctx.Output(framework::GradVarName("Scale")); + auto* dbias = ctx.Output(framework::GradVarName("Bias")); + + const framework::DataLayout layout = + framework::StringToDataLayout(ctx.Attr("data_layout")); + + auto dims = x->dims(); + int N = dims[0]; + int C = layout == framework::DataLayout::kNCHW ? dims[1] + : dims[dims.size() - 1]; + int HxW = x->numel() / N / C; + + auto* x_d = x->data(); + auto* dy_d = dy->data(); + auto* scale_d = scale->data(); + ConstEigenVectorArrayMap scale_e(scale_d, C); + + T* dx_d = dx ? dx->mutable_data(ctx.GetPlace()) : nullptr; + T* dscale_d = dscale ? dscale->mutable_data(ctx.GetPlace()) : nullptr; + T* dbias_d = dbias ? dbias->mutable_data(ctx.GetPlace()) : nullptr; + EigenVectorArrayMap dscale_e(dscale_d, C); + EigenVectorArrayMap dbias_e(dbias_d, C); + + if (layout == framework::DataLayout::kNCHW) { + // compute dx + int stride = C * HxW; + if (dx) { + for (int i = 0; i < N; i++) { + ConstEigenArrayMap dy_e(dy_d, HxW, C); + EigenArrayMap dx_e(dx_d, HxW, C); + dx_e = dy_e.rowwise() * scale_e.transpose(); + dy_d += stride; + dx_d += stride; + } + } + // compute dscale and dbias + if (dscale && dbias) { + dy_d = dy->data(); + for (int i = 0; i < N; i++) { + ConstEigenArrayMap x_e(x_d, HxW, C); + ConstEigenArrayMap dy_e(dy_d, HxW, C); + if (i == 0) { + dscale_e = (x_e * dy_e).colwise().sum(); + } else { + dscale_e += (x_e * dy_e).colwise().sum(); + } + if (i == 0) { + dbias_e = dy_e.colwise().sum(); + } else { + dbias_e += dy_e.colwise().sum(); + } + x_d += stride; + dy_d += stride; + } + } + } else { + int num = N * HxW; + ConstEigenArrayMap dy_e(dy_d, C, num); + // compute dx + if (dx) { + EigenArrayMap dx_e(dx_d, C, num); + dx_e = dy_e.colwise() * scale_e; + } + // compute dscale and dbias + if (dscale && dbias) { + ConstEigenArrayMap x_e(x_d, C, num); + dscale_e = (x_e * dy_e).rowwise().sum(); + dbias_e = dy_e.rowwise().sum(); + } + } + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +using CPU = paddle::platform::CPUDeviceContext; + +REGISTER_OPERATOR(affine_channel, ops::AffineChannelOp, + ops::AffineChannelOpMaker, + paddle::framework::DefaultGradOpDescMaker); +REGISTER_OPERATOR(affine_channel_grad, ops::AffineChannelOpGrad); + +REGISTER_OP_CPU_KERNEL(affine_channel, ops::AffineChannelKernel, + ops::AffineChannelKernel); +REGISTER_OP_CPU_KERNEL(affine_channel_grad, + ops::AffineChannelGradKernel, + ops::AffineChannelGradKernel); diff --git a/paddle/fluid/operators/affine_channel_op.cu b/paddle/fluid/operators/affine_channel_op.cu new file mode 100644 index 0000000000..2bebdb345a --- /dev/null +++ b/paddle/fluid/operators/affine_channel_op.cu @@ -0,0 +1,187 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +Indicesou may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "cub/cub.cuh" +#include "paddle/fluid/framework/data_layout.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/platform/cuda_primitives.h" + +namespace paddle { +namespace operators { + +template +__global__ void KeAffineChannelCUDA(const T* x, const T* scale, const T* bias, + const int C, const int HxW, const int num, + T* y) { + int gid = blockIdx.x * blockDim.x + threadIdx.x; + int stride = blockDim.x * gridDim.x; + for (int i = gid; i < num; i += stride) { + const int c = layout == framework::DataLayout::kNCHW ? i / HxW % C : i % C; + if (HasBias) { + y[i] = scale[c] * x[i] + bias[c]; + } else { + y[i] = scale[c] * x[i]; + } + } +} + +template +class AffineChannelCUDAKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto* x = ctx.Input("X"); + auto* scale = ctx.Input("Scale"); + auto* bias = ctx.Input("Bias"); + + auto* y = ctx.Output("Out"); + y->mutable_data(ctx.GetPlace()); + + const framework::DataLayout layout = + framework::StringToDataLayout(ctx.Attr("data_layout")); + auto& dev_ctx = ctx.template device_context(); + + auto dims = x->dims(); + const int num = x->numel(); + int N = dims[0]; + int C = layout == framework::DataLayout::kNCHW ? dims[1] + : dims[dims.size() - 1]; + int HxW = num / N / C; + + const T* x_d = x->data(); + const T* scale_d = scale->data(); + const T* bias_d = bias->data(); + T* y_d = y->data(); + + int block = 1024; + int grid = (num + block - 1) / block; + if (layout == framework::DataLayout::kNCHW) { + KeAffineChannelCUDA<<>>( + x_d, scale_d, bias_d, C, HxW, num, y_d); + } else { + KeAffineChannelCUDA<<>>( + x_d, scale_d, bias_d, C, HxW, num, y_d); + } + } +}; + +template +__global__ void AffineChannelScaleBiasGradientCUDAKernel( + const T* dy, const T* x, const int N, const int C, const int HxW, T* dscale, + T* dbias) { + const int outer_size = C; + const int inner_size = N * HxW; + typedef cub::BlockReduce BlockReduce; + __shared__ typename BlockReduce::TempStorage ds_storage; + __shared__ typename BlockReduce::TempStorage db_storage; + + for (int i = blockIdx.x; i < outer_size; i += gridDim.x) { + T ds_sum = 0; + T db_sum = 0; + for (int j = threadIdx.x; j < inner_size; j += blockDim.x) { + const int index = layout == framework::DataLayout::kNCHW + ? (j / HxW * C + i) * HxW + j % HxW + : j * outer_size + i; + ds_sum += dy[index] * x[index]; + db_sum += dy[index]; + } + ds_sum = BlockReduce(ds_storage).Reduce(ds_sum, cub::Sum()); + db_sum = BlockReduce(db_storage).Reduce(db_sum, cub::Sum()); + if (threadIdx.x == 0) { + dscale[i] = ds_sum; + dbias[i] = db_sum; + } + __syncthreads(); + } +} + +template +class AffineChannelGradCUDAKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto* x = ctx.Input("X"); + auto* scale = ctx.Input("Scale"); + auto* bias = ctx.Input("Bias"); + auto* dy = ctx.Input(framework::GradVarName("Out")); + + auto* dx = ctx.Output(framework::GradVarName("X")); + auto* dscale = + ctx.Output(framework::GradVarName("Scale")); + auto* dbias = ctx.Output(framework::GradVarName("Bias")); + + const framework::DataLayout layout = + framework::StringToDataLayout(ctx.Attr("data_layout")); + auto& dev_ctx = ctx.template device_context(); + + auto dims = x->dims(); + const int num = x->numel(); + int N = dims[0]; + int C = layout == framework::DataLayout::kNCHW ? dims[1] + : dims[dims.size() - 1]; + int HxW = num / N / C; + + const T* x_d = x->data(); + const T* dy_d = dy->data(); + const T* s_d = scale->data(); + + T* dx_d = dx ? dx->mutable_data(ctx.GetPlace()) : nullptr; + T* ds_d = dscale ? dscale->mutable_data(ctx.GetPlace()) : nullptr; + T* db_d = dbias ? dbias->mutable_data(ctx.GetPlace()) : nullptr; + + const int block = 1024; + int max_threads = dev_ctx.GetMaxPhysicalThreadCount(); + const int max_blocks = std::max(max_threads / block, 1); + int grid1 = (num + block - 1) / block; + int grid2 = std::min(C, max_blocks); + if (layout == framework::DataLayout::kNCHW) { + if (dx) { + KeAffineChannelCUDA<<>>( + dy_d, s_d, nullptr, C, HxW, num, dx_d); + } + if (dscale && dbias) { + AffineChannelScaleBiasGradientCUDAKernel< + T, block, framework::DataLayout::kNCHW><<>>( + dy_d, x_d, N, C, HxW, ds_d, db_d); + } + } else { + if (dx) { + KeAffineChannelCUDA<<>>( + dy_d, s_d, nullptr, C, HxW, num, dx_d); + } + if (dscale && dbias) { + AffineChannelScaleBiasGradientCUDAKernel< + T, block, framework::DataLayout::kNHWC><<>>( + dy_d, x_d, N, C, HxW, ds_d, db_d); + } + } + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +using CUDA = paddle::platform::CUDADeviceContext; + +REGISTER_OP_CUDA_KERNEL(affine_channel, + ops::AffineChannelCUDAKernel, + ops::AffineChannelCUDAKernel); +REGISTER_OP_CUDA_KERNEL(affine_channel_grad, + ops::AffineChannelGradCUDAKernel, + ops::AffineChannelGradCUDAKernel); diff --git a/paddle/fluid/operators/array_to_lod_tensor_op.cc b/paddle/fluid/operators/array_to_lod_tensor_op.cc index b8b8b2290a..6257e04b01 100644 --- a/paddle/fluid/operators/array_to_lod_tensor_op.cc +++ b/paddle/fluid/operators/array_to_lod_tensor_op.cc @@ -11,7 +11,7 @@ distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include +#include #include #include "paddle/fluid/framework/lod_rank_table.h" diff --git a/paddle/fluid/operators/clip_by_norm_op.h b/paddle/fluid/operators/clip_by_norm_op.h index 5af0eb0b2a..855c4d7067 100644 --- a/paddle/fluid/operators/clip_by_norm_op.h +++ b/paddle/fluid/operators/clip_by_norm_op.h @@ -16,12 +16,15 @@ limitations under the License. */ #include "paddle/fluid/framework/eigen.h" #include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/framework/selected_rows.h" +#include "paddle/fluid/operators/math/selected_rows_functor.h" #include "paddle/fluid/platform/transform.h" namespace paddle { namespace operators { using Tensor = framework::Tensor; +using SelectedRows = framework::SelectedRows; template using EigenVector = framework::EigenVector; @@ -31,9 +34,40 @@ class ClipByNormKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& context) const override { auto max_norm = context.Attr("max_norm"); - auto* input = context.Input("X"); - auto* output = context.Output("Out"); - output->mutable_data(context.GetPlace()); + auto in_var = context.InputVar("X"); + + Tensor* output = nullptr; + const Tensor* input = nullptr; + if (in_var->IsType()) { + input = context.Input("X"); + + output = context.Output("Out"); + output->mutable_data(context.GetPlace()); + } else if (in_var->IsType()) { + auto* x = context.Input("X"); + + // merge ids in selected rows first + math::scatter::MergeAdd merge_func; + SelectedRows* merged_input = + const_cast(context.scope()) + .Var() + ->GetMutable(); + merge_func(context.template device_context(), *x, + merged_input); + input = &(merged_input->value()); + + SelectedRows* output_selected_rows = context.Output("Out"); + output_selected_rows->set_rows(merged_input->rows()); + output_selected_rows->set_height(merged_input->height()); + output = output_selected_rows->mutable_value(); + output->Resize(merged_input->value().dims()); + output->mutable_data(context.GetPlace()); + } else { + PADDLE_THROW("Unexpected branch, input variable type is %s", + in_var->Type().name()); + } + + PADDLE_ENFORCE_NOT_NULL(input); auto x = EigenVector::Flatten(*input); auto out = EigenVector::Flatten(*output); diff --git a/paddle/fluid/operators/concat_op.h b/paddle/fluid/operators/concat_op.h index b2c6495c44..bd474be0fa 100644 --- a/paddle/fluid/operators/concat_op.h +++ b/paddle/fluid/operators/concat_op.h @@ -17,7 +17,7 @@ limitations under the License. */ #include #include #include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/operators/math/concat.h" +#include "paddle/fluid/operators/math/concat_and_split.h" #include "paddle/fluid/operators/strided_memcpy.h" namespace paddle { @@ -89,29 +89,17 @@ class ConcatGradKernel : public framework::OpKernel { outputs.push_back(nullptr); } } + auto& dev_ctx = ctx.template device_context(); // Sometimes direct copies will be faster, this maybe need deeply analysis. if (axis == 0 && outs.size() < 10) { - size_t input_offset = 0; - const auto in_stride = framework::stride_numel(out_grad->dims()); - - for (size_t i = 0; i < outs.size(); ++i) { - auto out_stride = framework::stride_numel(ins[i]->dims()); - auto* out = outputs[i]; - if (out != nullptr) { - StridedNumelCopyWithAxis( - ctx.device_context(), axis, out->data(), out_stride, - out_grad->data() + input_offset, in_stride, out_stride[axis]); - } - input_offset += out_stride[axis]; - } + std::vector ref_shape; + ref_shape.insert(ref_shape.begin(), ins.begin(), ins.end()); + StridedMemcpyWithAxis0(dev_ctx, *out_grad, ref_shape, &outputs); } else { - auto& dev_ctx = ctx.template device_context(); - paddle::operators::math::ConcatGradFunctor - concat_grad_functor; - concat_grad_functor(dev_ctx, *out_grad, - ctx.MultiInput("X"), - static_cast(axis), &outputs); + math::SplitFunctor split_functor; + split_functor(dev_ctx, *out_grad, ctx.MultiInput("X"), + static_cast(axis), &outputs); } } }; diff --git a/paddle/fluid/operators/conv_mkldnn_op.cc b/paddle/fluid/operators/conv_mkldnn_op.cc index eae6596828..521f423fb0 100644 --- a/paddle/fluid/operators/conv_mkldnn_op.cc +++ b/paddle/fluid/operators/conv_mkldnn_op.cc @@ -300,10 +300,10 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel { std::vector paddings = ctx.Attr>("paddings"); std::vector dilations = ctx.Attr>("dilations"); bool fuse_relu = ctx.Attr("fuse_relu"); - bool fuse_eltwise = ctx.Attr("fuse_eltwise"); + bool fuse_residual_conn = ctx.Attr("fuse_residual_connection"); int groups = ctx.Attr("groups"); - // TODO: add support for dilation + // TODO(tpatejko): add support for dilation PADDLE_ENFORCE( dilations.size() == 2 && dilations[0] == 1 && dilations[1] == 1, "dilation in convolution is not implemented yet"); @@ -369,11 +369,11 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel { bias_tz, platform::MKLDNNGetDataType(), memory::format::x); conv_pd = ConvFwdPrimitiveDesc(src_md, weights_md, bias_md, dst_md, strides, paddings, mkldnn_engine, - fuse_relu, fuse_eltwise); + fuse_relu, fuse_residual_conn); } else { conv_pd = ConvFwdPrimitiveDesc(src_md, weights_md, dst_md, strides, paddings, - mkldnn_engine, fuse_relu, fuse_eltwise); + mkldnn_engine, fuse_relu, fuse_residual_conn); } // Save conv_pd/src_memory/weights_memory for backward pass dev_ctx.SetBlob(key_conv_pd, conv_pd); @@ -386,8 +386,26 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel { auto user_weights_memory_p = handler.AcquireWeightsMemory( user_weights_md, to_void_cast(filter_data)); - T* output_data = - output->mutable_data(ctx.GetPlace(), handler.GetDstMemorySize()); + T* output_data = nullptr; + + if (fuse_residual_conn) { + auto residual_param = ctx.Input("ResidualData"); + auto residual_param_data = residual_param->data(); + + PADDLE_ENFORCE( + residual_param_data != nullptr, + "Provide data if you want MKLDNN conv+elementwise_add fusion"); + PADDLE_ENFORCE_EQ(output->dims(), residual_param->dims(), + "Output and elementwise parameter need to have the " + "same dimension sizes"); + + output->ShareDataWith(*residual_param); + output_data = output->mutable_data(ctx.GetPlace()); + } else { + output_data = + output->mutable_data(ctx.GetPlace(), handler.GetDstMemorySize()); + } + // create reorder primitive if the input format is not the preferred one auto src_memory_p = handler.AcquireSrcMemoryFromPrimitive(user_src_memory_p, pipeline); @@ -424,14 +442,15 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel { private: mkldnn::primitive_attr CreatePostOps(bool fuse_relu, - bool fuse_eltwise) const { + bool fuse_residual_conn) const { mkldnn::primitive_attr conv_attr; mkldnn::post_ops post_operations; // Fusion with Elementwise layer relies on adding a sum post-operation with - // the scale parameter. It is assumed that when fuse_eltwise is true, the - // Output tensor contains the data coming from residual connection. The - // result of this post_op is: Output = scale * Output + Conv_Out. - if (fuse_eltwise) { + // the scale parameter. It is assumed that when fuse_residual_connection is + // true, the output tensor contains the data coming from residual + // connection. The result of this post_op is: + // Output = scale * Output + Conv_Out. + if (fuse_residual_conn) { post_operations.append_sum(1.0f); } // Fusion with ReLU layer is executed through the PostOps feature. Create a @@ -452,7 +471,7 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel { const memory::desc& dst, const std::vector& strides, const std::vector& paddings, const mkldnn::engine& engine, const bool fuse_relu, - const bool fuse_eltwise) const { + const bool fuse_residual_conn) const { memory::dims stride_dims = {strides[0], strides[1]}; memory::dims padding_dims = {paddings[0], paddings[1]}; @@ -461,7 +480,8 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel { dst, stride_dims, padding_dims, padding_dims, mkldnn::padding_kind::zero); - mkldnn::primitive_attr conv_attr = CreatePostOps(fuse_relu, fuse_eltwise); + mkldnn::primitive_attr conv_attr = + CreatePostOps(fuse_relu, fuse_residual_conn); auto p_conv_pd = new mkldnn::convolution_forward::primitive_desc( conv_desc, conv_attr, engine); @@ -476,7 +496,7 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel { const std::vector& strides, const std::vector& paddings, const mkldnn::engine& engine, const bool fuse_relu, - const bool fuse_eltwise) const { + const bool fuse_residual_conn) const { memory::dims stride_dims = {strides[0], strides[1]}; memory::dims padding_dims = {paddings[0], paddings[1]}; @@ -485,7 +505,8 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel { bias, dst, stride_dims, padding_dims, padding_dims, mkldnn::padding_kind::zero); - mkldnn::primitive_attr conv_attr = CreatePostOps(fuse_relu, fuse_eltwise); + mkldnn::primitive_attr conv_attr = + CreatePostOps(fuse_relu, fuse_residual_conn); auto p_conv_pd = new mkldnn::convolution_forward::primitive_desc( conv_desc, conv_attr, engine); diff --git a/paddle/fluid/operators/conv_op.cc b/paddle/fluid/operators/conv_op.cc index 8f84bf71a7..8f2561fcc3 100644 --- a/paddle/fluid/operators/conv_op.cc +++ b/paddle/fluid/operators/conv_op.cc @@ -132,6 +132,11 @@ void Conv2DOpMaker::Make() { "(Tensor) The output tensor of convolution operator. " "The format of output tensor is also NCHW.") .Reuse("Input"); + AddInput("ResidualData", + "(Tensor) Tensor with residual data " + "to which convolution output will be added." + "Used with fuse_residual_connection fusion.") + .AsDispensable(); AddAttr>("strides", "(vector default:{1, 1}), the " "strides(h_stride, w_stride) of " @@ -164,10 +169,10 @@ void Conv2DOpMaker::Make() { .SetDefault(false); AddAttr("fuse_relu", "(bool, default false) Only used in mkldnn kernel") .SetDefault(false); - AddAttr("fuse_eltwise", + AddAttr("fuse_residual_connection", "(bool, default false) Only used in mkldnn kernel. Used " - "whenever convolution output is connected via skip connection " - "to a previous layer.") + "whenever convolution output is as an input to residual " + "connection.") .SetDefault(false); AddAttr( "data_format", diff --git a/paddle/fluid/operators/decayed_adagrad_op.cc b/paddle/fluid/operators/decayed_adagrad_op.cc index c0f2b49a04..d73ae9e272 100644 --- a/paddle/fluid/operators/decayed_adagrad_op.cc +++ b/paddle/fluid/operators/decayed_adagrad_op.cc @@ -32,6 +32,16 @@ class DecayedAdagradOp : public framework::OperatorWithKernel { PADDLE_ENFORCE( ctx->HasInput("LearningRate"), "Input(LearningRate) of DecayedAdagradOp should not be null."); + PADDLE_ENFORCE( + ctx->GetInputsVarType("Param").front() == + framework::proto::VarType::LOD_TENSOR, + "The input var's type should be LoDTensor, but the received is %s", + ctx->Inputs("Param").front(), ctx->GetInputsVarType("Param").front()); + PADDLE_ENFORCE( + ctx->GetInputsVarType("Grad").front() == + framework::proto::VarType::LOD_TENSOR, + "The input var's type should be LoDTensor, but the received is %s", + ctx->Inputs("Grad").front(), ctx->GetInputsVarType("Grad").front()); PADDLE_ENFORCE(ctx->HasOutput("ParamOut"), "Output(ParamOut) of DecayedAdagradOp should not be null."); diff --git a/paddle/fluid/operators/decayed_adagrad_op.h b/paddle/fluid/operators/decayed_adagrad_op.h index a46af078e0..5df43d33ef 100644 --- a/paddle/fluid/operators/decayed_adagrad_op.h +++ b/paddle/fluid/operators/decayed_adagrad_op.h @@ -23,6 +23,17 @@ template class DecayedAdagradOpKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& ctx) const override { + const auto* param_var = ctx.InputVar("Param"); + PADDLE_ENFORCE(param_var->IsType(), + "The Var(%s)'s type should be LoDTensor, " + "but the received is %s", + ctx.Inputs("Param").front(), param_var->Type().name()); + const auto* grad_var = ctx.InputVar("Grad"); + PADDLE_ENFORCE(grad_var->IsType(), + "The Var(%s)'s type should be LoDTensor, " + "but the received is %s", + ctx.Inputs("Grad").front(), grad_var->Type().name()); + auto param_out_tensor = ctx.Output("ParamOut"); auto moment_out_tensor = ctx.Output("MomentOut"); diff --git a/paddle/fluid/operators/detection/CMakeLists.txt b/paddle/fluid/operators/detection/CMakeLists.txt index aa8ed502fc..d5eec148f9 100644 --- a/paddle/fluid/operators/detection/CMakeLists.txt +++ b/paddle/fluid/operators/detection/CMakeLists.txt @@ -20,7 +20,7 @@ detection_library(box_coder_op SRCS box_coder_op.cc box_coder_op.cu) detection_library(iou_similarity_op SRCS iou_similarity_op.cc iou_similarity_op.cu) detection_library(mine_hard_examples_op SRCS mine_hard_examples_op.cc) -detection_library(multiclass_nms_op SRCS multiclass_nms_op.cc) +detection_library(multiclass_nms_op SRCS multiclass_nms_op.cc poly_util.cc gpc.cc) detection_library(prior_box_op SRCS prior_box_op.cc prior_box_op.cu) detection_library(anchor_generator_op SRCS anchor_generator_op.cc anchor_generator_op.cu) diff --git a/paddle/fluid/operators/detection/generate_proposal_labels_op.cc b/paddle/fluid/operators/detection/generate_proposal_labels_op.cc index d7a53f1bef..339e63a2be 100644 --- a/paddle/fluid/operators/detection/generate_proposal_labels_op.cc +++ b/paddle/fluid/operators/detection/generate_proposal_labels_op.cc @@ -16,7 +16,7 @@ limitations under the License. */ #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/operators/detection/bbox_util.h" #include "paddle/fluid/operators/gather.h" -#include "paddle/fluid/operators/math/concat.h" +#include "paddle/fluid/operators/math/concat_and_split.h" #include "paddle/fluid/operators/math/math_function.h" namespace paddle { diff --git a/paddle/fluid/operators/detection/generate_proposals_op.cc b/paddle/fluid/operators/detection/generate_proposals_op.cc index 818d58ea9e..a69d9c9a52 100644 --- a/paddle/fluid/operators/detection/generate_proposals_op.cc +++ b/paddle/fluid/operators/detection/generate_proposals_op.cc @@ -12,10 +12,12 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ +#include +#include #include #include #include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/framework/var_type.h" +#include "paddle/fluid/operators/detail/safe_ref.h" #include "paddle/fluid/operators/gather.h" #include "paddle/fluid/operators/math/math_function.h" @@ -25,21 +27,17 @@ namespace operators { using Tensor = framework::Tensor; using LoDTensor = framework::LoDTensor; -struct AppendProposalsFunctor { - LoDTensor *out_; - int64_t offset_; - Tensor *to_add_; +static const double kBBoxClipDefault = std::log(1000.0 / 16.0); - AppendProposalsFunctor(LoDTensor *out, int64_t offset, Tensor *to_add) - : out_(out), offset_(offset), to_add_(to_add) {} - - template - void apply() const { - auto *out_data = out_->data(); - auto *to_add_data = to_add_->data(); - memcpy(out_data + offset_, to_add_data, to_add_->numel() * sizeof(T)); - } -}; +static void AppendProposals(Tensor *dst, int64_t offset, const Tensor &src) { + auto *out_data = dst->data(); + auto *to_add_data = src.data(); + size_t size_of_t = framework::SizeOfType(src.type()); + offset *= size_of_t; + std::memcpy( + reinterpret_cast(reinterpret_cast(out_data) + offset), + to_add_data, src.numel() * size_of_t); +} class GenerateProposalsOp : public framework::OperatorWithKernel { public: @@ -75,8 +73,9 @@ class GenerateProposalsOp : public framework::OperatorWithKernel { }; template -void BoxCoder(const platform::DeviceContext &ctx, Tensor *all_anchors, - Tensor *bbox_deltas, Tensor *variances, Tensor *proposals) { +static inline void BoxCoder(const platform::DeviceContext &ctx, + Tensor *all_anchors, Tensor *bbox_deltas, + Tensor *variances, Tensor *proposals) { T *proposals_data = proposals->mutable_data(ctx.GetPlace()); int64_t row = all_anchors->dims()[0]; @@ -108,11 +107,11 @@ void BoxCoder(const platform::DeviceContext &ctx, Tensor *all_anchors, anchor_center_y; bbox_width = std::exp(std::min(variances_data[i * len + 2] * bbox_deltas_data[i * len + 2], - std::log(1000.0 / 16.0))) * + kBBoxClipDefault)) * anchor_width; bbox_height = std::exp(std::min(variances_data[i * len + 3] * bbox_deltas_data[i * len + 3], - std::log(1000.0 / 16.0))) * + kBBoxClipDefault)) * anchor_height; } else { bbox_center_x = @@ -120,10 +119,10 @@ void BoxCoder(const platform::DeviceContext &ctx, Tensor *all_anchors, bbox_center_y = bbox_deltas_data[i * len + 1] * anchor_height + anchor_center_y; bbox_width = std::exp(std::min(bbox_deltas_data[i * len + 2], - std::log(1000.0 / 16.0))) * + kBBoxClipDefault)) * anchor_width; bbox_height = std::exp(std::min(bbox_deltas_data[i * len + 3], - std::log(1000.0 / 16.0))) * + kBBoxClipDefault)) * anchor_height; } @@ -136,30 +135,32 @@ void BoxCoder(const platform::DeviceContext &ctx, Tensor *all_anchors, } template -void ClipTiledBoxes(const platform::DeviceContext &ctx, const Tensor &im_info, - Tensor *boxes) { +static inline void ClipTiledBoxes(const platform::DeviceContext &ctx, + const Tensor &im_info, Tensor *boxes) { T *boxes_data = boxes->mutable_data(ctx.GetPlace()); const T *im_info_data = im_info.data(); + T zero(0); for (int64_t i = 0; i < boxes->numel(); ++i) { if (i % 4 == 0) { boxes_data[i] = - std::max(std::min(boxes_data[i], im_info_data[1] - 1), 0.0f); + std::max(std::min(boxes_data[i], im_info_data[1] - 1), zero); } else if (i % 4 == 1) { boxes_data[i] = - std::max(std::min(boxes_data[i], im_info_data[0] - 1), 0.0f); + std::max(std::min(boxes_data[i], im_info_data[0] - 1), zero); } else if (i % 4 == 2) { boxes_data[i] = - std::max(std::min(boxes_data[i], im_info_data[1] - 1), 0.0f); + std::max(std::min(boxes_data[i], im_info_data[1] - 1), zero); } else { boxes_data[i] = - std::max(std::min(boxes_data[i], im_info_data[0] - 1), 0.0f); + std::max(std::min(boxes_data[i], im_info_data[0] - 1), zero); } } } template -void FilterBoxes(const platform::DeviceContext &ctx, Tensor *boxes, - float min_size, const Tensor &im_info, Tensor *keep) { +static inline void FilterBoxes(const platform::DeviceContext &ctx, + Tensor *boxes, float min_size, + const Tensor &im_info, Tensor *keep) { const T *im_info_data = im_info.data(); T *boxes_data = boxes->mutable_data(ctx.GetPlace()); T im_scale = im_info_data[2]; @@ -185,24 +186,24 @@ void FilterBoxes(const platform::DeviceContext &ctx, Tensor *boxes, keep->Resize({keep_len}); } -bool SortScorePairDescend(const std::pair &pair1, - const std::pair &pair2) { - return pair1.first > pair2.first; -} - template -void GetMaxScoreIndex(const std::vector &scores, - std::vector> *sorted_indices) { +static inline std::vector> GetSortedScoreIndex( + const std::vector &scores) { + std::vector> sorted_indices; + sorted_indices.reserve(scores.size()); for (size_t i = 0; i < scores.size(); ++i) { - sorted_indices->push_back(std::make_pair(scores[i], i)); + sorted_indices.emplace_back(scores[i], i); } // Sort the score pair according to the scores in descending order - std::stable_sort(sorted_indices->begin(), sorted_indices->end(), - SortScorePairDescend); + std::stable_sort(sorted_indices.begin(), sorted_indices.end(), + [](const std::pair &a, const std::pair &b) { + return a.first < b.first; + }); + return sorted_indices; } template -T BBoxArea(const T *box, const bool normalized) { +static inline T BBoxArea(const T *box, bool normalized) { if (box[2] < box[0] || box[3] < box[1]) { // If coordinate values are is invalid // (e.g. xmax < xmin or ymax < ymin), return 0. @@ -220,7 +221,7 @@ T BBoxArea(const T *box, const bool normalized) { } template -T JaccardOverlap(const T *box1, const T *box2, const bool normalized) { +static inline T JaccardOverlap(const T *box1, const T *box2, bool normalized) { if (box2[0] > box1[2] || box2[2] < box1[0] || box2[1] > box1[3] || box2[3] < box1[1]) { return static_cast(0.); @@ -229,8 +230,8 @@ T JaccardOverlap(const T *box1, const T *box2, const bool normalized) { const T inter_ymin = std::max(box1[1], box2[1]); const T inter_xmax = std::min(box1[2], box2[2]); const T inter_ymax = std::min(box1[3], box2[3]); - const T inter_w = std::max(0.0f, inter_xmax - inter_xmin + 1); - const T inter_h = std::max(0.0f, inter_ymax - inter_ymin + 1); + const T inter_w = std::max(T(0), inter_xmax - inter_xmin + 1); + const T inter_h = std::max(T(0), inter_ymax - inter_ymin + 1); const T inter_area = inter_w * inter_h; const T bbox1_area = BBoxArea(box1, normalized); const T bbox2_area = BBoxArea(box2, normalized); @@ -238,9 +239,21 @@ T JaccardOverlap(const T *box1, const T *box2, const bool normalized) { } } +template +static inline Tensor VectorToTensor(const std::vector &selected_indices, + int selected_num) { + Tensor keep_nms; + keep_nms.Resize({selected_num}); + auto *keep_data = keep_nms.mutable_data(platform::CPUPlace()); + for (int i = 0; i < selected_num; ++i) { + keep_data[i] = selected_indices[i]; + } + return keep_nms; +} + template -Tensor NMS(const platform::DeviceContext &ctx, Tensor *bbox, Tensor *scores, - const T nms_threshold, const float eta) { +static inline Tensor NMS(const platform::DeviceContext &ctx, Tensor *bbox, + Tensor *scores, T nms_threshold, float eta) { PADDLE_ENFORCE_NOT_NULL(bbox); int64_t num_boxes = bbox->dims()[0]; // 4: [xmin ymin xmax ymax] @@ -248,20 +261,18 @@ Tensor NMS(const platform::DeviceContext &ctx, Tensor *bbox, Tensor *scores, std::vector scores_data(num_boxes); std::copy_n(scores->data(), num_boxes, scores_data.begin()); - std::vector> sorted_indices; - GetMaxScoreIndex(scores_data, &sorted_indices); + std::vector> sorted_indices = + GetSortedScoreIndex(scores_data); std::vector selected_indices; int selected_num = 0; T adaptive_threshold = nms_threshold; const T *bbox_data = bbox->data(); - bool flag; while (sorted_indices.size() != 0) { - int idx = sorted_indices.front().second; - flag = true; - for (size_t k = 0; k < selected_indices.size(); ++k) { + int idx = sorted_indices.back().second; + bool flag = true; + for (int kept_idx : selected_indices) { if (flag) { - const int kept_idx = selected_indices[k]; T overlap = JaccardOverlap(bbox_data + idx * box_size, bbox_data + kept_idx * box_size, false); flag = (overlap <= adaptive_threshold); @@ -271,32 +282,29 @@ Tensor NMS(const platform::DeviceContext &ctx, Tensor *bbox, Tensor *scores, } if (flag) { selected_indices.push_back(idx); - selected_num++; + ++selected_num; } - sorted_indices.erase(sorted_indices.begin()); + sorted_indices.erase(sorted_indices.end()); if (flag && eta < 1 && adaptive_threshold > 0.5) { adaptive_threshold *= eta; } } - Tensor keep_nms; - keep_nms.Resize({selected_num}); - int *keep_data = keep_nms.mutable_data(ctx.GetPlace()); - for (int i = 0; i < selected_num; ++i) { - keep_data[i] = selected_indices[i]; - } - - return keep_nms; + return VectorToTensor(selected_indices, selected_num); } -template +template class GenerateProposalsKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext &context) const override { auto *scores = context.Input("Scores"); auto *bbox_deltas = context.Input("BboxDeltas"); auto *im_info = context.Input("ImInfo"); - auto *anchors = context.Input("Anchors"); - auto *variances = context.Input("Variances"); + auto anchors = detail::Ref(context.Input("Anchors"), + "Cannot find input Anchors(%s) in scope", + context.Inputs("Anchors")[0]); + auto variances = detail::Ref(context.Input("Variances"), + "Cannot find input Variances(%s) in scope", + context.Inputs("Variances")[0]); auto *rpn_rois = context.Output("RpnRois"); auto *rpn_roi_probs = context.Output("RpnRoiProbs"); @@ -307,15 +315,16 @@ class GenerateProposalsKernel : public framework::OpKernel { float min_size = context.Attr("min_size"); float eta = context.Attr("eta"); - auto &dev_ctx = context.template device_context(); + auto &dev_ctx = + context.template device_context(); - auto scores_dim = scores->dims(); + auto &scores_dim = scores->dims(); int64_t num = scores_dim[0]; int64_t c_score = scores_dim[1]; int64_t h_score = scores_dim[2]; int64_t w_score = scores_dim[3]; - auto bbox_dim = bbox_deltas->dims(); + auto &bbox_dim = bbox_deltas->dims(); int64_t c_bbox = bbox_dim[1]; int64_t h_bbox = bbox_dim[2]; int64_t w_bbox = bbox_dim[3]; @@ -330,17 +339,17 @@ class GenerateProposalsKernel : public framework::OpKernel { scores_swap.mutable_data({num, h_score, w_score, c_score}, dev_ctx.GetPlace()); - math::Transpose trans; + math::Transpose trans; std::vector axis = {0, 2, 3, 1}; trans(dev_ctx, *bbox_deltas, &bbox_deltas_swap, axis); trans(dev_ctx, *scores, &scores_swap, axis); framework::LoD lod; - std::vector lod0(1, 0); - Tensor *anchor = const_cast(anchors); - anchor->Resize({anchors->numel() / 4, 4}); - Tensor *var = const_cast(variances); - var->Resize({var->numel() / 4, 4}); + lod.resize(1); + auto &lod0 = lod[0]; + lod0.push_back(0); + anchors.Resize({anchors.numel() / 4, 4}); + variances.Resize({variances.numel() / 4, 4}); int64_t num_proposals = 0; for (int64_t i = 0; i < num; ++i) { @@ -352,24 +361,17 @@ class GenerateProposalsKernel : public framework::OpKernel { scores_slice.Resize({h_score * w_score * c_score, 1}); std::pair tensor_pair = - ProposalForOneImage(dev_ctx, im_info_slice, *anchor, *var, + ProposalForOneImage(dev_ctx, im_info_slice, anchors, variances, bbox_deltas_slice, scores_slice, pre_nms_top_n, post_nms_top_n, nms_thresh, min_size, eta); - Tensor proposals = tensor_pair.first; - Tensor scores = tensor_pair.second; - - framework::VisitDataType( - framework::ToDataType(rpn_rois->type()), - AppendProposalsFunctor(rpn_rois, 4 * num_proposals, &proposals)); - framework::VisitDataType( - framework::ToDataType(rpn_roi_probs->type()), - AppendProposalsFunctor(rpn_roi_probs, num_proposals, &scores)); + Tensor &proposals = tensor_pair.first; + Tensor &scores = tensor_pair.second; + AppendProposals(rpn_rois, 4 * num_proposals, proposals); + AppendProposals(rpn_roi_probs, num_proposals, scores); num_proposals += proposals.dims()[0]; - lod0.emplace_back(num_proposals); + lod0.push_back(num_proposals); } - - lod.emplace_back(lod0); rpn_rois->set_lod(lod); rpn_roi_probs->set_lod(lod); rpn_rois->Resize({num_proposals, 4}); @@ -377,7 +379,7 @@ class GenerateProposalsKernel : public framework::OpKernel { } std::pair ProposalForOneImage( - const DeviceContext &ctx, const Tensor &im_info_slice, + const platform::CPUDeviceContext &ctx, const Tensor &im_info_slice, const Tensor &anchors, const Tensor &variances, const Tensor &bbox_deltas_slice, // [M, 4] const Tensor &scores_slice, // [N, 1] @@ -392,10 +394,9 @@ class GenerateProposalsKernel : public framework::OpKernel { for (int i = 0; i < scores_slice.numel(); ++i) { index[i] = i; } - std::function compare = - [scores_data](const int64_t &i, const int64_t &j) { - return scores_data[i] > scores_data[j]; - }; + auto compare = [scores_data](const int64_t &i, const int64_t &j) { + return scores_data[i] > scores_data[j]; + }; if (pre_nms_top_n <= 0 || pre_nms_top_n >= scores_slice.numel()) { std::sort(index, index + scores_slice.numel(), compare); @@ -452,33 +453,45 @@ class GenerateProposalsKernel : public framework::OpKernel { class GenerateProposalsOpMaker : public framework::OpProtoAndCheckerMaker { public: void Make() override { - AddInput("Scores", "The scores of anchors should be foreground."); - AddInput("BboxDeltas", "bbox_deltas."); - AddInput("ImInfo", "Information for image reshape."); - AddInput("Anchors", "All anchors."); - AddInput("Variances", " variances"); - - AddOutput("RpnRois", "Anchors."); - AddOutput("RpnRoiProbs", "Anchors."); - AddAttr("pre_nms_topN", "pre_nms_topN"); - AddAttr("post_nms_topN", "post_nms_topN"); - AddAttr("nms_thresh", "nms_thres"); - AddAttr("min_size", "min size"); + AddInput("Scores", + "(Tensor) The scores from conv is in shape (N, A, H, W), " + "N is batch size, A is number of anchors, " + "H and W are height and width of the feature map"); + AddInput("BboxDeltas", + "(Tensor) Bounding box deltas from conv is in " + "shape (N, 4*A, H, W)."); + AddInput("ImInfo", + "(Tensor) Information for image reshape is in shape (N, 3), " + "in format (height, width, scale)"); + AddInput("Anchors", + "(Tensor) Bounding box anchors from anchor_generator_op " + "is in shape (A, H, W, 4)."); + AddInput("Variances", + "(Tensor) Bounding box variances with same shape as `Anchors`."); + + AddOutput("RpnRois", + "(LoDTensor), Output proposals with shape (rois_num, 4)."); + AddOutput("RpnRoiProbs", + "(LoDTensor) Scores of proposals with shape (rois_num, 1)."); + AddAttr("pre_nms_topN", + "Number of top scoring RPN proposals to keep before " + "applying NMS."); + AddAttr("post_nms_topN", + "Number of top scoring RPN proposals to keep after " + "applying NMS"); + AddAttr("nms_thresh", "NMS threshold used on RPN proposals."); + AddAttr("min_size", + "Proposal height and width both need to be greater " + "than this min_size."); AddAttr("eta", "The parameter for adaptive NMS."); AddComment(R"DOC( -Generate Proposals OP - -This operator proposes rois according to each box with their probability to be a foreground object and -the box can be calculated by anchors. Bbox_deltais and scores are the output of RPN. Final proposals -could be used to train detection net. - -Scores is the probability for each box to be an object. In format of (N, A, H, W) where N is batch size, A is number -of anchors, H and W are height and width of the feature map. -BboxDeltas is the differece between predicted box locatoin and anchor location. In format of (N, 4*A, H, W) +This operator Generate bounding box proposals for Faster RCNN. +The propoasls are generated for a list of images based on image +score 'Scores', bounding box regression result 'BboxDeltas' as +well as predefined bounding box shapes 'anchors'. Greedy +non-maximum suppression is applied to generate the final bounding +boxes. -For generating proposals, this operator transposes and resizes scores and bbox_deltas in size of (H*W*A, 1) and (H*W*A, 4) and - calculate box locations as proposals candidates. Then clip boxes to image and remove predicted boxes with small area. -Finally, apply nms to get final proposals as output. )DOC"); } }; @@ -490,6 +503,5 @@ namespace ops = paddle::operators; REGISTER_OPERATOR(generate_proposals, ops::GenerateProposalsOp, ops::GenerateProposalsOpMaker, paddle::framework::EmptyGradOpMaker); -REGISTER_OP_CPU_KERNEL( - generate_proposals, - ops::GenerateProposalsKernel); +REGISTER_OP_CPU_KERNEL(generate_proposals, ops::GenerateProposalsKernel, + ops::GenerateProposalsKernel); diff --git a/paddle/fluid/operators/detection/generate_proposals_op.cu b/paddle/fluid/operators/detection/generate_proposals_op.cu index 6146ff509d..91213b3c4d 100644 --- a/paddle/fluid/operators/detection/generate_proposals_op.cu +++ b/paddle/fluid/operators/detection/generate_proposals_op.cu @@ -16,10 +16,13 @@ limitations under the License. */ #include #include #include "cub/cub.cuh" +#include "paddle/fluid/framework/mixed_vector.h" #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/memory/memory.h" +#include "paddle/fluid/operators/detail/safe_ref.h" #include "paddle/fluid/operators/gather.cu.h" #include "paddle/fluid/operators/math/math_function.h" +#include "paddle/fluid/platform/for_range.h" namespace paddle { namespace operators { @@ -36,36 +39,38 @@ namespace { int const kThreadsPerBlock = sizeof(uint64_t) * 8; -template -__global__ void RangeInitKernel(const T start, const T delta, const int size, - T *out) { - CUDA_1D_KERNEL_LOOP(i, size) { out[i] = start + i * delta; } -} +static const double kBBoxClipDefault = std::log(1000.0 / 16.0); + +struct RangeInitFunctor { + int start_; + int delta_; + int *out_; + __device__ void operator()(size_t i) { out_[i] = start_ + i * delta_; } +}; template -void SortDescending(const platform::CUDADeviceContext &ctx, const Tensor &value, - Tensor *value_out, Tensor *index_out) { - int num = value.numel(); +static void SortDescending(const platform::CUDADeviceContext &ctx, + const Tensor &value, Tensor *value_out, + Tensor *index_out) { + int num = static_cast(value.numel()); Tensor index_in_t; int *idx_in = index_in_t.mutable_data({num}, ctx.GetPlace()); - int block = 512; - auto stream = ctx.stream(); - RangeInitKernel<<>>(0, 1, num, idx_in); + platform::ForRange for_range(ctx, num); + for_range(RangeInitFunctor{0, 1, idx_in}); + int *idx_out = index_out->mutable_data({num}, ctx.GetPlace()); const T *keys_in = value.data(); T *keys_out = value_out->mutable_data({num}, ctx.GetPlace()); // Determine temporary device storage requirements - void *d_temp_storage = NULL; size_t temp_storage_bytes = 0; cub::DeviceRadixSort::SortPairsDescending( - d_temp_storage, temp_storage_bytes, keys_in, keys_out, idx_in, idx_out, - num); + nullptr, temp_storage_bytes, keys_in, keys_out, idx_in, idx_out, num); // Allocate temporary storage auto place = boost::get(ctx.GetPlace()); - d_temp_storage = memory::Alloc(place, temp_storage_bytes); + void *d_temp_storage = memory::Alloc(place, temp_storage_bytes); // Run sorting operation cub::DeviceRadixSort::SortPairsDescending( @@ -76,22 +81,27 @@ void SortDescending(const platform::CUDADeviceContext &ctx, const Tensor &value, } template -__device__ __forceinline__ T Min(T x, T y) { - return x < y ? x : y; -} - -template -__device__ __forceinline__ T Max(T x, T y) { - return x > y ? x : y; -} - -template -__global__ void BoxDecodeAndClipKernel(const T *anchor, const T *deltas, - const T *var, const int *index, - const T *im_info, const int num, - T *proposals) { - T kBBoxClipDefault = log(1000.0 / 16.0); - CUDA_1D_KERNEL_LOOP(i, num) { +struct BoxDecodeAndClipFunctor { + const T *anchor; + const T *deltas; + const T *var; + const int *index; + const T *im_info; + + T *proposals; + + BoxDecodeAndClipFunctor(const T *anchor, const T *deltas, const T *var, + const int *index, const T *im_info, T *proposals) + : anchor(anchor), + deltas(deltas), + var(var), + index(index), + im_info(im_info), + proposals(proposals) {} + + T bbox_clip_default{static_cast(kBBoxClipDefault)}; + + __device__ void operator()(size_t i) { int k = index[i] * 4; T axmin = anchor[k]; T aymin = anchor[k + 1]; @@ -108,17 +118,17 @@ __global__ void BoxDecodeAndClipKernel(const T *anchor, const T *deltas, T dxmax = deltas[k + 2]; T dymax = deltas[k + 3]; - T d_cx = 0., d_cy = 0., d_w = 0., d_h = 0.; + T d_cx, d_cy, d_w, d_h; if (var) { d_cx = cx + dxmin * w * var[k]; d_cy = cy + dymin * h * var[k + 1]; - d_w = exp(Min(dxmax * var[k + 2], kBBoxClipDefault)) * w; - d_h = exp(Min(dymax * var[k + 3], kBBoxClipDefault)) * h; + d_w = exp(Min(dxmax * var[k + 2], bbox_clip_default)) * w; + d_h = exp(Min(dymax * var[k + 3], bbox_clip_default)) * h; } else { d_cx = cx + dxmin * w; d_cy = cy + dymin * h; - d_w = exp(Min(dxmax, kBBoxClipDefault)) * w; - d_h = exp(Min(dymax, kBBoxClipDefault)) * h; + d_w = exp(Min(dxmax, bbox_clip_default)) * w; + d_h = exp(Min(dymax, bbox_clip_default)) * h; } T oxmin = d_cx - d_w * 0.5; @@ -126,17 +136,21 @@ __global__ void BoxDecodeAndClipKernel(const T *anchor, const T *deltas, T oxmax = d_cx + d_w * 0.5 - 1.; T oymax = d_cy + d_h * 0.5 - 1.; - proposals[i * 4] = Max(Min(oxmin, im_info[1] - 1.), 0.); - proposals[i * 4 + 1] = Max(Min(oymin, im_info[0] - 1.), 0.); - proposals[i * 4 + 2] = Max(Min(oxmax, im_info[1] - 1.), 0.); - proposals[i * 4 + 3] = Max(Min(oymax, im_info[0] - 1.), 0.); + proposals[i * 4] = Max(Min(oxmin, im_info[1] - 1.), 0.); + proposals[i * 4 + 1] = Max(Min(oymin, im_info[0] - 1.), 0.); + proposals[i * 4 + 2] = Max(Min(oxmax, im_info[1] - 1.), 0.); + proposals[i * 4 + 3] = Max(Min(oymax, im_info[0] - 1.), 0.); } -} + + __device__ __forceinline__ T Min(T a, T b) const { return a > b ? b : a; } + + __device__ __forceinline__ T Max(T a, T b) const { return a > b ? a : b; } +}; template -__global__ void FilterBBoxes(const T *bboxes, const T *im_info, - const T min_size, const int num, int *keep_num, - int *keep) { +static __global__ void FilterBBoxes(const T *bboxes, const T *im_info, + const T min_size, const int num, + int *keep_num, int *keep) { T im_h = im_info[0]; T im_w = im_info[1]; T im_scale = im_info[2]; @@ -181,7 +195,7 @@ __global__ void FilterBBoxes(const T *bboxes, const T *im_info, } } -__device__ inline float IoU(const float *a, const float *b) { +static __device__ inline float IoU(const float *a, const float *b) { float left = max(a[0], b[0]), right = min(a[2], b[2]); float top = max(a[1], b[1]), bottom = min(a[3], b[3]); float width = max(right - left + 1, 0.f), height = max(bottom - top + 1, 0.f); @@ -191,8 +205,9 @@ __device__ inline float IoU(const float *a, const float *b) { return inter_s / (s_a + s_b - inter_s); } -__global__ void NMSKernel(const int n_boxes, const float nms_overlap_thresh, - const float *dev_boxes, uint64_t *dev_mask) { +static __global__ void NMSKernel(const int n_boxes, + const float nms_overlap_thresh, + const float *dev_boxes, uint64_t *dev_mask) { const int row_start = blockIdx.y; const int col_start = blockIdx.x; @@ -234,9 +249,9 @@ __global__ void NMSKernel(const int n_boxes, const float nms_overlap_thresh, } template -void NMS(const platform::CUDADeviceContext &ctx, const Tensor &proposals, - const Tensor &sorted_indices, const T nms_threshold, - Tensor *keep_out) { +static void NMS(const platform::CUDADeviceContext &ctx, const Tensor &proposals, + const Tensor &sorted_indices, const T nms_threshold, + Tensor *keep_out) { int boxes_num = proposals.dims()[0]; PADDLE_ENFORCE_EQ(boxes_num, sorted_indices.dims()[0]); @@ -247,13 +262,10 @@ void NMS(const platform::CUDADeviceContext &ctx, const Tensor &proposals, const T *boxes = proposals.data(); auto place = boost::get(ctx.GetPlace()); - int size_bytes = boxes_num * col_blocks * sizeof(uint64_t); - uint64_t *d_mask = - reinterpret_cast(memory::Alloc(place, size_bytes)); - NMSKernel<<>>(boxes_num, nms_threshold, boxes, d_mask); - uint64_t *h_mask = reinterpret_cast( - memory::Alloc(platform::CPUPlace(), size_bytes)); - memory::Copy(platform::CPUPlace(), h_mask, place, d_mask, size_bytes, 0); + framework::Vector mask(boxes_num * col_blocks); + NMSKernel<<>>( + boxes_num, nms_threshold, boxes, + mask.CUDAMutableData(boost::get(ctx.GetPlace()))); std::vector remv(col_blocks); memset(&remv[0], 0, sizeof(uint64_t) * col_blocks); @@ -267,7 +279,7 @@ void NMS(const platform::CUDADeviceContext &ctx, const Tensor &proposals, if (!(remv[nblock] & (1ULL << inblock))) { ++num_to_keep; keep_vec.push_back(i); - uint64_t *p = &h_mask[0] + i * col_blocks; + uint64_t *p = &mask[0] + i * col_blocks; for (int j = nblock; j < col_blocks; j++) { remv[j] |= p[j]; } @@ -276,12 +288,10 @@ void NMS(const platform::CUDADeviceContext &ctx, const Tensor &proposals, int *keep = keep_out->mutable_data({num_to_keep}, ctx.GetPlace()); memory::Copy(place, keep, platform::CPUPlace(), keep_vec.data(), sizeof(int) * num_to_keep, 0); - memory::Free(place, d_mask); - memory::Free(platform::CPUPlace(), h_mask); } template -std::pair ProposalForOneImage( +static std::pair ProposalForOneImage( const platform::CUDADeviceContext &ctx, const Tensor &im_info, const Tensor &anchors, const Tensor &variances, const Tensor &bbox_deltas, // [M, 4] @@ -300,18 +310,20 @@ std::pair ProposalForOneImage( // 2. box decode and clipping Tensor proposals; proposals.mutable_data({pre_nms_num, 4}, ctx.GetPlace()); - int block = 512; - auto stream = ctx.stream(); - BoxDecodeAndClipKernel<<>>( - anchors.data(), bbox_deltas.data(), variances.data(), - index_sort.data(), im_info.data(), pre_nms_num, - proposals.data()); + + { + platform::ForRange for_range(ctx, pre_nms_num); + for_range(BoxDecodeAndClipFunctor{ + anchors.data(), bbox_deltas.data(), variances.data(), + index_sort.data(), im_info.data(), proposals.data()}); + } // 3. filter Tensor keep_index, keep_num_t; keep_index.mutable_data({pre_nms_num}, ctx.GetPlace()); keep_num_t.mutable_data({1}, ctx.GetPlace()); min_size = std::max(min_size, 1.0f); + auto stream = ctx.stream(); FilterBBoxes<<<1, 512, 0, stream>>>( proposals.data(), im_info.data(), min_size, pre_nms_num, keep_num_t.data(), keep_index.data()); @@ -355,8 +367,12 @@ class CUDAGenerateProposalsKernel : public framework::OpKernel { auto *scores = context.Input("Scores"); auto *bbox_deltas = context.Input("BboxDeltas"); auto *im_info = context.Input("ImInfo"); - auto *anchors = context.Input("Anchors"); - auto *variances = context.Input("Variances"); + auto anchors = detail::Ref(context.Input("Anchors"), + "Cannot find input Anchors(%s) in scope", + context.Inputs("Anchors")[0]); + auto variances = detail::Ref(context.Input("Variances"), + "Cannot find input Variances(%s) in scope", + context.Inputs("Variances")[0]); auto *rpn_rois = context.Output("RpnRois"); auto *rpn_roi_probs = context.Output("RpnRoiProbs"); @@ -392,10 +408,8 @@ class CUDAGenerateProposalsKernel : public framework::OpKernel { trans(dev_ctx, *bbox_deltas, &bbox_deltas_swap, axis); trans(dev_ctx, *scores, &scores_swap, axis); - Tensor *anchor = const_cast(anchors); - anchor->Resize({anchors->numel() / 4, 4}); - Tensor *var = const_cast(variances); - var->Resize({var->numel() / 4, 4}); + anchors.Resize({anchors.numel() / 4, 4}); + variances.Resize({variances.numel() / 4, 4}); rpn_rois->mutable_data({bbox_deltas->numel() / 4, 4}, context.GetPlace()); @@ -417,12 +431,12 @@ class CUDAGenerateProposalsKernel : public framework::OpKernel { scores_slice.Resize({h_score * w_score * c_score, 1}); std::pair box_score_pair = - ProposalForOneImage(dev_ctx, im_info_slice, *anchor, *var, + ProposalForOneImage(dev_ctx, im_info_slice, anchors, variances, bbox_deltas_slice, scores_slice, pre_nms_top_n, post_nms_top_n, nms_thresh, min_size, eta); - Tensor proposals = box_score_pair.first; - Tensor scores = box_score_pair.second; + Tensor &proposals = box_score_pair.first; + Tensor &scores = box_score_pair.second; memory::Copy(place, rpn_rois_data + num_proposals * 4, place, proposals.data(), sizeof(T) * proposals.numel(), 0); diff --git a/paddle/fluid/operators/detection/gpc.cc b/paddle/fluid/operators/detection/gpc.cc new file mode 100644 index 0000000000..7c0823c048 --- /dev/null +++ b/paddle/fluid/operators/detection/gpc.cc @@ -0,0 +1,2201 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +/** + * @file src/gpc.cpp + * @author huhan02(com@baidu.com) + * @date 2015/12/18 14:17:30 + * @brief + * + * @modified by sunyipeng + * @email sunyipeng@baidu.com + * @date 2018/6/12 + **/ + +#include "paddle/fluid/operators/detection/gpc.h" + +namespace gpc { + +typedef struct lmt_shape { /* Local minima table */ + double y; /* Y coordinate at local minimum */ + edge_node *first_bound; /* Pointer to bound list */ + struct lmt_shape *next; /* Pointer to next local minimum */ +} lmt_node; + +typedef struct sbt_t_shape { /* Scanbeam tree */ + double y; /* Scanbeam node y value */ + struct sbt_t_shape *less; /* Pointer to nodes with lower y */ + struct sbt_t_shape *more; /* Pointer to nodes with higher y */ +} sb_tree; + +typedef struct it_shape { /* Intersection table */ + edge_node *ie[2]; /* Intersecting edge (bundle) pair */ + gpc_vertex point; /* Point of intersection */ + struct it_shape *next; /* The next intersection table node */ +} it_node; + +typedef struct st_shape { /* Sorted edge table */ + edge_node *edge; /* Pointer to AET edge */ + double xb; /* Scanbeam bottom x coordinate */ + double xt; /* Scanbeam top x coordinate */ + double dx; /* Change in x for a unit y increase */ + struct st_shape *prev; /* Previous edge in sorted list */ +} st_node; + +typedef struct bbox_shape { /* Contour axis-aligned bounding box */ + double xmin; /* Minimum x coordinate */ + double ymin; /* Minimum y coordinate */ + double xmax; /* Maximum x coordinate */ + double ymax; /* Maximum y coordinate */ +} bbox; + +/* +=========================================================================== + Global Data +=========================================================================== +*/ + +/* Horizontal edge state transitions within scanbeam boundary */ +const h_state next_h_state[3][6] = { + /* ABOVE BELOW CROSS */ + /* L R L R L R */ + /* NH */ + {BH, TH, TH, BH, NH, NH}, + /* BH */ + {NH, NH, NH, NH, TH, TH}, + /* TH */ + {NH, NH, NH, NH, BH, BH}}; + +/* +=========================================================================== + Private Functions +=========================================================================== +*/ + +static void reset_it(it_node **it) { + it_node *itn; + + while (*it) { + itn = (*it)->next; + gpc_free(*it); + *it = itn; + } +} + +static void reset_lmt(lmt_node **lmt) { + lmt_node *lmtn; + + while (*lmt) { + lmtn = (*lmt)->next; + gpc_free(*lmt); + *lmt = lmtn; + } +} + +static void insert_bound(edge_node **b, edge_node *e) { + edge_node *existing_bound = NULL; + + if (!*b) { + /* Link node e to the tail of the list */ + *b = e; + } else { + /* Do primary sort on the x field */ + if (e[0].bot.x < (*b)[0].bot.x) { + /* Insert a new node mid-list */ + existing_bound = *b; + *b = e; + (*b)->next_bound = existing_bound; + } else { + if (e[0].bot.x == (*b)[0].bot.x) { + /* Do secondary sort on the dx field */ + if (e[0].dx < (*b)[0].dx) { + /* Insert a new node mid-list */ + existing_bound = *b; + *b = e; + (*b)->next_bound = existing_bound; + } else { + /* Head further down the list */ + insert_bound(&((*b)->next_bound), e); + } + } else { + /* Head further down the list */ + insert_bound(&((*b)->next_bound), e); + } + } + } +} + +static edge_node **bound_list(lmt_node **lmt, double y) { + lmt_node *existing_node; + + if (!*lmt) { + /* Add node onto the tail end of the LMT */ + gpc_malloc(*lmt, sizeof(lmt_node), + const_cast("LMT insertion")); + (*lmt)->y = y; + (*lmt)->first_bound = NULL; + (*lmt)->next = NULL; + return &((*lmt)->first_bound); + } else if (y < (*lmt)->y) { + /* Insert a new LMT node before the current node */ + existing_node = *lmt; + gpc_malloc(*lmt, sizeof(lmt_node), + const_cast("LMT insertion")); + (*lmt)->y = y; + (*lmt)->first_bound = NULL; + (*lmt)->next = existing_node; + return &((*lmt)->first_bound); + } else { + if (y > (*lmt)->y) { + /* Head further up the LMT */ + return bound_list(&((*lmt)->next), y); + } else { + /* Use this existing LMT node */ + return &((*lmt)->first_bound); + } + } +} + +static void add_to_sbtree(int *entries, sb_tree **sbtree, double y) { + if (!*sbtree) { + /* Add a new tree node here */ + gpc_malloc(*sbtree, sizeof(sb_tree), + const_cast("scanbeam tree insertion")); + (*sbtree)->y = y; + (*sbtree)->less = NULL; + (*sbtree)->more = NULL; + (*entries)++; + } else { + if ((*sbtree)->y > y) { + /* Head into the 'less' sub-tree */ + add_to_sbtree(entries, &((*sbtree)->less), y); + } else { + if ((*sbtree)->y < y) { + /* Head into the 'more' sub-tree */ + add_to_sbtree(entries, &((*sbtree)->more), y); + } + } + } +} + +static void build_sbt(int *entries, double *sbt, sb_tree *sbtree) { + if (sbtree->less) { + build_sbt(entries, sbt, sbtree->less); + } + sbt[*entries] = sbtree->y; + (*entries)++; + if (sbtree->more) { + build_sbt(entries, sbt, sbtree->more); + } +} + +static void free_sbtree(sb_tree **sbtree) { + if (*sbtree) { + free_sbtree(&((*sbtree)->less)); + free_sbtree(&((*sbtree)->more)); + gpc_free(*sbtree); + } +} + +static int count_optimal_vertices(gpc_vertex_list c) { + int result = 0; + int i = 0; + + /* Ignore non-contributing contours */ + if (c.num_vertices > 0) { + for (i = 0; i < c.num_vertices; i++) { + /* Ignore superfluous vertices embedded in horizontal edges */ + if (gpc_optimal(c.vertex, i, c.num_vertices)) { + result++; + } + } + } + return result; +} + +static edge_node *build_lmt(lmt_node **lmt, sb_tree **sbtree, int *sbt_entries, + gpc_polygon *p, int type, gpc_op op) { + int c = 0; + int i = 0; + int min = 0; + int max = 0; + int num_edges = 0; + int v = 0; + int num_vertices = 0; + int total_vertices = 0; + int e_index = 0; + edge_node *e = NULL; + edge_node *edge_table = NULL; + + for (c = 0; c < p->num_contours; c++) { + total_vertices += count_optimal_vertices(p->contour[c]); + } + + /* Create the entire input polygon edge table in one go */ + gpc_malloc(edge_table, total_vertices * sizeof(edge_node), + const_cast("edge table creation")); + + for (c = 0; c < p->num_contours; c++) { + if (p->contour[c].num_vertices < 0) { + /* Ignore the non-contributing contour and repair the vertex count */ + p->contour[c].num_vertices = -p->contour[c].num_vertices; + } else { + /* Perform contour optimisation */ + num_vertices = 0; + for (i = 0; i < p->contour[c].num_vertices; i++) { + if (gpc_optimal(p->contour[c].vertex, i, p->contour[c].num_vertices)) { + edge_table[num_vertices].vertex.x = p->contour[c].vertex[i].x; + edge_table[num_vertices].vertex.y = p->contour[c].vertex[i].y; + + /* Record vertex in the scanbeam table */ + add_to_sbtree(sbt_entries, sbtree, edge_table[num_vertices].vertex.y); + + num_vertices++; + } + } + + /* Do the contour forward pass */ + for (min = 0; min < num_vertices; min++) { + /* If a forward local minimum... */ + if (gpc_fwd_min(edge_table, min, num_vertices)) { + /* Search for the next local maximum... */ + num_edges = 1; + max = gpc_next_index(min, num_vertices); + while (gpc_not_fmax(edge_table, max, num_vertices)) { + num_edges++; + max = gpc_next_index(max, num_vertices); + } + + /* Build the next edge list */ + e = &edge_table[e_index]; + e_index += num_edges; + v = min; + e[0].bstate[BELOW] = UNBUNDLED; + e[0].bundle[BELOW][CLIP] = 0; + e[0].bundle[BELOW][SUBJ] = 0; + for (i = 0; i < num_edges; i++) { + e[i].xb = edge_table[v].vertex.x; + e[i].bot.x = edge_table[v].vertex.x; + e[i].bot.y = edge_table[v].vertex.y; + + v = gpc_next_index(v, num_vertices); + + e[i].top.x = edge_table[v].vertex.x; + e[i].top.y = edge_table[v].vertex.y; + e[i].dx = (edge_table[v].vertex.x - e[i].bot.x) / + (e[i].top.y - e[i].bot.y); + e[i].type = type; + e[i].outp[ABOVE] = NULL; + e[i].outp[BELOW] = NULL; + e[i].next = NULL; + e[i].prev = NULL; + e[i].succ = + ((num_edges > 1) && (i < (num_edges - 1))) ? &(e[i + 1]) : NULL; + e[i].pred = ((num_edges > 1) && (i > 0)) ? &(e[i - 1]) : NULL; + e[i].next_bound = NULL; + e[i].bside[CLIP] = (op == GPC_DIFF) ? RIGHT : LEFT; + e[i].bside[SUBJ] = LEFT; + } + insert_bound(bound_list(lmt, edge_table[min].vertex.y), e); + } + } + + /* Do the contour reverse pass */ + for (min = 0; min < num_vertices; min++) { + /* If a reverse local minimum... */ + if (gpc_rev_min(edge_table, min, num_vertices)) { + /* Search for the previous local maximum... */ + num_edges = 1; + max = gpc_prev_index(min, num_vertices); + while (gpc_not_rmax(edge_table, max, num_vertices)) { + num_edges++; + max = gpc_prev_index(max, num_vertices); + } + + /* Build the previous edge list */ + e = &edge_table[e_index]; + e_index += num_edges; + v = min; + e[0].bstate[BELOW] = UNBUNDLED; + e[0].bundle[BELOW][CLIP] = 0; + e[0].bundle[BELOW][SUBJ] = 0; + for (i = 0; i < num_edges; i++) { + e[i].xb = edge_table[v].vertex.x; + e[i].bot.x = edge_table[v].vertex.x; + e[i].bot.y = edge_table[v].vertex.y; + + v = gpc_prev_index(v, num_vertices); + + e[i].top.x = edge_table[v].vertex.x; + e[i].top.y = edge_table[v].vertex.y; + e[i].dx = (edge_table[v].vertex.x - e[i].bot.x) / + (e[i].top.y - e[i].bot.y); + e[i].type = type; + e[i].outp[ABOVE] = NULL; + e[i].outp[BELOW] = NULL; + e[i].next = NULL; + e[i].prev = NULL; + e[i].succ = + ((num_edges > 1) && (i < (num_edges - 1))) ? &(e[i + 1]) : NULL; + e[i].pred = ((num_edges > 1) && (i > 0)) ? &(e[i - 1]) : NULL; + e[i].next_bound = NULL; + e[i].bside[CLIP] = (op == GPC_DIFF) ? RIGHT : LEFT; + e[i].bside[SUBJ] = LEFT; + } + insert_bound(bound_list(lmt, edge_table[min].vertex.y), e); + } + } + } + } + return edge_table; +} // NOLINT + +static void add_edge_to_aet(edge_node **aet, edge_node *edge, edge_node *prev) { + if (!*aet) { + /* Append edge onto the tail end of the AET */ + *aet = edge; + edge->prev = prev; + edge->next = NULL; + } else { + /* Do primary sort on the xb field */ + if (edge->xb < (*aet)->xb) { + /* Insert edge here (before the AET edge) */ + edge->prev = prev; + edge->next = *aet; + (*aet)->prev = edge; + *aet = edge; + } else { + if (edge->xb == (*aet)->xb) { + /* Do secondary sort on the dx field */ + if (edge->dx < (*aet)->dx) { + /* Insert edge here (before the AET edge) */ + edge->prev = prev; + edge->next = *aet; + (*aet)->prev = edge; + *aet = edge; + } else { + /* Head further into the AET */ + add_edge_to_aet(&((*aet)->next), edge, *aet); + } + } else { + /* Head further into the AET */ + add_edge_to_aet(&((*aet)->next), edge, *aet); + } + } + } +} + +static void add_intersection(it_node **it, edge_node *edge0, edge_node *edge1, + double x, double y) { + it_node *existing_node; + + if (!*it) { + /* Append a new node to the tail of the list */ + gpc_malloc(*it, sizeof(it_node), + const_cast("IT insertion")); + (*it)->ie[0] = edge0; + (*it)->ie[1] = edge1; + (*it)->point.x = x; + (*it)->point.y = y; + (*it)->next = NULL; + } else { + if ((*it)->point.y > y) { + /* Insert a new node mid-list */ + existing_node = *it; + gpc_malloc(*it, sizeof(it_node), + const_cast("IT insertion")); + (*it)->ie[0] = edge0; + (*it)->ie[1] = edge1; + (*it)->point.x = x; + (*it)->point.y = y; + (*it)->next = existing_node; + } else { + /* Head further down the list */ + add_intersection(&((*it)->next), edge0, edge1, x, y); + } + } +} + +static void add_st_edge(st_node **st, it_node **it, edge_node *edge, + double dy) { + st_node *existing_node; + double den = 0.0; + double r = 0.0; + double x = 0.0; + double y = 0.0; + + if (!*st) { + /* Append edge onto the tail end of the ST */ + gpc_malloc(*st, sizeof(st_node), + const_cast("ST insertion")); + (*st)->edge = edge; + (*st)->xb = edge->xb; + (*st)->xt = edge->xt; + (*st)->dx = edge->dx; + (*st)->prev = NULL; + } else { + den = ((*st)->xt - (*st)->xb) - (edge->xt - edge->xb); + + /* If new edge and ST edge don't cross */ + if ((edge->xt >= (*st)->xt) || (edge->dx == (*st)->dx) || + (fabs(den) <= DBL_EPSILON)) { + /* No intersection - insert edge here (before the ST edge) */ + existing_node = *st; + gpc_malloc(*st, sizeof(st_node), + const_cast("ST insertion")); + (*st)->edge = edge; + (*st)->xb = edge->xb; + (*st)->xt = edge->xt; + (*st)->dx = edge->dx; + (*st)->prev = existing_node; + } else { + /* Compute intersection between new edge and ST edge */ + r = (edge->xb - (*st)->xb) / den; + x = (*st)->xb + r * ((*st)->xt - (*st)->xb); + y = r * dy; + + /* Insert the edge pointers and the intersection point in the IT */ + add_intersection(it, (*st)->edge, edge, x, y); + + /* Head further into the ST */ + add_st_edge(&((*st)->prev), it, edge, dy); + } + } +} + +static void build_intersection_table(it_node **it, edge_node *aet, double dy) { + st_node *st; + st_node *stp; + edge_node *edge = NULL; + + /* Build intersection table for the current scanbeam */ + reset_it(it); + st = NULL; + + /* Process each AET edge */ + for (edge = aet; edge; edge = edge->next) { + if ((edge->bstate[ABOVE] == BUNDLE_HEAD) || edge->bundle[ABOVE][CLIP] || + edge->bundle[ABOVE][SUBJ]) { + add_st_edge(&st, it, edge, dy); + } + } + + /* Free the sorted edge table */ + while (st) { + stp = st->prev; + gpc_free(st); + st = stp; + } +} + +static int count_contours(polygon_node *polygon) { + int nc = 0; + int nv = 0; + vertex_node *v = NULL; + vertex_node *nextv = NULL; + + for (nc = 0; polygon; polygon = polygon->next) { + if (polygon->active) { + /* Count the vertices in the current contour */ + nv = 0; + for (v = polygon->proxy->v[LEFT]; v; v = v->next) { + nv++; + } + + /* Record valid vertex counts in the active field */ + if (nv > 2) { + polygon->active = nv; + nc++; + } else { + /* Invalid contour: just free the heap */ + for (v = polygon->proxy->v[LEFT]; v; v = nextv) { + nextv = v->next; + gpc_free(v); + } + polygon->active = 0; + } + } + } + return nc; +} + +static void add_left(polygon_node *p, double x, double y) { + vertex_node *nv = NULL; + + /* Create a new vertex node and set its fields */ + gpc_malloc(nv, sizeof(vertex_node), + const_cast("vertex node creation")); + nv->x = x; + nv->y = y; + + /* Add vertex nv to the left end of the polygon's vertex list */ + nv->next = p->proxy->v[LEFT]; + + /* Update proxy->[LEFT] to point to nv */ + p->proxy->v[LEFT] = nv; +} + +static void merge_left(polygon_node *p, polygon_node *q, polygon_node *list) { + polygon_node *target = NULL; + + /* Label contour as a hole */ + q->proxy->hole = 1; + + if (p->proxy != q->proxy) { + /* Assign p's vertex list to the left end of q's list */ + p->proxy->v[RIGHT]->next = q->proxy->v[LEFT]; + q->proxy->v[LEFT] = p->proxy->v[LEFT]; + + /* Redirect any p->proxy references to q->proxy */ + + for (target = p->proxy; list; list = list->next) { + if (list->proxy == target) { + list->active = 0; + list->proxy = q->proxy; + } + } + } +} + +static void add_right(polygon_node *p, double x, double y) { + vertex_node *nv = NULL; + + /* Create a new vertex node and set its fields */ + gpc_malloc(nv, sizeof(vertex_node), + const_cast("vertex node creation")); + nv->x = x; + nv->y = y; + nv->next = NULL; + + /* Add vertex nv to the right end of the polygon's vertex list */ + p->proxy->v[RIGHT]->next = nv; + + /* Update proxy->v[RIGHT] to point to nv */ + p->proxy->v[RIGHT] = nv; +} + +static void merge_right(polygon_node *p, polygon_node *q, polygon_node *list) { + polygon_node *target = NULL; + + /* Label contour as external */ + q->proxy->hole = 0; + + if (p->proxy != q->proxy) { + /* Assign p's vertex list to the right end of q's list */ + q->proxy->v[RIGHT]->next = p->proxy->v[LEFT]; + q->proxy->v[RIGHT] = p->proxy->v[RIGHT]; + + /* Redirect any p->proxy references to q->proxy */ + for (target = p->proxy; list; list = list->next) { + if (list->proxy == target) { + list->active = 0; + list->proxy = q->proxy; + } + } + } +} + +static void add_local_min(polygon_node **p, edge_node *edge, double x, + double y) { + polygon_node *existing_min = NULL; + vertex_node *nv = NULL; + + existing_min = *p; + + gpc_malloc(*p, sizeof(polygon_node), + const_cast("polygon node creation")); + + /* Create a new vertex node and set its fields */ + gpc_malloc(nv, sizeof(vertex_node), + const_cast("vertex node creation")); + nv->x = x; + nv->y = y; + nv->next = NULL; + + /* Initialise proxy to point to p itself */ + (*p)->proxy = (*p); + (*p)->active = 1; + (*p)->next = existing_min; + + /* Make v[LEFT] and v[RIGHT] point to new vertex nv */ + (*p)->v[LEFT] = nv; + (*p)->v[RIGHT] = nv; + + /* Assign polygon p to the edge */ + edge->outp[ABOVE] = *p; +} + +static int count_tristrips(polygon_node *tn) { + int total = 0; + + for (total = 0; tn; tn = tn->next) { + if (tn->active > 2) { + total++; + } + } + return total; +} + +void add_vertex(vertex_node **t, double x, double y) { + if (!(*t)) { + gpc_malloc(*t, sizeof(vertex_node), + const_cast("tristrip vertex creation")); + (*t)->x = x; + (*t)->y = y; + (*t)->next = NULL; + } else { + /* Head further down the list */ + add_vertex(&((*t)->next), x, y); + } +} + +void gpc_vertex_create(edge_node *e, int p, int s, double x, double y) { + add_vertex(&(e->outp[p]->v[s]), x, y); + e->outp[p]->active++; +} + +static void new_tristrip(polygon_node **tn, edge_node *edge, double x, + double y) { + if (!(*tn)) { + gpc_malloc(*tn, sizeof(polygon_node), + const_cast("tristrip node creation")); + (*tn)->next = NULL; + (*tn)->v[LEFT] = NULL; + (*tn)->v[RIGHT] = NULL; + (*tn)->active = 1; + add_vertex(&((*tn)->v[LEFT]), x, y); + edge->outp[ABOVE] = *tn; + } else { + /* Head further down the list */ + new_tristrip(&((*tn)->next), edge, x, y); + } +} + +static bbox *create_contour_bboxes(gpc_polygon *p) { + bbox *box; + int c = 0; + int v = 0; + + gpc_malloc(box, p->num_contours * sizeof(bbox), + const_cast("Bounding box creation")); + + /* Construct contour bounding boxes */ + for (c = 0; c < p->num_contours; c++) { + /* Initialise bounding box extent */ + box[c].xmin = DBL_MAX; + box[c].ymin = DBL_MAX; + box[c].xmax = -DBL_MAX; + box[c].ymax = -DBL_MAX; + + for (v = 0; v < p->contour[c].num_vertices; v++) { + /* Adjust bounding box */ + if (p->contour[c].vertex[v].x < box[c].xmin) { + box[c].xmin = p->contour[c].vertex[v].x; + } + if (p->contour[c].vertex[v].y < box[c].ymin) { + box[c].ymin = p->contour[c].vertex[v].y; + } + if (p->contour[c].vertex[v].x > box[c].xmax) { + box[c].xmax = p->contour[c].vertex[v].x; + } + if (p->contour[c].vertex[v].y > box[c].ymax) { + box[c].ymax = p->contour[c].vertex[v].y; + } + } + } + return box; +} + +static void minimax_test(gpc_polygon *subj, gpc_polygon *clip, gpc_op op) { + bbox *s_bbox; + bbox *c_bbox; + int s = 0; + int c = 0; + int *o_table = NULL; + int overlap = 0; + + s_bbox = create_contour_bboxes(subj); + c_bbox = create_contour_bboxes(clip); + + gpc_malloc(o_table, + subj->num_contours * clip->num_contours * sizeof(int), + const_cast("overlap table creation")); + + /* Check all subject contour bounding boxes against clip boxes */ + for (s = 0; s < subj->num_contours; s++) { + for (c = 0; c < clip->num_contours; c++) { + o_table[c * subj->num_contours + s] = + (!((s_bbox[s].xmax < c_bbox[c].xmin) || + (s_bbox[s].xmin > c_bbox[c].xmax))) && + (!((s_bbox[s].ymax < c_bbox[c].ymin) || + (s_bbox[s].ymin > c_bbox[c].ymax))); + } + } + + /* For each clip contour, search for any subject contour overlaps */ + for (c = 0; c < clip->num_contours; c++) { + overlap = 0; + for (s = 0; (!overlap) && (s < subj->num_contours); s++) { + overlap = o_table[c * subj->num_contours + s]; + } + + if (!overlap) { + /* Flag non contributing status by negating vertex count */ + clip->contour[c].num_vertices = -clip->contour[c].num_vertices; + } + } + + if (op == GPC_INT) { + /* For each subject contour, search for any clip contour overlaps */ + for (s = 0; s < subj->num_contours; s++) { + overlap = 0; + for (c = 0; (!overlap) && (c < clip->num_contours); c++) { + overlap = o_table[c * subj->num_contours + s]; + } + + if (!overlap) { + /* Flag non contributing status by negating vertex count */ + subj->contour[s].num_vertices = -subj->contour[s].num_vertices; + } + } + } + + gpc_free(s_bbox); + gpc_free(c_bbox); + gpc_free(o_table); +} + +/* +=========================================================================== + Public Functions +=========================================================================== +*/ + +void gpc_free_polygon(gpc_polygon *p) { + int c = 0; + + for (c = 0; c < p->num_contours; c++) { + gpc_free(p->contour[c].vertex); + } + gpc_free(p->hole); + gpc_free(p->contour); + p->num_contours = 0; +} + +/* +void gpc_read_polygon(FILE *fp, int read_hole_flags, gpc_polygon *p) { + int c = 0; + int v = 0; + + fscanf(fp, "%d", &(p->num_contours)); + gpc_malloc(p->hole, p->num_contours * sizeof(int), + (char *)"hole flag array creation"); + gpc_malloc(p->contour, + p->num_contours * sizeof(gpc_vertex_list), + (char *)"contour creation"); + for (c = 0; c < p->num_contours; c++) { + fscanf(fp, "%d", &(p->contour[c].num_vertices)); + + if (read_hole_flags) { + fscanf(fp, "%d", &(p->hole[c])); + } else { + p->hole[c] = 0; // Assume all contours to be external + } + + gpc_malloc(p->contour[c].vertex, + p->contour[c].num_vertices * sizeof(gpc_vertex), + (char *)"vertex creation"); + for (v = 0; v < p->contour[c].num_vertices; v++) { + fscanf(fp, "%lf %lf", &(p->contour[c].vertex[v].x), + &(p->contour[c].vertex[v].y)); + } + } +} + +void gpc_write_polygon(FILE *fp, int write_hole_flags, gpc_polygon *p) { + int c = 0; + int v = 0; + + fprintf(fp, "%d\n", p->num_contours); + for (c = 0; c < p->num_contours; c++) { + fprintf(fp, "%d\n", p->contour[c].num_vertices); + + if (write_hole_flags) { + fprintf(fp, "%d\n", p->hole[c]); + } + + for (v = 0; v < p->contour[c].num_vertices; v++) { + fprintf(fp, "% .*lf % .*lf\n", DBL_DIG, p->contour[c].vertex[v].x, + DBL_DIG, p->contour[c].vertex[v].y); + } + } +} +*/ + +void gpc_add_contour(gpc_polygon *p, gpc_vertex_list *new_contour, int hole) { + int *extended_hole = NULL; + int c = 0; + int v = 0; + gpc_vertex_list *extended_contour = NULL; + + /* Create an extended hole array */ + gpc_malloc(extended_hole, (p->num_contours + 1) * sizeof(int), + const_cast("contour hole addition")); + + /* Create an extended contour array */ + gpc_malloc(extended_contour, + (p->num_contours + 1) * sizeof(gpc_vertex_list), + const_cast("contour addition")); + + /* Copy the old contour and hole data into the extended arrays */ + for (c = 0; c < p->num_contours; c++) { + extended_hole[c] = p->hole[c]; + extended_contour[c] = p->contour[c]; + } + + /* Copy the new contour and hole onto the end of the extended arrays */ + c = p->num_contours; + extended_hole[c] = hole; + extended_contour[c].num_vertices = new_contour->num_vertices; + gpc_malloc(extended_contour[c].vertex, + new_contour->num_vertices * sizeof(gpc_vertex), + const_cast("contour addition")); + for (v = 0; v < new_contour->num_vertices; v++) { + extended_contour[c].vertex[v] = new_contour->vertex[v]; + } + + /* Dispose of the old contour */ + gpc_free(p->contour); + gpc_free(p->hole); + + /* Update the polygon information */ + p->num_contours++; + p->hole = extended_hole; + p->contour = extended_contour; +} + +// gpc_polygon_clip +void gpc_polygon_clip(gpc_op op, gpc_polygon *subj, gpc_polygon *clip, + gpc_polygon *result) { + sb_tree *sbtree = NULL; + it_node *it = NULL; + it_node *intersect = NULL; + edge_node *edge = NULL; + edge_node *prev_edge = NULL; + edge_node *next_edge = NULL; + edge_node *succ_edge = NULL; + edge_node *e0 = NULL; + edge_node *e1 = NULL; + edge_node *aet = NULL; + edge_node *c_heap = NULL; + edge_node *s_heap = NULL; + lmt_node *lmt = NULL; + lmt_node *local_min = NULL; + polygon_node *out_poly = NULL; + polygon_node *p = NULL; + polygon_node *q = NULL; + polygon_node *poly = NULL; + polygon_node *npoly = NULL; + polygon_node *cf = NULL; + vertex_node *vtx = NULL; + vertex_node *nv = NULL; + h_state horiz[2]; + int in[2]; + int exists[2]; + int parity[2] = {LEFT, LEFT}; + int c = 0; + int v = 0; + int contributing = 0; + int search = 0; + int scanbeam = 0; + int sbt_entries = 0; + int vclass = 0; + int bl = 0; + int br = 0; + int tl = 0; + int tr = 0; + double *sbt = NULL; + double xb = 0.0; + double px = 0.0; + double yb = 0.0; + double yt = 0.0; + double dy = 0.0; + double ix = 0.0; + double iy = 0.0; + + /* Test for trivial NULL result cases */ + if (((subj->num_contours == 0) && (clip->num_contours == 0)) || + ((subj->num_contours == 0) && ((op == GPC_INT) || (op == GPC_DIFF))) || + ((clip->num_contours == 0) && (op == GPC_INT))) { + result->num_contours = 0; + result->hole = NULL; + result->contour = NULL; + return; + } + /* Identify potentialy contributing contours */ + if (((op == GPC_INT) || (op == GPC_DIFF)) && (subj->num_contours > 0) && + (clip->num_contours > 0)) { + minimax_test(subj, clip, op); + } + /* Build LMT */ + if (subj->num_contours > 0) { + s_heap = build_lmt(&lmt, &sbtree, &sbt_entries, subj, SUBJ, op); + } + if (clip->num_contours > 0) { + c_heap = build_lmt(&lmt, &sbtree, &sbt_entries, clip, CLIP, op); + } + /* Return a NULL result if no contours contribute */ + if (lmt == NULL) { + result->num_contours = 0; + result->hole = NULL; + result->contour = NULL; + reset_lmt(&lmt); + gpc_free(s_heap); + gpc_free(c_heap); + return; + } + + /* Build scanbeam table from scanbeam tree */ + gpc_malloc(sbt, sbt_entries * sizeof(double), + const_cast("sbt creation")); + build_sbt(&scanbeam, sbt, sbtree); + scanbeam = 0; + free_sbtree(&sbtree); + /* Allow pointer re-use without causing memory leak */ + if (subj == result) { + gpc_free_polygon(subj); + } + if (clip == result) { + gpc_free_polygon(clip); + } + /* Invert clip polygon for difference operation */ + if (op == GPC_DIFF) { + parity[CLIP] = RIGHT; + } + local_min = lmt; + + // Process each scanbeam + while (scanbeam < sbt_entries) { + /* Set yb and yt to the bottom and top of the scanbeam */ + yb = sbt[scanbeam++]; + if (scanbeam < sbt_entries) { + yt = sbt[scanbeam]; + dy = yt - yb; + } + /* === SCANBEAM BOUNDARY PROCESSING ================================ */ + /* If LMT node corresponding to yb exists */ + if (local_min) { + if (local_min->y == yb) { + /* Add edges starting at this local minimum to the AET */ + for (edge = local_min->first_bound; edge; edge = edge->next_bound) { + add_edge_to_aet(&aet, edge, NULL); + } + local_min = local_min->next; + } + } + /* Set dummy previous x value */ + px = -DBL_MAX; + /* Create bundles within AET */ + e0 = aet; + e1 = aet; + /* Set up bundle fields of first edge */ + aet->bundle[ABOVE][aet->type] = (aet->top.y != yb); + aet->bundle[ABOVE][!aet->type] = 0; + aet->bstate[ABOVE] = UNBUNDLED; + + for (next_edge = aet->next; next_edge; next_edge = next_edge->next) { + /* Set up bundle fields of next edge */ + next_edge->bundle[ABOVE][next_edge->type] = (next_edge->top.y != yb); + next_edge->bundle[ABOVE][!next_edge->type] = 0; + next_edge->bstate[ABOVE] = UNBUNDLED; + /* Bundle edges above the scanbeam boundary if they coincide */ + if (next_edge->bundle[ABOVE][next_edge->type]) { + if (gpc_eq(e0->xb, next_edge->xb) && gpc_eq(e0->dx, next_edge->dx) && + (e0->top.y != yb)) { + next_edge->bundle[ABOVE][next_edge->type] ^= + e0->bundle[ABOVE][next_edge->type]; + next_edge->bundle[ABOVE][!next_edge->type] = + e0->bundle[ABOVE][!next_edge->type]; + next_edge->bstate[ABOVE] = BUNDLE_HEAD; + e0->bundle[ABOVE][CLIP] = 0; + e0->bundle[ABOVE][SUBJ] = 0; + e0->bstate[ABOVE] = BUNDLE_TAIL; + } + e0 = next_edge; + } + } + horiz[CLIP] = NH; + horiz[SUBJ] = NH; + + // Process each edge at this scanbeam boundary + for (edge = aet; edge; edge = edge->next) { + exists[CLIP] = + edge->bundle[ABOVE][CLIP] + (edge->bundle[BELOW][CLIP] << 1); + exists[SUBJ] = + edge->bundle[ABOVE][SUBJ] + (edge->bundle[BELOW][SUBJ] << 1); + if (exists[CLIP] || exists[SUBJ]) { + /* Set bundle side */ + edge->bside[CLIP] = parity[CLIP]; + edge->bside[SUBJ] = parity[SUBJ]; + /* Determine contributing status and quadrant occupancies */ + switch (op) { + case GPC_DIFF: + case GPC_INT: + contributing = (exists[CLIP] && (parity[SUBJ] || horiz[SUBJ])) || + (exists[SUBJ] && (parity[CLIP] || horiz[CLIP])) || + (exists[CLIP] && exists[SUBJ] && + (parity[CLIP] == parity[SUBJ])); + br = (parity[CLIP]) && (parity[SUBJ]); + bl = (parity[CLIP] ^ edge->bundle[ABOVE][CLIP]) && + (parity[SUBJ] ^ edge->bundle[ABOVE][SUBJ]); + tr = (parity[CLIP] ^ (horiz[CLIP] != NH)) && + (parity[SUBJ] ^ (horiz[SUBJ] != NH)); + tl = (parity[CLIP] ^ (horiz[CLIP] != NH) ^ + edge->bundle[BELOW][CLIP]) && + (parity[SUBJ] ^ (horiz[SUBJ] != NH) ^ + edge->bundle[BELOW][SUBJ]); + break; + case GPC_XOR: + contributing = exists[CLIP] || exists[SUBJ]; + br = (parity[CLIP]) ^ (parity[SUBJ]); + bl = (parity[CLIP] ^ edge->bundle[ABOVE][CLIP]) ^ + (parity[SUBJ] ^ edge->bundle[ABOVE][SUBJ]); + tr = (parity[CLIP] ^ (horiz[CLIP] != NH)) ^ + (parity[SUBJ] ^ (horiz[SUBJ] != NH)); + tl = (parity[CLIP] ^ (horiz[CLIP] != NH) ^ + edge->bundle[BELOW][CLIP]) ^ + (parity[SUBJ] ^ (horiz[SUBJ] != NH) ^ + edge->bundle[BELOW][SUBJ]); + break; + case GPC_UNION: + contributing = (exists[CLIP] && (!parity[SUBJ] || horiz[SUBJ])) || + (exists[SUBJ] && (!parity[CLIP] || horiz[CLIP])) || + (exists[CLIP] && exists[SUBJ] && + (parity[CLIP] == parity[SUBJ])); + br = (parity[CLIP]) || (parity[SUBJ]); + bl = (parity[CLIP] ^ edge->bundle[ABOVE][CLIP]) || + (parity[SUBJ] ^ edge->bundle[ABOVE][SUBJ]); + tr = (parity[CLIP] ^ (horiz[CLIP] != NH)) || + (parity[SUBJ] ^ (horiz[SUBJ] != NH)); + tl = (parity[CLIP] ^ (horiz[CLIP] != NH) ^ + edge->bundle[BELOW][CLIP]) || + (parity[SUBJ] ^ (horiz[SUBJ] != NH) ^ + edge->bundle[BELOW][SUBJ]); + break; + } + // Update parity + parity[CLIP] ^= edge->bundle[ABOVE][CLIP]; + parity[SUBJ] ^= edge->bundle[ABOVE][SUBJ]; + /* Update horizontal state */ + if (exists[CLIP]) { + horiz[CLIP] = next_h_state[horiz[CLIP]] + [((exists[CLIP] - 1) << 1) + parity[CLIP]]; + } + if (exists[SUBJ]) { + horiz[SUBJ] = next_h_state[horiz[SUBJ]] + [((exists[SUBJ] - 1) << 1) + parity[SUBJ]]; + } + vclass = tr + (tl << 1) + (br << 2) + (bl << 3); + if (contributing) { + xb = edge->xb; + switch (vclass) { + case EMN: + case IMN: + add_local_min(&out_poly, edge, xb, yb); + px = xb; + cf = edge->outp[ABOVE]; + break; + case ERI: + if (xb != px) { + add_right(cf, xb, yb); + px = xb; + } + edge->outp[ABOVE] = cf; + cf = NULL; + break; + case ELI: + add_left(edge->outp[BELOW], xb, yb); + px = xb; + cf = edge->outp[BELOW]; + break; + case EMX: + if (xb != px) { + add_left(cf, xb, yb); + px = xb; + } + merge_right(cf, edge->outp[BELOW], out_poly); + cf = NULL; + break; + case ILI: + if (xb != px) { + add_left(cf, xb, yb); + px = xb; + } + edge->outp[ABOVE] = cf; + cf = NULL; + break; + case IRI: + add_right(edge->outp[BELOW], xb, yb); + px = xb; + cf = edge->outp[BELOW]; + edge->outp[BELOW] = NULL; + break; + case IMX: + if (xb != px) { + add_right(cf, xb, yb); + px = xb; + } + merge_left(cf, edge->outp[BELOW], out_poly); + cf = NULL; + edge->outp[BELOW] = NULL; + break; + case IMM: + if (xb != px) { + add_right(cf, xb, yb); + px = xb; + } + merge_left(cf, edge->outp[BELOW], out_poly); + edge->outp[BELOW] = NULL; + add_local_min(&out_poly, edge, xb, yb); + cf = edge->outp[ABOVE]; + break; + case EMM: + if (xb != px) { + add_left(cf, xb, yb); + px = xb; + } + merge_right(cf, edge->outp[BELOW], out_poly); + edge->outp[BELOW] = NULL; + add_local_min(&out_poly, edge, xb, yb); + cf = edge->outp[ABOVE]; + break; + case LED: + if (edge->bot.y == yb) { + add_left(edge->outp[BELOW], xb, yb); + } + edge->outp[ABOVE] = edge->outp[BELOW]; + px = xb; + break; + case RED: + if (edge->bot.y == yb) { + add_right(edge->outp[BELOW], xb, yb); + } + edge->outp[ABOVE] = edge->outp[BELOW]; + px = xb; + break; + default: + break; + } /* End of switch */ + } /* End of contributing conditional */ + } /* End of edge exists conditional */ + } // End of AET loop + + /* Delete terminating edges from the AET, otherwise compute xt */ + for (edge = aet; edge; edge = edge->next) { + if (edge->top.y == yb) { + prev_edge = edge->prev; + next_edge = edge->next; + if (prev_edge) { + prev_edge->next = next_edge; + } else { + aet = next_edge; + } + if (next_edge) { + next_edge->prev = prev_edge; + } + /* Copy bundle head state to the adjacent tail edge if required */ + if ((edge->bstate[BELOW] == BUNDLE_HEAD) && prev_edge) { + if (prev_edge->bstate[BELOW] == BUNDLE_TAIL) { + prev_edge->outp[BELOW] = edge->outp[BELOW]; + prev_edge->bstate[BELOW] = UNBUNDLED; + if (prev_edge->prev) { + if (prev_edge->prev->bstate[BELOW] == BUNDLE_TAIL) { + prev_edge->bstate[BELOW] = BUNDLE_HEAD; + } + } + } + } + } else { + if (edge->top.y == yt) { + edge->xt = edge->top.x; + } else { + edge->xt = edge->bot.x + edge->dx * (yt - edge->bot.y); + } + } + } + + if (scanbeam < sbt_entries) { + /* === SCANBEAM INTERIOR PROCESSING ============================== */ + build_intersection_table(&it, aet, dy); + /* Process each node in the intersection table */ + for (intersect = it; intersect; intersect = intersect->next) { + e0 = intersect->ie[0]; + e1 = intersect->ie[1]; + /* Only generate output for contributing intersections */ + if ((e0->bundle[ABOVE][CLIP] || e0->bundle[ABOVE][SUBJ]) && + (e1->bundle[ABOVE][CLIP] || e1->bundle[ABOVE][SUBJ])) { + p = e0->outp[ABOVE]; + q = e1->outp[ABOVE]; + ix = intersect->point.x; + iy = intersect->point.y + yb; + + in[CLIP] = (e0->bundle[ABOVE][CLIP] && !e0->bside[CLIP]) || + (e1->bundle[ABOVE][CLIP] && e1->bside[CLIP]) || + (!e0->bundle[ABOVE][CLIP] && !e1->bundle[ABOVE][CLIP] && + e0->bside[CLIP] && e1->bside[CLIP]); + in[SUBJ] = (e0->bundle[ABOVE][SUBJ] && !e0->bside[SUBJ]) || + (e1->bundle[ABOVE][SUBJ] && e1->bside[SUBJ]) || + (!e0->bundle[ABOVE][SUBJ] && !e1->bundle[ABOVE][SUBJ] && + e0->bside[SUBJ] && e1->bside[SUBJ]); + + // Determine quadrant occupancies + switch (op) { + case GPC_DIFF: + case GPC_INT: + tr = (in[CLIP]) && (in[SUBJ]); + tl = (in[CLIP] ^ e1->bundle[ABOVE][CLIP]) && + (in[SUBJ] ^ e1->bundle[ABOVE][SUBJ]); + br = (in[CLIP] ^ e0->bundle[ABOVE][CLIP]) && + (in[SUBJ] ^ e0->bundle[ABOVE][SUBJ]); + bl = (in[CLIP] ^ e1->bundle[ABOVE][CLIP] ^ + e0->bundle[ABOVE][CLIP]) && + (in[SUBJ] ^ e1->bundle[ABOVE][SUBJ] ^ + e0->bundle[ABOVE][SUBJ]); + break; + case GPC_XOR: + tr = (in[CLIP]) ^ (in[SUBJ]); + tl = (in[CLIP] ^ e1->bundle[ABOVE][CLIP]) ^ + (in[SUBJ] ^ e1->bundle[ABOVE][SUBJ]); + br = (in[CLIP] ^ e0->bundle[ABOVE][CLIP]) ^ + (in[SUBJ] ^ e0->bundle[ABOVE][SUBJ]); + bl = (in[CLIP] ^ e1->bundle[ABOVE][CLIP] ^ + e0->bundle[ABOVE][CLIP]) ^ + (in[SUBJ] ^ e1->bundle[ABOVE][SUBJ] ^ + e0->bundle[ABOVE][SUBJ]); + break; + case GPC_UNION: + tr = (in[CLIP]) || (in[SUBJ]); + tl = (in[CLIP] ^ e1->bundle[ABOVE][CLIP]) || + (in[SUBJ] ^ e1->bundle[ABOVE][SUBJ]); + br = (in[CLIP] ^ e0->bundle[ABOVE][CLIP]) || + (in[SUBJ] ^ e0->bundle[ABOVE][SUBJ]); + bl = (in[CLIP] ^ e1->bundle[ABOVE][CLIP] ^ + e0->bundle[ABOVE][CLIP]) || + (in[SUBJ] ^ e1->bundle[ABOVE][SUBJ] ^ + e0->bundle[ABOVE][SUBJ]); + break; + } + vclass = tr + (tl << 1) + (br << 2) + (bl << 3); + switch (vclass) { + case EMN: + add_local_min(&out_poly, e0, ix, iy); + e1->outp[ABOVE] = e0->outp[ABOVE]; + break; + case ERI: + if (p) { + add_right(p, ix, iy); + e1->outp[ABOVE] = p; + e0->outp[ABOVE] = NULL; + } + break; + case ELI: + if (q) { + add_left(q, ix, iy); + e0->outp[ABOVE] = q; + e1->outp[ABOVE] = NULL; + } + break; + case EMX: + if (p && q) { + add_left(p, ix, iy); + merge_right(p, q, out_poly); + e0->outp[ABOVE] = NULL; + e1->outp[ABOVE] = NULL; + } + break; + case IMN: + add_local_min(&out_poly, e0, ix, iy); + e1->outp[ABOVE] = e0->outp[ABOVE]; + break; + case ILI: + if (p) { + add_left(p, ix, iy); + e1->outp[ABOVE] = p; + e0->outp[ABOVE] = NULL; + } + break; + case IRI: + if (q) { + add_right(q, ix, iy); + e0->outp[ABOVE] = q; + e1->outp[ABOVE] = NULL; + } + break; + case IMX: + if (p && q) { + add_right(p, ix, iy); + merge_left(p, q, out_poly); + e0->outp[ABOVE] = NULL; + e1->outp[ABOVE] = NULL; + } + break; + case IMM: + if (p && q) { + add_right(p, ix, iy); + merge_left(p, q, out_poly); + add_local_min(&out_poly, e0, ix, iy); + e1->outp[ABOVE] = e0->outp[ABOVE]; + } + break; + case EMM: + if (p && q) { + add_left(p, ix, iy); + merge_right(p, q, out_poly); + add_local_min(&out_poly, e0, ix, iy); + e1->outp[ABOVE] = e0->outp[ABOVE]; + } + break; + default: + break; + } // End of switch + } /* End of contributing intersection conditional */ + + /* Swap bundle sides in response to edge crossing */ + if (e0->bundle[ABOVE][CLIP]) { + e1->bside[CLIP] = !e1->bside[CLIP]; + } + if (e1->bundle[ABOVE][CLIP]) { + e0->bside[CLIP] = !e0->bside[CLIP]; + } + if (e0->bundle[ABOVE][SUBJ]) { + e1->bside[SUBJ] = !e1->bside[SUBJ]; + } + if (e1->bundle[ABOVE][SUBJ]) { + e0->bside[SUBJ] = !e0->bside[SUBJ]; + } + + /* Swap e0 and e1 bundles in the AET */ + prev_edge = e0->prev; + next_edge = e1->next; + if (next_edge) { + next_edge->prev = e0; + } + if (e0->bstate[ABOVE] == BUNDLE_HEAD) { + search = 1; + while (search) { + prev_edge = prev_edge->prev; + if (prev_edge) { + if (prev_edge->bstate[ABOVE] != BUNDLE_TAIL) { + search = 0; + } + } else { + search = 0; + } + } + } + if (!prev_edge) { + aet->prev = e1; + e1->next = aet; + aet = e0->next; + } else { + prev_edge->next->prev = e1; + e1->next = prev_edge->next; + prev_edge->next = e0->next; + } + e0->next->prev = prev_edge; + e1->next->prev = e1; + e0->next = next_edge; + } /* End of IT loop*/ + + // Prepare for next scanbeam + for (edge = aet; edge; edge = next_edge) { + next_edge = edge->next; + succ_edge = edge->succ; + if ((edge->top.y == yt) && succ_edge) { + /* Replace AET edge by its successor */ + succ_edge->outp[BELOW] = edge->outp[ABOVE]; + succ_edge->bstate[BELOW] = edge->bstate[ABOVE]; + succ_edge->bundle[BELOW][CLIP] = edge->bundle[ABOVE][CLIP]; + succ_edge->bundle[BELOW][SUBJ] = edge->bundle[ABOVE][SUBJ]; + prev_edge = edge->prev; + if (prev_edge) { + prev_edge->next = succ_edge; + } else { + aet = succ_edge; + } + if (next_edge) { + next_edge->prev = succ_edge; + } + succ_edge->prev = prev_edge; + succ_edge->next = next_edge; + } else { + /* Update this edge */ + edge->outp[BELOW] = edge->outp[ABOVE]; + edge->bstate[BELOW] = edge->bstate[ABOVE]; + edge->bundle[BELOW][CLIP] = edge->bundle[ABOVE][CLIP]; + edge->bundle[BELOW][SUBJ] = edge->bundle[ABOVE][SUBJ]; + edge->xb = edge->xt; + } + edge->outp[ABOVE] = NULL; + } + } + } /* === END OF SCANBEAM PROCESSING ================================== */ + // Generate result polygon from out_poly + result->contour = NULL; + result->hole = NULL; + result->num_contours = count_contours(out_poly); + if (result->num_contours > 0) { + gpc_malloc(result->hole, result->num_contours * sizeof(int), + const_cast("hole flag table creation")); + gpc_malloc(result->contour, + result->num_contours * sizeof(gpc_vertex_list), + const_cast("contour creation")); + + c = 0; + for (poly = out_poly; poly; poly = npoly) { + npoly = poly->next; + if (poly->active) { + result->hole[c] = poly->proxy->hole; + result->contour[c].num_vertices = poly->active; + gpc_malloc( + result->contour[c].vertex, + result->contour[c].num_vertices * sizeof(gpc_vertex), + const_cast("vertex creation")); + + v = result->contour[c].num_vertices - 1; + for (vtx = poly->proxy->v[LEFT]; vtx; vtx = nv) { + nv = vtx->next; + result->contour[c].vertex[v].x = vtx->x; + result->contour[c].vertex[v].y = vtx->y; + gpc_free(vtx); + v--; + } + c++; + } + gpc_free(poly); + } + } else { + for (poly = out_poly; poly; poly = npoly) { + npoly = poly->next; + gpc_free(poly); + } + } + + // Tidy up + reset_it(&it); + reset_lmt(&lmt); + gpc_free(c_heap); + gpc_free(s_heap); + gpc_free(sbt); +} // NOLINT + +void gpc_free_tristrip(gpc_tristrip *t) { + int s = 0; + for (s = 0; s < t->num_strips; s++) { + gpc_free(t->strip[s].vertex); + } + gpc_free(t->strip); + t->num_strips = 0; +} + +void gpc_polygon_to_tristrip(gpc_polygon *s, gpc_tristrip *t) { + gpc_polygon c; + c.num_contours = 0; + c.hole = NULL; + c.contour = NULL; + gpc_tristrip_clip(GPC_DIFF, s, &c, t); +} + +// gpc_tristrip_clip +void gpc_tristrip_clip(gpc_op op, gpc_polygon *subj, gpc_polygon *clip, + gpc_tristrip *result) { + sb_tree *sbtree = NULL; + it_node *it = NULL; + it_node *intersect = NULL; + edge_node *edge = NULL; + edge_node *prev_edge = NULL; + edge_node *next_edge = NULL; + edge_node *succ_edge = NULL; + edge_node *e0 = NULL; + edge_node *e1 = NULL; + edge_node *aet = NULL; + edge_node *c_heap = NULL; + edge_node *s_heap = NULL; + edge_node *cf = NULL; + lmt_node *lmt = NULL; + lmt_node *local_min = NULL; + polygon_node *tlist = NULL; + polygon_node *tn = NULL; + polygon_node *tnn = NULL; + polygon_node *p = NULL; + polygon_node *q = NULL; + vertex_node *lt = NULL; + vertex_node *ltn = NULL; + vertex_node *rt = NULL; + vertex_node *rtn = NULL; + h_state horiz[2]; + vertex_type cft = NUL; + int in[2]; + int exists[2]; + int parity[2] = {LEFT, LEFT}; + int s = 0; + int v = 0; + int contributing = 0; + int search = 0; + int scanbeam = 0; + int sbt_entries = 0; + int vclass = 0; + int bl = 0; + int br = 0; + int tl = 0; + int tr = 0; + double *sbt = NULL; + double xb = 0.0; + double px = 0.0; + double nx = 0.0; + double yb = 0.0; + double yt = 0.0; + double dy = 0.0; + double ix = 0.0; + double iy = 0.0; + + /* Test for trivial NULL result cases */ + if (((subj->num_contours == 0) && (clip->num_contours == 0)) || + ((subj->num_contours == 0) && ((op == GPC_INT) || (op == GPC_DIFF))) || + ((clip->num_contours == 0) && (op == GPC_INT))) { + result->num_strips = 0; + result->strip = NULL; + return; + } + + /* Identify potentialy contributing contours */ + if (((op == GPC_INT) || (op == GPC_DIFF)) && (subj->num_contours > 0) && + (clip->num_contours > 0)) { + minimax_test(subj, clip, op); + } + /* Build LMT */ + if (subj->num_contours > 0) { + s_heap = build_lmt(&lmt, &sbtree, &sbt_entries, subj, SUBJ, op); + } + if (clip->num_contours > 0) { + c_heap = build_lmt(&lmt, &sbtree, &sbt_entries, clip, CLIP, op); + } + /* Return a NULL result if no contours contribute */ + if (lmt == NULL) { + result->num_strips = 0; + result->strip = NULL; + reset_lmt(&lmt); + gpc_free(s_heap); + gpc_free(c_heap); + return; + } + + /* Build scanbeam table from scanbeam tree */ + gpc_malloc(sbt, sbt_entries * sizeof(double), + const_cast("sbt creation")); + build_sbt(&scanbeam, sbt, sbtree); + scanbeam = 0; + free_sbtree(&sbtree); + + /* Invert clip polygon for difference operation */ + if (op == GPC_DIFF) { + parity[CLIP] = RIGHT; + } + local_min = lmt; + + // Process each scanbeam + while (scanbeam < sbt_entries) { + /* Set yb and yt to the bottom and top of the scanbeam */ + yb = sbt[scanbeam++]; + if (scanbeam < sbt_entries) { + yt = sbt[scanbeam]; + dy = yt - yb; + } + + /* === SCANBEAM BOUNDARY PROCESSING ================================ */ + /* If LMT node corresponding to yb exists */ + if (local_min) { + if (local_min->y == yb) { + /* Add edges starting at this local minimum to the AET */ + for (edge = local_min->first_bound; edge; edge = edge->next_bound) { + add_edge_to_aet(&aet, edge, NULL); + } + local_min = local_min->next; + } + } + /* Set dummy previous x value */ + /* Create bundles within AET */ + px = -DBL_MAX; + e0 = aet; + e1 = aet; + + /* Set up bundle fields of first edge */ + aet->bundle[ABOVE][aet->type] = (aet->top.y != yb); + aet->bundle[ABOVE][!aet->type] = 0; + aet->bstate[ABOVE] = UNBUNDLED; + + for (next_edge = aet->next; next_edge; next_edge = next_edge->next) { + /* Set up bundle fields of next edge */ + next_edge->bundle[ABOVE][next_edge->type] = (next_edge->top.y != yb); + next_edge->bundle[ABOVE][!next_edge->type] = 0; + next_edge->bstate[ABOVE] = UNBUNDLED; + + /* Bundle edges above the scanbeam boundary if they coincide */ + if (next_edge->bundle[ABOVE][next_edge->type]) { + if (gpc_eq(e0->xb, next_edge->xb) && gpc_eq(e0->dx, next_edge->dx) && + (e0->top.y != yb)) { + next_edge->bundle[ABOVE][next_edge->type] ^= + e0->bundle[ABOVE][next_edge->type]; + next_edge->bundle[ABOVE][!next_edge->type] = + e0->bundle[ABOVE][!next_edge->type]; + next_edge->bstate[ABOVE] = BUNDLE_HEAD; + e0->bundle[ABOVE][CLIP] = 0; + e0->bundle[ABOVE][SUBJ] = 0; + e0->bstate[ABOVE] = BUNDLE_TAIL; + } + e0 = next_edge; + } + } + horiz[CLIP] = NH; + horiz[SUBJ] = NH; + + /* Process each edge at this scanbeam boundary */ + for (edge = aet; edge; edge = edge->next) { + exists[CLIP] = + edge->bundle[ABOVE][CLIP] + (edge->bundle[BELOW][CLIP] << 1); + exists[SUBJ] = + edge->bundle[ABOVE][SUBJ] + (edge->bundle[BELOW][SUBJ] << 1); + + if (exists[CLIP] || exists[SUBJ]) { + /* Set bundle side */ + edge->bside[CLIP] = parity[CLIP]; + edge->bside[SUBJ] = parity[SUBJ]; + + /* Determine contributing status and quadrant occupancies */ + switch (op) { + case GPC_DIFF: + case GPC_INT: + contributing = (exists[CLIP] && (parity[SUBJ] || horiz[SUBJ])) || + (exists[SUBJ] && (parity[CLIP] || horiz[CLIP])) || + (exists[CLIP] && exists[SUBJ] && + (parity[CLIP] == parity[SUBJ])); + br = (parity[CLIP]) && (parity[SUBJ]); + bl = (parity[CLIP] ^ edge->bundle[ABOVE][CLIP]) && + (parity[SUBJ] ^ edge->bundle[ABOVE][SUBJ]); + tr = (parity[CLIP] ^ (horiz[CLIP] != NH)) && + (parity[SUBJ] ^ (horiz[SUBJ] != NH)); + tl = (parity[CLIP] ^ (horiz[CLIP] != NH) ^ + edge->bundle[BELOW][CLIP]) && + (parity[SUBJ] ^ (horiz[SUBJ] != NH) ^ + edge->bundle[BELOW][SUBJ]); + break; + case GPC_XOR: + contributing = exists[CLIP] || exists[SUBJ]; + br = (parity[CLIP]) ^ (parity[SUBJ]); + bl = (parity[CLIP] ^ edge->bundle[ABOVE][CLIP]) ^ + (parity[SUBJ] ^ edge->bundle[ABOVE][SUBJ]); + tr = (parity[CLIP] ^ (horiz[CLIP] != NH)) ^ + (parity[SUBJ] ^ (horiz[SUBJ] != NH)); + tl = (parity[CLIP] ^ (horiz[CLIP] != NH) ^ + edge->bundle[BELOW][CLIP]) ^ + (parity[SUBJ] ^ (horiz[SUBJ] != NH) ^ + edge->bundle[BELOW][SUBJ]); + break; + case GPC_UNION: + contributing = (exists[CLIP] && (!parity[SUBJ] || horiz[SUBJ])) || + (exists[SUBJ] && (!parity[CLIP] || horiz[CLIP])) || + (exists[CLIP] && exists[SUBJ] && + (parity[CLIP] == parity[SUBJ])); + br = (parity[CLIP]) || (parity[SUBJ]); + bl = (parity[CLIP] ^ edge->bundle[ABOVE][CLIP]) || + (parity[SUBJ] ^ edge->bundle[ABOVE][SUBJ]); + tr = (parity[CLIP] ^ (horiz[CLIP] != NH)) || + (parity[SUBJ] ^ (horiz[SUBJ] != NH)); + tl = (parity[CLIP] ^ (horiz[CLIP] != NH) ^ + edge->bundle[BELOW][CLIP]) || + (parity[SUBJ] ^ (horiz[SUBJ] != NH) ^ + edge->bundle[BELOW][SUBJ]); + break; + } + + // Update parity + parity[CLIP] ^= edge->bundle[ABOVE][CLIP]; + parity[SUBJ] ^= edge->bundle[ABOVE][SUBJ]; + + /* Update horizontal state */ + if (exists[CLIP]) { + horiz[CLIP] = next_h_state[horiz[CLIP]] + [((exists[CLIP] - 1) << 1) + parity[CLIP]]; + } + if (exists[SUBJ]) { + horiz[SUBJ] = next_h_state[horiz[SUBJ]] + [((exists[SUBJ] - 1) << 1) + parity[SUBJ]]; + } + vclass = tr + (tl << 1) + (br << 2) + (bl << 3); + + if (contributing) { + xb = edge->xb; + switch (vclass) { + case EMN: + new_tristrip(&tlist, edge, xb, yb); + cf = edge; + break; + case ERI: + edge->outp[ABOVE] = cf->outp[ABOVE]; + if (xb != cf->xb) { + gpc_vertex_create(edge, ABOVE, RIGHT, xb, yb); + } + cf = NULL; + break; + case ELI: + gpc_vertex_create(edge, BELOW, LEFT, xb, yb); + edge->outp[ABOVE] = NULL; + cf = edge; + break; + case EMX: + if (xb != cf->xb) { + gpc_vertex_create(edge, BELOW, RIGHT, xb, yb); + } + edge->outp[ABOVE] = NULL; + cf = NULL; + break; + case IMN: + if (cft == LED) { + if (cf->bot.y != yb) { + gpc_vertex_create(cf, BELOW, LEFT, cf->xb, yb); + } + new_tristrip(&tlist, cf, cf->xb, yb); + } + edge->outp[ABOVE] = cf->outp[ABOVE]; + gpc_vertex_create(edge, ABOVE, RIGHT, xb, yb); + break; + case ILI: + new_tristrip(&tlist, edge, xb, yb); + cf = edge; + cft = ILI; + break; + case IRI: + if (cft == LED) { + if (cf->bot.y != yb) { + gpc_vertex_create(cf, BELOW, LEFT, cf->xb, yb); + } + new_tristrip(&tlist, cf, cf->xb, yb); + } + gpc_vertex_create(edge, BELOW, RIGHT, xb, yb); + edge->outp[ABOVE] = NULL; + break; + case IMX: + gpc_vertex_create(edge, BELOW, LEFT, xb, yb); + edge->outp[ABOVE] = NULL; + cft = IMX; + break; + case IMM: + gpc_vertex_create(edge, BELOW, LEFT, xb, yb); + edge->outp[ABOVE] = cf->outp[ABOVE]; + if (xb != cf->xb) { + gpc_vertex_create(cf, ABOVE, RIGHT, xb, yb); + } + cf = edge; + break; + case EMM: + gpc_vertex_create(edge, BELOW, RIGHT, xb, yb); + edge->outp[ABOVE] = NULL; + new_tristrip(&tlist, edge, xb, yb); + cf = edge; + break; + case LED: + if (edge->bot.y == yb) { + gpc_vertex_create(edge, BELOW, LEFT, xb, yb); + } + edge->outp[ABOVE] = edge->outp[BELOW]; + cf = edge; + cft = LED; + break; + case RED: + edge->outp[ABOVE] = cf->outp[ABOVE]; + if (cft == LED) { + if (cf->bot.y == yb) { + gpc_vertex_create(edge, BELOW, RIGHT, xb, yb); + } else { + if (edge->bot.y == yb) { + gpc_vertex_create(cf, BELOW, LEFT, cf->xb, yb); + gpc_vertex_create(edge, BELOW, RIGHT, xb, yb); + } + } + } else { + gpc_vertex_create(edge, BELOW, RIGHT, xb, yb); + gpc_vertex_create(edge, ABOVE, RIGHT, xb, yb); + } + cf = NULL; + break; + default: + break; + } /* End of switch */ + } /* End of contributing conditional */ + } /* End of edge exists conditional */ + } // End of AET loop + + /* Delete terminating edges from the AET, otherwise compute xt */ + for (edge = aet; edge; edge = edge->next) { + if (edge->top.y == yb) { + prev_edge = edge->prev; + next_edge = edge->next; + if (prev_edge) { + prev_edge->next = next_edge; + } else { + aet = next_edge; + } + if (next_edge) { + next_edge->prev = prev_edge; + } + + /* Copy bundle head state to the adjacent tail edge if required */ + if ((edge->bstate[BELOW] == BUNDLE_HEAD) && prev_edge) { + if (prev_edge->bstate[BELOW] == BUNDLE_TAIL) { + prev_edge->outp[BELOW] = edge->outp[BELOW]; + prev_edge->bstate[BELOW] = UNBUNDLED; + if (prev_edge->prev) { + if (prev_edge->prev->bstate[BELOW] == BUNDLE_TAIL) { + prev_edge->bstate[BELOW] = BUNDLE_HEAD; + } + } + } + } + } else { + if (edge->top.y == yt) { + edge->xt = edge->top.x; + } else { + edge->xt = edge->bot.x + edge->dx * (yt - edge->bot.y); + } + } + } + + if (scanbeam < sbt_entries) { + /* === SCANBEAM INTERIOR PROCESSING ============================== */ + build_intersection_table(&it, aet, dy); + /* Process each node in the intersection table */ + for (intersect = it; intersect; intersect = intersect->next) { + e0 = intersect->ie[0]; + e1 = intersect->ie[1]; + + /* Only generate output for contributing intersections */ + if ((e0->bundle[ABOVE][CLIP] || e0->bundle[ABOVE][SUBJ]) && + (e1->bundle[ABOVE][CLIP] || e1->bundle[ABOVE][SUBJ])) { + p = e0->outp[ABOVE]; + q = e1->outp[ABOVE]; + ix = intersect->point.x; + iy = intersect->point.y + yb; + + in[CLIP] = (e0->bundle[ABOVE][CLIP] && !e0->bside[CLIP]) || + (e1->bundle[ABOVE][CLIP] && e1->bside[CLIP]) || + (!e0->bundle[ABOVE][CLIP] && !e1->bundle[ABOVE][CLIP] && + e0->bside[CLIP] && e1->bside[CLIP]); + in[SUBJ] = (e0->bundle[ABOVE][SUBJ] && !e0->bside[SUBJ]) || + (e1->bundle[ABOVE][SUBJ] && e1->bside[SUBJ]) || + (!e0->bundle[ABOVE][SUBJ] && !e1->bundle[ABOVE][SUBJ] && + e0->bside[SUBJ] && e1->bside[SUBJ]); + + switch (op) { // Determine quadrant occupancies + case GPC_DIFF: + case GPC_INT: + tr = (in[CLIP]) && (in[SUBJ]); + tl = (in[CLIP] ^ e1->bundle[ABOVE][CLIP]) && + (in[SUBJ] ^ e1->bundle[ABOVE][SUBJ]); + br = (in[CLIP] ^ e0->bundle[ABOVE][CLIP]) && + (in[SUBJ] ^ e0->bundle[ABOVE][SUBJ]); + bl = (in[CLIP] ^ e1->bundle[ABOVE][CLIP] ^ + e0->bundle[ABOVE][CLIP]) && + (in[SUBJ] ^ e1->bundle[ABOVE][SUBJ] ^ + e0->bundle[ABOVE][SUBJ]); + break; + case GPC_XOR: + tr = (in[CLIP]) ^ (in[SUBJ]); + tl = (in[CLIP] ^ e1->bundle[ABOVE][CLIP]) ^ + (in[SUBJ] ^ e1->bundle[ABOVE][SUBJ]); + br = (in[CLIP] ^ e0->bundle[ABOVE][CLIP]) ^ + (in[SUBJ] ^ e0->bundle[ABOVE][SUBJ]); + bl = (in[CLIP] ^ e1->bundle[ABOVE][CLIP] ^ + e0->bundle[ABOVE][CLIP]) ^ + (in[SUBJ] ^ e1->bundle[ABOVE][SUBJ] ^ + e0->bundle[ABOVE][SUBJ]); + break; + case GPC_UNION: + tr = (in[CLIP]) || (in[SUBJ]); + tl = (in[CLIP] ^ e1->bundle[ABOVE][CLIP]) || + (in[SUBJ] ^ e1->bundle[ABOVE][SUBJ]); + br = (in[CLIP] ^ e0->bundle[ABOVE][CLIP]) || + (in[SUBJ] ^ e0->bundle[ABOVE][SUBJ]); + bl = (in[CLIP] ^ e1->bundle[ABOVE][CLIP] ^ + e0->bundle[ABOVE][CLIP]) || + (in[SUBJ] ^ e1->bundle[ABOVE][SUBJ] ^ + e0->bundle[ABOVE][SUBJ]); + break; + } + + vclass = tr + (tl << 1) + (br << 2) + (bl << 3); + switch (vclass) { + case EMN: + new_tristrip(&tlist, e1, ix, iy); + e0->outp[ABOVE] = e1->outp[ABOVE]; + break; + case ERI: + if (p) { + gpc_p_edge(prev_edge, e0, ABOVE); + gpc_vertex_create(prev_edge, ABOVE, LEFT, px, iy); + gpc_vertex_create(e0, ABOVE, RIGHT, ix, iy); + e1->outp[ABOVE] = e0->outp[ABOVE]; + e0->outp[ABOVE] = NULL; + } + break; + case ELI: + if (q) { + gpc_n_edge(next_edge, e1, ABOVE); + gpc_vertex_create(e1, ABOVE, LEFT, ix, iy); + gpc_vertex_create(next_edge, ABOVE, RIGHT, nx, iy); + e0->outp[ABOVE] = e1->outp[ABOVE]; + e1->outp[ABOVE] = NULL; + } + break; + case EMX: + if (p && q) { + gpc_vertex_create(e0, ABOVE, LEFT, ix, iy); + e0->outp[ABOVE] = NULL; + e1->outp[ABOVE] = NULL; + } + break; + case IMN: + gpc_p_edge(prev_edge, e0, ABOVE); + gpc_vertex_create(prev_edge, ABOVE, LEFT, px, iy); + gpc_n_edge(next_edge, e1, ABOVE); + gpc_vertex_create(next_edge, ABOVE, RIGHT, nx, iy); + new_tristrip(&tlist, prev_edge, px, iy); + e1->outp[ABOVE] = prev_edge->outp[ABOVE]; + gpc_vertex_create(e1, ABOVE, RIGHT, ix, iy); + new_tristrip(&tlist, e0, ix, iy); + next_edge->outp[ABOVE] = e0->outp[ABOVE]; + gpc_vertex_create(next_edge, ABOVE, RIGHT, nx, iy); + break; + case ILI: + if (p) { + gpc_vertex_create(e0, ABOVE, LEFT, ix, iy); + gpc_n_edge(next_edge, e1, ABOVE); + gpc_vertex_create(next_edge, ABOVE, RIGHT, nx, iy); + e1->outp[ABOVE] = e0->outp[ABOVE]; + e0->outp[ABOVE] = NULL; + } + break; + case IRI: + if (q) { + gpc_vertex_create(e1, ABOVE, RIGHT, ix, iy); + gpc_p_edge(prev_edge, e0, ABOVE); + gpc_vertex_create(prev_edge, ABOVE, LEFT, px, iy); + e0->outp[ABOVE] = e1->outp[ABOVE]; + e1->outp[ABOVE] = NULL; + } + break; + case IMX: + if (p && q) { + gpc_vertex_create(e0, ABOVE, RIGHT, ix, iy); + gpc_vertex_create(e1, ABOVE, LEFT, ix, iy); + e0->outp[ABOVE] = NULL; + e1->outp[ABOVE] = NULL; + gpc_p_edge(prev_edge, e0, ABOVE); + gpc_vertex_create(prev_edge, ABOVE, LEFT, px, iy); + new_tristrip(&tlist, prev_edge, px, iy); + gpc_n_edge(next_edge, e1, ABOVE); + gpc_vertex_create(next_edge, ABOVE, RIGHT, nx, iy); + next_edge->outp[ABOVE] = prev_edge->outp[ABOVE]; + gpc_vertex_create(next_edge, ABOVE, RIGHT, nx, iy); + } + break; + case IMM: + if (p && q) { + gpc_vertex_create(e0, ABOVE, RIGHT, ix, iy); + gpc_vertex_create(e1, ABOVE, LEFT, ix, iy); + gpc_p_edge(prev_edge, e0, ABOVE); + gpc_vertex_create(prev_edge, ABOVE, LEFT, px, iy); + new_tristrip(&tlist, prev_edge, px, iy); + gpc_n_edge(next_edge, e1, ABOVE); + gpc_vertex_create(next_edge, ABOVE, RIGHT, nx, iy); + e1->outp[ABOVE] = prev_edge->outp[ABOVE]; + gpc_vertex_create(e1, ABOVE, RIGHT, ix, iy); + new_tristrip(&tlist, e0, ix, iy); + next_edge->outp[ABOVE] = e0->outp[ABOVE]; + gpc_vertex_create(next_edge, ABOVE, RIGHT, nx, iy); + } + break; + case EMM: + if (p && q) { + gpc_vertex_create(e0, ABOVE, LEFT, ix, iy); + new_tristrip(&tlist, e1, ix, iy); + e0->outp[ABOVE] = e1->outp[ABOVE]; + } + break; + default: + break; + } /* End of switch */ + } /* End of contributing intersection conditional */ + + // Swap bundle sides in response to edge crossing + if (e0->bundle[ABOVE][CLIP]) { + e1->bside[CLIP] = !e1->bside[CLIP]; + } + if (e1->bundle[ABOVE][CLIP]) { + e0->bside[CLIP] = !e0->bside[CLIP]; + } + if (e0->bundle[ABOVE][SUBJ]) { + e1->bside[SUBJ] = !e1->bside[SUBJ]; + } + if (e1->bundle[ABOVE][SUBJ]) { + e0->bside[SUBJ] = !e0->bside[SUBJ]; + } + + /* Swap e0 and e1 bundles in the AET */ + prev_edge = e0->prev; + next_edge = e1->next; + if (e1->next) { + e1->next->prev = e0; + } + + if (e0->bstate[ABOVE] == BUNDLE_HEAD) { + search = 1; + while (search) { + prev_edge = prev_edge->prev; + if (prev_edge) { + if (prev_edge->bundle[ABOVE][CLIP] || + prev_edge->bundle[ABOVE][SUBJ] || + (prev_edge->bstate[ABOVE] == BUNDLE_HEAD)) { + search = 0; + } + } else { + search = 0; + } + } + } + if (!prev_edge) { + e1->next = aet; + aet = e0->next; + } else { + e1->next = prev_edge->next; + prev_edge->next = e0->next; + } + e0->next->prev = prev_edge; + e1->next->prev = e1; + e0->next = next_edge; + } /* End of IT loop*/ + + /* Prepare for next scanbeam */ + for (edge = aet; edge; edge = next_edge) { + next_edge = edge->next; + succ_edge = edge->succ; + + if ((edge->top.y == yt) && succ_edge) { + /* Replace AET edge by its successor */ + succ_edge->outp[BELOW] = edge->outp[ABOVE]; + succ_edge->bstate[BELOW] = edge->bstate[ABOVE]; + succ_edge->bundle[BELOW][CLIP] = edge->bundle[ABOVE][CLIP]; + succ_edge->bundle[BELOW][SUBJ] = edge->bundle[ABOVE][SUBJ]; + prev_edge = edge->prev; + if (prev_edge) { + prev_edge->next = succ_edge; + } else { + aet = succ_edge; + } + if (next_edge) { + next_edge->prev = succ_edge; + } + succ_edge->prev = prev_edge; + succ_edge->next = next_edge; + } else { + /* Update this edge */ + edge->outp[BELOW] = edge->outp[ABOVE]; + edge->bstate[BELOW] = edge->bstate[ABOVE]; + edge->bundle[BELOW][CLIP] = edge->bundle[ABOVE][CLIP]; + edge->bundle[BELOW][SUBJ] = edge->bundle[ABOVE][SUBJ]; + edge->xb = edge->xt; + } + edge->outp[ABOVE] = NULL; + } + } + } /* === END OF SCANBEAM PROCESSING ================================== */ + + // Generate result tristrip from tlist + result->strip = NULL; + result->num_strips = count_tristrips(tlist); + if (result->num_strips > 0) { + gpc_malloc(result->strip, + result->num_strips * sizeof(gpc_vertex_list), + const_cast("tristrip list creation")); + + s = 0; + for (tn = tlist; tn; tn = tnn) { + tnn = tn->next; + if (tn->active > 2) { + /* Valid tristrip: copy the vertices and free the heap */ + result->strip[s].num_vertices = tn->active; + gpc_malloc(result->strip[s].vertex, + tn->active * sizeof(gpc_vertex), + const_cast("tristrip creation")); + v = 0; + if (0) { + lt = tn->v[RIGHT]; + rt = tn->v[LEFT]; + } else { + lt = tn->v[LEFT]; + rt = tn->v[RIGHT]; + } + while (lt || rt) { + if (lt) { + ltn = lt->next; + result->strip[s].vertex[v].x = lt->x; + result->strip[s].vertex[v].y = lt->y; + v++; + gpc_free(lt); + lt = ltn; + } + if (rt) { + rtn = rt->next; + result->strip[s].vertex[v].x = rt->x; + result->strip[s].vertex[v].y = rt->y; + v++; + gpc_free(rt); + rt = rtn; + } + } + s++; + } else { + /* Invalid tristrip: just free the heap */ + for (lt = tn->v[LEFT]; lt; lt = ltn) { + ltn = lt->next; + gpc_free(lt); + } + for (rt = tn->v[RIGHT]; rt; rt = rtn) { + rtn = rt->next; + gpc_free(rt); + } + } + gpc_free(tn); + } + } + // Tidy up + reset_it(&it); + reset_lmt(&lmt); + gpc_free(c_heap); + gpc_free(s_heap); + gpc_free(sbt); +} // NOLINT + +} // namespace gpc + +/* vim: set expandtab ts=4 sw=4 sts=4 tw=100: */ diff --git a/paddle/fluid/operators/detection/gpc.h b/paddle/fluid/operators/detection/gpc.h new file mode 100644 index 0000000000..ee86262ef2 --- /dev/null +++ b/paddle/fluid/operators/detection/gpc.h @@ -0,0 +1,246 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +/*************************************************************************** + * + * Copyright (c) 2015 Baidu.com, Inc. All Rights Reserved + * + **************************************************************************/ + +/** + * @file include/gpc.h + * @author huhan02(com@baidu.com) + * @date 2015/12/18 13:52:10 + * @brief + * + * @modified by sunyipeng + * @email sunyipeng@baidu.com + * @date 2018/6/12 + **/ + +#ifndef PADDLE_FLUID_OPERATORS_DETECTION_GPC_H_ // GPC_H_ +#define PADDLE_FLUID_OPERATORS_DETECTION_GPC_H_ // GPC_H_ + +#include +#include +#include +#include + +namespace gpc { + +typedef enum { // Set operation type + GPC_DIFF, // Difference + GPC_INT, // Intersection + GPC_XOR, // Exclusive or + GPC_UNION // Union +} gpc_op; + +typedef struct { // Polygon vertex structure + double x; // Vertex x component + double y; // vertex y component +} gpc_vertex; + +typedef struct { // Vertex list structure + int num_vertices; // Number of vertices in list + gpc_vertex *vertex; // Vertex array pointer +} gpc_vertex_list; + +typedef struct { // Polygon set structure + int num_contours; // Number of contours in polygon + int *hole; // Hole external contour flags + gpc_vertex_list *contour; // Contour array pointer +} gpc_polygon; + +typedef struct { // Tristrip set structure + int num_strips; // Number of tristrips + gpc_vertex_list *strip; // Tristrip array pointer +} gpc_tristrip; + +typedef enum { LEFT, RIGHT } gpc_left_right; + +typedef enum { ABOVE, BELOW } gpc_above_below; + +typedef enum { CLIP, SUBJ } gpc_clip_subj; + +typedef enum { /* Edge intersection classes */ + NUL, /* Empty non-intersection */ + EMX, /* External maximum */ + ELI, /* External left intermediate */ + TED, /* Top edge */ + ERI, /* External right intermediate */ + RED, /* Right edge */ + IMM, /* Internal maximum and minimum */ + IMN, /* Internal minimum */ + EMN, /* External minimum */ + EMM, /* External maximum and minimum */ + LED, /* Left edge */ + ILI, /* Internal left intermediate */ + BED, /* Bottom edge */ + IRI, /* Internal right intermediate */ + IMX, /* Internal maximum */ + FUL /* Full non-intersection */ +} vertex_type; + +typedef enum { /* Horizontal edge states */ + NH, /* No horizontal edge */ + BH, /* Bottom horizontal edge */ + TH /* Top horizontal edge */ +} h_state; + +typedef enum { /* Edge bundle state */ + UNBUNDLED, /* Isolated edge not within a bundle */ + BUNDLE_HEAD, /* Bundle head node */ + BUNDLE_TAIL /* Passive bundle tail node */ +} bundle_state; + +typedef struct v_shape { /* Internal vertex list datatype */ + double x; /* X coordinate component */ + double y; /* Y coordinate component */ + struct v_shape *next; /* Pointer to next vertex in list */ +} vertex_node; + +typedef struct p_shape { /* Internal contour / tristrip type */ + int active; /* Active flag / vertex count */ + int hole; /* Hole / external contour flag */ + vertex_node *v[2]; /* Left and right vertex list ptrs */ + struct p_shape *next; /* Pointer to next polygon contour */ + struct p_shape *proxy; /* Pointer to actual structure used */ +} polygon_node; + +typedef struct edge_shape { + gpc_vertex vertex; /* Piggy-backed contour vertex data */ + gpc_vertex bot; /* Edge lower (x, y) coordinate */ + gpc_vertex top; /* Edge upper (x, y) coordinate */ + double xb; /* Scanbeam bottom x coordinate */ + double xt; /* Scanbeam top x coordinate */ + double dx; /* Change in x for a unit y increase */ + int type; /* Clip / subject edge flag */ + int bundle[2][2]; /* Bundle edge flags */ + int bside[2]; /* Bundle left / right indicators */ + bundle_state bstate[2]; /* Edge bundle state */ + polygon_node *outp[2]; /* Output polygon / tristrip pointer */ + struct edge_shape *prev; /* Previous edge in the AET */ + struct edge_shape *next; /* Next edge in the AET */ + struct edge_shape *pred; /* Edge connected at the lower end */ + struct edge_shape *succ; /* Edge connected at the upper end */ + struct edge_shape *next_bound; /* Pointer to next bound in LMT */ +} edge_node; + +inline bool gpc_eq(float a, float b) { return (fabs(a - b) <= 1e-6); } + +inline bool gpc_prev_index(float a, float b) { return (fabs(a - b) <= 1e-6); } + +inline int gpc_prev_index(int i, int n) { return ((i - 1 + n) % n); } + +inline int gpc_next_index(int i, int n) { return ((i + 1) % n); } + +inline int gpc_optimal(gpc_vertex *v, int i, int n) { + return (v[(i + 1) % n].y != v[i].y || v[(i - 1 + n) % n].y != v[i].y); +} + +inline int gpc_fwd_min(edge_node *v, int i, int n) { + return (v[(i + 1) % n].vertex.y > v[i].vertex.y && + v[(i - 1 + n) % n].vertex.y >= v[i].vertex.y); +} + +inline int gpc_not_fmax(edge_node *v, int i, int n) { + return (v[(i + 1) % n].vertex.y > v[i].vertex.y); +} + +inline int gpc_rev_min(edge_node *v, int i, int n) { + return (v[(i + 1) % n].vertex.y >= v[i].vertex.y && + v[(i - 1 + n) % n].vertex.y > v[i].vertex.y); +} + +inline int gpc_not_rmax(edge_node *v, int i, int n) { + return (v[(i - 1 + n) % n].vertex.y > v[i].vertex.y); +} + +// inline void gpc_p_edge(edge_node *d, edge_node *e, int p, double i, double j) +// { +inline void gpc_p_edge(edge_node *d, edge_node *e, int p) { + d = e; + do { + d = d->prev; + } while (!d->outp[p]); + // i = d->bot.x + d->dx * (j - d->bot.y); +} + +// inline void gpc_n_edge(edge_node *d, edge_node *e, int p, double i, double j) +// { +inline void gpc_n_edge(edge_node *d, edge_node *e, int p) { + d = e; + do { + d = d->next; + } while (!d->outp[p]); + // i = d->bot.x + d->dx * (j - d->bot.y); +} + +template +void gpc_malloc(T *&p, int b, char *s) { + if (b > 0) { + p = (T *)malloc(b); + + if (!p) { + fprintf(stderr, "gpc malloc failure: %s\n", s); + exit(0); + } + } else { + p = NULL; + } +} +template +void gpc_free(T *&p) { + if (p) { + free(p); + p = NULL; + } +} + +/* +=========================================================================== + Public Function Prototypes +=========================================================================== +*/ + +void add_vertex(vertex_node **t, double x, double y); + +void gpc_vertex_create(edge_node *e, int p, int s, double x, double y); + +/* +void gpc_read_polygon(FILE *infile_ptr, int read_hole_flags, + gpc_polygon *polygon); + +void gpc_write_polygon(FILE *outfile_ptr, int write_hole_flags, + gpc_polygon *polygon); +*/ +void gpc_add_contour(gpc_polygon *polygon, gpc_vertex_list *contour, int hole); + +void gpc_polygon_clip(gpc_op set_operation, gpc_polygon *subject_polygon, + gpc_polygon *clip_polygon, gpc_polygon *result_polygon); + +void gpc_tristrip_clip(gpc_op set_operation, gpc_polygon *subject_polygon, + gpc_polygon *clip_polygon, + gpc_tristrip *result_tristrip); + +void gpc_polygon_to_tristrip(gpc_polygon *polygon, gpc_tristrip *tristrip); + +void gpc_free_polygon(gpc_polygon *polygon); + +void gpc_free_tristrip(gpc_tristrip *tristrip); + +} // namespace gpc + +#endif // PADDLE_FLUID_OPERATORS_DETECTION_GPC_H_ +/* vim: set expandtab ts=4 sw=4 sts=4 tw=100: */ diff --git a/paddle/fluid/operators/detection/multiclass_nms_op.cc b/paddle/fluid/operators/detection/multiclass_nms_op.cc index 60b93efdce..9e78b28a60 100644 --- a/paddle/fluid/operators/detection/multiclass_nms_op.cc +++ b/paddle/fluid/operators/detection/multiclass_nms_op.cc @@ -9,10 +9,11 @@ http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and + limitations under the License. */ #include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/operators/detection/poly_util.h" namespace paddle { namespace operators { @@ -20,9 +21,6 @@ namespace operators { using Tensor = framework::Tensor; using LoDTensor = framework::LoDTensor; -constexpr int64_t kOutputDim = 6; -constexpr int64_t kBBoxSize = 4; - class MultiClassNMSOp : public framework::OperatorWithKernel { public: using framework::OperatorWithKernel::OperatorWithKernel; @@ -42,10 +40,15 @@ class MultiClassNMSOp : public framework::OperatorWithKernel { "The rank of Input(BBoxes) must be 3."); PADDLE_ENFORCE_EQ(score_dims.size(), 3, "The rank of Input(Scores) must be 3."); - PADDLE_ENFORCE_EQ(box_dims[2], 4, - "The 2nd dimension of Input(BBoxes) must be 4, " - "represents the layout of coordinate " - "[xmin, ymin, xmax, ymax]"); + PADDLE_ENFORCE(box_dims[2] == 4 || box_dims[2] == 8 || box_dims[2] == 16 || + box_dims[2] == 24 || box_dims[2] == 32, + "The 2nd dimension of Input(BBoxes) must be 4 or 8, " + "represents the layout of coordinate " + "[xmin, ymin, xmax, ymax] or " + "4 points: [x1, y1, x2, y2, x3, y3, x4, y4] or " + "8 points: [xi, yi] i= 1,2,...,8 or " + "12 points: [xi, yi] i= 1,2,...,12 or " + "16 points: [xi, yi] i= 1,2,...,16"); PADDLE_ENFORCE_EQ(box_dims[1], score_dims[2], "The 1st dimensiong of Input(BBoxes) must be equal to " "3rd dimension of Input(Scores), which represents the " @@ -53,7 +56,7 @@ class MultiClassNMSOp : public framework::OperatorWithKernel { // Here the box_dims[0] is not the real dimension of output. // It will be rewritten in the computing kernel. - ctx->SetOutputDim("Out", {box_dims[1], 6}); + ctx->SetOutputDim("Out", {box_dims[1], box_dims[2] + 2}); } protected: @@ -128,6 +131,21 @@ static inline T JaccardOverlap(const T* box1, const T* box2, } } +template +T PolyIoU(const T* box1, const T* box2, const size_t box_size, + const bool normalized) { + T bbox1_area = PolyArea(box1, box_size, normalized); + T bbox2_area = PolyArea(box2, box_size, normalized); + T inter_area = PolyOverlapArea(box1, box2, box_size, normalized); + if (bbox1_area == 0 || bbox2_area == 0 || inter_area == 0) { + // If coordinate values are is invalid + // if area size <= 0, return 0. + return T(0.); + } else { + return inter_area / (bbox1_area + bbox2_area - inter_area); + } +} + template class MultiClassNMSKernel : public framework::OpKernel { public: @@ -137,6 +155,8 @@ class MultiClassNMSKernel : public framework::OpKernel { // The total boxes for each instance. int64_t num_boxes = bbox.dims()[0]; // 4: [xmin ymin xmax ymax] + // 8: [x1 y1 x2 y2 x3 y3 x4 y4] + // 16, 24, or 32: [x1 y1 x2 y2 ... xn yn], n = 8, 12 or 16 int64_t box_size = bbox.dims()[1]; std::vector scores_data(num_boxes); @@ -154,8 +174,19 @@ class MultiClassNMSKernel : public framework::OpKernel { for (size_t k = 0; k < selected_indices->size(); ++k) { if (keep) { const int kept_idx = (*selected_indices)[k]; - T overlap = JaccardOverlap(bbox_data + idx * box_size, + T overlap = T(0.); + // 4: [xmin ymin xmax ymax] + if (box_size == 4) { + overlap = JaccardOverlap(bbox_data + idx * box_size, bbox_data + kept_idx * box_size, true); + } + // 8: [x1 y1 x2 y2 x3 y3 x4 y4] or 16, 24, 32 + if (box_size == 8 || box_size == 16 || box_size == 24 || + box_size == 32) { + overlap = + PolyIoU(bbox_data + idx * box_size, + bbox_data + kept_idx * box_size, box_size, true); + } keep = overlap <= adaptive_threshold; } else { break; @@ -228,7 +259,9 @@ class MultiClassNMSKernel : public framework::OpKernel { void MultiClassOutput(const Tensor& scores, const Tensor& bboxes, const std::map>& selected_indices, Tensor* outs) const { - int predict_dim = scores.dims()[1]; + int64_t predict_dim = scores.dims()[1]; + int64_t box_size = bboxes.dims()[1]; + int64_t out_dim = bboxes.dims()[1] + 2; auto* scores_data = scores.data(); auto* bboxes_data = bboxes.data(); auto* odata = outs->data(); @@ -240,11 +273,11 @@ class MultiClassNMSKernel : public framework::OpKernel { const std::vector& indices = it.second; for (size_t j = 0; j < indices.size(); ++j) { int idx = indices[j]; - const T* bdata = bboxes_data + idx * kBBoxSize; - odata[count * kOutputDim] = label; // label - odata[count * kOutputDim + 1] = sdata[idx]; // score - // xmin, ymin, xmax, ymax - std::memcpy(odata + count * kOutputDim + 2, bdata, 4 * sizeof(T)); + const T* bdata = bboxes_data + idx * box_size; + odata[count * out_dim] = label; // label + odata[count * out_dim + 1] = sdata[idx]; // score + // xmin, ymin, xmax, ymax or multi-points coordinates + std::memcpy(odata + count * out_dim + 2, bdata, box_size * sizeof(T)); count++; } } @@ -261,6 +294,7 @@ class MultiClassNMSKernel : public framework::OpKernel { int64_t class_num = score_dims[1]; int64_t predict_dim = score_dims[2]; int64_t box_dim = boxes->dims()[2]; + int64_t out_dim = boxes->dims()[2] + 2; std::vector>> all_indices; std::vector batch_starts = {0}; @@ -283,7 +317,7 @@ class MultiClassNMSKernel : public framework::OpKernel { T* od = outs->mutable_data({1}, ctx.GetPlace()); od[0] = -1; } else { - outs->mutable_data({num_kept, kOutputDim}, ctx.GetPlace()); + outs->mutable_data({num_kept, out_dim}, ctx.GetPlace()); for (int64_t i = 0; i < batch_size; ++i) { Tensor ins_score = scores->Slice(i, i + 1); ins_score.Resize({class_num, predict_dim}); @@ -311,10 +345,11 @@ class MultiClassNMSOpMaker : public framework::OpProtoAndCheckerMaker { public: void Make() override { AddInput("BBoxes", - "(Tensor) A 3-D Tensor with shape [N, M, 4] represents the " + "(Tensor) A 3-D Tensor with shape " + "[N, M, 4 or 8 16 24 32] represents the " "predicted locations of M bounding bboxes, N is the batch size. " "Each bounding box has four coordinate values and the layout is " - "[xmin, ymin, xmax, ymax]."); + "[xmin, ymin, xmax, ymax], when box size equals to 4."); AddInput("Scores", "(Tensor) A 3-D Tensor with shape [N, C, M] represents the " "predicted confidence predictions. N is the batch size, C is the " @@ -351,8 +386,12 @@ class MultiClassNMSOpMaker : public framework::OpProtoAndCheckerMaker { AddOutput("Out", "(LoDTensor) A 2-D LoDTensor with shape [No, 6] represents the " "detections. Each row has 6 values: " - "[label, confidence, xmin, ymin, xmax, ymax], No is the total " - "number of detections in this mini-batch. For each instance, " + "[label, confidence, xmin, ymin, xmax, ymax] or " + "(LoDTensor) A 2-D LoDTensor with shape [No, 10] represents the " + "detections. Each row has 10 values: " + "[label, confidence, x1, y1, x2, y2, x3, y3, x4, y4]. No is the " + "total number of detections in this mini-batch." + "For each instance, " "the offsets in first dimension are called LoD, the number of " "offset is N + 1, if LoD[i + 1] - LoD[i] == 0, means there is " "no detected bbox."); diff --git a/paddle/fluid/operators/detection/poly_util.cc b/paddle/fluid/operators/detection/poly_util.cc new file mode 100644 index 0000000000..1af2c95c6c --- /dev/null +++ b/paddle/fluid/operators/detection/poly_util.cc @@ -0,0 +1,132 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#ifndef POLY_UTIL_CC_ +#define POLY_UTIL_CC_ + +#include "paddle/fluid/operators/detection/poly_util.h" +#include "paddle/fluid/framework/op_registry.h" + +namespace paddle { +namespace operators { + +using gpc::gpc_polygon_clip; +using gpc::gpc_free_polygon; + +template +void Array2PointVec(const T*& box, const size_t box_size, + std::vector>& vec) { + size_t pts_num = box_size / 2; + vec.resize(pts_num); + for (size_t i = 0; i < pts_num; i++) { + vec.at(i).x = box[2 * i]; + vec.at(i).y = box[2 * i + 1]; + } +} + +template +void Array2Poly(const T*& box, const size_t box_size, gpc::gpc_polygon& poly) { + size_t pts_num = box_size / 2; + poly.num_contours = 1; + poly.hole = (int*)malloc(sizeof(int)); + poly.hole[0] = 0; + poly.contour = (gpc::gpc_vertex_list*)malloc(sizeof(gpc::gpc_vertex_list)); + poly.contour->num_vertices = pts_num; + poly.contour->vertex = + (gpc::gpc_vertex*)malloc(sizeof(gpc::gpc_vertex) * pts_num); + for (size_t i = 0; i < pts_num; ++i) { + poly.contour->vertex[i].x = box[2 * i]; + poly.contour->vertex[i].y = box[2 * i + 1]; + } +} + +template +void PointVec2Poly(const std::vector>& vec, gpc::gpc_polygon& poly) { + int pts_num = vec.size(); + poly.num_contours = 1; + poly.hole = (int*)malloc(sizeof(int)); + poly.hole[0] = 0; + poly.contour = (gpc::gpc_vertex_list*)malloc(sizeof(gpc::gpc_vertex_list)); + poly.contour->num_vertices = pts_num; + poly.contour->vertex = + (gpc::gpc_vertex*)malloc(sizeof(gpc::gpc_vertex) * pts_num); + for (size_t i = 0; i < pts_num; ++i) { + poly.contour->vertex[i].x = vec[i].x; + poly.contour->vertex[i].y = vec[i].y; + } +} + +template +void Poly2PointVec(const gpc::gpc_vertex_list& contour, + std::vector>& vec) { + int pts_num = contour.num_vertices; + vec.resize(pts_num); + for (int i = 0; i < pts_num; i++) { + vec.at(i).x = contour.vertex[i].x; + vec.at(i).y = contour.vertex[i].y; + } +} + +template +T GetContourArea(std::vector>& vec) { + size_t pts_num = vec.size(); + if (pts_num < 3) return T(0.); + T area = T(0.); + for (size_t i = 0; i < pts_num; ++i) { + area += vec[i].x * vec[(i + 1) % pts_num].y - + vec[i].y * vec[(i + 1) % pts_num].x; + } + return std::fabs(area / 2.0); +} + +template +T PolyArea(const T* box, const size_t box_size, const bool normalized) { + // If coordinate values are is invalid + // if area size <= 0, return 0. + std::vector> vec; + Array2PointVec(box, box_size, vec); + return GetContourArea(vec); +} + +template +T PolyOverlapArea(const T* box1, const T* box2, const size_t box_size, + const bool normalized) { + gpc::gpc_polygon poly1; + gpc::gpc_polygon poly2; + Array2Poly(box1, box_size, poly1); + Array2Poly(box2, box_size, poly2); + gpc::gpc_polygon respoly; + gpc::gpc_op op = gpc::GPC_INT; + gpc::gpc_polygon_clip(op, &poly2, &poly1, &respoly); + + T inter_area = T(0.); + int contour_num = respoly.num_contours; + for (int i = 0; i < contour_num; ++i) { + std::vector> resvec; + Poly2PointVec(respoly.contour[i], resvec); + // inter_area += std::fabs(cv::contourArea(resvec)) + 0.5f * + // (cv::arcLength(resvec, true)); + inter_area += GetContourArea(resvec); + } + + gpc::gpc_free_polygon(&poly1); + gpc::gpc_free_polygon(&poly2); + gpc::gpc_free_polygon(&respoly); + return inter_area; +} + +} // namespace operators +} // namespace paddle + +#endif diff --git a/paddle/fluid/operators/detection/poly_util.h b/paddle/fluid/operators/detection/poly_util.h new file mode 100644 index 0000000000..f07baf72d9 --- /dev/null +++ b/paddle/fluid/operators/detection/poly_util.h @@ -0,0 +1,73 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#ifndef POLY_UTIL_H_ +#define POLY_UTIL_H_ + +#include +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/operators/detection/gpc.h" + +namespace paddle { +namespace operators { + +template +class Point_ { + public: + // default constructor + Point_() {} + Point_(T _x, T _y) {} + Point_(const Point_& pt) {} + + Point_& operator=(const Point_& pt); + // conversion to another data type + // template operator Point_<_T>() const; + // conversion to the old-style C structures + // operator Vec() const; + + // checks whether the point is inside the specified rectangle + // bool inside(const Rect_& r) const; + T x; //!< x coordinate of the point + T y; //!< y coordinate of the point +}; + +template +void Array2PointVec(const T*& box, const size_t box_size, + std::vector>& vec); + +template +void Array2Poly(const T*& box, const size_t box_size, gpc::gpc_polygon& poly); + +template +void PointVec2Poly(const std::vector>& vec, gpc::gpc_polygon& poly); + +template +void Poly2PointVec(const gpc::gpc_vertex_list& contour, + std::vector>& vec); + +template +T GetContourArea(std::vector>& vec); + +template +T PolyArea(const T* box, const size_t box_size, const bool normalized); + +template +T PolyOverlapArea(const T* box1, const T* box2, const size_t box_size, + const bool normalized); +} // namespace operators +} // namespace paddle + +#include "paddle/fluid/operators/detection/poly_util.cc" + +#endif // POLY_UTIL_H_ diff --git a/paddle/fluid/operators/detection/polygon_box_transform_op.cc b/paddle/fluid/operators/detection/polygon_box_transform_op.cc index 568d50d457..4b3bc2edb5 100644 --- a/paddle/fluid/operators/detection/polygon_box_transform_op.cc +++ b/paddle/fluid/operators/detection/polygon_box_transform_op.cc @@ -41,9 +41,9 @@ class PolygonBoxTransformCPUKernel : public framework::OpKernel { for (int id_w = 0; id_w < width; ++id_w) { id = id_n * height * width + width * id_h + id_w; if (id_n % 2 == 0) { - out_data[id] = id_w - in_data[id]; + out_data[id] = id_w * 4 - in_data[id]; } else { - out_data[id] = id_h - in_data[id]; + out_data[id] = id_h * 4 - in_data[id]; } } } diff --git a/paddle/fluid/operators/detection/polygon_box_transform_op.cu b/paddle/fluid/operators/detection/polygon_box_transform_op.cu index 6187ac6622..e1eaf084a3 100644 --- a/paddle/fluid/operators/detection/polygon_box_transform_op.cu +++ b/paddle/fluid/operators/detection/polygon_box_transform_op.cu @@ -32,9 +32,9 @@ __global__ void PolygonBoxTransformKernel(const int n, const int h, const int w, if (id_n < n && id_h < h && id_w < w) { int id = id_n * h * w + w * id_h + id_w; if (id_n % 2 == 0) { - output[id] = id_w - input[id]; + output[id] = id_w * 4 - input[id]; } else { - output[id] = id_h - input[id]; + output[id] = id_h * 4 - input[id]; } } } diff --git a/paddle/fluid/operators/distributed/CMakeLists.txt b/paddle/fluid/operators/distributed/CMakeLists.txt index 56734b81e8..21db93958a 100644 --- a/paddle/fluid/operators/distributed/CMakeLists.txt +++ b/paddle/fluid/operators/distributed/CMakeLists.txt @@ -20,7 +20,7 @@ if(WITH_GRPC) DEPS grpc++_unsecure grpc_unsecure gpr cares zlib protobuf sendrecvop_grpc scope profiler math_function SERIAL) cc_test(rpc_server_test SRCS rpc_server_test.cc DEPS sendrecvop_grpc grpc++_unsecure grpc_unsecure gpr cares zlib protobuf executor proto_desc lookup_sparse_table_op SERIAL) - cc_test(varhandle_test SRCS varhandle_test.cc) + cc_test(varhandle_test SRCS varhandle_test.cc DEPS profiler) return() endif() diff --git a/paddle/fluid/operators/distributed/grpc_client.cc b/paddle/fluid/operators/distributed/grpc_client.cc index 13682b78f0..f5d5627815 100644 --- a/paddle/fluid/operators/distributed/grpc_client.cc +++ b/paddle/fluid/operators/distributed/grpc_client.cc @@ -12,14 +12,12 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/fluid/operators/distributed/grpc_client.h" - #include - #include #include "glog/logging.h" // For VLOG #include "paddle/fluid/framework/threadpool.h" +#include "paddle/fluid/operators/distributed/grpc_client.h" #include "paddle/fluid/operators/distributed/grpc_serde.h" #include "paddle/fluid/operators/distributed/request_handler.h" #include "paddle/fluid/platform/profiler.h" @@ -73,10 +71,11 @@ VarHandlePtr GRPCClient::AsyncSendVar(const std::string& ep, const framework::Scope* p_scope = &scope; const auto ch = GetChannel(ep_val); SendProcessor* s = new SendProcessor(ch); - VarHandlePtr h(new VarHandle(ep, "Send", var_name_val, p_ctx, p_scope)); + const std::string method = "SendRPC"; + VarHandlePtr h(new VarHandle(ep, method, var_name_val, p_ctx, p_scope)); s->Prepare(h, time_out); - framework::AsyncIO([var_name_val, p_scope, p_ctx, s, this] { + framework::AsyncIO([var_name_val, p_scope, p_ctx, s, method, h, this] { auto* var = p_scope->FindVar(var_name_val); ::grpc::ByteBuffer req; @@ -87,10 +86,16 @@ VarHandlePtr GRPCClient::AsyncSendVar(const std::string& ep, // stub context s->response_call_back_ = nullptr; + platform::RecordRPCEvent record_event(method, p_ctx); + auto call = s->stub_g_.PrepareUnaryCall( s->context_.get(), "/sendrecv.SendRecvService/SendVariable", req, &cq_); call->StartCall(); call->Finish(&s->reply_, &s->status_, reinterpret_cast(s)); + + if (UNLIKELY(platform::IsProfileEnabled())) { + h->Wait(); + } }); req_count_++; @@ -122,10 +127,11 @@ VarHandlePtr GRPCClient::AsyncGetVar(const std::string& ep, const framework::Scope* p_scope = &scope; const auto ch = GetChannel(ep_val); GetProcessor* s = new GetProcessor(ch); - VarHandlePtr h(new VarHandle(ep, "Get", var_name_val, p_ctx, p_scope)); + const std::string method = "GetRPC"; + VarHandlePtr h(new VarHandle(ep, method, var_name_val, p_ctx, p_scope)); s->Prepare(h, time_out); - framework::AsyncIO([var_name_val, s, this] { + framework::AsyncIO([var_name_val, s, method, p_ctx, h, this] { // prepare input sendrecv::VariableMessage req; req.set_varname(var_name_val); @@ -137,10 +143,16 @@ VarHandlePtr GRPCClient::AsyncGetVar(const std::string& ep, // stub context s->response_call_back_ = ProcGetResponse; + platform::RecordRPCEvent record_event(method, p_ctx); + auto call = s->stub_g_.PrepareUnaryCall( s->context_.get(), "/sendrecv.SendRecvService/GetVariable", buf, &cq_); call->StartCall(); call->Finish(&s->reply_, &s->status_, reinterpret_cast(s)); + + if (UNLIKELY(platform::IsProfileEnabled())) { + h->Wait(); + } }); req_count_++; @@ -161,12 +173,14 @@ VarHandlePtr GRPCClient::AsyncPrefetchVar(const std::string& ep, const framework::Scope* p_scope = &scope; const auto ch = GetChannel(ep_val); GetProcessor* s = new GetProcessor(ch); - VarHandlePtr h( - new VarHandle(ep, "Prefetch", out_var_name_val, p_ctx, p_scope)); + + const std::string method = "PrefetchRPC"; + + VarHandlePtr h(new VarHandle(ep, method, out_var_name_val, p_ctx, p_scope)); s->Prepare(h, time_out); framework::AsyncIO([in_var_name_val, out_var_name_val, ep_val, p_scope, p_ctx, - s, this] { + s, method, h, this] { auto* var = p_scope->FindVar(in_var_name_val); ::grpc::ByteBuffer req; @@ -177,11 +191,17 @@ VarHandlePtr GRPCClient::AsyncPrefetchVar(const std::string& ep, // stub context s->response_call_back_ = ProcGetResponse; + platform::RecordRPCEvent record_event(method, p_ctx); + auto call = s->stub_g_.PrepareUnaryCall( s->context_.get(), "/sendrecv.SendRecvService/PrefetchVariable", req, &cq_); call->StartCall(); call->Finish(&s->reply_, &s->status_, static_cast(s)); + + if (UNLIKELY(platform::IsProfileEnabled())) { + h->Wait(); + } }); req_count_++; @@ -193,15 +213,24 @@ VarHandlePtr GRPCClient::AsyncSendBatchBarrier(const std::string& ep, const auto ch = GetChannel(ep); BatchBarrierProcessor* s = new BatchBarrierProcessor(ch); - VarHandlePtr h(new VarHandle(ep, "BatchBarrier", BATCH_BARRIER_MESSAGE, - nullptr, nullptr)); + const std::string method = "BatchBarrierRPC"; + VarHandlePtr h( + new VarHandle(ep, method, BATCH_BARRIER_MESSAGE, nullptr, nullptr)); s->Prepare(h, time_out); sendrecv::VariableMessage req; req.set_varname(BATCH_BARRIER_MESSAGE); + + platform::RecordRPCEvent record_event(method, nullptr); + auto rpc = s->stub_->AsyncSendVariable(s->context_.get(), req, &cq_); rpc->Finish(&s->reply_, &s->status_, reinterpret_cast(s)); req_count_++; + + if (UNLIKELY(platform::IsProfileEnabled())) { + h->Wait(); + } + return h; } @@ -209,15 +238,24 @@ VarHandlePtr GRPCClient::AsyncSendFetchBarrier(const std::string& ep, int64_t time_out) { const auto ch = GetChannel(ep); FetchBarrierProcessor* s = new FetchBarrierProcessor(ch); - VarHandlePtr h(new VarHandle(ep, "FetchBarrier", FETCH_BARRIER_MESSAGE, - nullptr, nullptr)); + const std::string method = "FetchBarrierRPC"; + VarHandlePtr h( + new VarHandle(ep, method, FETCH_BARRIER_MESSAGE, nullptr, nullptr)); s->Prepare(h, time_out); sendrecv::VariableMessage req; req.set_varname(FETCH_BARRIER_MESSAGE); + + platform::RecordRPCEvent record_event(method, nullptr); + auto rpc = s->stub_->AsyncGetVariable(s->context_.get(), req, &cq_); rpc->Finish(&s->reply_, &s->status_, reinterpret_cast(s)); req_count_++; + + if (UNLIKELY(platform::IsProfileEnabled())) { + h->Wait(); + } + return h; } @@ -226,15 +264,23 @@ VarHandlePtr GRPCClient::AsyncSendComplete(const std::string& ep, const auto ch = GetChannel(ep); BatchBarrierProcessor* s = new BatchBarrierProcessor(ch); - VarHandlePtr h( - new VarHandle(ep, "SendComplete", COMPLETE_MESSAGE, nullptr, nullptr)); + const std::string method = "SendCompleteRPC"; + VarHandlePtr h(new VarHandle(ep, method, COMPLETE_MESSAGE, nullptr, nullptr)); s->Prepare(h, time_out); sendrecv::VariableMessage req; req.set_varname(COMPLETE_MESSAGE); + + platform::RecordRPCEvent record_event(method, nullptr); + auto rpc = s->stub_->AsyncSendVariable(s->context_.get(), req, &cq_); rpc->Finish(&s->reply_, &s->status_, reinterpret_cast(s)); req_count_++; + + if (UNLIKELY(platform::IsProfileEnabled())) { + h->Wait(); + } + return h; } @@ -244,17 +290,27 @@ VarHandlePtr GRPCClient::AsyncCheckpointNotify(const std::string& ep, const auto ch = GetChannel(ep); CheckpointNotifyProcessor* s = new CheckpointNotifyProcessor(ch); - VarHandlePtr h(new VarHandle(ep, "CheckPointNotify", CHECKPOINT_SAVE_MESSAGE, - nullptr, nullptr)); + + const std::string method = "CheckPointNotifyRPC"; + + VarHandlePtr h( + new VarHandle(ep, method, CHECKPOINT_SAVE_MESSAGE, nullptr, nullptr)); s->Prepare(h, time_out); sendrecv::VariableMessage req; req.set_varname(CHECKPOINT_SAVE_MESSAGE); req.set_out_varname(dir); + platform::RecordRPCEvent record_event(method, nullptr); + auto rpc = s->stub_->AsyncCheckpointNotify(s->context_.get(), req, &cq_); rpc->Finish(&s->reply_, &s->status_, reinterpret_cast(s)); req_count_++; + + if (UNLIKELY(platform::IsProfileEnabled())) { + h->Wait(); + } + return h; } @@ -273,12 +329,16 @@ void GRPCClient::Proceed() { BaseProcessor* c = static_cast(tag); GPR_ASSERT(ok); PADDLE_ENFORCE(c); + if (c->status_.ok()) { VLOG(3) << c->GetVarHandlePtr()->String() << " process"; c->Process(); } else if (c->status_.error_code() == grpc::StatusCode::DEADLINE_EXCEEDED) { + // FIXME(gongwb): parse error_details? LOG(ERROR) << c->GetVarHandlePtr()->String() - << " meets grpc error:" << c->status_.error_message(); + << " meets grpc error, error_code:" << c->status_.error_code() + << " error_message:" << c->status_.error_message() + << " error_details:" << c->status_.error_details(); { std::lock_guard lk(sync_mutex_); ok_ = false; @@ -286,7 +346,10 @@ void GRPCClient::Proceed() { c->Finish(false); } else { LOG(FATAL) << c->GetVarHandlePtr()->String() - << " meets grpc error:" << c->status_.error_message(); + << " meets grpc error, error_code:" << c->status_.error_code() + << " error_message:" << c->status_.error_message() + << " error_details:" << c->status_.error_details(); + c->Finish(false); } diff --git a/paddle/fluid/operators/distributed/grpc_serde.cc b/paddle/fluid/operators/distributed/grpc_serde.cc index 3f8796713a..bac098b892 100644 --- a/paddle/fluid/operators/distributed/grpc_serde.cc +++ b/paddle/fluid/operators/distributed/grpc_serde.cc @@ -36,6 +36,7 @@ void SerializeToByteBuffer(const std::string& name, framework::Variable* var, const platform::DeviceContext& ctx, ::grpc::ByteBuffer* msg, const std::string& out_name) { + platform::RecordRPCEvent record_event("serial", &ctx); // Default DestroyCallback does nothing, When using GPU // the CPU buffer need to be freed. DestroyCallback destroy_callback = [](void* backing) {}; @@ -147,6 +148,7 @@ void DeserializeFromByteBuffer(const ::grpc::ByteBuffer& msg, const platform::DeviceContext& ctx, const framework::Scope* scope, framework::Variable** var) { + platform::RecordRPCEvent record_event("deserial", &ctx); operators::distributed::GRPCVariableResponse resp(scope, &ctx); PADDLE_ENFORCE(resp.Parse(msg) == 0, "parse bytebuffer to tensor error!"); *var = resp.GetVar(); diff --git a/paddle/fluid/operators/fill_constant_op.cc b/paddle/fluid/operators/fill_constant_op.cc index 2826b82117..e04a68717b 100644 --- a/paddle/fluid/operators/fill_constant_op.cc +++ b/paddle/fluid/operators/fill_constant_op.cc @@ -70,6 +70,12 @@ class FillConstantOp : public framework::OperatorBase { } }; +class FillConstantOpVarTypeInference : public framework::VarTypeInference { + public: + void operator()(const framework::OpDesc &op_desc, + framework::BlockDesc *block) const override {} +}; + class FillConstantOpMaker : public framework::OpProtoAndCheckerMaker { public: void Make() override { @@ -102,4 +108,5 @@ Fill up a variable with specified constant value. namespace ops = paddle::operators; REGISTER_OPERATOR(fill_constant, ops::FillConstantOp, ops::FillConstantInferShape, ops::FillConstantOpMaker, - paddle::framework::EmptyGradOpMaker); + paddle::framework::EmptyGradOpMaker, + ops::FillConstantOpVarTypeInference); diff --git a/paddle/fluid/operators/ftrl_op.cc b/paddle/fluid/operators/ftrl_op.cc index 70ba25c213..b77e12d650 100644 --- a/paddle/fluid/operators/ftrl_op.cc +++ b/paddle/fluid/operators/ftrl_op.cc @@ -34,6 +34,16 @@ class FTRLOp : public framework::OperatorWithKernel { "Input(Grad) of FTRL should not be null."); PADDLE_ENFORCE(ctx->HasInput("LearningRate"), "Input(LearningRate) of FTRL should not be null."); + PADDLE_ENFORCE( + ctx->GetInputsVarType("Param").front() == + framework::proto::VarType::LOD_TENSOR, + "The input var's type should be LoDTensor, but the received is %s", + ctx->Inputs("Param").front(), ctx->GetInputsVarType("Param").front()); + PADDLE_ENFORCE( + ctx->GetInputsVarType("Grad").front() == + framework::proto::VarType::LOD_TENSOR, + "The input var's type should be LoDTensor, but the received is %s", + ctx->Inputs("Grad").front(), ctx->GetInputsVarType("Grad").front()); PADDLE_ENFORCE(ctx->HasOutput("ParamOut"), "Output(ParamOut) of FTRL should not be null."); diff --git a/paddle/fluid/operators/ftrl_op.h b/paddle/fluid/operators/ftrl_op.h index 6f821e7e99..8f812c9a03 100644 --- a/paddle/fluid/operators/ftrl_op.h +++ b/paddle/fluid/operators/ftrl_op.h @@ -28,6 +28,17 @@ template class FTRLOpKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& ctx) const override { + const auto* param_var = ctx.InputVar("Param"); + PADDLE_ENFORCE(param_var->IsType(), + "The Var(%s)'s type should be LoDTensor, " + "but the received is %s", + ctx.Inputs("Param").front(), param_var->Type().name()); + const auto* grad_var = ctx.InputVar("Grad"); + PADDLE_ENFORCE(grad_var->IsType(), + "The Var(%s)'s type should be LoDTensor, " + "but the received is %s", + ctx.Inputs("Grad").front(), grad_var->Type().name()); + auto* param_out = ctx.Output("ParamOut"); auto* sq_accum_out = ctx.Output("SquaredAccumOut"); auto* lin_accum_out = ctx.Output("LinearAccumOut"); diff --git a/paddle/fluid/operators/fused_embedding_fc_lstm_op.cc b/paddle/fluid/operators/fused_embedding_fc_lstm_op.cc index 0b917a4036..fdc9cb4888 100644 --- a/paddle/fluid/operators/fused_embedding_fc_lstm_op.cc +++ b/paddle/fluid/operators/fused_embedding_fc_lstm_op.cc @@ -93,11 +93,7 @@ void FusedEmbeddingFCLSTMOp::InferShape( ctx->SetOutputDim("Cell", out_dims); ctx->ShareLoD("Ids", "Hidden"); ctx->ShareLoD("Ids", "Cell"); - int xx_width; - if (ctx->Attrs().Get("use_seq")) { - xx_width = wh_dims[1]; - } else { - xx_width = x_dims[1] > wh_dims[1] ? wh_dims[1] : x_dims[1]; + if (!ctx->Attrs().Get("use_seq")) { PADDLE_ENFORCE(ctx->HasOutput("BatchedInput"), "Assert only one Output(BatchedInput) of LSTM."); PADDLE_ENFORCE(ctx->HasOutput("BatchedHidden"), @@ -112,7 +108,7 @@ void FusedEmbeddingFCLSTMOp::InferShape( ctx->SetOutputDim("BatchedHidden", out_dims); ctx->SetOutputDim("BatchedCell", out_dims); } - ctx->SetOutputDim("XX", {x_dims[0], xx_width}); + ctx->SetOutputDim("XX", {x_dims[0], wh_dims[1]}); ctx->ShareLoD("Ids", "XX"); } @@ -435,8 +431,6 @@ class FusedEmbeddingFCLSTMKernel : public framework::OpKernel { INIT_VEC_FUNC INIT_BASE_INPUT_DATAS - // std::cout << "===> Batch Compute" << std::endl; - auto* reordered_h0 = ctx.Output("ReorderedH0"); auto* reordered_c0 = ctx.Output("ReorderedC0"); auto* batched_input = ctx.Output("BatchedInput"); diff --git a/paddle/fluid/operators/fusion_lstm_op.cc b/paddle/fluid/operators/fusion_lstm_op.cc index ae1f6d8e48..067e6a3e7c 100644 --- a/paddle/fluid/operators/fusion_lstm_op.cc +++ b/paddle/fluid/operators/fusion_lstm_op.cc @@ -15,11 +15,9 @@ limitations under the License. */ #include "paddle/fluid/operators/fusion_lstm_op.h" #include #include "paddle/fluid/operators/math/blas.h" -#include "paddle/fluid/operators/math/cpu_lstm_compute.h" -#include "paddle/fluid/operators/math/cpu_vec.h" #include "paddle/fluid/operators/math/fc_compute.h" +#include "paddle/fluid/operators/math/jit_kernel.h" #include "paddle/fluid/operators/math/sequence2batch.h" -#include "paddle/fluid/platform/cpu_info.h" namespace paddle { namespace operators { @@ -219,121 +217,55 @@ This operator fuse the X into LSTM, more details can refer to LSTM op. template class FuisonLSTMKernel : public framework::OpKernel { public: -#define INIT_VEC_FUNC \ - std::function act_gate, act_cell, act_cand; \ - auto& act_gate_str = ctx.Attr("gate_activation"); \ - auto& act_cell_str = ctx.Attr("cell_activation"); \ - auto& act_cand_str = ctx.Attr("candidate_activation"); \ - if (platform::jit::MayIUse(platform::jit::avx)) { \ - math::VecActivations act_functor; \ - act_gate = act_functor(act_gate_str); \ - act_cell = act_functor(act_cell_str); \ - act_cand = act_functor(act_cand_str); \ - } else { \ - math::VecActivations act_functor; \ - act_gate = act_functor(act_gate_str); \ - act_cell = act_functor(act_cell_str); \ - act_cand = act_functor(act_cand_str); \ - } - -#define INIT_BASE_INPUT_OUTPUT \ - auto* x = ctx.Input("X"); \ - auto* h0 = ctx.Input("H0"); \ - auto* c0 = ctx.Input("C0"); \ - auto* wx = ctx.Input("WeightX"); \ - auto* wh = ctx.Input("WeightH"); \ - auto* bias = ctx.Input("Bias"); \ - auto* xx = ctx.Output("XX"); \ - auto* hidden_out = ctx.Output("Hidden"); \ - auto* cell_out = ctx.Output("Cell"); \ - bool is_reverse = ctx.Attr("is_reverse"); \ - bool use_peepholes = ctx.Attr("use_peepholes"); - -#define INIT_BASE_SIZES \ - auto x_dims = x->dims(); /* T x M*/ \ - auto wh_dims = wh->dims(); /* D x 4D*/ \ - const int M = x_dims[1]; \ - const int D = wh_dims[0]; \ - const int D2 = D * 2; \ - const int D3 = D * 3; \ - const int D4 = wh_dims[1]; - -#define INIT_BASE_INPUT_DATAS \ - const T* x_data = x->data(); \ - const T* wx_data = wx->data(); \ - const T* wh_data = wh->data(); \ - /* diagonal weight*/ \ - const T* wc_data = bias->data() + D4; \ - /* for peephole only*/ \ - T* checked_cell_data = nullptr; \ - auto place = ctx.GetPlace(); \ - if (use_peepholes) { \ - /* w_ic * Ct-1, w_fc * Ct-1 ; w_oc * Ct => ih*/ \ - auto* checked_cell = ctx.Output("CheckedCell"); \ - checked_cell_data = checked_cell->mutable_data(place); \ - } - -/// Compute LSTM +#define INIT_BASE_DEFINES \ + using DeviceContext = paddle::platform::CPUDeviceContext; \ + auto* x = ctx.Input("X"); \ + auto* h0 = ctx.Input("H0"); \ + auto* c0 = ctx.Input("C0"); \ + auto* wx = ctx.Input("WeightX"); \ + auto* wh = ctx.Input("WeightH"); \ + auto* bias = ctx.Input("Bias"); \ + auto* xx = ctx.Output("XX"); \ + auto* hidden_out = ctx.Output("Hidden"); \ + auto* cell_out = ctx.Output("Cell"); \ + bool is_reverse = ctx.Attr("is_reverse"); \ + bool use_peepholes = ctx.Attr("use_peepholes"); \ + auto x_dims = x->dims(); /* T x M*/ \ + auto wh_dims = wh->dims(); /* D x 4D*/ \ + const int M = x_dims[1]; \ + const int D = wh_dims[0]; \ + const int D4 = wh_dims[1] + +#define INIT_OTHER_DEFINES \ + const T* x_data = x->data(); \ + const T* wx_data = wx->data(); \ + const T* wh_data = wh->data(); \ + /* diagonal weight*/ \ + const T* wp_data = bias->data() + D4; \ + /* for peephole only*/ \ + T* checked_cell_data = nullptr; \ + auto place = ctx.GetPlace(); \ + if (use_peepholes) { \ + /* w_ic * Ct-1, w_fc * Ct-1 ; w_oc * Ct => ih*/ \ + auto* checked_cell = ctx.Output("CheckedCell"); \ + checked_cell_data = checked_cell->mutable_data(place); \ + } \ + const auto& ker = \ + math::jitkernel::KernelPool::Instance() \ + .template Get, const std::string&, \ + const std::string&, const std::string&>( \ + ctx.Attr("gate_activation"), \ + ctx.Attr("candidate_activation"), \ + ctx.Attr("cell_activation"), D, use_peepholes) + +// Wh GEMM #define GEMM_WH_ADDON(bs, prev, out) \ blas.GEMM(CblasNoTrans, CblasNoTrans, bs, D4, D, static_cast(1), prev, D, \ wh_data, D4, static_cast(1), out, D4) -#define GET_Ct(ct_1, gates, ct) \ - /* C_t = C_t-1 * fgated + cand_gated * igated*/ \ - act_cand(D, gates, gates); \ - blas.VMUL(D, gates, gates + D, gates + D); \ - blas.VMUL(D, ct_1, gates + D2, gates + D2); \ - blas.VADD(D, gates + D, gates + D2, ct) - -#define GET_Ht(ct, gates, ht) \ - /* H_t = act_cell(C_t) * ogated */ \ - act_cell(D, ct, gates + D2); \ - blas.VMUL(D, gates + D2, gates + D3, ht) - -#define GET_Ct_NOH0C0(gates, ct) \ - /* C_t = igated * cgated*/ \ - act_gate(D, gates + D, gates + D); \ - act_cand(D, gates, gates); \ - blas.VMUL(D, gates, gates + D, ct) - -#define COMPUTE_CtHt_NOH0C0(gates, ct, ht) \ - GET_Ct_NOH0C0(gates, ct); \ - act_gate(D, gates + D3, gates + D3); \ - GET_Ht(ct, gates, ht) - -#define COMPUTE_CtHt_PEEPHOLE_NOH0C0(gates, ct, ht) \ - GET_Ct_NOH0C0(gates, ct); \ - /* get outgated, put W_oc * C_t on igated */ \ - blas.VMUL(D, wc_data + D2, ct, gates + D); \ - blas.VADD(D, gates + D, gates + D3, gates + D3); \ - act_gate(D, gates + D3, gates + D3); \ - GET_Ht(ct, gates, ht) - -#define COMPUTE_CtHt(gates, ct_1, ct, ht) \ - act_gate(D3, gates + D, gates + D); \ - GET_Ct(ct_1, gates, ct); \ - GET_Ht(ct, gates, ht) - -#define COMPUTE_CtHt_PEEPHOLE(gates, ct_1, ct, ht) \ - /* get fgated and igated*/ \ - blas.VMUL(D, wc_data, ct_1, checked_cell_data); \ - blas.VMUL(D, wc_data + D, ct_1, checked_cell_data + D); \ - blas.VADD(D2, checked_cell_data, gates + D, gates + D); \ - act_gate(D2, gates + D, gates + D); \ - GET_Ct(ct_1, gates, ct); \ - /* get ogated*/ \ - blas.VMUL(D, wc_data + D2, ct, gates + D); \ - blas.VADD(D, gates + D, gates + D3, gates + D3); \ - act_gate(D, gates + D3, gates + D3); \ - GET_Ht(ct, gates, ht) - void SeqCompute(const framework::ExecutionContext& ctx) const { - using DeviceContext = paddle::platform::CPUDeviceContext; - INIT_BASE_INPUT_OUTPUT - INIT_BASE_SIZES - INIT_VEC_FUNC - INIT_BASE_INPUT_DATAS - + INIT_BASE_DEFINES; + INIT_OTHER_DEFINES; auto x_lod = x->lod(); const int total_T = x_dims[0]; const int N = x_lod[0].size() - 1; @@ -357,89 +289,47 @@ class FuisonLSTMKernel : public framework::OpKernel { gate_offset = -D; } -#define MOVE_ONE_STEP \ - prev_h_data = h_out_data; \ - prev_c_data = c_out_data; \ - xx_data = xx_data + xx_offset; \ - h_out_data = h_out_data + gate_offset; \ - c_out_data = c_out_data + gate_offset - -#define PROCESS_H0C0_DEFINES \ - int bid = is_reverse ? N - 1 - i : i; \ - int seq_len = x_lod[0][bid + 1] - x_lod[0][bid]; \ - const T* prev_c_data = nullptr; \ - const T* prev_h_data = nullptr; \ - int tstart = 0 - -#define PROCESS_H0C0_PEEPHOLE \ - PROCESS_H0C0_DEFINES; \ - if (h0_data) { \ - prev_h_data = h0_data + bid * D; \ - prev_c_data = c0_data + bid * D; \ - } else { \ - COMPUTE_CtHt_PEEPHOLE_NOH0C0(xx_data, c_out_data, h_out_data); \ - MOVE_ONE_STEP; \ - tstart = 1; \ - } - -#define PROCESS_H0C0 \ - PROCESS_H0C0_DEFINES; \ - if (h0_data) { \ - prev_h_data = h0_data + bid * D; \ - prev_c_data = c0_data + bid * D; \ - } else { \ - COMPUTE_CtHt_NOH0C0(xx_data, c_out_data, h_out_data); \ - MOVE_ONE_STEP; \ - tstart = 1; \ - } - - if (use_peepholes) { - for (int i = 0; i < N; ++i) { - PROCESS_H0C0_PEEPHOLE - for (int step = tstart; step < seq_len; ++step) { - GEMM_WH_ADDON(1, prev_h_data, xx_data); - COMPUTE_CtHt_PEEPHOLE(xx_data, prev_c_data, c_out_data, h_out_data); - MOVE_ONE_STEP; - } - } - } else { - // TODO(TJ): unly workaround, clean me - std::function compute_ctht; - if (platform::jit::MayIUse(platform::jit::avx) && - act_gate_str == "sigmoid" && act_cand_str == "tanh" && - act_cell_str == "tanh" && D == 8) { - compute_ctht = math::lstm_compute_ctht; + for (int i = 0; i < N; ++i) { + int bid = is_reverse ? N - 1 - i : i; + int seq_len = x_lod[0][bid + 1] - x_lod[0][bid]; + const T* prev_c_data = nullptr; + const T* prev_h_data = nullptr; + int tstart = 0; + if (h0_data) { + prev_h_data = h0_data + bid * D; + prev_c_data = c0_data + bid * D; } else { - compute_ctht = [&](T* gates, const T* ct_1, T* ct, T* ht) { - COMPUTE_CtHt(gates, ct_1, ct, ht); - }; + ker->ComputeC1H1(xx_data, c_out_data, h_out_data, wp_data); + tstart = 1; + // move one step + prev_h_data = h_out_data; + prev_c_data = c_out_data; + xx_data = xx_data + xx_offset; + h_out_data = h_out_data + gate_offset; + c_out_data = c_out_data + gate_offset; } - for (int i = 0; i < N; ++i) { - PROCESS_H0C0 - for (int step = tstart; step < seq_len; ++step) { - GEMM_WH_ADDON(1, prev_h_data, xx_data); - compute_ctht(xx_data, prev_c_data, c_out_data, h_out_data); - MOVE_ONE_STEP; - } + for (int step = tstart; step < seq_len; ++step) { + GEMM_WH_ADDON(1, prev_h_data, xx_data); + ker->ComputeCtHt(xx_data, prev_c_data, c_out_data, h_out_data, wp_data, + checked_cell_data); + // move one step + prev_h_data = h_out_data; + prev_c_data = c_out_data; + xx_data = xx_data + xx_offset; + h_out_data = h_out_data + gate_offset; + c_out_data = c_out_data + gate_offset; } } -#undef PROCESS_H0C0_DEFINES -#undef PROCESS_H0C0_PEEPHOLE -#undef PROCESS_H0C0 -#undef MOVE_ONE_STEP } void BatchCompute(const framework::ExecutionContext& ctx) const { - using DeviceContext = platform::CPUDeviceContext; - INIT_BASE_INPUT_OUTPUT - INIT_BASE_SIZES + INIT_BASE_DEFINES; if (x->lod()[0].size() == 2) { xx->Resize({x_dims[0], D4}); SeqCompute(ctx); return; } - INIT_VEC_FUNC - INIT_BASE_INPUT_DATAS + INIT_OTHER_DEFINES; auto* reordered_h0 = ctx.Output("ReorderedH0"); auto* reordered_c0 = ctx.Output("ReorderedC0"); @@ -487,8 +377,8 @@ class FuisonLSTMKernel : public framework::OpKernel { prev_c_data = reordered_c0_data; size_t sz = sizeof(T) * D; for (int i = 0; i < max_bs; ++i) { - std::memcpy(reordered_h0_data, h0_data + seq_order[i] * D, sz); - std::memcpy(reordered_c0_data, c0_data + seq_order[i] * D, sz); + blas.VCOPY(sz, h0_data + seq_order[i] * D, reordered_h0_data); + blas.VCOPY(sz, c0_data + seq_order[i] * D, reordered_c0_data); reordered_h0_data += D; reordered_c0_data += D; } @@ -498,13 +388,7 @@ class FuisonLSTMKernel : public framework::OpKernel { T* cur_h_out_data = batched_h_out_data; T* cur_c_out_data = batched_c_out_data; for (int i = 0; i < max_bs; ++i) { - GET_Ct_NOH0C0(cur_in_data, cur_c_out_data); - if (use_peepholes) { - blas.VMUL(D, wc_data + D2, cur_c_out_data, cur_in_data + D); - blas.VADD(D, cur_in_data + D, cur_in_data + D3, cur_in_data + D3); - } - act_gate(D, cur_in_data + D3, cur_in_data + D3); - GET_Ht(cur_c_out_data, cur_in_data, cur_h_out_data); + ker->ComputeC1H1(cur_in_data, cur_c_out_data, cur_h_out_data, wp_data); cur_in_data += D4; cur_c_out_data += D; cur_h_out_data += D; @@ -513,71 +397,37 @@ class FuisonLSTMKernel : public framework::OpKernel { prev_h_data = batched_h_out_data; prev_c_data = batched_c_out_data; } + + // compute kernel part const auto& batch_starts = batched_lod[0]; const int max_seq_len = batch_starts.size() - 1; const int offset = tstart * max_bs * D; batched_input_data = batched_input_data + offset * 4; batched_h_out_data = batched_h_out_data + offset; batched_c_out_data = batched_c_out_data + offset; - -#define DEFINE_CUR \ - T* cur_in_data = batched_input_data; \ - T* cur_prev_c_data = prev_c_data; \ - T* cur_c_out_data = batched_c_out_data; \ - T* cur_h_out_data = batched_h_out_data - -#define MOVE_ONE_BATCH \ - cur_in_data += D4; \ - cur_prev_c_data += D; \ - cur_c_out_data += D; \ - cur_h_out_data += D - -#define MOVE_ONE_STEP \ - prev_c_data = batched_c_out_data; \ - prev_h_data = batched_h_out_data; \ - batched_c_out_data = cur_c_out_data; \ - batched_h_out_data = cur_h_out_data; \ - batched_input_data = cur_in_data - - if (use_peepholes) { - for (int step = tstart; step < max_seq_len; ++step) { - const int cur_bs = batch_starts[step + 1] - batch_starts[step]; - GEMM_WH_ADDON(cur_bs, prev_h_data, batched_input_data); - DEFINE_CUR; - for (int i = 0; i < cur_bs; ++i) { - COMPUTE_CtHt_PEEPHOLE(cur_in_data, cur_prev_c_data, cur_c_out_data, - cur_h_out_data); - MOVE_ONE_BATCH; - } - MOVE_ONE_STEP; - } - } else { - // TODO(TJ): unly workaround, clean me - std::function compute_ctht; - if (platform::jit::MayIUse(platform::jit::avx) && - act_gate_str == "sigmoid" && act_cand_str == "tanh" && - act_cell_str == "tanh" && D == 8) { - compute_ctht = math::lstm_compute_ctht; - } else { - compute_ctht = [&](T* gates, const T* ct_1, T* ct, T* ht) { - COMPUTE_CtHt(gates, ct_1, ct, ht); - }; - } - for (int step = tstart; step < max_seq_len; ++step) { - const int cur_bs = batch_starts[step + 1] - batch_starts[step]; - GEMM_WH_ADDON(cur_bs, prev_h_data, batched_input_data); - DEFINE_CUR; - for (int i = 0; i < cur_bs; ++i) { - compute_ctht(cur_in_data, cur_prev_c_data, cur_c_out_data, - cur_h_out_data); - MOVE_ONE_BATCH; - } - MOVE_ONE_STEP; + for (int step = tstart; step < max_seq_len; ++step) { + const int cur_bs = batch_starts[step + 1] - batch_starts[step]; + GEMM_WH_ADDON(cur_bs, prev_h_data, batched_input_data); + T* cur_in_data = batched_input_data; + T* cur_prev_c_data = prev_c_data; + T* cur_c_out_data = batched_c_out_data; + T* cur_h_out_data = batched_h_out_data; + for (int i = 0; i < cur_bs; ++i) { + ker->ComputeCtHt(cur_in_data, cur_prev_c_data, cur_c_out_data, + cur_h_out_data, wp_data, checked_cell_data); + // move one batch + cur_in_data += D4; + cur_prev_c_data += D; + cur_c_out_data += D; + cur_h_out_data += D; } + // move one step + prev_c_data = batched_c_out_data; + prev_h_data = batched_h_out_data; + batched_c_out_data = cur_c_out_data; + batched_h_out_data = cur_h_out_data; + batched_input_data = cur_in_data; } -#undef MOVE_ONE_STEP -#undef MOVE_ONE_BATCH -#undef DEFINE_CUR math::Batch2LoDTensorFunctor to_seq; batched_h_out->set_lod(batched_lod); @@ -594,18 +444,9 @@ class FuisonLSTMKernel : public framework::OpKernel { } } -#undef COMPUTE_CtHt_PEEPHOLE -#undef COMPUTE_CtHt -#undef GET_Ct_NOH0C0 -#undef COMPUTE_CtHt_NOH0C0 -#undef COMPUTE_CtHt_PEEPHOLE_NOH0C0 -#undef GET_Ht -#undef GET_Ct #undef GEMM_WH_ADDON -#undef INIT_BASE_INPUT_DATAS -#undef INIT_BASE_SIZES -#undef INIT_BASE_INPUT_OUTPUT -#undef INIT_VEC_FUNC +#undef INIT_OTHER_DEFINES +#undef INIT_BASE_DEFINES }; } // namespace operators diff --git a/paddle/fluid/operators/fusion_seqconv_eltadd_relu_op.cc b/paddle/fluid/operators/fusion_seqconv_eltadd_relu_op.cc new file mode 100644 index 0000000000..b0910dc19e --- /dev/null +++ b/paddle/fluid/operators/fusion_seqconv_eltadd_relu_op.cc @@ -0,0 +1,229 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/fusion_seqconv_eltadd_relu_op.h" +#include // for min, max +#include +#include "paddle/fluid/operators/math/blas.h" +#include "paddle/fluid/operators/math/fc_compute.h" + +namespace paddle { +namespace operators { + +void FusionSeqConvEltAddReluOp::InferShape( + framework::InferShapeContext* ctx) const { + PADDLE_ENFORCE(ctx->HasInput("X"), + "Input(X) of FusionSeqConvEltAddReluOp should not be null."); + PADDLE_ENFORCE( + ctx->HasInput("Filter"), + "Input(Filter) of FusionSeqConvEltAddReluOp should not be null."); + PADDLE_ENFORCE( + ctx->HasInput("Bias"), + "Input(Bias) of FusionSeqConvEltAddReluOp should not be null."); + PADDLE_ENFORCE( + ctx->HasOutput("Out"), + "Output(Out) of FusionSeqConvEltAddReluOp should not be null."); + PADDLE_ENFORCE( + ctx->HasOutput("ColMat"), + "Output(ColMat) of FusionSeqConvEltAddReluOp should not be null."); + + auto x_dims = ctx->GetInputDim("X"); + auto w_dims = ctx->GetInputDim("Filter"); + int context_length = ctx->Attrs().Get("contextLength"); + PADDLE_ENFORCE( + ctx->Attrs().Get("contextStride") == 1, + "Currently, FusionSeqConvEltAddReluOp only supports contextStride=1."); + PADDLE_ENFORCE(x_dims.size() == 2 && w_dims.size() == 2, + "Input(X, Filter) should be 2-D tensor."); + PADDLE_ENFORCE(x_dims.size() == 2 && w_dims.size() == 2, + "Input(X, Filter) should be 2-D tensor."); + PADDLE_ENFORCE(w_dims[0] == context_length * x_dims[1], + "Filter's height should be context_length * " + "input_hidden_size ."); + PADDLE_ENFORCE_GT(context_length + ctx->Attrs().Get("contextStart"), 0, + "contextStart size should be smaller than contextLength."); + + ctx->SetOutputDim("Out", {x_dims[0], w_dims[1]}); + ctx->SetOutputDim("ColMat", {x_dims[0], w_dims[0]}); + ctx->ShareLoD("X", "Out"); +} + +framework::OpKernelType FusionSeqConvEltAddReluOp::GetExpectedKernelType( + const framework::ExecutionContext& ctx) const { + return framework::OpKernelType( + framework::ToDataType(ctx.Input("X")->type()), + ctx.device_context()); +} + +void FusionSeqConvEltAddReluOpMaker::Make() { + AddInput("X", + "(LoDTensor) the input is a LodTensor, which support " + "variable-time length input sequence. The underlying tensor in " + "this LoDTensor is a matrix with shape (T X M), where T is the " + "total time steps in this mini-batch, M is the dim size of x."); + // PaddingData only support false yet, should be ensured at pass. + AddInput("Filter", + "(Tensor) same as the input(Filter) of sequence conv op is an " + "learnable parameter." + "This is a tensor with shape (K, N), where K is the " + "context_length * dim size of x, N is the output feature size."); + AddInput("Bias", + "(Tensor) the learnable weights. shape (1, N), where N is the " + "output feature size"); + AddOutput( + "Out", + "(LoDTensor) the output(Out) is a LodTensor, which support " + "variable-time length output sequence. The underlying tensor in " + "this LoDTensor is a matrix with shape (T, N), where, T is the " + "total time steps in this mini-batch, N is the output feature size."); + AddOutput("ColMat", + "(Tensor) (T, K), where T is where T is the " + "total time steps in this mini-batch, K is height of Filter") + .AsIntermediate(); + AddAttr("contextLength", + "(int) the contextLength of FusionSeqConvEltAddReluOp is the " + "height of the convolution kernel.") + .GreaterThan(0); + AddAttr("contextStart", + "(int, default:0) the contextStart of FusionSeqConvEltAddReluOp " + "represents the beginning of the convolution of the number of " + "rows of sequence, which can be negative. The negative number " + "means to pad contextStart time-steps of zeros or learnable " + "parameters at the beginning of each instance. The positive " + "number means to skip contextStart time-steps of each " + "instance.") + .SetDefault(0); + AddAttr( + "contextStride", + "(int, default:1) the contextStride of FusionSeqConvEltAddReluOp " + "represents the stride length of convolution kernel. " + "Currently, FusionSeqConvEltAddReluOp only supports" + "contextStride=1.") + .SetDefault(1) + .GreaterThan(0); + AddComment(R"DOC( +Fusion Sequence Conv and ElementwiseAdd Operator. +)DOC"); +} + +template +class FusionSeqConvEltAddReluKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + using DeviceContext = paddle::platform::CPUDeviceContext; + auto* x = ctx.Input("X"); + auto* w = ctx.Input("Filter"); + auto* b = ctx.Input("Bias"); + auto* y = ctx.Output("Out"); + auto* col = ctx.Output("ColMat"); + + auto x_lod = x->lod(); + auto x_dims = x->dims(); + auto w_dims = w->dims(); + PADDLE_ENFORCE_EQ(b->numel(), w_dims[1], + "bias size should be equal to output feature size."); + PADDLE_ENFORCE_EQ(x_lod.size(), 1UL, + "Only support one level sequence now."); + + const T* x_data = x->data(); + const T* w_data = w->data(); + const T* b_data = b->data(); + T* y_data = y->mutable_data(ctx.GetPlace()); + T* col_data = col->mutable_data(ctx.GetPlace()); + + int context_start = ctx.Attr("contextStart"); + int context_length = ctx.Attr("contextLength"); + int up_pad = std::max(0, -context_start); + int down_pad = std::max(0, context_start + context_length - 1); + // im2col + int src_mat_w = static_cast(x_dims[1]); + int src_mat_w_sz = src_mat_w * sizeof(T); + int col_mat_w = static_cast(w_dims[0]); + int col_mat_w_sz = col_mat_w * sizeof(T); + for (int i = 0; i < static_cast(x_lod[0].size()) - 1; ++i) { + int st = x_lod[0][i]; + int ed = x_lod[0][i + 1]; + const T* src_data = x_data + st * src_mat_w; + T* dst_data = col_data + st * col_mat_w; + int seq_len = ed - st; + if (seq_len > up_pad + down_pad) { + // zero all up_pad and fill data + std::memset(dst_data, 0, up_pad * col_mat_w_sz); + dst_data = dst_data + up_pad * src_mat_w; + int copy_size = col_mat_w_sz - up_pad * src_mat_w_sz; + for (int j = 0; j < up_pad; ++j) { + // blas.VCOPY? + std::memcpy(dst_data, src_data, copy_size); + dst_data += (col_mat_w - src_mat_w); + copy_size += src_mat_w_sz; + } + // fill data + for (int j = 0; j < seq_len - up_pad - down_pad; ++j) { + std::memcpy(dst_data, src_data, copy_size); + dst_data += col_mat_w; + src_data += src_mat_w; + } + // zero all down_pad and fill data + std::memset(dst_data, 0, down_pad * col_mat_w_sz); + copy_size -= src_mat_w_sz; + for (int j = 0; j < down_pad; ++j) { + std::memcpy(dst_data, src_data, copy_size); + dst_data += col_mat_w; + src_data += src_mat_w; + copy_size -= src_mat_w_sz; + } + } else { + PADDLE_ENFORCE_GE(context_length, up_pad + down_pad + 1); + std::memset(dst_data, 0, seq_len * col_mat_w_sz); + dst_data = dst_data + up_pad * src_mat_w; + int zero_sz = up_pad * src_mat_w_sz; + int cur_src_sz = seq_len * src_mat_w_sz; + for (int j = 0; j < std::min(up_pad, seq_len); ++j) { + int copy_size = std::min(cur_src_sz, col_mat_w_sz - zero_sz); + std::memcpy(dst_data, src_data, copy_size); + dst_data += (col_mat_w - src_mat_w); + zero_sz -= src_mat_w_sz; + } + // from bottom + dst_data = col_data + ed * col_mat_w; + src_data = x_data + st * src_mat_w; + zero_sz = down_pad * src_mat_w_sz; + for (int j = 1; j <= std::min(down_pad, seq_len); ++j) { + int copy_size = std::min(cur_src_sz, col_mat_w_sz - zero_sz); + std::memcpy(dst_data - (zero_sz + copy_size) / sizeof(T), + src_data + std::max(seq_len - j - up_pad, 0) * src_mat_w, + copy_size); + dst_data -= col_mat_w; + zero_sz -= src_mat_w_sz; + } + } + } + auto& dev_ctx = ctx.template device_context(); + auto blas = math::GetBlas(dev_ctx); + math::FCCompute(blas, x_dims[0], w_dims[1], w_dims[0], + col_data, w_data, y_data, b_data, true); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OPERATOR(fusion_seqconv_eltadd_relu, ops::FusionSeqConvEltAddReluOp, + ops::FusionSeqConvEltAddReluOpMaker, + paddle::framework::DefaultGradOpDescMaker); + +REGISTER_OP_CPU_KERNEL(fusion_seqconv_eltadd_relu, + ops::FusionSeqConvEltAddReluKernel, + ops::FusionSeqConvEltAddReluKernel); diff --git a/paddle/fluid/operators/fusion_seqconv_eltadd_relu_op.h b/paddle/fluid/operators/fusion_seqconv_eltadd_relu_op.h new file mode 100644 index 0000000000..028d79dc2a --- /dev/null +++ b/paddle/fluid/operators/fusion_seqconv_eltadd_relu_op.h @@ -0,0 +1,42 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once +#include "paddle/fluid/framework/op_registry.h" + +namespace paddle { +namespace operators { + +using LoDTensor = framework::LoDTensor; +using Tensor = framework::Tensor; + +class FusionSeqConvEltAddReluOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext* ctx) const override; + + protected: + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext& ctx) const override; +}; + +class FusionSeqConvEltAddReluOpMaker + : public framework::OpProtoAndCheckerMaker { + public: + void Make() override; +}; + +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/fusion_seqexpand_concat_fc_op.cc b/paddle/fluid/operators/fusion_seqexpand_concat_fc_op.cc index 0cd3d3887c..8d2f055d53 100644 --- a/paddle/fluid/operators/fusion_seqexpand_concat_fc_op.cc +++ b/paddle/fluid/operators/fusion_seqexpand_concat_fc_op.cc @@ -136,9 +136,9 @@ class FusionSeqExpandConcatFCOpKernel : public framework::OpKernel { // since infershape can not get lod info PADDLE_ENFORCE_EQ(ref_lod.size(), 1UL, "Only support input lod size is 1."); PADDLE_ENFORCE_EQ(in1_lod.size(), 1UL, "Only support input lod size is 1."); - PADDLE_ENFORCE_EQ(in1_lod[0].size() - 1, N, + PADDLE_ENFORCE_EQ(static_cast(in1_lod[0].size() - 1), N, "Batch size of all inputs should be equal."); - PADDLE_ENFORCE_EQ(in1_lod[0][N], N, + PADDLE_ENFORCE_EQ(static_cast(in1_lod[0][N]), N, "Seq_length of other inputs should be 1."); PADDLE_ENFORCE_EQ(in1_dims[0], N, "input height should be batch size."); for (size_t i = 2; i < ins.size(); ++i) { diff --git a/paddle/fluid/operators/gather.h b/paddle/fluid/operators/gather.h index d15cb55647..d72e07d76c 100644 --- a/paddle/fluid/operators/gather.h +++ b/paddle/fluid/operators/gather.h @@ -39,11 +39,9 @@ void CPUGather(const platform::DeviceContext& ctx, const Tensor& src, PADDLE_ENFORCE(platform::is_cpu_place(ctx.GetPlace())); // check index of shape 1-D PADDLE_ENFORCE(index.dims().size() == 1); - int index_size = index.dims()[0]; + int64_t index_size = index.dims()[0]; auto src_dims = src.dims(); - framework::DDim output_dims(src_dims); - output_dims[0] = index_size; const T* p_src = src.data(); const int* p_index = index.data(); @@ -55,7 +53,7 @@ void CPUGather(const platform::DeviceContext& ctx, const Tensor& src, const size_t slice_bytes = slice_size * sizeof(T); - for (int i = 0; i < index_size; ++i) { + for (int64_t i = 0; i < index_size; ++i) { int index_ = p_index[i]; memcpy(p_output + i * slice_size, p_src + index_ * slice_size, slice_bytes); } diff --git a/paddle/fluid/operators/isfinite_op.cc b/paddle/fluid/operators/isfinite_op.cc index 248c779356..7b42efd623 100644 --- a/paddle/fluid/operators/isfinite_op.cc +++ b/paddle/fluid/operators/isfinite_op.cc @@ -60,7 +60,7 @@ class OverflowOpMaker : public framework::OpProtoAndCheckerMaker { "(Tensor) 1-dim tensor, contains a bool scalar. The output " "tensor of overflow operator."); AddComment(string::Sprintf(R"DOC( -Overflow operator. +Overflow %s operator. $$Out = any(X)$$ @@ -69,6 +69,8 @@ Out = Inf if any X contains Inf, Out = Nan if any X contains Nan, Out = 0 if no Inf/Nan detected. If X contains both Inf/Nan, it will return the first indicator it meeted. + +%s )DOC", GetName(), GetComments())); } diff --git a/paddle/fluid/operators/listen_and_serv_op.cc b/paddle/fluid/operators/listen_and_serv_op.cc index dc008d1697..26f09c46c2 100644 --- a/paddle/fluid/operators/listen_and_serv_op.cc +++ b/paddle/fluid/operators/listen_and_serv_op.cc @@ -66,7 +66,7 @@ static void ParallelExecuteBlocks( << "pointer: " << prepared[run_block].get(); executor->RunPreparedContext(prepared[run_block].get(), scope); } catch (const std::exception &e) { - LOG(ERROR) << "run sub program error " << e.what(); + LOG(FATAL) << "run sub program:" << idx << " error " << e.what(); } })); } diff --git a/paddle/fluid/operators/lod_tensor_to_array_op.cc b/paddle/fluid/operators/lod_tensor_to_array_op.cc index 8eab83fcd2..e72337a3e6 100644 --- a/paddle/fluid/operators/lod_tensor_to_array_op.cc +++ b/paddle/fluid/operators/lod_tensor_to_array_op.cc @@ -17,7 +17,7 @@ limitations under the License. */ #include "paddle/fluid/framework/lod_tensor_array.h" #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/operators/detail/safe_ref.h" -#include "paddle/fluid/operators/math/concat.h" +#include "paddle/fluid/operators/math/concat_and_split.h" #include "paddle/fluid/platform/device_context.h" #include "paddle/fluid/platform/port.h" @@ -79,7 +79,7 @@ struct LoDTensorToArrayFunctor : public boost::static_visitor { template template void LoDTensorToArrayFunctorImpl::apply() { - math::ConcatGradFunctor func; + math::SplitFunctor func; func(*dev_ctx_, prev_functor_->input_, prev_functor_->ref_inputs_, 0, &prev_functor_->outputs_); } diff --git a/paddle/fluid/operators/math/CMakeLists.txt b/paddle/fluid/operators/math/CMakeLists.txt index 9110135643..5d0c0b4228 100644 --- a/paddle/fluid/operators/math/CMakeLists.txt +++ b/paddle/fluid/operators/math/CMakeLists.txt @@ -1,10 +1,10 @@ if (NOT WIN32) -add_subdirectory(detail) + add_subdirectory(detail) endif(NOT WIN32) function(math_library TARGET) - # math_library is a function to create math library. - # The interface is the same as cc_library. + # math_library is a function to create math library. + # The interface is the same as cc_library. # But it handle split GPU/CPU code and link some common library. set(cc_srcs) set(cu_srcs) @@ -35,7 +35,7 @@ function(math_library TARGET) endfunction() # please add new math_library in alphabetical order -math_library(concat) +math_library(concat_and_split) math_library(context_project DEPS im2col math_function) math_library(cross_entropy) math_library(cos_sim_functor) @@ -43,24 +43,22 @@ math_library(depthwise_conv) math_library(im2col) if (NOT WIN32) # windows do not support avx functions yet. -math_library(gru_compute DEPS activation_functions math_function) -math_library(lstm_compute DEPS activation_functions) -# TODO(TJ): ugly workaround, clean me -cc_library(cpu_lstm_compute SRCS cpu_lstm_compute.cc DEPS activation_functions cblas cpu_info) + math_library(gru_compute DEPS activation_functions math_function) + math_library(lstm_compute DEPS activation_functions) endif (NOT WIN32) cc_library(blas SRCS blas.cc DEPS cblas framework_proto device_context) math_library(math_function DEPS blas) math_library(maxouting) math_library(pooling) -math_library(selected_rows_functor DEPS selected_rows math_function) +math_library(selected_rows_functor DEPS selected_rows math_function blas) math_library(sequence2batch) math_library(sequence_padding) math_library(sequence_pooling DEPS math_function) math_library(sequence_scale) math_library(softmax DEPS math_function) if (NOT WIN32) -math_library(matrix_bit_code) + math_library(matrix_bit_code) endif (NOT WIN32) math_library(unpooling) math_library(vol2col) @@ -74,5 +72,9 @@ if(WITH_GPU) nv_test(math_function_gpu_test SRCS math_function_test.cu DEPS math_function) nv_test(selected_rows_functor_gpu_test SRCS selected_rows_functor_test.cu DEPS selected_rows_functor math_function) endif() -cc_test(concat_test SRCS concat_test.cc DEPS concat) +cc_test(concat_test SRCS concat_test.cc DEPS concat_and_split) cc_test(cpu_vec_test SRCS cpu_vec_test.cc DEPS blas cpu_info) +cc_library(jit_kernel + SRCS jit_kernel.cc jit_kernel_blas.cc jit_kernel_exp.cc jit_kernel_lstm.cc + DEPS cpu_info cblas) +cc_test(jit_kernel_test SRCS jit_kernel_test.cc DEPS jit_kernel) diff --git a/paddle/fluid/operators/math/algorithm.h b/paddle/fluid/operators/math/algorithm.h new file mode 100644 index 0000000000..262469beea --- /dev/null +++ b/paddle/fluid/operators/math/algorithm.h @@ -0,0 +1,44 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include // for int64_t +#include + +#include "paddle/fluid/platform/hostdevice.h" + +namespace paddle { +namespace operators { +namespace math { + +template +HOSTDEVICE inline int64_t BinarySearch(const T *x, int64_t num, const T &val) { + int64_t beg = 0, end = num - 1; + while (beg <= end) { + auto mid = ((beg + end) >> 1); + if (x[mid] == val) + return mid; + else if (x[mid] < val) + beg = mid + 1; + else + end = mid - 1; + } + return -1; +} + +} // namespace math +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/math/concat.cc b/paddle/fluid/operators/math/concat_and_split.cc similarity index 95% rename from paddle/fluid/operators/math/concat.cc rename to paddle/fluid/operators/math/concat_and_split.cc index 7b79f10e33..c6e17fd042 100644 --- a/paddle/fluid/operators/math/concat.cc +++ b/paddle/fluid/operators/math/concat_and_split.cc @@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/fluid/operators/math/concat.h" +#include "paddle/fluid/operators/math/concat_and_split.h" #include namespace paddle { @@ -67,7 +67,7 @@ class ConcatFunctor { * each dimension must be the same, except the axis dimension. */ template -class ConcatGradFunctor { +class SplitFunctor { public: void operator()(const platform::CPUDeviceContext& context, const framework::Tensor& input, @@ -111,7 +111,7 @@ class ConcatGradFunctor { }; #define DEFINE_FUNCTOR(type) \ template class ConcatFunctor; \ - template class ConcatGradFunctor; + template class SplitFunctor; FOR_ALL_TYPES(DEFINE_FUNCTOR); diff --git a/paddle/fluid/operators/math/concat.cu b/paddle/fluid/operators/math/concat_and_split.cu similarity index 90% rename from paddle/fluid/operators/math/concat.cu rename to paddle/fluid/operators/math/concat_and_split.cu index b59d86e661..760a065c10 100644 --- a/paddle/fluid/operators/math/concat.cu +++ b/paddle/fluid/operators/math/concat_and_split.cu @@ -15,7 +15,7 @@ limitations under the License. */ #include #include #include "paddle/fluid/framework/mixed_vector.h" -#include "paddle/fluid/operators/math/concat.h" +#include "paddle/fluid/operators/math/concat_and_split.h" #include "paddle/fluid/platform/cuda_primitives.h" #include "paddle/fluid/platform/float16.h" @@ -24,7 +24,7 @@ namespace operators { namespace math { template -__global__ void KernelConcat(T** inputs, const int* input_cols, int col_size, +__global__ void ConcatKernel(T** inputs, const int* input_cols, int col_size, const int output_rows, const int output_cols, T* output) { int tid_x = blockIdx.x * blockDim.x + threadIdx.x; @@ -50,7 +50,7 @@ __global__ void KernelConcat(T** inputs, const int* input_cols, int col_size, } template -__global__ void KernelConcat(T** inputs_data, const int fixed_in_col, +__global__ void ConcatKernel(T** inputs_data, const int fixed_in_col, const int out_rows, const int out_cols, T* output_data) { int tid_x = blockIdx.x * blockDim.x + threadIdx.x; @@ -67,9 +67,9 @@ __global__ void KernelConcat(T** inputs_data, const int fixed_in_col, } template -__global__ void KernelConcatGrad(const T* input_data, const int in_row, - const int in_col, const int* out_cols, - int out_cols_size, T** outputs_data) { +__global__ void SplitKernel(const T* input_data, const int in_row, + const int in_col, const int* out_cols, + int out_cols_size, T** outputs_data) { int tid_x = blockIdx.x * blockDim.x + threadIdx.x; int curr_segment = 0; int curr_offset = out_cols[0]; @@ -94,9 +94,9 @@ __global__ void KernelConcatGrad(const T* input_data, const int in_row, } template -__global__ void KernelConcatGrad(const T* input_data, const int in_row, - const int in_col, const int fixed_out_col, - T** outputs_data) { +__global__ void SplitKernel(const T* input_data, const int in_row, + const int in_col, const int fixed_out_col, + T** outputs_data) { int tid_x = blockIdx.x * blockDim.x + threadIdx.x; for (; tid_x < in_col; tid_x += blockDim.x * gridDim.x) { int split = tid_x / fixed_out_col; @@ -170,11 +170,11 @@ class ConcatFunctor { dim3 grid_size = dim3(grid_cols, grid_rows, 1); if (sameShape) { - KernelConcat<<>>( + ConcatKernel<<>>( dev_ins_data, in_col, out_row, out_col, output->data()); } else { const int* dev_ins_col_data = inputs_col.CUDAData(context.GetPlace()); - KernelConcat<<>>( + ConcatKernel<<>>( dev_ins_data, dev_ins_col_data, static_cast(inputs_col.size()), out_row, out_col, output->data()); } @@ -189,7 +189,7 @@ class ConcatFunctor { * each dimension must be the same, except the axis dimension. */ template -class ConcatGradFunctor { +class SplitFunctor { public: void operator()(const platform::CUDADeviceContext& context, const framework::Tensor& input, @@ -248,11 +248,11 @@ class ConcatGradFunctor { dim3 grid_size = dim3(grid_cols, grid_rows, 1); if (sameShape) { - KernelConcatGrad<<>>( + SplitKernel<<>>( input.data(), in_row, in_col, out0_col, dev_out_gpu_data); } else { const int* dev_outs_col_data = outputs_cols.CUDAData(context.GetPlace()); - KernelConcatGrad<<>>( + SplitKernel<<>>( input.data(), in_row, in_col, dev_outs_col_data, static_cast(outputs_cols.size()), dev_out_gpu_data); } @@ -264,7 +264,7 @@ class ConcatGradFunctor { #define DEFINE_FUNCTOR(type) \ template class ConcatFunctor; \ - template class ConcatGradFunctor + template class SplitFunctor FOR_ALL_TYPES(DEFINE_FUNCTOR); diff --git a/paddle/fluid/operators/math/concat.h b/paddle/fluid/operators/math/concat_and_split.h similarity index 98% rename from paddle/fluid/operators/math/concat.h rename to paddle/fluid/operators/math/concat_and_split.h index 867a84fa87..3a5eddcbf4 100644 --- a/paddle/fluid/operators/math/concat.h +++ b/paddle/fluid/operators/math/concat_and_split.h @@ -54,7 +54,7 @@ class ConcatFunctor { * Output[1] = [[5,6]] */ template -class ConcatGradFunctor { +class SplitFunctor { public: void operator()(const DeviceContext& context, const framework::Tensor& input, const std::vector& ref_inputs, diff --git a/paddle/fluid/operators/math/concat_test.cc b/paddle/fluid/operators/math/concat_test.cc index a46f2d51ca..8ba9e8e8ec 100644 --- a/paddle/fluid/operators/math/concat_test.cc +++ b/paddle/fluid/operators/math/concat_test.cc @@ -12,10 +12,10 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/fluid/operators/math/concat.h" #include #include #include "paddle/fluid/framework/tensor_util.h" +#include "paddle/fluid/operators/math/concat_and_split.h" template void testConcat() { diff --git a/paddle/fluid/operators/math/cpu_lstm_compute.cc b/paddle/fluid/operators/math/cpu_lstm_compute.cc deleted file mode 100644 index e96d187933..0000000000 --- a/paddle/fluid/operators/math/cpu_lstm_compute.cc +++ /dev/null @@ -1,43 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at -http://www.apache.org/licenses/LICENSE-2.0 -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "paddle/fluid/operators/math/cpu_lstm_compute.h" - -namespace paddle { -namespace operators { -namespace math { -#ifdef __AVX__ -template <> -void lstm_compute_ctht(float* gates, const float* ct_1, float* ct, - float* ht) { - namespace act = detail::forward::avx; - // gates: W_ch, W_ih, W_fh, W_oh - __m256 c, i, f, o; - c = _mm256_loadu_ps(gates); - i = _mm256_loadu_ps(gates + 8); - f = _mm256_loadu_ps(gates + 16); - o = _mm256_loadu_ps(gates + 24); - - /* C_t = C_t-1 * fgated + cand_gated * igated*/ - c = _mm256_mul_ps(act::Tanh(c), act::Sigmoid(i)); - i = _mm256_loadu_ps(ct_1); - f = _mm256_mul_ps(i, act::Sigmoid(f)); - f = _mm256_add_ps(c, f); - _mm256_storeu_ps(ct, f); - - /* H_t = act_cell(C_t) * ogated */ - o = _mm256_mul_ps(act::Tanh(f), act::Sigmoid(o)); - _mm256_storeu_ps(ht, o); -} -#endif -} // namespace math -} // namespace operators -} // namespace paddle diff --git a/paddle/fluid/operators/math/cpu_lstm_compute.h b/paddle/fluid/operators/math/cpu_lstm_compute.h deleted file mode 100644 index 169a9e4b47..0000000000 --- a/paddle/fluid/operators/math/cpu_lstm_compute.h +++ /dev/null @@ -1,64 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at -http://www.apache.org/licenses/LICENSE-2.0 -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once -#include -#include "paddle/fluid/operators/math/cpu_vec.h" -#include "paddle/fluid/platform/cpu_info.h" -#ifdef __AVX__ -#include -#endif - -namespace paddle { -namespace operators { -namespace math { - -// TODO(TJ): ugly workaround, clean me -template -void lstm_compute_ctht(T* gates, const T* ct_1, T* ct, T* ht) { - // gates: W_ch, W_ih, W_fh, W_oh - vec_sigmoid(24, gates + 8, gates + 8); - vec_tanh(8, gates, gates); - const T *i = gates + 8, *f = gates + 16, *o = gates + 24; - const T min = SIGMOID_THRESHOLD_MIN; - const T max = SIGMOID_THRESHOLD_MAX; - for (int d = 0; d < 8; ++d) { - // C_t = C_t-1 * fgated + cand_gated * igated - ct[d] = ct_1[d] * f[d] + gates[d] * i[d]; - // H_t = act_cell(C_t) * ogated - T tmp = ct[d] * 2; - tmp = static_cast(0) - ((tmp < min) ? min : ((tmp > max) ? max : tmp)); - vec_exp(1, &tmp, &tmp); - tmp = static_cast(2) / (static_cast(1) + tmp) - static_cast(1); - ht[d] = tmp * o[d]; - } -} - -#ifdef __AVX__ -namespace detail { -namespace forward { -namespace avx { -__m256 Sigmoid(const __m256 a); -__m256 Tanh(const __m256 a); - -} // namespace avx -} // namespace forward -} // namespace detail - -template <> -void lstm_compute_ctht(float* gates, const float* ct_1, float* ct, - float* ht); - -#endif - -} // namespace math -} // namespace operators -} // namespace paddle diff --git a/paddle/fluid/operators/math/cpu_vec.h b/paddle/fluid/operators/math/cpu_vec.h index 6a059968b7..0aed253c80 100644 --- a/paddle/fluid/operators/math/cpu_vec.h +++ b/paddle/fluid/operators/math/cpu_vec.h @@ -125,10 +125,8 @@ inline void vec_scal(const int n, const float a, } template <> -inline void vec_scal(const int n, - const float a, - const float* x, - float* y) { +inline void vec_scal(const int n, const float a, + const float* x, float* y) { // TODO(TJ): enable me vec_scal(n, a, x, y); } @@ -181,10 +179,10 @@ inline void vec_bias_sub(const int n, const float a, } template <> -inline void vec_bias_sub(const int n, - const float a, - const float* x, - float* y) { +inline void vec_bias_sub(const int n, + const float a, + const float* x, + float* y) { // TODO(TJ): enable me vec_bias_sub(n, a, x, y); } @@ -242,7 +240,7 @@ inline void vec_cross(const int n, const float* x, } template <> -inline void vec_cross( +inline void vec_cross( const int n, const float* x, const float* y, const float* z, float* out) { // TODO(TJ): enable me vec_cross(n, x, y, z, out); @@ -296,10 +294,10 @@ inline void vec_add_bias(const int n, const float a, } template <> -inline void vec_add_bias(const int n, - const float a, - const float* x, - float* y) { +inline void vec_add_bias(const int n, + const float a, + const float* x, + float* y) { // TODO(TJ): enable me vec_add_bias(n, a, x, y); } @@ -390,9 +388,9 @@ inline void vec_sigmoid(const int n, const float* x, } template <> -inline void vec_sigmoid(const int n, - const float* x, - float* y) { +inline void vec_sigmoid(const int n, + const float* x, + float* y) { // TODO(TJ): enable me vec_sigmoid(n, x, y); } @@ -454,9 +452,8 @@ inline void vec_relu(const int n, const float* x, } template <> -inline void vec_relu(const int n, - const float* x, - float* y) { +inline void vec_relu(const int n, const float* x, + float* y) { // TODO(TJ): enable me vec_relu(n, x, y); } diff --git a/paddle/fluid/operators/math/cpu_vec_test.cc b/paddle/fluid/operators/math/cpu_vec_test.cc index 3ce66f49ed..cd40f1b2f9 100644 --- a/paddle/fluid/operators/math/cpu_vec_test.cc +++ b/paddle/fluid/operators/math/cpu_vec_test.cc @@ -110,7 +110,7 @@ TEST(CpuVecTest, sigmoid) { TestAndBench(sz, vec_sigmoid, ref_sigmoid); TestAndBench(sz, vec_sigmoid, ref_sigmoid); TestAndBench(sz, vec_sigmoid, ref_sigmoid); - TestAndBench(sz, vec_sigmoid, + TestAndBench(sz, vec_sigmoid, ref_sigmoid); } TestAndBench(30, vec_sigmoid, ref_sigmoid); @@ -123,8 +123,7 @@ TEST(CpuVecTest, tanh) { TestAndBench(sz, vec_tanh, ref_tanh); TestAndBench(sz, vec_tanh, ref_tanh); TestAndBench(sz, vec_tanh, ref_tanh); - TestAndBench(sz, vec_tanh, - ref_tanh); + TestAndBench(sz, vec_tanh, ref_tanh); } TestAndBench(30, vec_tanh, ref_tanh); } @@ -136,8 +135,7 @@ TEST(CpuVecTest, relu) { TestAndBench(sz, vec_relu, ref_relu); TestAndBench(sz, vec_relu, ref_relu); TestAndBench(sz, vec_relu, ref_relu); - TestAndBench(sz, vec_relu, - ref_relu); + TestAndBench(sz, vec_relu, ref_relu); } TestAndBench(30, vec_relu, ref_relu); } @@ -170,7 +168,7 @@ TEST(CpuVecTest, inplace_sigmoid) { TestInplace(sz, vec_sigmoid, ref_sigmoid); TestInplace(sz, vec_sigmoid, ref_sigmoid); TestInplace(sz, vec_sigmoid, ref_sigmoid); - TestInplace(sz, vec_sigmoid, + TestInplace(sz, vec_sigmoid, ref_sigmoid); } TestInplace(30, vec_sigmoid, ref_sigmoid); @@ -183,8 +181,7 @@ TEST(CpuVecTest, inplace_tanh) { TestInplace(sz, vec_tanh, ref_tanh); TestInplace(sz, vec_tanh, ref_tanh); TestInplace(sz, vec_tanh, ref_tanh); - TestInplace(sz, vec_tanh, - ref_tanh); + TestInplace(sz, vec_tanh, ref_tanh); } TestInplace(30, vec_tanh, ref_tanh); } @@ -196,8 +193,7 @@ TEST(CpuVecTest, inplace_relu) { TestInplace(sz, vec_relu, ref_relu); TestInplace(sz, vec_relu, ref_relu); TestInplace(sz, vec_relu, ref_relu); - TestInplace(sz, vec_relu, - ref_relu); + TestInplace(sz, vec_relu, ref_relu); } TestInplace(30, vec_relu, ref_relu); } diff --git a/paddle/fluid/operators/math/depthwise_conv.cu b/paddle/fluid/operators/math/depthwise_conv.cu index 3be3899123..66d37c3bf3 100644 --- a/paddle/fluid/operators/math/depthwise_conv.cu +++ b/paddle/fluid/operators/math/depthwise_conv.cu @@ -46,17 +46,20 @@ __forceinline__ __device__ unsigned warp_id() { return ret; } +#define ARG_DEFINE_KernelDepthwiseConv \ + const T *const input_data, const T *const filter_data, const int batch_size, \ + const int output_channels, const int output_height, \ + const int output_width, const int input_channels, \ + const int input_height, const int input_width, \ + const int filter_multiplier, const int filter_height, \ + const int filter_width, const int stride_height, const int stride_width, \ + const int padding_height, const int padding_width, \ + const int dilate_height, const int dilate_width, T *const output_data + // A Cuda kernel to compute the depthwise convolution forward pass // in NCHW format. template -__device__ __inline__ void KernelDepthwiseConv( - const T* const input_data, const T* const filter_data, const int batch_size, - const int output_channels, const int output_height, const int output_width, - const int input_channels, const int input_height, const int input_width, - const int filter_multiplier, const int filter_height, - const int filter_width, const int stride_height, const int stride_width, - const int padding_height, const int padding_width, const int dilate_height, - const int dilate_width, T* const output_data) { +__device__ __inline__ void KernelDepthwiseConv(ARG_DEFINE_KernelDepthwiseConv) { for (int w_out = threadIdx.x; w_out < output_width; w_out += blockDim.x) { for (int h_out = threadIdx.y; h_out < output_height; h_out += blockDim.y) { const int batch = blockIdx.y; @@ -97,42 +100,105 @@ __device__ __inline__ void KernelDepthwiseConv( } } -template -__global__ void KernelDepthwiseConvSp( - const T* const input_data, const T* const filter_data, const int batch_size, - const int output_channels, const int output_height, const int output_width, - const int input_channels, const int input_height, const int input_width, - const int filter_multiplier, const int filter_height, - const int filter_width, const int stride_height, const int stride_width, - const int padding_height, const int padding_width, const int dilate_height, - const int dilate_width, T* const output_data) { - if (c_filter_multiplier == 0) - KernelDepthwiseConv(input_data, filter_data, batch_size, output_channels, - output_height, output_width, input_channels, - input_height, input_width, filter_multiplier, - filter_height, filter_width, stride_height, - stride_width, padding_height, padding_width, - dilate_height, dilate_width, output_data); +template +__device__ __inline__ void KernelDepthwiseConvCFilter( + ARG_DEFINE_KernelDepthwiseConv) { + const int kWeghtSize = c_filter * c_filter; + T r_weight[kWeghtSize]; + const int batch = blockIdx.y; + const int c_out = blockIdx.x; + const T* weight = filter_data + c_out * c_filter * c_filter; + for (int i = 0; i < c_filter * c_filter; i++) r_weight[i] = weight[i]; - else - KernelDepthwiseConv(input_data, filter_data, batch_size, output_channels, - output_height, output_width, input_channels, - input_height, input_width, c_filter_multiplier, - filter_height, filter_height, c_stride, c_stride, - padding_height, padding_width, dilate_height, - dilate_width, output_data); + for (int w_out = threadIdx.x; w_out < output_width; w_out += blockDim.x) { + for (int h_out = threadIdx.y; h_out < output_height; h_out += blockDim.y) { + const int batch = blockIdx.y; + const int c_out = blockIdx.x; + + const int c_in = c_out / filter_multiplier; + T value = 0; + const int h_in_start = -padding_height + h_out * stride_height; + const int w_in_start = -padding_width + w_out * stride_width; + const int h_in_end = h_in_start + c_filter * dilate_height; + const int w_in_end = w_in_start + c_filter * dilate_width; + + const int in_offset = + ((batch * input_channels + c_in) * input_height) * input_width; + + const int h_end = h_in_end < input_height ? h_in_end : input_height; + const int w_end = w_in_end < input_width ? w_in_end : input_width; + const int h_start = h_in_start > 0 ? h_in_start : 0; + const int w_start = w_in_start > 0 ? w_in_start : 0; + + for (int h_in = h_in_start, h_f = 0; h_f < c_filter; + h_in += dilate_height, h_f++) { + for (int w_in = w_in_start, w_f = 0; w_f < c_filter; + w_in += dilate_width, w_f++) { + if (h_in >= 0 && h_in < input_height && w_in >= 0 && + w_in < input_width) { + const int offset = in_offset + h_in * input_width + w_in; + value += r_weight[h_f * c_filter + w_f] * input_data[offset]; + } + } + } + int index = + ((batch * gridDim.x + c_out) * output_height + h_out) * output_width + + w_out; + output_data[index] = value; + } + } +} + +template +__global__ void KernelDepthwiseConvSp(ARG_DEFINE_KernelDepthwiseConv) { + if (c_filter_multiplier == 0) { + if (c_filter == -1) + KernelDepthwiseConv( + input_data, filter_data, batch_size, output_channels, output_height, + output_width, input_channels, input_height, input_width, + filter_multiplier, filter_height, filter_width, stride_height, + stride_width, padding_height, padding_width, dilate_height, + dilate_width, output_data); + else + KernelDepthwiseConvCFilter( + input_data, filter_data, batch_size, output_channels, output_height, + output_width, input_channels, input_height, input_width, + filter_multiplier, filter_height, filter_width, stride_height, + stride_width, padding_height, padding_width, dilate_height, + dilate_width, output_data); + } else { + if (c_filter == -1) + KernelDepthwiseConv(input_data, filter_data, batch_size, + output_channels, output_height, output_width, + input_channels, input_height, input_width, + c_filter_multiplier, filter_height, filter_height, + c_stride, c_stride, padding_height, padding_width, + dilate_height, dilate_width, output_data); + else + KernelDepthwiseConvCFilter( + input_data, filter_data, batch_size, output_channels, output_height, + output_width, input_channels, input_height, input_width, + c_filter_multiplier, filter_height, filter_height, c_stride, c_stride, + padding_height, padding_width, dilate_height, dilate_width, + output_data); + } } // CUDA kernel to compute the depthwise convolution backprop w.r.t input. +#define ARG_DEFINE_KernelDepthwiseConvInputGrad \ + const T *const output_grad_data, const T *const filter_data, \ + const int batch_size, const int output_channels, \ + const int output_height, const int output_width, \ + const int input_channels, const int input_height, const int input_width, \ + const int filter_multiplier, const int filter_height, \ + const int filter_width, const int stride_height, const int stride_width, \ + const int padding_height, const int padding_width, \ + const int dilate_height, const int dilate_width, \ + T *const input_grad_data + template __device__ __inline__ void KernelDepthwiseConvInputGrad( - const T* const output_grad_data, const T* const filter_data, - const int batch_size, const int output_channels, const int output_height, - const int output_width, const int input_channels, const int input_height, - const int input_width, const int filter_multiplier, const int filter_height, - const int filter_width, const int stride_height, const int stride_width, - const int padding_height, const int padding_width, const int dilate_height, - const int dilate_width, T* const input_grad_data) { + ARG_DEFINE_KernelDepthwiseConvInputGrad) { for (int w_in = threadIdx.x; w_in < input_width; w_in += blockDim.x) { for (int h_in = threadIdx.y; h_in < input_height; h_in += blockDim.y) { const int batch = blockIdx.y; @@ -184,15 +250,67 @@ __device__ __inline__ void KernelDepthwiseConvInputGrad( } } -template +template +__device__ __inline__ void KernelDepthwiseConvInputGradCFilter( + ARG_DEFINE_KernelDepthwiseConvInputGrad) { + const int kWeghtSize = c_filter * c_filter * c_filter_multiplier + 1; + T r_weight[kWeghtSize]; + const int batch = blockIdx.y; + const int c_in = blockIdx.x; + + for (int c_i = 0; c_i < filter_multiplier; c_i++) { + int c_out = c_in * filter_multiplier + c_i; + const T* weight = filter_data + c_out * c_filter * c_filter; + for (int i = 0; i < c_filter * c_filter; i++) + r_weight[i + c_i * c_filter * c_filter] = + weight[c_filter * c_filter - i - 1]; + } + + for (int w_in = threadIdx.x; w_in < input_width; w_in += blockDim.x) { + for (int h_in = threadIdx.y; h_in < input_height; h_in += blockDim.y) { + const int batch = blockIdx.y; + const int c_in = blockIdx.x; + + int h_out_start = h_in - (c_filter - 1) * dilate_height + padding_height; + + int w_out_start = w_in - (c_filter - 1) * dilate_width + padding_width; + + T value = 0; + + for (int c_i = 0; c_i < filter_multiplier; c_i++) { + int c_out = c_in * filter_multiplier + c_i; + for (int h_out = h_out_start, h_f = 0; h_f < c_filter; + h_out += dilate_height, h_f++) { + for (int w_out = w_out_start, w_f = 0; w_f < c_filter; + w_out += dilate_width, w_f++) { + int s_h_out = h_out / stride_height; + int s_w_out = w_out / stride_width; + if (h_out % stride_height == 0 && w_out % stride_width == 0 && + s_h_out >= 0 && s_h_out < output_height && s_w_out >= 0 && + s_w_out < output_width) { + const int output_grad_offset = + ((batch * output_channels + c_out) * output_height + + s_h_out) * + output_width + + s_w_out; + value += + output_grad_data[output_grad_offset] * + r_weight[h_f * c_filter + w_f + c_i * c_filter * c_filter]; + } + } + } + } + int index = + ((batch * gridDim.x + c_in) * input_height + h_in) * input_width + + w_in; + input_grad_data[index] = value; + } + } +} + +template __global__ void KernelDepthwiseConvInputGradSp( - const T* const output_grad_data, const T* const filter_data, - const int batch_size, const int output_channels, const int output_height, - const int output_width, const int input_channels, const int input_height, - const int input_width, const int filter_multiplier, const int filter_height, - const int filter_width, const int stride_height, const int stride_width, - const int padding_height, const int padding_width, const int dilate_height, - const int dilate_width, T* const input_grad_data) { + ARG_DEFINE_KernelDepthwiseConvInputGrad) { if (c_filter_multiplier == 0) KernelDepthwiseConvInputGrad( output_grad_data, filter_data, batch_size, output_channels, @@ -200,13 +318,20 @@ __global__ void KernelDepthwiseConvInputGradSp( filter_multiplier, filter_height, filter_width, stride_height, stride_width, padding_height, padding_width, dilate_height, dilate_width, input_grad_data); - else + else if (c_filter == -1) KernelDepthwiseConvInputGrad( output_grad_data, filter_data, batch_size, output_channels, output_height, output_width, input_channels, input_height, input_width, c_filter_multiplier, filter_height, filter_width, c_stride, c_stride, padding_height, padding_width, dilate_height, dilate_width, input_grad_data); + else + KernelDepthwiseConvInputGradCFilter( + output_grad_data, filter_data, batch_size, output_channels, + output_height, output_width, input_channels, input_height, input_width, + c_filter_multiplier, filter_height, filter_width, c_stride, c_stride, + padding_height, padding_width, dilate_height, dilate_width, + input_grad_data); } // Cuda kernel to compute the depthwise convolution backprop w.r.t. filter. @@ -325,12 +450,14 @@ class DepthwiseConvFunctor { dim3 threads(std::min(output_width, thread), blocks, 1); dim3 grid(output_channels, batch_size, 1); int filter_multiplier = output_channels / input_channels; -#define check_case(c_filter_multiplier, c_stride) \ +#define check_case(c_filter_multiplier, c_stride, c_filter) \ if (c_filter_multiplier == 0 || \ filter_multiplier == c_filter_multiplier && \ - stride_height == stride_width && stride_height == c_stride) { \ - KernelDepthwiseConvSp<<>>( \ + stride_height == stride_width && stride_height == c_stride && \ + (ksize_height == ksize_width && ksize_height == c_filter || \ + c_filter == -1)) { \ + KernelDepthwiseConvSp<<>>( \ input_data, filter_data, batch_size, output_channels, output_height, \ output_width, input_channels, input_height, input_width, \ filter_multiplier, ksize_height, ksize_width, stride_height, \ @@ -338,11 +465,17 @@ class DepthwiseConvFunctor { dilate_width, output_data); \ return; \ } - check_case(1, 1); - check_case(1, 2); - // NOTE(liangdun): 0,0 for other case - // add other case if needed, e.g. check_case(2^n,1) - check_case(0, 0); + check_case(1, 1, 3); + check_case(1, 1, 5); + check_case(1, 1, -1); + check_case(1, 2, 3); + check_case(1, 2, 5); + check_case(1, 2, -1); + check_case(0, 0, 3); + check_case(0, 0, 5); + check_case(0, 0, -1); +// NOTE(liangdun): 0,0 for other case +// add other case if needed, e.g. check_case(2^n,1) #undef check_case } }; @@ -384,13 +517,15 @@ class DepthwiseConvInputGradFunctor { dim3 grid(input_channels, batch_size, 1); int filter_multiplier = output_channels / input_channels; -#define check_case(c_filter_multiplier, c_stride) \ +#define check_case(c_filter_multiplier, c_stride, c_filter) \ if (c_filter_multiplier == 0 || \ filter_multiplier == c_filter_multiplier && \ - stride_height == stride_width && stride_height == c_stride) { \ + stride_height == stride_width && stride_height == c_stride && \ + (ksize_height == ksize_width && ksize_height == c_filter || \ + c_filter == -1)) { \ KernelDepthwiseConvInputGradSp< \ - T, c_filter_multiplier, \ - c_stride><<>>( \ + T, c_filter_multiplier, c_stride, \ + c_filter><<>>( \ output_grad_data, filter_data, batch_size, output_channels, \ output_height, output_width, input_channels, input_height, \ input_width, filter_multiplier, ksize_height, ksize_width, \ @@ -398,11 +533,21 @@ class DepthwiseConvInputGradFunctor { dilate_height, dilate_width, input_grad_data); \ return; \ } - check_case(1, 1); - check_case(1, 2); - // NOTE(liangdun): 0,0 for other case - // add other case if needed, e.g. check_case(2^n,1) - check_case(0, 0); + check_case(1, 1, 3); + check_case(1, 1, 5); + check_case(1, 1, -1); + check_case(1, 2, 3); + check_case(1, 2, 5); + check_case(1, 2, -1); + check_case(2, 1, 3); + check_case(2, 1, 5); + check_case(2, 1, -1); + check_case(2, 2, 3); + check_case(2, 2, 5); + check_case(2, 2, -1); + check_case(0, 0, -1); +// NOTE(liangdun): 0,0 for other case +// add other case if needed, e.g. check_case(2^n,1) #undef check_case } }; diff --git a/paddle/fluid/operators/math/fc_compute.h b/paddle/fluid/operators/math/fc_compute.h index 1f5a49c0ab..87220d4019 100644 --- a/paddle/fluid/operators/math/fc_compute.h +++ b/paddle/fluid/operators/math/fc_compute.h @@ -15,6 +15,7 @@ limitations under the License. */ #pragma once #include "paddle/fluid/operators/math/blas.h" +#include "paddle/fluid/operators/math/jit_kernel.h" DECLARE_int32(paddle_num_threads); @@ -30,20 +31,25 @@ inline void FCCompute(const BlasT& blas, const int M, if (B == NULL) { return; } + if (relu) { + const auto& vaddrelu = jitkernel::KernelPool::Instance() + .template Get>(N); + for (int i = 0; i < M; i++) { + T* dst = Y + i * N; + vaddrelu->Compute(B, dst, dst); + } + } else { + const auto& vadd = jitkernel::KernelPool::Instance() + .template Get>(N); #ifdef PADDLE_WITH_MKLML #pragma omp parallel for if (FLAGS_paddle_num_threads > 1) #endif - for (int i = 0; i < M; i++) { - blas.AXPY(N, static_cast(1), B, Y + i * N); + for (int i = 0; i < M; i++) { + T* dst = Y + i * N; + vadd->Compute(B, dst, dst); + } } - - if (!relu) { - return; - } - - // TODO(TJ): fuse relu - LOG(FATAL) << "Not implemented!"; } } // namespace math diff --git a/paddle/fluid/operators/math/jit_kernel.cc b/paddle/fluid/operators/math/jit_kernel.cc new file mode 100644 index 0000000000..68b708b345 --- /dev/null +++ b/paddle/fluid/operators/math/jit_kernel.cc @@ -0,0 +1,41 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/math/jit_kernel.h" +#include +#include + +namespace paddle { +namespace operators { +namespace math { +namespace jitkernel { + +namespace jit = platform::jit; + +KernelPool& KernelPool::Instance() { + static thread_local KernelPool g_jit_kernels; + return g_jit_kernels; +} + +std::shared_ptr KernelPool::Get(const std::string& key) const { + if (kers_.find(key) == kers_.end()) { + return nullptr; + } + return kers_.at(key); +} + +} // namespace jitkernel +} // namespace math +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/math/jit_kernel.h b/paddle/fluid/operators/math/jit_kernel.h new file mode 100644 index 0000000000..e91e4e8e5a --- /dev/null +++ b/paddle/fluid/operators/math/jit_kernel.h @@ -0,0 +1,148 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once +#include +#include // for shared_ptr +#include +#include +#include "paddle/fluid/platform/cpu_info.h" +#include "paddle/fluid/platform/macros.h" + +// Note: Only support on CPU yet. +namespace paddle { +namespace operators { +namespace math { +namespace jitkernel { + +#define SIGMOID_THRESHOLD_MIN -40.0 +#define SIGMOID_THRESHOLD_MAX 13.0 +#define EXP_MAX_INPUT 40.0 +#define AVX_FLOAT_BLOCK 8 +#define AVX2_FLOAT_BLOCK 8 +#define AVX512_FLOAT_BLOCK 16 + +typedef enum { kLT8, kEQ8, kGT8LT16, kEQ16, kGT16 } jit_block; + +class Kernel { + public: + Kernel() = default; + virtual ~Kernel() = default; + int num_{0}; + int end_{0}; + int rest_{0}; + DISABLE_COPY_AND_ASSIGN(Kernel); +}; + +class KernelPool { + public: + static KernelPool &Instance(); + + template + std::shared_ptr Get(ARGS... args); + + std::shared_ptr Get(const std::string &key) const; + + private: + KernelPool() = default; + std::unordered_map> kers_; + + DISABLE_COPY_AND_ASSIGN(KernelPool); +}; + +template +class VMulKernel : public Kernel { + public: + virtual void Compute(const T *x, const T *y, T *z) const = 0; +}; + +template +class VAddKernel : public Kernel { + public: + virtual void Compute(const T *x, const T *y, T *z) const = 0; +}; + +template +class VScalKernel : public Kernel { + public: + virtual void Compute(const T a, const T *x, T *y) const = 0; + virtual void Compute(const T a, T *x) const = 0; +}; + +template +class VAddBiasKernel : public Kernel { + public: + virtual void Compute(const T a, const T *x, T *y) const = 0; +}; + +template +class VAddReluKernel : public Kernel { + public: + virtual void Compute(const T *x, const T *y, T *z) const = 0; +}; + +template +class VActKernel : public Kernel { + public: + virtual void Compute(const T *x, T *y) const = 0; +}; + +template +class VReluKernel : public VActKernel { + public: + virtual void Compute(const T *x, T *y) const = 0; +}; + +template +class VIdentityKernel : public VActKernel { + public: + virtual void Compute(const T *x, T *y) const = 0; +}; + +template +class VExpKernel : public VActKernel { + public: + virtual void Compute(const T *x, T *y) const = 0; +}; + +template +class VSigmoidKernel : public VActKernel { + public: + virtual void Compute(const T *x, T *y) const = 0; +}; + +template +class VTanhKernel : public VActKernel { + public: + virtual void Compute(const T *x, T *y) const = 0; +}; + +template +class LSTMKernel : public Kernel { + public: + virtual void ComputeCtHt(T *gates, const T *ct_1, T *ct, T *ht, + /* below only used in peephole*/ + const T *wp_data = nullptr, + T *checked = nullptr) const = 0; + + // compute c1 and h1 without c0 or h0 + virtual void ComputeC1H1(T *gates, T *ct, T *ht, + /* below only used in peephole*/ + const T *wp_data = nullptr) const = 0; +}; + +} // namespace jitkernel +} // namespace math +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/math/jit_kernel_blas.cc b/paddle/fluid/operators/math/jit_kernel_blas.cc new file mode 100644 index 0000000000..c88b17b012 --- /dev/null +++ b/paddle/fluid/operators/math/jit_kernel_blas.cc @@ -0,0 +1,479 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/math/jit_kernel.h" +#include +#include "paddle/fluid/operators/math/jit_kernel_macro.h" +#ifdef PADDLE_WITH_MKLML +#include "paddle/fluid/platform/dynload/mklml.h" +#endif + +#ifdef __AVX__ +#include +#endif + +namespace paddle { +namespace operators { +namespace math { +namespace jitkernel { + +namespace jit = platform::jit; + +/* VMUL JitKernel */ +template +class VMulKernelImpl : public VMulKernel { + public: + explicit VMulKernelImpl(int d) : VMulKernel() { this->num_ = d; } + void Compute(const T* x, const T* y, T* z) const override { + for (int i = 0; i < this->num_; ++i) { + z[i] = x[i] * y[i]; + } + } +}; + +#ifdef PADDLE_WITH_MKLML +#define MKL_FLOAT(isa, block) \ + template <> \ + void VMulKernelImpl::Compute( \ + const float* x, const float* y, float* z) const { \ + platform::dynload::vsMul(this->num_, x, y, z); \ + } + +#define MKL_DOUBLE(isa, block) \ + template <> \ + void VMulKernelImpl::Compute( \ + const double* x, const double* y, double* z) const { \ + platform::dynload::vdMul(this->num_, x, y, z); \ + } + +FOR_EACH_ISA(MKL_FLOAT, kGT16); +FOR_EACH_ISA_BLOCK(MKL_DOUBLE); +#endif + +#define INTRI8_FLOAT(isa) \ + template <> \ + void VMulKernelImpl::Compute( \ + const float* x, const float* y, float* z) const { \ + __m256 tmpx, tmpy; \ + tmpx = _mm256_loadu_ps(x); \ + tmpy = _mm256_loadu_ps(y); \ + tmpx = _mm256_mul_ps(tmpx, tmpy); \ + _mm256_storeu_ps(z, tmpx); \ + } + +// avx > for > mkl +#ifdef __AVX__ +INTRI8_FLOAT(jit::avx); +#endif +#ifdef __AVX2__ +INTRI8_FLOAT(jit::avx2); +#endif +#ifdef __AVX512F__ +INTRI8_FLOAT(jit::avx512f); +#endif +// TODO(TJ): eq16 test and complete avx512 +#undef INTRI8_FLOAT +#undef MKL_FLOAT +#undef MKL_DOUBLE + +/* VADD JitKernel */ +template +class VAddKernelImpl : public VAddKernel { + public: + explicit VAddKernelImpl(int d) : VAddKernel() { this->num_ = d; } + void Compute(const T* x, const T* y, T* z) const override { + for (int i = 0; i < this->num_; ++i) { + z[i] = x[i] + y[i]; + } + } +}; + +#ifdef PADDLE_WITH_MKLML +#define MKL_FLOAT(isa, block) \ + template <> \ + void VAddKernelImpl::Compute( \ + const float* x, const float* y, float* z) const { \ + platform::dynload::vsAdd(this->num_, x, y, z); \ + } + +#define MKL_DOUBLE(isa, block) \ + template <> \ + void VAddKernelImpl::Compute( \ + const double* x, const double* y, double* z) const { \ + platform::dynload::vdAdd(this->num_, x, y, z); \ + } + +FOR_EACH_ISA(MKL_FLOAT, kGT16); +FOR_EACH_ISA_BLOCK(MKL_DOUBLE); +#endif + +#define INTRI8_FLOAT(isa) \ + template <> \ + void VAddKernelImpl::Compute( \ + const float* x, const float* y, float* z) const { \ + __m256 tmpx, tmpy; \ + tmpx = _mm256_loadu_ps(x); \ + tmpy = _mm256_loadu_ps(y); \ + tmpx = _mm256_add_ps(tmpx, tmpy); \ + _mm256_storeu_ps(z, tmpx); \ + } +#ifdef __AVX__ +INTRI8_FLOAT(jit::avx); +#endif +#ifdef __AVX2__ +INTRI8_FLOAT(jit::avx2); +#endif +#ifdef __AVX512F__ +INTRI8_FLOAT(jit::avx512f); +#endif +// TODO(TJ): eq16 test and complete avx512 + +#undef INTRI8_FLOAT +#undef MKL_FLOAT +#undef MKL_DOUBLE + +/* VSCAL JitKernel */ +template +class VScalKernelImpl : public VScalKernel { + public: + explicit VScalKernelImpl(int d) : VScalKernel() { this->num_ = d; } + void Compute(const T a, const T* x, T* y) const override { + for (int i = 0; i < this->num_; ++i) { + y[i] = a * x[i]; + } + } + void Compute(const T a, T* x) const override { + for (int i = 0; i < this->num_; ++i) { + x[i] = a * x[i]; + } + } +}; + +#ifdef PADDLE_WITH_MKLML +#define MKL_FLOAT(isa, block) \ + template <> \ + void VScalKernelImpl::Compute(const float a, float* x) \ + const { \ + platform::dynload::cblas_sscal(this->num_, a, x, 1); \ + } + +#define MKL_DOUBLE(isa, block) \ + template <> \ + void VScalKernelImpl::Compute(const double a, double* x) \ + const { \ + platform::dynload::cblas_dscal(this->num_, a, x, 1); \ + } + +FOR_EACH_ISA(MKL_FLOAT, kGT16); +FOR_EACH_ISA_BLOCK(MKL_DOUBLE); +#endif + +#define INTRI8_FLOAT(isa) \ + template <> \ + void VScalKernelImpl::Compute( \ + const float a, const float* x, float* y) const { \ + __m256 tmp; \ + __m256 scalar = _mm256_set1_ps(a); \ + tmp = _mm256_loadu_ps(x); \ + tmp = _mm256_mul_ps(tmp, scalar); \ + _mm256_storeu_ps(y, tmp); \ + } +#define INTRI8_INPLACE_FLOAT(isa) \ + template <> \ + void VScalKernelImpl::Compute(const float a, float* x) \ + const { \ + __m256 tmp; \ + __m256 scalar = _mm256_set1_ps(a); \ + tmp = _mm256_loadu_ps(x); \ + tmp = _mm256_mul_ps(tmp, scalar); \ + _mm256_storeu_ps(x, tmp); \ + } + +#ifdef __AVX__ +INTRI8_FLOAT(jit::avx); +INTRI8_INPLACE_FLOAT(jit::avx); +#endif +#ifdef __AVX2__ +INTRI8_FLOAT(jit::avx2); +INTRI8_INPLACE_FLOAT(jit::avx2); +#endif +#ifdef __AVX512F__ +INTRI8_FLOAT(jit::avx512f); +INTRI8_INPLACE_FLOAT(jit::avx512f); +#endif +// TODO(TJ): eq16 test and complete avx512 + +#undef INTRI8_FLOAT +#undef INTRI8_INPLACE_FLOAT +#undef MKL_FLOAT +#undef MKL_DOUBLE + +/* VAddBias JitKernel */ +template +class VAddBiasKernelImpl : public VAddBiasKernel { + public: + explicit VAddBiasKernelImpl(int d) : VAddBiasKernel() { this->num_ = d; } + void Compute(const T a, const T* x, T* y) const override { + for (int i = 0; i < this->num_; ++i) { + y[i] = x[i] + a; + } + } +}; + +#define INTRI8_FLOAT(isa) \ + template <> \ + void VAddBiasKernelImpl::Compute( \ + const float a, const float* x, float* y) const { \ + __m256 tmp = _mm256_loadu_ps(x); \ + tmp = _mm256_add_ps(tmp, _mm256_set1_ps(a)); \ + _mm256_storeu_ps(y, tmp); \ + } + +#define INTRI16_FLOAT(isa) \ + template <> \ + void VAddBiasKernelImpl::Compute( \ + const float a, const float* x, float* y) const { \ + __m256 tmp0 = _mm256_loadu_ps(x); \ + __m256 tmp1 = _mm256_loadu_ps(x + 8); \ + tmp0 = _mm256_add_ps(tmp0, _mm256_set1_ps(a)); \ + tmp1 = _mm256_add_ps(tmp1, _mm256_set1_ps(a)); \ + _mm256_storeu_ps(y, tmp0); \ + _mm256_storeu_ps(y + 8, tmp1); \ + } + +#ifdef __AVX__ +INTRI8_FLOAT(jit::avx); +INTRI16_FLOAT(jit::avx); +#endif +#ifdef __AVX2__ +INTRI8_FLOAT(jit::avx2); +INTRI16_FLOAT(jit::avx2); +#endif +#ifdef __AVX512F__ +INTRI8_FLOAT(jit::avx512f); +INTRI16_FLOAT(jit::avx512f); +#endif +// TODO(TJ): eq16 test and complete avx512 + +#undef INTRI8_FLOAT +#undef INTRI16_FLOAT + +/* VRelu JitKernel */ +template +class VReluKernelImpl : public VReluKernel { + public: + explicit VReluKernelImpl(int d) : VReluKernel() { this->num_ = d; } + void Compute(const T* x, T* y) const override { + for (int i = 0; i < this->num_; ++i) { + y[i] = x[i] > 0 ? x[i] : 0; + } + } +}; + +#define INTRI8_FLOAT(isa) \ + template <> \ + void VReluKernelImpl::Compute(const float* x, float* y) \ + const { \ + __m256 tmp = _mm256_loadu_ps(x); \ + tmp = _mm256_max_ps(tmp, _mm256_setzero_ps()); \ + _mm256_storeu_ps(y, tmp); \ + } + +#define INTRI16_FLOAT(isa) \ + template <> \ + void VReluKernelImpl::Compute(const float* x, float* y) \ + const { \ + __m256 zeros = _mm256_setzero_ps(); \ + __m256 tmp0 = _mm256_loadu_ps(x); \ + __m256 tmp1 = _mm256_loadu_ps(x + 8); \ + tmp0 = _mm256_max_ps(tmp0, zeros); \ + tmp1 = _mm256_max_ps(tmp1, zeros); \ + _mm256_storeu_ps(y, tmp0); \ + _mm256_storeu_ps(y + 8, tmp1); \ + } + +#define INTRI_GT8LT16_FLOAT(isa) \ + template <> \ + VReluKernelImpl::VReluKernelImpl(int d) \ + : VReluKernel() { \ + this->num_ = d; \ + this->end_ = AVX_FLOAT_BLOCK; \ + this->rest_ = d - AVX_FLOAT_BLOCK; \ + } \ + template <> \ + void VReluKernelImpl::Compute(const float* x, \ + float* y) const { \ + __m256 zeros = _mm256_setzero_ps(); \ + __m256 tmp0 = _mm256_loadu_ps(x); \ + __m256 tmp1 = _mm256_loadu_ps(x + this->rest_); \ + tmp0 = _mm256_max_ps(tmp0, zeros); \ + tmp1 = _mm256_max_ps(tmp1, zeros); \ + _mm256_storeu_ps(y, tmp0); \ + _mm256_storeu_ps(y + this->rest_, tmp1); \ + } + +#define INTRI_GT16_FLOAT(isa) \ + template <> \ + VReluKernelImpl::VReluKernelImpl(int d) \ + : VReluKernel() { \ + this->num_ = d; \ + this->end_ = d - d % AVX_FLOAT_BLOCK; \ + this->rest_ = d - AVX_FLOAT_BLOCK; \ + } \ + template <> \ + void VReluKernelImpl::Compute(const float* x, float* y) \ + const { \ + __m256 zeros = _mm256_setzero_ps(); \ + for (int i = 0; i < this->end_; i += AVX_FLOAT_BLOCK) { \ + __m256 tmp = _mm256_loadu_ps(x + i); \ + tmp = _mm256_max_ps(tmp, zeros); \ + _mm256_storeu_ps(y + i, tmp); \ + } \ + __m256 tmp = _mm256_loadu_ps(x + this->rest_); \ + tmp = _mm256_max_ps(tmp, zeros); \ + _mm256_storeu_ps(y + this->rest_, tmp); \ + } + +#ifdef __AVX__ +INTRI8_FLOAT(jit::avx); +INTRI16_FLOAT(jit::avx); +INTRI_GT8LT16_FLOAT(jit::avx); +INTRI_GT16_FLOAT(jit::avx); +#endif +#ifdef __AVX2__ +INTRI8_FLOAT(jit::avx2); +INTRI16_FLOAT(jit::avx2); +INTRI_GT8LT16_FLOAT(jit::avx2); +INTRI_GT16_FLOAT(jit::avx2); +#endif +#ifdef __AVX512F__ +// TODO(TJ): refine avx512 +INTRI8_FLOAT(jit::avx512f); +INTRI16_FLOAT(jit::avx512f); +INTRI_GT8LT16_FLOAT(jit::avx512f); +INTRI_GT16_FLOAT(jit::avx512f); +#endif + +#undef INTRI8_FLOAT +#undef INTRI16_FLOAT +#undef INTRI_GT8LT16_FLOAT +#undef INTRI_GT16_FLOAT + +/* An empty JitKernel */ +template +class VIdentityKernelImpl : public VIdentityKernel { + public: + explicit VIdentityKernelImpl(int d) : VIdentityKernel() { this->num_ = d; } + void Compute(const T* x, T* y) const override {} +}; + +/* VAddRelu JitKernel */ +template +class VAddReluKernelImpl : public VAddReluKernel { + public: + explicit VAddReluKernelImpl(int d) : VAddReluKernel() { this->num_ = d; } + void Compute(const T* x, const T* y, T* z) const override { + for (int i = 0; i < this->num_; ++i) { + z[i] = x[i] + y[i]; + z[i] = z[i] > 0 ? z[i] : 0; + } + } +}; + +#define INTRI8_FLOAT(isa) \ + template <> \ + void VAddReluKernelImpl::Compute( \ + const float* x, const float* y, float* z) const { \ + __m256 tmpx = _mm256_loadu_ps(x); \ + __m256 tmpy = _mm256_loadu_ps(y); \ + tmpy = _mm256_add_ps(tmpx, tmpy); \ + tmpy = _mm256_max_ps(tmpy, _mm256_setzero_ps()); \ + _mm256_storeu_ps(z, tmpy); \ + } + +#define INTRI16_FLOAT(isa) \ + template <> \ + void VAddReluKernelImpl::Compute( \ + const float* x, const float* y, float* z) const { \ + __m256 zeros = _mm256_setzero_ps(); \ + __m256 tmp0 = _mm256_loadu_ps(x); \ + __m256 tmp1 = _mm256_loadu_ps(y); \ + tmp0 = _mm256_add_ps(tmp0, tmp1); \ + tmp0 = _mm256_max_ps(tmp0, zeros); \ + tmp1 = _mm256_loadu_ps(x + 8); \ + __m256 tmp2 = _mm256_loadu_ps(y + 8); \ + tmp1 = _mm256_add_ps(tmp1, tmp2); \ + tmp1 = _mm256_max_ps(tmp1, zeros); \ + _mm256_storeu_ps(z, tmp0); \ + _mm256_storeu_ps(z + 8, tmp1); \ + } + +#define INTRI_COMMON_FLOAT(isa, block) \ + template <> \ + VAddReluKernelImpl::VAddReluKernelImpl(int d) \ + : VAddReluKernel() { \ + this->num_ = d; \ + this->end_ = d - d % AVX_FLOAT_BLOCK; \ + this->rest_ = d - this->end_; \ + } \ + template <> \ + void VAddReluKernelImpl::Compute( \ + const float* x, const float* y, float* z) const { \ + __m256 zeros = _mm256_setzero_ps(); \ + for (int i = 0; i < this->end_; i += AVX_FLOAT_BLOCK) { \ + __m256 tmpx = _mm256_loadu_ps(x + i); \ + __m256 tmpy = _mm256_loadu_ps(y + i); \ + tmpy = _mm256_add_ps(tmpx, tmpy); \ + tmpy = _mm256_max_ps(tmpy, zeros); \ + _mm256_storeu_ps(z + i, tmpy); \ + } \ + for (int i = this->end_; i < this->num_; ++i) { \ + z[i] = x[i] + y[i]; \ + z[i] = z[i] > 0 ? z[i] : 0; \ + } \ + } + +#ifdef __AVX__ +INTRI8_FLOAT(jit::avx); +INTRI16_FLOAT(jit::avx); +INTRI_COMMON_FLOAT(jit::avx, kGT16); +#endif +#ifdef __AVX2__ +INTRI8_FLOAT(jit::avx2); +INTRI16_FLOAT(jit::avx2); +INTRI_COMMON_FLOAT(jit::avx2, kGT16); +#endif +#ifdef __AVX512F__ +// TODO(TJ): refine avx512 +INTRI8_FLOAT(jit::avx512f); +INTRI16_FLOAT(jit::avx512f); +INTRI_COMMON_FLOAT(jit::avx512f, kGT16); +#endif + +#undef INTRI8_FLOAT +#undef INTRI16_FLOAT +#undef INTRI_COMMON_FLOAT + +REGISTER_JITKERNEL(vmul, VMulKernel); +REGISTER_JITKERNEL(vadd, VAddKernel); +REGISTER_JITKERNEL(vscal, VScalKernel); +REGISTER_JITKERNEL(vaddb, VAddBiasKernel); +REGISTER_JITKERNEL(vrelu, VReluKernel); +REGISTER_JITKERNEL(vaddrelu, VAddReluKernel); +REGISTER_JITKERNEL(videntity, VIdentityKernel); + +} // namespace jitkernel +} // namespace math +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/math/jit_kernel_exp.cc b/paddle/fluid/operators/math/jit_kernel_exp.cc new file mode 100644 index 0000000000..c4247580f4 --- /dev/null +++ b/paddle/fluid/operators/math/jit_kernel_exp.cc @@ -0,0 +1,541 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/math/jit_kernel.h" +#include // for exp +#include +#include "paddle/fluid/operators/math/jit_kernel_macro.h" +#ifdef PADDLE_WITH_MKLML +#include "paddle/fluid/platform/dynload/mklml.h" +#endif + +#ifdef __AVX__ +#include +#endif + +namespace paddle { +namespace operators { +namespace math { +namespace jitkernel { +namespace jit = platform::jit; + +/* VExp JitKernel */ +template +class VExpKernelImpl : public VExpKernel { + public: + explicit VExpKernelImpl(int d) : VExpKernel() { this->num_ = d; } + void Compute(const T* x, T* y) const override { + for (int i = 0; i < this->num_; ++i) { + y[i] = std::exp(x[i]); + } + } +}; + +#ifdef PADDLE_WITH_MKLML +#define MKL_FLOAT(isa, block) \ + template <> \ + void VExpKernelImpl::Compute(const float* x, float* y) \ + const { \ + platform::dynload::vsExp(this->num_, x, y); \ + } + +#define MKL_DOUBLE(isa, block) \ + template <> \ + void VExpKernelImpl::Compute(const double* x, double* y) \ + const { \ + platform::dynload::vdExp(this->num_, x, y); \ + } +FOR_EACH_ISA(MKL_FLOAT, kLT8); +FOR_EACH_ISA(MKL_FLOAT, kGT8LT16); +FOR_EACH_ISA(MKL_FLOAT, kGT16); +FOR_EACH_ISA_BLOCK(MKL_DOUBLE); +#endif + +namespace detail { + +#ifdef __AVX__ + +#define ALIGN32 __attribute__((aligned(32))) + +#define _PS256_CONST(Name, Val) \ + static const float _ps256_##Name[8] ALIGN32 = {Val, Val, Val, Val, \ + Val, Val, Val, Val} + +#define _PI256_CONST(Name, Val) \ + static const int _pi256_##Name[8] ALIGN32 = {Val, Val, Val, Val, \ + Val, Val, Val, Val} + +_PI256_CONST(0x7f, 0x7f); +_PS256_CONST(one, 1.f); +_PS256_CONST(0p5, 0.5f); +_PS256_CONST(exp_hi, 88.3762626647949f); +_PS256_CONST(exp_lo, -88.3762626647949f); +_PS256_CONST(cephes_LOG2EF, 1.44269504088896341); +_PS256_CONST(cephes_exp_C1, 0.693359375); +_PS256_CONST(cephes_exp_C2, -2.12194440e-4); +_PS256_CONST(cephes_exp_p0, 1.9875691500E-4); +_PS256_CONST(cephes_exp_p1, 1.3981999507E-3); +_PS256_CONST(cephes_exp_p2, 8.3334519073E-3); +_PS256_CONST(cephes_exp_p3, 4.1665795894E-2); +_PS256_CONST(cephes_exp_p4, 1.6666665459E-1); +_PS256_CONST(cephes_exp_p5, 5.0000001201E-1); + +typedef union imm_xmm_union { + __m256i imm; + __m128i xmm[2]; +} imm_xmm_union; + +#define COPY_IMM_TO_XMM(imm_, xmm0_, xmm1_) \ + { \ + imm_xmm_union u ALIGN32; \ + u.imm = imm_; \ + xmm0_ = u.xmm[0]; \ + xmm1_ = u.xmm[1]; \ + } + +#define COPY_XMM_TO_IMM(xmm0_, xmm1_, imm_) \ + { \ + imm_xmm_union u ALIGN32; \ + u.xmm[0] = xmm0_; \ + u.xmm[1] = xmm1_; \ + imm_ = u.imm; \ + } + +#define AVX2_BITOP_USING_SSE2(fn) \ + static inline __m256i avx2_mm256_##fn(__m256i x, int y) { \ + /* use SSE2 to perform the bitop AVX2 */ \ + __m128i x1, x2; \ + __m256i ret; \ + COPY_IMM_TO_XMM(x, x1, x2); \ + x1 = _mm_##fn(x1, y); \ + x2 = _mm_##fn(x2, y); \ + COPY_XMM_TO_IMM(x1, x2, ret); \ + return ret; \ + } + +#define AVX2_INTOP_USING_SSE2(fn) \ + static inline __m256i avx2_mm256_add_epi32(__m256i x, __m256i y) { \ + /* use SSE2 to perform the AVX2 integer operation */ \ + __m128i x1, x2; \ + __m128i y1, y2; \ + __m256i ret; \ + COPY_IMM_TO_XMM(x, x1, x2); \ + COPY_IMM_TO_XMM(y, y1, y2); \ + x1 = _mm_##fn(x1, y1); \ + x2 = _mm_##fn(x2, y2); \ + COPY_XMM_TO_IMM(x1, x2, ret); \ + return ret; \ + } + +AVX2_BITOP_USING_SSE2(slli_epi32); +AVX2_INTOP_USING_SSE2(add_epi32); + +#define AVXEXP_BASE \ + __m256 tmp = _mm256_setzero_ps(), fx; \ + __m256 one = *reinterpret_cast(_ps256_one); \ + __m256i imm0; \ + x = _mm256_min_ps(x, *reinterpret_cast(_ps256_exp_hi)); \ + x = _mm256_max_ps(x, *reinterpret_cast(_ps256_exp_lo)); \ + /* express exp(x) as exp(g + n*log(2)) */ \ + fx = _mm256_mul_ps(x, \ + *reinterpret_cast(_ps256_cephes_LOG2EF)); \ + fx = _mm256_add_ps(fx, *reinterpret_cast(_ps256_0p5)); \ + tmp = _mm256_floor_ps(fx); \ + /* if greater, substract 1 */ \ + __m256 mask = _mm256_cmp_ps(tmp, fx, _CMP_GT_OS); \ + mask = _mm256_and_ps(mask, one); \ + fx = _mm256_sub_ps(tmp, mask); \ + tmp = _mm256_mul_ps(fx, \ + *reinterpret_cast(_ps256_cephes_exp_C1)); \ + __m256 z = _mm256_mul_ps( \ + fx, *reinterpret_cast(_ps256_cephes_exp_C2)); \ + x = _mm256_sub_ps(x, tmp); \ + x = _mm256_sub_ps(x, z); \ + z = _mm256_mul_ps(x, x); \ + __m256 y = *reinterpret_cast(_ps256_cephes_exp_p0); \ + y = _mm256_mul_ps(y, x); \ + y = _mm256_add_ps(y, \ + *reinterpret_cast(_ps256_cephes_exp_p1)); \ + y = _mm256_mul_ps(y, x); \ + y = _mm256_add_ps(y, \ + *reinterpret_cast(_ps256_cephes_exp_p2)); \ + y = _mm256_mul_ps(y, x); \ + y = _mm256_add_ps(y, \ + *reinterpret_cast(_ps256_cephes_exp_p3)); \ + y = _mm256_mul_ps(y, x); \ + y = _mm256_add_ps(y, \ + *reinterpret_cast(_ps256_cephes_exp_p4)); \ + y = _mm256_mul_ps(y, x); \ + y = _mm256_add_ps(y, \ + *reinterpret_cast(_ps256_cephes_exp_p5)); \ + y = _mm256_mul_ps(y, z); \ + y = _mm256_add_ps(y, x); \ + y = _mm256_add_ps(y, one); \ + /* build 2^n */ \ + imm0 = _mm256_cvttps_epi32(fx) + +__m256 ExpAVX(__m256 x) { + AVXEXP_BASE; + // two AVX2 instructions using SSE2 + imm0 = avx2_mm256_add_epi32(imm0, + *reinterpret_cast(_pi256_0x7f)); + imm0 = avx2_mm256_slli_epi32(imm0, 23); + __m256 pow2n = _mm256_castsi256_ps(imm0); + y = _mm256_mul_ps(y, pow2n); + return y; +} +#endif + +#ifdef __AVX2__ +__m256 ExpAVX2(__m256 x) { + AVXEXP_BASE; + // two AVX2 instructions + imm0 = _mm256_add_epi32(imm0, *reinterpret_cast(_pi256_0x7f)); + imm0 = _mm256_slli_epi32(imm0, 23); + __m256 pow2n = _mm256_castsi256_ps(imm0); + y = _mm256_mul_ps(y, pow2n); + return y; +} +#endif + +} // namespace detail + +#define INTRI8_FLOAT(isa, expisa) \ + template <> \ + void VExpKernelImpl::Compute(const float* x, float* y) \ + const { \ + __m256 tmp = _mm256_loadu_ps(x); \ + _mm256_storeu_ps(y, expisa(tmp)); \ + } + +#define INTRI16_FLOAT(isa, expisa) \ + template <> \ + void VExpKernelImpl::Compute(const float* x, float* y) \ + const { \ + __m256 tmp0 = _mm256_loadu_ps(x); \ + __m256 tmp1 = _mm256_loadu_ps(x + 8); \ + tmp0 = expisa(tmp0); \ + tmp1 = expisa(tmp1); \ + _mm256_storeu_ps(y, tmp0); \ + _mm256_storeu_ps(y + 8, tmp1); \ + } + +#ifdef __AVX__ +INTRI8_FLOAT(jit::avx, detail::ExpAVX); +INTRI16_FLOAT(jit::avx, detail::ExpAVX); +#endif +#ifdef __AVX2__ +INTRI8_FLOAT(jit::avx2, detail::ExpAVX2); +INTRI16_FLOAT(jit::avx2, detail::ExpAVX2); +#endif +#ifdef __AVX512F__ +INTRI8_FLOAT(jit::avx512f, detail::ExpAVX2); +INTRI16_FLOAT(jit::avx512f, detail::ExpAVX2); +#endif +// TODO(TJ): eq16 test and complete avx512 + +#undef INTRI8_FLOAT +#undef INTRI16_FLOAT +#undef MKL_FLOAT +#undef MKL_DOUBLE + +REGISTER_JITKERNEL(vexp, VExpKernel); + +/* VSigmoid JitKernel */ +template +class VSigmoidKernelImpl : public VSigmoidKernel { + public: + explicit VSigmoidKernelImpl(int d) : VSigmoidKernel() { + this->num_ = d; + vexp_ = KernelPool::Instance().template Get>(d); + } + void Compute(const T* x, T* y) const override { + const T min = SIGMOID_THRESHOLD_MIN; + const T max = SIGMOID_THRESHOLD_MAX; + for (int i = 0; i < this->num_; ++i) { + y[i] = (x[i] < min) ? min : ((x[i] > max) ? max : x[i]); + y[i] = static_cast(0) - y[i]; + } + vexp_->Compute(y, y); + for (int i = 0; i < this->num_; ++i) { + y[i] = static_cast(1) / (static_cast(1) + y[i]); + } + } + + private: + std::shared_ptr> vexp_; +}; + +#define INTRI_SIGMOID(tmp, min, max, expisa) \ + tmp = _mm256_max_ps(tmp, min); \ + tmp = _mm256_min_ps(tmp, max); \ + tmp = _mm256_sub_ps(_mm256_set1_ps(0.0f), tmp); \ + tmp = expisa(tmp); \ + tmp = _mm256_add_ps(_mm256_set1_ps(1.0f), tmp); \ + tmp = _mm256_div_ps(_mm256_set1_ps(1.0f), tmp) + +#define INTRI8_FLOAT(isa, expisa) \ + template <> \ + void VSigmoidKernelImpl::Compute(const float* x, float* y) \ + const { \ + /* TODO(TJ): try to use static const*/ \ + __m256 max = _mm256_set1_ps(SIGMOID_THRESHOLD_MAX); \ + __m256 min = _mm256_set1_ps(SIGMOID_THRESHOLD_MIN); \ + __m256 tmp = _mm256_loadu_ps(x); \ + INTRI_SIGMOID(tmp, min, max, expisa); \ + _mm256_storeu_ps(y, tmp); \ + } + +#define INTRI16_FLOAT(isa, expisa) \ + template <> \ + void VSigmoidKernelImpl::Compute(const float* x, \ + float* y) const { \ + __m256 max = _mm256_set1_ps(SIGMOID_THRESHOLD_MAX); \ + __m256 min = _mm256_set1_ps(SIGMOID_THRESHOLD_MIN); \ + __m256 tmp0 = _mm256_loadu_ps(x); \ + __m256 tmp1 = _mm256_loadu_ps(x + 8); \ + INTRI_SIGMOID(tmp0, min, max, expisa); \ + INTRI_SIGMOID(tmp1, min, max, expisa); \ + _mm256_storeu_ps(y, tmp0); \ + _mm256_storeu_ps(y + 8, tmp1); \ + } + +#define INTRI_GT8LT16_FLOAT(isa, expisa) \ + template <> \ + VSigmoidKernelImpl::VSigmoidKernelImpl(int d) \ + : VSigmoidKernel() { \ + this->num_ = d; \ + this->end_ = AVX_FLOAT_BLOCK; \ + this->rest_ = d - this->end_; \ + vexp_ = \ + KernelPool::Instance().template Get>(this->rest_); \ + } \ + template <> \ + void VSigmoidKernelImpl::Compute(const float* x, \ + float* y) const { \ + __m256 max = _mm256_set1_ps(SIGMOID_THRESHOLD_MAX); \ + __m256 min = _mm256_set1_ps(SIGMOID_THRESHOLD_MIN); \ + __m256 tmp = _mm256_loadu_ps(x); \ + INTRI_SIGMOID(tmp, min, max, expisa); \ + _mm256_storeu_ps(y, tmp); \ + const float min_ = SIGMOID_THRESHOLD_MIN; \ + const float max_ = SIGMOID_THRESHOLD_MAX; \ + for (int i = this->end_; i < this->num_; ++i) { \ + y[i] = (x[i] < min_) ? min_ : ((x[i] > max_) ? max_ : x[i]); \ + y[i] = 0.f - y[i]; \ + } \ + vexp_->Compute(y + this->end_, y + this->end_); \ + for (int i = this->end_; i < this->num_; ++i) { \ + y[i] = 1.f / (1.f + y[i]); \ + } \ + } + +#define INTRI_GT16_FLOAT(isa, expisa) \ + template <> \ + VSigmoidKernelImpl::VSigmoidKernelImpl(int d) \ + : VSigmoidKernel() { \ + this->num_ = d; \ + this->rest_ = d % AVX_FLOAT_BLOCK; \ + this->end_ = d - this->rest_; \ + vexp_ = \ + KernelPool::Instance().template Get>(this->rest_); \ + } \ + template <> \ + void VSigmoidKernelImpl::Compute(const float* x, \ + float* y) const { \ + __m256 max = _mm256_set1_ps(SIGMOID_THRESHOLD_MAX); \ + __m256 min = _mm256_set1_ps(SIGMOID_THRESHOLD_MIN); \ + for (int i = 0; i < this->end_; i += AVX_FLOAT_BLOCK) { \ + __m256 tmp = _mm256_loadu_ps(x + i); \ + INTRI_SIGMOID(tmp, min, max, expisa); \ + _mm256_storeu_ps(y + i, tmp); \ + } \ + const float min_ = SIGMOID_THRESHOLD_MIN; \ + const float max_ = SIGMOID_THRESHOLD_MAX; \ + for (int i = this->end_; i < this->num_; ++i) { \ + y[i] = (x[i] < min_) ? min_ : ((x[i] > max_) ? max_ : x[i]); \ + y[i] = 0.f - y[i]; \ + } \ + vexp_->Compute(y + this->end_, y + this->end_); \ + for (int i = this->end_; i < this->num_; ++i) { \ + y[i] = 1.f / (1.f + y[i]); \ + } \ + } + +#ifdef __AVX__ +INTRI8_FLOAT(jit::avx, detail::ExpAVX); +INTRI16_FLOAT(jit::avx, detail::ExpAVX); +INTRI_GT8LT16_FLOAT(jit::avx, detail::ExpAVX); +INTRI_GT16_FLOAT(jit::avx, detail::ExpAVX); +#endif +#ifdef __AVX2__ +INTRI8_FLOAT(jit::avx2, detail::ExpAVX2); +INTRI16_FLOAT(jit::avx2, detail::ExpAVX2); +// maybe use avx at gt8lt16 and gt16 +#endif +#ifdef __AVX512F__ +INTRI8_FLOAT(jit::avx512f, detail::ExpAVX2); +INTRI16_FLOAT(jit::avx512f, detail::ExpAVX2); +// maybe use avx2 at gt8lt16 and gt16 +#endif + +#undef INTRI8_FLOAT +#undef INTRI16_FLOAT +#undef INTRI_GT8LT16_FLOAT +#undef INTRI_GT16_FLOAT +#undef INTRI_VSIGMOID + +REGISTER_JITKERNEL(vsigmoid, VSigmoidKernel); + +/* VTanh JitKernel */ +template +class VTanhKernelImpl : public VTanhKernel { + public: + explicit VTanhKernelImpl(int d) : VTanhKernel() { + this->num_ = d; + vscal_ = KernelPool::Instance().template Get>(d); + vsigmoid_ = KernelPool::Instance().template Get>(d); + vaddbias_ = KernelPool::Instance().template Get>(d); + } + void Compute(const T* x, T* y) const override { + vscal_->Compute(static_cast(2), x, y); + vsigmoid_->Compute(y, y); + vscal_->Compute(static_cast(2), y); + vaddbias_->Compute(static_cast(-1), y, y); + } + + private: + std::shared_ptr> vscal_; + std::shared_ptr> vsigmoid_; + std::shared_ptr> vaddbias_; +}; + +#define INTRI_VTANH(tmp, expisa) \ + tmp = _mm256_mul_ps(_mm256_set1_ps(-2.0f), tmp); \ + tmp = _mm256_min_ps(tmp, _mm256_set1_ps(EXP_MAX_INPUT)); \ + tmp = expisa(tmp); \ + tmp = _mm256_add_ps(_mm256_set1_ps(1.0f), tmp); \ + tmp = _mm256_div_ps(_mm256_set1_ps(2.0f), tmp); \ + tmp = _mm256_sub_ps(tmp, _mm256_set1_ps(1.0f)) + +#define INTRI8_FLOAT(isa, expisa) \ + template <> \ + void VTanhKernelImpl::Compute(const float* x, float* y) \ + const { \ + __m256 tmp = _mm256_loadu_ps(x); \ + INTRI_VTANH(tmp, expisa); \ + _mm256_storeu_ps(y, tmp); \ + } + +#define INTRI16_FLOAT(isa, expisa) \ + template <> \ + void VTanhKernelImpl::Compute(const float* x, float* y) \ + const { \ + __m256 tmp0 = _mm256_loadu_ps(x); \ + __m256 tmp1 = _mm256_loadu_ps(x + 8); \ + INTRI_VTANH(tmp0, expisa); \ + INTRI_VTANH(tmp1, expisa); \ + _mm256_storeu_ps(y, tmp0); \ + _mm256_storeu_ps(y + 8, tmp1); \ + } + +#define INTRI_GT8LT16_FLOAT(isa, expisa) \ + template <> \ + VTanhKernelImpl::VTanhKernelImpl(int d) \ + : VTanhKernel() { \ + this->num_ = d; \ + this->end_ = AVX_FLOAT_BLOCK; \ + this->rest_ = d - this->end_; \ + vscal_ = \ + KernelPool::Instance().template Get>(this->rest_); \ + vsigmoid_ = KernelPool::Instance().template Get>( \ + this->rest_); \ + vaddbias_ = KernelPool::Instance().template Get>( \ + this->rest_); \ + } \ + template <> \ + void VTanhKernelImpl::Compute(const float* x, \ + float* y) const { \ + __m256 tmp = _mm256_loadu_ps(x); \ + INTRI_VTANH(tmp, expisa); \ + _mm256_storeu_ps(y, tmp); \ + x += AVX_FLOAT_BLOCK; \ + y += AVX_FLOAT_BLOCK; \ + vscal_->Compute(2.f, x, y); \ + vsigmoid_->Compute(y, y); \ + vscal_->Compute(2.f, y); \ + vaddbias_->Compute(-1.f, y, y); \ + } + +#define INTRI_GT16_FLOAT(isa, expisa) \ + template <> \ + VTanhKernelImpl::VTanhKernelImpl(int d) \ + : VTanhKernel() { \ + this->num_ = d; \ + this->rest_ = d % AVX_FLOAT_BLOCK; \ + this->end_ = d - this->rest_; \ + vscal_ = \ + KernelPool::Instance().template Get>(this->rest_); \ + vsigmoid_ = KernelPool::Instance().template Get>( \ + this->rest_); \ + vaddbias_ = KernelPool::Instance().template Get>( \ + this->rest_); \ + } \ + template <> \ + void VTanhKernelImpl::Compute(const float* x, float* y) \ + const { \ + for (int i = 0; i < this->end_; i += AVX_FLOAT_BLOCK) { \ + __m256 tmp = _mm256_loadu_ps(x + i); \ + INTRI_VTANH(tmp, expisa); \ + _mm256_storeu_ps(y + i, tmp); \ + } \ + x += this->end_; \ + y += this->end_; \ + vscal_->Compute(2.f, x, y); \ + vsigmoid_->Compute(y, y); \ + vscal_->Compute(2.f, y); \ + vaddbias_->Compute(-1.f, y, y); \ + } + +#ifdef __AVX__ +INTRI8_FLOAT(jit::avx, detail::ExpAVX); +INTRI16_FLOAT(jit::avx, detail::ExpAVX); +INTRI_GT8LT16_FLOAT(jit::avx, detail::ExpAVX); +INTRI_GT16_FLOAT(jit::avx, detail::ExpAVX); +#endif +#ifdef __AVX2__ +INTRI8_FLOAT(jit::avx2, detail::ExpAVX2); +INTRI16_FLOAT(jit::avx2, detail::ExpAVX2); +// maybe use avx at gt8lt16 and gt16 +#endif +#ifdef __AVX512F__ +INTRI8_FLOAT(jit::avx512f, detail::ExpAVX2); +INTRI16_FLOAT(jit::avx512f, detail::ExpAVX2); +// maybe use avx at gt8lt16 and gt16 +#endif + +#undef INTRI8_FLOAT +#undef INTRI16_FLOAT +#undef INTRI_GT8LT16_FLOAT +#undef INTRI_GT16_FLOAT +#undef INTRI_VTANH + +REGISTER_JITKERNEL(vtanh, VTanhKernel); + +#undef JITKERNEL_NEW_ACT_IMPL + +} // namespace jitkernel +} // namespace math +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/math/jit_kernel_lstm.cc b/paddle/fluid/operators/math/jit_kernel_lstm.cc new file mode 100644 index 0000000000..26bd26e2e1 --- /dev/null +++ b/paddle/fluid/operators/math/jit_kernel_lstm.cc @@ -0,0 +1,360 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/math/jit_kernel.h" +#include +#include "paddle/fluid/operators/math/jit_kernel_macro.h" +#include "paddle/fluid/platform/enforce.h" +#include "paddle/fluid/platform/macros.h" + +#ifdef __AVX__ +#include +#endif + +namespace paddle { +namespace operators { +namespace math { +namespace jitkernel { +namespace detail { +#ifdef __AVX__ +__m256 ExpAVX(__m256 x); +#endif + +#ifdef __AVX2__ +__m256 ExpAVX2(__m256 x); +#endif + +} // namespace detail + +namespace jit = platform::jit; + +#ifdef __AVX__ +typedef enum { kSigmoid, kRelu, kTanh, kIdentity } act_type; + +class AVXAct { + public: + virtual ~AVXAct() = default; + virtual __m256 Compute(__m256 x) const = 0; +}; + +template +class AVXActImpl : public AVXAct { + public: + __m256 Compute(__m256 x) const override { PADDLE_THROW("Unkown type!"); } +}; + +#define AVX_SIGMOID(isa, expisa) \ + template <> \ + __m256 AVXActImpl::Compute(__m256 x) const { \ + __m256 ones = _mm256_set1_ps(1.0f); \ + x = _mm256_max_ps(x, _mm256_set1_ps(SIGMOID_THRESHOLD_MIN)); \ + x = _mm256_min_ps(x, _mm256_set1_ps(SIGMOID_THRESHOLD_MAX)); \ + x = _mm256_sub_ps(_mm256_set1_ps(0.0f), x); \ + x = expisa(x); \ + x = _mm256_add_ps(ones, x); \ + return _mm256_div_ps(ones, x); \ + } + +#define AVX_TANH(isa, expisa) \ + template <> \ + __m256 AVXActImpl::Compute(__m256 x) const { \ + __m256 ones = _mm256_set1_ps(1.0f); \ + x = _mm256_mul_ps(_mm256_set1_ps(-2.0f), x); \ + x = _mm256_min_ps(x, _mm256_set1_ps(EXP_MAX_INPUT)); \ + x = expisa(x); \ + x = _mm256_add_ps(ones, x); \ + x = _mm256_div_ps(_mm256_set1_ps(2.0f), x); \ + return _mm256_sub_ps(x, ones); \ + } + +#define AVX_RELU(isa) \ + template <> \ + __m256 AVXActImpl::Compute(__m256 x) const { \ + return _mm256_max_ps(x, _mm256_setzero_ps()); \ + } + +#define AVX_IDENTITY(isa) \ + template <> \ + __m256 AVXActImpl::Compute(__m256 x) const { \ + return x; \ + } + +#define FOR_EACH_AVX_ISA(macro_) \ + macro_(jit::avx); \ + macro_(jit::avx2); \ + macro_(jit::avx512f) + +FOR_EACH_AVX_ISA(AVX_RELU); +FOR_EACH_AVX_ISA(AVX_IDENTITY); + +AVX_SIGMOID(jit::avx, detail::ExpAVX); +AVX_TANH(jit::avx, detail::ExpAVX); + +#ifdef __AVX2__ +AVX_SIGMOID(jit::avx2, detail::ExpAVX2); +AVX_SIGMOID(jit::avx512f, detail::ExpAVX2); +AVX_TANH(jit::avx2, detail::ExpAVX2); +AVX_TANH(jit::avx512f, detail::ExpAVX2); +#endif + +#undef FOR_EACH_AVX_ISA +#undef AVX_IDENTITY +#undef AVX_RELU +#undef AVX_TANH +#undef AVX_SIGMOID + +#endif + +template +static std::shared_ptr> GetActKernel( + const std::string& type, int n) { + if (type == "sigmoid") { + return std::dynamic_pointer_cast>( + KernelPool::Instance().template Get>(n)); + } else if (type == "relu") { + return std::dynamic_pointer_cast>( + KernelPool::Instance().template Get>(n)); + } else if (type == "tanh") { + return std::dynamic_pointer_cast>( + KernelPool::Instance().template Get>(n)); + } else if (type == "identity" || type == "") { + return std::dynamic_pointer_cast>( + KernelPool::Instance().template Get>(n)); + } + PADDLE_THROW("Not support type: %s", type); + return nullptr; +} + +/* LSTM JitKernel */ +template +class LSTMKernelImpl : public LSTMKernel { + public: + explicit LSTMKernelImpl(const std::string& act_gate, + const std::string& act_cand, + const std::string& act_cell, int d) + : LSTMKernel() { + d_ = d; + d2_ = d * 2; + d3_ = d * 3; + act_gate_d3_ = GetActKernel(act_gate, d3_); + act_gate_d_ = GetActKernel(act_gate, d); + act_cand_d_ = GetActKernel(act_cand, d); + act_cell_d_ = GetActKernel(act_cell, d); + vmul_d_ = KernelPool::Instance().template Get>(d); + vadd_d_ = KernelPool::Instance().template Get>(d); + } + + void ComputeCtHt(T* gates, const T* ct_1, T* ct, T* ht, const T* wp_data, + T* checked) const override { + // gates: W_ch, W_ih, W_fh, W_oh + act_gate_d3_->Compute(gates + d_, gates + d_); + + /* C_t = C_t-1 * fgated + cand_gated * igated */ + act_cand_d_->Compute(gates, gates); + vmul_d_->Compute(gates, gates + d_, gates + d_); + vmul_d_->Compute(ct_1, gates + d2_, gates + d2_); + vadd_d_->Compute(gates + d_, gates + d2_, ct); + + /* H_t = act_cell(C_t) * ogated */ + act_cell_d_->Compute(ct, gates + d2_); + vmul_d_->Compute(gates + d2_, gates + d3_, ht); + } + void ComputeC1H1(T* gates, T* ct, T* ht, const T* wp_data) const override { + /* C_t = igated * cgated*/ + act_gate_d_->Compute(gates + d_, gates + d_); + act_cand_d_->Compute(gates, gates); + vmul_d_->Compute(gates, gates + d_, ct); + /* H_t = act_cell(C_t) * ogated */ + act_gate_d_->Compute(gates + d3_, gates + d3_); + act_cell_d_->Compute(ct, gates + d2_); + vmul_d_->Compute(gates + d2_, gates + d3_, ht); + } + + private: + int d_, d2_, d3_; + std::shared_ptr> act_gate_d3_, act_gate_d_, act_cand_d_, + act_cell_d_; + std::shared_ptr> vmul_d_; + std::shared_ptr> vadd_d_; +#ifdef __AVX__ + std::unique_ptr avx_act_gate_, avx_act_cand_, avx_act_cell_; +#endif +}; + +#define INTRI8_FLOAT(isa) \ + template <> \ + LSTMKernelImpl::LSTMKernelImpl( \ + const std::string& act_gate, const std::string& act_cand, \ + const std::string& act_cell, int d) \ + : LSTMKernel() { \ + auto GetAVXAct = [&](const std::string& type) -> std::unique_ptr { \ + if (type == "sigmoid") { \ + return std::unique_ptr(new AVXActImpl()); \ + } else if (type == "relu") { \ + return std::unique_ptr(new AVXActImpl()); \ + } else if (type == "tanh") { \ + return std::unique_ptr(new AVXActImpl()); \ + } else if (type == "identity" || type == "") { \ + return std::unique_ptr(new AVXActImpl()); \ + } \ + PADDLE_THROW("Not support type: %s", type); \ + }; \ + avx_act_gate_ = GetAVXAct(act_gate); \ + avx_act_cand_ = GetAVXAct(act_cand); \ + avx_act_cell_ = GetAVXAct(act_cell); \ + } \ + template <> \ + void LSTMKernelImpl::ComputeCtHt( \ + float* gates, const float* ct_1, float* ct, float* ht, \ + const float* wp_data, float* checked) const { \ + /* gates: W_ch, W_ih, W_fh, W_oh */ \ + __m256 c, i, f, o; \ + c = _mm256_loadu_ps(gates); \ + i = _mm256_loadu_ps(gates + 8); \ + f = _mm256_loadu_ps(gates + 16); \ + o = _mm256_loadu_ps(gates + 24); \ + /* C_t = C_t-1 * fgated + cand_gated * igated*/ \ + c = _mm256_mul_ps(avx_act_cand_->Compute(c), avx_act_gate_->Compute(i)); \ + i = _mm256_loadu_ps(ct_1); \ + f = _mm256_mul_ps(i, avx_act_gate_->Compute(f)); \ + f = _mm256_add_ps(c, f); \ + _mm256_storeu_ps(ct, f); \ + /* H_t = act_cell(C_t) * ogated */ \ + o = _mm256_mul_ps(avx_act_cell_->Compute(f), avx_act_gate_->Compute(o)); \ + _mm256_storeu_ps(ht, o); \ + } \ + template <> \ + void LSTMKernelImpl::ComputeC1H1( \ + float* gates, float* ct, float* ht, const float* wp_data) const { \ + __m256 c, i, o; \ + c = _mm256_loadu_ps(gates); \ + i = _mm256_loadu_ps(gates + 8); \ + o = _mm256_loadu_ps(gates + 24); \ + /* C_t = igated * cgated*/ \ + c = _mm256_mul_ps(avx_act_gate_->Compute(i), avx_act_cand_->Compute(c)); \ + _mm256_storeu_ps(ct, c); \ + /* H_t = act_cell(C_t) * ogated */ \ + o = _mm256_mul_ps(avx_act_cell_->Compute(c), avx_act_gate_->Compute(o)); \ + _mm256_storeu_ps(ht, o); \ + } + +// TODO(TJ): optimize keq16 + +#ifdef __AVX__ +INTRI8_FLOAT(jit::avx); +#endif +#ifdef __AVX2__ +INTRI8_FLOAT(jit::avx2); +#endif +#ifdef __AVX512F__ +INTRI8_FLOAT(jit::avx512f); +#endif + +/* Peephole JitKernel */ +template +class PeepholeKernelImpl : public LSTMKernel { + public: + explicit PeepholeKernelImpl(const std::string& act_gate, + const std::string& act_cand, + const std::string& act_cell, int d) + : LSTMKernel() { + d_ = d; + d2_ = d * 2; + d3_ = d * 3; + act_gate_d_ = GetActKernel(act_gate, d); + act_cand_d_ = GetActKernel(act_cand, d); + act_cell_d_ = GetActKernel(act_cell, d); + vmul_d_ = KernelPool::Instance().template Get>(d); + vadd_d_ = KernelPool::Instance().template Get>(d); + vadd_d2_ = KernelPool::Instance().template Get>(d2_); + act_gate_d2_ = GetActKernel(act_gate, d2_); + } + + void ComputeCtHt(T* gates, const T* ct_1, T* ct, T* ht, const T* wp_data, + T* checked) const override { + /* get fgated and igated*/ + vmul_d_->Compute(wp_data, ct_1, checked); + vmul_d_->Compute(wp_data + d_, ct_1, checked + d_); + vadd_d2_->Compute(checked, gates + d_, gates + d_); + act_gate_d2_->Compute(gates + d_, gates + d_); + /* C_t = C_t-1 * fgated + cand_gated * igated*/ + act_cand_d_->Compute(gates, gates); + vmul_d_->Compute(gates, gates + d_, gates + d_); + vmul_d_->Compute(ct_1, gates + d2_, gates + d2_); + vadd_d_->Compute(gates + d_, gates + d2_, ct); + /* get ogated*/ + vmul_d_->Compute(wp_data + d2_, ct, gates + d_); + vadd_d_->Compute(gates + d_, gates + d3_, gates + d3_); + act_gate_d_->Compute(gates + d3_, gates + d3_); + /* H_t = act_cell(C_t) * ogated */ + act_cell_d_->Compute(ct, gates + d2_); + vmul_d_->Compute(gates + d2_, gates + d3_, ht); + } + + void ComputeC1H1(T* gates, T* ct, T* ht, const T* wp_data) const override { + /* C_t = igated * cgated*/ + act_gate_d_->Compute(gates + d_, gates + d_); + act_cand_d_->Compute(gates, gates); + vmul_d_->Compute(gates, gates + d_, ct); + /* get outgated, put W_oc * C_t on igated */ + vmul_d_->Compute(wp_data + d2_, ct, gates + d_); + vadd_d_->Compute(gates + d_, gates + d3_, gates + d3_); + /* H_t = act_cell(C_t) * ogated */ + act_gate_d_->Compute(gates + d3_, gates + d3_); + act_cell_d_->Compute(ct, gates + d2_); + vmul_d_->Compute(gates + d2_, gates + d3_, ht); + } + + private: + int d_, d2_, d3_; + std::shared_ptr> act_gate_d2_, act_gate_d_, act_cand_d_, + act_cell_d_; + std::shared_ptr> vmul_d_; + std::shared_ptr> vadd_d_, vadd_d2_; +}; + +#define JITKERNEL_DECLARE_LSTM(ker_class, ker_dtype) \ + template <> \ + std::shared_ptr> \ + KernelPool::Get, const std::string&, \ + const std::string&, const std::string&, int, bool>( \ + const std::string& act_gate, const std::string& act_cand, \ + const std::string& act_cell, int d, bool use_peephole) + +#define JITKERNEL_KEY_LSTM(ker_key, dtype_key) \ + #ker_key #dtype_key + std::to_string(d) + act_gate + act_cand + act_cell + \ + (use_peephole ? "p" : "n") + +#define JITKERNEL_NEW_LSTM_IMPL(ker, dtype, isa, k) \ + if (use_peephole) { \ + p = std::dynamic_pointer_cast>( \ + std::make_shared>( \ + act_gate, act_cand, act_cell, d)); \ + } else { \ + p = std::dynamic_pointer_cast>( \ + std::make_shared>(act_gate, act_cand, \ + act_cell, d)); \ + } + +REGISTER_JITKERNEL_ARGS(lstm, LSTMKernel, JITKERNEL_DECLARE_LSTM, + JITKERNEL_KEY_LSTM, JITKERNEL_NEW_LSTM_IMPL); + +#undef INTRI8_FLOAT +#undef JITKERNEL_DECLARE_LSTM +#undef JITKERNEL_KEY_LSTM +#undef JITKERNEL_NEW_LSTM_IMPL +} // namespace jitkernel +} // namespace math +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/math/jit_kernel_macro.h b/paddle/fluid/operators/math/jit_kernel_macro.h new file mode 100644 index 0000000000..d8e55f2673 --- /dev/null +++ b/paddle/fluid/operators/math/jit_kernel_macro.h @@ -0,0 +1,111 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once +#include +#include "paddle/fluid/platform/cpu_info.h" + +namespace paddle { +namespace operators { +namespace math { +namespace jitkernel { + +namespace jit = platform::jit; + +#define SEARCH_BLOCK(macro_, ker, dtype, isa) \ + if (d < AVX_FLOAT_BLOCK) { \ + macro_(ker, dtype, isa, kLT8); \ + } else if (d == AVX_FLOAT_BLOCK) { \ + macro_(ker, dtype, isa, kEQ8); \ + } else if (d > AVX_FLOAT_BLOCK && d < AVX512_FLOAT_BLOCK) { \ + macro_(ker, dtype, isa, kGT8LT16); \ + } else if (d == AVX512_FLOAT_BLOCK) { \ + macro_(ker, dtype, isa, kEQ16); \ + } else { \ + macro_(ker, dtype, isa, kGT16); \ + } + +#define SEARCH_ISA_BLOCK(macro_, ker, dtype) \ + if (jit::MayIUse(jit::avx512f)) { \ + SEARCH_BLOCK(macro_, ker, dtype, jit::avx512f); \ + } else if (jit::MayIUse(jit::avx2)) { \ + SEARCH_BLOCK(macro_, ker, dtype, jit::avx2); \ + } else if (jit::MayIUse(jit::avx)) { \ + SEARCH_BLOCK(macro_, ker, dtype, jit::avx); \ + } else { \ + SEARCH_BLOCK(macro_, ker, dtype, jit::isa_any); \ + } + +#define JITKERNEL_DECLARE(ker_class, ker_dtype) \ + template <> \ + std::shared_ptr> \ + KernelPool::Get, int>(int d) + +#define JITKERNEL_KEY(ker_key, dtype_key) \ + #ker_key #dtype_key + std::to_string(d) + +#define JITKERNEL_NEW_IMPL(ker, dtype, isa, k) \ + p = std::dynamic_pointer_cast>( \ + std::make_shared>(d)) + +#define JITKERNEL_WITH_DTYPE(ker_key, ker_class, ker_dtype, dtype_key, \ + marco_declare, macro_key, macro_impl) \ + marco_declare(ker_class, ker_dtype) { \ + std::string key = macro_key(ker_key, dtype_key); \ + if (kers_.find(key) == kers_.end()) { \ + std::shared_ptr> p; \ + SEARCH_ISA_BLOCK(macro_impl, ker_class, ker_dtype); \ + kers_.insert({key, std::dynamic_pointer_cast(p)}); \ + return p; \ + } \ + return std::dynamic_pointer_cast>( \ + kers_.at(key)); \ + } + +#define REGISTER_JITKERNEL(ker_key, ker_class) \ + JITKERNEL_WITH_DTYPE(ker_key, ker_class, float, f, JITKERNEL_DECLARE, \ + JITKERNEL_KEY, JITKERNEL_NEW_IMPL); \ + JITKERNEL_WITH_DTYPE(ker_key, ker_class, double, d, JITKERNEL_DECLARE, \ + JITKERNEL_KEY, JITKERNEL_NEW_IMPL) + +#define REGISTER_JITKERNEL_ARGS(ker_key, ker_class, marco_declare, macro_key, \ + macro_impl) \ + JITKERNEL_WITH_DTYPE(ker_key, ker_class, float, f, marco_declare, macro_key, \ + macro_impl); \ + JITKERNEL_WITH_DTYPE(ker_key, ker_class, double, d, marco_declare, \ + macro_key, macro_impl) + +#define FOR_EACH_ISA(macro_, block) \ + macro_(jit::avx512f, block); \ + macro_(jit::avx2, block); \ + macro_(jit::avx, block); \ + macro_(jit::isa_any, block) + +#define FOR_EACH_BLOCK(macro_, isa) \ + macro_(isa, kLT8); \ + macro_(isa, kEQ8); \ + macro_(isa, kGT8LT16); \ + macro_(isa, kEQ16); \ + macro_(isa, kGT16) + +#define FOR_EACH_ISA_BLOCK(macro_) \ + FOR_EACH_BLOCK(macro_, jit::avx512f); \ + FOR_EACH_BLOCK(macro_, jit::avx2); \ + FOR_EACH_BLOCK(macro_, jit::avx); \ + FOR_EACH_BLOCK(macro_, jit::isa_any) + +} // namespace jitkernel +} // namespace math +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/math/jit_kernel_test.cc b/paddle/fluid/operators/math/jit_kernel_test.cc new file mode 100644 index 0000000000..c9e6ab740d --- /dev/null +++ b/paddle/fluid/operators/math/jit_kernel_test.cc @@ -0,0 +1,807 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/math/jit_kernel.h" +#include +#include // for exp +#include // for memcpy +#include +#include +#include +#include "gflags/gflags.h" +#include "glog/logging.h" +#include "gtest/gtest.h" + +#ifdef PADDLE_WITH_MKLML +#include "paddle/fluid/platform/dynload/mklml.h" +#endif + +#ifdef __AVX__ +#include +#endif + +constexpr int repeat = 20000; + +inline double GetCurrentUS() { + struct timeval time; + gettimeofday(&time, NULL); + return 1e+6 * time.tv_sec + time.tv_usec; +} + +template +void RandomVec(const int n, T* a, const T lower = static_cast(-20.f), + const T upper = static_cast(20.f)) { + static unsigned int seed = 100; + std::mt19937 rng(seed++); + std::uniform_real_distribution uniform_dist(0, 1); + for (int i = 0; i < n; ++i) { + a[i] = static_cast(uniform_dist(rng) * (upper - lower) + lower); + } +} + +void vrelu_ref(const int n, const float* x, float* y) { + for (int i = 0; i < n; ++i) { + y[i] = x[i] > 0.f ? x[i] : 0.f; + } +} + +#if defined __AVX__ || defined __AVX2__ +void vrelu_intri8(const int n, const float* x, float* y) { + __m256 tmp = _mm256_loadu_ps(x); + tmp = _mm256_max_ps(tmp, _mm256_setzero_ps()); + _mm256_storeu_ps(y, tmp); +} +#endif + +TEST(JitKernel, vrelu) { + namespace jit = paddle::operators::math::jitkernel; + for (int d : {7, 8, 15, 16, 30, 256, 512}) { + std::vector x(d); + std::vector zref(d), ztgt(d); + RandomVec(d, x.data(), -10.f, 1.f); + const auto& ker = + jit::KernelPool::Instance().template Get>(d); + const float* x_data = x.data(); + float* ztgt_data = ztgt.data(); + float* zref_data = zref.data(); + auto trefs = GetCurrentUS(); + for (int i = 0; i < repeat; ++i) { + vrelu_ref(d, x_data, zref_data); + } + auto trefe = GetCurrentUS(); +#if defined __AVX__ || defined __AVX2__ + if (d == 8) { + auto si0 = GetCurrentUS(); + for (int i = 0; i < repeat; ++i) { + vrelu_intri8(d, x_data, zref_data); + } + auto si1 = GetCurrentUS(); + VLOG(3) << "Vec size 8 intr takes: " << (si1 - si0) / repeat; + } +#endif + auto ttgts = GetCurrentUS(); + for (int i = 0; i < repeat; ++i) { + ker->Compute(x_data, ztgt_data); + } + auto ttgte = GetCurrentUS(); + VLOG(3) << "Vec size " << d << ": refer takes: " << (trefe - trefs) / repeat + << " us, tgt takes: " << (ttgte - ttgts) / repeat; + for (int i = 0; i < d; ++i) { + EXPECT_NEAR(ztgt_data[i], zref_data[i], 1e-3); + } + } +} + +void vaddbias_ref(const int n, const float a, const float* x, float* y) { + for (int i = 0; i < n; ++i) { + y[i] = x[i] + a; + } +} + +TEST(JitKernel, vaddbias) { + namespace jit = paddle::operators::math::jitkernel; + for (int d : {7, 8, 15, 16, 30, 64, 100, 128, 256}) { + std::vector x(d); + std::vector zref(d), ztgt(d); + RandomVec(d, x.data(), -2.f, 2.f); + const auto& ker = + jit::KernelPool::Instance().template Get>(d); + const float a = 2.f; + const float* x_data = x.data(); + float* ztgt_data = ztgt.data(); + float* zref_data = zref.data(); + auto trefs = GetCurrentUS(); + for (int i = 0; i < repeat; ++i) { + vaddbias_ref(d, a, x_data, zref_data); + } + auto trefe = GetCurrentUS(); + auto ttgts = GetCurrentUS(); + for (int i = 0; i < repeat; ++i) { + ker->Compute(a, x_data, ztgt_data); + } + auto ttgte = GetCurrentUS(); + + VLOG(3) << "Vec size " << d << ": refer takes: " << (trefe - trefs) / repeat + << " us, tgt takes: " << (ttgte - ttgts) / repeat; + for (int i = 0; i < d; ++i) { + EXPECT_NEAR(ztgt_data[i], zref_data[i], 1e-3); + } + } +} + +void vexp_ref(const int n, const float* x, float* y) { + for (int i = 0; i < n; ++i) { + y[i] = std::exp(x[i]); + } +} + +#ifdef PADDLE_WITH_MKLML +void vexp_mkl(const int n, const float* x, float* y) { + paddle::platform::dynload::vsExp(n, x, y); +} +#endif + +TEST(JitKernel, vexp) { + namespace jit = paddle::operators::math::jitkernel; + for (int d : {7, 8, 15, 16, 30, 128, 256}) { + std::vector x(d); + std::vector zref(d), ztgt(d); + RandomVec(d, x.data(), -2.f, 2.f); + const auto& ker = + jit::KernelPool::Instance().template Get>(d); + const float* x_data = x.data(); + float* ztgt_data = ztgt.data(); + float* zref_data = zref.data(); + auto trefs = GetCurrentUS(); + for (int i = 0; i < repeat; ++i) { + vexp_ref(d, x_data, zref_data); + } + auto trefe = GetCurrentUS(); + +#ifdef PADDLE_WITH_MKLML + auto tmkls = GetCurrentUS(); + for (int i = 0; i < repeat; ++i) { + vexp_mkl(d, x_data, zref_data); + } + auto tmkle = GetCurrentUS(); +#endif + + auto ttgts = GetCurrentUS(); + for (int i = 0; i < repeat; ++i) { + ker->Compute(x_data, ztgt_data); + } + auto ttgte = GetCurrentUS(); + + VLOG(3) << "Vec size " << d << ": refer takes: " << (trefe - trefs) / repeat +#ifdef PADDLE_WITH_MKLML + << " us, mkl takes: " << (tmkle - tmkls) / repeat << " us, " +#else + << " us, " +#endif + << "tgt takes: " << (ttgte - ttgts) / repeat; + for (int i = 0; i < d; ++i) { + EXPECT_NEAR(ztgt_data[i], zref_data[i], 1e-3); + } + } +} + +inline float _sigmoid(float x) { + const float min = SIGMOID_THRESHOLD_MIN; + const float max = SIGMOID_THRESHOLD_MAX; + float tmp = (x < min) ? min : ((x > max) ? max : x); + return 1.f / (1.f + std::exp(-tmp)); +} + +void vsigmoid_ref(const int n, const float* x, float* y) { + for (int i = 0; i < n; ++i) { + y[i] = _sigmoid(x[i]); + } +} + +void vsigmoid_better( + const std::shared_ptr< + const paddle::operators::math::jitkernel::VExpKernel>& vexp, + const int n, const float* x, float* y) { + const float min = SIGMOID_THRESHOLD_MIN; + const float max = SIGMOID_THRESHOLD_MAX; + for (int i = 0; i < n; ++i) { + y[i] = (x[i] < min) ? min : ((x[i] > max) ? max : x[i]); + y[i] = 0.f - y[i]; + } + vexp->Compute(y, y); + for (int i = 0; i < n; ++i) { + y[i] = 1.f / (1.f + y[i]); + } +} + +TEST(JitKernel, vsigmoid) { + namespace jit = paddle::operators::math::jitkernel; + for (int d : {7, 8, 15, 16, 30, 32, 64, 100, 128, 256}) { + std::vector x(d); + std::vector zref(d), ztgt(d); + RandomVec(d, x.data(), -2.f, 2.f); + const auto& ker = + jit::KernelPool::Instance().template Get>(d); + const auto& vexp = + jit::KernelPool::Instance().template Get>(d); + const float* x_data = x.data(); + float* ztgt_data = ztgt.data(); + float* zref_data = zref.data(); + auto tmkls = GetCurrentUS(); + for (int i = 0; i < repeat; ++i) { + vsigmoid_better(vexp, d, x_data, zref_data); + } + auto tmkle = GetCurrentUS(); + auto trefs = GetCurrentUS(); + for (int i = 0; i < repeat; ++i) { + vsigmoid_ref(d, x_data, zref_data); + } + auto trefe = GetCurrentUS(); + auto ttgts = GetCurrentUS(); + for (int i = 0; i < repeat; ++i) { + ker->Compute(x_data, ztgt_data); + } + auto ttgte = GetCurrentUS(); + + VLOG(3) << "Vec size " << d << ": refer takes: " << (trefe - trefs) / repeat + << " us, better(jit exp) takes: " << (tmkle - tmkls) / repeat + << " us, tgt takes: " << (ttgte - ttgts) / repeat; + for (int i = 0; i < d; ++i) { + EXPECT_NEAR(ztgt_data[i], zref_data[i], 1e-3); + } + } +} + +inline float _tanh(float x) { return 2.f * _sigmoid(2.f * x) - 1.f; } + +void vtanh_ref(const int n, const float* x, float* y) { + for (int i = 0; i < n; ++i) { + y[i] = _tanh(x[i]); + } +} + +void vtanh_better( + const std::shared_ptr< + const paddle::operators::math::jitkernel::VScalKernel>& vscal, + const std::shared_ptr< + const paddle::operators::math::jitkernel::VSigmoidKernel>& + vsigmoid, + const std::shared_ptr< + const paddle::operators::math::jitkernel::VAddBiasKernel>& + vaddbias, + const int n, const float* x, float* y) { + vscal->Compute(2.f, x, y); + vsigmoid->Compute(y, y); + vscal->Compute(2.f, y); + vaddbias->Compute(-1.f, y, y); +} + +TEST(JitKernel, vtanh) { + namespace jit = paddle::operators::math::jitkernel; + for (int d : {7, 8, 15, 16, 30, 32, 64, 100, 128, 256}) { + std::vector x(d); + std::vector zref(d), ztgt(d); + RandomVec(d, x.data(), -2.f, 2.f); + const auto& ker = + jit::KernelPool::Instance().template Get>(d); + const auto& vscal = + jit::KernelPool::Instance().template Get>(d); + const auto& vsigmoid = + jit::KernelPool::Instance().template Get>(d); + const auto& vaddbias = + jit::KernelPool::Instance().template Get>(d); + const float* x_data = x.data(); + float* ztgt_data = ztgt.data(); + float* zref_data = zref.data(); + auto tmkls = GetCurrentUS(); + for (int i = 0; i < repeat; ++i) { + vtanh_better(vscal, vsigmoid, vaddbias, d, x_data, zref_data); + } + auto tmkle = GetCurrentUS(); + auto trefs = GetCurrentUS(); + for (int i = 0; i < repeat; ++i) { + vtanh_ref(d, x_data, zref_data); + } + auto trefe = GetCurrentUS(); + auto ttgts = GetCurrentUS(); + for (int i = 0; i < repeat; ++i) { + ker->Compute(x_data, ztgt_data); + } + auto ttgte = GetCurrentUS(); + + VLOG(3) << "Vec size " << d << ": refer takes: " << (trefe - trefs) / repeat + << " us, better(jit exp) takes: " << (tmkle - tmkls) / repeat + << " us, tgt takes: " << (ttgte - ttgts) / repeat; + for (int i = 0; i < d; ++i) { + EXPECT_NEAR(ztgt_data[i], zref_data[i], 1e-3); + } + } +} + +void lstm_ctht_ref( + const std::shared_ptr< + const paddle::operators::math::jitkernel::VSigmoidKernel>& + vsigmoid_3d, + const std::shared_ptr< + const paddle::operators::math::jitkernel::VTanhKernel>& vtanh_d, + const std::shared_ptr< + const paddle::operators::math::jitkernel::VExpKernel>& vexp_1, + const int d, float* gates, const float* ct_1, float* ct, float* ht) { + vsigmoid_3d->Compute(gates + d, gates + d); + vtanh_d->Compute(gates, gates); + const float *i = gates + d, *f = gates + d * 2, *o = gates + d * 3; + const float min = SIGMOID_THRESHOLD_MIN; + const float max = SIGMOID_THRESHOLD_MAX; + for (int k = 0; k < d; ++k) { + // C_t = C_t-1 * fgated + cand_gated * igated + ct[k] = ct_1[k] * f[k] + gates[k] * i[k]; + // H_t = act_cell(C_t) * ogated + float tmp = ct[k] * 2; + tmp = 0.f - ((tmp < min) ? min : ((tmp > max) ? max : tmp)); + vexp_1->Compute(&tmp, &tmp); + tmp = 2.f / (1.f + tmp) - 1.f; + ht[k] = tmp * o[k]; + } +} + +void lstm_ctht_better( + const std::shared_ptr< + const paddle::operators::math::jitkernel::VSigmoidKernel>& + vsigmoid_3d, + const std::shared_ptr< + const paddle::operators::math::jitkernel::VTanhKernel>& vtanh_d, + const std::shared_ptr< + const paddle::operators::math::jitkernel::VMulKernel>& vmul_d, + const std::shared_ptr< + const paddle::operators::math::jitkernel::VAddKernel>& vadd_d, + const int d, float* gates, const float* ct_1, float* ct, float* ht) { + int d2 = d * 2; + vsigmoid_3d->Compute(gates + d, gates + d); + vtanh_d->Compute(gates, gates); + vmul_d->Compute(gates, gates + d, gates + d); + vmul_d->Compute(ct_1, gates + d2, gates + d2); + vadd_d->Compute(gates + d, gates + d2, ct); + /* H_t = act_cell(C_t) * ogated */ + vtanh_d->Compute(ct, gates + d2); + vmul_d->Compute(gates + d2, gates + d * 3, ht); +} + +TEST(JitKernel, lstm) { + namespace jit = paddle::operators::math::jitkernel; + for (int d : {7, 8, 15, 16, 30, 32, 64, 100}) { + int d4 = d * 4; + int d3 = d * 3; + std::vector x(d4), xref(d4); + std::vector ct_1(d), ct_tgt(d), ht_tgt(d); + std::vector ct_ref(d), ht_ref(d); + RandomVec(d4, x.data(), -2.f, 2.f); + RandomVec(d, ct_1.data(), -2.f, 2.f); + memcpy(xref.data(), x.data(), sizeof(float) * d4); + std::string act_gate = "sigmoid", act_cand = "tanh", act_cell = "tanh"; + const auto& ker = + jit::KernelPool::Instance() + .template Get, const std::string&, + const std::string&, const std::string&>( + act_gate, act_cand, act_cell, d, false); + // below kernels are used to compute refer + const auto& vsigmoid_3d = + jit::KernelPool::Instance().template Get>( + d3); + const auto& vtanh_d = + jit::KernelPool::Instance().template Get>(d); + const auto& vexp_1 = + jit::KernelPool::Instance().template Get>(1); + const auto& vmul_d = + jit::KernelPool::Instance().template Get>(d); + const auto& vadd_d = + jit::KernelPool::Instance().template Get>(d); + + float* x_data = x.data(); + float* xref_data = xref.data(); + const float* ct_1_data = ct_1.data(); + float* ct_tgt_data = ct_tgt.data(); + float* ht_tgt_data = ht_tgt.data(); + float* ct_ref_data = ct_ref.data(); + float* ht_ref_data = ht_ref.data(); + // compute once to check correctness + lstm_ctht_ref(vsigmoid_3d, vtanh_d, vexp_1, d, xref_data, ct_1_data, + ct_ref_data, ht_ref_data); + ker->ComputeCtHt(x_data, ct_1_data, ct_tgt_data, ht_tgt_data); + for (int i = 0; i < d; ++i) { + EXPECT_NEAR(ct_tgt_data[i], ct_ref_data[i], 1e-3); + EXPECT_NEAR(ht_tgt_data[i], ht_ref_data[i], 1e-3); + } + + auto tmkls = GetCurrentUS(); + for (int i = 0; i < repeat; ++i) { + lstm_ctht_better(vsigmoid_3d, vtanh_d, vmul_d, vadd_d, d, xref_data, + ct_1_data, ct_ref_data, ht_ref_data); + } + auto tmkle = GetCurrentUS(); + auto trefs = GetCurrentUS(); + for (int i = 0; i < repeat; ++i) { + lstm_ctht_ref(vsigmoid_3d, vtanh_d, vexp_1, d, xref_data, ct_1_data, + ct_ref_data, ht_ref_data); + } + auto trefe = GetCurrentUS(); + auto ttgts = GetCurrentUS(); + for (int i = 0; i < repeat; ++i) { + ker->ComputeCtHt(x_data, ct_1_data, ct_tgt_data, ht_tgt_data); + } + auto ttgte = GetCurrentUS(); + VLOG(3) << "Vec size " << d << ": refer takes: " << (trefe - trefs) / repeat + << " us, better(jit) takes: " << (tmkle - tmkls) / repeat + << " us, tgt takes: " << (ttgte - ttgts) / repeat; + } +} + +void vscal_ref(const int n, const float a, const float* x, float* y) { + for (int i = 0; i < n; ++i) { + y[i] = a * x[i]; + } +} +void vscal_inp_ref(const int n, const float a, float* x) { + for (int i = 0; i < n; ++i) { + x[i] = a * x[i]; + } +} +#if defined __AVX__ || defined __AVX2__ +void vscal_intri8(const int n, const float a, const float* x, float* y) { + __m256 tmp; + __m256 scalar = _mm256_set1_ps(a); + tmp = _mm256_loadu_ps(x); + tmp = _mm256_mul_ps(tmp, scalar); + _mm256_storeu_ps(y, tmp); +} +void vscal_inp_intri8(const int n, const float a, float* x) { + __m256 tmp; + __m256 scalar = _mm256_set1_ps(a); + tmp = _mm256_loadu_ps(x); + tmp = _mm256_mul_ps(tmp, scalar); + _mm256_storeu_ps(x, tmp); +} +#endif + +#ifdef PADDLE_WITH_MKLML +void vscal_inp_mkl(const int n, const float a, float* x) { + paddle::platform::dynload::cblas_sscal(n, a, x, 1); +} +#endif + +TEST(JitKernel, vscal) { + namespace jit = paddle::operators::math::jitkernel; + for (int d : {7, 8, 15, 16, 30, 256, 512}) { + std::vector x(d), y(d); + std::vector zref(d), ztgt(d); + RandomVec(d, x.data()); + std::memcpy(y.data(), x.data(), sizeof(float) * d); + float a = 2.f; + const auto& ker = + jit::KernelPool::Instance().template Get>(d); + const float* x_data = x.data(); + float* y_data = y.data(); + float* ztgt_data = ztgt.data(); + float* zref_data = zref.data(); + auto trefs = GetCurrentUS(); + for (int i = 0; i < repeat; ++i) { + vscal_ref(d, a, x_data, zref_data); + } + auto trefe = GetCurrentUS(); + auto trefs1 = GetCurrentUS(); + for (int i = 0; i < repeat; ++i) { + vscal_inp_ref(d, a, y_data); + } + auto trefe1 = GetCurrentUS(); + +#ifdef PADDLE_WITH_MKLML + auto tmkls = GetCurrentUS(); + for (int i = 0; i < repeat; ++i) { + vscal_inp_mkl(d, a, y_data); + } + auto tmkle = GetCurrentUS(); +#endif + +#if defined __AVX__ || defined __AVX2__ + if (d == 8) { + auto si0 = GetCurrentUS(); + for (int i = 0; i < repeat; ++i) { + vscal_intri8(d, a, x_data, zref_data); + } + auto si1 = GetCurrentUS(); + auto si2 = GetCurrentUS(); + for (int i = 0; i < repeat; ++i) { + vscal_inp_intri8(d, a, y_data); + } + auto si3 = GetCurrentUS(); + VLOG(3) << "Vec size 8 intr takes: " << (si1 - si0) / repeat + << " us, inplace: " << (si3 - si2) / repeat; + } +#endif + + auto ttgts = GetCurrentUS(); + for (int i = 0; i < repeat; ++i) { + ker->Compute(a, x_data, ztgt_data); + } + auto ttgte = GetCurrentUS(); + auto ttgts1 = GetCurrentUS(); + for (int i = 0; i < repeat; ++i) { + ker->Compute(a, y_data); + } + auto ttgte1 = GetCurrentUS(); + VLOG(3) << "Vec size " << d << ": refer takes: " << (trefe - trefs) / repeat + << " us, inplace takes: " << (trefe1 - trefs1) / repeat +#ifdef PADDLE_WITH_MKLML + << " us, mkl inplace takes: " << (tmkle - tmkls) / repeat << " us, " +#else + << " us, " +#endif + << "tgt takes: " << (ttgte - ttgts) / repeat + << "us, tgt inplace takes: " << (ttgte1 - ttgts1) / repeat; + for (int i = 0; i < d; ++i) { + EXPECT_NEAR(ztgt_data[i], zref_data[i], 1e-3); + } + } +} + +void vmul_ref(const int n, const float* x, const float* y, float* z) { + for (int i = 0; i < n; ++i) { + z[i] = x[i] * y[i]; + } +} + +#if defined __AVX__ || defined __AVX2__ +void vmul_intri8(const int n, const float* x, const float* y, float* z) { + __m256 tmpx, tmpy; + tmpx = _mm256_loadu_ps(x); + tmpy = _mm256_loadu_ps(y); + tmpx = _mm256_mul_ps(tmpx, tmpy); + _mm256_storeu_ps(z, tmpx); +} +#endif + +#ifdef PADDLE_WITH_MKLML +void vmul_mkl(const int n, const float* x, const float* y, float* z) { + paddle::platform::dynload::vsMul(n, x, y, z); +} +#endif + +TEST(JitKernel, vmul) { + namespace jit = paddle::operators::math::jitkernel; + for (int d : {7, 8, 15, 16, 30, 256, 512}) { + std::vector x(d), y(d); + std::vector zref(d), ztgt(d); + RandomVec(d, x.data()); + RandomVec(d, y.data()); + const auto& ker = + jit::KernelPool::Instance().template Get>(d); + const float* x_data = x.data(); + const float* y_data = y.data(); + float* ztgt_data = ztgt.data(); + float* zref_data = zref.data(); + auto trefs = GetCurrentUS(); + for (int i = 0; i < repeat; ++i) { + vmul_ref(d, x_data, y_data, zref_data); + } + auto trefe = GetCurrentUS(); + +#ifdef PADDLE_WITH_MKLML + auto tmkls = GetCurrentUS(); + for (int i = 0; i < repeat; ++i) { + vmul_mkl(d, x_data, y_data, zref_data); + } + auto tmkle = GetCurrentUS(); +#endif + +#if defined __AVX__ || defined __AVX2__ + if (d == 8) { + auto si0 = GetCurrentUS(); + for (int i = 0; i < repeat; ++i) { + vmul_intri8(d, x_data, y_data, zref_data); + } + auto si1 = GetCurrentUS(); + VLOG(3) << "Vec size 8 intr takes: " << (si1 - si0) / repeat; + } +#endif + + auto ttgts = GetCurrentUS(); + for (int i = 0; i < repeat; ++i) { + ker->Compute(x_data, y_data, ztgt_data); + } + auto ttgte = GetCurrentUS(); + + VLOG(3) << "Vec size " << d << ": refer takes: " << (trefe - trefs) / repeat +#ifdef PADDLE_WITH_MKLML + << " us, mkl takes: " << (tmkle - tmkls) / repeat << " us, " +#else + << " us, " +#endif + << "tgt takes: " << (ttgte - ttgts) / repeat; + for (int i = 0; i < d; ++i) { + EXPECT_NEAR(ztgt_data[i], zref_data[i], 1e-3); + } + } +} + +void vadd_ref(const int n, const float* x, const float* y, float* z) { + for (int i = 0; i < n; ++i) { + z[i] = x[i] + y[i]; + } +} + +#if defined __AVX__ || defined __AVX2__ +void vadd_intri8(const int n, const float* x, const float* y, float* z) { + __m256 tmpx, tmpy; + tmpx = _mm256_loadu_ps(x); + tmpy = _mm256_loadu_ps(y); + tmpx = _mm256_add_ps(tmpx, tmpy); + _mm256_storeu_ps(z, tmpx); +} +#endif + +#ifdef PADDLE_WITH_MKLML +void vadd_mkl(const int n, const float* x, const float* y, float* z) { + paddle::platform::dynload::vsAdd(n, x, y, z); +} +#endif + +TEST(JitKernel, vadd) { + namespace jit = paddle::operators::math::jitkernel; + for (int d : {7, 8, 15, 16, 30, 256, 512}) { + std::vector x(d), y(d); + std::vector zref(d), ztgt(d); + RandomVec(d, x.data()); + RandomVec(d, y.data()); + const auto& ker = + jit::KernelPool::Instance().template Get>(d); + const float* x_data = x.data(); + const float* y_data = y.data(); + float* ztgt_data = ztgt.data(); + float* zref_data = zref.data(); + auto trefs = GetCurrentUS(); + for (int i = 0; i < repeat; ++i) { + vadd_ref(d, x_data, y_data, zref_data); + } + auto trefe = GetCurrentUS(); + +#ifdef PADDLE_WITH_MKLML + auto tmkls = GetCurrentUS(); + for (int i = 0; i < repeat; ++i) { + vadd_mkl(d, x_data, y_data, zref_data); + } + auto tmkle = GetCurrentUS(); +#endif + +#if defined __AVX__ || defined __AVX2__ + if (d == 8) { + auto si0 = GetCurrentUS(); + for (int i = 0; i < repeat; ++i) { + vadd_intri8(d, x_data, y_data, zref_data); + } + auto si1 = GetCurrentUS(); + VLOG(3) << "Vec size 8 intr takes: " << (si1 - si0) / repeat; + } +#endif + + auto ttgts = GetCurrentUS(); + for (int i = 0; i < repeat; ++i) { + ker->Compute(x_data, y_data, ztgt_data); + } + auto ttgte = GetCurrentUS(); + + VLOG(3) << "Vec size " << d << ": refer takes: " << (trefe - trefs) / repeat +#ifdef PADDLE_WITH_MKLML + << " us, mkl takes: " << (tmkle - tmkls) / repeat << " us, " +#else + << " us, " +#endif + << "tgt takes: " << (ttgte - ttgts) / repeat; + for (int i = 0; i < d; ++i) { + EXPECT_NEAR(ztgt_data[i], zref_data[i], 1e-3); + } + } +} + +void vaddrelu_ref(const int n, const float* x, const float* y, float* z) { + for (int i = 0; i < n; ++i) { + z[i] = x[i] + y[i]; + z[i] = z[i] > 0 ? z[i] : 0; + } +} +void vaddrelu_better( + const std::shared_ptr< + const paddle::operators::math::jitkernel::VAddKernel>& vadd, + const std::shared_ptr< + const paddle::operators::math::jitkernel::VReluKernel>& vrelu, + const float* x, const float* y, float* z) { + vadd->Compute(x, y, z); + vrelu->Compute(z, z); +} + +TEST(JitKernel, vaddrelu) { + namespace jit = paddle::operators::math::jitkernel; + for (int d : {7, 8, 15, 16, 30, 256, 512}) { + std::vector x(d), y(d); + std::vector zref(d), ztgt(d); + RandomVec(d, x.data()); + RandomVec(d, y.data()); + const auto& ker = + jit::KernelPool::Instance().template Get>(d); + const auto& vadd = + jit::KernelPool::Instance().template Get>(d); + const auto& vrelu = + jit::KernelPool::Instance().template Get>(d); + const float* x_data = x.data(); + const float* y_data = y.data(); + float* ztgt_data = ztgt.data(); + float* zref_data = zref.data(); + auto trefs = GetCurrentUS(); + for (int i = 0; i < repeat; ++i) { + vadd_ref(d, x_data, y_data, zref_data); + } + auto trefe = GetCurrentUS(); + auto tmkls = GetCurrentUS(); + for (int i = 0; i < repeat; ++i) { + vaddrelu_better(vadd, vrelu, x_data, y_data, zref_data); + } + auto tmkle = GetCurrentUS(); + auto ttgts = GetCurrentUS(); + for (int i = 0; i < repeat; ++i) { + ker->Compute(x_data, y_data, ztgt_data); + } + auto ttgte = GetCurrentUS(); + VLOG(3) << "Vec size " << d << ": refer takes: " << (trefe - trefs) / repeat + << " us, better takes: " << (tmkle - tmkls) / repeat << " us, " + << "tgt takes: " << (ttgte - ttgts) / repeat; + for (int i = 0; i < d; ++i) { + EXPECT_NEAR(ztgt_data[i], zref_data[i], 1e-3); + } + } +} + +TEST(JitKernel, pool) { + namespace jit = paddle::operators::math::jitkernel; + const int frame_size = 4; + std::string act_gate = "sigmoid", act_cand = "tanh", act_cell = "tanh"; + const auto& plstm1 = + jit::KernelPool::Instance() + .template Get, const std::string&, + const std::string&, const std::string&>( + act_gate, act_cand, act_cell, frame_size, false); + const auto& plstm2 = + jit::KernelPool::Instance() + .template Get, const std::string&, + const std::string&, const std::string&>( + act_gate, act_cand, act_cell, frame_size, false); + const auto& peephole = + jit::KernelPool::Instance() + .template Get, const std::string&, + const std::string&, const std::string&>( + act_gate, act_cand, act_cell, frame_size, true); + EXPECT_TRUE(plstm1 != peephole); + + const auto& pvmul_f = + jit::KernelPool::Instance().template Get>(4); + EXPECT_TRUE(std::dynamic_pointer_cast(plstm2) != + std::dynamic_pointer_cast(pvmul_f)); + + const auto& pvmul_d = + jit::KernelPool::Instance().template Get>(4); + EXPECT_TRUE(std::dynamic_pointer_cast(pvmul_f) != + std::dynamic_pointer_cast(pvmul_d)); + + const auto& pvmul_from_key = jit::KernelPool::Instance().Get("vmulf4"); + EXPECT_EQ(pvmul_f, pvmul_from_key); + const auto& pvmul_from_key2 = jit::KernelPool::Instance().Get("vmulf5"); + EXPECT_TRUE(pvmul_from_key2 == nullptr); +} diff --git a/paddle/fluid/operators/math/selected_rows_functor.cc b/paddle/fluid/operators/math/selected_rows_functor.cc index 8e8baf49b2..08f57dd45a 100644 --- a/paddle/fluid/operators/math/selected_rows_functor.cc +++ b/paddle/fluid/operators/math/selected_rows_functor.cc @@ -12,10 +12,11 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ +#include #include #include -#include "paddle/fluid/operators/math/math_function.h" +#include "paddle/fluid/operators/math/blas.h" #include "paddle/fluid/operators/math/selected_rows_functor.h" namespace paddle { @@ -150,6 +151,45 @@ template struct SelectedRowsAddTo; template struct SelectedRowsAddTo; template struct SelectedRowsAddTo; +template +struct SelectedRowsSumTo { + void operator()(const platform::CPUDeviceContext& context, + const std::vector& input1, + const std::vector& input2_offsets, + framework::SelectedRows* input2) { + // Ensure all selected rows have the same height + size_t size = 0u; + for (auto iter = input1.begin(); iter != input1.end(); ++iter) { + auto& in_rows = (*iter)->rows(); + size += in_rows.end() - in_rows.begin(); + auto in1_height = (*iter)->height(); + PADDLE_ENFORCE_EQ(in1_height, input2->height()); + } + // concat rows + std::vector in2_rows; + in2_rows.reserve(in2_rows.size() + size); + for (auto iter = input1.begin(); iter != input1.end(); ++iter) { + const framework::Vector& in_rows = (*iter)->rows(); + in2_rows.insert(in2_rows.end(), in_rows.begin(), in_rows.end()); + } + input2->set_rows(in2_rows); + + auto* in2_value = input2->mutable_value(); + auto* in2_data = in2_value->data(); + auto blas = math::GetBlas(context); + size_t offset = 0u; + for (size_t i = 0u; i != input1.size(); ++i) { + auto& in_value = input1[i]->value(); + const auto* in_data = in_value.data(); + offset += input2_offsets[i]; + blas.VCOPY(in_value.numel(), in_data, in2_data + offset); + } + } +}; + +template struct SelectedRowsSumTo; +template struct SelectedRowsSumTo; + template struct SelectedRowsAddToTensor { void operator()(const platform::CPUDeviceContext& context, @@ -207,35 +247,45 @@ struct MergeAdd { const framework::SelectedRows& input, framework::SelectedRows* output) { framework::SelectedRows& out = *output; - auto input_rows = input.rows(); - std::set row_set(input_rows.begin(), input_rows.end()); - std::vector merge_rows(row_set.begin(), row_set.end()); + std::vector input_rows(input.rows()); - auto input_width = input.value().dims()[1]; - out.set_rows(merge_rows); + std::map> merge_row_map; + for (size_t i = 0; i < input_rows.size(); ++i) { + merge_row_map[input_rows[i]].push_back(i); + } + + std::vector merge_rows(merge_row_map.size()); + size_t idx = 0; + int64_t input_width = input.value().dims()[1]; out.set_height(input.height()); - out.mutable_value()->mutable_data( + + T* out_data = out.mutable_value()->mutable_data( framework::make_ddim( {static_cast(merge_rows.size()), input_width}), context.GetPlace()); - - math::SetConstant constant_functor; - constant_functor(context, out.mutable_value(), 0.0); - - auto* out_data = out.mutable_value()->data(); - auto* input_data = input.value().data(); - - for (size_t i = 0; i < input_rows.size(); i++) { - size_t out_i = FindPos(merge_rows, input_rows[i]); - for (int64_t j = 0; j < input_width; j++) { - out_data[out_i * input_width + j] += input_data[i * input_width + j]; + const T* in_data = input.value().data(); + + for (auto& row_pair : merge_row_map) { + auto* out_ptr = out_data + idx * input_width; + auto& rows = row_pair.second; + merge_rows[idx] = row_pair.first; + ++idx; + // rows.size() is always larger than 0 + std::memcpy(out_ptr, in_data + rows[0] * input_width, + sizeof(T) * input_width); + + for (size_t i = 1; i < rows.size(); ++i) { + auto* in_ptr = in_data + rows[i] * input_width; + for (int64_t j = 0; j < input_width; ++j) { + out_ptr[j] += in_ptr[j]; + } } } + + out.set_rows(merge_rows); } }; -template struct MergeAdd; -template struct MergeAdd; template struct MergeAdd; template struct MergeAdd; diff --git a/paddle/fluid/operators/math/selected_rows_functor.h b/paddle/fluid/operators/math/selected_rows_functor.h index aa419f74fc..900be86f91 100644 --- a/paddle/fluid/operators/math/selected_rows_functor.h +++ b/paddle/fluid/operators/math/selected_rows_functor.h @@ -12,8 +12,14 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ #pragma once + +#include +#include + #include "paddle/fluid/framework/eigen.h" #include "paddle/fluid/framework/selected_rows.h" +#include "paddle/fluid/operators/math/blas.h" +#include "paddle/fluid/operators/math/math_function.h" #include "paddle/fluid/platform/device_context.h" #define INLINE_FOR2(sizei, sizej) \ @@ -49,6 +55,15 @@ struct SelectedRowsAddTo { const int64_t input2_offset, framework::SelectedRows* input2); }; +// input2 = [all input in input1] + input2 +template +struct SelectedRowsSumTo { + void operator()(const DeviceContext& context, + const std::vector& input1, + const std::vector& input2_offsets, + framework::SelectedRows* input2); +}; + // input2 = input1 + input2 template struct SelectedRowsAddToTensor { @@ -70,6 +85,104 @@ struct MergeAdd { framework::SelectedRows* output); }; +template <> +struct MergeAdd { + framework::SelectedRows operator()(const platform::CPUDeviceContext& context, + const framework::SelectedRows& input) { + framework::SelectedRows out; + (*this)(context, input, &out); + return out; + } + + void operator()(const platform::CPUDeviceContext& context, + const framework::SelectedRows& input, + framework::SelectedRows* output) { + framework::SelectedRows& out = *output; + std::vector input_rows(input.rows()); + + std::map> merge_row_map; + for (size_t i = 0; i < input_rows.size(); ++i) { + merge_row_map[input_rows[i]].push_back(i); + } + + std::vector merge_rows(merge_row_map.size()); + size_t idx = 0; + int64_t input_width = input.value().dims()[1]; + out.set_height(input.height()); + + auto* out_data = out.mutable_value()->mutable_data( + framework::make_ddim( + {static_cast(merge_rows.size()), input_width}), + context.GetPlace()); + auto* in_data = input.value().data(); + + auto blas = GetBlas(context); + for (auto& row_pair : merge_row_map) { + auto* out_ptr = out_data + idx * input_width; + auto& rows = row_pair.second; + merge_rows[idx] = row_pair.first; + ++idx; + // rows.size() is always larger than 0 + blas.VCOPY(input_width, in_data + rows[0] * input_width, out_ptr); + + for (size_t i = 1; i < rows.size(); ++i) { + blas.AXPY(input_width, 1., in_data + rows[i] * input_width, out_ptr); + } + } + + out.set_rows(merge_rows); + } +}; + +template <> +struct MergeAdd { + framework::SelectedRows operator()(const platform::CPUDeviceContext& context, + const framework::SelectedRows& input) { + framework::SelectedRows out; + (*this)(context, input, &out); + return out; + } + + void operator()(const platform::CPUDeviceContext& context, + const framework::SelectedRows& input, + framework::SelectedRows* output) { + framework::SelectedRows& out = *output; + std::vector input_rows(input.rows()); + + std::map> merge_row_map; + for (size_t i = 0; i < input_rows.size(); ++i) { + merge_row_map[input_rows[i]].push_back(i); + } + + std::vector merge_rows(merge_row_map.size()); + size_t idx = 0; + int64_t input_width = input.value().dims()[1]; + out.set_height(input.height()); + + auto* out_data = out.mutable_value()->mutable_data( + framework::make_ddim( + {static_cast(merge_rows.size()), input_width}), + context.GetPlace()); + auto* in_data = input.value().data(); + + auto blas = GetBlas(context); + for (auto& row_pair : merge_row_map) { + auto* out_ptr = out_data + idx * input_width; + auto& rows = row_pair.second; + merge_rows[idx] = row_pair.first; + ++idx; + // rows.size() is always larger than 0 + blas.VCOPY(input_width, in_data + rows[0] * input_width, out_ptr); + + for (size_t i = 1; i < rows.size(); ++i) { + blas.AXPY(input_width, 1., in_data + rows[i] * input_width, out_ptr); + } + } + + out.set_rows(merge_rows); + } +}; + template struct Add { framework::SelectedRows operator()(const DeviceContext& context, diff --git a/paddle/fluid/operators/math/selected_rows_functor_test.cc b/paddle/fluid/operators/math/selected_rows_functor_test.cc index 70bed820ee..8355893560 100644 --- a/paddle/fluid/operators/math/selected_rows_functor_test.cc +++ b/paddle/fluid/operators/math/selected_rows_functor_test.cc @@ -219,3 +219,174 @@ TEST(selected_rows_functor, cpu_add_to) { // row9: 2.0 + 3.0 EXPECT_EQ(tensor1_data[9 * row_numel + 6], 5.0); } + +TEST(selected_rows_functor, cpu_merge_add_float) { + paddle::platform::CPUPlace cpu_place; + paddle::platform::CPUDeviceContext ctx(cpu_place); + paddle::operators::math::SetConstant + functor; + int64_t height = 10; + int64_t row_numel = 10; + + std::vector rows{0, 4, 4, 7}; + std::unique_ptr selected_rows{ + new paddle::framework::SelectedRows(rows, height)}; + auto* in_value = selected_rows->mutable_value(); + in_value->mutable_data( + paddle::framework::make_ddim( + {static_cast(rows.size()), row_numel}), + cpu_place); + functor(ctx, in_value, 1.0); + + std::unique_ptr output{ + new paddle::framework::SelectedRows()}; + + paddle::operators::math::scatter::MergeAdd + merge_add_functor; + merge_add_functor(ctx, *selected_rows, output.get()); + + auto out_height = output->height(); + EXPECT_EQ(out_height, height); + + auto& out_rows = output->rows(); + EXPECT_EQ(out_rows[0], 0); + EXPECT_EQ(out_rows[1], 4); + EXPECT_EQ(out_rows[2], 7); + + auto* out_data = output->value().data(); + + EXPECT_EQ(out_data[0 * row_numel], 1.0); + EXPECT_EQ(out_data[1 * row_numel], 2.0); + EXPECT_EQ(out_data[2 * row_numel], 1.0); +} + +TEST(selected_rows_functor, cpu_merge_add_int) { + paddle::platform::CPUPlace cpu_place; + paddle::platform::CPUDeviceContext ctx(cpu_place); + paddle::operators::math::SetConstant + functor; + int64_t height = 10; + int64_t row_numel = 10; + + std::vector rows{0, 4, 4, 7}; + std::unique_ptr selected_rows{ + new paddle::framework::SelectedRows(rows, height)}; + auto* in_value = selected_rows->mutable_value(); + in_value->mutable_data( + paddle::framework::make_ddim( + {static_cast(rows.size()), row_numel}), + cpu_place); + functor(ctx, in_value, 1); + + std::unique_ptr output{ + new paddle::framework::SelectedRows()}; + + paddle::operators::math::scatter::MergeAdd + merge_add_functor; + merge_add_functor(ctx, *selected_rows, output.get()); + + auto out_height = output->height(); + EXPECT_EQ(out_height, height); + + auto& out_rows = output->rows(); + EXPECT_EQ(out_rows[0], 0); + EXPECT_EQ(out_rows[1], 4); + EXPECT_EQ(out_rows[2], 7); + + auto* out_data = output->value().data(); + + EXPECT_EQ(out_data[0 * row_numel], 1); + EXPECT_EQ(out_data[1 * row_numel], 2); + EXPECT_EQ(out_data[2 * row_numel], 1); +} +TEST(selected_rows_functor, cpu_sum_to) { + paddle::platform::CPUPlace cpu_place; + paddle::platform::CPUDeviceContext ctx(cpu_place); + paddle::operators::math::SetConstant + functor; + int64_t height = 10; + int64_t row_numel = 10; + std::vector rows1{0, 4, 7}; + std::unique_ptr selected_rows1{ + new paddle::framework::SelectedRows(rows1, height)}; + auto* in1_value = selected_rows1->mutable_value(); + in1_value->mutable_data( + paddle::framework::make_ddim( + {static_cast(rows1.size()), row_numel}), + cpu_place); + functor(ctx, in1_value, 1.0); + std::vector rows2{0, 5, 7, 9}; + std::unique_ptr selected_rows2{ + new paddle::framework::SelectedRows(rows2, height)}; + auto* in2_value = selected_rows2->mutable_value(); + in2_value->mutable_data( + paddle::framework::make_ddim( + {static_cast(rows2.size()), row_numel}), + cpu_place); + functor(ctx, in2_value, 2.0); + std::unique_ptr output{ + new paddle::framework::SelectedRows()}; + output->set_height(height); + auto* out_value = output->mutable_value(); + // simplely concat two SelectedRows + out_value->mutable_data(paddle::framework::make_ddim({7, 10}), + cpu_place); + paddle::operators::math::SelectedRowsSumTo + sum_to_functor; + sum_to_functor(ctx, std::vector( + {selected_rows1.get(), selected_rows2.get()}), + std::vector({0, in1_value->numel()}), output.get()); + auto out_height = output->height(); + EXPECT_EQ(out_height, height); + auto& out_rows = output->rows(); + // input1 rows + EXPECT_EQ(out_rows[0], 0); + EXPECT_EQ(out_rows[1], 4); + EXPECT_EQ(out_rows[2], 7); + // input2 rows + EXPECT_EQ(out_rows[3], 0); + EXPECT_EQ(out_rows[4], 5); + EXPECT_EQ(out_rows[5], 7); + EXPECT_EQ(out_rows[6], 9); + auto* out_data = output->value().data(); + // input1 value + EXPECT_EQ(out_data[0 * row_numel + 0], 1.0); + EXPECT_EQ(out_data[0 * row_numel + 8], 1.0); + EXPECT_EQ(out_data[1 * row_numel + 1], 1.0); + EXPECT_EQ(out_data[2 * row_numel + 6], 1.0); + // input2 value + EXPECT_EQ(out_data[3 * row_numel + 3], 2.0); + EXPECT_EQ(out_data[3 * row_numel + 8], 2.0); + EXPECT_EQ(out_data[4 * row_numel + 4], 2.0); + EXPECT_EQ(out_data[5 * row_numel + 7], 2.0); + EXPECT_EQ(out_data[6 * row_numel + 9], 2.0); + std::unique_ptr tensor1{ + new paddle::framework::Tensor()}; + tensor1->mutable_data( + paddle::framework::make_ddim({height, row_numel}), cpu_place); + functor(ctx, tensor1.get(), 3.0); + paddle::operators::math::SelectedRowsAddToTensor< + paddle::platform::CPUDeviceContext, float> + add_to_tensor_functor; + add_to_tensor_functor(ctx, *output, tensor1.get()); + auto* tensor1_data = tensor1->data(); + // row0: 1.0 + 2.0 + 3.0 + EXPECT_EQ(tensor1_data[0 * row_numel + 0], 6.0); + // row1: 3.0 + EXPECT_EQ(tensor1_data[1 * row_numel + 1], 3.0); + // row4 : 1.0 + 3.0 + EXPECT_EQ(tensor1_data[4 * row_numel + 6], 4.0); + // row5: 2.0 + 3.0 + EXPECT_EQ(tensor1_data[5 * row_numel + 7], 5.0); + // row6: 3.0 + EXPECT_EQ(tensor1_data[6 * row_numel + 1], 3.0); + // row7: 1.0 + 2.0 + 3.0 + EXPECT_EQ(tensor1_data[7 * row_numel + 3], 6.0); + // row9: 2.0 + 3.0 + EXPECT_EQ(tensor1_data[9 * row_numel + 6], 5.0); +} diff --git a/paddle/fluid/operators/math/sequence_pooling.cc b/paddle/fluid/operators/math/sequence_pooling.cc index 69318a6598..235b5405fb 100644 --- a/paddle/fluid/operators/math/sequence_pooling.cc +++ b/paddle/fluid/operators/math/sequence_pooling.cc @@ -12,9 +12,11 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/fluid/operators/math/sequence_pooling.h" #include + +#include "paddle/fluid/operators/math/blas.h" #include "paddle/fluid/operators/math/math_function.h" +#include "paddle/fluid/operators/math/sequence_pooling.h" namespace paddle { namespace operators { @@ -180,6 +182,7 @@ class SequencePoolFunctor { } auto lod = input.lod()[0]; auto& place = *context.eigen_device(); + auto blas = math::GetBlas(context); for (int i = 0; i < static_cast(lod.size()) - 1; ++i) { Tensor in_t = input.Slice(static_cast(lod[i]), static_cast(lod[i + 1])); @@ -191,7 +194,14 @@ class SequencePoolFunctor { if (pooltype == "AVERAGE") { out_e.device(place) = in_e.mean(Eigen::array({{0}})); } else if (pooltype == "SUM") { - out_e.device(place) = in_e.sum(Eigen::array({{0}})); + if (h > 0) { + const T* in_data = in_t.data(); + T* out_data = out_t.mutable_data(context.GetPlace()); + blas.VCOPY(w, in_data, out_data); + for (int64_t r = 1; r != h; ++r) { + blas.AXPY(w, 1., in_data + r * w, out_data); + } + } } else if (pooltype == "SQRT") { out_e.device(place) = in_e.sum(Eigen::array({{0}})) / std::sqrt(static_cast(h)); @@ -223,6 +233,7 @@ class SequencePoolGradFunctor { } auto lod = in_grad->lod()[0]; auto& place = *context.eigen_device(); + auto blas = math::GetBlas(context); for (int i = 0; i < static_cast(lod.size()) - 1; ++i) { auto in_g_t = in_grad->Slice(static_cast(lod[i]), static_cast(lod[i + 1])); @@ -237,7 +248,11 @@ class SequencePoolGradFunctor { if (pooltype == "AVERAGE") { in_g_e.device(place) = (out_g_e / static_cast(h)).broadcast(bcast); } else if (pooltype == "SUM") { - in_g_e.device(place) = (out_g_e).broadcast(bcast); + const T* out_g_data = out_g_t.data(); + T* in_g_data = in_g_t.mutable_data(context.GetPlace()); + for (int r = 0; r != h; ++r) { + blas.VCOPY(w, out_g_data, in_g_data + r * w); + } } else if (pooltype == "SQRT") { in_g_e.device(place) = (out_g_e / std::sqrt(static_cast(h))).broadcast(bcast); diff --git a/paddle/fluid/operators/momentum_op.cc b/paddle/fluid/operators/momentum_op.cc index 5f43c58108..12b916fceb 100644 --- a/paddle/fluid/operators/momentum_op.cc +++ b/paddle/fluid/operators/momentum_op.cc @@ -24,7 +24,7 @@ class MomentumOp : public framework::OperatorWithKernel { using framework::OperatorWithKernel::OperatorWithKernel; protected: - void InferShape(framework::InferShapeContext *ctx) const override { + void InferShape(framework::InferShapeContext* ctx) const override { PADDLE_ENFORCE(ctx->HasInput("Param"), "Input(param) of Momentum should not be null."); PADDLE_ENFORCE(ctx->HasInput("Grad"), @@ -33,6 +33,11 @@ class MomentumOp : public framework::OperatorWithKernel { "Input(velocity) of Momentum should not be null."); PADDLE_ENFORCE(ctx->HasInput("LearningRate"), "Input(LearningRate) of Momentum should not be null."); + PADDLE_ENFORCE( + ctx->GetInputsVarType("Param").front() == + framework::proto::VarType::LOD_TENSOR, + "The input var's type should be LoDTensor, but the received is %s", + ctx->Inputs("Param").front(), ctx->GetInputsVarType("Param").front()); PADDLE_ENFORCE(ctx->HasOutput("ParamOut"), "Output(ParamOut) of Momentum should not be null."); @@ -40,12 +45,15 @@ class MomentumOp : public framework::OperatorWithKernel { "Output(VelocityOut) of Momentum should not be null."); auto param_dim = ctx->GetInputDim("Param"); - PADDLE_ENFORCE_EQ( - param_dim, ctx->GetInputDim("Grad"), - "Param and Grad input of MomentumOp should have the same dimension."); - PADDLE_ENFORCE_EQ( - param_dim, ctx->GetInputDim("Velocity"), - "Param and Velocity of MomentumOp should have the same dimension."); + if (ctx->GetInputsVarType("Grad")[0] == + framework::proto::VarType::LOD_TENSOR) { + PADDLE_ENFORCE_EQ( + param_dim, ctx->GetInputDim("Grad"), + "Param and Grad input of MomentumOp should have the same dimension."); + PADDLE_ENFORCE_EQ( + param_dim, ctx->GetInputDim("Velocity"), + "Param and Velocity of MomentumOp should have the same dimension."); + } PADDLE_ENFORCE_EQ(framework::product(ctx->GetInputDim("LearningRate")), 1, "Learning_rate should be a scalar"); @@ -53,13 +61,34 @@ class MomentumOp : public framework::OperatorWithKernel { ctx->SetOutputDim("VelocityOut", param_dim); } framework::OpKernelType GetExpectedKernelType( - const framework::ExecutionContext &ctx) const override { - auto input_data_type = - framework::ToDataType(ctx.Input("Param")->type()); + const framework::ExecutionContext& ctx) const override { + auto input_data_type = framework::GetDataTypeOfVar(ctx.InputVar("Param")); return framework::OpKernelType(input_data_type, ctx.GetPlace()); } }; +class MomentumOpInferVarType : public framework::VarTypeInference { + public: + void operator()(const framework::OpDesc& op_desc, + framework::BlockDesc* block) const override { + auto input_var = op_desc.Input("Param")[0]; + for (auto& out_var : op_desc.Output("ParamOut")) { + if (block->FindRecursiveOrCreateVar(input_var).GetType() == + framework::proto::VarType::SELECTED_ROWS) { + block->FindRecursiveOrCreateVar(out_var).SetType( + framework::proto::VarType::SELECTED_ROWS); + } else if (block->FindRecursiveOrCreateVar(input_var).GetType() == + framework::proto::VarType::LOD_TENSOR) { + block->FindRecursiveOrCreateVar(out_var).SetType( + framework::proto::VarType::LOD_TENSOR); + } else { + PADDLE_THROW( + "Only support LodTensor and SelectedRows, Unexpected Input Type."); + } + } + } +}; + class MomentumOpMaker : public framework::OpProtoAndCheckerMaker { public: void Make() override { @@ -110,6 +139,9 @@ $$ } // namespace paddle namespace ops = paddle::operators; -REGISTER_OP_WITHOUT_GRADIENT(momentum, ops::MomentumOp, ops::MomentumOpMaker); -REGISTER_OP_CPU_KERNEL(momentum, ops::MomentumOpKernel, - ops::MomentumOpKernel); +REGISTER_OPERATOR(momentum, ops::MomentumOp, ops::MomentumOpMaker, + paddle::framework::EmptyGradOpMaker, + ops::MomentumOpInferVarType); +REGISTER_OP_CPU_KERNEL( + momentum, ops::MomentumOpKernel, + ops::MomentumOpKernel); diff --git a/paddle/fluid/operators/momentum_op.cu b/paddle/fluid/operators/momentum_op.cu index a3932db1f3..b68fec34d4 100644 --- a/paddle/fluid/operators/momentum_op.cu +++ b/paddle/fluid/operators/momentum_op.cu @@ -15,65 +15,7 @@ limitations under the License. */ #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/operators/momentum_op.h" -namespace paddle { -namespace operators { - -template -__global__ void MomentumKernel(const T* p, const T* g, const T* v, - const T* learning_rate, const T mu, - const int64_t num, bool use_nesterov, T* p_out, - T* v_out) { - T lr = learning_rate[0]; - if (use_nesterov) { - for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < num; - i += blockDim.x * gridDim.x) { - T g_val = g[i]; - T v_new = v[i] * mu + g_val; - v_out[i] = v_new; - p_out[i] = p[i] - (g_val + v_new * mu) * lr; - } - } else { - for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < num; - i += blockDim.x * gridDim.x) { - T v_new = v[i] * mu + g[i]; - v_out[i] = v_new; - p_out[i] = p[i] - lr * v_new; - } - } -} - -template -class MomentumOpCUDAKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto param_out = ctx.Output("ParamOut"); - auto velocity_out = ctx.Output("VelocityOut"); - auto param = ctx.Input("Param"); - auto velocity = ctx.Input("Velocity"); - auto grad = ctx.Input("Grad"); - auto learning_rate = ctx.Input("LearningRate"); - - T* p_out = param_out->mutable_data(ctx.GetPlace()); - T* v_out = velocity_out->mutable_data(ctx.GetPlace()); - - T mu = static_cast(ctx.Attr("mu")); - bool use_nesterov = ctx.Attr("use_nesterov"); - - auto* p = param->data(); - auto* v = velocity->data(); - auto* g = grad->data(); - auto* lr = learning_rate->data(); - - int block = 512; - int grid = (param->numel() + block - 1) / block; - MomentumKernel<<>>( - p, g, v, lr, mu, param->numel(), use_nesterov, p_out, v_out); - } -}; - -} // namespace operators -} // namespace paddle - namespace ops = paddle::operators; -REGISTER_OP_CUDA_KERNEL(momentum, ops::MomentumOpCUDAKernel, - ops::MomentumOpCUDAKernel); +REGISTER_OP_CUDA_KERNEL( + momentum, ops::MomentumOpKernel, + ops::MomentumOpKernel); diff --git a/paddle/fluid/operators/momentum_op.h b/paddle/fluid/operators/momentum_op.h index 264726040f..6b4d00f56c 100644 --- a/paddle/fluid/operators/momentum_op.h +++ b/paddle/fluid/operators/momentum_op.h @@ -13,29 +13,48 @@ See the License for the specific language governing permissions and limitations under the License. */ #pragma once +#include #include "paddle/fluid/framework/eigen.h" #include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/operators/math/algorithm.h" +#include "paddle/fluid/operators/math/selected_rows_functor.h" +#include "paddle/fluid/platform/for_range.h" namespace paddle { namespace operators { -template -class MomentumOpKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto param_out = ctx.Output("ParamOut"); - auto velocity_out = ctx.Output("VelocityOut"); - auto param = ctx.Input("Param"); - auto velocity = ctx.Input("Velocity"); - auto grad = ctx.Input("Grad"); - auto learning_rate = ctx.Input("LearningRate"); +using framework::Tensor; +using framework::SelectedRows; +struct NoNesterov; +struct UseNesterov; - param_out->mutable_data(ctx.GetPlace()); - velocity_out->mutable_data(ctx.GetPlace()); +template +class CPUDenseMomentumFunctor { + private: + const Tensor* param; + const Tensor* grad; + const Tensor* velocity; + const Tensor* learning_rate; + const T mu; + const T use_nesterov; + Tensor* param_out; + Tensor* velocity_out; - T mu = static_cast(ctx.Attr("mu")); - bool use_nesterov = ctx.Attr("use_nesterov"); + public: + CPUDenseMomentumFunctor(const Tensor* param, const Tensor* grad, + const Tensor* velocity, const Tensor* learning_rate, + const T mu, const bool use_nesterov, + Tensor* param_out, Tensor* velocity_out) + : param(param), + grad(grad), + velocity(velocity), + learning_rate(learning_rate), + mu(mu), + use_nesterov(use_nesterov), + param_out(param_out), + velocity_out(velocity_out) {} + inline void operator()() { auto p_out = framework::EigenVector::Flatten(*param_out); auto v_out = framework::EigenVector::Flatten(*velocity_out); @@ -53,5 +72,283 @@ class MomentumOpKernel : public framework::OpKernel { } }; +template +class DenseMomentumFunctor; + +// NOTE(dzh) for performance. +// avoid if/else in inside kernel, implement GPU UseNesterov/NoNesterov as two +// functor. +template +class DenseMomentumFunctor { + private: + const T* p_; + const T* g_; + const T* v_; + const T* lr_; + const T mu_; + const int64_t num_; + T* p_out_; + T* v_out_; + + public: + DenseMomentumFunctor(const T* p, const T* g, const T* v, + const T* learning_rate, const T mu, const int64_t num, + T* p_out, T* v_out) + : p_(p), + g_(g), + v_(v), + lr_(learning_rate), + mu_(mu), + num_(num), + p_out_(p_out), + v_out_(v_out) {} + inline HOSTDEVICE void operator()(size_t i) const { + // put memory access in register + const T p = p_[i]; + const T g = g_[i]; + const T lr = lr_[0]; + const T v = v_[i]; + T v_out = v * mu_ + g; + T p_out = p - (g + v_out * mu_) * lr; + // write reigster to memory + v_out_[i] = v_out; + p_out_[i] = p_out; + } +}; + +template +class DenseMomentumFunctor { + private: + const T* p_; + const T* g_; + const T* v_; + const T* lr_; + const T mu_; + const int64_t num_; + T* p_out_; + T* v_out_; + + public: + DenseMomentumFunctor(const T* p, const T* g, const T* v, + const T* learning_rate, const T mu, const int64_t num, + T* p_out, T* v_out) + : p_(p), + g_(g), + v_(v), + lr_(learning_rate), + mu_(mu), + num_(num), + p_out_(p_out), + v_out_(v_out) {} + inline HOSTDEVICE void operator()(size_t i) const { + // put memory access in register + const T p = p_[i]; + const T g = g_[i]; + const T lr = lr_[0]; + const T v = v_[i]; + T v_out = v * mu_ + g; + T p_out = p - lr * v_out; + // write reigster to memory + v_out_[i] = v_out; + p_out_[i] = p_out; + } +}; + +template +class SparseMomentumFunctor; + +template +class SparseMomentumFunctor { + private: + const T* p_; + const T* g_; + const T* v_; + const T* lr_; + const T mu_; + const int64_t* rows_; + const int64_t row_numel_; + const int64_t row_height_; + T* p_out_; + T* v_out_; + + public: + SparseMomentumFunctor(const T* p, const T* g, const T* v, const T* lr, + const T mu, const int64_t* rows, int64_t row_numel, + int64_t row_height, T* p_out, T* v_out) + : p_(p), + g_(g), + v_(v), + lr_(lr), + mu_(mu), + rows_(rows), + row_numel_(row_numel), + row_height_(row_height), + p_out_(p_out), + v_out_(v_out) {} + + inline HOSTDEVICE void operator()(size_t i) { + auto row_idx = + math::BinarySearch(rows_, row_height_, i / row_numel_); + T g = row_idx >= 0 ? g_[row_idx * row_numel_ + i % row_numel_] : 0; + // put memory access in register + const T p = p_[i]; + const T lr = lr_[0]; + const T v = v_[i]; + T v_out = v * mu_ + g; + T p_out = p - (g + v_out * mu_) * lr; + // write reigster to memory + v_out_[i] = v_out; + p_out_[i] = p_out; + } +}; + +template +class SparseMomentumFunctor { + private: + const T* p_; + const T* g_; + const T* v_; + const T* lr_; + const T mu_; + const int64_t* rows_; + const int64_t row_numel_; + const int64_t row_height_; + T* p_out_; + T* v_out_; + + public: + SparseMomentumFunctor(const T* p, const T* g, const T* v, const T* lr, + const T mu, const int64_t* rows, int64_t row_numel, + int64_t row_height, T* p_out, T* v_out) + : p_(p), + g_(g), + v_(v), + lr_(lr), + mu_(mu), + rows_(rows), + row_numel_(row_numel), + row_height_(row_height), + p_out_(p_out), + v_out_(v_out) {} + + inline HOSTDEVICE void operator()(size_t i) { + auto row_idx = + math::BinarySearch(rows_, row_height_, i / row_numel_); + T g = row_idx >= 0 ? g_[row_idx * row_numel_ + i % row_numel_] : 0; + // put memory access in register + const T p = p_[i]; + const T lr = lr_[0]; + const T v = v_[i]; + T v_out = v * mu_ + g; + T p_out = p - v_out * lr; + // write reigster to memory + v_out_[i] = v_out; + p_out_[i] = p_out; + } +}; + +template +class MomentumOpKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + T mu = static_cast(ctx.Attr("mu")); + bool use_nesterov = ctx.Attr("use_nesterov"); + + auto learning_rate = ctx.Input("LearningRate"); + auto param = ctx.Input("Param"); + auto param_out = ctx.Output("ParamOut"); + auto* velocity = ctx.Input("Velocity"); + auto velocity_out = ctx.Output("VelocityOut"); + param_out->mutable_data(ctx.GetPlace()); + velocity_out->mutable_data(ctx.GetPlace()); + + auto* grad_var = ctx.InputVar("Grad"); + if (grad_var->IsType()) { + auto grad = ctx.Input("Grad"); + if (platform::is_cpu_place(ctx.GetPlace())) { + CPUDenseMomentumFunctor functor(param, grad, velocity, learning_rate, + mu, use_nesterov, param_out, + velocity_out); + functor(); + } else if (platform::is_gpu_place(ctx.GetPlace())) { + platform::ForRange for_range( + static_cast(ctx.device_context()), + param->numel()); + if (use_nesterov) { + DenseMomentumFunctor functor( + param->data(), grad->data(), velocity->data(), + learning_rate->data(), mu, param->numel(), + param_out->mutable_data(ctx.GetPlace()), + velocity_out->mutable_data(ctx.GetPlace())); + for_range(functor); + + } else { + DenseMomentumFunctor functor( + param->data(), grad->data(), velocity->data(), + learning_rate->data(), mu, param->numel(), + param_out->mutable_data(ctx.GetPlace()), + velocity_out->mutable_data(ctx.GetPlace())); + for_range(functor); + } + } + + } else if (grad_var->IsType()) { + // sparse update embedding with selectedrows + auto grad = ctx.Input("Grad"); + + // sparse update maybe empty. + if (grad->rows().size() == 0) { + VLOG(3) << "Grad SelectedRows contains no data!"; + return; + } + auto* merged_grad = const_cast(ctx.scope()) + .Var() + ->GetMutable(); + math::scatter::MergeAdd merge_func; + merge_func(ctx.template device_context(), *grad, + merged_grad); + + const int64_t* rows = nullptr; +#ifdef PADDLE_WITH_CUDA + if (platform::is_gpu_place(ctx.GetPlace())) { + rows = merged_grad->rows().CUDAData(ctx.GetPlace()); + } else { +#endif + rows = merged_grad->rows().data(); +#ifdef PADDLE_WITH_CUDA + } +#endif + int64_t row_numel = + merged_grad->value().numel() / merged_grad->rows().size(); + platform::ForRange for_range( + static_cast(ctx.device_context()), + param->numel()); + if (use_nesterov) { + SparseMomentumFunctor functor( + param->data(), merged_grad->value().data(), + velocity->data(), learning_rate->data(), mu, rows, row_numel, + static_cast(merged_grad->rows().size()), + param_out->mutable_data(ctx.GetPlace()), + velocity_out->mutable_data(ctx.GetPlace())); + for_range(functor); + + } else { + SparseMomentumFunctor functor( + param->data(), merged_grad->value().data(), + velocity->data(), learning_rate->data(), mu, rows, row_numel, + static_cast(merged_grad->rows().size()), + param_out->mutable_data(ctx.GetPlace()), + velocity_out->mutable_data(ctx.GetPlace())); + for_range(functor); + } + } else { + PADDLE_THROW( + string::Sprintf("MomentumOp only supports LoDTensor or SelectedRows " + "gradient, but the received Variable Type is %s", + grad_var->Type().name())); + } + } +}; + } // namespace operators } // namespace paddle diff --git a/paddle/fluid/operators/parallel_do_op.cc b/paddle/fluid/operators/parallel_do_op.cc index 97c36a83fc..ab25628d45 100644 --- a/paddle/fluid/operators/parallel_do_op.cc +++ b/paddle/fluid/operators/parallel_do_op.cc @@ -397,6 +397,24 @@ class ParallelDoGradOpShapeInference : public framework::InferShapeBase { } }; +class ParallelDoGradOpVarTypeInference : public framework::VarTypeInference { + public: + void operator()(const framework::OpDesc &op_desc, + framework::BlockDesc *block) const override { + framework::BlockDesc *sub_block = + boost::get(op_desc.GetAttr(kParallelBlock)); + for (auto &out_vars : op_desc.Outputs()) { + for (auto &out_var : out_vars.second) { + auto &var = block->FindRecursiveOrCreateVar(out_var); + auto sub_var = sub_block->FindRecursiveOrCreateVar(out_var); + if (sub_var.GetType() != var.GetType()) { + var.SetType(sub_var.GetType()); + } + } + } + } +}; + } // namespace operators } // namespace paddle @@ -404,4 +422,5 @@ REGISTER_OPERATOR(parallel_do, paddle::operators::ParallelDoOp, paddle::operators::ParallelDoOpProtoMaker, paddle::operators::ParallelDoGradOpDescMaker); REGISTER_OPERATOR(parallel_do_grad, paddle::operators::ParallelDoGradOp, - paddle::operators::ParallelDoGradOpShapeInference); + paddle::operators::ParallelDoGradOpShapeInference, + paddle::operators::ParallelDoGradOpVarTypeInference); diff --git a/paddle/fluid/operators/reader/blocking_queue.h b/paddle/fluid/operators/reader/blocking_queue.h index 28cc91a5ed..51b980acb5 100644 --- a/paddle/fluid/operators/reader/blocking_queue.h +++ b/paddle/fluid/operators/reader/blocking_queue.h @@ -31,8 +31,8 @@ class BlockingQueue { // is a workaround and a simplified version of framework::Channel as it // doesn't support GPU and it implements on buffered blocking queue. public: - explicit BlockingQueue(size_t capacity) - : capacity_(capacity), closed_(false) { + explicit BlockingQueue(size_t capacity, bool speed_test_mode = false) + : capacity_(capacity), speed_test_mode_(speed_test_mode), closed_(false) { PADDLE_ENFORCE_GT( capacity_, 0, "The capacity of a reader::BlockingQueue must be greater than 0."); @@ -72,7 +72,9 @@ class BlockingQueue { if (!queue_.empty()) { PADDLE_ENFORCE_NOT_NULL(elem); *elem = queue_.front(); - queue_.pop_front(); + if (LIKELY(!speed_test_mode_)) { + queue_.pop_front(); + } send_cv_.notify_one(); return true; } else { @@ -114,6 +116,7 @@ class BlockingQueue { private: size_t capacity_; + bool speed_test_mode_; bool closed_; std::deque queue_; diff --git a/paddle/fluid/operators/reader/lod_tensor_blocking_queue.h b/paddle/fluid/operators/reader/lod_tensor_blocking_queue.h index 4f7cfc24ec..3f041ff7e4 100644 --- a/paddle/fluid/operators/reader/lod_tensor_blocking_queue.h +++ b/paddle/fluid/operators/reader/lod_tensor_blocking_queue.h @@ -33,8 +33,9 @@ class LoDTensorBlockingQueue { private: LoDTensorBlockingQueue(size_t capacity, - const std::vector& dims) - : queue_(capacity), dims_(dims) {} + const std::vector& dims, + bool speed_test_mode = false) + : queue_(capacity, speed_test_mode), dims_(dims) {} public: bool Push(const std::vector& lod_tensor_vec) { @@ -69,11 +70,12 @@ class LoDTensorBlockingQueue { class LoDTensorBlockingQueueHolder { public: - void InitOnce(size_t capacity, const std::vector& dims) { + void InitOnce(size_t capacity, const std::vector& dims, + bool speed_test_mode = false) { PADDLE_ENFORCE( queue_ == nullptr, "LoDTensorBlockingQueueHolder::InitOnce() can only be called once"); - queue_.reset(new LoDTensorBlockingQueue(capacity, dims)); + queue_.reset(new LoDTensorBlockingQueue(capacity, dims, speed_test_mode)); } inline const std::shared_ptr& GetQueue() const { diff --git a/paddle/fluid/operators/reader/reader_blocking_queue_test.cc b/paddle/fluid/operators/reader/reader_blocking_queue_test.cc index 7d1b381d56..dc0940ac0b 100644 --- a/paddle/fluid/operators/reader/reader_blocking_queue_test.cc +++ b/paddle/fluid/operators/reader/reader_blocking_queue_test.cc @@ -217,3 +217,27 @@ TEST(BlockingQueue, MyClassTest) { q.Receive(&b); EXPECT_EQ(a.val_, b.val_); } + +TEST(BlockingQueue, speed_test_mode) { + size_t queue_size = 10; + BlockingQueue q1(queue_size, false); + for (size_t i = 0; i < queue_size; ++i) { + q1.Send(i); + } + size_t b; + for (size_t i = 0; i < queue_size; ++i) { + q1.Receive(&b); + EXPECT_EQ(b, i); + } + EXPECT_EQ(q1.Size(), 0UL); + + BlockingQueue q2(queue_size, true); + for (size_t i = 0; i < queue_size; ++i) { + q2.Send(i); + } + for (size_t i = 0; i < queue_size; ++i) { + q2.Receive(&b); + EXPECT_EQ(b, 0UL); + } + EXPECT_EQ(q2.Size(), queue_size); +} diff --git a/paddle/fluid/operators/reshape_op.cc b/paddle/fluid/operators/reshape_op.cc index d72f85f2c4..500d86fec3 100644 --- a/paddle/fluid/operators/reshape_op.cc +++ b/paddle/fluid/operators/reshape_op.cc @@ -164,7 +164,7 @@ dimension value will be copied from Input(X) at runtime. Note that the index of [2, 3, 4], Attr(shape) = [2, 3, 2, 0] is an invalid input. 3. Input(Shape) has a higher priority than Attr(shape) if it is provided, while -Attr(shape) still should be set correctly to gurantee shape inference in +Attr(shape) still should be set correctly to gurantee shape inference in compile-time. )DOC"); @@ -259,7 +259,6 @@ class Reshape2Op : public ReshapeOp { : ReshapeOp(type, inputs, outputs, attrs) {} void InferShape(framework::InferShapeContext *ctx) const override { - ReshapeOp::InferShape(ctx); PADDLE_ENFORCE(ctx->HasOutput("XShape"), "Output(XShape) of ReshapeOp should not be null."); const auto &x_dims = ctx->GetInputDim("X"); @@ -270,6 +269,8 @@ class Reshape2Op : public ReshapeOp { } ctx->SetOutputDim("XShape", framework::make_ddim(xshape_dims)); ctx->ShareLoD("X", /*->*/ "XShape"); + + ReshapeOp::InferShape(ctx); } }; diff --git a/paddle/fluid/operators/rmsprop_op.cc b/paddle/fluid/operators/rmsprop_op.cc index 2f773f222e..f06f87e61d 100644 --- a/paddle/fluid/operators/rmsprop_op.cc +++ b/paddle/fluid/operators/rmsprop_op.cc @@ -32,6 +32,11 @@ class RmspropOp : public framework::OperatorWithKernel { "Input(Grad) of RmspropOp should not be null."); PADDLE_ENFORCE(ctx->HasInput("Moment"), "Input(Moment) of RmspropOp should not be null."); + PADDLE_ENFORCE( + ctx->GetInputsVarType("Param").front() == + framework::proto::VarType::LOD_TENSOR, + "The input var's type should be LoDTensor, but the received is %s", + ctx->Inputs("Param").front(), ctx->GetInputsVarType("Param").front()); PADDLE_ENFORCE(ctx->HasOutput("ParamOut"), "Output(param_out) of RmspropOp should not be null."); diff --git a/paddle/fluid/operators/rmsprop_op.h b/paddle/fluid/operators/rmsprop_op.h index 25ed32c5eb..797cd45fdc 100644 --- a/paddle/fluid/operators/rmsprop_op.h +++ b/paddle/fluid/operators/rmsprop_op.h @@ -13,66 +13,254 @@ See the License for the specific language governing permissions and limitations under the License. */ #pragma once +#include #include "paddle/fluid/framework/eigen.h" #include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/operators/math/algorithm.h" +#include "paddle/fluid/operators/math/selected_rows_functor.h" +#include "paddle/fluid/platform/for_range.h" namespace paddle { namespace operators { -using Tensor = framework::Tensor; template using EigenVector = framework::EigenVector; +template +struct DenseRmspropGradFunctor { + inline explicit DenseRmspropGradFunctor(const T *grad) : grad_(grad) {} + + HOSTDEVICE inline T operator()(int64_t idx) const { return grad_[idx]; } + + const T *grad_; +}; + +template +struct SparseRmspropGradFunctor { + inline SparseRmspropGradFunctor(const T *grad, const int64_t *rows, + int64_t row_numel, int64_t row_count) + : grad_(grad), + rows_(rows), + row_numel_(row_numel), + row_count_(row_count) {} + + HOSTDEVICE inline T operator()(int64_t idx) const { + auto row_idx = math::BinarySearch(rows_, row_count_, idx / row_numel_); + return row_idx >= 0 ? grad_[row_idx * row_numel_ + idx % row_numel_] : 0; + } + + const T *grad_; + const int64_t *rows_; + int64_t row_numel_; + int64_t row_count_; +}; + +template +struct UncenteredRmspropFunctor { + UncenteredRmspropFunctor(T *param, T *ms, T *mom, const T *lr, T rho, + T epsilon, T momentum, + const GradFunctor &grad_functor) + : param_(param), + ms_(ms), + mom_(mom), + lr_(lr), + rho_(rho), + epsilon_(epsilon), + momentum_(momentum), + grad_functor_(grad_functor) {} + + HOSTDEVICE inline void operator()(int64_t idx) const { + T g = grad_functor_(idx); + T ms_out = rho_ * ms_[idx] + (1 - rho_) * g * g; + T mom_out = momentum_ * mom_[idx] + lr_[0] * g / sqrt(ms_out + epsilon_); + param_[idx] -= mom_out; + ms_[idx] = ms_out; + mom_[idx] = mom_out; + } + + T *param_; + T *ms_; + T *mom_; + const T *lr_; + T rho_; + T epsilon_; + T momentum_; + GradFunctor grad_functor_; +}; + +template +struct CenteredRmspropFunctor { + CenteredRmspropFunctor(T *param, T *ms, T *mom, T *mean_grad, const T *lr, + T rho, T epsilon, T momentum, + const GradFunctor &grad_functor) + : param_(param), + ms_(ms), + mom_(mom), + mean_grad_(mean_grad), + lr_(lr), + rho_(rho), + epsilon_(epsilon), + momentum_(momentum), + grad_functor_(grad_functor) {} + + HOSTDEVICE inline void operator()(int64_t idx) const { + T g = grad_functor_(idx); + T ms_out = rho_ * ms_[idx] + (1 - rho_) * g * g; + T mg_out = rho_ * mean_grad_[idx] + (1 - rho_) * g; + T mom_out = momentum_ * mom_[idx] + + lr_[0] * g / sqrt(ms_out - mg_out * mg_out + epsilon_); + param_[idx] -= mom_out; + ms_[idx] = ms_out; + mom_[idx] = mom_out; + mean_grad_[idx] = mg_out; + } + + T *param_; + T *ms_; + T *mom_; + T *mean_grad_; + const T *lr_; + T rho_; + T epsilon_; + T momentum_; + GradFunctor grad_functor_; +}; + template class RmspropOpKernel : public framework::OpKernel { public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto* param_out = ctx.Output("ParamOut"); - auto* moment_out = ctx.Output("MomentOut"); - auto* mean_square_out = ctx.Output("MeanSquareOut"); + void Compute(const framework::ExecutionContext &ctx) const override { + using LoDTensor = framework::LoDTensor; + auto *grad_var = ctx.InputVar("Grad"); + auto *param_out = ctx.Output("ParamOut"); + auto *moment_out = ctx.Output("MomentOut"); + auto *mean_square_out = ctx.Output("MeanSquareOut"); - auto grad = ctx.Input("Grad"); + auto epsilon = static_cast(ctx.Attr("epsilon")); + auto rho = static_cast(ctx.Attr("decay")); + auto momentum = static_cast(ctx.Attr("momentum")); + bool centered = ctx.Attr("centered"); - param_out->mutable_data(ctx.GetPlace()); - moment_out->mutable_data(ctx.GetPlace()); - mean_square_out->mutable_data(ctx.GetPlace()); + auto &p_tensor = *ctx.Input("Param"); + auto &ms_tensor = *ctx.Input("MeanSquare"); + auto &lr_tensor = *ctx.Input("LearningRate"); + auto &mom_tensor = *ctx.Input("Moment"); - float epsilon = ctx.Attr("epsilon"); - float rho = ctx.Attr("decay"); - float momentum = ctx.Attr("momentum"); - bool centered = ctx.Attr("centered"); + PADDLE_ENFORCE_EQ(&p_tensor, param_out, + "Param and ParamOut must be the same Tensor"); + PADDLE_ENFORCE_EQ(&mom_tensor, moment_out, + "Moment and MomentOut must be the same Tensor"); + PADDLE_ENFORCE_EQ(&ms_tensor, mean_square_out, + "MeanSquare and MeanSquareOut must be the same Tensor"); + + auto &dev_ctx = ctx.template device_context(); + size_t limit = static_cast(ms_tensor.numel()); + + if (grad_var->IsType()) { + auto &grad_tensor = grad_var->Get(); + + if (std::is_same::value) { + auto &place = + *ctx.template device_context().eigen_device(); + auto lr_value = lr_tensor.data()[0]; + + auto p = EigenVector::Flatten(p_tensor); + auto ms = EigenVector::Flatten(ms_tensor); + auto g = EigenVector::Flatten(grad_tensor); + auto mom = EigenVector::Flatten(mom_tensor); + + auto p_out = EigenVector::Flatten(*param_out); + auto mom_out = EigenVector::Flatten(*moment_out); + auto ms_out = EigenVector::Flatten(*mean_square_out); + + ms_out.device(place) = rho * ms + (1 - rho) * g * g; + if (centered) { + auto &mg_tensor = *ctx.Input("MeanGrad"); + auto mg = EigenVector::Flatten(mg_tensor); + auto *mean_grad_out = ctx.Output("MeanGradOut"); + PADDLE_ENFORCE(&mg_tensor, mean_grad_out, + "MeanGrad and MeanGradOut must be the same Tensor"); + auto mg_out = EigenVector::Flatten(*mean_grad_out); + + mg_out.device(place) = rho * mg + (1 - rho) * g; + mom_out.device(place) = + momentum * mom + + lr_value * g / (ms_out - mg_out.square() + epsilon).sqrt(); + } else { + mom_out.device(place) = + momentum * mom + lr_value * g / (ms_out + epsilon).sqrt(); + } + p_out.device(place) = p - mom_out; + } else { + DenseRmspropGradFunctor grad_func(grad_tensor.data()); + platform::ForRange for_range(dev_ctx, limit); + if (centered) { + auto &mg_tensor = *ctx.Input("MeanGrad"); + auto *mean_grad_out = ctx.Output("MeanGradOut"); + PADDLE_ENFORCE(&mg_tensor, mean_grad_out, + "MeanGrad and MeanGradOut must be the same Tensor"); + for_range(CenteredRmspropFunctor>( + param_out->mutable_data(ctx.GetPlace()), + mean_square_out->mutable_data(ctx.GetPlace()), + moment_out->mutable_data(ctx.GetPlace()), + mean_grad_out->mutable_data(ctx.GetPlace()), + lr_tensor.data(), rho, epsilon, momentum, grad_func)); + } else { + for_range(UncenteredRmspropFunctor>( + param_out->mutable_data(ctx.GetPlace()), + mean_square_out->mutable_data(ctx.GetPlace()), + moment_out->mutable_data(ctx.GetPlace()), lr_tensor.data(), + rho, epsilon, momentum, grad_func)); + } + } + } else if (grad_var->IsType()) { + auto &grad = grad_var->Get(); + auto *merged_grad = const_cast(ctx.scope()) + .Var() + ->GetMutable(); + + math::scatter::MergeAdd merge_func; + merge_func(dev_ctx, grad, merged_grad); + + platform::ForRange for_range(dev_ctx, limit); + const int64_t *rows; +#ifdef PADDLE_WITH_CUDA + if (platform::is_gpu_place(ctx.GetPlace())) { + rows = merged_grad->rows().CUDAData(ctx.GetPlace()); + } else { +#endif + rows = merged_grad->rows().data(); +#ifdef PADDLE_WITH_CUDA + } +#endif + auto &merged_tensor = merged_grad->value(); + int64_t row_count = merged_grad->rows().size(); + int64_t row_numel = merged_tensor.numel() / row_count; + SparseRmspropGradFunctor grad_func(merged_tensor.data(), rows, + row_numel, row_count); - auto p = EigenVector::Flatten(*ctx.Input("Param")); - auto ms = EigenVector::Flatten(*ctx.Input("MeanSquare")); - auto lr = EigenVector::Flatten(*ctx.Input("LearningRate")); - auto g = EigenVector::Flatten(*grad); - auto mom = EigenVector::Flatten(*ctx.Input("Moment")); - - auto p_out = EigenVector::Flatten(*param_out); - auto mom_out = EigenVector::Flatten(*moment_out); - auto ms_out = EigenVector::Flatten(*mean_square_out); - auto& place = *ctx.template device_context().eigen_device(); - - Eigen::DSizes grad_dsize(static_cast(grad->numel())); - - ms_out.device(place) = rho * ms + (1 - rho) * g * g; - if (centered) { - auto mg = EigenVector::Flatten(*ctx.Input("MeanGrad")); - auto* mean_grad_out = ctx.Output("MeanGradOut"); - mean_grad_out->mutable_data(ctx.GetPlace()); - auto mg_out = EigenVector::Flatten(*mean_grad_out); - - mg_out.device(place) = rho * mg + (1 - rho) * g; - mom_out.device(place) = momentum * mom + - lr.broadcast(grad_dsize) * g / - (ms_out - mg_out.square() + epsilon).sqrt(); + if (centered) { + auto &mg_tensor = *ctx.Input("MeanGrad"); + auto *mean_grad_out = ctx.Output("MeanGradOut"); + PADDLE_ENFORCE(&mg_tensor, mean_grad_out, + "MeanGrad and MeanGradOut must be the same Tensor"); + for_range(CenteredRmspropFunctor>( + param_out->mutable_data(ctx.GetPlace()), + mean_square_out->mutable_data(ctx.GetPlace()), + moment_out->mutable_data(ctx.GetPlace()), + mean_grad_out->mutable_data(ctx.GetPlace()), lr_tensor.data(), + rho, epsilon, momentum, grad_func)); + } else { + for_range(UncenteredRmspropFunctor>( + param_out->mutable_data(ctx.GetPlace()), + mean_square_out->mutable_data(ctx.GetPlace()), + moment_out->mutable_data(ctx.GetPlace()), lr_tensor.data(), + rho, epsilon, momentum, grad_func)); + } } else { - mom_out.device(place) = - momentum * mom + - lr.broadcast(grad_dsize) * g / (ms_out + epsilon).sqrt(); + PADDLE_THROW("RMSProp only supports LoDTensor or SelectedRows gradient"); } - p_out.device(place) = p - mom_out; } }; diff --git a/paddle/fluid/operators/roi_align_op.cc b/paddle/fluid/operators/roi_align_op.cc new file mode 100644 index 0000000000..c57a34c3a7 --- /dev/null +++ b/paddle/fluid/operators/roi_align_op.cc @@ -0,0 +1,166 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + http://www.apache.org/licenses/LICENSE-2.0 +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/roi_align_op.h" + +namespace paddle { +namespace operators { + +using Tensor = framework::Tensor; +using LoDTensor = framework::LoDTensor; + +class ROIAlignOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext* ctx) const override { + PADDLE_ENFORCE(ctx->HasInput("X"), + "Input(X) of ROIAlignOp should not be null."); + PADDLE_ENFORCE(ctx->HasInput("ROIs"), + "Input(ROIs) of ROIAlignOp should not be null."); + PADDLE_ENFORCE(ctx->HasOutput("Out"), + "Output(Out) of ROIAlignOp should not be null."); + auto input_dims = ctx->GetInputDim("X"); + auto rois_dims = ctx->GetInputDim("ROIs"); + + PADDLE_ENFORCE(input_dims.size() == 4, + "The format of input tensor is NCHW."); + PADDLE_ENFORCE(rois_dims.size() == 2, + "ROIs should be a 2-D LoDTensor of shape (num_rois, 4)" + "given as [[x1, y1, x2, y2], …]."); + PADDLE_ENFORCE(rois_dims[1] == 4, + "ROIs should be a 2-D LoDTensor of shape (num_rois, 4)" + "given as [[x1, y1, x2, y2], …]."); + int pooled_height = ctx->Attrs().Get("pooled_height"); + int pooled_width = ctx->Attrs().Get("pooled_width"); + float spatial_scale = ctx->Attrs().Get("spatial_scale"); + + PADDLE_ENFORCE_GT(pooled_height, 0, + "The pooled output height must greater than 0"); + PADDLE_ENFORCE_GT(pooled_width, 0, + "The pooled output width must greater than 0"); + PADDLE_ENFORCE_GT(spatial_scale, 0.0f, + "The spatial scale must greater than 0"); + + auto out_dims = input_dims; + out_dims[0] = rois_dims[0]; + out_dims[1] = input_dims[1]; + out_dims[2] = pooled_height; + out_dims[3] = pooled_width; + + ctx->SetOutputDim("Out", out_dims); + } + + protected: + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext& ctx) const override { + return framework::OpKernelType( + framework::ToDataType(ctx.Input("X")->type()), + ctx.device_context()); + } +}; + +class ROIAlignGradOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext* ctx) const override { + PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")), + "The GRAD@Out of ROIAlignGradOp should not be null."); + PADDLE_ENFORCE(ctx->HasOutputs(framework::GradVarName("X")), + "The GRAD@X of ROIAlignGradOp should not be null."); + ctx->SetOutputsDim(framework::GradVarName("X"), ctx->GetInputsDim("X")); + } + + protected: + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext& ctx) const override { + return framework::OpKernelType( + framework::ToDataType(ctx.Input("X")->type()), + ctx.device_context()); + } +}; + +class ROIAlignOpMaker : public framework::OpProtoAndCheckerMaker { + public: + void Make() override { + AddInput("X", + "(Tensor), " + "The input of ROIAlignOp. " + "The format of input tensor is NCHW. Where N is batch size, " + "C is the number of input channels, " + "H is the height of the feature, and " + "W is the width of the feature."); + AddInput("ROIs", + "(LoDTensor), " + "ROIs (Regions of Interest) to pool over. " + "should be a 2-D LoDTensor of shape (num_rois, 4)" + "given as [[x1, y1, x2, y2], …]. " + "(x1, y1) is the top left coordinates, and " + "(x2, y2) is the bottom right coordinates."); + AddOutput("Out", + "(Tensor), " + "The output of ROIAlignOp is a 4-D tensor with shape " + "(num_rois, channels, pooled_h, pooled_w)."); + AddAttr("spatial_scale", + "(float, default 1.0), " + "Multiplicative spatial scale factor " + "to translate ROI coords from their input scale " + "to the scale used when pooling.") + .SetDefault(1.0); + AddAttr("pooled_height", + "(int, default 1), " + "The pooled output height.") + .SetDefault(1); + AddAttr("pooled_width", + "(int, default 1), " + "The pooled output width.") + .SetDefault(1); + AddAttr("sampling_ratio", + "(int,default -1)," + "number of sampling points in the interpolation grid" + "If <=0, then grid points are adaptive to roi_width " + "and pooled_w, likewise for height") + .SetDefault(-1); + AddComment(R"DOC( +**RoIAlign Operator** + +Region of interest align (also known as RoI align) is to perform +bilinear interpolation on inputs of nonuniform sizes to obtain +fixed-size feature maps (e.g. 7*7) + +Dividing each region proposal into equal-sized sections with +the pooled_width and pooled_height. Location remains the origin +result. + +In each ROI bin, the value of the four regularly sampled locations +are computed directly through bilinear interpolation. The output is +the mean of four locations. +Thus avoid the misaligned problem. + )DOC"); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OPERATOR(roi_align, ops::ROIAlignOp, ops::ROIAlignOpMaker, + paddle::framework::DefaultGradOpDescMaker); +REGISTER_OPERATOR(roi_align_grad, ops::ROIAlignGradOp); +REGISTER_OP_CPU_KERNEL( + roi_align, + ops::CPUROIAlignOpKernel, + ops::CPUROIAlignOpKernel); +REGISTER_OP_CPU_KERNEL( + roi_align_grad, + ops::CPUROIAlignGradOpKernel, + ops::CPUROIAlignGradOpKernel); diff --git a/paddle/fluid/operators/roi_align_op.cu b/paddle/fluid/operators/roi_align_op.cu new file mode 100644 index 0000000000..bcec6f3563 --- /dev/null +++ b/paddle/fluid/operators/roi_align_op.cu @@ -0,0 +1,353 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/roi_align_op.h" +#include "paddle/fluid/platform/cuda_primitives.h" + +namespace paddle { +namespace operators { + +using Tensor = framework::Tensor; +using LoDTensor = framework::LoDTensor; + +static constexpr int kNumCUDAThreads = 512; +static constexpr int kNumMaxinumNumBlocks = 4096; + +static inline int NumBlocks(const int N) { + return std::min((N + kNumCUDAThreads - 1) / kNumCUDAThreads, + kNumMaxinumNumBlocks); +} + +#define CUDA_1D_KERNEL_LOOP(i, n) \ + for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < (n); \ + i += blockDim.x * gridDim.x) + +template +__device__ T BilinearInterpolate(const T* input_data, const int height, + const int width, T y, T x) { + if (y < -1.0 || y > height || x < -1.0 || x > width) { + return 0; + } + y = y <= 0 ? 0 : y; + x = x <= 0 ? 0 : x; + int y_low = static_cast(y); + int x_low = static_cast(x); + int y_high; + int x_high; + if (y_low >= height - 1) { + y_high = y_low = height - 1; + y = static_cast(y_low); + } else { + y_high = y_low + 1; + } + if (x_low >= width - 1) { + x_high = x_low = width - 1; + x = static_cast(x_low); + } else { + x_high = x_low + 1; + } + T ly = y - y_low, lx = x - x_low; + T hy = 1. - ly, hx = 1. - lx; + + T v1 = input_data[y_low * width + x_low]; + T v2 = input_data[y_low * width + x_high]; + T v3 = input_data[y_high * width + x_low]; + T v4 = input_data[y_high * width + x_high]; + T w1 = hy * hx, w2 = hy * lx, w3 = ly * hx, w4 = ly * lx; + + T val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4); + return val; +} + +template +__device__ void BilinearInterpolateGradient(const int height, const int width, + T y, T x, T* w1, T* w2, T* w3, + T* w4, int* x_low, int* x_high, + int* y_low, int* y_high) { + if (y < -1.0 || y > height || x < -1.0 || x > width) { + return; + } + + y = y <= 0 ? 0 : y; + x = x <= 0 ? 0 : x; + *y_low = static_cast(y); + *x_low = static_cast(x); + if (*y_low >= height - 1) { + *y_high = *y_low = height - 1; + y = static_cast(*y_low); + } else { + *y_high = *y_low + 1; + } + if (*x_low >= width - 1) { + *x_high = *x_low = width - 1; + x = static_cast(*x_low); + } else { + *x_high = *x_low + 1; + } + T ly = y - *y_low, lx = x - *x_low; + T hy = 1. - ly, hx = 1. - lx; + *w1 = hy * hx, *w2 = hy * lx, *w3 = ly * hx, *w4 = ly * lx; + + return; +} + +template +__global__ void GPUROIAlignForward( + const int nthreads, const T* input_data, const T* input_rois, + const float spatial_scale, const int channels, const int height, + const int width, const int pooled_height, const int pooled_width, + const int sampling_ratio, int* roi_batch_id_data, T* output_data) { + CUDA_1D_KERNEL_LOOP(i, nthreads) { + int pw = i % pooled_width; + int ph = (i / pooled_width) % pooled_height; + int c = (i / pooled_width / pooled_height) % channels; + int n = i / pooled_width / pooled_height / channels; + + const T* offset_input_rois = input_rois + n * kROISize; + int roi_batch_ind = roi_batch_id_data[n]; + + T roi_xmin = offset_input_rois[0] * spatial_scale; + T roi_ymin = offset_input_rois[1] * spatial_scale; + T roi_xmax = offset_input_rois[2] * spatial_scale; + T roi_ymax = offset_input_rois[3] * spatial_scale; + + T roi_width = max(roi_xmax - roi_xmin, static_cast(1.)); + T roi_height = max(roi_ymax - roi_ymin, static_cast(1.)); + T bin_size_h = static_cast(roi_height) / static_cast(pooled_height); + T bin_size_w = static_cast(roi_width) / static_cast(pooled_width); + + const T* offset_input_data = + input_data + (roi_batch_ind * channels + c) * height * width; + + int roi_bin_grid_h = (sampling_ratio > 0) + ? sampling_ratio + : ceil(roi_height / pooled_height); + int roi_bin_grid_w = + (sampling_ratio > 0) ? sampling_ratio : ceil(roi_width / pooled_width); + const T count = roi_bin_grid_h * roi_bin_grid_w; + T output_val = 0; + for (int iy = 0; iy < roi_bin_grid_h; iy++) { + const T y = roi_ymin + ph * bin_size_h + + static_cast(iy + .5f) * bin_size_h / + static_cast(roi_bin_grid_h); + for (int ix = 0; ix < roi_bin_grid_w; ix++) { + const T x = roi_xmin + pw * bin_size_w + + static_cast(ix + .5f) * bin_size_w / + static_cast(roi_bin_grid_w); + T val = BilinearInterpolate(offset_input_data, height, width, y, x); + output_val += val; + } + } + output_val /= count; + output_data[i] = output_val; + } +} + +template +__global__ void GPUROIAlignBackward(const int nthreads, const T* input_rois, + const T* out_grad, const int num_rois, + const float spatial_scale, + const int channels, const int height, + const int width, const int pooled_height, + const int pooled_width, + const int sampling_ratio, + int* roi_batch_id_data, T* input_grad) { + CUDA_1D_KERNEL_LOOP(i, nthreads) { + int pw = i % pooled_width; + int ph = (i / pooled_width) % pooled_height; + int c = (i / pooled_width / pooled_height) % channels; + int n = i / pooled_width / pooled_height / channels; + const T* offset_input_rois = input_rois + n * kROISize; + int roi_batch_ind = roi_batch_id_data[n]; + + T roi_xmin = offset_input_rois[0] * spatial_scale; + T roi_ymin = offset_input_rois[1] * spatial_scale; + T roi_xmax = offset_input_rois[2] * spatial_scale; + T roi_ymax = offset_input_rois[3] * spatial_scale; + + T roi_width = max(roi_xmax - roi_xmin, static_cast(1.)); + T roi_height = max(roi_ymax - roi_ymin, static_cast(1.)); + T bin_size_h = static_cast(roi_height) / static_cast(pooled_height); + T bin_size_w = static_cast(roi_width) / static_cast(pooled_width); + + T* offset_input_grad = + input_grad + (roi_batch_ind * channels + c) * height * width; + + const T* offset_out_grad = + out_grad + (n * channels + c) * pooled_height * pooled_width; + const T out_grad_this_bin = offset_out_grad[ph * pooled_width + pw]; + + int roi_bin_grid_h = (sampling_ratio > 0) + ? sampling_ratio + : ceil(roi_height / pooled_height); + int roi_bin_grid_w = + (sampling_ratio > 0) ? sampling_ratio : ceil(roi_width / pooled_width); + + const T count = roi_bin_grid_h * roi_bin_grid_w; + for (int iy = 0; iy < roi_bin_grid_h; iy++) { + const T y = roi_ymin + ph * bin_size_h + + static_cast(iy + .5f) * bin_size_h / + static_cast(roi_bin_grid_h); + for (int ix = 0; ix < roi_bin_grid_w; ix++) { + const T x = roi_xmin + pw * bin_size_w + + static_cast(ix + .5f) * bin_size_w / + static_cast(roi_bin_grid_w); + T w1 = 0, w2 = 0, w3 = 0, w4 = 0; + int x_low = -1, x_high = -1, y_low = -1, y_high = -1; + BilinearInterpolateGradient(height, width, y, x, &w1, &w2, &w3, &w4, + &x_low, &x_high, &y_low, &y_high); + T diff1 = out_grad_this_bin * w1 / count; + T diff2 = out_grad_this_bin * w2 / count; + T diff3 = out_grad_this_bin * w3 / count; + T diff4 = out_grad_this_bin * w4 / count; + if (x_low >= 0 && x_high >= 0 && y_low >= 0 && y_high >= 0) { + platform::CudaAtomicAdd(offset_input_grad + y_low * width + x_low, + diff1); + platform::CudaAtomicAdd(offset_input_grad + y_low * width + x_high, + diff2); + platform::CudaAtomicAdd(offset_input_grad + y_high * width + x_low, + diff3); + platform::CudaAtomicAdd(offset_input_grad + y_high * width + x_high, + diff4); + } + } + } + } +} + +template +class GPUROIAlignOpKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto* in = ctx.Input("X"); + auto* rois = ctx.Input("ROIs"); + auto* out = ctx.Output("Out"); + + auto pooled_height = ctx.Attr("pooled_height"); + auto pooled_width = ctx.Attr("pooled_width"); + auto spatial_scale = ctx.Attr("spatial_scale"); + auto sampling_ratio = ctx.Attr("sampling_ratio"); + + auto in_dims = in->dims(); + int batch_size = in_dims[0]; + int channels = in_dims[1]; + int height = in_dims[2]; + int width = in_dims[3]; + + int rois_num = rois->dims()[0]; + + if (rois_num == 0) return; + + int output_size = out->numel(); + int blocks = NumBlocks(output_size); + int threads = kNumCUDAThreads; + + Tensor roi_batch_id_list; + roi_batch_id_list.Resize({rois_num}); + int* roi_batch_id_data = + roi_batch_id_list.mutable_data(platform::CPUPlace()); + auto rois_lod = rois->lod().back(); + int rois_batch_size = rois_lod.size() - 1; + PADDLE_ENFORCE_EQ( + rois_batch_size, batch_size, + "The rois_batch_size and imgs batch_size must be the same."); + int rois_num_with_lod = rois_lod[rois_batch_size]; + PADDLE_ENFORCE_EQ(rois_num, rois_num_with_lod, + "The rois_num from input and lod must be the same."); + for (int n = 0; n < rois_batch_size; ++n) { + for (size_t i = rois_lod[n]; i < rois_lod[n + 1]; ++i) { + roi_batch_id_data[i] = n; + } + } + Tensor roi_batch_id_list_gpu; + framework::TensorCopySync(roi_batch_id_list, ctx.GetPlace(), + &roi_batch_id_list_gpu); + GPUROIAlignForward< + T><<>>( + output_size, in->data(), rois->data(), spatial_scale, channels, + height, width, pooled_height, pooled_width, sampling_ratio, + roi_batch_id_list_gpu.data(), + out->mutable_data(ctx.GetPlace())); + } +}; + +template +class GPUROIAlignGradOpKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto* in = ctx.Input("X"); + auto* rois = ctx.Input("ROIs"); + + auto* out_grad = ctx.Input(framework::GradVarName("Out")); + auto* in_grad = ctx.Output(framework::GradVarName("X")); + + auto pooled_height = ctx.Attr("pooled_height"); + auto pooled_width = ctx.Attr("pooled_width"); + auto spatial_scale = ctx.Attr("spatial_scale"); + auto sampling_ratio = ctx.Attr("sampling_ratio"); + + int rois_num = rois->dims()[0]; + int channels = in->dims()[1]; + int height = in->dims()[2]; + int width = in->dims()[3]; + + if (!in_grad) { + return; + } + Tensor roi_batch_id_list; + roi_batch_id_list.Resize({rois_num}); + int* roi_batch_id_data = + roi_batch_id_list.mutable_data(platform::CPUPlace()); + auto rois_lod = rois->lod().back(); + int rois_batch_size = rois_lod.size() - 1; + for (int n = 0; n < rois_batch_size; ++n) { + for (size_t i = rois_lod[n]; i < rois_lod[n + 1]; ++i) { + roi_batch_id_data[i] = n; + } + } + Tensor roi_batch_id_list_gpu; + framework::TensorCopySync(roi_batch_id_list, ctx.GetPlace(), + &roi_batch_id_list_gpu); + + in_grad->mutable_data(ctx.GetPlace()); + math::SetConstant set_zero; + set_zero(ctx.cuda_device_context(), in_grad, static_cast(0)); + + int output_grad_size = out_grad->numel(); + int blocks = NumBlocks(output_grad_size); + int threads = kNumCUDAThreads; + + if (output_grad_size > 0) { + GPUROIAlignBackward< + T><<>>( + output_grad_size, rois->data(), out_grad->data(), rois_num, + spatial_scale, channels, height, width, pooled_height, pooled_width, + sampling_ratio, roi_batch_id_list_gpu.data(), + in_grad->mutable_data(ctx.GetPlace())); + } + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OP_CUDA_KERNEL( + roi_align, + ops::GPUROIAlignOpKernel, + ops::GPUROIAlignOpKernel); +REGISTER_OP_CUDA_KERNEL( + roi_align_grad, + ops::GPUROIAlignGradOpKernel, + ops::GPUROIAlignGradOpKernel); diff --git a/paddle/fluid/operators/roi_align_op.h b/paddle/fluid/operators/roi_align_op.h new file mode 100644 index 0000000000..a18aee1b86 --- /dev/null +++ b/paddle/fluid/operators/roi_align_op.h @@ -0,0 +1,332 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + http://www.apache.org/licenses/LICENSE-2.0 +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once +#include +#include +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/operators/math/math_function.h" + +namespace paddle { +namespace operators { + +using Tensor = framework::Tensor; +using LoDTensor = framework::LoDTensor; + +static constexpr int kROISize = 4; + +template +void PreCalcForBilinearInterpolate( + const platform::DeviceContext& ctx, const int height, const int width, + const int pooled_height, const int pooled_width, const int iy_upper, + const int ix_upper, T roi_ymin, T roi_xmin, T bin_size_h, T bin_size_w, + int roi_bin_grid_h, int roi_bin_grid_w, Tensor* pre_pos, Tensor* pre_w) { + int pre_calc_index = 0; + int* pre_pos_data = pre_pos->mutable_data(ctx.GetPlace()); + T* pre_w_data = pre_w->mutable_data(ctx.GetPlace()); + for (int ph = 0; ph < pooled_height; ph++) { + for (int pw = 0; pw < pooled_width; pw++) { + for (int iy = 0; iy < iy_upper; iy++) { + // calculate y of sample points + T y = roi_ymin + ph * bin_size_h + + static_cast(iy + .5f) * bin_size_h / + static_cast(roi_bin_grid_h); + // calculate x of samle points + for (int ix = 0; ix < ix_upper; ix++) { + T x = roi_xmin + pw * bin_size_w + + static_cast(ix + .5f) * bin_size_w / + static_cast(roi_bin_grid_w); + // deal with elements out of map + if (y < -1.0 || y > height || x < -1.0 || x > width) { + for (int i = 0; i < kROISize; ++i) { + pre_pos_data[i + pre_calc_index * kROISize] = 0; + pre_w_data[i + pre_calc_index * kROISize] = 0; + } + pre_calc_index += 1; + continue; + } + y = y <= 0 ? 0 : y; + x = x <= 0 ? 0 : x; + + int y_low = static_cast(y); + int x_low = static_cast(x); + int y_high; + int x_high; + if (y_low >= height - 1) { + y_high = y_low = height - 1; + y = static_cast(y_low); + } else { + y_high = y_low + 1; + } + if (x_low >= width - 1) { + x_high = x_low = width - 1; + x = static_cast(x_low); + } else { + x_high = x_low + 1; + } + T ly = y - y_low, lx = x - x_low; + T hy = 1. - ly, hx = 1. - lx; + pre_pos_data[pre_calc_index * kROISize] = y_low * width + x_low; + pre_pos_data[pre_calc_index * kROISize + 1] = y_low * width + x_high; + pre_pos_data[pre_calc_index * kROISize + 2] = y_high * width + x_low; + pre_pos_data[pre_calc_index * kROISize + 3] = y_high * width + x_high; + pre_w_data[pre_calc_index * kROISize] = hy * hx; + pre_w_data[pre_calc_index * kROISize + 1] = hy * lx; + pre_w_data[pre_calc_index * kROISize + 2] = ly * hx; + pre_w_data[pre_calc_index * kROISize + 3] = ly * lx; + pre_calc_index += 1; + } + } + } + } +} + +template +void bilinear_interpolate_gradient(const int height, const int width, T y, T x, + const T out_grad_this_bin, const T count, + T* batch_grad_data) { + int x_low, y_low, x_high, y_high; + T w1, w2, w3, w4; + if (y < -1.0 || y > height || x < -1.0 || x > width) { + w1 = w2 = w3 = w4 = 0; + x_low = x_high = y_low = y_high = -1; + return; + } + y = y <= 0 ? 0 : y; + x = x <= 0 ? 0 : x; + y_low = static_cast(y); + x_low = static_cast(x); + if (y_low >= height - 1) { + y_high = y_low = height - 1; + y = static_cast(y_low); + } else { + y_high = y_low + 1; + } + + if (x_low >= width - 1) { + x_high = x_low = width - 1; + x = static_cast(x_low); + } else { + x_high = x_low + 1; + } + + T ly = y - y_low, lx = x - x_low; + T hy = 1. - ly, hx = 1. - lx; + w1 = hy * hx, w2 = hy * lx, w3 = ly * hx, w4 = ly * lx; + T diff1 = out_grad_this_bin * w1 / count; + T diff2 = out_grad_this_bin * w2 / count; + T diff3 = out_grad_this_bin * w3 / count; + T diff4 = out_grad_this_bin * w4 / count; + if (x_low >= 0 && x_high >= 0 && y_low >= 0 && y_high >= 0) { + *(batch_grad_data + y_low * width + x_low) += diff1; + *(batch_grad_data + y_low * width + x_high) += diff2; + *(batch_grad_data + y_high * width + x_low) += diff3; + *(batch_grad_data + y_high * width + x_high) += diff4; + } +} + +template +class CPUROIAlignOpKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto* in = ctx.Input("X"); + auto* rois = ctx.Input("ROIs"); + auto* out = ctx.Output("Out"); + auto pooled_height = ctx.Attr("pooled_height"); + auto pooled_width = ctx.Attr("pooled_width"); + auto spatial_scale = ctx.Attr("spatial_scale"); + auto sampling_ratio = ctx.Attr("sampling_ratio"); + + auto& dev_ctx = ctx.template device_context(); + + auto in_dims = in->dims(); + int batch_size = in_dims[0]; + int channels = in_dims[1]; + int height = in_dims[2]; + int width = in_dims[3]; + int rois_num = rois->dims()[0]; + + auto in_stride = framework::stride(in_dims); + auto roi_stride = framework::stride(rois->dims()); + auto out_stride = framework::stride(out->dims()); + + const T* input_data = in->data(); + framework::Tensor roi_batch_id_list; + roi_batch_id_list.Resize({rois_num}); + int* roi_batch_id_data = + roi_batch_id_list.mutable_data(ctx.GetPlace()); + + auto rois_lod = rois->lod().back(); + int rois_batch_size = rois_lod.size() - 1; + PADDLE_ENFORCE_EQ( + rois_batch_size, batch_size, + "The rois_batch_size and imgs batch_size must be the same."); + int rois_num_with_lod = rois_lod[rois_batch_size]; + PADDLE_ENFORCE_EQ(rois_num, rois_num_with_lod, + "The rois_num from input and lod must be the same."); + for (int n = 0; n < rois_batch_size; ++n) { + for (size_t i = rois_lod[n]; i < rois_lod[n + 1]; ++i) { + roi_batch_id_data[i] = n; + } + } + T* output_data = out->mutable_data(ctx.GetPlace()); + const T* rois_data = rois->data(); + for (int n = 0; n < rois_num; ++n) { + int roi_batch_id = roi_batch_id_data[n]; + T roi_xmin = rois_data[0] * spatial_scale; + T roi_ymin = rois_data[1] * spatial_scale; + T roi_xmax = rois_data[2] * spatial_scale; + T roi_ymax = rois_data[3] * spatial_scale; + + T roi_width = std::max(roi_xmax - roi_xmin, static_cast(1.)); + T roi_height = std::max(roi_ymax - roi_ymin, static_cast(1.)); + T bin_size_h = static_cast(roi_height) / static_cast(pooled_height); + T bin_size_w = static_cast(roi_width) / static_cast(pooled_width); + const T* batch_data = input_data + roi_batch_id * in_stride[0]; + + int roi_bin_grid_h = (sampling_ratio > 0) + ? sampling_ratio + : ceil(roi_height / pooled_height); + int roi_bin_grid_w = (sampling_ratio > 0) + ? sampling_ratio + : ceil(roi_width / pooled_width); + const T count = roi_bin_grid_h * roi_bin_grid_w; + Tensor pre_pos; + Tensor pre_w; + int pre_size = count * out_stride[1]; + pre_pos.Resize({pre_size, kROISize}); + pre_w.Resize({pre_size, kROISize}); + + PreCalcForBilinearInterpolate( + dev_ctx, height, width, pooled_height, pooled_width, roi_bin_grid_h, + roi_bin_grid_w, roi_ymin, roi_xmin, bin_size_h, bin_size_w, + roi_bin_grid_h, roi_bin_grid_w, &pre_pos, &pre_w); + const int* pre_pos_data = pre_pos.data(); + const T* pre_w_data = pre_w.data(); + for (int c = 0; c < channels; c++) { + int pre_calc_index = 0; + for (int ph = 0; ph < pooled_height; ph++) { + for (int pw = 0; pw < pooled_width; pw++) { + const int pool_index = ph * pooled_width + pw; + T output_val = 0; + for (int iy = 0; iy < roi_bin_grid_h; iy++) { + for (int ix = 0; ix < roi_bin_grid_w; ix++) { + for (int i = 0; i < kROISize; i++) { + int pos = pre_pos_data[pre_calc_index * kROISize + i]; + T w = pre_w_data[pre_calc_index * kROISize + i]; + output_val += w * batch_data[pos]; + } + pre_calc_index += 1; + } + } + output_val /= count; + output_data[pool_index] = output_val; + } + } + batch_data += in_stride[1]; + output_data += out_stride[1]; + } + rois_data += roi_stride[0]; + } + } +}; + +template +class CPUROIAlignGradOpKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto* in = ctx.Input("X"); + auto* rois = ctx.Input("ROIs"); + auto* out_grad = + ctx.Input(framework::GradVarName("Out")); + auto* in_grad = ctx.Output(framework::GradVarName("X")); + + auto pooled_height = ctx.Attr("pooled_height"); + auto pooled_width = ctx.Attr("pooled_width"); + auto spatial_scale = ctx.Attr("spatial_scale"); + auto sampling_ratio = ctx.Attr("sampling_ratio"); + auto in_dims = in->dims(); + if (!in_grad) { + return; + } + int channels = in_dims[1]; + int height = in_dims[2]; + int width = in_dims[3]; + int rois_num = rois->dims()[0]; + Tensor roi_batch_id_list; + roi_batch_id_list.Resize({rois_num}); + int* roi_batch_id_data = + roi_batch_id_list.mutable_data(ctx.GetPlace()); + + auto rois_lod = rois->lod().back(); + int rois_batch_size = rois_lod.size() - 1; + for (int n = 0; n < rois_batch_size; ++n) { + for (size_t i = rois_lod[n]; i < rois_lod[n + 1]; ++i) { + roi_batch_id_data[i] = n; + } + } + + const T* rois_data = rois->data(); + const T* out_grad_data = out_grad->data(); + T* in_grad_data = in_grad->mutable_data(ctx.GetPlace()); + + auto in_stride = framework::stride(in->dims()); + auto roi_stride = framework::stride(rois->dims()); + auto out_stride = framework::stride(out_grad->dims()); + + for (int n = 0; n < rois_num; ++n) { + int roi_batch_idx = roi_batch_id_data[n]; + T roi_xmin = rois_data[0] * spatial_scale; + T roi_ymin = rois_data[1] * spatial_scale; + T roi_xmax = rois_data[2] * spatial_scale; + T roi_ymax = rois_data[3] * spatial_scale; + T roi_width = std::max(roi_xmax - roi_xmin, static_cast(1.)); + T roi_height = std::max(roi_ymax - roi_ymin, static_cast(1.)); + T bin_size_h = static_cast(roi_height) / static_cast(pooled_height); + T bin_size_w = static_cast(roi_width) / static_cast(pooled_width); + for (int c = 0; c < channels; ++c) { + T* batch_grad_data = + in_grad_data + roi_batch_idx * in_stride[0] + c * in_stride[1]; + const T* batch_out_grad_data = + out_grad_data + n * out_stride[0] + c * out_stride[1]; + for (int ph = 0; ph < pooled_height; ++ph) { + for (int pw = 0; pw < pooled_width; ++pw) { + int pool_index = ph * pooled_width + pw; + T out_grad_this_bin = batch_out_grad_data[pool_index]; + int roi_bin_grid_h = (sampling_ratio > 0) + ? sampling_ratio + : ceil(roi_height / pooled_height); + int roi_bin_grid_w = (sampling_ratio > 0) + ? sampling_ratio + : ceil(roi_width / pooled_width); + T count = roi_bin_grid_h * roi_bin_grid_w; + for (int iy = 0; iy < roi_bin_grid_h; iy++) { + const T y = roi_ymin + ph * bin_size_h + + static_cast(iy + .5f) * bin_size_h / + static_cast(roi_bin_grid_h); + for (int ix = 0; ix < roi_bin_grid_w; ix++) { + const T x = roi_xmin + pw * bin_size_w + + static_cast(ix + .5f) * bin_size_w / + static_cast(roi_bin_grid_w); + bilinear_interpolate_gradient(height, width, y, x, + out_grad_this_bin, count, + batch_grad_data); + } + } + } + } + } + rois_data += roi_stride[0]; + } + } +}; +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/roi_pool_op.cc b/paddle/fluid/operators/roi_pool_op.cc index d6d209d5de..8e29761ec2 100644 --- a/paddle/fluid/operators/roi_pool_op.cc +++ b/paddle/fluid/operators/roi_pool_op.cc @@ -174,4 +174,4 @@ REGISTER_OP_CPU_KERNEL( REGISTER_OP_CPU_KERNEL( roi_pool_grad, ops::CPUROIPoolGradOpKernel, - ops::CPUROIPoolOpKernel); + ops::CPUROIPoolGradOpKernel); diff --git a/paddle/fluid/operators/roi_pool_op.cu b/paddle/fluid/operators/roi_pool_op.cu index 46e20285db..75c3dd6bc4 100644 --- a/paddle/fluid/operators/roi_pool_op.cu +++ b/paddle/fluid/operators/roi_pool_op.cu @@ -249,4 +249,4 @@ REGISTER_OP_CUDA_KERNEL( REGISTER_OP_CUDA_KERNEL( roi_pool_grad, ops::GPUROIPoolGradOpKernel, - ops::GPUROIPoolOpKernel); + ops::GPUROIPoolGradOpKernel); diff --git a/paddle/fluid/operators/sequence_concat_op.cc b/paddle/fluid/operators/sequence_concat_op.cc index 397a318295..3234b60861 100644 --- a/paddle/fluid/operators/sequence_concat_op.cc +++ b/paddle/fluid/operators/sequence_concat_op.cc @@ -90,11 +90,13 @@ REGISTER_OPERATOR(sequence_concat, paddle::framework::OperatorWithKernel, paddle::framework::DefaultGradOpDescMaker); template using Kernel = op::SeqConcatKernel; -REGISTER_OP_CPU_KERNEL(sequence_concat, Kernel, Kernel); +REGISTER_OP_CPU_KERNEL(sequence_concat, Kernel, Kernel, + Kernel); + REGISTER_OPERATOR(sequence_concat_grad, paddle::framework::OperatorWithKernel, op::SeqConcatGradShapeInferer); template using GradKernel = op::SeqConcatGradKernel; REGISTER_OP_CPU_KERNEL(sequence_concat_grad, GradKernel, - GradKernel); + GradKernel, GradKernel); diff --git a/paddle/fluid/operators/sequence_concat_op.h b/paddle/fluid/operators/sequence_concat_op.h index 33e9babff2..ff035f421c 100644 --- a/paddle/fluid/operators/sequence_concat_op.h +++ b/paddle/fluid/operators/sequence_concat_op.h @@ -17,7 +17,7 @@ #include #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/operators/detail/safe_ref.h" -#include "paddle/fluid/operators/math/concat.h" +#include "paddle/fluid/operators/math/concat_and_split.h" namespace paddle { namespace operators { @@ -106,7 +106,7 @@ class SeqConcatGradKernel : public framework::OpKernel { } } - math::ConcatGradFunctor functor; + math::SplitFunctor functor; std::vector sliced_x_ptr; std::vector sliced_dx_ptr; for (auto &x : sliced_x) { diff --git a/paddle/fluid/operators/sequence_unpad_op.cc b/paddle/fluid/operators/sequence_unpad_op.cc new file mode 100644 index 0000000000..e633e378a2 --- /dev/null +++ b/paddle/fluid/operators/sequence_unpad_op.cc @@ -0,0 +1,153 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/sequence_unpad_op.h" + +namespace paddle { +namespace operators { + +class SequenceUnpadOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + protected: + void InferShape(framework::InferShapeContext* ctx) const override { + PADDLE_ENFORCE(ctx->HasInput("X"), + "Input(X) of SequenceUnpadOp should not be null."); + PADDLE_ENFORCE(ctx->HasInput("Length"), + "Input(Length) of SequenceUnpadOp should not be null."); + PADDLE_ENFORCE(ctx->HasOutput("Out"), + "Output(Out) of SequenceUnpadOp should not be null."); + + auto x_dims = ctx->GetInputDim("X"); + PADDLE_ENFORCE_GE(x_dims.size(), 2, + "The rank of Input(X) can't be less than 2."); + + auto len_dims = ctx->GetInputDim("Length"); + PADDLE_ENFORCE(len_dims.size() == 2 && len_dims[1] == 1, + "The shape of Input(Length) should be [batch_size, 1]."); + PADDLE_ENFORCE( + len_dims[0] == x_dims[0], + "Input(X) and Input(Length) should have the same first dimension."); + + int64_t out_dim_0 = -1; + if (ctx->IsRuntime()) { + out_dim_0 = x_dims[0] * x_dims[1]; + } + + std::vector out_dims_vec{out_dim_0}; + if (x_dims.size() == 2) { + out_dims_vec.push_back(1); + } else { + for (int i = 2; i < x_dims.size(); ++i) { + out_dims_vec.push_back(x_dims[i]); + } + } + ctx->SetOutputDim("Out", framework::make_ddim(out_dims_vec)); + } + + protected: + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext& ctx) const override { + auto data_type = framework::GetDataTypeOfVar(ctx.InputVar("X")); + return framework::OpKernelType(data_type, ctx.device_context()); + } +}; + +class SequenceUnpadOpMaker : public framework::OpProtoAndCheckerMaker { + public: + void Make() override { + AddInput("X", + "(LoDTensor, default LoDTensor) Input tensor which " + "contains the padded sequences with equal length."); + AddInput("Length", + "(LoDTensor) The input tensor which specifies the actual ength of " + "sequences after unpadding."); + AddOutput( + "Out", + "(LoDTensor) The output tensor which contains unpadded sequences."); + AddComment(R"DOC( + Sequence Unpad Operator + + This operator removes the padding data in the input sequences and convert + them into sequences with actual length as output, identitied by lod + information. + + Example: + + Given input tensor Input(X): + X.data = [[ 1.0, 2.0, 3.0, 4.0, 5.0], + [ 6.0, 7.0, 8.0, 9.0, 10.0], + [11.0, 12.0, 13.0, 14.0, 15.0]], +` + in which there are 3 sequences padded to length 5, and the acutal length + specified by Input(Length): + + Length.data = [[2], [3], [4]], + + after unpadding, Output(Out) will be: + + Out.data = [[1.0, 2.0, 6.0, 7.0, 8.0, 11.0, 12.0, 13.0, 14.0]] + Out.lod = [[0, 2, 5, 9]] + + )DOC"); + } +}; + +class SequenceUnpadGradOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext* ctx) const override { + PADDLE_ENFORCE(ctx->HasInput("X"), + "Input(X) of SequenceUnpadGradOp should not be null."); + PADDLE_ENFORCE( + ctx->HasInput(framework::GradVarName("Out")), + "Input(Out@GRAD) of SequenceUnpadGradOp should not be null."); + + if (ctx->HasOutput(framework::GradVarName("X"))) { + ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("X")); + ctx->ShareLoD("X", /*->*/ framework::GradVarName("X")); + } + } + + protected: + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext& ctx) const override { + auto data_type = framework::GetDataTypeOfVar(ctx.InputVar("X")); + return framework::OpKernelType(data_type, ctx.device_context()); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OPERATOR(sequence_unpad, ops::SequenceUnpadOp, + ops::SequenceUnpadOpMaker, + paddle::framework::DefaultGradOpDescMaker); +REGISTER_OPERATOR(sequence_unpad_grad, ops::SequenceUnpadGradOp); +REGISTER_OP_CPU_KERNEL( + sequence_unpad, + ops::SequenceUnpadOpKernel, + ops::SequenceUnpadOpKernel, + ops::SequenceUnpadOpKernel, + ops::SequenceUnpadOpKernel); +REGISTER_OP_CPU_KERNEL( + sequence_unpad_grad, + ops::SequenceUnpadGradOpKernel, + ops::SequenceUnpadGradOpKernel, + ops::SequenceUnpadGradOpKernel, + ops::SequenceUnpadGradOpKernel); diff --git a/paddle/fluid/operators/sequence_unpad_op.cu b/paddle/fluid/operators/sequence_unpad_op.cu new file mode 100644 index 0000000000..7524837223 --- /dev/null +++ b/paddle/fluid/operators/sequence_unpad_op.cu @@ -0,0 +1,30 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/sequence_unpad_op.h" + +namespace ops = paddle::operators; +REGISTER_OP_CUDA_KERNEL( + sequence_unpad, + ops::SequenceUnpadOpKernel, + ops::SequenceUnpadOpKernel, + ops::SequenceUnpadOpKernel, + ops::SequenceUnpadOpKernel); +REGISTER_OP_CUDA_KERNEL( + sequence_unpad_grad, + ops::SequenceUnpadGradOpKernel, + ops::SequenceUnpadGradOpKernel, + ops::SequenceUnpadGradOpKernel, + ops::SequenceUnpadGradOpKernel); diff --git a/paddle/fluid/operators/sequence_unpad_op.h b/paddle/fluid/operators/sequence_unpad_op.h new file mode 100644 index 0000000000..07df3dca83 --- /dev/null +++ b/paddle/fluid/operators/sequence_unpad_op.h @@ -0,0 +1,104 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/memory/memcpy.h" +#include "paddle/fluid/operators/math/math_function.h" +#include "paddle/fluid/operators/math/sequence_padding.h" + +namespace paddle { +namespace operators { + +using LoDTensor = framework::LoDTensor; +using LoD = framework::LoD; + +template +class SequenceUnpadOpKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto* x_t = ctx.Input("X"); + auto* len_t = ctx.Input("Length"); + auto* out_t = ctx.Output("Out"); + out_t->mutable_data(ctx.GetPlace()); + + const int64_t* seq_len_ptr = nullptr; + if (platform::is_gpu_place(ctx.GetPlace())) { + LoDTensor seq_len_cpu; + seq_len_cpu.Resize(len_t->dims()); + seq_len_ptr = seq_len_cpu.mutable_data(platform::CPUPlace()); + framework::TensorCopy(*len_t, platform::CPUPlace(), + ctx.template device_context(), + &seq_len_cpu); + } else { + seq_len_ptr = len_t->data(); + } + + size_t batch_size = x_t->dims()[0]; + std::vector out_lod0(batch_size + 1, 0); + for (size_t i = 0; i < batch_size; ++i) { + out_lod0[i + 1] = out_lod0[i] + seq_len_ptr[i]; + } + + framework::LoD out_lod; + out_lod.push_back(out_lod0); + out_t->set_lod(out_lod); + + std::vector out_dims_vec{static_cast(out_lod0.back())}; + if (x_t->dims().size() == 2) { + out_dims_vec.push_back(1); + } else { + for (int i = 2; i < x_t->dims().size(); ++i) { + out_dims_vec.push_back(x_t->dims()[i]); + } + } + out_t->Resize(framework::make_ddim(out_dims_vec)); + + int64_t padded_length = x_t->dims()[1]; + math::UnpaddingLoDTensorFunctor()( + ctx.template device_context(), *x_t, out_t, + padded_length, 0, false, math::kBatchLengthWidth); + } +}; + +template +class SequenceUnpadGradOpKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto* d_x = ctx.Output(framework::GradVarName("X")); + if (d_x) { + const auto* d_out = ctx.Input(framework::GradVarName("Out")); + const auto* x_t = ctx.Input("X"); + d_x->mutable_data(ctx.GetPlace()); + + int padded_length = x_t->dims()[1]; + + LoDTensor zero_pads; + zero_pads.Resize({1, 1}); + zero_pads.mutable_data(ctx.GetPlace()); + math::SetConstant set_zero; + auto& dev_ctx = ctx.template device_context(); + set_zero(dev_ctx, &zero_pads, static_cast(0)); + + math::PaddingLoDTensorFunctor()( + ctx.template device_context(), *d_out, d_x, zero_pads, + padded_length, 0, false, math::kBatchLengthWidth); + } + } +}; + +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/sgd_op.cc b/paddle/fluid/operators/sgd_op.cc index fef230e42d..411a126bc8 100644 --- a/paddle/fluid/operators/sgd_op.cc +++ b/paddle/fluid/operators/sgd_op.cc @@ -21,7 +21,7 @@ class SGDOp : public framework::OperatorWithKernel { public: using framework::OperatorWithKernel::OperatorWithKernel; - void InferShape(framework::InferShapeContext* ctx) const override { + void InferShape(framework::InferShapeContext *ctx) const override { PADDLE_ENFORCE(ctx->HasInput("Param"), "Input(Param) of SGDOp should not be null."); PADDLE_ENFORCE(ctx->HasInput("Grad"), @@ -42,7 +42,7 @@ class SGDOp : public framework::OperatorWithKernel { protected: framework::OpKernelType GetExpectedKernelType( - const framework::ExecutionContext& ctx) const override { + const framework::ExecutionContext &ctx) const override { auto data_type = framework::GetDataTypeOfVar(ctx.InputVar("Param")); return framework::OpKernelType(data_type, ctx.device_context()); } @@ -50,17 +50,20 @@ class SGDOp : public framework::OperatorWithKernel { class SGDOpInferVarType : public framework::VarTypeInference { public: - void operator()(const framework::OpDesc& op_desc, - framework::BlockDesc* block) const override { - auto input_var = op_desc.Input("Param")[0]; - for (auto& out_var : op_desc.Output("ParamOut")) { - if (block->FindRecursiveOrCreateVar(input_var).GetType() == - framework::proto::VarType::SELECTED_ROWS) { - block->FindRecursiveOrCreateVar(out_var).SetType( - framework::proto::VarType::SELECTED_ROWS); - } else { - block->FindRecursiveOrCreateVar(out_var).SetType( - framework::proto::VarType::LOD_TENSOR); + void operator()(const framework::OpDesc &op_desc, + framework::BlockDesc *block) const override { + auto input_var_n = op_desc.Input("Param")[0]; + auto in_var_type = block->FindRecursiveOrCreateVar(input_var_n).GetType(); + PADDLE_ENFORCE(in_var_type == framework::proto::VarType::SELECTED_ROWS || + in_var_type == framework::proto::VarType::LOD_TENSOR, + "The input Var's type should be LoDtensor or SelectedRows," + " but the received var(%s)'s type is %s", + input_var_n, in_var_type); + + for (auto &out_var_n : op_desc.Output("ParamOut")) { + auto &out_var = block->FindRecursiveOrCreateVar(out_var_n); + if (out_var.GetType() != in_var_type) { + out_var.SetType(in_var_type); } } } diff --git a/paddle/fluid/operators/sgd_op.cu b/paddle/fluid/operators/sgd_op.cu index 2436090757..d3f4eba3b2 100644 --- a/paddle/fluid/operators/sgd_op.cu +++ b/paddle/fluid/operators/sgd_op.cu @@ -56,6 +56,12 @@ template class SGDOpCUDAKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& ctx) const override { + const auto* param_var = ctx.InputVar("Param"); + PADDLE_ENFORCE(param_var->IsType(), + "The Var(%s)'s type should be LoDTensor, " + "but the received is %s", + ctx.Inputs("Param").front(), param_var->Type().name()); + auto* param = ctx.Input("Param"); auto* param_out = ctx.Output("ParamOut"); auto* learning_rate = ctx.Input("LearningRate"); diff --git a/paddle/fluid/operators/split_op.cc b/paddle/fluid/operators/split_op.cc index d661b276bc..a05582ae09 100644 --- a/paddle/fluid/operators/split_op.cc +++ b/paddle/fluid/operators/split_op.cc @@ -111,11 +111,10 @@ Example: } // namespace paddle namespace ops = paddle::operators; -USE_CPU_ONLY_OP(concat); REGISTER_OPERATOR(split, ops::SplitOp, ops::SplitOpMaker, ops::SplitGradMaker); -REGISTER_OP_CPU_KERNEL(split, - ops::SplitOpKernel, - ops::SplitOpKernel, - ops::SplitOpKernel, - ops::SplitOpKernel); +REGISTER_OP_CPU_KERNEL( + split, ops::SplitOpKernel, + ops::SplitOpKernel, + ops::SplitOpKernel, + ops::SplitOpKernel); diff --git a/paddle/fluid/operators/split_op.h b/paddle/fluid/operators/split_op.h index f0c417c705..6f4a25ab5e 100644 --- a/paddle/fluid/operators/split_op.h +++ b/paddle/fluid/operators/split_op.h @@ -17,6 +17,7 @@ limitations under the License. */ #include // NOLINT #include #include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/operators/math/concat_and_split.h" #include "paddle/fluid/operators/strided_memcpy.h" namespace paddle { @@ -28,18 +29,22 @@ class SplitOpKernel : public framework::OpKernel { void Compute(const framework::ExecutionContext& ctx) const override { auto* in = ctx.Input("X"); auto outs = ctx.MultiOutput("Out"); - auto in_stride = framework::stride_numel(in->dims()); - int64_t axis = static_cast(ctx.Attr("axis")); + int axis = ctx.Attr("axis"); auto place = ctx.GetPlace(); - size_t input_offset = 0; - for (auto& out : outs) { - out->mutable_data(ctx.GetPlace()); - auto out_stride = framework::stride_numel(out->dims()); - StridedNumelCopyWithAxis(ctx.device_context(), axis, out->data(), - out_stride, in->data() + input_offset, - in_stride, out_stride[axis]); - input_offset += out_stride[axis]; + std::vector shape_refer; + for (size_t j = 0; j < outs.size(); ++j) { + outs[j]->mutable_data(ctx.GetPlace()); + shape_refer.emplace_back(outs[j]); + } + + auto& dev_ctx = ctx.template device_context(); + // Sometimes direct copies will be faster, this maybe need deeply analysis. + if (axis == 0 && outs.size() < 10) { + StridedMemcpyWithAxis0(dev_ctx, *in, shape_refer, &outs); + } else { + math::SplitFunctor functor; + functor(dev_ctx, *in, shape_refer, axis, &outs); } } }; diff --git a/paddle/fluid/operators/strided_memcpy.h b/paddle/fluid/operators/strided_memcpy.h index 7a10218e15..c3d83a06f2 100644 --- a/paddle/fluid/operators/strided_memcpy.h +++ b/paddle/fluid/operators/strided_memcpy.h @@ -13,8 +13,9 @@ See the License for the specific language governing permissions and limitations under the License. */ #pragma once +#include +#include "paddle/fluid/framework/tensor.h" #include "paddle/fluid/operators/detail/strided_memcpy.h" - namespace paddle { namespace operators { @@ -98,5 +99,26 @@ inline void StridedNumelCopyWithAxis(const platform::DeviceContext& ctx, } } +template +inline void StridedMemcpyWithAxis0( + const platform::DeviceContext& dev_ctx, const framework::Tensor& input, + const std::vector& shape_refer, + std::vector* outputs) { + const framework::DDim in_stride = stride_numel(input.dims()); + const int axis = 0; + size_t input_offset = 0; + + for (size_t i = 0; i < outputs->size(); ++i) { + auto out_stride = stride_numel(shape_refer[i]->dims()); + auto out = outputs->at(i); + if (out != nullptr) { + StridedNumelCopyWithAxis(dev_ctx, axis, out->data(), out_stride, + input.data() + input_offset, in_stride, + out_stride[axis]); + } + input_offset += out_stride[axis]; + } +} + } // namespace operators } // namespace paddle diff --git a/paddle/fluid/operators/sum_op.h b/paddle/fluid/operators/sum_op.h index 34403c7a7a..11987c61ae 100644 --- a/paddle/fluid/operators/sum_op.h +++ b/paddle/fluid/operators/sum_op.h @@ -43,17 +43,31 @@ class SumKernel : public framework::OpKernel { out->mutable_data(context.GetPlace()); } auto result = EigenVector::Flatten(*out); + auto &place = + *context.template device_context().eigen_device(); + int start = in_place ? 1 : 0; if (!in_place) { - math::SetConstant constant_functor; - constant_functor(context.template device_context(), out, - 0.0); + if ((in_num >= 2) && in_vars[0]->IsType() && + in_vars[1]->IsType()) { + auto &in_0 = in_vars[0]->Get(); + auto &in_1 = in_vars[1]->Get(); + if (in_0.numel() && in_1.numel()) { + auto in_0_e = EigenVector::Flatten(in_0); + auto in_1_e = EigenVector::Flatten(in_1); + result.device(place) = in_0_e + in_1_e; + start = 2; + } + } + if (start != 2) { + math::SetConstant constant_functor; + constant_functor(context.template device_context(), + out, 0.0); + } } math::SelectedRowsAddToTensor functor; - auto &place = - *context.template device_context().eigen_device(); // If in_place, just skip the first tensor - for (size_t i = in_place ? 1 : 0; i < in_num; i++) { + for (size_t i = start; i < in_num; i++) { if (in_vars[i]->IsType()) { auto &in_t = in_vars[i]->Get(); if (in_t.numel() == 0) { diff --git a/paddle/fluid/operators/truncated_gaussian_random_op.cc b/paddle/fluid/operators/truncated_gaussian_random_op.cc index d854e28039..1e8708f264 100644 --- a/paddle/fluid/operators/truncated_gaussian_random_op.cc +++ b/paddle/fluid/operators/truncated_gaussian_random_op.cc @@ -148,7 +148,7 @@ struct TruncatedNormal { T operator()(T value) const { auto p = a_normal_cdf + (b_normal_cdf - a_normal_cdf) * value; - return (std::sqrt(2.0) * Erfinv(2 * p - 1) + mean) * std; + return std::sqrt(2.0) * Erfinv(2 * p - 1) * std + mean; } }; diff --git a/paddle/fluid/operators/truncated_gaussian_random_op.cu b/paddle/fluid/operators/truncated_gaussian_random_op.cu index ad2a9021bf..5a3510babe 100644 --- a/paddle/fluid/operators/truncated_gaussian_random_op.cu +++ b/paddle/fluid/operators/truncated_gaussian_random_op.cu @@ -42,7 +42,7 @@ struct TruncatedNormal { rng.discard(n); T value = dist(rng); auto p = a_normal_cdf + (b_normal_cdf - a_normal_cdf) * value; - return (std::sqrt(2.0) * erfinvf(2 * p - 1) + mean) * std; + return std::sqrt(2.0) * erfinvf(2 * p - 1) * std + mean; } }; @@ -52,6 +52,7 @@ class GPUTruncatedGaussianRandomKernel : public framework::OpKernel { void Compute(const framework::ExecutionContext& context) const override { auto* tensor = context.Output("Out"); T* data = tensor->mutable_data(context.GetPlace()); + unsigned int seed = static_cast(context.Attr("seed")); if (seed == 0) { std::random_device rd; diff --git a/paddle/fluid/operators/uniform_random_op.cc b/paddle/fluid/operators/uniform_random_op.cc index 763bb40358..aa907595cb 100644 --- a/paddle/fluid/operators/uniform_random_op.cc +++ b/paddle/fluid/operators/uniform_random_op.cc @@ -23,14 +23,14 @@ namespace operators { template class CPUUniformRandomKernel : public framework::OpKernel { public: - void Compute(const framework::ExecutionContext& ctx) const override { - framework::Tensor* tensor = nullptr; + void Compute(const framework::ExecutionContext &ctx) const override { + framework::Tensor *tensor = nullptr; auto out_var = ctx.OutputVar("Out"); if (out_var->IsType()) { tensor = out_var->GetMutable(); } else if (out_var->IsType()) { auto shape = ctx.Attr>("shape"); - auto* selected_rows = out_var->GetMutable(); + auto *selected_rows = out_var->GetMutable(); tensor = selected_rows->mutable_value(); tensor->Resize(framework::make_ddim(shape)); selected_rows->mutable_rows()->reserve(shape[0]); @@ -39,7 +39,7 @@ class CPUUniformRandomKernel : public framework::OpKernel { "uniform_random_op's output only" "supports SelectedRows and LoDTensor"); } - T* data = tensor->mutable_data(ctx.GetPlace()); + T *data = tensor->mutable_data(ctx.GetPlace()); unsigned int seed = static_cast(ctx.Attr("seed")); std::minstd_rand engine; if (seed == 0) { @@ -60,14 +60,14 @@ class UniformRandomOp : public framework::OperatorWithKernel { public: using framework::OperatorWithKernel::OperatorWithKernel; - void InferShape(framework::InferShapeContext* ctx) const override { + void InferShape(framework::InferShapeContext *ctx) const override { PADDLE_ENFORCE(ctx->HasOutput("Out"), "Output(Out) of UniformRandomOp should not be null."); PADDLE_ENFORCE( ctx->Attrs().Get("min") < ctx->Attrs().Get("max"), "uniform_random's min must less then max"); - auto& shape = ctx->Attrs().Get>("shape"); + auto &shape = ctx->Attrs().Get>("shape"); std::vector temp; temp.reserve(shape.size()); for (auto dim : shape) { @@ -78,7 +78,7 @@ class UniformRandomOp : public framework::OperatorWithKernel { protected: framework::OpKernelType GetExpectedKernelType( - const framework::ExecutionContext& ctx) const override { + const framework::ExecutionContext &ctx) const override { return framework::OpKernelType( static_cast(ctx.Attr("dtype")), ctx.GetPlace()); @@ -112,17 +112,17 @@ uniform distribution. The random result is in set [min, max]. class UniformRandomOpVarTypeInference : public framework::VarTypeInference { public: - void operator()(const framework::OpDesc& op_desc, - framework::BlockDesc* block) const override { + void operator()(const framework::OpDesc &op_desc, + framework::BlockDesc *block) const override { auto out_var_name = op_desc.Output("Out").front(); - if (block->FindRecursiveOrCreateVar(out_var_name).GetType() == - framework::proto::VarType::SELECTED_ROWS) { - block->FindRecursiveOrCreateVar(out_var_name) - .SetType(framework::proto::VarType::SELECTED_ROWS); - } else { - block->FindRecursiveOrCreateVar(out_var_name) - .SetType(framework::proto::VarType::LOD_TENSOR); + auto var_data_type = static_cast( + boost::get(op_desc.GetAttr("dtype"))); + + auto out_var = block->FindRecursiveOrCreateVar(out_var_name); + if (out_var.GetType() != framework::proto::VarType::SELECTED_ROWS) { + out_var.SetType(framework::proto::VarType::LOD_TENSOR); } + out_var.SetDataType(var_data_type); } }; diff --git a/paddle/fluid/platform/cpu_info.cc b/paddle/fluid/platform/cpu_info.cc index 2880c09263..b5f472d20f 100644 --- a/paddle/fluid/platform/cpu_info.cc +++ b/paddle/fluid/platform/cpu_info.cc @@ -128,7 +128,7 @@ bool MayIUse(const cpu_isa_t cpu_isa) { return cpu.has(Cpu::tAVX); case avx2: return cpu.has(Cpu::tAVX2); - case avx512_common: + case avx512f: return cpu.has(Cpu::tAVX512F); case avx512_core: return true && cpu.has(Cpu::tAVX512F) && cpu.has(Cpu::tAVX512BW) && diff --git a/paddle/fluid/platform/cpu_info.h b/paddle/fluid/platform/cpu_info.h index 30c8fbcfce..6810a1651a 100644 --- a/paddle/fluid/platform/cpu_info.h +++ b/paddle/fluid/platform/cpu_info.h @@ -43,7 +43,7 @@ typedef enum { sse42, avx, avx2, - avx512_common, + avx512f, avx512_core, avx512_core_vnni, avx512_mic, diff --git a/paddle/fluid/platform/device_context.cc b/paddle/fluid/platform/device_context.cc index dfc079e986..7d1cf57253 100644 --- a/paddle/fluid/platform/device_context.cc +++ b/paddle/fluid/platform/device_context.cc @@ -35,6 +35,16 @@ platform::DeviceContext* DeviceContextPool::Get(const platform::Place& place) { return it->second.get(); } +const std::vector +DeviceContextPool::GetAllDeviceContexts() const { + std::vector all_device_ctx; + all_device_ctx.reserve(device_contexts_.size()); + for (auto& dev_ctx : device_contexts_) { + all_device_ctx.emplace_back(dev_ctx.second.get()); + } + return all_device_ctx; +} + DeviceContextPool::DeviceContextPool( const std::vector& places) { PADDLE_ENFORCE_GT(places.size(), 0); @@ -198,9 +208,9 @@ class CudnnHolder { CUDADeviceContext::CUDADeviceContext(CUDAPlace place) : place_(place), cudnn_holder_(nullptr) { SetDeviceId(place_.device); - compute_capability = GetCUDAComputeCapability(place_.device); - multi_process = GetCUDAMultiProcessors(place_.device); - max_threads_per_mp = GetCUDAMaxThreadsPerMultiProcessor(place_.device); + compute_capability_ = GetCUDAComputeCapability(place_.device); + multi_process_ = GetCUDAMultiProcessors(place_.device); + max_threads_per_mp_ = GetCUDAMaxThreadsPerMultiProcessor(place_.device); PADDLE_ENFORCE(cudaStreamCreate(&stream_)); eigen_stream_.reset(new EigenCudaStreamDevice()); eigen_stream_->Reinitialize(&stream_, place); @@ -211,6 +221,16 @@ CUDADeviceContext::CUDADeviceContext(CUDAPlace place) cudnn_holder_.reset(new CudnnHolder(&stream_, place)); } + driver_version_ = GetCUDADriverVersion(place_.device); + runtime_version_ = GetCUDARuntimeVersion(place_.device); + + LOG(INFO) << "device: " << place_.device + << ", CUDA Capability: " << compute_capability_ + << ", Driver Version: " << driver_version_ / 1000 << "." + << (driver_version_ % 100) / 10 + << ", Runtime Version: " << runtime_version_ / 1000 << "." + << (runtime_version_ % 100) / 10; + callback_manager_.reset(new StreamCallbackManager(stream_)); } @@ -232,11 +252,11 @@ void CUDADeviceContext::Wait() const { } int CUDADeviceContext::GetComputeCapability() const { - return compute_capability; + return compute_capability_; } int CUDADeviceContext::GetMaxPhysicalThreadCount() const { - return multi_process * max_threads_per_mp; + return multi_process_ * max_threads_per_mp_; } Eigen::GpuDevice* CUDADeviceContext::eigen_device() const { diff --git a/paddle/fluid/platform/device_context.h b/paddle/fluid/platform/device_context.h index 7953919515..999bbe00f1 100644 --- a/paddle/fluid/platform/device_context.h +++ b/paddle/fluid/platform/device_context.h @@ -135,9 +135,11 @@ class CUDADeviceContext : public DeviceContext { cudaStream_t stream_; cublasHandle_t cublas_handle_; - int compute_capability; - int multi_process; - int max_threads_per_mp; + int compute_capability_; + int runtime_version_; + int driver_version_; + int multi_process_; + int max_threads_per_mp_; mutable std::mutex mtx_; @@ -215,6 +217,9 @@ class DeviceContextPool { /*! \brief Return handle of single device context. */ platform::DeviceContext* Get(const platform::Place& place); + /*! \brief Return all the device contexts. */ + const std::vector GetAllDeviceContexts() const; + template const typename DefaultDeviceContextType::TYPE* GetByPlace( const Place& place) { diff --git a/paddle/fluid/platform/enforce.h b/paddle/fluid/platform/enforce.h index f04395a8ac..a251bfcd99 100644 --- a/paddle/fluid/platform/enforce.h +++ b/paddle/fluid/platform/enforce.h @@ -130,6 +130,13 @@ struct EOFException : public std::exception { #define UNLIKELY(condition) (condition == 0) #endif +#if !defined(_WIN32) +#define LIKELY(condition) __builtin_expect(static_cast(condition), 1) +#else +// there is no equivalent intrinsics in msvc. +#define LIKELY(condition) (condition != 0) +#endif + template inline typename std::enable_if::type throw_on_error( bool stat, const Args&... args) { diff --git a/paddle/fluid/platform/gpu_info.cc b/paddle/fluid/platform/gpu_info.cc index f599e7fbc8..8fff9844db 100644 --- a/paddle/fluid/platform/gpu_info.cc +++ b/paddle/fluid/platform/gpu_info.cc @@ -46,6 +46,24 @@ int GetCUDAComputeCapability(int id) { return device_prop.major * 10 + device_prop.minor; } +int GetCUDARuntimeVersion(int id) { + PADDLE_ENFORCE_LT(id, GetCUDADeviceCount(), "id must less than GPU count"); + int runtime_version = 0; + PADDLE_ENFORCE(cudaRuntimeGetVersion(&runtime_version), + "cudaRuntimeGetVersion failed in " + "paddle::platform::cudaRuntimeGetVersion"); + return runtime_version; +} + +int GetCUDADriverVersion(int id) { + PADDLE_ENFORCE_LT(id, GetCUDADeviceCount(), "id must less than GPU count"); + int driver_version = 0; + PADDLE_ENFORCE(cudaDriverGetVersion(&driver_version), + "cudaDriverGetVersion failed in " + "paddle::platform::GetCUDADriverVersion"); + return driver_version; +} + int GetCUDAMultiProcessors(int id) { PADDLE_ENFORCE_LT(id, GetCUDADeviceCount(), "id must less than GPU count"); int count; diff --git a/paddle/fluid/platform/gpu_info.h b/paddle/fluid/platform/gpu_info.h index f4640d3eaa..be44158431 100644 --- a/paddle/fluid/platform/gpu_info.h +++ b/paddle/fluid/platform/gpu_info.h @@ -29,6 +29,12 @@ int GetCUDADeviceCount(); //! Get the compute capability of the ith GPU (format: major * 10 + minor) int GetCUDAComputeCapability(int i); +//! Get the runtime version of the ith GPU +int GetCUDARuntimeVersion(int id); + +//! Get the driver version of the ith GPU +int GetCUDADriverVersion(int id); + //! Get the MultiProcessors of the ith GPU. int GetCUDAMultiProcessors(int i); diff --git a/paddle/fluid/platform/init.cc b/paddle/fluid/platform/init.cc index 4c99f4be32..ab91ca5345 100644 --- a/paddle/fluid/platform/init.cc +++ b/paddle/fluid/platform/init.cc @@ -116,7 +116,7 @@ void InitDevices(bool init_p2p, const std::vector devices) { platform::SetNumThreads(FLAGS_paddle_num_threads); #endif - if (platform::jit::MayIUse(platform::jit::avx512_common)) { + if (platform::jit::MayIUse(platform::jit::avx512f)) { #ifndef __AVX512F__ LOG(WARNING) << "AVX512F is available, Please re-compile on local machine"; #endif diff --git a/paddle/fluid/platform/profiler.cc b/paddle/fluid/platform/profiler.cc index 652a6ec7a4..da46a1abe1 100644 --- a/paddle/fluid/platform/profiler.cc +++ b/paddle/fluid/platform/profiler.cc @@ -30,6 +30,8 @@ limitations under the License. */ #include "paddle/fluid/platform/device_tracer.h" #include "paddle/fluid/string/printf.h" +DEFINE_bool(enable_rpc_profiler, false, "Enable rpc profiler or not."); + namespace paddle { namespace platform { @@ -193,6 +195,13 @@ RecordEvent::~RecordEvent() { PopEvent(name_, dev_ctx_); } +RecordRPCEvent::RecordRPCEvent(const std::string& name, + const DeviceContext* dev_ctx) { + if (FLAGS_enable_rpc_profiler) { + event_.reset(new platform::RecordEvent(name, dev_ctx)); + } +} + RecordBlock::RecordBlock(int block_id) : is_enabled_(false), start_ns_(PosixInNsec()) { std::lock_guard l(profiler_mu); @@ -276,7 +285,7 @@ struct EventItem { // Print results void PrintProfiler(const std::vector>& events_table, const std::string& sorted_domain, const size_t name_width, - const size_t data_width, double total) { + const size_t data_width, bool merge_thread) { // Output header information std::cout << "\n------------------------->" << " Profiling Report " @@ -292,6 +301,10 @@ void PrintProfiler(const std::vector>& events_table, PADDLE_THROW("Invalid profiler state", g_state); } + if (merge_thread) { + std::cout << "Note! This Report merge all thread info into one." + << std::endl; + } std::cout << "Place: " << place << std::endl; std::cout << "Time unit: ms" << std::endl; std::cout << "Sorted by " << sorted_domain @@ -312,8 +325,7 @@ void PrintProfiler(const std::vector>& events_table, << std::setw(data_width) << event_item.min_time << std::setw(data_width) << event_item.max_time << std::setw(data_width) << event_item.ave_time - << std::setw(data_width) << event_item.total_time / total - << std::endl; + << std::setw(data_width) << event_item.ratio << std::endl; } } std::cout << std::endl; @@ -321,8 +333,10 @@ void PrintProfiler(const std::vector>& events_table, // Parse the event list and output the profiling report void ParseEvents(const std::vector>& events, + bool merge_thread, EventSortingKey sorted_by = EventSortingKey::kDefault) { if (g_state == ProfilerState::kDisabled) return; + if (merge_thread && events.size() < 2) return; std::string sorted_domain; std::function sorted_func; @@ -361,34 +375,55 @@ void ParseEvents(const std::vector>& events, sorted_domain = "event first end time"; } + const std::vector>* analyze_events; + std::vector> merged_events_list; + if (merge_thread) { + std::vector merged_events; + for (size_t i = 0; i < events.size(); ++i) { + for (size_t j = 0; j < events[i].size(); ++j) { + merged_events.push_back(events[i][j]); + } + } + merged_events_list.push_back(merged_events); + analyze_events = &merged_events_list; + } else { + analyze_events = &events; + } + std::vector> events_table; size_t max_name_width = 0; - double total = 0.; // the total time - for (size_t i = 0; i < events.size(); i++) { + for (size_t i = 0; i < (*analyze_events).size(); i++) { + double total = 0.; // the total time in one thread std::list pushed_events; std::vector event_items; std::unordered_map event_idx; - for (size_t j = 0; j < events[i].size(); j++) { - if (events[i][j].type() == EventType::kPushRange) { - pushed_events.push_back(events[i][j]); - } else if (events[i][j].type() == EventType::kPopRange) { + for (size_t j = 0; j < (*analyze_events)[i].size(); j++) { + if ((*analyze_events)[i][j].type() == EventType::kPushRange) { + pushed_events.push_back((*analyze_events)[i][j]); + } else if ((*analyze_events)[i][j].type() == EventType::kPopRange) { std::list::reverse_iterator rit = pushed_events.rbegin(); while (rit != pushed_events.rend() && - rit->name() != events[i][j].name()) { + rit->name() != (*analyze_events)[i][j].name()) { ++rit; } if (rit != pushed_events.rend()) { double event_time = (g_state == ProfilerState::kCUDA || g_state == ProfilerState::kAll) - ? rit->CudaElapsedMs(events[i][j]) - : rit->CpuElapsedMs(events[i][j]); + ? rit->CudaElapsedMs((*analyze_events)[i][j]) + : rit->CpuElapsedMs((*analyze_events)[i][j]); total += event_time; - std::string event_name = - "thread" + std::to_string(rit->thread_id()) + "::" + rit->name(); - max_name_width = std::max(max_name_width, event_name.size()); + std::string event_name; + if (merge_thread) { + event_name = rit->name(); + max_name_width = std::max(max_name_width, event_name.size()); + } else { + event_name = "thread" + std::to_string(rit->thread_id()) + "::" + + rit->name(); + max_name_width = std::max(max_name_width, event_name.size()); + } if (event_idx.find(event_name) == event_idx.end()) { event_idx[event_name] = event_items.size(); @@ -413,7 +448,7 @@ void ParseEvents(const std::vector>& events, pushed_events.erase((++rit).base()); } else { LOG(WARNING) << "Cannot find the push marker of event \'" - << events[i][j].name() + << (*analyze_events)[i][j].name() << "\', which will be ignored in profiling report."; } } @@ -421,6 +456,7 @@ void ParseEvents(const std::vector>& events, // average time for (auto& item : event_items) { item.ave_time = item.total_time / item.calls; + item.ratio = item.total_time / total; } // sort if (sorted_by != EventSortingKey::kDefault) { @@ -438,7 +474,8 @@ void ParseEvents(const std::vector>& events, } // Print report - PrintProfiler(events_table, sorted_domain, max_name_width + 4, 12, total); + PrintProfiler(events_table, sorted_domain, max_name_width + 4, 12, + merge_thread); } void DisableProfiler(EventSortingKey sorted_key, @@ -449,7 +486,8 @@ void DisableProfiler(EventSortingKey sorted_key, Mark("_stop_profiler_", nullptr); std::vector> all_events = GetAllEvents(); - ParseEvents(all_events, sorted_key); + ParseEvents(all_events, true, sorted_key); + ParseEvents(all_events, false, sorted_key); ResetProfiler(); DeviceTracer* tracer = GetDeviceTracer(); if (tracer->IsEnabled()) { diff --git a/paddle/fluid/platform/profiler.h b/paddle/fluid/platform/profiler.h index 38630686f7..e8eae874af 100644 --- a/paddle/fluid/platform/profiler.h +++ b/paddle/fluid/platform/profiler.h @@ -71,6 +71,7 @@ void PopEvent(const std::string& name, const DeviceContext* dev_ctx); #if !defined(_WIN32) struct RecordEvent { + // dev_ctx can be set to nullptr if device is cpu. RecordEvent(const std::string& name, const DeviceContext* dev_ctx); ~RecordEvent(); @@ -86,6 +87,16 @@ struct RecordEvent { std::string full_name_; }; +class RecordRPCEvent { + public: + // dev_ctx can be set to nullptr if device is cpu. + RecordRPCEvent(const std::string& name, const DeviceContext* dev_ctx); + ~RecordRPCEvent() {} + + private: + std::unique_ptr event_; +}; + struct RecordBlock { explicit RecordBlock(int block_id); ~RecordBlock(); diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc index 311cd94460..339a7c98c6 100644 --- a/paddle/fluid/pybind/pybind.cc +++ b/paddle/fluid/pybind/pybind.cc @@ -57,6 +57,10 @@ limitations under the License. */ #include "pybind11/stl.h" +DEFINE_bool(reader_queue_speed_test_mode, false, + "If set true, the queue.pop will only get data from queue but not " + "remove the data from queue for speed testing"); + // disable auto conversion to list in Python PYBIND11_MAKE_OPAQUE(paddle::framework::LoDTensorArray); @@ -157,7 +161,50 @@ PYBIND11_PLUGIN(core) { .def("_get_double_element", TensorGetElement) .def("_dtype", [](Tensor &self) { return ToDataType(self.type()); }); - py::class_(m, "LoDTensor") + py::class_(m, "LoDTensor", R"DOC( + LoDTensor is a Tensor with optional LoD information. + + np.array(lod_tensor) can convert LoDTensor to numpy array. + lod_tensor.lod() can retrieve the LoD information. + + LoD is short for Level of Details and is usually used for varied sequence + length. You can skip the following comment if you don't need optional LoD. + + For example: + A LoDTensor X can look like the example below. It contains 2 sequences. + The first has length 2 and the second has length 3, as described by x.lod. + + The first tensor dimension 5=2+3 is calculated from LoD if it's available. + It means the total number of sequence element. In X, each element has 2 + columns, hence [5, 2]. + + x.lod = [[2, 3]] + x.data = [[1, 2], [3, 4], + [5, 6], [7, 8], [9, 10]] + x.shape = [5, 2] + + LoD can have multiple levels (for example, a paragraph can have multiple + sentences and a sentence can have multiple words). In the following + LodTensor Y, the lod_level is 2. It means there are 2 sequence, the + first sequence length is 2 (has 2 sub-sequences), the second one's + length is 1. The first sequence's 2 sub-sequences have length 2 and 2, + respectively. And the second sequence's 1 sub-sequence has length 3. + + y.lod = [[2 1], [2 2 3]] + y.shape = [2+2+3, ...] + + Note: + In above description, LoD is length-based. In Paddle internal + implementation, lod is offset-based. Hence, internally, + y.lod is represented as [[0, 2, 3], [0, 2, 4, 7]] (length-based + equivlent would be [[2-0, 3-2], [2-0, 4-2, 7-4]]). + + Sometimes LoD is called recursive_sequence_length to be more + self-explanatory. In this case, it must be length-based. Due to history + reasons. when LoD is called lod in public API, it might be offset-based. + Users should be careful about it. + + )DOC") .def_buffer( [](Tensor &self) -> py::buffer_info { return CastToPyBuffer(self); }) .def("__init__", @@ -337,7 +384,8 @@ All parameter, weight, gradient are variables in Paddle. return make_ddim(shape); }); auto *holder = var.GetMutable(); - holder->InitOnce(capacity, dims); + holder->InitOnce(capacity, dims, + FLAGS_reader_queue_speed_test_mode); return holder->GetQueue(); }, py::return_value_policy::copy); @@ -624,16 +672,17 @@ All parameter, weight, gradient are variables in Paddle. ExecutionStrategy allows the user to more preciously control how to run the program in ParallelExecutor by setting the property. - The available properties include: - use_cuda (bool): Whether to use CUDA or not. Default True. - num_threads (int): The number of threads that used to run the - operators in ParallelExecutor. If it is not set, it will be - set in ParallelExecutor according to the device count. - Default 0. - allow_op_delay (bool): Whether to delay the communication operators - to run. Default False. - num_iteration_per_drop_scope (int): how many iterations between - the two dropping local scopes. Default 100. + Examples: + .. code-block:: python + + exec_strategy = fluid.ExecutionStrategy() + exec_strategy.num_threads = 4 + + train_exe = fluid.ParallelExecutor(use_cuda=True, + loss_name=loss.name, + exec_strategy=exec_strategy) + + train_loss, = train_exe.run([loss.name], feed=feed_dict) )DOC"); @@ -643,19 +692,34 @@ All parameter, weight, gradient are variables in Paddle. [](const ExecutionStrategy &self) { return self.num_threads_; }, [](ExecutionStrategy &self, size_t num_threads) { self.num_threads_ = num_threads; - }) + }, + R"DOC(The type is INT, num_threads represents the size of thread pool that + used to run the operators of the current program in ParallelExecutor. + If :math:`num\_threads=1`, all the operators will execute one by one, + but the order maybe difference between iterations. + If it is not set, it will be set in ParallelExecutor according to the + device type and device count, for GPU, :math:`num\_threads=device\_count*4`, for CPU, + :math:`num\_threads=CPU\_NUM*4`, the explanation of:math:`CPU\_NUM` is in ParallelExecutor. + if it is not set, ParallelExecutor will get the cpu count by calling + `multiprocessing.cpu_count()`. Default 0.)DOC") .def_property( "use_cuda", [](const ExecutionStrategy &self) { return self.use_cuda_; }, [](ExecutionStrategy &self, bool use_cuda) { self.use_cuda_ = use_cuda; - }) + }) // FIXME(chengduo): Doesn't add doc for 'use_cuda', use_cuda may + // make user confuse, because ParallelExecutor has a parameter named + // 'use_cuda' too, in current implementation, ParallelExecutor's + // 'use_cuda' will rewrite ExecutionStrategy's 'use_cuda'. .def_property( "allow_op_delay", [](const ExecutionStrategy &self) { return self.allow_op_delay_; }, [](ExecutionStrategy &self, bool allow_op_delay) { self.allow_op_delay_ = allow_op_delay; - }) + }, + R"DOC(The type is BOOL, allow_op_delay represents whether to delay the + communication operators to run, it may make the execution faster. + Note that in some models, allow_op_delay may cause program hang. Default False.)DOC") .def_property( "num_iteration_per_drop_scope", [](const ExecutionStrategy &self) { @@ -663,7 +727,19 @@ All parameter, weight, gradient are variables in Paddle. }, [](ExecutionStrategy &self, size_t num_iteration_per_drop_scope) { self.num_iteration_per_drop_scope_ = num_iteration_per_drop_scope; - }); + }, + R"DOC(The type is INT, num_iteration_per_drop_scope indicates how + many iterations to clean up the temp variables which + is generated during execution. It may make the execution faster, + because the temp variable's shape maybe the same between two iterations. Default 100. + + NOTES: + 1. If you fetch data when calling the 'run', the ParallelExecutor + will clean up the temp variables at the end of the current iteration. + 2. In some NLP model, it may cause the GPU memory is insufficient, + in this case, you should reduce `num_iteration_per_drop_scope`. + )DOC"); + exec_strategy.def_property( "use_experimental_executor", [](const ExecutionStrategy &self) { @@ -678,20 +754,17 @@ All parameter, weight, gradient are variables in Paddle. BuildStrategy allows the user to more preciously control how to build the SSA Graph in ParallelExecutor by setting the property. - The available properties include: - reduce_strategy (str): There are two reduce strategies, 'AllReduce' - and 'Reduce'. If you want that all parameters will be optimized - on all devices, you can choose 'AllReduce'; if you choose - 'Reduce', all parameters will be evenly allocated to different - devices for optimization, and then broadcast the optimized - parameter to other devices. Default 'AllReduce'. - gradient_scale_strategy (str): There are two ways of defining loss@grad, - 'CoeffNumDevice' and 'Customized'. By default, ParallelExecutor - sets the loss@grad according to the number of devices. If you want - to customize loss@grad, you can choose 'Customized'. - Default 'CoeffNumDevice'. - debug_graphviz_path (str): Whether to write the SSA Graph to file in the - form of graphviz. It is useful for debugging. Default "". + Examples: + .. code-block:: python + + build_strategy = fluid.BuildStrategy() + build_strategy.reduce_strategy = fluid.BuildStrategy.ReduceStrategy.Reduce + + train_exe = fluid.ParallelExecutor(use_cuda=True, + loss_name=loss.name, + build_strategy=build_strategy) + + train_loss, = train_exe.run([loss.name], feed=feed_dict) )DOC"); py::enum_(build_strategy, "ReduceStrategy") @@ -710,31 +783,51 @@ All parameter, weight, gradient are variables in Paddle. [](const BuildStrategy &self) { return self.reduce_; }, [](BuildStrategy &self, BuildStrategy::ReduceStrategy strategy) { self.reduce_ = strategy; - }) + }, + R"DOC(The type is STR, there are two reduce strategies in ParallelExecutor, + 'AllReduce' and 'Reduce'. If you want that all the parameters' + optimization are done on all devices independently, you should choose 'AllReduce'; + if you choose 'Reduce', all the parameters' optimization will be evenly distributed + to different devices, and then broadcast the optimized parameter to other devices. + In some models, `Reduce` is faster. Default 'AllReduce'. )DOC") .def_property( "gradient_scale_strategy", [](const BuildStrategy &self) { return self.gradient_scale_; }, [](BuildStrategy &self, BuildStrategy::GradientScaleStrategy strategy) { self.gradient_scale_ = strategy; - }) + }, + R"DOC(The type is STR, there are three ways of defining :math:`loss@grad` in + ParallelExecutor, 'CoeffNumDevice', 'One' and 'Customized'. By default, + ParallelExecutor sets the :math:`loss@grad` according to the number of devices. + If you want to customize :math:`loss@grad`, you can choose 'Customized'. + Default 'CoeffNumDevice'.)DOC") .def_property( "debug_graphviz_path", [](const BuildStrategy &self) { return self.debug_graphviz_path_; }, [](BuildStrategy &self, const std::string &path) { self.debug_graphviz_path_ = path; - }) + }, + R"DOC(The type is STR, debug_graphviz_path indicate the path that + writing the SSA Graph to file in the form of graphviz, you. + It is useful for debugging. Default "")DOC") .def_property( "enable_data_balance", [](const BuildStrategy &self) { return self.enable_data_balance_; }, - [](BuildStrategy &self, bool b) { self.enable_data_balance_ = b; }) - .def_property("fuse_elewise_add_act_ops", - [](const BuildStrategy &self) { - return self.fuse_elewise_add_act_ops_; - }, - [](BuildStrategy &self, bool b) { - self.fuse_elewise_add_act_ops_ = b; - }) + [](BuildStrategy &self, bool b) { + self.enable_data_balance_ = b; + }) // FIXME(chengudo): enable_data_balance seems not important + .def_property( + "fuse_elewise_add_act_ops", + [](const BuildStrategy &self) { + return self.fuse_elewise_add_act_ops_; + }, + [](BuildStrategy &self, bool b) { + self.fuse_elewise_add_act_ops_ = b; + }, + R"DOC(The type is BOOL, fuse_elewise_add_act_ops indicate whether + to fuse elementwise_add_op and activation_op, + it may make the execution faster. Default False)DOC") .def("_create_passes_from_strategy", [](BuildStrategy &self) -> std::shared_ptr { return self.CreatePassesFromStrategy(); diff --git a/paddle/fluid/train/demo/README.md b/paddle/fluid/train/demo/README.md index 41b01d3382..191da20669 100644 --- a/paddle/fluid/train/demo/README.md +++ b/paddle/fluid/train/demo/README.md @@ -15,7 +15,7 @@ cmake .. -DFLUID_INSTALL_DIR=$PADDLE_LIB \ -DWITH_MKL=OFF \ -DWITH_MKLDNN=OFF make -j8 -make -j8 inference_lib_dist +make -j8 fluid_lib_dist ``` ### step 2. generate program desc diff --git a/paddle/scripts/paddle_build.sh b/paddle/scripts/paddle_build.sh index e133323ae4..85493c1054 100755 --- a/paddle/scripts/paddle_build.sh +++ b/paddle/scripts/paddle_build.sh @@ -390,7 +390,9 @@ function run_mac_test() { Running unit tests ... ======================================== EOF - + #remove proxy here to fix dist error on mac + export http_proxy= + export https_proxy= # TODO: jiabin need to refine this part when these tests fixed on mac ctest --output-on-failure -j $1 # make install should also be test when unittest @@ -648,42 +650,47 @@ function gen_capi_package() { fi } -function gen_fluid_inference_lib() { +function gen_fluid_lib() { mkdir -p ${PADDLE_ROOT}/build cd ${PADDLE_ROOT}/build if [[ ${WITH_C_API:-OFF} == "OFF" && ${WITH_INFERENCE:-ON} == "ON" ]] ; then cat <>> with program.lr_schedule_guard(): >>> lr = lr * decay """ + + tmp_role = self._current_role + tmp_var = self._op_role_var + OpRole = core.op_proto_and_checker_maker.OpRole self._current_role = OpRole.LRSched # TODO(typhoonzero): how to set target learning rate var self._op_role_var = [] yield - self._op_role_var = [] - self._current_role = OpRole.Forward + self._op_role_var = tmp_var + self._current_role = tmp_role def __str__(self): """ diff --git a/python/paddle/fluid/layer_helper.py b/python/paddle/fluid/layer_helper.py index bd9727b6ac..dc317de9ab 100644 --- a/python/paddle/fluid/layer_helper.py +++ b/python/paddle/fluid/layer_helper.py @@ -324,10 +324,19 @@ class LayerHelper(object): raise ValueError("no Parameter name %s found" % name) return param - def create_tmp_variable(self, dtype, stop_gradient=False): + def create_variable_for_type_inference(self, dtype, stop_gradient=False): + """Create a temporary variable that should be type inferred layer. + + Note: + The default type will be set to LOD_TENSOR. However, when + the var is used as operator output, its type will be updated + based on operator's `VarTypeInference` implementation in + infer_var_type. + """ return self.main_program.current_block().create_var( name=unique_name.generate(".".join([self.name, 'tmp'])), dtype=dtype, + type=core.VarDesc.VarType.LOD_TENSOR, persistable=False, stop_gradient=stop_gradient) @@ -388,7 +397,7 @@ class LayerHelper(object): b = self.create_parameter( attr=bias_attr, shape=size, dtype=input_var.dtype, is_bias=True) - tmp = self.create_tmp_variable(dtype=input_var.dtype) + tmp = self.create_variable_for_type_inference(dtype=input_var.dtype) self.append_op( type='elementwise_add', inputs={'X': [input_var], @@ -414,7 +423,7 @@ class LayerHelper(object): tmp = input_var # NOTE(dzhwinter): some activation support inplace compution. if not core.IsInplace(act_type): - tmp = self.create_tmp_variable(dtype=input_var.dtype) + tmp = self.create_variable_for_type_inference(dtype=input_var.dtype) self.append_op( type=act_type, inputs={"X": [input_var]}, diff --git a/python/paddle/fluid/layers/control_flow.py b/python/paddle/fluid/layers/control_flow.py index 4af97e8632..459be4339b 100644 --- a/python/paddle/fluid/layers/control_flow.py +++ b/python/paddle/fluid/layers/control_flow.py @@ -80,8 +80,8 @@ def split_lod_tensor(input, mask, level=0): """ helper = LayerHelper('split_lod_tensor', **locals()) - out_true = helper.create_tmp_variable(dtype=input.dtype) - out_false = helper.create_tmp_variable(dtype=input.dtype) + out_true = helper.create_variable_for_type_inference(dtype=input.dtype) + out_false = helper.create_variable_for_type_inference(dtype=input.dtype) helper.append_op( type='split_lod_tensor', inputs={ @@ -131,7 +131,7 @@ def merge_lod_tensor(in_true, in_false, x, mask, level=0): in_true=out_true, in_false=out_false, mask=y, x=x, level=level) """ helper = LayerHelper('merge_lod_tensor', **locals()) - out = helper.create_tmp_variable(dtype=in_true.dtype) + out = helper.create_variable_for_type_inference(dtype=in_true.dtype) helper.append_op( type='merge_lod_tensor', inputs={'X': x, @@ -524,7 +524,7 @@ class StaticRNN(object): if not isinstance(o, Variable): raise TypeError("step output takes a Variable") - tmp_o = self.helper.create_tmp_variable(dtype=o.dtype) + tmp_o = self.helper.create_variable_for_type_inference(dtype=o.dtype) self.helper.append_op( type='rnn_memory_helper', inputs={'X': [o]}, @@ -606,7 +606,8 @@ class StaticRNN(object): pre_memories.append(mem.pre_mem.name) mem_var = rnn_block.var(mem.mem.name) assert isinstance(mem_var, Variable) - new_mem = self.helper.create_tmp_variable(dtype=mem_var.dtype) + new_mem = self.helper.create_variable_for_type_inference( + dtype=mem_var.dtype) rnn_block.append_op( type='rnn_memory_helper', @@ -813,7 +814,7 @@ def max_sequence_len(rank_table): ${out_comment}. """ helper = LayerHelper("max_seqence_len", **locals()) - res = helper.create_tmp_variable(dtype="int64") + res = helper.create_variable_for_type_inference(dtype="int64") helper.append_op( type="max_sequence_len", inputs={"RankTable": rank_table}, @@ -884,7 +885,7 @@ def array_to_lod_tensor(x, table): lod_tensor = fluid.layers.array_to_lod_tensor(array, table) """ helper = LayerHelper("array_to_lod_tensor", **locals()) - tmp = helper.create_tmp_variable(dtype=x.dtype) + tmp = helper.create_variable_for_type_inference(dtype=x.dtype) helper.append_op( type="array_to_lod_tensor", inputs={'X': x, @@ -915,7 +916,7 @@ def increment(x, value=1.0, in_place=True): """ helper = LayerHelper("increment", **locals()) if not in_place: - out = helper.create_tmp_variable(dtype=x.dtype) + out = helper.create_variable_for_type_inference(dtype=x.dtype) else: out = x helper.append_op( @@ -1012,7 +1013,7 @@ def less_than(x, y, force_cpu=None, cond=None, **ignored): """ helper = LayerHelper("less_than", **locals()) if cond is None: - cond = helper.create_tmp_variable(dtype='bool') + cond = helper.create_variable_for_type_inference(dtype='bool') cond.stop_gradient = True attrs = dict() @@ -1051,7 +1052,7 @@ def equal(x, y, cond=None, **ignored): """ helper = LayerHelper("equal", **locals()) if cond is None: - cond = helper.create_tmp_variable(dtype='bool') + cond = helper.create_variable_for_type_inference(dtype='bool') cond.stop_gradient = True helper.append_op( @@ -1098,7 +1099,7 @@ def array_read(array, i): array, Variable) or array.type != core.VarDesc.VarType.LOD_TENSOR_ARRAY: raise TypeError("array should be tensor array vairable") - out = helper.create_tmp_variable(dtype=array.dtype) + out = helper.create_variable_for_type_inference(dtype=array.dtype) helper.append_op( type='read_from_array', inputs={'X': [array], @@ -1133,7 +1134,7 @@ def shrink_memory(x, i, table): usage. """ helper = LayerHelper('shrink_memory', **locals()) - out = helper.create_tmp_variable(dtype=x.dtype) + out = helper.create_variable_for_type_inference(dtype=x.dtype) helper.append_op( type='shrink_rnn_memory', inputs={'X': [x], @@ -1170,7 +1171,7 @@ def array_length(array): """ helper = LayerHelper('array_length', **locals()) - tmp = helper.create_tmp_variable(dtype='int64') + tmp = helper.create_variable_for_type_inference(dtype='int64') tmp.stop_gradient = True helper.append_op( type='lod_array_length', inputs={'X': [array]}, outputs={'Out': [tmp]}) @@ -1590,7 +1591,7 @@ class DynamicRNN(object): self.mem_dict = dict() self.output_array = [] self.outputs = [] - self.cond = self.helper.create_tmp_variable(dtype='bool') + self.cond = self.helper.create_variable_for_type_inference(dtype='bool') self.cond.stop_gradient = False self.while_op = While(self.cond) self.input_array = [] @@ -1924,7 +1925,7 @@ def reorder_lod_tensor_by_rank(x, rank_table): helper.is_instance('x', Variable) helper.is_instance('rank_table', Variable) - out = helper.create_tmp_variable(dtype=x.dtype) + out = helper.create_variable_for_type_inference(dtype=x.dtype) helper.append_op( type='reorder_lod_tensor_by_rank', inputs={'X': [x], @@ -1958,7 +1959,7 @@ def is_empty(x, cond=None, **ignored): """ helper = LayerHelper("is_empty", **locals()) if cond is None: - cond = helper.create_tmp_variable(dtype='bool') + cond = helper.create_variable_for_type_inference(dtype='bool') cond.stop_gradient = True elif not isinstance(cond, Variable): raise TypeError("cond takes a variable") diff --git a/python/paddle/fluid/layers/detection.py b/python/paddle/fluid/layers/detection.py index 1cfcbbb9c1..b94b59631a 100644 --- a/python/paddle/fluid/layers/detection.py +++ b/python/paddle/fluid/layers/detection.py @@ -147,10 +147,11 @@ def rpn_target_assign(bbox_pred, helper = LayerHelper('rpn_target_assign', **locals()) # Assign target label to anchors - loc_index = helper.create_tmp_variable(dtype='int32') - score_index = helper.create_tmp_variable(dtype='int32') - target_label = helper.create_tmp_variable(dtype='int32') - target_bbox = helper.create_tmp_variable(dtype=anchor_box.dtype) + loc_index = helper.create_variable_for_type_inference(dtype='int32') + score_index = helper.create_variable_for_type_inference(dtype='int32') + target_label = helper.create_variable_for_type_inference(dtype='int32') + target_bbox = helper.create_variable_for_type_inference( + dtype=anchor_box.dtype) helper.append_op( type="rpn_target_assign", inputs={ @@ -282,7 +283,8 @@ def detection_output(loc, scores = nn.reshape(x=scores, shape=compile_shape, actual_shape=run_shape) scores = nn.transpose(scores, perm=[0, 2, 1]) scores.stop_gradient = True - nmsed_outs = helper.create_tmp_variable(dtype=decoded_box.dtype) + nmsed_outs = helper.create_variable_for_type_inference( + dtype=decoded_box.dtype) helper.append_op( type="multiclass_nms", inputs={'Scores': scores, @@ -314,7 +316,7 @@ def iou_similarity(x, y, name=None): """ helper = LayerHelper("iou_similarity", **locals()) if name is None: - out = helper.create_tmp_variable(dtype=x.dtype) + out = helper.create_variable_for_type_inference(dtype=x.dtype) else: out = helper.create_variable( name=name, dtype=x.dtype, persistable=False) @@ -351,7 +353,8 @@ def box_coder(prior_box, helper = LayerHelper("box_coder", **locals()) if name is None: - output_box = helper.create_tmp_variable(dtype=prior_box.dtype) + output_box = helper.create_variable_for_type_inference( + dtype=prior_box.dtype) else: output_box = helper.create_variable( name=name, dtype=prior_box.dtype, persistable=False) @@ -382,7 +385,7 @@ def polygon_box_transform(input, name=None): """ helper = LayerHelper("polygon_box_transform", **locals()) if name is None: - output = helper.create_tmp_variable(dtype=input.dtype) + output = helper.create_variable_for_type_inference(dtype=input.dtype) else: output = helper.create_variable( name=name, dtype=prior_box.input, persistable=False) @@ -450,7 +453,7 @@ def detection_map(detect_res, helper = LayerHelper("detection_map", **locals()) def __create_var(type): - return helper.create_tmp_variable(dtype=type) + return helper.create_variable_for_type_inference(dtype=type) map_out = __create_var('float32') accum_pos_count_out = out_states[0] if out_states else __create_var('int32') @@ -557,8 +560,9 @@ def bipartite_match(dist_matrix, >>> matched_indices, matched_dist = fluid.layers.bipartite_match(iou) """ helper = LayerHelper('bipartite_match', **locals()) - match_indices = helper.create_tmp_variable(dtype='int32') - match_distance = helper.create_tmp_variable(dtype=dist_matrix.dtype) + match_indices = helper.create_variable_for_type_inference(dtype='int32') + match_distance = helper.create_variable_for_type_inference( + dtype=dist_matrix.dtype) helper.append_op( type='bipartite_match', inputs={'DistMat': dist_matrix}, @@ -644,8 +648,8 @@ def target_assign(input, gt, matched_indices, mismatch_value=0) """ helper = LayerHelper('target_assign', **locals()) - out = helper.create_tmp_variable(dtype=input.dtype) - out_weight = helper.create_tmp_variable(dtype='float32') + out = helper.create_variable_for_type_inference(dtype=input.dtype) + out_weight = helper.create_variable_for_type_inference(dtype='float32') helper.append_op( type='target_assign', inputs={ @@ -816,9 +820,10 @@ def ssd_loss(location, conf_loss = nn.reshape( x=conf_loss, shape=(num, num_prior), actual_shape=actual_shape) conf_loss.stop_gradient = True - neg_indices = helper.create_tmp_variable(dtype='int32') + neg_indices = helper.create_variable_for_type_inference(dtype='int32') dtype = matched_indices.dtype - updated_matched_indices = helper.create_tmp_variable(dtype=dtype) + updated_matched_indices = helper.create_variable_for_type_inference( + dtype=dtype) helper.append_op( type='mine_hard_examples', inputs={ @@ -998,8 +1003,8 @@ def prior_box(input, max_sizes = [max_sizes] attrs['max_sizes'] = max_sizes - box = helper.create_tmp_variable(dtype) - var = helper.create_tmp_variable(dtype) + box = helper.create_variable_for_type_inference(dtype) + var = helper.create_variable_for_type_inference(dtype) helper.append_op( type="prior_box", inputs={"Input": input, @@ -1337,8 +1342,8 @@ def anchor_generator(input, 'offset': offset } - anchor = helper.create_tmp_variable(dtype) - var = helper.create_tmp_variable(dtype) + anchor = helper.create_variable_for_type_inference(dtype) + var = helper.create_variable_for_type_inference(dtype) helper.append_op( type="anchor_generator", inputs={"Input": input}, @@ -1384,7 +1389,7 @@ def roi_perspective_transform(input, """ helper = LayerHelper('roi_perspective_transform', **locals()) dtype = helper.input_dtype() - out = helper.create_tmp_variable(dtype) + out = helper.create_variable_for_type_inference(dtype) helper.append_op( type="roi_perspective_transform", inputs={"X": input, @@ -1418,11 +1423,15 @@ def generate_proposal_labels(rpn_rois, helper = LayerHelper('generate_proposal_labels', **locals()) - rois = helper.create_tmp_variable(dtype=rpn_rois.dtype) - labels_int32 = helper.create_tmp_variable(dtype=gt_classes.dtype) - bbox_targets = helper.create_tmp_variable(dtype=rpn_rois.dtype) - bbox_inside_weights = helper.create_tmp_variable(dtype=rpn_rois.dtype) - bbox_outside_weights = helper.create_tmp_variable(dtype=rpn_rois.dtype) + rois = helper.create_variable_for_type_inference(dtype=rpn_rois.dtype) + labels_int32 = helper.create_variable_for_type_inference( + dtype=gt_classes.dtype) + bbox_targets = helper.create_variable_for_type_inference( + dtype=rpn_rois.dtype) + bbox_inside_weights = helper.create_variable_for_type_inference( + dtype=rpn_rois.dtype) + bbox_outside_weights = helper.create_variable_for_type_inference( + dtype=rpn_rois.dtype) helper.append_op( type="generate_proposal_labels", @@ -1504,8 +1513,10 @@ def generate_proposals(scores, """ helper = LayerHelper('generate_proposals', **locals()) - rpn_rois = helper.create_tmp_variable(dtype=bbox_deltas.dtype) - rpn_roi_probs = helper.create_tmp_variable(dtype=scores.dtype) + rpn_rois = helper.create_variable_for_type_inference( + dtype=bbox_deltas.dtype) + rpn_roi_probs = helper.create_variable_for_type_inference( + dtype=scores.dtype) helper.append_op( type="generate_proposals", inputs={ diff --git a/python/paddle/fluid/layers/io.py b/python/paddle/fluid/layers/io.py index 81c78cba21..95e13669ad 100644 --- a/python/paddle/fluid/layers/io.py +++ b/python/paddle/fluid/layers/io.py @@ -55,7 +55,11 @@ def data(name, Args: name(str): The name/alias of the function shape(list): Tuple declaring the shape. - append_batch_size(bool): Whether or not to append the data as a batch. + append_batch_size(bool): + 1. If true, it prepends -1 to the shape. + For example if shape=[1], the resulting shape is [-1, 1]. + 2. If shape contains -1, such as shape=[1, -1], + append_batch_size will be enforced to be be False (ineffective). dtype(int|float): The type of data : float32, float_16, int etc type(VarType): The output type. By default it is LOD_TENSOR. lod_level(int): The LoD Level. 0 means the input data is not a sequence. @@ -950,7 +954,7 @@ def read_file(reader): """ helper = LayerHelper('read_file') out = [ - helper.create_tmp_variable( + helper.create_variable_for_type_inference( stop_gradient=True, dtype='float32') for _ in range(len(reader.desc.shapes())) ] diff --git a/python/paddle/fluid/layers/layer_function_generator.py b/python/paddle/fluid/layers/layer_function_generator.py index 8c11921d9b..eea0a362a0 100644 --- a/python/paddle/fluid/layers/layer_function_generator.py +++ b/python/paddle/fluid/layers/layer_function_generator.py @@ -202,10 +202,12 @@ def generate_layer_fn(op_type): out_var = out[0] if (isinstance(out, list) or isinstance(out, tuple)) else out else: - out_var = helper.create_tmp_variable(dtype=dtype) + out_var = helper.create_variable_for_type_inference(dtype=dtype) outputs[o_name] = [out_var] for name in intermediate_output_names: - outputs[name] = [helper.create_tmp_variable(dtype=dtype)] + outputs[name] = [ + helper.create_variable_for_type_inference(dtype=dtype) + ] helper.append_op( type=op_type, inputs=inputs, outputs=outputs, attrs=kwargs) return helper.append_activation(out_var) @@ -229,7 +231,7 @@ def generate_layer_fn_noattr(op_type): def func(x, name=None): helper = LayerHelper(op_type, **locals()) - output = helper.create_tmp_variable(dtype=x.dtype) + output = helper.create_variable_for_type_inference(dtype=x.dtype) helper.append_op(type=op_type, inputs={"X": x}, outputs={"Out": output}) return output diff --git a/python/paddle/fluid/layers/metric_op.py b/python/paddle/fluid/layers/metric_op.py index a3064b565d..b2d2c93ead 100644 --- a/python/paddle/fluid/layers/metric_op.py +++ b/python/paddle/fluid/layers/metric_op.py @@ -58,11 +58,11 @@ def accuracy(input, label, k=1, correct=None, total=None): """ helper = LayerHelper("accuracy", **locals()) topk_out, topk_indices = nn.topk(input, k=k) - acc_out = helper.create_tmp_variable(dtype="float32") + acc_out = helper.create_variable_for_type_inference(dtype="float32") if correct is None: - correct = helper.create_tmp_variable(dtype="int64") + correct = helper.create_variable_for_type_inference(dtype="int64") if total is None: - total = helper.create_tmp_variable(dtype="int64") + total = helper.create_variable_for_type_inference(dtype="int64") helper.append_op( type="accuracy", inputs={ @@ -124,8 +124,8 @@ def auc(input, auc_out=fluid.layers.auc(input=prediction, label=label) """ helper = LayerHelper("auc", **locals()) - auc_out = helper.create_tmp_variable(dtype="float64") - batch_auc_out = helper.create_tmp_variable(dtype="float64") + auc_out = helper.create_variable_for_type_inference(dtype="float64") + batch_auc_out = helper.create_variable_for_type_inference(dtype="float64") # make tp, tn, fp, fn persistable, so that can accumulate all batches. # for batch auc diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py index 8c0ef7a824..cca618b9ad 100644 --- a/python/paddle/fluid/layers/nn.py +++ b/python/paddle/fluid/layers/nn.py @@ -56,6 +56,7 @@ __all__ = [ 'sequence_expand', 'sequence_expand_as', 'sequence_pad', + 'sequence_unpad', 'lstm_unit', 'reduce_sum', 'reduce_mean', @@ -64,6 +65,7 @@ __all__ = [ 'reduce_prod', 'sequence_first_step', 'sequence_last_step', + 'sequence_slice', 'dropout', 'split', 'ctc_greedy_decoder', @@ -94,6 +96,7 @@ __all__ = [ 'pad_constant_like', 'label_smooth', 'roi_pool', + 'roi_align', 'dice_loss', 'image_resize', 'image_resize_short', @@ -107,6 +110,7 @@ __all__ = [ 'log', 'crop', 'rank_loss', + 'margin_rank_loss', 'elu', 'relu6', 'pow', @@ -150,6 +154,7 @@ __all__ = [ 'mul', 'sigmoid_cross_entropy_with_logits', 'maxout', + 'affine_channel', ] @@ -237,7 +242,7 @@ def fc(input, w = helper.create_parameter( attr=param_attr, shape=param_shape, dtype=dtype, is_bias=False) - tmp = helper.create_tmp_variable(dtype) + tmp = helper.create_variable_for_type_inference(dtype) helper.append_op( type="mul", inputs={"X": input_var, @@ -250,7 +255,7 @@ def fc(input, if len(mul_results) == 1: pre_bias = mul_results[0] else: - pre_bias = helper.create_tmp_variable(dtype) + pre_bias = helper.create_variable_for_type_inference(dtype) helper.append_op( type="sum", inputs={"X": mul_results}, @@ -309,7 +314,7 @@ def embedding(input, helper = LayerHelper('embedding', **locals()) w = helper.create_parameter( attr=helper.param_attr, shape=size, dtype=dtype, is_bias=False) - tmp = helper.create_tmp_variable(dtype) + tmp = helper.create_variable_for_type_inference(dtype) padding_idx = -1 if padding_idx is None else padding_idx if padding_idx >= 0 else ( size[0] + padding_idx) helper.append_op( @@ -351,7 +356,6 @@ def dynamic_lstm(input, c_0(Variable): The initial cell state is an optional input, default is zero. This is a tensor with shape (N x D), where N is the batch size. `h_0` and `c_0` can be NULL but only at the same time. - param_attr(ParamAttr|None): The parameter attribute for the learnable hidden-hidden weights. @@ -359,6 +363,11 @@ def dynamic_lstm(input, W_{fh}, W_{oh}`} - The shape is (D x 4D), where D is the hidden size. + + If it is set to None or one attribute of ParamAttr, + dynamic_lstm will create ParamAttr as param_attr. + If the Initializer of the param_attr is not set, the + parameter is initialized with Xavier. Default: None. bias_attr (ParamAttr|None): The bias attribute for the learnable bias weights, which contains two parts, input-hidden bias weights and peephole connections weights if @@ -371,6 +380,11 @@ def dynamic_lstm(input, - Biases = { :math:`b_c, b_i, b_f, b_o, W_{ic}, \ W_{fc}, W_{oc}`}. - The shape is (1 x 7D). + + If it is set to None or one attribute of ParamAttr, + dynamic_lstm will create ParamAttr as bias_attr. + If the Initializer of the bias_attr is not set, + the bias is initialized zero. Default: None. use_peepholes (bool): ${use_peepholes_comment} is_reverse (bool): ${is_reverse_comment} gate_activation (str): ${gate_activation_comment} @@ -389,11 +403,11 @@ def dynamic_lstm(input, hidden_dim = 512 forward_proj = fluid.layers.fc(input=input_seq, size=hidden_dim * 4, - act=None, bias_attr=None) + bias_attr=False) forward, _ = fluid.layers.dynamic_lstm( input=forward_proj, size=hidden_dim * 4, use_peepholes=False) """ - + assert bias_attr is not False, "bias_attr should not be False in dynamic_lstmp." helper = LayerHelper('lstm', **locals()) size = size // 4 weight = helper.create_parameter( @@ -404,10 +418,10 @@ def dynamic_lstm(input, bias = helper.create_parameter( attr=helper.bias_attr, shape=bias_size, dtype=dtype, is_bias=True) - hidden = helper.create_tmp_variable(dtype) - cell = helper.create_tmp_variable(dtype) - batch_gate = helper.create_tmp_variable(dtype) - batch_cell_pre_act = helper.create_tmp_variable(dtype) + hidden = helper.create_variable_for_type_inference(dtype) + cell = helper.create_variable_for_type_inference(dtype) + batch_gate = helper.create_variable_for_type_inference(dtype) + batch_cell_pre_act = helper.create_variable_for_type_inference(dtype) inputs = {'Input': input, 'Weight': weight, 'Bias': bias} batch_size = input.shape[0] if h_0: @@ -528,6 +542,11 @@ def dynamic_lstmp(input, size. - Projection weight = {:math:`W_{rh}`}. - The shape of projection weight is (D x P). + + If it is set to None or one attribute of ParamAttr, + dynamic_lstm will create ParamAttr as param_attr. + If the Initializer of the param_attr is not set, the + parameter is initialized with Xavier. Default: None. bias_attr(ParamAttr|None): The bias attribute for the learnable bias weights, which contains two parts, input-hidden bias weights and peephole connections weights if @@ -540,6 +559,11 @@ def dynamic_lstmp(input, - Biases = { :math:`b_c, b_i, b_f, b_o, W_{ic}, \ W_{fc}, W_{oc}`}. - The shape is (1 x 7D). + + If it is set to None or one attribute of ParamAttr, + dynamic_lstm will create ParamAttr as bias_attr. + If the Initializer of the bias_attr is not set, + the bias is initialized zero. Default: None. use_peepholes(bool): Whether to enable diagonal/peephole connections, default `True`. is_reverse(bool): Whether to compute reversed LSTM, default `False`. @@ -584,6 +608,7 @@ def dynamic_lstmp(input, proj_activation="tanh") """ + assert bias_attr is not False, "bias_attr should not be False in dynamic_lstmp." helper = LayerHelper('lstmp', **locals()) size = size // 4 weight = helper.create_parameter( @@ -596,12 +621,12 @@ def dynamic_lstmp(input, bias = helper.create_parameter( attr=helper.bias_attr, shape=bias_size, dtype=dtype, is_bias=True) - projection = helper.create_tmp_variable(dtype) - cell = helper.create_tmp_variable(dtype) - ordered_proj0 = helper.create_tmp_variable(dtype) - batch_hidden = helper.create_tmp_variable(dtype) - batch_gate = helper.create_tmp_variable(dtype) - batch_cell_pre_act = helper.create_tmp_variable(dtype) + projection = helper.create_variable_for_type_inference(dtype) + cell = helper.create_variable_for_type_inference(dtype) + ordered_proj0 = helper.create_variable_for_type_inference(dtype) + batch_hidden = helper.create_variable_for_type_inference(dtype) + batch_gate = helper.create_variable_for_type_inference(dtype) + batch_cell_pre_act = helper.create_variable_for_type_inference(dtype) helper.append_op( type='lstmp', @@ -726,10 +751,10 @@ def dynamic_gru(input, ), 'The shape of h0 should be(batch_size, %d)' % size inputs['H0'] = h_0 - hidden = helper.create_tmp_variable(dtype) - batch_gate = helper.create_tmp_variable(dtype) - batch_reset_hidden_prev = helper.create_tmp_variable(dtype) - batch_hidden = helper.create_tmp_variable(dtype) + hidden = helper.create_variable_for_type_inference(dtype) + batch_gate = helper.create_variable_for_type_inference(dtype) + batch_reset_hidden_prev = helper.create_variable_for_type_inference(dtype) + batch_hidden = helper.create_variable_for_type_inference(dtype) helper.append_op( type='gru', @@ -819,9 +844,9 @@ def gru_unit(input, weight = helper.create_parameter( attr=helper.param_attr, shape=[size, 3 * size], dtype=dtype) - gate = helper.create_tmp_variable(dtype) - reset_hidden_pre = helper.create_tmp_variable(dtype) - updated_hidden = helper.create_tmp_variable(dtype) + gate = helper.create_variable_for_type_inference(dtype) + reset_hidden_pre = helper.create_variable_for_type_inference(dtype) + updated_hidden = helper.create_variable_for_type_inference(dtype) inputs = {'Input': input, 'HiddenPrev': hidden, 'Weight': weight} # create bias if helper.bias_attr: @@ -871,10 +896,14 @@ def linear_chain_crf(input, label, param_attr=None): attr=helper.param_attr, shape=[size + 2, size], dtype=helper.input_dtype()) - alpha = helper.create_tmp_variable(dtype=helper.input_dtype()) - emission_exps = helper.create_tmp_variable(dtype=helper.input_dtype()) - transition_exps = helper.create_tmp_variable(dtype=helper.input_dtype()) - log_likelihood = helper.create_tmp_variable(dtype=helper.input_dtype()) + alpha = helper.create_variable_for_type_inference( + dtype=helper.input_dtype()) + emission_exps = helper.create_variable_for_type_inference( + dtype=helper.input_dtype()) + transition_exps = helper.create_variable_for_type_inference( + dtype=helper.input_dtype()) + log_likelihood = helper.create_variable_for_type_inference( + dtype=helper.input_dtype()) helper.append_op( type='linear_chain_crf', inputs={"Emission": [input], @@ -913,7 +942,8 @@ def crf_decoding(input, param_attr, label=None): """ helper = LayerHelper('crf_decoding', **locals()) transition = helper.get_parameter(param_attr.name) - viterbi_path = helper.create_tmp_variable(dtype=helper.input_dtype()) + viterbi_path = helper.create_variable_for_type_inference( + dtype=helper.input_dtype()) helper.append_op( type='crf_decoding', inputs={"Emission": [input], @@ -937,9 +967,9 @@ def cos_sim(X, Y): Variable: the output of cosine(X, Y). """ helper = LayerHelper('cos_sim', **locals()) - out = helper.create_tmp_variable(dtype=X.dtype) - xnorm = helper.create_tmp_variable(dtype=X.dtype) - ynorm = helper.create_tmp_variable(dtype=X.dtype) + out = helper.create_variable_for_type_inference(dtype=X.dtype) + xnorm = helper.create_variable_for_type_inference(dtype=X.dtype) + ynorm = helper.create_variable_for_type_inference(dtype=X.dtype) helper.append_op( type='cos_sim', inputs={'X': [X], @@ -983,8 +1013,9 @@ def dropout(x, dropout_prob, is_test=False, seed=None, name=None): """ helper = LayerHelper('dropout', **locals()) - out = helper.create_tmp_variable(dtype=x.dtype) - mask = helper.create_tmp_variable(dtype=x.dtype, stop_gradient=True) + out = helper.create_variable_for_type_inference(dtype=x.dtype) + mask = helper.create_variable_for_type_inference( + dtype=x.dtype, stop_gradient=True) if (seed is None or seed == 0) and helper.main_program.random_seed != 0: seed = helper.main_program.random_seed @@ -1069,7 +1100,7 @@ def cross_entropy(input, label, soft_label=False, ignore_index=-100): cost = fluid.layers.cross_entropy(input=predict, label=label) """ helper = LayerHelper('cross_entropy', **locals()) - out = helper.create_tmp_variable(dtype=input.dtype) + out = helper.create_variable_for_type_inference(dtype=input.dtype) helper.append_op( type='cross_entropy', inputs={'X': [input], @@ -1116,14 +1147,14 @@ def square_error_cost(input, label): """ helper = LayerHelper('square_error_cost', **locals()) - minus_out = helper.create_tmp_variable(dtype=input.dtype) + minus_out = helper.create_variable_for_type_inference(dtype=input.dtype) helper.append_op( type='elementwise_sub', inputs={'X': [input], 'Y': [label]}, outputs={'Out': [minus_out]}) - square_out = helper.create_tmp_variable(dtype=input.dtype) + square_out = helper.create_variable_for_type_inference(dtype=input.dtype) helper.append_op( type='square', inputs={'X': [minus_out]}, outputs={'Out': [square_out]}) @@ -1229,12 +1260,13 @@ def chunk_eval(input, helper = LayerHelper("chunk_eval", **locals()) # prepare output - precision = helper.create_tmp_variable(dtype="float32") - recall = helper.create_tmp_variable(dtype="float32") - f1_score = helper.create_tmp_variable(dtype="float32") - num_infer_chunks = helper.create_tmp_variable(dtype="int64") - num_label_chunks = helper.create_tmp_variable(dtype="int64") - num_correct_chunks = helper.create_tmp_variable(dtype="int64") + precision = helper.create_variable_for_type_inference(dtype="float32") + recall = helper.create_variable_for_type_inference(dtype="float32") + f1_score = helper.create_variable_for_type_inference(dtype="float32") + num_infer_chunks = helper.create_variable_for_type_inference(dtype="int64") + num_label_chunks = helper.create_variable_for_type_inference(dtype="int64") + num_correct_chunks = helper.create_variable_for_type_inference( + dtype="int64") helper.append_op( type="chunk_eval", @@ -1265,7 +1297,8 @@ def sequence_conv(input, padding=None, bias_attr=None, param_attr=None, - act=None): + act=None, + name=None): """ This function creates the op for sequence_conv, using the inputs and other convolutional configurations for the filters and stride as given @@ -1277,9 +1310,19 @@ def sequence_conv(input, filter_size (int): the filter size (H and W). filter_stride (int): stride of the filter. padding (bool): if True, add paddings. - bias_attr (ParamAttr|None): attributes for bias - param_attr (ParamAttr|None): attributes for parameter - act (str): the activation type + bias_attr (ParamAttr|bool|None): The parameter attribute for the bias of sequence_conv. + If it is set to False, no bias will be added to the output units. + If it is set to None or one attribute of ParamAttr, sequence_conv + will create ParamAttr as bias_attr. If the Initializer of the bias_attr + is not set, the bias is initialized zero. Default: None. + param_attr (ParamAttr|None): The parameter attribute for learnable parameters/weights + of sequence_conv. If it is set to None or one attribute of ParamAttr, sequence_conv + will create ParamAttr as param_attr. If the Initializer of the param_attr + is not set, the parameter is initialized with Xavier. Default: None. + act (str): Activation type, if it is set to None, activation is not appended. + Default: None. + name (str|None): A name for this layer(optional). If set None, the layer + will be named automatically. Default: None. Returns: Variable: output of sequence_conv @@ -1290,7 +1333,7 @@ def sequence_conv(input, filter_shape = [filter_size * input.shape[1], num_filters] filter_param = helper.create_parameter( attr=helper.param_attr, shape=filter_shape, dtype=dtype) - pre_bias = helper.create_tmp_variable(dtype) + pre_bias = helper.create_variable_for_type_inference(dtype) helper.append_op( type='sequence_conv', @@ -1308,7 +1351,7 @@ def sequence_conv(input, return helper.append_activation(pre_act) -def sequence_softmax(input, param_attr=None, bias_attr=None, use_cudnn=False): +def sequence_softmax(input, use_cudnn=False, name=None): """ This function computes the softmax activation among all time-steps for each sequence. The dimension of each time-step should be 1. Thus, the shape of @@ -1328,10 +1371,10 @@ def sequence_softmax(input, param_attr=None, bias_attr=None, use_cudnn=False): Args: input (Variable): The input variable which is a LoDTensor. - bias_attr (ParamAttr|None): attributes for bias - param_attr (ParamAttr|None): attributes for parameter use_cudnn (bool): Use cudnn kernel or not, it is valid only when the cudnn \ - library is installed. Default: False + library is installed. Default: False. + name (str|None): A name for this layer(optional). If set None, the layer + will be named automatically. Default: None. Returns: Variable: output of sequence_softmax @@ -1346,7 +1389,7 @@ def sequence_softmax(input, param_attr=None, bias_attr=None, use_cudnn=False): """ helper = LayerHelper('sequence_softmax', **locals()) dtype = helper.input_dtype() - softmax_out = helper.create_tmp_variable(dtype) + softmax_out = helper.create_variable_for_type_inference(dtype) helper.append_op( type="sequence_softmax", inputs={"X": input}, @@ -1355,7 +1398,7 @@ def sequence_softmax(input, param_attr=None, bias_attr=None, use_cudnn=False): return softmax_out -def softmax(input, param_attr=None, bias_attr=None, use_cudnn=True, name=None): +def softmax(input, use_cudnn=True, name=None): """ The input of the softmax operator is a tensor of any rank. The output tensor has the same shape as the input. @@ -1382,10 +1425,10 @@ def softmax(input, param_attr=None, bias_attr=None, use_cudnn=True, name=None): Args: input (Variable): The input variable. - bias_attr (ParamAttr): attributes for bias - param_attr (ParamAttr): attributes for parameter use_cudnn (bool): Use cudnn kernel or not, it is valid only when the cudnn \ - library is installed. + library is installed. + name (str|None): A name for this layer(optional). If set None, the layer + will be named automatically. Default: None. Returns: Variable: output of softmax @@ -1400,7 +1443,7 @@ def softmax(input, param_attr=None, bias_attr=None, use_cudnn=True, name=None): """ helper = LayerHelper('softmax', **locals()) dtype = helper.input_dtype() - softmax_out = helper.create_tmp_variable(dtype) + softmax_out = helper.create_variable_for_type_inference(dtype) helper.append_op( type="softmax", inputs={"X": input}, @@ -1491,14 +1534,23 @@ def conv2d(input, convolution in Alex Krizhevsky's Deep CNN paper: when group=2, the first half of the filters is only connected to the first half of the input channels, while the second half of the filters is only - connected to the second half of the input channels. Default: groups=1 - param_attr (ParamAttr): The parameters to the Conv2d Layer. Default: None - bias_attr (ParamAttr): Bias parameter for the Conv2d layer. Default: None + connected to the second half of the input channels. Default: groups=1. + param_attr (ParamAttr|None): The parameter attribute for learnable parameters/weights + of conv2d. If it is set to None or one attribute of ParamAttr, conv2d + will create ParamAttr as param_attr. If the Initializer of the param_attr + is not set, the parameter is initialized with :math:`Normal(0.0, std)`, + and the :math:`std` is :math:`(\\frac{2.0 }{filter\_elem\_num})^{0.5}`. Default: None. + bias_attr (ParamAttr|bool|None): The parameter attribute for the bias of conv2d. + If it is set to False, no bias will be added to the output units. + If it is set to None or one attribute of ParamAttr, conv2d + will create ParamAttr as bias_attr. If the Initializer of the bias_attr + is not set, the bias is initialized zero. Default: None. use_cudnn (bool): Use cudnn kernel or not, it is valid only when the cudnn library is installed. Default: True - act (str): Activation type. Default: None + act (str): Activation type, if it is set to None, activation is not appended. + Default: None name (str|None): A name for this layer(optional). If set None, the layer - will be named automatically. + will be named automatically. Default: None Returns: Variable: The tensor variable storing the convolution and \ @@ -1516,7 +1568,7 @@ def conv2d(input, """ num_channels = input.shape[1] - + assert param_attr is not False, "param_attr should not be False here." l_type = 'conv2d' if (num_channels == groups and num_filters % num_channels == 0 and not use_cudnn): @@ -1544,7 +1596,8 @@ def conv2d(input, filter_shape = [num_filters, int(num_filter_channels)] + filter_size def _get_default_param_initializer(): - std = (2.0 / (filter_size[0]**2 * num_channels))**0.5 + filter_elem_num = filter_size[0] * filter_size[1] * num_channels + std = (2.0 / filter_elem_num)**0.5 return Normal(0.0, std, 0) filter_param = helper.create_parameter( @@ -1553,7 +1606,7 @@ def conv2d(input, dtype=dtype, default_initializer=_get_default_param_initializer()) - pre_bias = helper.create_tmp_variable(dtype) + pre_bias = helper.create_variable_for_type_inference(dtype) helper.append_op( type=l_type, @@ -1655,13 +1708,22 @@ def conv3d(input, the first half of the filters is only connected to the first half of the input channels, while the second half of the filters is only connected to the second half of the input channels. Default: groups=1 - param_attr (ParamAttr): The parameters to the Conv3d Layer. Default: None - bias_attr (ParamAttr): Bias parameter for the Conv3d layer. Default: None + param_attr (ParamAttr|None): The parameter attribute for learnable parameters/weights + of conv3d. If it is set to None or one attribute of ParamAttr, conv3d + will create ParamAttr as param_attr. If it is set to None, the parameter + is initialized with :math:`Normal(0.0, std)`, and the :math:`std` is + :math:`(\\frac{2.0 }{filter\_elem\_num})^{0.5}`. Default: None. + bias_attr (ParamAttr|bool|None): The parameter attribute for the bias of conv3d. + If it is set to False, no bias will be added to the output units. + If it is set to None or one attribute of ParamAttr, conv3d + will create ParamAttr as bias_attr. If the Initializer of the bias_attr + is not set, the bias is initialized zero. Default: None. use_cudnn (bool): Use cudnn kernel or not, it is valid only when the cudnn library is installed. Default: True - act (str): Activation type. Default: None + act (str): Activation type, if it is set to None, activation is not appended. + Default: None. name (str|None): A name for this layer(optional). If set None, the layer - will be named automatically. + will be named automatically. Default: None. Returns: Variable: The tensor variable storing the convolution and \ @@ -1679,7 +1741,7 @@ def conv3d(input, """ l_type = 'conv3d' - + assert param_attr is not False, "param_attr should not be False here." helper = LayerHelper(l_type, **locals()) dtype = helper.input_dtype() @@ -1704,7 +1766,9 @@ def conv3d(input, filter_shape = [num_filters, num_filter_channels] + filter_size def _get_default_param_initializer(): - std = (2.0 / (filter_size[0]**3 * num_channels))**0.5 + filter_elem_num = filter_size[0] * filter_size[1] * filter_size[ + 2] * num_channels + std = (2.0 / filter_elem_num)**0.5 return Normal(0.0, std, 0) filter_param = helper.create_parameter( @@ -1713,7 +1777,7 @@ def conv3d(input, dtype=dtype, default_initializer=_get_default_param_initializer()) - pre_bias = helper.create_tmp_variable(dtype) + pre_bias = helper.create_variable_for_type_inference(dtype) helper.append_op( type=l_type, @@ -1792,8 +1856,8 @@ def sequence_pool(input, pool_type): """ helper = LayerHelper('sequence_pool', **locals()) dtype = helper.input_dtype() - pool_out = helper.create_tmp_variable(dtype) - max_index = helper.create_tmp_variable(dtype) + pool_out = helper.create_variable_for_type_inference(dtype) + max_index = helper.create_variable_for_type_inference(dtype) helper.append_op( type="sequence_pool", @@ -1829,7 +1893,7 @@ def sequence_concat(input, name=None): out = fluid.layers.sequence_concat(input=[seq1, seq2, seq3]) """ helper = LayerHelper('sequence_concat', **locals()) - out = helper.create_tmp_variable(dtype=helper.input_dtype()) + out = helper.create_variable_for_type_inference(dtype=helper.input_dtype()) helper.append_op( type='sequence_concat', inputs={'X': input}, outputs={'Out': [out]}) return out @@ -1901,6 +1965,76 @@ def sequence_last_step(input): return sequence_pool(input=input, pool_type="last") +def sequence_slice(input, offset, length, name=None): + """ + **Sequence Slice Layer** + + The layer crops a subsequence from given sequence with given start + offset and subsequence length. + + It only supports sequence data (LoDTensor with lod_level equal to 1). + + .. code-block:: text + + - Case: + + Given the input Variable **input**: + + input.data = [[a1, a2], [b1, b2], [c1, c2], [d1, d2], [e1, e2]], + input.lod = [[3, 2]], + input.dims = (5, 2), + + with offset.data = [[0], [1]] and length.data = [[2], [1]], + + the output Variable will be + + out.data = [[a1, a2], [b1, b2], [e1, e2]], + out.lod = [[2, 1]], + out.dims = (3, 2). + + NOTE: The first dimension size of **input**, **offset** and **length** + should be equal. The **offset** should start from 0. + + Args: + input(Variable): The input Variable which consists of the complete + sequences. + offset(Variable): The offset to slice each sequence. + length(Variable): The length of each subsequence. + name(str|None): A name for this layer(optional). If set None, the + layer will be named automatically. + + Returns: + Variable: The output subsequences. + + Examples: + + .. code-block:: python + + import numpy as np + seqs = fluid.layers.data(name='x', shape=[10, 5], + dtype='float32', lod_level=1) + offset = fluid.layers.assign(input=np.array([[0, 1]]).astype("int32")) + length = fluid.layers.assign(input=np.array([[2, 1]]).astype("int32")) + subseqs = fluid.layers.sequence_slice(input=seqs, offset=offset, + length=length) + """ + helper = LayerHelper("sequence_slice", **locals()) + dtype = helper.input_dtype() + out = helper.create_variable_for_type_inference(dtype) + + offset.stop_gradient = True + length.stop_gradient = True + + helper.append_op( + type="sequence_slice", + inputs={"X": input, + "Offset": offset, + "Length": length}, + outputs={"Out": out}) + + return out + + @templatedoc() def pool2d(input, pool_size=-1, @@ -1972,7 +2106,7 @@ def pool2d(input, helper = LayerHelper(l_type, **locals()) dtype = helper.input_dtype() - pool_out = helper.create_tmp_variable(dtype) + pool_out = helper.create_variable_for_type_inference(dtype) helper.append_op( type=l_type, @@ -2040,7 +2174,7 @@ def pool3d(input, l_type = "pool3d" helper = LayerHelper(l_type, **locals()) dtype = helper.input_dtype() - pool_out = helper.create_tmp_variable(dtype) + pool_out = helper.create_variable_for_type_inference(dtype) helper.append_op( type=l_type, @@ -2106,8 +2240,14 @@ def batch_norm(input, is_test(bool, Default False): Used for training or training. momentum(float, Default 0.9): epsilon(float, Default 1e-05): - param_attr(ParamAttr): The parameter attribute for Parameter `scale`. - bias_attr(ParamAttr): The parameter attribute for Parameter `bias`. + param_attr(ParamAttr|None): The parameter attribute for Parameter `scale` + of batch_norm. If it is set to None or one attribute of ParamAttr, batch_norm + will create ParamAttr as param_attr. If the Initializer of the param_attr + is not set, the parameter is initialized with Xavier. Default: None. + bias_attr(ParamAttr|None): The parameter attribute for the bias of batch_norm. + If it is set to None or one attribute of ParamAttr, batch_norm + will create ParamAttr as bias_attr. If the Initializer of the bias_attr + is not set, the bias is initialized zero. Default: None. data_layout(string, default NCHW): NCHW|NHWC in_place(bool, Default False): Make the input and output of batch norm reuse memory. name(string, Default None): A name for this layer(optional). If set None, the layer @@ -2127,6 +2267,7 @@ def batch_norm(input, hidden1 = fluid.layers.fc(input=x, size=200, param_attr='fc1.w') hidden2 = fluid.layers.batch_norm(input=hidden1) """ + assert bias_attr is not False, "bias_attr should not be False in batch_norm." helper = LayerHelper('batch_norm', **locals()) dtype = helper.input_dtype() @@ -2176,10 +2317,13 @@ def batch_norm(input, mean_out = mean # variance and variance out share the same memory variance_out = variance - saved_mean = helper.create_tmp_variable(dtype=dtype, stop_gradient=True) - saved_variance = helper.create_tmp_variable(dtype=dtype, stop_gradient=True) + saved_mean = helper.create_variable_for_type_inference( + dtype=dtype, stop_gradient=True) + saved_variance = helper.create_variable_for_type_inference( + dtype=dtype, stop_gradient=True) - batch_norm_out = input if in_place else helper.create_tmp_variable(dtype) + batch_norm_out = input if in_place else helper.create_variable_for_type_inference( + dtype) helper.append_op( type="batch_norm", @@ -2243,19 +2387,28 @@ def layer_norm(input, Args: input(Variable): The input tensor variable. scale(bool): Whether to learn the adaptive gain :math:`g` after - normalization. + normalization. Default True. shift(bool): Whether to learn the adaptive bias :math:`b` after - normalization. - begin_norm_axis(bool): The normalization will be performed along + normalization. Default True. + begin_norm_axis(int): The normalization will be performed along dimensions from :attr:`begin_norm_axis` to :attr:`rank(input)`. + Default 1. epsilon(float): The small value added to the variance to prevent - division by zero. + division by zero. Default 1e-05. param_attr(ParamAttr|None): The parameter attribute for the learnable - gain :math:`g`. + gain :math:`g`. If :attr:`scale` is False, :attr:`param_attr` is + omitted. If :attr:`scale` is True and :attr:`param_attr` is None, + a default :code:`ParamAttr` would be added as scale. The + :attr:`param_attr` is initialized as 1 if it is added. Default None. bias_attr(ParamAttr|None): The parameter attribute for the learnable - bias :math:`b`. + bias :math:`b`. If :attr:`shift` is False, :attr:`bias_attr` is + omitted. If :attr:`shift` is True and :attr:`param_attr` is None, + a default :code:`ParamAttr` would be added as bias. The + :attr:`bias_attr` is initialized as 0 if it is added. Default None. act(str): Activation to be applied to the output of layer normalizaiton. - name (str): The name of this layer. It is optional. + Default None. + name(str): The name of this layer. It is optional. Default None, and a + unique name would be generated automatically. Returns: ${y_comment} @@ -2287,9 +2440,11 @@ def layer_norm(input, inputs['Bias'] = bias # create output - mean_out = helper.create_tmp_variable(dtype=dtype, stop_gradient=True) - variance_out = helper.create_tmp_variable(dtype=dtype, stop_gradient=True) - layer_norm_out = helper.create_tmp_variable(dtype) + mean_out = helper.create_variable_for_type_inference( + dtype=dtype, stop_gradient=True) + variance_out = helper.create_variable_for_type_inference( + dtype=dtype, stop_gradient=True) + layer_norm_out = helper.create_variable_for_type_inference(dtype) helper.append_op( type="layer_norm", @@ -2396,15 +2551,22 @@ def conv2d_transpose(input, when group=2, the first half of the filters is only connected to the first half of the input channels, while the second half of the filters is only connected to the second half of the input channels. - Default: groups=1 - param_attr(ParamAttr): The parameters to the Conv2d_transpose Layer. - Default: None - bias_attr(ParamAttr): Bias parameter for the Conv2d layer. Default: None + Default: groups = 1. + param_attr (ParamAttr|None): The parameter attribute for learnable parameters/weights + of conv2d_transpose. If it is set to None or one attribute of ParamAttr, conv2d_transpose + will create ParamAttr as param_attr. If the Initializer of the param_attr + is not set, the parameter is initialized with Xavier. Default: None. + bias_attr (ParamAttr|bool|None): The parameter attribute for the bias of conv2d_transpose. + If it is set to False, no bias will be added to the output units. + If it is set to None or one attribute of ParamAttr, conv2d_transpose + will create ParamAttr as bias_attr. If the Initializer of the bias_attr + is not set, the bias is initialized zero. Default: None. use_cudnn(bool): Use cudnn kernel or not, it is valid only when the cudnn - library is installed. Default: True - act(str): Activation type. Default: None + library is installed. Default: True. + act (str): Activation type, if it is set to None, activation is not appended. + Default: None. name(str|None): A name for this layer(optional). If set None, the layer - will be named automatically. + will be named automatically. Default: True. Returns: Variable: The tensor variable storing the convolution transpose result. @@ -2419,7 +2581,7 @@ def conv2d_transpose(input, data = fluid.layers.data(name='data', shape=[3, 32, 32], dtype='float32') conv2d_transpose = fluid.layers.conv2d_transpose(input=data, num_filters=2, filter_size=3) """ - + assert param_attr is not False, "param_attr should not be False in conv2d_transpose." input_channel = input.shape[1] op_type = 'conv2d_transpose' @@ -2455,6 +2617,7 @@ def conv2d_transpose(input, else: filter_size = utils.convert_to_list(filter_size, 2, 'conv2d_transpose.filter_size') + if output_size is None: output_size = [] elif isinstance(output_size, list) or isinstance(output_size, int): @@ -2464,10 +2627,11 @@ def conv2d_transpose(input, padding = utils.convert_to_list(padding, 2, 'padding') groups = 1 if groups is None else groups filter_shape = [input_channel, num_filters // groups] + filter_size + img_filter = helper.create_parameter( dtype=input.dtype, shape=filter_shape, attr=helper.param_attr) - pre_bias = helper.create_tmp_variable(dtype=input.dtype) + pre_bias = helper.create_variable_for_type_inference(dtype=input.dtype) helper.append_op( type=op_type, inputs={'Input': [input], @@ -2576,12 +2740,19 @@ def conv3d_transpose(input, first half of the input channels, while the second half of the filters is only connected to the second half of the input channels. Default: groups=1 - param_attr(ParamAttr): The parameters to the Conv3d_transpose Layer. - Default: None - bias_attr(ParamAttr): Bias parameter for the Conv3d layer. Default: None + param_attr (ParamAttr|None): The parameter attribute for learnable parameters/weights + of conv3d_transpose. If it is set to None or one attribute of ParamAttr, conv3d_transpose + will create ParamAttr as param_attr. If the Initializer of the param_attr + is not set, the parameter is initialized with Xavier. Default: None. + bias_attr (ParamAttr|bool|None): The parameter attribute for the bias of conv3d_transpose. + If it is set to False, no bias will be added to the output units. + If it is set to None or one attribute of ParamAttr, conv3d_transpose + will create ParamAttr as bias_attr. If the Initializer of the bias_attr + is not set, the bias is initialized zero. Default: None. use_cudnn(bool): Use cudnn kernel or not, it is valid only when the cudnn library is installed. Default: True - act(str): Activation type. Default: None + act (str): Activation type, if it is set to None, activation is not appended. + Default: None. name(str|None): A name for this layer(optional). If set None, the layer will be named automatically. @@ -2598,6 +2769,7 @@ def conv3d_transpose(input, data = fluid.layers.data(name='data', shape=[3, 12, 32, 32], dtype='float32') conv3d_transpose = fluid.layers.conv3d_transpose(input=data, num_filters=2, filter_size=3) """ + assert param_attr is not False, "param_attr should not be False in conv3d_transpose." l_type = "conv3d_transpose" helper = LayerHelper(l_type, **locals()) if not isinstance(input, Variable): @@ -2637,7 +2809,7 @@ def conv3d_transpose(input, img_filter = helper.create_parameter( dtype=input.dtype, shape=filter_shape, attr=helper.param_attr) - pre_bias = helper.create_tmp_variable(dtype=input.dtype) + pre_bias = helper.create_variable_for_type_inference(dtype=input.dtype) helper.append_op( type=l_type, inputs={'Input': [input], @@ -2716,7 +2888,7 @@ def sequence_expand(x, y, ref_level=-1, name=None): """ helper = LayerHelper('sequence_expand', input=x, **locals()) dtype = helper.input_dtype() - tmp = helper.create_tmp_variable(dtype) + tmp = helper.create_variable_for_type_inference(dtype) helper.append_op( type='sequence_expand', inputs={'X': x, @@ -2782,7 +2954,7 @@ def sequence_expand_as(x, y, name=None): """ helper = LayerHelper('sequence_expand_as', input=x, **locals()) dtype = helper.input_dtype() - tmp = helper.create_tmp_variable(dtype) + tmp = helper.create_variable_for_type_inference(dtype) helper.append_op( type='sequence_expand_as', inputs={'X': x, @@ -2792,7 +2964,7 @@ def sequence_expand_as(x, y, name=None): @templatedoc() -def sequence_pad(x, pad_value, maxlen=None): +def sequence_pad(x, pad_value, maxlen=None, name=None): """ ${comment} @@ -2806,7 +2978,9 @@ def sequence_pad(x, pad_value, maxlen=None): None or any positive int. When it is None, all sequences will be padded up to the length of the longest one among them; when it a certain positive value, it must be greater than the length of the - longest original sequence." + longest original sequence. + name(str|None): A name for this layer(optional). If set None, the layer + will be named automatically. Returns: Variable: The padded sequence batch and the original lengths before @@ -2825,8 +2999,8 @@ def sequence_pad(x, pad_value, maxlen=None): helper = LayerHelper('sequence_pad', input=x, **locals()) dtype = helper.input_dtype() - out = helper.create_tmp_variable(dtype) - length = helper.create_tmp_variable(dtype) + out = helper.create_variable_for_type_inference(dtype) + length = helper.create_variable_for_type_inference(dtype) pad_value.stop_gradient = True length.stop_gradient = True @@ -2843,6 +3017,66 @@ def sequence_pad(x, pad_value, maxlen=None): return out, length +def sequence_unpad(x, length, name=None): + """ + **Sequence Unpad Layer** + + This layer removes the padding data in the input sequences and convert + them into sequences with actual length as output, identitied by lod + information. + + .. code-block:: text + + Example: + + Given input Variable **x**: + x.data = [[ 1.0, 2.0, 3.0, 4.0, 5.0], + [ 6.0, 7.0, 8.0, 9.0, 10.0], + [11.0, 12.0, 13.0, 14.0, 15.0]], + + in which there are 3 sequences padded to length 5, and the acutal length + specified by input Variable **length**: + + length.data = [[2], [3], [4]], + + after unpadding, the output Variable will be: + + out.data = [[1.0, 2.0, 6.0, 7.0, 8.0, 11.0, 12.0, 13.0, 14.0]] + out.lod = [[2, 3, 4]] + + Args: + x(Variable): Input Variable which contains the padded sequences with + equal length. + length(Variable): The Variable that specifies the actual ength of + sequences after unpadding. + name(str|None): A name for this layer(optional). If set None, the layer + will be named automatically. + + Returns: + Variable: The Variable contains the unpadded sequences. + + Examples: + .. code-block:: python + + x = fluid.layers.data(name='x', shape=[10, 5], dtype='float32') + len = fluid.layers.data(name='length', shape=[1], dtype='int64') + out = fluid.layers.sequence_unpad(x=x, length=len) + """ + + helper = LayerHelper('sequence_unpad', input=x, **locals()) + dtype = helper.input_dtype() + out = helper.create_variable_for_type_inference(dtype) + + length.stop_gradient = True + + helper.append_op( + type='sequence_unpad', + inputs={'X': x, + 'Length': length}, + outputs={'Out': out}) + return out + + def beam_search(pre_ids, pre_scores, ids, @@ -2930,8 +3164,9 @@ def beam_search(pre_ids, score_type = scores.dtype id_type = ids.dtype - selected_scores = helper.create_tmp_variable(dtype=score_type) - selected_ids = helper.create_tmp_variable(dtype=id_type) + selected_scores = helper.create_variable_for_type_inference( + dtype=score_type) + selected_ids = helper.create_variable_for_type_inference(dtype=id_type) helper.append_op( type='beam_search', @@ -2988,8 +3223,8 @@ def beam_search_decode(ids, scores, beam_size, end_id, name=None): ids, scores, beam_size=5, end_id=0) """ helper = LayerHelper('beam_search_decode', **locals()) - sentence_ids = helper.create_tmp_variable(dtype=ids.dtype) - sentence_scores = helper.create_tmp_variable(dtype=ids.dtype) + sentence_ids = helper.create_variable_for_type_inference(dtype=ids.dtype) + sentence_scores = helper.create_variable_for_type_inference(dtype=ids.dtype) helper.append_op( type="beam_search_decode", @@ -3054,10 +3289,18 @@ def lstm_unit(x_t, cell_t_prev (Variable): The cell value of lstm unit, a 2-D tensor with shape M x S, M for batch size and S for size of lstm unit. forget_bias (float): The forget bias of lstm unit. - param_attr (ParamAttr): The attributes of parameter weights, used to set - initializer, name etc. - bias_attr (ParamAttr): The attributes of bias weights, if not False, - bias weights will be created and be set to default value. + param_attr(ParamAttr|None): The parameter attribute for the learnable + hidden-hidden weights. + If it is set to None or one attribute of ParamAttr, + lstm_unit will create ParamAttr as param_attr. + If the Initializer of the param_attr is not set, the + parameter is initialized with Xavier. Default: None. + bias_attr (ParamAttr|None): The bias attribute for the learnable bias + weights. If it is set to False, no bias will be added + to the output units. If it is set to None or one attribute of ParamAttr, + lstm_unit will create ParamAttr as bias_attr. + If the Initializer of the bias_attr is not set, + the bias is initialized zero. Default: None. name(str|None): A name for this layer(optional). If set None, the layer will be named automatically. @@ -3111,8 +3354,8 @@ def lstm_unit(x_t, param_attr=param_attr, bias_attr=bias_attr) dtype = x_t.dtype - c = helper.create_tmp_variable(dtype) - h = helper.create_tmp_variable(dtype) + c = helper.create_variable_for_type_inference(dtype) + h = helper.create_variable_for_type_inference(dtype) helper.append_op( type='lstm_unit', @@ -3166,7 +3409,7 @@ def reduce_sum(input, dim=None, keep_dim=False, name=None): """ helper = LayerHelper('reduce_sum', **locals()) - out = helper.create_tmp_variable(dtype=helper.input_dtype()) + out = helper.create_variable_for_type_inference(dtype=helper.input_dtype()) if dim is not None and not isinstance(dim, list): dim = [dim] helper.append_op( @@ -3223,7 +3466,7 @@ def reduce_mean(input, dim=None, keep_dim=False, name=None): fluid.layers.reduce_mean(x, dim=[0, 1]) # [4.0, 5.0] """ helper = LayerHelper('reduce_mean', **locals()) - out = helper.create_tmp_variable(dtype=helper.input_dtype()) + out = helper.create_variable_for_type_inference(dtype=helper.input_dtype()) if dim is not None and not isinstance(dim, list): dim = [dim] helper.append_op( @@ -3278,7 +3521,7 @@ def reduce_max(input, dim=None, keep_dim=False, name=None): fluid.layers.reduce_max(x, dim=[0, 1]) # [7.0, 8.0] """ helper = LayerHelper('reduce_max', **locals()) - out = helper.create_tmp_variable(dtype=helper.input_dtype()) + out = helper.create_variable_for_type_inference(dtype=helper.input_dtype()) if dim is not None and not isinstance(dim, list): dim = [dim] helper.append_op( @@ -3333,7 +3576,7 @@ def reduce_min(input, dim=None, keep_dim=False, name=None): fluid.layers.reduce_min(x, dim=[0, 1]) # [1.0, 2.0] """ helper = LayerHelper('reduce_min', **locals()) - out = helper.create_tmp_variable(dtype=helper.input_dtype()) + out = helper.create_variable_for_type_inference(dtype=helper.input_dtype()) if dim is not None and not isinstance(dim, list): dim = [dim] helper.append_op( @@ -3389,7 +3632,7 @@ def reduce_prod(input, dim=None, keep_dim=False, name=None): fluid.layers.reduce_prod(x, dim=[0, 1]) # [105.0, 384.0] """ helper = LayerHelper('reduce_prod', **locals()) - out = helper.create_tmp_variable(dtype=helper.input_dtype()) + out = helper.create_variable_for_type_inference(dtype=helper.input_dtype()) if dim is not None and not isinstance(dim, list): dim = [dim] helper.append_op( @@ -3449,7 +3692,7 @@ def split(input, num_or_sections, dim=-1, name=None): dim], 'len(num_or_sections) must not be more than input.shape[dim].' num = len(num_or_sections) outs = [ - helper.create_tmp_variable(dtype=helper.input_dtype()) + helper.create_variable_for_type_inference(dtype=helper.input_dtype()) for i in range(num) ] helper.append_op( @@ -3506,8 +3749,8 @@ def l2_normalize(x, axis, epsilon=1e-12, name=None): axis = 0 helper = LayerHelper("l2_normalize", **locals()) - out = helper.create_tmp_variable(dtype=x.dtype) - norm = helper.create_tmp_variable(dtype=x.dtype) + out = helper.create_variable_for_type_inference(dtype=x.dtype) + norm = helper.create_variable_for_type_inference(dtype=x.dtype) helper.append_op( type="norm", inputs={"X": x}, @@ -3616,7 +3859,7 @@ def matmul(x, y, transpose_x=False, transpose_y=False, alpha=1.0, name=None): __check_input(x, y) helper = LayerHelper('matmul', **locals()) - out = helper.create_tmp_variable(dtype=x.dtype) + out = helper.create_variable_for_type_inference(dtype=x.dtype) helper.append_op( type='matmul', inputs={'X': x, @@ -3687,8 +3930,8 @@ def topk(input, k, name=None): top5_values, top5_indices = layers.topk(input, k=5) """ helper = LayerHelper("top_k", **locals()) - values = helper.create_tmp_variable(dtype=input.dtype) - indices = helper.create_tmp_variable(dtype="int64") + values = helper.create_variable_for_type_inference(dtype=input.dtype) + indices = helper.create_variable_for_type_inference(dtype="int64") helper.append_op( type="top_k", inputs={"X": [input]}, @@ -3746,8 +3989,8 @@ def edit_distance(input, label, normalized=True, ignored_tokens=None): # remove some tokens from input and labels if ignored_tokens is not None and len(ignored_tokens) > 0: - erased_input = helper.create_tmp_variable(dtype="int64") - erased_label = helper.create_tmp_variable(dtype="int64") + erased_input = helper.create_variable_for_type_inference(dtype="int64") + erased_label = helper.create_variable_for_type_inference(dtype="int64") helper.append_op( type="sequence_erase", @@ -3764,8 +4007,8 @@ def edit_distance(input, label, normalized=True, ignored_tokens=None): label = erased_label # edit distance op - edit_distance_out = helper.create_tmp_variable(dtype="int64") - sequence_num = helper.create_tmp_variable(dtype="int64") + edit_distance_out = helper.create_variable_for_type_inference(dtype="int64") + sequence_num = helper.create_variable_for_type_inference(dtype="int64") helper.append_op( type="edit_distance", inputs={"Hyps": [input], @@ -3840,7 +4083,7 @@ def ctc_greedy_decoder(input, blank, name=None): _, topk_indices = topk(input, k=1) # ctc align op - ctc_out = helper.create_tmp_variable(dtype="int64") + ctc_out = helper.create_variable_for_type_inference(dtype="int64") helper.append_op( type="ctc_align", inputs={"Input": [topk_indices]}, @@ -3890,8 +4133,8 @@ def warpctc(input, label, blank=0, norm_by_times=False): """ helper = LayerHelper('warpctc', **locals()) - loss_out = helper.create_tmp_variable(dtype=input.dtype) - grad_out = helper.create_tmp_variable(dtype=input.dtype) + loss_out = helper.create_variable_for_type_inference(dtype=input.dtype) + grad_out = helper.create_variable_for_type_inference(dtype=input.dtype) helper.append_op( type='warpctc', inputs={'Logits': [input], @@ -3952,7 +4195,7 @@ def sequence_reshape(input, new_dim): x_reshaped = fluid.layers.sequence_reshape(input=x, new_dim=10) """ helper = LayerHelper('sequence_reshape', **locals()) - out = helper.create_tmp_variable(helper.input_dtype()) + out = helper.create_variable_for_type_inference(helper.input_dtype()) helper.append_op( type='sequence_reshape', inputs={'X': [input]}, @@ -3971,7 +4214,8 @@ def nce(input, sample_weight=None, param_attr=None, bias_attr=None, - num_neg_samples=None): + num_neg_samples=None, + name=None): """ ${comment} @@ -3982,9 +4226,18 @@ def nce(input, sample_weight (Variable|None): A Variable of shape [batch_size, 1] storing a weight for each sample. The default weight for each sample is 1.0. - param_attr (ParamAttr|None): attributes for parameter - bias_attr (ParamAttr|None): attributes for bias + param_attr (ParamAttr|None): The parameter attribute for learnable parameters/weights + of nce. If it is set to None or one attribute of ParamAttr, nce + will create ParamAttr as param_attr. If the Initializer of the param_attr + is not set, the parameter is initialized with Xavier. Default: None. + bias_attr (ParamAttr|bool|None): The parameter attribute for the bias of nce. + If it is set to False, no bias will be added to the output units. + If it is set to None or one attribute of ParamAttr, nce + will create ParamAttr as bias_attr. If the Initializer of the bias_attr + is not set, the bias is initialized zero. Default: None. num_neg_samples (int): ${num_neg_samples_comment} + name (str|None): A name for this layer(optional). If set None, the layer + will be named automatically. Default: None. Returns: Variable: The output nce loss. @@ -4017,22 +4270,31 @@ def nce(input, """ helper = LayerHelper('nce', **locals()) assert isinstance(input, Variable) - dim = input.shape[1] assert isinstance(label, Variable) + + dim = input.shape[1] num_true_class = label.shape[1] w = helper.create_parameter( attr=helper.param_attr, shape=[num_total_classes, dim], is_bias=False, dtype=input.dtype) - b = helper.create_parameter( - attr=helper.bias_attr, - shape=[num_total_classes, 1], - is_bias=True, - dtype=input.dtype) - cost = helper.create_tmp_variable(dtype=input.dtype) - sample_logits = helper.create_tmp_variable(dtype=input.dtype) - sample_labels = helper.create_tmp_variable(dtype=label.dtype) + inputs = { + 'Input': input, + 'Label': label, + 'Weight': w, + 'SampleWeight': sample_weight if sample_weight is not None else [] + } + if helper.bias_attr: + b = helper.create_parameter( + attr=helper.bias_attr, + shape=[num_total_classes, 1], + is_bias=True, + dtype=input.dtype) + inputs['Bias'] = b + cost = helper.create_variable_for_type_inference(dtype=input.dtype) + sample_logits = helper.create_variable_for_type_inference(dtype=input.dtype) + sample_labels = helper.create_variable_for_type_inference(dtype=label.dtype) if num_neg_samples is None: num_neg_samples = 10 @@ -4046,13 +4308,7 @@ def nce(input, helper.append_op( type='nce', - inputs={ - 'Input': input, - 'Label': label, - 'Weight': w, - 'Bias': b, - 'SampleWeight': sample_weight if sample_weight is not None else [] - }, + inputs=inputs, outputs={ 'Cost': cost, 'SampleLogits': sample_logits, @@ -4062,7 +4318,12 @@ def nce(input, return cost / (num_neg_samples + 1) -def hsigmoid(input, label, num_classes, param_attr=None, bias_attr=None): +def hsigmoid(input, + label, + num_classes, + param_attr=None, + bias_attr=None, + name=None): """ The hierarchical sigmoid operator is used to accelerate the training process of language model. This operator organizes the classes into a @@ -4083,11 +4344,17 @@ def hsigmoid(input, label, num_classes, param_attr=None, bias_attr=None): label (Variable): The tensor variable contains labels of training data. It's a tensor with shape is :math:`[N \\times 1]`. num_classes: (int), The number of classes, must not be less than 2. - param_attr (ParamAttr|list of ParamAttr, default None): The parameter - attribute for learnable parameters/weights of this layer. - bias_attr (ParamAttr|list of ParamAttr, default None): The parameter - attribute for the bias of this layer. If it is set to False, no - bias will be applied. + param_attr (ParamAttr|None): The parameter attribute for learnable parameters/weights + of hsigmoid. If it is set to None or one attribute of ParamAttr, hsigmoid + will create ParamAttr as param_attr. If the Initializer of the param_attr + is not set, the parameter is initialized with Xavier. Default: None. + bias_attr (ParamAttr|bool|None): The parameter attribute for the bias of hsigmoid. + If it is set to False, no bias will be added to the output units. + If it is set to None or one attribute of ParamAttr, hsigmoid + will create ParamAttr as bias_attr. If the Initializer of the bias_attr + is not set, the bias is initialized zero. Default: None. + name (str|None): A name for this layer(optional). If set None, the layer + will be named automatically. Default: None. Returns: Out: (Tensor) The cost of hierarchical sigmoid operator. the shape is [N, 1] @@ -4103,8 +4370,8 @@ def hsigmoid(input, label, num_classes, param_attr=None, bias_attr=None): helper = LayerHelper('hierarchical_sigmoid', **locals()) dtype = helper.input_dtype() - out = helper.create_tmp_variable(dtype) - pre_out = helper.create_tmp_variable(dtype) + out = helper.create_variable_for_type_inference(dtype) + pre_out = helper.create_variable_for_type_inference(dtype) dim = input.shape[1] if num_classes < 2: raise ValueError("num_classes must not be less than 2.") @@ -4164,8 +4431,8 @@ def transpose(x, perm, name=None): (idx, perm[idx], len(x.shape))) helper = LayerHelper('transpose', **locals()) - out = helper.create_tmp_variable(x.dtype) - x_shape = helper.create_tmp_variable(x.dtype) + out = helper.create_variable_for_type_inference(x.dtype) + x_shape = helper.create_variable_for_type_inference(x.dtype) helper.append_op( type='transpose2', inputs={'X': [x]}, @@ -4307,7 +4574,7 @@ def im2sequence(input, inputs["Y"] = input_image_size attrs["out_stride"] = out_stride helper = LayerHelper('im2sequence', **locals()) - out = helper.create_tmp_variable(dtype=helper.input_dtype()) + out = helper.create_variable_for_type_inference(dtype=helper.input_dtype()) helper.append_op( type='im2sequence', inputs=inputs, outputs={'Out': out}, attrs=attrs) return out @@ -4340,7 +4607,7 @@ def row_conv(input, future_context_size, param_attr=None, act=None): filter_shape = [future_context_size + 1, input.shape[1]] filter_param = helper.create_parameter( attr=helper.param_attr, shape=filter_shape, dtype=dtype) - out = helper.create_tmp_variable(dtype) + out = helper.create_variable_for_type_inference(dtype) helper.append_op( type='row_conv', inputs={'X': [input], @@ -4373,7 +4640,7 @@ def multiplex(inputs, index): raise ValueError("inputs should be a list object and contains at least " "2 elements.") - out = helper.create_tmp_variable(inputs[0].dtype) + out = helper.create_variable_for_type_inference(inputs[0].dtype) helper.append_op( type='multiplex', inputs={'X': inputs, @@ -4444,8 +4711,8 @@ def softmax_with_cross_entropy(logits, logits=fc, label=label) """ helper = LayerHelper('softmax_with_cross_entropy', **locals()) - softmax = helper.create_tmp_variable(dtype=logits.dtype) - loss = helper.create_tmp_variable(dtype=logits.dtype) + softmax = helper.create_variable_for_type_inference(dtype=logits.dtype) + loss = helper.create_variable_for_type_inference(dtype=logits.dtype) helper.append_op( type='softmax_with_cross_entropy', inputs={'Logits': logits, @@ -4495,8 +4762,8 @@ def smooth_l1(x, y, inside_weight=None, outside_weight=None, sigma=None): """ helper = LayerHelper('smooth_l1_loss', **locals()) - diff = helper.create_tmp_variable(dtype=x.dtype) - loss = helper.create_tmp_variable(dtype=x.dtype) + diff = helper.create_variable_for_type_inference(dtype=x.dtype) + loss = helper.create_variable_for_type_inference(dtype=x.dtype) helper.append_op( type='smooth_l1_loss', inputs={ @@ -4529,7 +4796,7 @@ def one_hot(input, depth): one_hot_label = layers.one_hot(input=label, depth=10) """ helper = LayerHelper("one_hot", **locals()) - one_hot_out = helper.create_tmp_variable(dtype='float32') + one_hot_out = helper.create_variable_for_type_inference(dtype='float32') helper.append_op( type="one_hot", inputs={'X': input}, @@ -4671,8 +4938,8 @@ def reshape(x, shape, actual_shape=None, act=None, inplace=True, name=None): "except one unknown dimension.") helper = LayerHelper("reshape2", **locals()) - out = helper.create_tmp_variable(dtype=x.dtype) - x_shape = helper.create_tmp_variable(dtype=x.dtype) + out = helper.create_variable_for_type_inference(dtype=x.dtype) + x_shape = helper.create_variable_for_type_inference(dtype=x.dtype) helper.append_op( type="reshape2", inputs=inputs, @@ -4721,8 +4988,8 @@ def squeeze(input, axes, name=None): y = layers.sequeeze(input=x, axes=[1]) """ helper = LayerHelper("squeeze", **locals()) - out = helper.create_tmp_variable(dtype=input.dtype) - x_shape = helper.create_tmp_variable(dtype=input.dtype) + out = helper.create_variable_for_type_inference(dtype=input.dtype) + x_shape = helper.create_variable_for_type_inference(dtype=input.dtype) helper.append_op( type="squeeze2", inputs={"X": input}, @@ -4758,8 +5025,8 @@ def unsqueeze(input, axes, name=None): y = layers.unsequeeze(input=x, axes=[1]) """ helper = LayerHelper("unsqueeze", **locals()) - out = helper.create_tmp_variable(dtype=input.dtype) - x_shape = helper.create_tmp_variable(dtype=input.dtype) + out = helper.create_variable_for_type_inference(dtype=input.dtype) + x_shape = helper.create_variable_for_type_inference(dtype=input.dtype) helper.append_op( type="unsqueeze2", inputs={"X": input}, @@ -4849,7 +5116,7 @@ def lod_reset(x, y=None, target_lod=None): out = layers.lod_reset(x=x, y=y) """ helper = LayerHelper("lod_reset", **locals()) - out = helper.create_tmp_variable(dtype=x.dtype) + out = helper.create_variable_for_type_inference(dtype=x.dtype) if y is not None: helper.append_op( type="lod_reset", inputs={'X': x, @@ -4918,8 +5185,9 @@ def lrn(input, n=5, k=1.0, alpha=1e-4, beta=0.75, name=None): "dims of input must be 4(not %d), and it's order must be NCHW" % (dims)) - mid_out = helper.create_tmp_variable(dtype=dtype, stop_gradient=True) - lrn_out = helper.create_tmp_variable(dtype) + mid_out = helper.create_variable_for_type_inference( + dtype=dtype, stop_gradient=True) + lrn_out = helper.create_variable_for_type_inference(dtype) helper.append_op( type="lrn", inputs={"X": input}, @@ -4984,7 +5252,7 @@ def pad(x, paddings, pad_value=0., name=None): """ helper = LayerHelper('pad', input=x, **locals()) dtype = helper.input_dtype() - out = helper.create_tmp_variable(dtype) + out = helper.create_variable_for_type_inference(dtype) helper.append_op( type='pad', inputs={'X': x}, @@ -5064,7 +5332,7 @@ def pad_constant_like(x, y, pad_value=0., name=None): """ helper = LayerHelper('pad_constant_like', input=x, **locals()) dtype = helper.input_dtype() - out = helper.create_tmp_variable(dtype) + out = helper.create_variable_for_type_inference(dtype) helper.append_op( type='pad_constant_like', inputs={'X': x, @@ -5129,7 +5397,7 @@ def label_smooth(label, raise ValueError("The value of epsilon must be between 0 and 1.") helper = LayerHelper("label_smooth", **locals()) label.stop_gradient = True - smooth_label = helper.create_tmp_variable(dtype) + smooth_label = helper.create_variable_for_type_inference(dtype) helper.append_op( type="label_smooth", inputs={"X": label, @@ -5161,8 +5429,8 @@ def roi_pool(input, rois, pooled_height=1, pooled_width=1, spatial_scale=1.0): """ helper = LayerHelper('roi_pool', **locals()) dtype = helper.input_dtype() - pool_out = helper.create_tmp_variable(dtype) - argmaxes = helper.create_tmp_variable(dtype='int32') + pool_out = helper.create_variable_for_type_inference(dtype) + argmaxes = helper.create_variable_for_type_inference(dtype='int32') helper.append_op( type="roi_pool", inputs={"X": input, @@ -5177,6 +5445,54 @@ def roi_pool(input, rois, pooled_height=1, pooled_width=1, spatial_scale=1.0): return pool_out +@templatedoc() +def roi_align(input, + rois, + pooled_height=1, + pooled_width=1, + spatial_scale=1.0, + sampling_ratio=-1, + name=None): + """ + ${comment} + + Args: + input (Variable): ${x_comment} + rois (Variable): ROIs (Regions of Interest) to pool over. + pooled_height (integer): ${pooled_height_comment} Default: 1 + pooled_width (integer): ${pooled_width_comment} Default: 1 + spatial_scale (float): ${spatial_scale_comment} Default: 1.0 + sampling_ratio(intger): ${sampling_ratio_comment} Default: -1 + + Returns: + Variable: ${out_comment}. + Examples: + .. code-block:: python + + align_out = fluid.layers.roi_align(input=x, + rois=rois, + pooled_height=7, + pooled_width=7, + spatial_scale=0.5, + sampling_ratio=-1) + """ + helper = LayerHelper('roi_align', **locals()) + dtype = helper.input_dtype() + align_out = helper.create_variable_for_type_inference(dtype) + helper.append_op( + type="roi_align", + inputs={"X": input, + "ROIs": rois}, + outputs={"Out": align_out}, + attrs={ + "pooled_height": pooled_height, + "pooled_width": pooled_width, + "spatial_scale": spatial_scale, + "sampling_ratio": sampling_ratio + }) + return align_out + + def dice_loss(input, label, epsilon=0.00001): """ Dice loss for comparing the similarity of two batch of data, @@ -5287,7 +5603,7 @@ def image_resize(input, out_h = int(input.shape[2] * scale) out_w = int(input.shape[3] * scale) - out = helper.create_tmp_variable(dtype) + out = helper.create_variable_for_type_inference(dtype) helper.append_op( type=resample_methods[resample], inputs=inputs, @@ -5396,7 +5712,7 @@ def gather(input, index): """ helper = LayerHelper('gather', **locals()) dtype = helper.input_dtype() - out = helper.create_tmp_variable(dtype) + out = helper.create_variable_for_type_inference(dtype) helper.append_op( type="gather", inputs={"X": input, @@ -5436,7 +5752,7 @@ def scatter(input, index, updates, name=None): """ helper = LayerHelper('scatter', **locals()) dtype = helper.input_dtype() - out = helper.create_tmp_variable(dtype) + out = helper.create_variable_for_type_inference(dtype) helper.append_op( type="scatter", inputs={"X": input, @@ -5496,7 +5812,7 @@ def sequence_scatter(input, index, updates, name=None): """ helper = LayerHelper('sequence_scatter', **locals()) dtype = helper.input_dtype() - out = helper.create_tmp_variable(dtype) + out = helper.create_variable_for_type_inference(dtype) helper.append_op( type="sequence_scatter", inputs={"X": input, @@ -5526,7 +5842,7 @@ def random_crop(x, shape, seed=None): """ helper = LayerHelper("random_crop", **locals()) dtype = x.dtype - out = helper.create_tmp_variable(dtype) + out = helper.create_variable_for_type_inference(dtype) if seed is None: seed = np.random.randint(-65536, 65536) op_attrs = {"shape": shape} @@ -5572,7 +5888,7 @@ def log(x, name=None): """ helper = LayerHelper('log', **locals()) dtype = helper.input_dtype(input_param_name='x') - out = helper.create_tmp_variable(dtype) + out = helper.create_variable_for_type_inference(dtype) helper.append_op(type="log", inputs={"X": x}, outputs={"Out": out}) return out @@ -5603,7 +5919,7 @@ def relu(x, name=None): """ helper = LayerHelper('relu', **locals()) dtype = helper.input_dtype(input_param_name='x') - out = helper.create_tmp_variable(dtype) + out = helper.create_variable_for_type_inference(dtype) helper.append_op(type="relu", inputs={"X": x}, outputs={"Out": out}) return out @@ -5642,9 +5958,9 @@ def mean_iou(input, label, num_classes): """ helper = LayerHelper('mean_iou', **locals()) dtype = helper.input_dtype() - out_mean_iou = helper.create_tmp_variable(dtype='float32') - out_wrong = helper.create_tmp_variable(dtype='int32') - out_correct = helper.create_tmp_variable(dtype='int32') + out_mean_iou = helper.create_variable_for_type_inference(dtype='float32') + out_wrong = helper.create_variable_for_type_inference(dtype='int32') + out_correct = helper.create_variable_for_type_inference(dtype='int32') helper.append_op( type="mean_iou", inputs={"Predictions": input, @@ -5736,7 +6052,7 @@ def crop(x, shape=None, offsets=None, name=None): if offsets is None: offsets = [0] * len(x.shape) - out = helper.create_tmp_variable(x.dtype) + out = helper.create_variable_for_type_inference(x.dtype) ipts = {'X': x} attrs = {} if isinstance(shape, Variable): @@ -5816,7 +6132,7 @@ def rank_loss(label, left, right, name=None): if not (isinstance(right, Variable)): raise ValueError("The right should be a Variable") - out = helper.create_tmp_variable("float32") + out = helper.create_variable_for_type_inference("float32") helper.append_op( type='rank_loss', @@ -5827,6 +6143,54 @@ def rank_loss(label, left, right, name=None): return out +def margin_rank_loss(label, left, right, margin=0.1, name=None): + """ + Margin Ranking Loss Layer for ranking problem, + which compares left score and right score passed in. + The ranking loss can be defined as following equation: + + .. math:: + + rank\_loss &= max(0, -label * (left - right) + margin) + + Args: + label (Variable): Indicates whether the left is ranked higher than the right or not. + left (Variable): Ranking score for left. + right (Variable): Ranking score for right. + margin (float): Indicates the given margin. + name (str|None): A name for this layer (optional). If set None, the layer + will be named automatically. + Returns: + Variable: The ranking loss. + Raises: + ValueError: Any of label, left, and right is not a Variable. + Examples: + .. code-block:: python + label = fluid.layers.data(name="label", shape=[4, 1], dtype="float32") + left = fluid.layers.data(name="left", shape=[4, 1], dtype="float32") + right = fluid.layers.data(name="right", shape=[4, 1], dtype="float32") + out = fluid.layers.margin_rank_loss(label, left, right) + """ + helper = LayerHelper('margin_rank_loss', **locals()) + if not isinstance(label, Variable): + raise ValueError("The label should be a Variable.") + if not isinstance(left, Variable): + raise ValueError("The left should be a Variable.") + if not isinstance(right, Variable): + raise ValueError("The right should be a Variable.") + out = helper.create_variable_for_type_inference(left.dtype) + act = helper.create_variable_for_type_inference(left.dtype) + helper.append_op( + type='margin_rank_loss', + inputs={"Label": label, + "X1": left, + "X2": right}, + outputs={'Out': out, + 'Activated': act}, + attrs={'margin': margin}) + return out + + def pad2d(input, paddings=[0, 0, 0, 0], mode='constant', @@ -5900,7 +6264,7 @@ def pad2d(input, helper = LayerHelper('pad2d', **locals()) dtype = helper.input_dtype(input_param_name='input') - out = helper.create_tmp_variable(dtype) + out = helper.create_variable_for_type_inference(dtype) helper.append_op( type='pad2d', inputs={'X': input}, @@ -5929,7 +6293,7 @@ def elu(x, alpha=1.0, name=None): output(${out_type}): ${out_comment} """ helper = LayerHelper('elu', **locals()) - out = helper.create_tmp_variable(dtype=x.dtype) + out = helper.create_variable_for_type_inference(dtype=x.dtype) helper.append_op( type='elu', inputs={'X': x}, @@ -5952,7 +6316,7 @@ def relu6(x, threshold=6.0, name=None): output(${out_type}): ${out_comment} """ helper = LayerHelper('relu6', **locals()) - out = helper.create_tmp_variable(dtype=x.dtype) + out = helper.create_variable_for_type_inference(dtype=x.dtype) helper.append_op( type='relu6', inputs={'X': x}, @@ -5975,7 +6339,7 @@ def pow(x, factor=1.0, name=None): output(${out_type}): ${out_comment} """ helper = LayerHelper('pow', **locals()) - out = helper.create_tmp_variable(dtype=x.dtype) + out = helper.create_variable_for_type_inference(dtype=x.dtype) helper.append_op( type='pow', inputs={'X': x}, @@ -5999,7 +6363,7 @@ def stanh(x, scale_a=2.0 / 3.0, scale_b=1.7159, name=None): output(${out_type}): ${out_comment} """ helper = LayerHelper('stanh', **locals()) - out = helper.create_tmp_variable(dtype=x.dtype) + out = helper.create_variable_for_type_inference(dtype=x.dtype) helper.append_op( type='stanh', inputs={'X': x}, @@ -6024,7 +6388,7 @@ def hard_sigmoid(x, slope=0.2, offset=0.5, name=None): output(${out_type}): ${out_comment} """ helper = LayerHelper('hard_sigmoid', **locals()) - out = helper.create_tmp_variable(dtype=x.dtype) + out = helper.create_variable_for_type_inference(dtype=x.dtype) helper.append_op( type='hard_sigmoid', inputs={'X': x}, @@ -6048,7 +6412,7 @@ def swish(x, beta=1.0, name=None): output(${out_type}): ${out_comment} """ helper = LayerHelper('swish', **locals()) - out = helper.create_tmp_variable(dtype=x.dtype) + out = helper.create_variable_for_type_inference(dtype=x.dtype) helper.append_op( type='swish', inputs={'X': x}, @@ -6100,7 +6464,7 @@ def prelu(x, mode, param_attr=None, name=None): dtype='float32', is_bias=False, default_initializer=Constant(1.0)) - out = helper.create_tmp_variable(dtype) + out = helper.create_variable_for_type_inference(dtype) helper.append_op( type="prelu", inputs={"X": x, @@ -6124,7 +6488,7 @@ def brelu(x, t_min=0.0, t_max=24.0, name=None): output(${out_type}): ${out_comment} """ helper = LayerHelper('brelu', **locals()) - out = helper.create_tmp_variable(dtype=x.dtype) + out = helper.create_variable_for_type_inference(dtype=x.dtype) helper.append_op( type='brelu', inputs={'X': x}, @@ -6147,7 +6511,7 @@ def leaky_relu(x, alpha=0.02, name=None): output(${out_type}): ${out_comment} """ helper = LayerHelper('leaky_relu', **locals()) - out = helper.create_tmp_variable(dtype=x.dtype) + out = helper.create_variable_for_type_inference(dtype=x.dtype) helper.append_op( type='leaky_relu', inputs={'X': x}, @@ -6169,7 +6533,7 @@ def soft_relu(x, threshold=40.0, name=None): output(${out_type}): ${out_comment} """ helper = LayerHelper('soft_relu', **locals()) - out = helper.create_tmp_variable(dtype=x.dtype) + out = helper.create_variable_for_type_inference(dtype=x.dtype) helper.append_op( type='soft_relu', inputs={'X': x}, @@ -6236,8 +6600,8 @@ def flatten(x, axis=1, name=None): if not (isinstance(axis, int)) or axis > len(x.shape) or axis < 0: raise ValueError("The axis should be a int, and in range [0, rank(x)]") - out = helper.create_tmp_variable(x.dtype) - x_shape = helper.create_tmp_variable(x.dtype) + out = helper.create_variable_for_type_inference(x.dtype) + x_shape = helper.create_variable_for_type_inference(x.dtype) helper.append_op( type='flatten2', inputs={"X": x}, @@ -6283,13 +6647,15 @@ def sequence_enumerate(input, win_size, pad_value=0, name=None): out = fluid.layers.sequence_enumerate(input=x, win_size=3, pad_value=0) """ helper = LayerHelper('sequence_enumerate', **locals()) - out = helper.create_tmp_variable(helper.input_dtype(), stop_gradient=True) + out = helper.create_variable_for_type_inference( + helper.input_dtype(), stop_gradient=True) helper.append_op( type='sequence_enumerate', inputs={'X': input}, outputs={'Out': out}, attrs={'win_size': win_size, 'pad_value': pad_value}) + return out def sequence_mask(x, maxlen=None, dtype='int64', name=None): @@ -6322,9 +6688,9 @@ def sequence_mask(x, maxlen=None, dtype='int64', name=None): helper = LayerHelper('sequence_mask', **locals()) if name is None: - out = helper.create_tmp_variable(dtype=dtype) + out = helper.create_variable_for_type_inference(dtype=dtype) else: - out = helper.create_tmp_variable(dtype=dtype, name=name) + out = helper.create_variable_for_type_inference(dtype=dtype, name=name) helper.append_op( type='sequence_mask', @@ -6367,7 +6733,7 @@ def stack(x, axis=0): if not isinstance(x, list) and not isinstance(x, tuple): x = [x] - out = helper.create_tmp_variable(x[0].dtype) + out = helper.create_variable_for_type_inference(x[0].dtype) helper.append_op( type='stack', inputs={'X': x}, outputs={'Y': out}, attrs={'axis': axis}) @@ -6405,7 +6771,7 @@ def unstack(x, axis=0, num=None): outs = [] for _ in num: - outs.append(helper.create_tmp_variable(x.dtype)) + outs.append(helper.create_variable_for_type_inference(x.dtype)) helper.append_op( type='unstack', @@ -6457,7 +6823,7 @@ def expand(x, expand_times, name=None): """ helper = LayerHelper('expand', input=x, **locals()) dtype = helper.input_dtype(input_param_name='x') - out = helper.create_tmp_variable(dtype) + out = helper.create_variable_for_type_inference(dtype) helper.append_op( type='expand', inputs={'X': x}, @@ -6496,7 +6862,7 @@ def uniform_random_batch_size_like(input, """ helper = LayerHelper('uniform_random_batch_size_like', **locals()) - out = helper.create_tmp_variable(dtype) + out = helper.create_variable_for_type_inference(dtype) c_dtype = convert_np_dtype_to_dtype_(dtype) helper.append_op( type='uniform_random_batch_size_like', @@ -6533,7 +6899,7 @@ def gaussian_random(shape, mean=0.0, std=1.0, seed=0, dtype='float32'): """ helper = LayerHelper('gaussian_random', **locals()) - out = helper.create_tmp_variable(dtype) + out = helper.create_variable_for_type_inference(dtype) c_dtype = convert_np_dtype_to_dtype_(dtype) helper.append_op( type='gaussian_random', @@ -6568,7 +6934,7 @@ def sampling_id(x, min=0.0, max=1.0, seed=0, dtype='float32'): """ helper = LayerHelper('sampling_id', **locals()) - out = helper.create_tmp_variable(dtype) + out = helper.create_variable_for_type_inference(dtype) helper.append_op( type='sampling_id', inputs={'X': x}, @@ -6607,7 +6973,7 @@ def gaussian_random_batch_size_like(input, """ helper = LayerHelper('gaussian_random_batch_size_like', **locals()) - out = helper.create_tmp_variable(dtype) + out = helper.create_variable_for_type_inference(dtype) c_dtype = convert_np_dtype_to_dtype_(dtype) helper.append_op( type='gaussian_random_batch_size_like', @@ -6639,7 +7005,8 @@ def sum(x): """ helper = LayerHelper('sum', **locals()) - out = helper.create_tmp_variable(dtype=helper.input_dtype('x')) + out = helper.create_variable_for_type_inference( + dtype=helper.input_dtype('x')) helper.append_op( type='sum', inputs={'X': x}, @@ -6666,7 +7033,8 @@ def slice(input, axes, starts, ends): """ helper = LayerHelper('slice', **locals()) - out = helper.create_tmp_variable(dtype=helper.input_dtype('input')) + out = helper.create_variable_for_type_inference( + dtype=helper.input_dtype('input')) helper.append_op( type='slice', inputs={'Input': input}, @@ -6692,7 +7060,8 @@ def shape(input): """ helper = LayerHelper('shape', **locals()) - out = helper.create_tmp_variable(dtype=helper.input_dtype('input')) + out = helper.create_variable_for_type_inference( + dtype=helper.input_dtype('input')) helper.append_op( type='shape', inputs={'Input': input}, outputs={'Out': out}) @@ -6709,7 +7078,7 @@ def _elementwise_op(helper): use_mkldnn = helper.kwargs.get('use_mkldnn', False) name = helper.kwargs.get('name', None) if name is None: - out = helper.create_tmp_variable(dtype=x.dtype) + out = helper.create_variable_for_type_inference(dtype=x.dtype) else: out = helper.create_variable( name=name, dtype=x.dtype, persistable=False) @@ -6743,7 +7112,7 @@ def scale(x, scale=1.0, bias=0.0, bias_after_scale=True, act=None, name=None): helper = LayerHelper('scale', **locals()) if name is None: - out = helper.create_tmp_variable(dtype=x.dtype) + out = helper.create_variable_for_type_inference(dtype=x.dtype) else: out = helper.create_variable( name=name, dtype=x.dtype, persistable=False) @@ -6809,7 +7178,7 @@ def _logical_op(op_name, x, y, out=None, name=None, binary_op=True): if out is None: if name is None: - out = helper.create_tmp_variable(dtype=x.dtype) + out = helper.create_variable_for_type_inference(dtype=x.dtype) else: out = helper.create_variable( name=name, dtype=x.dtype, persistable=False) @@ -6917,7 +7286,7 @@ def clip(x, min, max, name=None): helper = LayerHelper("clip", **locals()) if name is None: - out = helper.create_tmp_variable(dtype=x.dtype) + out = helper.create_variable_for_type_inference(dtype=x.dtype) else: out = helper.create_variable( name=name, dtype=x.dtype, persistable=False) @@ -6949,7 +7318,7 @@ def clip_by_norm(x, max_norm, name=None): helper = LayerHelper("clip_by_norm", **locals()) if name is None: - out = helper.create_tmp_variable(dtype=x.dtype) + out = helper.create_variable_for_type_inference(dtype=x.dtype) else: out = helper.create_variable( name=name, dtype=x.dtype, persistable=False) @@ -6979,7 +7348,7 @@ def mean(x, name=None): helper = LayerHelper("mean", **locals()) if name is None: - out = helper.create_tmp_variable(dtype=x.dtype) + out = helper.create_variable_for_type_inference(dtype=x.dtype) else: out = helper.create_variable( name=name, dtype=x.dtype, persistable=False) @@ -7009,7 +7378,7 @@ def mul(x, y, x_num_col_dims=1, y_num_col_dims=1, name=None): helper = LayerHelper("mul", **locals()) if name is None: - out = helper.create_tmp_variable(dtype=x.dtype) + out = helper.create_variable_for_type_inference(dtype=x.dtype) else: out = helper.create_variable( name=name, dtype=x.dtype, persistable=False) @@ -7043,7 +7412,7 @@ def sigmoid_cross_entropy_with_logits(x, label, name=None): helper = LayerHelper("sigmoid_cross_entropy_with_logits", **locals()) if name is None: - out = helper.create_tmp_variable(dtype=x.dtype) + out = helper.create_variable_for_type_inference(dtype=x.dtype) else: out = helper.create_variable( name=name, dtype=x.dtype, persistable=False) @@ -7073,7 +7442,7 @@ def maxout(x, groups, name=None): helper = LayerHelper("maxout", **locals()) if name is None: - out = helper.create_tmp_variable(dtype=x.dtype) + out = helper.create_variable_for_type_inference(dtype=x.dtype) else: out = helper.create_variable( name=name, dtype=x.dtype, persistable=False) @@ -7084,3 +7453,44 @@ def maxout(x, groups, name=None): attrs={"groups": groups}, outputs={"Out": out}) return out + + +def affine_channel(x, scale=None, bias=None, data_layout='NCHW', name=None): + """ + Applies a separate affine transformation to each channel of the input. + Useful for replacing spatial batch norm with its equivalent fixed + transformation. The input also can be 2D tensor and applies a affine + transformation in second dimension. + + Args: + x (Variable): Feature map input can be a 4D tensor with order NCHW + or NHWC. It also can be a 2D tensor and the affine transformation + is applied in the second dimension. + scale (Variable): 1D input of shape (C), the c-th element is the scale + factor of the affine transformation for the c-th channel of + the input. + bias (Variable): 1D input of shape (C), the c-th element is the bias + of the affine transformation for the c-th channel of the input. + data_layout (string, default NCHW): NCHW or NHWC. If input is 2D + tensor, you can ignore data_layout. + name (str, default None): The name of this layer. + + Returns: + out (Variable): A tensor of the same shape and data layout with x. + """ + helper = LayerHelper("affine_channel", **locals()) + + if name is None: + out = helper.create_variable_for_type_inference(dtype=x.dtype) + else: + out = helper.create_variable( + name=name, dtype=x.dtype, persistable=False) + + helper.append_op( + type="affine_channel", + inputs={"X": x, + 'Scale': scale, + 'Bias': bias}, + attrs={"data_layout": data_layout}, + outputs={"Out": out}) + return out diff --git a/python/paddle/fluid/layers/ops.py b/python/paddle/fluid/layers/ops.py index 9a8300524d..1ff40a26f2 100644 --- a/python/paddle/fluid/layers/ops.py +++ b/python/paddle/fluid/layers/ops.py @@ -14,6 +14,8 @@ from __future__ import print_function from .layer_function_generator import generate_layer_fn, generate_layer_fn_noattr +from .. import core +from ..framework import convert_np_dtype_to_dtype_ __activations_noattr__ = [ 'sigmoid', @@ -58,8 +60,11 @@ _uniform_random_ = generate_layer_fn('uniform_random') def uniform_random(shape, dtype=None, min=None, max=None, seed=None): + locals_var = locals().keys() + if not isinstance(dtype, core.VarDesc.VarType): + dtype = convert_np_dtype_to_dtype_(dtype) kwargs = dict() - for name in locals(): + for name in locals_var: val = locals()[name] if val is not None: kwargs[name] = val @@ -78,8 +83,9 @@ _hard_shrink_ = generate_layer_fn('hard_shrink') def hard_shrink(x, threshold=None): + locals_var = locals().keys() kwargs = dict() - for name in locals(): + for name in locals_var: val = locals()[name] if val is not None: kwargs[name] = val @@ -99,12 +105,12 @@ _cum_sum_ = generate_layer_fn('cumsum') def cumsum(x, axis=None, exclusive=None, reverse=None): + locals_var = locals().keys() kwargs = dict() - for name in locals(): + for name in locals_var: val = locals()[name] if val is not None: kwargs[name] = val - return _cum_sum_(**kwargs) @@ -121,8 +127,9 @@ _thresholded_relu_ = generate_layer_fn('thresholded_relu') def thresholded_relu(x, threshold=None): + locals_var = locals().keys() kwargs = dict() - for name in locals(): + for name in locals_var: val = locals()[name] if val is not None: kwargs[name] = val diff --git a/python/paddle/fluid/layers/tensor.py b/python/paddle/fluid/layers/tensor.py index 44b92af7ac..09a7cb8dc9 100644 --- a/python/paddle/fluid/layers/tensor.py +++ b/python/paddle/fluid/layers/tensor.py @@ -100,7 +100,7 @@ def create_global_var(shape, force_cpu=False, name=None): """ - Create a new variable in the global block(block 0). + Create a new tensor variable with value in the global block(block 0). Args: shape(list[int]): shape of the variable @@ -152,7 +152,7 @@ def cast(x, dtype): result = fluid.layers.cast(x=data, dtype='float64') """ helper = LayerHelper('cast', **locals()) - out = helper.create_tmp_variable(dtype=dtype) + out = helper.create_variable_for_type_inference(dtype=dtype) helper.append_op( type='cast', inputs={'X': [x]}, @@ -184,7 +184,7 @@ def concat(input, axis=0, name=None): out = fluid.layers.concat(input=[Efirst, Esecond, Ethird, Efourth]) """ helper = LayerHelper('concat', **locals()) - out = helper.create_tmp_variable(dtype=helper.input_dtype()) + out = helper.create_variable_for_type_inference(dtype=helper.input_dtype()) helper.append_op( type='concat', inputs={'X': input}, @@ -221,7 +221,8 @@ def sums(input, out=None): """ helper = LayerHelper('sum', **locals()) if out is None: - out = helper.create_tmp_variable(dtype=helper.input_dtype()) + out = helper.create_variable_for_type_inference( + dtype=helper.input_dtype()) helper.append_op( type='sum', inputs={'X': input}, @@ -252,7 +253,7 @@ def assign(input, output=None): """ helper = LayerHelper('assign', **locals()) if output is None: - output = helper.create_tmp_variable(dtype=input.dtype) + output = helper.create_variable_for_type_inference(dtype=input.dtype) if isinstance(input, Variable): helper.append_op( type='assign', inputs={'X': [input]}, outputs={'Out': [output]}) @@ -311,7 +312,7 @@ def fill_constant(shape, dtype, value, force_cpu=False, out=None): helper = LayerHelper("fill_constant", **locals()) if out is None: - out = helper.create_tmp_variable(dtype=dtype) + out = helper.create_variable_for_type_inference(dtype=dtype) helper.append_op( type='fill_constant', inputs={}, @@ -358,7 +359,7 @@ def fill_constant_batch_size_like(input, ${out_comment}. """ helper = LayerHelper("fill_constant_batch_size_like", **locals()) - out = helper.create_tmp_variable(dtype=dtype) + out = helper.create_variable_for_type_inference(dtype=dtype) helper.append_op( type='fill_constant_batch_size_like', inputs={'Input': input}, @@ -396,7 +397,7 @@ def argmin(x, axis=0): out = fluid.layers.argmin(x=in, axis=-1) """ helper = LayerHelper("arg_min", **locals()) - out = helper.create_tmp_variable(VarDesc.VarType.INT64) + out = helper.create_variable_for_type_inference(VarDesc.VarType.INT64) helper.append_op( type='arg_min', inputs={'X': x}, @@ -427,7 +428,7 @@ def argmax(x, axis=0): out = fluid.layers.argmax(x=in, axis=-1) """ helper = LayerHelper("arg_max", **locals()) - out = helper.create_tmp_variable(VarDesc.VarType.INT64) + out = helper.create_variable_for_type_inference(VarDesc.VarType.INT64) helper.append_op( type='arg_max', inputs={'X': x}, @@ -477,8 +478,10 @@ def argsort(input, axis=-1, name=None): out, indices = fluid.layers.argsort(input, axis=0) """ helper = LayerHelper("argsort", **locals()) - out = helper.create_tmp_variable(dtype=input.dtype, stop_gradient=True) - ids = helper.create_tmp_variable(VarDesc.VarType.INT64, stop_gradient=True) + out = helper.create_variable_for_type_inference( + dtype=input.dtype, stop_gradient=True) + ids = helper.create_variable_for_type_inference( + VarDesc.VarType.INT64, stop_gradient=True) helper.append_op( type='argsort', inputs={'X': input}, @@ -562,7 +565,7 @@ def reverse(x, axis): if isinstance(axis, int): axis = [axis] helper = LayerHelper("reverse", **locals()) - out = helper.create_tmp_variable(dtype=x.dtype) + out = helper.create_variable_for_type_inference(dtype=x.dtype) helper.append_op( type='reverse', inputs={'Input': x}, @@ -654,7 +657,7 @@ def has_inf(x): Variable: The tensor variable storing the output, only a bool value. """ helper = LayerHelper("isinf", **locals()) - out = helper.create_tmp_variable(dtype=x.dtype) + out = helper.create_variable_for_type_inference(dtype=x.dtype) helper.append_op(type="isinf", inputs={"X": x}, outputs={"Out": out}) return out @@ -670,7 +673,7 @@ def has_nan(x): Variable: The tensor variable storing the output, only a bool value. """ helper = LayerHelper("isnan", **locals()) - out = helper.create_tmp_variable(dtype=x.dtype) + out = helper.create_variable_for_type_inference(dtype=x.dtype) helper.append_op(type="isnan", inputs={"X": x}, outputs={"Out": out}) return out @@ -687,6 +690,6 @@ def isfinite(x): Variable: The tensor variable storing the output, contains a bool value. """ helper = LayerHelper("isfinite", **locals()) - out = helper.create_tmp_variable(dtype=x.dtype) + out = helper.create_variable_for_type_inference(dtype=x.dtype) helper.append_op(type="isfinite", inputs={"X": x}, outputs={"Out": out}) return out diff --git a/python/paddle/fluid/nets.py b/python/paddle/fluid/nets.py index 1dabad54f5..00d33b36fc 100644 --- a/python/paddle/fluid/nets.py +++ b/python/paddle/fluid/nets.py @@ -64,23 +64,33 @@ def simple_img_conv_pool(input, average-pooling. Default :math:`max`. global_pooling (bool): Whether to use the global pooling. If global_pooling = true, pool_size and pool_padding while be ignored. Default False - conv_stride (int|list|tuple): The stride size of the Conv2d Layer. If stride is a + conv_stride (int|list|tuple): The stride size of the conv2d Layer. If stride is a list or tuple, it must contain two integers, (conv_stride_H, conv_stride_W). Otherwise, the conv_stride_H = conv_stride_W = conv_stride. Default: conv_stride = 1. - conv_padding (int|list|tuple): The padding size of the Conv2d Layer. If padding is + conv_padding (int|list|tuple): The padding size of the conv2d Layer. If padding is a list or tuple, it must contain two integers, (conv_padding_H, conv_padding_W). Otherwise, the conv_padding_H = conv_padding_W = conv_padding. Default: conv_padding = 0. - conv_dilation (int|list|tuple): The dilation size of the Conv2d Layer. If dilation is + conv_dilation (int|list|tuple): The dilation size of the conv2d Layer. If dilation is a list or tuple, it must contain two integers, (conv_dilation_H, conv_dilation_W). Otherwise, the conv_dilation_H = conv_dilation_W = conv_dilation. Default: conv_dilation = 1. - conv_groups (int): The groups number of the Conv2d Layer. According to grouped + conv_groups (int): The groups number of the conv2d Layer. According to grouped convolution in Alex Krizhevsky's Deep CNN paper: when group=2, the first half of the filters is only connected to the first half of the input channels, while the second half of the filters is only - connected to the second half of the input channels. Default: groups=1 - param_attr (ParamAttr): The parameters to the Conv2d Layer. Default: None - bias_attr (ParamAttr): Bias parameter for the Conv2d layer. Default: None - act (str): Activation type for Conv2d. Default: None + connected to the second half of the input channels. Default: groups=1. + param_attr (ParamAttr|None): The parameter attribute for learnable parameters/weights + of conv2d. If it is set to None or one attribute of ParamAttr, conv2d + will create ParamAttr as param_attr. If the Initializer of the param_attr + is not set, the parameter is initialized with :math:`Normal(0.0, std)`, + and the :math:`std` is :math:`(\\frac{2.0 }{filter\_elem\_num})^{0.5}`. + Default: None. + bias_attr (ParamAttr|bool|None): The parameter attribute for the bias of conv2d. + If it is set to False, no bias will be added to the output units. + If it is set to None or one attribute of ParamAttr, conv2d + will create ParamAttr as bias_attr. If the Initializer of the bias_attr + is not set, the bias is initialized zero. Default: None. + act (str): Activation type for conv2d, if it is set to None, activation is not + appended. Default: None. use_cudnn (bool): Use cudnn kernel or not, it is valid only when the cudnn library is installed. Default: True diff --git a/python/paddle/fluid/optimizer.py b/python/paddle/fluid/optimizer.py index 1b9571f6d3..17af44afdd 100644 --- a/python/paddle/fluid/optimizer.py +++ b/python/paddle/fluid/optimizer.py @@ -15,7 +15,7 @@ from __future__ import print_function import re from collections import defaultdict -from paddle.fluid.framework import Program, Variable, name_scope +from paddle.fluid.framework import Program, Variable, name_scope, default_main_program from . import framework from . import layers from .backward import append_backward @@ -111,7 +111,8 @@ class Optimizer(object): if param_lr == 1.0: return self._global_learning_rate() else: - return self._global_learning_rate() * param_lr + with default_main_program()._lr_schedule_guard(): + return self._global_learning_rate() * param_lr def _create_accumulators(self, block, parameters): """Create all accumulators needed by the parameters @@ -659,6 +660,9 @@ class AdamaxOptimizer(Optimizer): optimizer = fluid.optimizer.Adamax(learning_rate=0.2) optimizer.minimize(cost) + + Notes: + Currently, AdamaxOptimizer doesn't support sparse parameter optimization. """ _moment_acc_str = "moment" _inf_norm_acc_str = "inf_norm" @@ -778,6 +782,9 @@ class DecayedAdagradOptimizer(Optimizer): optimizer = fluid.optimizer.DecayedAdagrad(learning_rate=0.2) optimizer.minimize(cost) + + Notes: + Currently, DecayedAdagradOptimizer doesn't support sparse parameter optimization. """ _moment_acc_str = "moment" @@ -858,6 +865,9 @@ class AdadeltaOptimizer(Optimizer): optimizer = fluid.optimizer.Adadelta( learning_rate=0.0003, epsilon=1.0e-6, rho=0.95) _, params_grads = optimizer.minimize(cost) + + Notes: + Currently, AdadeltaOptimizer doesn't support sparse parameter optimization. """ _avg_squared_grad_acc_str = "_avg_squared_grad" @@ -1126,6 +1136,9 @@ class FtrlOptimizer(Optimizer): optimizer = fluid.optimizer.Ftrl(0.0001) _, params_grads = optimizer.minimize(cost) + + Notes: + Currently, FtrlOptimizer doesn't support sparse parameter optimization. """ _squared_acc_str = "squared" diff --git a/python/paddle/fluid/parallel_executor.py b/python/paddle/fluid/parallel_executor.py index 57d272cbfb..3f4dd5eb71 100644 --- a/python/paddle/fluid/parallel_executor.py +++ b/python/paddle/fluid/parallel_executor.py @@ -31,15 +31,32 @@ BuildStrategy = core.ParallelExecutor.BuildStrategy class ParallelExecutor(object): """ - ParallelExecutor can run program in parallel. + ParallelExecutor is designed for data parallelism, which focuses on distributing + the data across different nodes and every node operates on the data in parallel. + If you use ParallelExecutor to run the current program on GPU, the node means GPU + device, and ParallelExecutor will get the available GPU device automatically on + the current machine. If you use ParallelExecutor to run the current program on CPU, + the node means the CPU device, and you can specify the CPU device number by adding + 'CPU_NUM' environment variable, for example 'CPU_NUM=4', if the environment variable + is not found, ParallelExecutor will call `multiprocessing.cpu_count` to get the number + of CPUs in the system. Args: use_cuda (bool): Whether to use CUDA or not. loss_name (str): The loss name must set in training. Default None. main_program (Program): The program that need to run, if not provided, then default_main_program will be used. Default None. - share_vars_from(ParallelExecutor): If provied, it will share variables + share_vars_from(ParallelExecutor): If provide, it will share variables from the specified ParallelExecutor. Default None. + exec_strategy(ExecutionStrategy): exec_strategy is used to control how to run + the program in ParallelExecutor, for example how many threads are used to + execute the program, how many iterations to clean up the temp variables + which is generated during execution. For more information, please refer + to fluid.ExecutionStrategy. Default None. + build_strategy(BuildStrategy): build_strategy is used to control how to + build the SSA Graph in ParallelExecutor by setting the property, + for example reduce_strategy, gradient_scale_strategy. For more information, + please refer to fluid.BuildStrategy. Default None. num_trainers(int): If greater than 1, NCCL will be initialized with multiple rank of nodes, each node should have same number of GPUs. Distributed training will be enabled then. Default 1. diff --git a/python/paddle/fluid/regularizer.py b/python/paddle/fluid/regularizer.py index a4336e955f..c151fbd172 100644 --- a/python/paddle/fluid/regularizer.py +++ b/python/paddle/fluid/regularizer.py @@ -151,7 +151,7 @@ class L2DecayRegularizer(WeightDecayRegularizer): decay = block.create_var( dtype="float32", shape=param.shape, - type=core.VarDesc.VarType.SELECTED_ROWS) + type=core.VarDesc.VarType.LOD_TENSOR) block.append_op( type='extract_rows', inputs={'X': grad}, outputs={'Out': idx}) block.append_op( @@ -228,7 +228,7 @@ class L1DecayRegularizer(WeightDecayRegularizer): decay = block.create_var( dtype="float32", shape=param.shape, - type=core.VarDesc.VarType.SELECTED_ROWS) + type=core.VarDesc.VarType.LOD_TENSOR) block.append_op( type='extract_rows', inputs={'X': grad}, outputs={'Out': idx}) block.append_op( @@ -237,6 +237,7 @@ class L1DecayRegularizer(WeightDecayRegularizer): 'Ids': idx}, outputs={'Out': decay}, attrs={'is_sparse': True}) + param = decay # Append sign op block.append_op( diff --git a/python/paddle/fluid/tests/CMakeLists.txt b/python/paddle/fluid/tests/CMakeLists.txt index 1885dda44a..7ad923d332 100644 --- a/python/paddle/fluid/tests/CMakeLists.txt +++ b/python/paddle/fluid/tests/CMakeLists.txt @@ -1,4 +1,5 @@ -set(PYTHON_TESTS_DIR ${CMAKE_CURRENT_BINARY_DIR} CACHE PATH "python tests directory") +set(PYTHON_TESTS_DIR ${PADDLE_BINARY_DIR}/python/paddle/fluid/tests CACHE INTERNAL "python tests directory") + file(GLOB TEST_OPS RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "test_*.py") string(REPLACE ".py" "" TEST_OPS "${TEST_OPS}") diff --git a/python/paddle/fluid/tests/unittests/dist_simnet_bow.py b/python/paddle/fluid/tests/unittests/dist_simnet_bow.py index 6456d1b53a..fac5e037a4 100644 --- a/python/paddle/fluid/tests/unittests/dist_simnet_bow.py +++ b/python/paddle/fluid/tests/unittests/dist_simnet_bow.py @@ -81,7 +81,10 @@ def get_optimizer(): return optimizer -def train_network(batch_size, is_distributed=False, is_sparse=False): +def train_network(batch_size, + is_distributed=False, + is_sparse=False, + is_self_contained_lr=False): # query q = fluid.layers.data( name="query_ids", shape=[1], dtype="int64", lod_level=1) @@ -93,7 +96,9 @@ def train_network(batch_size, is_distributed=False, is_sparse=False): param_attr=fluid.ParamAttr( initializer=fluid.initializer.Constant(value=0.01), name="__emb__", - learning_rate=emb_lr), + learning_rate=emb_lr) if is_self_contained_lr else fluid.ParamAttr( + initializer=fluid.initializer.Constant(value=0.01), + name="__emb__"), is_sparse=is_sparse) ## vsum q_sum = fluid.layers.sequence_pool(input=q_emb, pool_type='sum') @@ -119,7 +124,9 @@ def train_network(batch_size, is_distributed=False, is_sparse=False): param_attr=fluid.ParamAttr( initializer=fluid.initializer.Constant(value=0.01), name="__emb__", - learning_rate=emb_lr), + learning_rate=emb_lr) if is_self_contained_lr else fluid.ParamAttr( + initializer=fluid.initializer.Constant(value=0.01), + name="__emb__"), is_sparse=is_sparse) ## vsum pt_sum = fluid.layers.sequence_pool(input=pt_emb, pool_type='sum') @@ -144,7 +151,9 @@ def train_network(batch_size, is_distributed=False, is_sparse=False): param_attr=fluid.ParamAttr( initializer=fluid.initializer.Constant(value=0.01), name="__emb__", - learning_rate=emb_lr), + learning_rate=emb_lr) if is_self_contained_lr else fluid.ParamAttr( + initializer=fluid.initializer.Constant(value=0.01), + name="__emb__"), is_sparse=is_sparse) ## vsum nt_sum = fluid.layers.sequence_pool(input=nt_emb, pool_type='sum') @@ -220,7 +229,10 @@ class TestDistSimnetBow2x2(TestDistRunnerBase): def get_model(self, batch_size=2): # Train program avg_cost, acc, predict = \ - train_network(batch_size, bool(int(os.environ["IS_DISTRIBUTED"])), bool(int(os.environ["IS_SPARSE"]))) + train_network(batch_size, + bool(int(os.environ["IS_DISTRIBUTED"])), + bool(int(os.environ["IS_SPARSE"])), + bool(int(os.environ["IS_SELF_CONTAINED_LR"]))) inference_program = fluid.default_main_program().clone() diff --git a/python/paddle/fluid/tests/unittests/test_affine_channel_op.py b/python/paddle/fluid/tests/unittests/test_affine_channel_op.py new file mode 100644 index 0000000000..2c9a063e6e --- /dev/null +++ b/python/paddle/fluid/tests/unittests/test_affine_channel_op.py @@ -0,0 +1,106 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function + +import unittest +import numpy as np +from op_test import OpTest +import paddle.fluid.core as core + + +def affine_channel(x, scale, bias, layout): + C = x.shape[1] if layout == 'NCHW' else x.shape[-1] + if len(x.shape) == 4: + new_shape = (1, C, 1, 1) if layout == 'NCHW' else (1, 1, 1, C) + else: + new_shape = (1, C) + scale = scale.reshape(new_shape) + bias = bias.reshape(new_shape) + return x * scale + bias + + +class TestAffineChannelOp(OpTest): + def setUp(self): + self.op_type = "affine_channel" + self.init_test_case() + + x = np.random.random(self.shape).astype("float32") + scale = np.random.random(self.C).astype("float32") + bias = np.random.random(self.C).astype("float32") + + y = affine_channel(x, scale, bias, self.layout) + + self.inputs = {'X': x, 'Scale': scale, 'Bias': bias} + self.attrs = {'data_layout': self.layout} + self.outputs = {'Out': y} + + def test_check_output(self): + self.check_output() + + def test_check_grad(self): + self.check_grad(['X', 'Scale', 'Bias'], 'Out') + + def test_check_grad_stopgrad_dx(self): + self.check_grad(['Scale', 'Bias'], 'Out', no_grad_set=set('X')) + + def test_check_grad_stopgrad_dscale_dbias(self): + self.check_grad(['X'], 'Out', no_grad_set=set(['Scale', 'Bias'])) + + def init_test_case(self): + self.shape = [2, 32, 14, 14] + self.C = 32 + self.layout = 'NCHW' + + +class TestAffineChannelNHWC(TestAffineChannelOp): + def init_test_case(self): + self.shape = [2, 14, 14, 32] + self.C = 32 + self.layout = 'NHWC' + + +class TestAffineChannel2D(TestAffineChannelOp): + def init_test_case(self): + self.shape = [16, 64] + self.C = 64 + self.layout = 'NCHW' + + +class TestAffineChannelNCHWLargeShape(TestAffineChannelOp): + def init_test_case(self): + self.shape = [64, 128, 112, 112] + self.C = 128 + self.layout = 'NCHW' + + # since the gradient check is very slow in large shape, so skip check_grad + def test_check_grad(self): + pass + + def test_check_grad_stopgrad_dx(self): + pass + + def test_check_grad_stopgrad_dscale_dbias(self): + pass + + +class TestAffineChannelNCHWLargeShape(TestAffineChannelNCHWLargeShape): + def init_test_case(self): + self.shape = [64, 112, 112, 512] + self.C = 512 + self.layout = 'NHWC' + + +if __name__ == '__main__': + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_clip_by_norm_op.py b/python/paddle/fluid/tests/unittests/test_clip_by_norm_op.py index 6103c3aafc..46433d7825 100644 --- a/python/paddle/fluid/tests/unittests/test_clip_by_norm_op.py +++ b/python/paddle/fluid/tests/unittests/test_clip_by_norm_op.py @@ -18,6 +18,9 @@ import unittest import numpy as np from op_test import OpTest +import paddle.fluid as fluid +import paddle.fluid.core as core + class TestClipByNormOp(OpTest): def setUp(self): @@ -62,5 +65,59 @@ class TestCase3(TestClipByNormOp): self.max_norm = 1.0 +class TestClipByNormOpWithSelectedRows(OpTest): + def check_with_place(self, place): + self.config_test_case() + scope = core.Scope() + + # set input + x_selected_rows = scope.var('X').get_selected_rows() + x_selected_rows.set_rows(self.grad_rows) + x_tensor = x_selected_rows.get_tensor() + x_np = np.random.random(self.grad_shape).astype("float32") + x_np[np.abs(x_np) < self.max_relative_error] = 0.5 + x_tensor.set(x_np, place) + + # set output + out_selected_rows = scope.var('Out').get_selected_rows() + + # run clip_by_norm_op + clip_by_norm_op = fluid.op.Operator( + "clip_by_norm", max_norm=self.max_norm, X='X', Out='Out') + clip_by_norm_op.run(scope, place) + + # check output + self.assertEqual(out_selected_rows.rows(), self.grad_clipped_rows) + out_tensor = out_selected_rows.get_tensor() + y_np = np.zeros(self.grad_clipped_shape) + y_np[0] = np.sum(x_np[0:2]) + y_np[1] = x_np[2] + y_np[2] = x_np[3] + norm = np.sqrt(np.sum(np.square(y_np))) + if norm > self.max_norm: + output = self.max_norm * y_np / norm + else: + output = y_np + self.assertTrue( + np.allclose( + np.array(out_tensor), output, atol=1e-5, equal_nan=False)) + + def test_clip_by_norm_with_selected_ros(self): + places = [core.CPUPlace()] + if core.is_compiled_with_cuda(): + places.append(core.CUDAPlace(0)) + + for place in places: + self.check_with_place(place) + + def config_test_case(self): + self.max_norm = 1.0 + self.max_relative_error = 0.006 + self.grad_shape = (4, 1) + self.grad_clipped_shape = (3, 1) + self.grad_rows = [0, 0, 1, 2] + self.grad_clipped_rows = [0, 1, 2] + + if __name__ == '__main__': unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_dist_simnet_bow.py b/python/paddle/fluid/tests/unittests/test_dist_simnet_bow.py index e971f29db4..a0b6879f99 100644 --- a/python/paddle/fluid/tests/unittests/test_dist_simnet_bow.py +++ b/python/paddle/fluid/tests/unittests/test_dist_simnet_bow.py @@ -25,7 +25,11 @@ class TestDistSimnetBowDense2x2(TestDistBase): self._enforce_place = "CPU" def test_simnet_bow(self): - need_envs = {"IS_DISTRIBUTED": '0', "IS_SPARSE": '0'} + need_envs = { + "IS_DISTRIBUTED": '0', + "IS_SPARSE": '0', + 'IS_SELF_CONTAINED_LR': '1' + } self.check_with_place( "dist_simnet_bow.py", delta=1e-5, @@ -39,7 +43,11 @@ class TestDistSimnetBow2x2DenseAsync(TestDistBase): self._enforce_place = "CPU" def test_simnet_bow(self): - need_envs = {"IS_DISTRIBUTED": '0', "IS_SPARSE": '0'} + need_envs = { + "IS_DISTRIBUTED": '0', + "IS_SPARSE": '0', + 'IS_SELF_CONTAINED_LR': '1' + } self.check_with_place( "dist_simnet_bow.py", delta=100, @@ -53,7 +61,11 @@ class TestDistSimnetBowSparse2x2(TestDistBase): self._enforce_place = "CPU" def test_simnet_bow(self): - need_envs = {"IS_DISTRIBUTED": '0', "IS_SPARSE": '1'} + need_envs = { + "IS_DISTRIBUTED": '0', + "IS_SPARSE": '1', + 'IS_SELF_CONTAINED_LR': '1' + } self.check_with_place( "dist_simnet_bow.py", delta=1e-5, @@ -67,7 +79,11 @@ class TestDistSimnetBow2x2SparseAsync(TestDistBase): self._enforce_place = "CPU" def test_simnet_bow(self): - need_envs = {"IS_DISTRIBUTED": '0', "IS_SPARSE": '1'} + need_envs = { + "IS_DISTRIBUTED": '0', + "IS_SPARSE": '1', + 'IS_SELF_CONTAINED_LR': '1' + } self.check_with_place( "dist_simnet_bow.py", delta=100, @@ -75,5 +91,61 @@ class TestDistSimnetBow2x2SparseAsync(TestDistBase): need_envs=need_envs) +# FIXME(tangwei): Learningrate variable is not created on pserver. +""" +class TestDistSimnetBow2x2LookupTableSync(TestDistBase): + def _setup_config(self): + self._sync_mode = True + self._enforce_place = "CPU" + + def test_simnet_bow(self): + need_envs = { + "IS_DISTRIBUTED": '1', + "IS_SPARSE": '1', + 'IS_SELF_CONTAINED_LR': '1' + } + self.check_with_place( + "dist_simnet_bow.py", + delta=1e-5, + check_error_log=True, + need_envs=need_envs) + + +class TestDistSimnetBow2x2LookupTableAsync(TestDistBase): + def _setup_config(self): + self._sync_mode = False + self._enforce_place = "CPU" + + def test_simnet_bow(self): + need_envs = { + "IS_DISTRIBUTED": '1', + "IS_SPARSE": '1', + 'IS_SELF_CONTAINED_LR': '1' + } + self.check_with_place( + "dist_simnet_bow.py", + delta=100, + check_error_log=False, + need_envs=need_envs) + + +class TestDistSimnetBow2x2LookupTableNotContainLRSync(TestDistBase): + def _setup_config(self): + self._sync_mode = True + self._enforce_place = "CPU" + + def test_simnet_bow(self): + need_envs = { + "IS_DISTRIBUTED": '1', + "IS_SPARSE": '1', + 'IS_SELF_CONTAINED_LR': '0' + } + self.check_with_place( + "dist_simnet_bow.py", + delta=1e-5, + check_error_log=False, + need_envs=need_envs) +""" + if __name__ == "__main__": unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_fused_embedding_fc_lstm_op.py b/python/paddle/fluid/tests/unittests/test_fused_embedding_fc_lstm_op.py new file mode 100644 index 0000000000..70ca521d33 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/test_fused_embedding_fc_lstm_op.py @@ -0,0 +1,218 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function + +import unittest +import numpy as np +from op_test import OpTest +from test_lstm_op import lstm, ACTIVATION + + +def fc(x, w, b): + return np.dot(x, w) + b + + +def fused_embedded_fc_lstm( + ids, # T x 1 + lod, # 1 x N + embeddings=None, # Dict_size x M + wx=None, # M x 4D + bx=None, # 1 x 4D + h0=None, # N x D + c0=None, # N x D + w_h=None, # D x 4D + w_b=None, # 1 x 4D + w_c=None, # 1 x 3D + is_reverse=False, + act_gate=None, + act_cell=None, + act_cand=None): + # Make a lookup for embeddings and pass result into lstm reference + T = ids.shape[0] + M = embeddings.shape[1] + x = embeddings[ids].reshape([T, M]) + return lstm( + fc(x, wx, bx), lod, h0, c0, w_h, w_b, w_c, is_reverse, act_gate, + act_cell, act_cand) + + +class TestFusionLSTMOp(OpTest): + def set_conf(self): + pass + + def setUp(self): + self.op_type = 'fused_embedding_fc_lstm' + self.lod = [[2, 3, 5, 4]] + self.M = 8 # Embedding size + self.D = 16 # Hidden size + self.dict_size = 18 + self.has_initial_state = False + self.use_peepholes = False + self.is_reverse = False + self.act_gate = 'sigmoid' + self.act_cell = 'tanh' + self.act_cand = 'tanh' + self.set_conf() + + T = sum(self.lod[0]) + bs = len(self.lod[0]) + + # this is the weight of fc + wx = np.random.normal(size=(self.M, 4 * self.D)).astype('float32') + # this is the bias of fc + bx = np.random.normal(size=(1, 4 * self.D)).astype('float32') + + if self.use_peepholes: + b = np.random.normal(size=(1, 7 * self.D)).astype('float32') + else: + b = np.random.normal(size=(1, 4 * self.D)).astype('float32') + w_b = np.copy(b[:, 0:4 * self.D]) + w_c = b[:, 4 * self.D:] if self.use_peepholes else None + + # low is 0 , high is voc_size - 1 + ids = np.random.randint( + low=0, high=self.dict_size - 1, size=(T, 1)).astype("int64") + # embeddings as they were trained , so each entry is of M size + embeddings = np.random.random( + (self.dict_size, self.M)).astype("float32") + + # multiply embeddings via Weights + fc_embeddings = np.dot(embeddings, wx) + + # bias should be manually added into the bias of this fused embedding fc LSTM + b[0, 0:4 * self.D] += bx[0, :] + combined_biases = b[:, 0:4 * self.D] + # So let broadcast it , so they can be added + ones = np.ones([self.dict_size, 1]) + broadcasted_biases = np.dot(ones, combined_biases) + # Sum biases with Wx*embeddings + fc_embeddings += broadcasted_biases + + if self.has_initial_state: + h0 = np.random.normal(size=(bs, self.D)).astype('float32') + c0 = np.random.normal(size=(bs, self.D)).astype('float32') + else: + h0 = np.zeros((bs, self.D)).astype('float32') + c0 = np.zeros((bs, self.D)).astype('float32') + + wh = np.random.normal(size=(self.D, 4 * self.D)).astype('float32') + + h, c = fused_embedded_fc_lstm( + ids, self.lod, embeddings, wx, bx, h0, c0, wh, w_b, w_c, + self.is_reverse, ACTIVATION[self.act_gate], + ACTIVATION[self.act_cell], ACTIVATION[self.act_cand]) + + self.inputs = { + 'Ids': (ids, self.lod), + 'Embeddings': fc_embeddings, + 'WeightH': wh, + 'Bias': b + } + + if self.has_initial_state: + self.inputs['H0'] = h0 + self.inputs['C0'] = c0 + + self.outputs = { + 'Hidden': (h, self.lod), + 'Cell': (c, self.lod), + } + self.attrs = { + 'use_peepholes': self.use_peepholes, + 'is_reverse': self.is_reverse, + 'gate_activation': self.act_gate, + 'cell_activation': self.act_cell, + 'candidate_activation': self.act_cand + } + + def test_check_output(self): + for use_seq in {True, False}: + self.attrs['use_seq'] = use_seq + self.check_output() + + +class TestFusionLSTMOpInit(TestFusionLSTMOp): + def set_conf(self): + self.has_initial_state = True + + +class TestFusionLSTMOpReverse(TestFusionLSTMOp): + def set_conf(self): + self.is_reverse = True + + +class TestFusionLSTMOpInitReverse(TestFusionLSTMOp): + def set_conf(self): + self.has_initial_state = True + self.is_reverse = True + + +class TestFusionLSTMOpMD1(TestFusionLSTMOp): + def set_conf(self): + self.M = 36 + self.D = 8 + + +class TestFusionLSTMOpMD2(TestFusionLSTMOp): + def set_conf(self): + self.M = 8 + self.D = 8 + + +class TestFusionLSTMOpMD3(TestFusionLSTMOp): + def set_conf(self): + self.M = 15 + self.D = 3 + + +class TestFusionLSTMOpBS1(TestFusionLSTMOp): + def set_conf(self): + self.lod = [[3]] + self.D = 16 + + +class TestFusionLSTMOpPeepholes(TestFusionLSTMOp): + def set_conf(self): + self.use_peepholes = True + + +class TestFusionLSTMOpPeepholesInit(TestFusionLSTMOp): + def set_conf(self): + self.use_peepholes = True + self.has_initial_state = True + + +class TestFusionLSTMOpPeepholesReverse(TestFusionLSTMOp): + def set_conf(self): + self.use_peepholes = True + self.is_reverse = True + + +class TestFusionLSTMOpPeepholesInitReverse(TestFusionLSTMOp): + def set_conf(self): + self.use_peepholes = True + self.has_initial_state = True + self.is_reverse = True + + +class TestFusionLSTMOpPeepholesBS1(TestFusionLSTMOp): + def set_conf(self): + self.use_peepholes = True + self.lod = [[2]] + self.D = 8 + + +if __name__ == '__main__': + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_fusion_seqconv_eltadd_relu_op.py b/python/paddle/fluid/tests/unittests/test_fusion_seqconv_eltadd_relu_op.py new file mode 100644 index 0000000000..ba6f1415b1 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/test_fusion_seqconv_eltadd_relu_op.py @@ -0,0 +1,94 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function + +import unittest +import numpy as np +import random +from op_test import OpTest +from test_seq_conv import seqconv + + +class TestSeqConvEltAddRelu(OpTest): + def set_conf(self): + pass + + def setUp(self): + self.op_type = 'fusion_seqconv_eltadd_relu' + self.lod = [[6, 4]] + self.in_fea_size = 16 + self.out_fea_size = 8 + self.context_length = 4 + self.context_stride = 1 + self.context_start = 0 + self.set_conf() + + assert self.context_stride == 1 + + T = sum(self.lod[0]) + x = np.random.uniform(-1, 1, [T, self.in_fea_size]).astype('float32') + w = np.random.uniform( + -1, 1, [self.in_fea_size * self.context_length, + self.out_fea_size]).astype('float32') + b = np.random.uniform(-2, 1, [1, self.out_fea_size]).astype('float32') + out = seqconv(x, self.lod, w, self.context_length, self.context_start) + out = np.maximum(out + b, 0) + + self.inputs = {'X': (x, self.lod), 'Filter': w, 'Bias': b} + self.attrs = { + 'contextStart': self.context_start, + 'contextLength': self.context_length, + 'contextStride': self.context_stride + } + self.outputs = {'Out': out} + + def test_check_output(self): + self.check_output() + + +class TestSeqConvEltAddReluBS1(TestSeqConvEltAddRelu): + def set_conf(self): + self.lod = [[10]] + + +class TestSeqConvEltAddReluBS1Case2(TestSeqConvEltAddRelu): + def set_conf(self): + self.lod = [[2]] + + +class TestSeqConvEltAddReluCase1(TestSeqConvEltAddRelu): + def set_conf(self): + self.lod = [[3, 5, 1, 6]] + self.context_length = 3 + self.context_start = -2 + + +class TestSeqConvEltAddReluCase2(TestSeqConvEltAddRelu): + def set_conf(self): + self.lod = [[10, 1, 2, 4, 1, 5, 6]] + self.in_fea_size = 2 + self.context_length = 4 + self.context_start = -1 + + +class TestSeqConvEltAddReluCase3(TestSeqConvEltAddRelu): + def set_conf(self): + self.lod = [[10, 1, 2, 4, 1, 5, 6]] + self.context_length = 5 + self.context_start = -4 + + +if __name__ == '__main__': + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_layers.py b/python/paddle/fluid/tests/unittests/test_layers.py index 1d8d0b55f0..50de468dba 100644 --- a/python/paddle/fluid/tests/unittests/test_layers.py +++ b/python/paddle/fluid/tests/unittests/test_layers.py @@ -194,6 +194,14 @@ class TestBook(unittest.TestCase): self.assertIsNotNone(layers.sequence_expand(x=x, y=y, ref_level=1)) print(str(program)) + def test_sequence_unpad(self): + program = Program() + with program_guard(program): + x = layers.data(name='x', shape=[10, 5], dtype='float32') + length = layers.data(name='length', shape=[1], dtype='int64') + self.assertIsNotNone(layers.sequence_unpad(x=x, length=length)) + print(str(program)) + def test_lstm_unit(self): program = Program() with program_guard(program): @@ -406,6 +414,19 @@ class TestBook(unittest.TestCase): self.assertIsNotNone(out) print(str(program)) + def test_sequence_slice(self): + program = Program() + with program_guard(program): + import numpy as np + seqs = layers.data( + name='x', shape=[10, 5], dtype='float32', lod_level=1) + offset = layers.assign(input=np.array([[0, 1]]).astype('int32')) + length = layers.assign(input=np.array([[2, 1]]).astype('int32')) + out = layers.sequence_slice( + input=seqs, offset=offset, length=length) + self.assertIsNotNone(out) + print(str(program)) + def test_lod_reset(self): program = Program() with program_guard(program): @@ -444,6 +465,16 @@ class TestBook(unittest.TestCase): self.assertIsNotNone(output) print(str(program)) + def test_roi_align(self): + program = Program() + with program_guard(program): + x = layers.data(name="x", shape=[256, 30, 30], dtype="float32") + rois = layers.data( + name="rois", shape=[4], dtype="float32", lod_level=1) + output = layers.roi_align(x, rois, 14, 14, 0.5, 2) + self.assertIsNotNone(output) + print(str(program)) + def test_resize_bilinear(self): program = Program() with program_guard(program): diff --git a/python/paddle/fluid/tests/unittests/test_momentum_op.py b/python/paddle/fluid/tests/unittests/test_momentum_op.py index 7137fd0fdb..a3d89610b4 100644 --- a/python/paddle/fluid/tests/unittests/test_momentum_op.py +++ b/python/paddle/fluid/tests/unittests/test_momentum_op.py @@ -16,6 +16,8 @@ from __future__ import print_function import unittest import numpy as np +import paddle.fluid.core as core +from paddle.fluid.op import Operator from op_test import OpTest @@ -88,5 +90,97 @@ class TestMomentumOp2(OpTest): self.check_output() +class TestSparseMomentumOp(unittest.TestCase): + def setUp(self): + self.use_nesterov = False + + def check_with_place(self, place): + self.init_kernel() + scope = core.Scope() + # create and initialize Grad Variable + height = 10 + rows = [0, 4, 7] + row_numel = 12 + mu = 1.0 + use_nesterov = self.use_nesterov + + # create and initialize Param Variable + param = scope.var('Param').get_tensor() + param_array = np.full((height, row_numel), 5.0).astype("float32") + param.set(param_array, place) + param_out = scope.var("ParamOut").get_tensor() + param_out_array = np.full((height, row_numel), 0.0).astype("float32") + param_out.set(param_out_array, place) + + grad_selected_rows = scope.var('Grad').get_selected_rows() + grad_selected_rows.set_height(height) + grad_selected_rows.set_rows(rows) + grad_np_array = np.ones((len(rows), row_numel)).astype("float32") + grad_np_array[0, 0] = 2.0 + grad_np_array[2, 8] = 4.0 + grad_tensor = grad_selected_rows.get_tensor() + grad_tensor.set(grad_np_array, place) + + velocity = scope.var('Velocity').get_tensor() + velocity_np_array = np.ones((height, row_numel)).astype("float32") + velocity.set(velocity_np_array, place) + velocity_out = scope.var('VelocityOut').get_tensor() + velocity_out_np_array = np.full((height, row_numel), + 0.0).astype("float32") + velocity_out.set(velocity_out_np_array, place) + + # create and initialize LeraningRate Variable + lr = scope.var('LearningRate').get_tensor() + lr_array = np.full((1), 2.0).astype("float32") + lr.set(lr_array, place) + + # create and run operator + op = Operator( + "momentum", + Param='Param', + Grad='Grad', + Velocity='Velocity', + ParamOut='ParamOut', + VelocityOut='VelocityOut', + LearningRate='LearningRate', + mu=mu, + use_nesterov=use_nesterov) + op.run(scope, place) + + # get and compare result + param_out_np_array = np.array(param_out) + velocity_out_np_array = np.array(velocity_out) + + # TODO(dzh): add a more suitable general numpy interface + # for sparse update. + _grad_np_array = np.full((height, row_numel), 0.0).astype("float32") + for i in range(len(rows)): + _grad_np_array[rows[i]] = grad_np_array[i] + _velocity_out = mu * velocity_np_array + _grad_np_array + _param = param_array + if use_nesterov: + _param_out = _param - (_grad_np_array + _velocity_out * mu + ) * lr_array + else: + _param_out = _param - lr_array * _velocity_out + self.assertTrue((_velocity_out == velocity_out_np_array).all()) + self.assertTrue((_param_out == param_out_np_array).all()) + + def init_kernel(self): + pass + + def test_sparse_momentum(self): + places = [core.CPUPlace()] + if core.is_compiled_with_cuda(): + places.append(core.CUDAPlace(0)) + for place in places: + self.check_with_place(place) + + +class TestSparseMomentumOp2(TestSparseMomentumOp): + def init_kernel(self): + self.use_nesterov = True + + if __name__ == "__main__": unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_polygon_box_transform.py b/python/paddle/fluid/tests/unittests/test_polygon_box_transform.py index dfedf8190f..7f266056a9 100644 --- a/python/paddle/fluid/tests/unittests/test_polygon_box_transform.py +++ b/python/paddle/fluid/tests/unittests/test_polygon_box_transform.py @@ -37,7 +37,7 @@ def PolygonBoxRestore(input): indexes = indexes.repeat( [batch_size], axis=0) # [batch_size, geo_channels/2, 2, h, w] return indexes.reshape( - input.shape) - input # [batch_size, geo_channels, h, w] + input.shape) * 4 - input # [batch_size, geo_channels, h, w] class TestPolygonBoxRestoreOp(OpTest): diff --git a/python/paddle/fluid/tests/unittests/test_rmsprop_op.py b/python/paddle/fluid/tests/unittests/test_rmsprop_op.py index 70848e4e22..eb12bc7417 100644 --- a/python/paddle/fluid/tests/unittests/test_rmsprop_op.py +++ b/python/paddle/fluid/tests/unittests/test_rmsprop_op.py @@ -19,33 +19,76 @@ import unittest import numpy as np import paddle.fluid.core as core from paddle.fluid.op import Operator +import paddle.fluid as fluid + + +def create_selected_rows_and_tensor(scope, place, height, row_num, + embedding_size): + sr = scope.var("@selected_rows@").get_selected_rows() + tensor = scope.var("grad").get_tensor() + + rows = np.random.random_integers( + low=0, high=height - 1, size=[row_num, ]).astype('int64') + sr_val = np.random.random(size=[row_num, embedding_size]).astype('float32') + + sr.set_height(height) + sr.set_rows(rows) + sr.get_tensor().set(sr_val, place) + + tensor_val = np.zeros(shape=[height, embedding_size], dtype='float32') + for i in range(row_num): + row = rows[i] + tensor_val[row, :] = tensor_val[row, :] + sr_val[i, :] + + tensor.set(tensor_val, place) + return tensor_val, sr_val class TestBase(unittest.TestCase): - def setup(self, centered, epsilon=1e-6): + def setup(self, + place, + is_sparse, + centered, + size, + row_num=None, + epsilon=1e-6): np.random.seed(5) # fix seed + self.scope = fluid.global_scope() + self.place = place + self.param_name = "param" - self.param = np.random.random((123, 321)).astype("float32") + self.param = np.random.random(size).astype("float32") self.mean_square_name = "mean_square" - self.mean_square = np.random.random((123, 321)).astype("float32") + self.mean_square = np.random.uniform( + low=1, high=2, size=size).astype("float32") self.mean_grad_name = "mean_grad" - self.mean_grad = np.random.random((123, 321)).astype("float32") + self.mean_grad = np.random.random(size).astype("float32") self.lr_name = "lr" self.learning_rate = np.array([0.01]).astype("float32") self.grad_name = "grad" - self.grad = np.random.random((123, 321)).astype("float32") + + self.is_sparse = is_sparse + if self.is_sparse: + self.grad_sr_name = "@selected_rows@" + self.grad, self.grad_sr = create_selected_rows_and_tensor( + self.scope, place, size[0], row_num, size[1]) + else: + self.grad = np.random.random(size).astype("float32") + grad_tensor = self.scope.var(self.grad_name).get_tensor() + grad_tensor.set(self.grad, place) self.moment_name = "moment" - self.moment = np.zeros((123, 321)).astype("float32") + self.moment = np.random.uniform( + low=0, high=1, size=size).astype("float32") self.epsilon = epsilon self.decay = 0.9 - self.momentum = 0.0 + self.momentum = 0.1 self.centered = centered self.ms_out = self.decay * self.mean_square + (1 - self.decay @@ -61,118 +104,122 @@ class TestBase(unittest.TestCase): self.param_out = self.param - self.moment_out - def check(self, - actual_t, - expect_t, - place, - out_name, - atol=1e-5, - equal_nan=False): - self.assertTrue( - np.allclose( - actual_t, expect_t, atol=atol, equal_nan=equal_nan), - "Output (" + out_name + ") has diff at " + str(place) + "\nExpect " - + str(expect_t) + "\n" + "But Got" + str(actual_t)) - - -class TestRmspropOp(TestBase): - def check_with_place(self, place, centered, epsilon): - self.setup(centered, epsilon) - scope = core.Scope() - # create and initialize Param Variable - param = scope.var(self.param_name).get_tensor() - param.set(self.param, place) + self.param_tensor = self.scope.var(self.param_name).get_tensor() + self.param_tensor.set(self.param, place) - mean_square = scope.var(self.mean_square_name).get_tensor() - mean_square.set(self.mean_square, place) + self.mean_square_tensor = self.scope.var( + self.mean_square_name).get_tensor() + self.mean_square_tensor.set(self.mean_square, place) - lr = scope.var(self.lr_name).get_tensor() + lr = self.scope.var(self.lr_name).get_tensor() lr.set(self.learning_rate, place) - grad = scope.var(self.grad_name).get_tensor() - grad.set(self.grad, place) + self.moment_tensor = self.scope.var(self.moment_name).get_tensor() + self.moment_tensor.set(self.moment, place) - moment = scope.var(self.moment_name).get_tensor() - moment.set(self.moment, place) + if self.centered: + self.mean_grad_tensor = self.scope.var( + self.mean_grad_name).get_tensor() + self.mean_grad_tensor.set(self.mean_grad, place) - # create and run sgd operator + def check(self, actual_t, expect_t, place, out_name, atol=1e-5): + self.assertTrue( + np.allclose( + actual_t, expect_t, atol=atol), + "Output (" + out_name + ") has diff at " + str(place) + "\nExpect " + + str(expect_t) + "\n" + "But Got" + str(actual_t)) - if self.centered: - mean_grad = scope.var(self.mean_grad_name).get_tensor() - mean_grad.set(self.mean_grad, place) - - rmsprop_op = Operator( - "rmsprop", - Param=self.param_name, - Grad=self.grad_name, - MeanSquare=self.mean_square_name, - MeanGrad=self.mean_grad_name, - Moment=self.moment_name, - LearningRate=self.lr_name, - ParamOut=self.param_name, - MeanSquareOut=self.mean_square_name, - MomentOut=self.moment_name, - MeanGradOut=self.mean_grad_name, - epsilon=self.epsilon, - decay=self.decay, - momentum=self.momentum, - centered=True) - else: - rmsprop_op = Operator( - "rmsprop", - Param=self.param_name, - Grad=self.grad_name, - MeanSquare=self.mean_square_name, - Moment=self.moment_name, - LearningRate=self.lr_name, - ParamOut=self.param_name, - MeanSquareOut=self.mean_square_name, - MomentOut=self.moment_name, - epsilon=self.epsilon, - decay=self.decay, - momentum=self.momentum, - centered=False) - - rmsprop_op.run(scope, place) - - atol = 1e-5 - equal_nan = False + +class TestRmspropOp(TestBase): + def check_with_place(self, + place, + is_sparse, + centered, + size, + row_num=None, + epsilon=1e-6): + self.setup(place, is_sparse, centered, size, row_num, epsilon) + self.run_and_check() + + def run_and_check(self): + grad_name = self.grad_sr_name if self.is_sparse else self.grad_name + + kwargs = { + 'Param': self.param_name, + 'Grad': grad_name, + 'MeanSquare': self.mean_square_name, + 'Moment': self.moment_name, + 'LearningRate': self.lr_name, + 'ParamOut': self.param_name, + 'MeanSquareOut': self.mean_square_name, + 'MomentOut': self.moment_name, + 'epsilon': self.epsilon, + 'decay': self.decay, + 'momentum': self.momentum, + 'centered': self.centered + } if self.centered: - atol = 1e-3 - equal_nan = True + kwargs['MeanGrad'] = self.mean_grad_name + kwargs['MeanGradOut'] = self.mean_grad_name + + rmsprop_op = Operator('rmsprop', **kwargs) + atol = 1e-6 + + rmsprop_op.run(self.scope, self.place) self.check( - np.array(mean_square), self.ms_out, place, self.mean_square_name) + np.array(self.mean_square_tensor), + self.ms_out, + self.place, + self.mean_square_name, + atol=atol) self.check( - np.array(moment), + np.array(self.moment_tensor), self.moment_out, - place, + self.place, self.moment_name, - atol=atol, - equal_nan=equal_nan) + atol=atol) self.check( - np.array(param), + np.array(self.param_tensor), self.param_out, - place, + self.place, self.param_name, - atol=atol, - equal_nan=equal_nan) + atol=atol) if self.centered: self.check( - np.array(mean_grad), self.mg_out, place, self.mean_grad_name) + np.array(self.mean_grad_tensor), self.mg_out, self.place, + self.mean_grad_name) def test_rmsprop(self): places = [core.CPUPlace()] if core.is_compiled_with_cuda(): places.append(core.CUDAPlace(0)) + + size = (128, 320) for place in places: - self.check_with_place(place, False, 1e-6) - self.check_with_place(place, False, 1e-10) - self.check_with_place(place, True, 1e-6) - self.check_with_place(place, True, 1e-10) + for centered in [False, True]: + with fluid.scope_guard(core.Scope()): + self.check_with_place( + place, is_sparse=False, centered=centered, size=size) + + with fluid.scope_guard(core.Scope()): + self.check_with_place( + place, + is_sparse=True, + centered=centered, + row_num=512, + size=size) + + with fluid.scope_guard(core.Scope()): + self.check_with_place( + place, + is_sparse=True, + centered=centered, + row_num=60, + size=size) if __name__ == "__main__": diff --git a/python/paddle/fluid/tests/unittests/test_roi_align_op.py b/python/paddle/fluid/tests/unittests/test_roi_align_op.py new file mode 100644 index 0000000000..1a252ea547 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/test_roi_align_op.py @@ -0,0 +1,170 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function + +import unittest +import numpy as np +import math +import sys +from op_test import OpTest + + +class TestROIAlignOp(OpTest): + def set_data(self): + self.init_test_case() + self.make_rois() + self.calc_roi_align() + self.inputs = {'X': self.x, 'ROIs': (self.rois[:, 1:5], self.rois_lod)} + self.attrs = { + 'spatial_scale': self.spatial_scale, + 'pooled_height': self.pooled_height, + 'pooled_width': self.pooled_width, + 'sampling_ratio': self.sampling_ratio + } + + self.outputs = {'Out': self.out_data} + + def init_test_case(self): + self.batch_size = 3 + self.channels = 3 + self.height = 8 + self.width = 6 + + # n, c, h, w + self.x_dim = (self.batch_size, self.channels, self.height, self.width) + + self.spatial_scale = 1.0 / 2.0 + self.pooled_height = 2 + self.pooled_width = 2 + self.sampling_ratio = -1 + + self.x = np.random.random(self.x_dim).astype('float32') + + def pre_calc(self, x_i, roi_xmin, roi_ymin, roi_bin_grid_h, roi_bin_grid_w, + bin_size_h, bin_size_w): + count = roi_bin_grid_h * roi_bin_grid_w + bilinear_pos = np.zeros( + [self.channels, self.pooled_height, self.pooled_width, count, 4], + np.float32) + bilinear_w = np.zeros( + [self.pooled_height, self.pooled_width, count, 4], np.float32) + for ph in range(self.pooled_width): + for pw in range(self.pooled_height): + c = 0 + for iy in range(roi_bin_grid_h): + y = roi_ymin + ph * bin_size_h + (iy + 0.5) * \ + bin_size_h / roi_bin_grid_h + for ix in range(roi_bin_grid_w): + x = roi_xmin + pw * bin_size_w + (ix + 0.5) * \ + bin_size_w / roi_bin_grid_w + if y < -1.0 or y > self.height or \ + x < -1.0 or x > self.width: + continue + if y <= 0: + y = 0 + if x <= 0: + x = 0 + y_low = int(y) + x_low = int(x) + if y_low >= self.height - 1: + y = y_high = y_low = self.height - 1 + else: + y_high = y_low + 1 + if x_low >= self.width - 1: + x = x_high = x_low = self.width - 1 + else: + x_high = x_low + 1 + ly = y - y_low + lx = x - x_low + hy = 1 - ly + hx = 1 - lx + for ch in range(self.channels): + bilinear_pos[ch, ph, pw, c, 0] = x_i[ch, y_low, + x_low] + bilinear_pos[ch, ph, pw, c, 1] = x_i[ch, y_low, + x_high] + bilinear_pos[ch, ph, pw, c, 2] = x_i[ch, y_high, + x_low] + bilinear_pos[ch, ph, pw, c, 3] = x_i[ch, y_high, + x_high] + bilinear_w[ph, pw, c, 0] = hy * hx + bilinear_w[ph, pw, c, 1] = hy * lx + bilinear_w[ph, pw, c, 2] = ly * hx + bilinear_w[ph, pw, c, 3] = ly * lx + c = c + 1 + return bilinear_pos, bilinear_w + + def calc_roi_align(self): + self.out_data = np.zeros( + (self.rois_num, self.channels, self.pooled_height, + self.pooled_width)).astype('float32') + + for i in range(self.rois_num): + roi = self.rois[i] + roi_batch_id = int(roi[0]) + x_i = self.x[roi_batch_id] + roi_xmin = roi[1] * self.spatial_scale + roi_ymin = roi[2] * self.spatial_scale + roi_xmax = roi[3] * self.spatial_scale + roi_ymax = roi[4] * self.spatial_scale + roi_width = max(roi_xmax - roi_xmin, 1) + roi_height = max(roi_ymax - roi_ymin, 1) + bin_size_h = float(roi_height) / float(self.pooled_height) + bin_size_w = float(roi_width) / float(self.pooled_width) + roi_bin_grid_h = self.sampling_ratio if self.sampling_ratio > 0 else \ + math.ceil(roi_height / self.pooled_height) + roi_bin_grid_w = self.sampling_ratio if self.sampling_ratio > 0 else \ + math.ceil(roi_width / self.pooled_width) + count = int(roi_bin_grid_h * roi_bin_grid_w) + pre_size = count * self.pooled_width * self.pooled_height + bilinear_pos, bilinear_w = self.pre_calc(x_i, roi_xmin, roi_ymin, + int(roi_bin_grid_h), + int(roi_bin_grid_w), + bin_size_h, bin_size_w) + for ch in range(self.channels): + align_per_bin = (bilinear_pos[ch] * bilinear_w).sum(axis=-1) + output_val = align_per_bin.mean(axis=-1) + self.out_data[i, ch, :, :] = output_val + + def make_rois(self): + rois = [] + self.rois_lod = [[]] + for bno in range(self.batch_size): + self.rois_lod[0].append(bno + 1) + for i in range(bno + 1): + x1 = np.random.random_integers( + 0, self.width // self.spatial_scale - self.pooled_width) + y1 = np.random.random_integers( + 0, self.height // self.spatial_scale - self.pooled_height) + + x2 = np.random.random_integers(x1 + self.pooled_width, + self.width // self.spatial_scale) + y2 = np.random.random_integers( + y1 + self.pooled_height, self.height // self.spatial_scale) + + roi = [bno, x1, y1, x2, y2] + rois.append(roi) + self.rois_num = len(rois) + self.rois = np.array(rois).astype("float32") + + def setUp(self): + self.op_type = "roi_align" + self.set_data() + + def test_check_output(self): + self.check_output() + + def test_check_grad(self): + self.check_grad(['X'], 'Out') diff --git a/python/paddle/fluid/tests/unittests/test_seq_conv.py b/python/paddle/fluid/tests/unittests/test_seq_conv.py index dcc86382e5..2285e94967 100644 --- a/python/paddle/fluid/tests/unittests/test_seq_conv.py +++ b/python/paddle/fluid/tests/unittests/test_seq_conv.py @@ -20,6 +20,53 @@ import random from op_test import OpTest +def seqconv(x, + lod, + filter, + context_length, + context_start, + padding_trainable=False, + padding_data=None): + [T, M] = x.shape + col = np.zeros((T, context_length * M)).astype('float32') + offset = [0] + for seq_len in lod[0]: + offset.append(offset[-1] + seq_len) + begin_pad = np.max([0, -context_start]) + for i in range(len(offset) - 1): + for j in range(context_length): + in_begin = offset[i] + context_start + j + in_end = offset[i + 1] + context_start + j + out_begin = offset[i] + out_end = offset[i + 1] + if in_begin < offset[i]: + pad_size = np.min( + [offset[i] - in_begin, offset[i + 1] - offset[i]]) + if padding_trainable: + sub_w = padding_data[j:j + pad_size, :] + col[offset[i]:offset[i] + pad_size, j * M:(j + 1) * + M] = sub_w + out_begin = offset[i] + pad_size + in_begin = offset[i] + + if in_end > offset[i + 1]: + pad_size = np.min( + [in_end - offset[i + 1], offset[i + 1] - offset[i]]) + if padding_trainable: + sub_w = padding_data[begin_pad + context_start + j - + pad_size:begin_pad + context_start + + j, :] + col[offset[i + 1] - pad_size:offset[i + 1], j * M:(j + 1) * + M] = sub_w + in_end = offset[i + 1] + out_end = offset[i + 1] - pad_size + if in_end <= in_begin: + continue + in_sub = x[in_begin:in_end, :] + col[out_begin:out_end, j * M:(j + 1) * M] += in_sub + return np.dot(col, filter) + + class TestSeqProject(OpTest): def setUp(self): self.init_test_case() @@ -66,57 +113,9 @@ class TestSeqProject(OpTest): 'paddingTrainable': self.padding_trainable, 'contextStride': self.context_stride } - out = np.zeros( - (self.input_size[0], self.output_represention)).astype('float32') + out = seqconv(x, self.lod, w, self.context_length, self.context_start, + self.padding_trainable, self.pad_data) self.outputs = {'Out': out} - self.compute() - - def compute(self): - x, lod = self.inputs['X'] - filter = self.inputs['Filter'] - pading_data = self.pad_data - out = np.zeros((self.input_size[0], self.context_length * - self.input_size[1])).astype('float32') - offset = [0] - for seq_len in lod[0]: - offset.append(offset[-1] + seq_len) - begin_pad = np.max([0, -self.context_start]) - - for i in range(len(offset) - 1): - for j in range(self.context_length): - in_begin = offset[i] + self.context_start + j - in_end = offset[i + 1] + self.context_start + j - out_begin = offset[i] - out_end = offset[i + 1] - if in_begin < offset[i]: - pad_size = np.min( - [offset[i] - in_begin, offset[i + 1] - offset[i]]) - if self.padding_trainable: - sub_w = pading_data[j:j + pad_size, :] - out[offset[i]:offset[i] + pad_size, j * self.input_size[ - 1]:(j + 1) * self.input_size[1]] = sub_w - out_begin = offset[i] + pad_size - in_begin = offset[i] - - if in_end > offset[i + 1]: - pad_size = np.min( - [in_end - offset[i + 1], offset[i + 1] - offset[i]]) - if self.padding_trainable: - sub_w = pading_data[begin_pad + self.context_start + j - - pad_size:begin_pad + - self.context_start + j, :] - out[offset[i + 1] - pad_size:offset[i + 1], j * self. - input_size[1]:(j + 1) * self.input_size[1]] = sub_w - in_end = offset[i + 1] - out_end = offset[i + 1] - pad_size - if in_end <= in_begin: - continue - - in_sub = x[in_begin:in_end, :] - out[out_begin:out_end, j * self.input_size[1]:(j + 1) * - self.input_size[1]] += in_sub - - np.dot(out, filter, out=self.outputs['Out']) def test_check_output(self): self.check_output() diff --git a/python/paddle/fluid/tests/unittests/test_sequence_unpad_op.py b/python/paddle/fluid/tests/unittests/test_sequence_unpad_op.py new file mode 100644 index 0000000000..673b0ea180 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/test_sequence_unpad_op.py @@ -0,0 +1,75 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import unittest +import six +import numpy as np +from op_test import OpTest + + +class TestSequenceUnpadOp(OpTest): + def init(self): + self.length = [2, 3, 4] + self.x_shape = (3, 5) + self.dtype = "float32" + + def compute(self): + assert len(self.length) == self.x_shape[0] + x = np.random.random(self.x_shape).astype(self.dtype) + out_lod = [self.length] + + out = x[0, 0:self.length[0]] + for i in six.moves.xrange(1, x.shape[0]): + out = np.append(out, x[i, 0:self.length[i]], axis=0) + + out_shape = (sum(self.length), ) + if len(self.x_shape) == 2: + out_shape = out_shape + (1, ) + else: + out_shape = out_shape + self.x_shape[2:] + + self.inputs = { + 'X': x, + 'Length': np.array(self.length).astype('int64').reshape(-1, 1) + } + self.outputs = {'Out': (out.reshape(out_shape), out_lod)} + + def setUp(self): + self.op_type = 'sequence_unpad' + self.init() + self.compute() + + def test_check_output(self): + self.check_output() + + def test_check_grad(self): + self.check_grad(["X"], "Out") + + +class TestSequenceUnpadOp2(TestSequenceUnpadOp): + def init(self): + self.length = [2, 3, 4] + self.x_shape = (3, 5, 4, 3) + self.dtype = "float32" + + +class TestSequenceUnpadOp3(TestSequenceUnpadOp): + def init(self): + self.length = [5, 2, 3, 4] + self.x_shape = (4, 5, 3, 3, 6) + self.dtype = "float64" + + +if __name__ == '__main__': + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_slice_var.py b/python/paddle/fluid/tests/unittests/test_slice_var.py index fab63b7d56..b16c744603 100644 --- a/python/paddle/fluid/tests/unittests/test_slice_var.py +++ b/python/paddle/fluid/tests/unittests/test_slice_var.py @@ -30,7 +30,6 @@ class TestSliceVar(unittest.TestCase): var = program.global_block().create_var( name=str(random.randint(10000, 99999)), persistable=True, - # dtype=core.VarDesc.VarType.LOD_TENSOR, shape=shape) var_list.append(var) blocks = slice_variable(var_list, 10, min_size) diff --git a/python/paddle/fluid/transpiler/distribute_transpiler.py b/python/paddle/fluid/transpiler/distribute_transpiler.py index ecdbe27f4d..2192139f8d 100644 --- a/python/paddle/fluid/transpiler/distribute_transpiler.py +++ b/python/paddle/fluid/transpiler/distribute_transpiler.py @@ -788,7 +788,8 @@ in a single call.") tuple: (main_program, startup_program), of type "Program" """ pserver_prog = self.get_pserver_program(endpoint) - pserver_startup = self.get_startup_program(endpoint) + pserver_startup = self.get_startup_program( + endpoint, pserver_program=pserver_prog) return pserver_prog, pserver_startup def get_startup_program(self, @@ -1118,6 +1119,7 @@ to transpile() call.") def _split_table_grad_and_add_send_vars(self, program, pserver_endpoints): # 2. add split_ids_op and send_op to send gradient to pservers + # there should only be one table_name all_ops = program.global_block().ops table_grad_name = grad_var_name(self.table_name) @@ -1142,7 +1144,7 @@ to transpile() call.") if self.sync_mode else [] }, attrs={ - "sync_mode": self.sync_mode, + "sync_mode": not self.sync_mode, "epmap": pserver_endpoints, RPC_OP_ROLE_ATTR_NAME: RPC_OP_ROLE_ATTR_VALUE, OP_ROLE_VAR_ATTR_NAME: [ @@ -1188,7 +1190,15 @@ to transpile() call.") def _create_table_optimize_block(self, pserver_index, pserver_program, pre_block_idx, grad_to_block_id): # STEP: create table optimize block + table_opt_block = pserver_program._create_block(pre_block_idx) # create table param and grad var in pserver program + # create table optimize block in pserver program + table_opt_op = [ + op for op in self.optimize_ops + if 'Param' in op.input_names and op.input("Param")[0] == + self.table_name + ][0] + origin_param_var = self.origin_program.global_block().vars[ self.table_name] @@ -1204,19 +1214,16 @@ to transpile() call.") dtype=origin_param_var.dtype, type=core.VarDesc.VarType.SELECTED_ROWS, persistable=True) + # parameter must be selected rows param_var.desc.set_type(core.VarDesc.VarType.SELECTED_ROWS) grad_var = pserver_program.global_block()._clone_variable( self.origin_program.global_block().vars[grad_var_name( self.table_name)]) - # create table optimize block in pserver program - table_opt_op = [ - op for op in self.optimize_ops - if 'Param' in op.input_names and op.input("Param")[0] == - self.table_name - ][0] - table_opt_block = pserver_program._create_block(pre_block_idx) + lr_var = pserver_program.global_block()._clone_variable( + self.origin_program.global_block().vars[table_opt_op.input( + "LearningRate")[0]]) if self.sync_mode: # create grad vars in pserver program @@ -1248,8 +1255,6 @@ to transpile() call.") grad_var = pserver_program.global_block()._rename_var( origin_grad_name, splited_grad_name) - lr_var = pserver_program.global_block().vars[table_opt_op.input( - "LearningRate")[0]] inputs = { "Param": [param_var], "Grad": [grad_var], diff --git a/python/paddle/fluid/transpiler/inference_transpiler.py b/python/paddle/fluid/transpiler/inference_transpiler.py index c402535b27..5269bd94ce 100644 --- a/python/paddle/fluid/transpiler/inference_transpiler.py +++ b/python/paddle/fluid/transpiler/inference_transpiler.py @@ -74,7 +74,7 @@ class InferenceTranspiler(object): ''' Transpile the program fusing elementwise_add into conv for MKLDNN program. Elementwise add following convolution OP can be fused by adding - 'fuse_eltwise' attribute to convolution OP and replacing its output + 'fuse_residual_connection' attribute to convolution OP and replacing its output Tensor with second parameter of elementwise_add. The result of fuse is: - before: @@ -92,7 +92,8 @@ class InferenceTranspiler(object): if current_op.type in ['conv2d']: next_op = self.block.ops[i + 1] if next_op.type == 'elementwise_add': - self._fuse_conv_eltwise(current_op, next_op) + self._fuse_conv_eltwise(i, current_op, next_op) + self.block._remove_op(i + 1) # Remove old conv self.block._remove_op(i + 1) # Remove elementwise_add i = i + 1 self._adjust_input() @@ -444,7 +445,7 @@ class InferenceTranspiler(object): outputs={"Output": out_var}, attrs=attrs) - def _fuse_conv_eltwise(self, conv_op, eltwise_op): + def _fuse_conv_eltwise(self, index, conv_op, eltwise_op): ''' fuse the conv op with elementwise_add @@ -454,9 +455,30 @@ class InferenceTranspiler(object): :type eltwise_op: Operator ''' - conv_op._set_attr("fuse_eltwise", True) - self.input_map[conv_op.output("Output")[0]] = eltwise_op.input("Y")[0] - self.input_map[eltwise_op.output("Out")[0]] = eltwise_op.input("Y")[0] + eltwise_input = "X" + if eltwise_op.input("X")[0] == conv_op.output("Output")[0]: + eltwise_input = "Y" + + residual_var = self.block.vars[eltwise_op.input(eltwise_input)[0]] + out_var = self.block.vars[eltwise_op.output("Out")[0]] + filter_var = self.block.vars[conv_op.input("Filter")[0]] + in_var = self.block.vars[conv_op.input("Input")[0]] + bias_var = self.block.vars[conv_op.input("Bias")[0]] + + conv_op._set_attr("fuse_residual_connection", True) + attrs = {name: conv_op.attr(name) for name in conv_op.attr_names} + + self.block._insert_op( + index, + type="conv2d", + inputs={ + "Input": in_var, + "Filter": filter_var, + "Bias": bias_var, + "ResidualData": residual_var + }, + outputs={"Output": out_var}, + attrs=attrs) def _adjust_input(self): for i in range(len(self.block.ops)): diff --git a/python/paddle/utils/__init__.py b/python/paddle/utils/__init__.py index 15595d2085..5de6f966a0 100644 --- a/python/paddle/utils/__init__.py +++ b/python/paddle/utils/__init__.py @@ -12,4 +12,5 @@ # See the License for the specific language governing permissions and # limitations under the License. -__all__ = ['dump_config'] +from plot import Ploter +__all__ = ['dump_config', 'Ploter'] diff --git a/python/paddle/utils/plot.py b/python/paddle/utils/plot.py new file mode 100644 index 0000000000..08889c0313 --- /dev/null +++ b/python/paddle/utils/plot.py @@ -0,0 +1,115 @@ +# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os + + +class PlotData(object): + def __init__(self): + self.step = [] + self.value = [] + + def append(self, step, value): + self.step.append(step) + self.value.append(value) + + def reset(self): + self.step = [] + self.value = [] + + +class Ploter(object): + """ + Plot input data in a 2D graph + + Args: + title: assign the title of input data. + step: x_axis of the data. + value: y_axis of the data. + """ + + def __init__(self, *args): + self.__args__ = args + self.__plot_data__ = {} + for title in args: + self.__plot_data__[title] = PlotData() + # demo in notebooks will use Ploter to plot figure, but when we convert + # the ipydb to py file for testing, the import of matplotlib will make the + # script crash. So we can use `export DISABLE_PLOT=True` to disable import + # these libs + self.__disable_plot__ = os.environ.get("DISABLE_PLOT") + if not self.__plot_is_disabled__(): + import matplotlib.pyplot as plt + from IPython import display + self.plt = plt + self.display = display + + def __plot_is_disabled__(self): + return self.__disable_plot__ == "True" + + def append(self, title, step, value): + """ + Feed data + + Args: + title: assign the group data to this subtitle. + step: the x_axis of data. + value: the y_axis of data. + + Examples: + .. code-block:: python + plot_curve = Ploter("Curve 1","Curve 2") + plot_curve.append(title="Curve 1",step=1,value=1) + """ + assert isinstance(title, basestring) + assert self.__plot_data__.has_key(title) + data = self.__plot_data__[title] + assert isinstance(data, PlotData) + data.append(step, value) + + def plot(self, path=None): + """ + Plot data in a 2D graph + + Args: + path: store the figure to this file path. Defaul None. + + Examples: + .. code-block:: python + plot_curve = Ploter() + plot_cure.plot() + """ + if self.__plot_is_disabled__(): + return + + titles = [] + for title in self.__args__: + data = self.__plot_data__[title] + assert isinstance(data, PlotData) + if len(data.step) > 0: + titles.append(title) + self.plt.plot(data.step, data.value) + self.plt.legend(titles, loc='upper left') + if path is None: + self.display.clear_output(wait=True) + self.display.display(self.plt.gcf()) + else: + self.plt.savefig(path) + self.plt.gcf().clear() + + def reset(self): + for key in self.__plot_data__: + data = self.__plot_data__[key] + assert isinstance(data, PlotData) + data.reset()