From 594dc4d8f0a74fe7640d22830cd221b91cbebbb5 Mon Sep 17 00:00:00 2001 From: sneaxiy Date: Thu, 10 Jan 2019 01:47:22 +0000 Subject: [PATCH 01/64] partial gc 1st version test=develop --- paddle/fluid/framework/details/CMakeLists.txt | 2 +- .../details/eager_deletion_op_handle.cc | 14 +- .../framework/details/eager_deletion_pass.cc | 125 +++++++++++++++++- .../framework/details/reference_count_pass.cc | 9 -- .../details/reference_count_pass_helper.cc | 15 ++- .../details/reference_count_pass_helper.h | 8 +- python/paddle/fluid/__init__.py | 1 + 7 files changed, 158 insertions(+), 16 deletions(-) diff --git a/paddle/fluid/framework/details/CMakeLists.txt b/paddle/fluid/framework/details/CMakeLists.txt index 179aa14528..cb34712975 100644 --- a/paddle/fluid/framework/details/CMakeLists.txt +++ b/paddle/fluid/framework/details/CMakeLists.txt @@ -54,7 +54,7 @@ cc_library(memory_optimize_pass SRCS analysis_var_pass.cc memory_reuse_types.cc cc_library(modify_op_lock_and_record_event_pass SRCS modify_op_lock_and_record_event_pass.cc DEPS computation_op_handle op_graph_view multi_devices_helper) cc_library(memory_early_delete_pass SRCS memory_early_delete_pass.cc DEPS memory_optimize_pass computation_op_handle scale_loss_grad_op_handle rpc_op_handle all_reduce_op_handle reduce_op_handle broadcast_op_handle data_balance_op_handle graph graph_helper pass) -cc_library(reference_count_pass_helper SRCS reference_count_pass_helper.cc DEPS garbage_collector computation_op_handle) +cc_library(reference_count_pass_helper SRCS reference_count_pass_helper.cc DEPS garbage_collector computation_op_handle proto_desc var_handle) cc_library(eager_deletion_op_handle SRCS eager_deletion_op_handle.cc DEPS lod_tensor selected_rows reference_count_pass_helper) cc_library(eager_deletion_pass SRCS eager_deletion_pass.cc DEPS computation_op_handle eager_deletion_op_handle graph graph_helper pass) cc_library(reference_count_pass SRCS reference_count_pass.cc DEPS computation_op_handle graph graph_helper pass op_graph_view reference_count_pass_helper) diff --git a/paddle/fluid/framework/details/eager_deletion_op_handle.cc b/paddle/fluid/framework/details/eager_deletion_op_handle.cc index 03fbfd7f24..58cdd65601 100644 --- a/paddle/fluid/framework/details/eager_deletion_op_handle.cc +++ b/paddle/fluid/framework/details/eager_deletion_op_handle.cc @@ -16,6 +16,7 @@ #include "paddle/fluid/framework/lod_tensor_array.h" #include "paddle/fluid/framework/scope.h" #include "paddle/fluid/framework/selected_rows.h" +#include "paddle/fluid/platform/profiler.h" #ifdef PADDLE_WITH_CUDA #include "paddle/fluid/platform/cuda_device_guard.h" #endif @@ -45,6 +46,7 @@ EagerDeletionOpHandle::EagerDeletionOpHandle( } } #endif + PADDLE_ENFORCE(!var_names_.empty(), "Var names cannot be empty"); } EagerDeletionOpHandle::~EagerDeletionOpHandle() { @@ -60,7 +62,13 @@ EagerDeletionOpHandle::~EagerDeletionOpHandle() { std::string EagerDeletionOpHandle::Name() const { return "eager_deletion"; } void EagerDeletionOpHandle::RunImpl() { - auto *exec_scope = scope_->FindVar(kLocalExecScopeName)->Get(); +#ifdef PADDLE_WITH_CUDA + platform::RecordEvent record_event(Name(), dev_ctx_); +#else + platform::RecordEvent record_event(Name(), nullptr); +#endif + + Scope *exec_scope = nullptr; std::deque> garbages; for (auto &name : var_names_) { auto it = ref_cnts_->find(name); @@ -69,6 +77,10 @@ void EagerDeletionOpHandle::RunImpl() { continue; } + if (!exec_scope) { + exec_scope = scope_->FindVar(kLocalExecScopeName)->Get(); + } + auto *var = exec_scope->FindVar(name); if (var == nullptr) { continue; diff --git a/paddle/fluid/framework/details/eager_deletion_pass.cc b/paddle/fluid/framework/details/eager_deletion_pass.cc index 4e42d0b497..6c8cb66b10 100644 --- a/paddle/fluid/framework/details/eager_deletion_pass.cc +++ b/paddle/fluid/framework/details/eager_deletion_pass.cc @@ -12,8 +12,11 @@ // See the License for the specific language governing permissions and // limitations under the License. +#include +#include #include #include +#include #include #include "paddle/fluid/framework/details/computation_op_handle.h" @@ -22,10 +25,120 @@ #include "paddle/fluid/framework/details/multi_devices_helper.h" #include "paddle/fluid/framework/ir/graph_helper.h" +DEFINE_double(fraction_of_eager_deletion, 1.0, "Fraction of eager deletion"); +DEFINE_bool(eager_delete_tensor_only, false, ""); + namespace paddle { namespace framework { namespace details { +namespace { // NOLINT +using OpToVarNameSetMap = + std::unordered_map>; +} // NOLINT + +static bool IsLoDTensor(VarDesc *var) { + return var->Proto()->type().type() == proto::VarType::LOD_TENSOR; +} + +static int64_t GetNumel(const GraphVars &vars, const std::string &var_name, + size_t scope_idx) { + auto *var_desc = TryGetLatestVarDesc(vars[scope_idx].at(var_name)); + PADDLE_ENFORCE(IsLoDTensor(var_desc)); + auto dims = var_desc->GetShape(); + return std::accumulate(dims.begin(), dims.end(), static_cast(1), + std::multiplies()); +} + +static void SplitIntoLoDTensorAndNonLoDTensorVars( + const OpToVarNameSetMap &m, const GraphVars &vars, + OpToVarNameSetMap *lod_tensors, OpToVarNameSetMap *other_vars) { + lod_tensors->clear(); + other_vars->clear(); + + for (auto &op_vars_pair : m) { + for (auto &var_name : op_vars_pair.second) { + auto *var_desc = TryGetLatestVarDesc( + vars[op_vars_pair.first->GetScopeIdx()].at(var_name)); + if (IsLoDTensor(var_desc)) { + (*lod_tensors)[op_vars_pair.first].insert(var_name); + } else { + (*other_vars)[op_vars_pair.first].insert(var_name); + } + } + } +} + +static OpToVarNameSetMap ShrinkGCVars(const OpToVarNameSetMap &m, + const GraphVars &vars, + double fraction_of_memory_size, + bool delete_lod_tensor_only = false) { + // Do not perform gc + if (fraction_of_memory_size <= 0.0) return {}; + + // Perform complete gc + if (fraction_of_memory_size >= 1.0) { + if (delete_lod_tensor_only) { + OpToVarNameSetMap lod_tensors, other_vars; + SplitIntoLoDTensorAndNonLoDTensorVars(m, vars, &lod_tensors, &other_vars); + return lod_tensors; + } else { + return m; + } + } + + // Perform partial gc + OpToVarNameSetMap lod_tensors, other_vars; + SplitIntoLoDTensorAndNonLoDTensorVars(m, vars, &lod_tensors, &other_vars); + + using TupleType = std::tuple; + + std::unordered_map> place_to_vars; + std::unordered_map total_memory_size; + for (auto &op_vars_pair : lod_tensors) { + auto scope_idx = op_vars_pair.first->GetScopeIdx(); + int64_t size = 0; + for (auto &var_name : op_vars_pair.second) { + auto var_size = GetNumel(vars, var_name, scope_idx); + size += std::abs(var_size); + place_to_vars[scope_idx].emplace_back(var_name, op_vars_pair.first, + var_size); + } + total_memory_size.emplace(scope_idx, size); + } + + for (auto &pair : place_to_vars) { + std::sort(pair.second.begin(), pair.second.end(), + [](const TupleType &t1, const TupleType &t2) { + return std::abs(std::get<2>(t1)) > std::abs(std::get<2>(t2)); + }); + } + + OpToVarNameSetMap ret; + for (auto &pair : place_to_vars) { + auto desired_delete_size = static_cast( + fraction_of_memory_size * total_memory_size.at(pair.first)); + int64_t cur_size = 0; + for (size_t i = 0; i < pair.second.size() && cur_size < desired_delete_size; + ++i) { + auto &var_name = std::get<0>(pair.second[i]); + auto *op = std::get<1>(pair.second[i]); + cur_size += std::get<2>(pair.second[i]); + ret[op].insert(var_name); + } + } + + if (!delete_lod_tensor_only) { + for (auto &op_vars_pair : other_vars) { + for (auto &var_name : op_vars_pair.second) { + ret[op_vars_pair.first].insert(var_name); + } + } + } + + return ret; +} + std::unique_ptr EagerDeletionPass::ApplyImpl( std::unique_ptr graph) const { auto &ref_cnts = @@ -43,9 +156,7 @@ std::unique_ptr EagerDeletionPass::ApplyImpl( // a reverse map of last_live_ops // i.e., last op --> variable names which can be deleted. - std::unordered_map> - op_vars_map; - + OpToVarNameSetMap op_vars_map; for (auto &var_ops_map : last_live_ops) { for (auto &var_ops_pair : var_ops_map) { const std::string &var_name = var_ops_pair.first; @@ -55,6 +166,10 @@ std::unique_ptr EagerDeletionPass::ApplyImpl( } } + op_vars_map = + ShrinkGCVars(op_vars_map, vars, FLAGS_fraction_of_eager_deletion, + FLAGS_eager_delete_tensor_only); + for (auto &pair : op_vars_map) { auto *op = pair.first; auto &var_names = pair.second; @@ -85,6 +200,10 @@ std::unique_ptr EagerDeletionPass::ApplyImpl( eager_deletion_op->AddOutput(dummy_leaf); } + VLOG(10) << "FLAGS_fraction_of_eager_deletion = " + << FLAGS_fraction_of_eager_deletion; + VLOG(10) << "FLAGS_eager_delete_tensor_only = " + << FLAGS_eager_delete_tensor_only; VLOG(10) << "Create " << op_vars_map.size() << " EagerDeletionOpHandle(s)"; return graph; } diff --git a/paddle/fluid/framework/details/reference_count_pass.cc b/paddle/fluid/framework/details/reference_count_pass.cc index 13a042d8e6..892f638f1f 100644 --- a/paddle/fluid/framework/details/reference_count_pass.cc +++ b/paddle/fluid/framework/details/reference_count_pass.cc @@ -189,15 +189,6 @@ ExtractComputationOpFromLastLivedVar(VarHandle *var, size_t scope_idx, return shrink_func(computation_op); } -static VarDesc *TryGetLatestVarDesc(const std::vector &vars) { - VarDesc *var_desc = nullptr; - std::find_if(vars.rbegin(), vars.rend(), [&](VarHandle *var_handle) -> bool { - var_desc = var_handle->Node()->Var(); - return var_desc != nullptr; - }); - return var_desc; -} - std::unique_ptr ReferenceCountPass::ApplyImpl( std::unique_ptr graph) const { auto &ref_cnts = Get>(kGlobalReferenceCount); diff --git a/paddle/fluid/framework/details/reference_count_pass_helper.cc b/paddle/fluid/framework/details/reference_count_pass_helper.cc index 89bd08c2d0..94de0e6ab0 100644 --- a/paddle/fluid/framework/details/reference_count_pass_helper.cc +++ b/paddle/fluid/framework/details/reference_count_pass_helper.cc @@ -13,9 +13,22 @@ // limitations under the License. #include "paddle/fluid/framework/details/reference_count_pass_helper.h" +#include "paddle/fluid/framework/details/var_handle.h" +#include "paddle/fluid/framework/var_desc.h" namespace paddle { namespace framework { -namespace details {} // namespace details +namespace details { + +VarDesc *TryGetLatestVarDesc(const std::vector &vars) { + VarDesc *var_desc = nullptr; + std::find_if(vars.rbegin(), vars.rend(), [&](VarHandle *var_handle) -> bool { + var_desc = var_handle->Node()->Var(); + return var_desc != nullptr; + }); + return var_desc; +} + +} // namespace details } // namespace framework } // namespace paddle diff --git a/paddle/fluid/framework/details/reference_count_pass_helper.h b/paddle/fluid/framework/details/reference_count_pass_helper.h index 1c083dbf00..d9e8776d7e 100644 --- a/paddle/fluid/framework/details/reference_count_pass_helper.h +++ b/paddle/fluid/framework/details/reference_count_pass_helper.h @@ -25,6 +25,10 @@ namespace paddle { namespace framework { + +class VarDesc; +class VarHandle; + namespace details { class ComputationOpHandle; @@ -43,9 +47,11 @@ const char kGarbageCollector[] = "garbage_collector"; const char kAllPlaces[] = "all_places"; using LastLiveOpsOfVars = - std::unordered_map>; + std::unordered_map>; const char kLastLiveOpsOfVars[] = "last_live_ops_of_var"; +VarDesc *TryGetLatestVarDesc(const std::vector &vars); + } // namespace details } // namespace framework } // namespace paddle diff --git a/python/paddle/fluid/__init__.py b/python/paddle/fluid/__init__.py index f9f3807b15..794f583037 100644 --- a/python/paddle/fluid/__init__.py +++ b/python/paddle/fluid/__init__.py @@ -127,6 +127,7 @@ def __bootstrap__(): 'use_ngraph', 'initial_cpu_memory_in_mb', 'init_allocated_mem', 'free_idle_memory', 'paddle_num_threads', "dist_threadpool_size", 'eager_delete_tensor_gb', 'fast_eager_deletion_mode', + 'fraction_of_eager_deletion', 'eager_delete_tensor_only', 'allocator_strategy', 'reader_queue_speed_test_mode', 'print_sub_graph_dir', 'pe_profile_fname', 'warpctc_dir', 'enable_parallel_graph' From e5fa9c58cff2ea452f9be0169e013b62b8085f30 Mon Sep 17 00:00:00 2001 From: tianshuo78520a <707759223@qq.com> Date: Thu, 28 Feb 2019 17:52:23 +0800 Subject: [PATCH 02/64] add travis_ci check api.spec --- .travis.yml | 1 + paddle/scripts/paddle_build.sh | 72 ++++++++++++++++++++++------------ 2 files changed, 49 insertions(+), 24 deletions(-) diff --git a/.travis.yml b/.travis.yml index 87de895dda..3766317166 100644 --- a/.travis.yml +++ b/.travis.yml @@ -12,6 +12,7 @@ os: - linux env: - JOB=check_style + - JOB=check_api addons: ssh_known_hosts: 13.229.163.131 before_install: diff --git a/paddle/scripts/paddle_build.sh b/paddle/scripts/paddle_build.sh index 1135caf4f8..c322f3970f 100755 --- a/paddle/scripts/paddle_build.sh +++ b/paddle/scripts/paddle_build.sh @@ -87,7 +87,7 @@ function cmake_gen() { PYTHON_FLAGS="-DPYTHON_EXECUTABLE:FILEPATH=/Library/Frameworks/Python.framework/Versions/3.5/bin/python3 -DPYTHON_INCLUDE_DIR:PATH=/Library/Frameworks/Python.framework/Versions/3.5/include/python3.5m/ -DPYTHON_LIBRARY:FILEPATH=/Library/Frameworks/Python.framework/Versions/3.5/lib/libpython3.5m.dylib" - WITH_FLUID_ONLY=${WITH_FLUID_ONLY:-ON} + pip3.5 uninstall -y protobuf pip3.5 install --user -r ${PADDLE_ROOT}/python/requirements.txt else exit 1 @@ -100,7 +100,7 @@ function cmake_gen() { PYTHON_FLAGS="-DPYTHON_EXECUTABLE:FILEPATH=/Library/Frameworks/Python.framework/Versions/3.6/bin/python3 -DPYTHON_INCLUDE_DIR:PATH=/Library/Frameworks/Python.framework/Versions/3.6/include/python3.6m/ -DPYTHON_LIBRARY:FILEPATH=/Library/Frameworks/Python.framework/Versions/3.6/lib/libpython3.6m.dylib" - WITH_FLUID_ONLY=${WITH_FLUID_ONLY:-ON} + pip3.6 uninstall -y protobuf pip3.6 install --user -r ${PADDLE_ROOT}/python/requirements.txt else exit 1 @@ -113,7 +113,7 @@ function cmake_gen() { PYTHON_FLAGS="-DPYTHON_EXECUTABLE:FILEPATH=/Library/Frameworks/Python.framework/Versions/3.7/bin/python3 -DPYTHON_INCLUDE_DIR:PATH=/Library/Frameworks/Python.framework/Versions/3.7/include/python3.7m/ -DPYTHON_LIBRARY:FILEPATH=/Library/Frameworks/Python.framework/Versions/3.7/lib/libpython3.7m.dylib" - WITH_FLUID_ONLY=${WITH_FLUID_ONLY:-ON} + pip3.7 uninstall -y protobuf pip3.7 install --user -r ${PADDLE_ROOT}/python/requirements.txt else exit 1 @@ -128,31 +128,44 @@ function cmake_gen() { PYTHON_FLAGS="-DPYTHON_EXECUTABLE:FILEPATH=/opt/python/cp27-cp27m/bin/python -DPYTHON_INCLUDE_DIR:PATH=/opt/python/cp27-cp27m/include/python2.7 -DPYTHON_LIBRARIES:FILEPATH=/opt/_internal/cpython-2.7.11-ucs2/lib/libpython2.7.so" + pip uninstall -y protobuf + pip install -r ${PADDLE_ROOT}/python/requirements.txt elif [ "$1" == "cp27-cp27mu" ]; then export LD_LIBRARY_PATH=/opt/_internal/cpython-2.7.11-ucs4/lib:${LD_LIBRARY_PATH#/opt/_internal/cpython-2.7.11-ucs2/lib:} export PATH=/opt/python/cp27-cp27mu/bin/:${PATH} PYTHON_FLAGS="-DPYTHON_EXECUTABLE:FILEPATH=/opt/python/cp27-cp27mu/bin/python -DPYTHON_INCLUDE_DIR:PATH=/opt/python/cp27-cp27mu/include/python2.7 -DPYTHON_LIBRARIES:FILEPATH=/opt/_internal/cpython-2.7.11-ucs4/lib/libpython2.7.so" + pip uninstall -y protobuf + pip install -r ${PADDLE_ROOT}/python/requirements.txt elif [ "$1" == "cp35-cp35m" ]; then export LD_LIBRARY_PATH=/opt/_internal/cpython-3.5.1/lib/:${LD_LIBRARY_PATH} export PATH=/opt/_internal/cpython-3.5.1/bin/:${PATH} export PYTHON_FLAGS="-DPYTHON_EXECUTABLE:FILEPATH=/opt/_internal/cpython-3.5.1/bin/python3 -DPYTHON_INCLUDE_DIR:PATH=/opt/_internal/cpython-3.5.1/include/python3.5m -DPYTHON_LIBRARIES:FILEPATH=/opt/_internal/cpython-3.5.1/lib/libpython3.so" + pip3.5 uninstall -y protobuf + pip3.5 install -r ${PADDLE_ROOT}/python/requirements.txt elif [ "$1" == "cp36-cp36m" ]; then export LD_LIBRARY_PATH=/opt/_internal/cpython-3.6.0/lib/:${LD_LIBRARY_PATH} export PATH=/opt/_internal/cpython-3.6.0/bin/:${PATH} export PYTHON_FLAGS="-DPYTHON_EXECUTABLE:FILEPATH=/opt/_internal/cpython-3.6.0/bin/python3 -DPYTHON_INCLUDE_DIR:PATH=/opt/_internal/cpython-3.6.0/include/python3.6m -DPYTHON_LIBRARIES:FILEPATH=/opt/_internal/cpython-3.6.0/lib/libpython3.so" + pip3.6 uninstall -y protobuf + pip3.6 install -r ${PADDLE_ROOT}/python/requirements.txt elif [ "$1" == "cp37-cp37m" ]; then export LD_LIBRARY_PATH=/opt/_internal/cpython-3.7.0/lib/:${LD_LIBRARY_PATH} export PATH=/opt/_internal/cpython-3.7.0/bin/:${PATH} export PYTHON_FLAGS="-DPYTHON_EXECUTABLE:FILEPATH=/opt/_internal/cpython-3.7.0/bin/python3.7 -DPYTHON_INCLUDE_DIR:PATH=/opt/_internal/cpython-3.7.0/include/python3.7m -DPYTHON_LIBRARIES:FILEPATH=/opt/_internal/cpython-3.7.0/lib/libpython3.so" + pip3.7 uninstall -y protobuf + pip3.7 install -r ${PADDLE_ROOT}/python/requirements.txt fi + else + pip uninstall -y protobuf + pip install -r ${PADDLE_ROOT}/python/requirements.txt fi fi @@ -186,7 +199,6 @@ function cmake_gen() { -DWITH_TESTING=${WITH_TESTING:-ON} -DCMAKE_MODULE_PATH=/opt/rocm/hip/cmake -DCMAKE_EXPORT_COMPILE_COMMANDS=ON - -DWITH_FLUID_ONLY=${WITH_FLUID_ONLY:-OFF} -DCMAKE_EXPORT_COMPILE_COMMANDS=ON -DWITH_CONTRIB=${WITH_CONTRIB:-ON} -DWITH_INFERENCE_API_TEST=${WITH_INFERENCE_API_TEST:-ON} @@ -219,7 +231,6 @@ EOF -DCUDNN_ROOT=/usr/ \ -DWITH_TESTING=${WITH_TESTING:-ON} \ -DCMAKE_MODULE_PATH=/opt/rocm/hip/cmake \ - -DWITH_FLUID_ONLY=${WITH_FLUID_ONLY:-OFF} \ -DCMAKE_EXPORT_COMPILE_COMMANDS=ON \ -DWITH_CONTRIB=${WITH_CONTRIB:-ON} \ -DWITH_INFERENCE_API_TEST=${WITH_INFERENCE_API_TEST:-ON} \ @@ -248,6 +259,7 @@ function check_style() { eval "$(GIMME_GO_VERSION=1.8.3 gimme)" fi + pip install cpplint # set up go environment for running gometalinter mkdir -p $GOPATH/src/github.com/PaddlePaddle/ ln -sf ${PADDLE_ROOT} $GOPATH/src/github.com/PaddlePaddle/Paddle @@ -382,9 +394,7 @@ EOF pip3.7 install --user ${INSTALL_PREFIX:-/paddle/build}/opt/paddle/share/wheels/*.whl fi - if [[ ${WITH_FLUID_ONLY:-OFF} == "OFF" ]] ; then - paddle version - fi + paddle version if [ "$1" == "cp27-cp27m" ]; then pip uninstall -y paddlepaddle @@ -422,8 +432,8 @@ function assert_api_spec_approvals() { BRANCH="develop" fi - API_FILES=("cmake/external" - "paddle/fluid/API.spec" + API_FILES=("paddle/fluid/API.spec" + "python/paddle/fluid/parallel_executor.py" "paddle/fluid/framework/operator.h" "paddle/fluid/framework/tensor.h" "paddle/fluid/framework/lod_tensor.h" @@ -435,6 +445,7 @@ function assert_api_spec_approvals() { "paddle/fluid/framework/ir/node.h" "paddle/fluid/framework/ir/graph.h" "paddle/fluid/framework/framework.proto" + "python/paddle/fluid/compiler.py" "paddle/fluid/operators/distributed/send_recv.proto.in") for API_FILE in ${API_FILES[*]}; do API_CHANGE=`git diff --name-only upstream/$BRANCH | grep "${API_FILE}" || true` @@ -539,7 +550,6 @@ EOF -DCMAKE_BUILD_TYPE=Release \ -DWITH_GPU=OFF \ -DWITH_MKL=OFF \ - -DWITH_FLUID_ONLY=ON local LIB_TYPE=$1 case $LIB_TYPE in @@ -615,13 +625,8 @@ EOF NCCL_DEPS="true" fi - if [[ ${WITH_FLUID_ONLY:-OFF} == "OFF" ]]; then - PADDLE_VERSION="paddle version" - CMD='"paddle", "version"' - else - PADDLE_VERSION="true" - CMD='"true"' - fi + PADDLE_VERSION="paddle version" + CMD='"paddle", "version"' if [ "$1" == "cp35-cp35m" ]; then cat >> ${PADDLE_ROOT}/build/Dockerfile <> ${PADDLE_ROOT}/build/Dockerfile <> ${PADDLE_ROOT}/build/Dockerfile < Date: Thu, 28 Feb 2019 18:05:17 +0800 Subject: [PATCH 03/64] update paddle_build --- paddle/scripts/paddle_build.sh | 1 + 1 file changed, 1 insertion(+) diff --git a/paddle/scripts/paddle_build.sh b/paddle/scripts/paddle_build.sh index c322f3970f..74d7250471 100755 --- a/paddle/scripts/paddle_build.sh +++ b/paddle/scripts/paddle_build.sh @@ -862,6 +862,7 @@ function main() { ;; test_fluid_lib) test_fluid_lib + ;; travis_check_api) traivs_check_api traivs_check_api_py35 From ffbd83947c23cda36a5371d466f3f8a4ea87ac78 Mon Sep 17 00:00:00 2001 From: tianshuo78520a <707759223@qq.com> Date: Thu, 28 Feb 2019 19:03:23 +0800 Subject: [PATCH 04/64] update build --- paddle/scripts/paddle_build.sh | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/paddle/scripts/paddle_build.sh b/paddle/scripts/paddle_build.sh index 74d7250471..260a9e4061 100755 --- a/paddle/scripts/paddle_build.sh +++ b/paddle/scripts/paddle_build.sh @@ -756,7 +756,9 @@ EOF ./clean.sh } -function traivs_check_api() { +function travis_check_api() { + mkdir -p ${PADDLE_ROOT}/build + cd ${PADDLE_ROOT}/build cmake .. \ -DCMAKE_BUILD_TYPE=Release \ -DWITH_GPU=OFF \ @@ -767,7 +769,9 @@ function traivs_check_api() { pip uninstall paddlepaddle } -function traivs_check_api_py35() { +function travis_check_api_py35() { + mkdir -p ${PADDLE_ROOT}/build + cd ${PADDLE_ROOT}/build cmake .. \ -DPY_VERSION=3.5 \ -DCMAKE_BUILD_TYPE=Release \ @@ -863,7 +867,7 @@ function main() { test_fluid_lib) test_fluid_lib ;; - travis_check_api) + check_api) traivs_check_api traivs_check_api_py35 ;; From 10439e8afc28869d470e1326bf2598f086d30c82 Mon Sep 17 00:00:00 2001 From: tianshuo78520a <707759223@qq.com> Date: Thu, 28 Feb 2019 21:55:38 +0800 Subject: [PATCH 05/64] update paddle_build check_api --- paddle/scripts/paddle_build.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/paddle/scripts/paddle_build.sh b/paddle/scripts/paddle_build.sh index 260a9e4061..2512c810f4 100755 --- a/paddle/scripts/paddle_build.sh +++ b/paddle/scripts/paddle_build.sh @@ -868,8 +868,8 @@ function main() { test_fluid_lib ;; check_api) - traivs_check_api - traivs_check_api_py35 + travis_check_api + travis_check_api_py35 ;; *) print_usage From 545247d7b4e803a2067c0187b2c3c962ec22629d Mon Sep 17 00:00:00 2001 From: Zhen Wang Date: Mon, 4 Mar 2019 17:59:31 +0800 Subject: [PATCH 06/64] add channel wise quantize op. --- paddle/fluid/operators/fake_quantize_op.cc | 62 +++++++++++++++++++ paddle/fluid/operators/fake_quantize_op.cu | 2 + paddle/fluid/operators/fake_quantize_op.h | 33 ++++++++++ .../tests/unittests/test_fake_quantize_op.py | 24 +++++++ 4 files changed, 121 insertions(+) diff --git a/paddle/fluid/operators/fake_quantize_op.cc b/paddle/fluid/operators/fake_quantize_op.cc index 3bb07d3835..c873ee6718 100644 --- a/paddle/fluid/operators/fake_quantize_op.cc +++ b/paddle/fluid/operators/fake_quantize_op.cc @@ -134,6 +134,61 @@ $$Out = round(X/scale * range)$$ } }; +class FakeChannelWiseQuantizeAbsMaxOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext* ctx) const override { + PADDLE_ENFORCE(ctx->HasInput("X"), + "Input(X) of FakeChannelWiseQuantizeOp should not be null."); + PADDLE_ENFORCE( + ctx->HasOutput("Out"), + "Output(Out) of FakeChannelWiseQuantizeOp should not be null."); + PADDLE_ENFORCE( + ctx->HasOutput("OutScales"), + "Output(Scales) of FakeChannelWiseQuantizeOp should not be null."); + ctx->SetOutputDim("Out", ctx->GetInputDim("X")); + ctx->SetOutputDim("OutScales", {ctx->GetInputDim("X")[0]}); + ctx->ShareLoD("X", /*->*/ "Out"); + } + + protected: + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext& ctx) const override { + return framework::OpKernelType(ctx.Input("X")->type(), + ctx.GetPlace()); + } +}; + +class FakeChannelWiseQuantizeAbsMaxOpMaker + : public framework::OpProtoAndCheckerMaker { + public: + void Make() override { + AddInput("X", "(Tensor) Input is float data type."); + AddOutput("Out", + "(Tensor) Output of quantized low level tensor, " + "but also saved as float data type."); + AddOutput("OutScales", "(Tensor) Current channel wise scale"); + AddAttr("bit_length", "(int, default 8)") + .SetDefault(8) + .AddCustomChecker([](const int& bit_length) { + PADDLE_ENFORCE(bit_length >= 1 && bit_length <= 16, + "'bit_length' should be between 1 and 16."); + }); + AddComment(R"DOC( +The scale of FakeChannelWiseQuantize operator is a vector. +In detail, each channel of the input X has a scale value. + +$$scale_c = max(abs(X_c))$$ +$$range = 2^{bit_length - 1} - 1$$ +$$Out_c = round(X_c / scale_c * range)$$ + +In above three formulas, the range value of c is as follow: +$$0 \leq c \leq \ the\ channel\ number\ of\ X$$ +)DOC"); + } +}; + class FakeQuantizeRangeAbsMaxOp : public framework::OperatorWithKernel { public: FakeQuantizeRangeAbsMaxOp(const std::string& type, @@ -218,3 +273,10 @@ REGISTER_OPERATOR(fake_quantize_range_abs_max, ops::FakeQuantizeRangeAbsMaxOp, paddle::framework::EmptyGradOpMaker); REGISTER_OP_CPU_KERNEL(fake_quantize_range_abs_max, ops::FakeQuantizeRangeAbsMaxKernel); + +REGISTER_OPERATOR(fake_channel_wise_quantize_abs_max, + ops::FakeChannelWiseQuantizeAbsMaxOp, + ops::FakeChannelWiseQuantizeAbsMaxOpMaker, + paddle::framework::EmptyGradOpMaker); +REGISTER_OP_CPU_KERNEL(fake_channel_wise_quantize_abs_max, + ops::FakeChannelWiseQuantizeAbsMaxKernel); diff --git a/paddle/fluid/operators/fake_quantize_op.cu b/paddle/fluid/operators/fake_quantize_op.cu index a0ff639621..5da16a7c73 100644 --- a/paddle/fluid/operators/fake_quantize_op.cu +++ b/paddle/fluid/operators/fake_quantize_op.cu @@ -174,5 +174,7 @@ namespace ops = paddle::operators; using CUDA = paddle::platform::CUDADeviceContext; REGISTER_OP_CUDA_KERNEL(fake_quantize_abs_max, ops::FakeQuantizeAbsMaxKernel); +REGISTER_OP_CUDA_KERNEL(fake_channel_wise_quantize_abs_max, + ops::FakeChannelWiseQuantizeAbsMaxKernel); REGISTER_OP_CUDA_KERNEL(fake_quantize_range_abs_max, ops::FakeQuantizeRangeAbsMaxKernel); diff --git a/paddle/fluid/operators/fake_quantize_op.h b/paddle/fluid/operators/fake_quantize_op.h index 7ace7573ec..8b47600e7d 100644 --- a/paddle/fluid/operators/fake_quantize_op.h +++ b/paddle/fluid/operators/fake_quantize_op.h @@ -63,6 +63,39 @@ class FakeQuantizeAbsMaxKernel : public framework::OpKernel { } }; +template +class FakeChannelWiseQuantizeAbsMaxKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& context) const override { + auto* in = context.Input("X"); + + auto* out = context.Output("Out"); + auto* out_scales = context.Output("OutScales"); + T* out_scales_data = out_scales->mutable_data(context.GetPlace()); + out->mutable_data(context.GetPlace()); + + int bit_length = context.Attr("bit_length"); + int bin_cnt = std::pow(2, bit_length - 1) - 1; + + auto& dev_ctx = context.template device_context(); + auto find_abs_max = FindAbsMaxFunctor(); + for (int64_t i = 0; i < in->dims()[0]; i++) { + framework::Tensor one_channel = in->Slice(i, i + 1); + const T* one_channel_data = one_channel.data(); + find_abs_max(dev_ctx, one_channel_data, one_channel.numel(), + &out_scales_data[i]); + } + auto clip_quant = ClipAndFakeQuantFunctor(); + for (int64_t i = 0; i < in->dims()[0]; i++) { + framework::Tensor one_channel_in = in->Slice(i, i + 1); + framework::Tensor one_channel_out = out->Slice(i, i + 1); + framework::Tensor one_channel_scale = out_scales->Slice(i, i + 1); + clip_quant(dev_ctx, one_channel_in, one_channel_scale, bin_cnt, + &one_channel_out); + } + } +}; + template class FakeQuantizeRangeAbsMaxKernel : public framework::OpKernel { public: diff --git a/python/paddle/fluid/tests/unittests/test_fake_quantize_op.py b/python/paddle/fluid/tests/unittests/test_fake_quantize_op.py index 4582b2a0ee..90a90112bd 100644 --- a/python/paddle/fluid/tests/unittests/test_fake_quantize_op.py +++ b/python/paddle/fluid/tests/unittests/test_fake_quantize_op.py @@ -35,6 +35,30 @@ class TestFakeQuantizeOp(OpTest): self.check_output() +class TestFakeChannelWiseQuantizeOp(OpTest): + def setUp(self): + self.op_type = "fake_channel_wise_quantize_abs_max" + self.attrs = {'bit_length': 8} + self.inputs = { + 'X': np.random.random((4, 3, 64, 64)).astype("float32"), + } + scales = [] + for i in range(self.inputs['X'].shape[0]): + scales.append(np.max(np.abs(self.inputs['X'][i])).astype("float32")) + outputs = self.inputs['X'].copy() + for i, scale in enumerate(scales): + outputs[i] = np.round(outputs[i] / scale * ( + (1 << (self.attrs['bit_length'] - 1)) - 1)) + + self.outputs = { + 'Out': outputs, + 'OutScales': np.array(scales).astype("float32"), + } + + def test_check_output(self): + self.check_output() + + class TestFakeQuantizeRangeAbsMaxOp(OpTest): def setUp(self): self.op_type = "fake_quantize_range_abs_max" From 89dee160d18d699075c2bfbfce6d7311dfa4f59f Mon Sep 17 00:00:00 2001 From: Zhen Wang Date: Tue, 5 Mar 2019 16:41:46 +0800 Subject: [PATCH 07/64] add channel wise dequantize op. --- paddle/fluid/operators/fake_dequantize_op.cc | 72 +++++++++++++++++++ paddle/fluid/operators/fake_dequantize_op.cu | 4 ++ paddle/fluid/operators/fake_dequantize_op.h | 51 +++++++++++++ paddle/fluid/operators/fake_quantize_op.cc | 7 +- .../unittests/test_fake_dequantize_op.py | 71 ++++++++++++++++++ 5 files changed, 201 insertions(+), 4 deletions(-) diff --git a/paddle/fluid/operators/fake_dequantize_op.cc b/paddle/fluid/operators/fake_dequantize_op.cc index 5d6488c67e..73ffaae6a5 100644 --- a/paddle/fluid/operators/fake_dequantize_op.cc +++ b/paddle/fluid/operators/fake_dequantize_op.cc @@ -76,6 +76,70 @@ $$Out = \frac{scale*X}{ max_range }$$ } }; +class FakeChannelWiseDequantizeMaxAbsOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext* ctx) const override { + PADDLE_ENFORCE( + ctx->HasInput("X"), + "Input(X) of FakeChannelWiseDequantizeMaxAbsOp should not be null."); + PADDLE_ENFORCE(ctx->HasInput("WeightScales"), + "Input(WeightScales) of FakeChannelWiseDequantizeMaxAbsOp " + "should not be null."); + PADDLE_ENFORCE( + ctx->HasOutput("Out"), + "Output(Out) of FakeChannelWiseDequantizeMaxAbsOp should not be null."); + + ctx->ShareDim("X", /*->*/ "Out"); + ctx->ShareLoD("X", /*->*/ "Out"); + } +}; + +class FakeChannelWiseDequantizeMaxAbsOpMaker + : public framework::OpProtoAndCheckerMaker { + public: + void Make() override { + AddInput("X", + "(Tensor) The input with float-32/64 type is the " + "low precision tensor."); + AddInput("ActivationScale", + "(float) The activation scale in quantization stage.") + .AsDispensable(); + AddInput("WeightScales", + "(float array) The weight scales in quantization stage."); + AddOutput("Out", + "(Tensor) The output is the dequantized high " + "precision tensor."); + AddAttr("activation_bits", "Quantization bit number for activation.") + .SetDefault(8) + .AddCustomChecker([](const int& bit_length) { + PADDLE_ENFORCE(bit_length >= 1 && bit_length <= 16, + "'activation_bits' should be between 1 and 16."); + }); + AddAttr("weight_bits", "Quantization bit number for weights.") + .SetDefault(8) + .AddCustomChecker([](const int& bit_length) { + PADDLE_ENFORCE(bit_length >= 1 && bit_length <= 16, + "'weight_bits' should be between 1 and 16."); + }); + + AddComment(R"DOC( +FakeChannelWiseDequantizeMaxAbsOp operator. + +This calculation is an opposite operation of FakeChannelWiseQuantizeMaxAbsOp: + +$$Out_c = \frac{ActivationScale*WeightScale_c*X_c}{(2^{weight\_bits-1}-1)*(2^{activation\_bits-1}-1)}$$ + +In the above formula, the range value of c is as follow: +$$0 \leq c \lt \ the\ channel\ number\ of\ X$$ + +Notes: Tha per-channel quantization is only applied to weights(channel size scale). +And the activations use per-layer quantization(only one scale). +)DOC"); + } +}; + } // namespace operators } // namespace paddle @@ -88,3 +152,11 @@ REGISTER_OPERATOR(fake_dequantize_max_abs, ops::FakeDequantizeMaxAbsOp, REGISTER_OP_CPU_KERNEL(fake_dequantize_max_abs, ops::FakeDequantizeMaxAbsKernel, ops::FakeDequantizeMaxAbsKernel); + +REGISTER_OPERATOR(fake_channel_wise_dequantize_max_abs, + ops::FakeChannelWiseDequantizeMaxAbsOp, + ops::FakeChannelWiseDequantizeMaxAbsOpMaker, + paddle::framework::EmptyGradOpMaker); +REGISTER_OP_CPU_KERNEL(fake_channel_wise_dequantize_max_abs, + ops::FakeChannelWiseDequantizeMaxAbsKernel, + ops::FakeChannelWiseDequantizeMaxAbsKernel); diff --git a/paddle/fluid/operators/fake_dequantize_op.cu b/paddle/fluid/operators/fake_dequantize_op.cu index 225bcc45bc..35dcc69279 100644 --- a/paddle/fluid/operators/fake_dequantize_op.cu +++ b/paddle/fluid/operators/fake_dequantize_op.cu @@ -55,3 +55,7 @@ using CUDA = paddle::platform::CUDADeviceContext; REGISTER_OP_CUDA_KERNEL(fake_dequantize_max_abs, ops::FakeDequantizeMaxAbsKernel, ops::FakeDequantizeMaxAbsKernel); +REGISTER_OP_CUDA_KERNEL( + fake_channel_wise_dequantize_max_abs, + ops::FakeChannelWiseDequantizeMaxAbsKernel, + ops::FakeChannelWiseDequantizeMaxAbsKernel); diff --git a/paddle/fluid/operators/fake_dequantize_op.h b/paddle/fluid/operators/fake_dequantize_op.h index d9923a10da..c26dfa8332 100644 --- a/paddle/fluid/operators/fake_dequantize_op.h +++ b/paddle/fluid/operators/fake_dequantize_op.h @@ -45,5 +45,56 @@ class FakeDequantizeMaxAbsKernel : public framework::OpKernel { } }; +template +class FakeChannelWiseDequantizeMaxAbsKernel : public framework::OpKernel { + public: + virtual void Compute(const framework::ExecutionContext& ctx) const { + auto* in = ctx.Input("X"); + auto* weight_scales = ctx.Input("WeightScales"); + auto* out = ctx.Output("Out"); + + PADDLE_ENFORCE_EQ(weight_scales->numel(), in->dims()[0], + "The weight uses the per-channel quantization type, so " + "the number of weight scale values must be the same with " + "first dimension value of Input(X)."); + + int ativation_bits = ctx.Attr("activation_bits"); + int weight_bits = ctx.Attr("weight_bits"); + int range = std::pow(2, weight_bits - 1) - 1; + + auto& dev_ctx = ctx.template device_context(); + out->mutable_data(dev_ctx.GetPlace()); + + auto dequant = DequantizeFunctor(); + if (ctx.HasInput("ActivationScale")) { + auto* activation_scale = ctx.Input("ActivationScale"); + PADDLE_ENFORCE_EQ(activation_scale->numel(), 1, + "The activation uses per-layer quantization type, so " + "it must have only one value."); + framework::Tensor cpu_weigth_scales; + framework::TensorCopy(*weight_scales, platform::CPUPlace(), + &cpu_weigth_scales); + dev_ctx.Wait(); + const T* weight_scales_data = cpu_weigth_scales.data(); + range *= (std::pow(2, ativation_bits - 1) - 1); + for (int64_t i = 0; i < in->dims()[0]; i++) { + framework::Tensor one_channel_in = in->Slice(i, i + 1); + framework::Tensor one_channel_out = out->Slice(i, i + 1); + auto max_range = range / weight_scales_data[i]; + dequant(dev_ctx, &one_channel_in, activation_scale, + static_cast(max_range), &one_channel_out); + } + } else { + for (int64_t i = 0; i < in->dims()[0]; i++) { + framework::Tensor one_channel_in = in->Slice(i, i + 1); + framework::Tensor one_channel_out = out->Slice(i, i + 1); + framework::Tensor one_channel_scale = weight_scales->Slice(i, i + 1); + dequant(dev_ctx, &one_channel_in, &one_channel_scale, + static_cast(range), &one_channel_out); + } + } + } +}; + } // namespace operators } // namespace paddle diff --git a/paddle/fluid/operators/fake_quantize_op.cc b/paddle/fluid/operators/fake_quantize_op.cc index c873ee6718..70186e5efa 100644 --- a/paddle/fluid/operators/fake_quantize_op.cc +++ b/paddle/fluid/operators/fake_quantize_op.cc @@ -180,11 +180,10 @@ The scale of FakeChannelWiseQuantize operator is a vector. In detail, each channel of the input X has a scale value. $$scale_c = max(abs(X_c))$$ -$$range = 2^{bit_length - 1} - 1$$ -$$Out_c = round(X_c / scale_c * range)$$ - +$$range = 2^{bit\_length - 1} - 1$$ +$$Out_c = round(\frac{X_c * range} {scale_c})$$ In above three formulas, the range value of c is as follow: -$$0 \leq c \leq \ the\ channel\ number\ of\ X$$ +$$0 \leq c \lt \ the\ channel\ number\ of\ X$$ )DOC"); } }; diff --git a/python/paddle/fluid/tests/unittests/test_fake_dequantize_op.py b/python/paddle/fluid/tests/unittests/test_fake_dequantize_op.py index 1bb4662e8d..bd8dad4d59 100644 --- a/python/paddle/fluid/tests/unittests/test_fake_dequantize_op.py +++ b/python/paddle/fluid/tests/unittests/test_fake_dequantize_op.py @@ -31,6 +31,77 @@ def dequantize_max_abs(x, scale, max_range): return y +def channel_wise_quantize_max_abs(x, max_range): + scales = [] + for i in range(x.shape[0]): + scales.append(np.max(np.abs(x[i])).astype("float32")) + + y = x.copy() + for i, scale in enumerate(scales): + y[i] = np.round(y[i] / scale * max_range) + return y, scales + + +def channel_wise_dequantize_max_abs(x, scales, max_range): + y = x.copy() + for i in range(x.shape[0]): + y[i] = (scales[i] / max_range) * y[i] + return y + + +class TestFakeChannelWiseDequantizeMaxAbsOp(OpTest): + def set_args(self): + self.weight_bits = 8 + self.activation_bits = 2 + self.data_type = "float32" + + def setUp(self): + self.set_args() + self.op_type = "fake_channel_wise_dequantize_max_abs" + x = np.random.randn(4, 3, 64, 64).astype(self.data_type) + max_range = math.pow(2, self.weight_bits - 1) - 1 + yq, scales = channel_wise_quantize_max_abs(x, max_range) + ydq = channel_wise_dequantize_max_abs(yq, scales, max_range) + + self.inputs = { + 'X': yq, + 'ActivationScale': np.array(1.0).astype(self.data_type), + 'WeightScales': np.array(scales).astype(self.data_type) + } + self.attrs = { + 'weight_bits': self.weight_bits, + 'activation_bits': self.activation_bits + } + self.outputs = {'Out': ydq} + + def test_check_output(self): + self.check_output() + + +class TestFakeChannelWiseDequantizeMaxAbsOpNoActivationScale(OpTest): + def set_args(self): + self.weight_bits = 8 + self.data_type = "float32" + + def setUp(self): + self.set_args() + self.op_type = "fake_channel_wise_dequantize_max_abs" + x = np.random.randn(4, 3, 64, 64).astype(self.data_type) + max_range = math.pow(2, self.weight_bits - 1) - 1 + yq, scales = channel_wise_quantize_max_abs(x, max_range) + ydq = channel_wise_dequantize_max_abs(yq, scales, max_range) + + self.inputs = { + 'X': yq, + 'WeightScales': np.array(scales).astype(self.data_type) + } + self.attrs = {'weight_bits': self.weight_bits} + self.outputs = {'Out': ydq} + + def test_check_output(self): + self.check_output() + + class TestFakeDequantizeMaxAbsOp(OpTest): def set_args(self): self.num_bits = 8 From 597dc65e76669e381cf4c52dfeeb671d5eecc4e6 Mon Sep 17 00:00:00 2001 From: sneaxiy Date: Tue, 5 Mar 2019 10:11:10 +0000 Subject: [PATCH 08/64] enhance gc test=develop --- paddle/fluid/framework/CMakeLists.txt | 2 +- paddle/fluid/framework/details/CMakeLists.txt | 3 +- .../framework/details/computation_op_handle.h | 2 + .../details/eager_deletion_op_handle.cc | 5 +- .../framework/details/eager_deletion_pass.cc | 165 ++++++---- .../framework/details/eager_deletion_pass.h | 32 -- .../details/while_op_eager_deletion_pass.cc | 62 ++++ paddle/fluid/framework/executor.cc | 8 +- .../operators/controlflow/CMakeLists.txt | 1 + .../fluid/operators/controlflow/while_op.cc | 9 +- .../operators/controlflow/while_op_helper.cc | 292 ++++++++++++++++++ .../operators/controlflow/while_op_helper.h | 43 +++ .../unittests/test_eager_deletion_while_op.py | 153 +++++++++ ...test_partial_eager_deletion_transformer.py | 26 ++ 14 files changed, 692 insertions(+), 111 deletions(-) delete mode 100644 paddle/fluid/framework/details/eager_deletion_pass.h create mode 100644 paddle/fluid/framework/details/while_op_eager_deletion_pass.cc create mode 100644 paddle/fluid/operators/controlflow/while_op_helper.cc create mode 100644 paddle/fluid/operators/controlflow/while_op_helper.h create mode 100644 python/paddle/fluid/tests/unittests/test_eager_deletion_while_op.py create mode 100644 python/paddle/fluid/tests/unittests/test_partial_eager_deletion_transformer.py diff --git a/paddle/fluid/framework/CMakeLists.txt b/paddle/fluid/framework/CMakeLists.txt index b9491c953f..ad19d729eb 100644 --- a/paddle/fluid/framework/CMakeLists.txt +++ b/paddle/fluid/framework/CMakeLists.txt @@ -174,7 +174,7 @@ else() cc_test(test_naive_executor SRCS naive_executor_test.cc DEPS naive_executor elementwise_add_op) endif() -target_link_libraries(executor garbage_collector) +target_link_libraries(executor garbage_collector while_op_helper) cc_library(parallel_executor SRCS parallel_executor.cc DEPS threaded_ssa_graph_executor scope_buffered_ssa_graph_executor parallel_ssa_graph_executor diff --git a/paddle/fluid/framework/details/CMakeLists.txt b/paddle/fluid/framework/details/CMakeLists.txt index dc308fd259..9f06455ea5 100644 --- a/paddle/fluid/framework/details/CMakeLists.txt +++ b/paddle/fluid/framework/details/CMakeLists.txt @@ -61,7 +61,8 @@ cc_library(inplace_op_pass SRCS inplace_op_pass.cc DEPS memory_optimize_pass op_ cc_library(modify_op_lock_and_record_event_pass SRCS modify_op_lock_and_record_event_pass.cc DEPS computation_op_handle op_graph_view multi_devices_helper) cc_library(reference_count_pass_helper SRCS reference_count_pass_helper.cc DEPS garbage_collector computation_op_handle) cc_library(eager_deletion_op_handle SRCS eager_deletion_op_handle.cc DEPS lod_tensor selected_rows reference_count_pass_helper) -cc_library(eager_deletion_pass SRCS eager_deletion_pass.cc DEPS computation_op_handle eager_deletion_op_handle graph graph_helper pass) +cc_library(while_op_eager_deletion_pass SRCS while_op_eager_deletion_pass.cc DEPS while_op_helper graph_helper pass computation_op_handle) +cc_library(eager_deletion_pass SRCS eager_deletion_pass.cc DEPS computation_op_handle eager_deletion_op_handle graph graph_helper pass while_op_eager_deletion_pass) cc_library(reference_count_pass SRCS reference_count_pass.cc DEPS computation_op_handle graph graph_helper pass op_graph_view reference_count_pass_helper) cc_library(sequential_execution_pass SRCS sequential_execution_pass.cc DEPS graph graph_helper pass) diff --git a/paddle/fluid/framework/details/computation_op_handle.h b/paddle/fluid/framework/details/computation_op_handle.h index 1e3dbb1e44..67f7cb738f 100644 --- a/paddle/fluid/framework/details/computation_op_handle.h +++ b/paddle/fluid/framework/details/computation_op_handle.h @@ -31,6 +31,8 @@ class ComputationOpHandle : public OpHandleBase { ComputationOpHandle(ir::Node *node, Scope *scope, platform::Place place, size_t scope_idx); + OperatorBase *GetOp() { return op_.get(); } + std::string Name() const override; const Scope *GetScope() const { return scope_; } diff --git a/paddle/fluid/framework/details/eager_deletion_op_handle.cc b/paddle/fluid/framework/details/eager_deletion_op_handle.cc index 9faef8a186..e58e501e6d 100644 --- a/paddle/fluid/framework/details/eager_deletion_op_handle.cc +++ b/paddle/fluid/framework/details/eager_deletion_op_handle.cc @@ -25,8 +25,6 @@ namespace paddle { namespace framework { namespace details { -static const std::string kEagerDeletionOpName{"eager_deletion"}; // NOLINT - EagerDeletionOpHandle::EagerDeletionOpHandle( ir::Node *node, const Scope *scope, const platform::Place &place, const std::unordered_set &var_names, GarbageCollector *gc, @@ -61,10 +59,9 @@ EagerDeletionOpHandle::~EagerDeletionOpHandle() { #endif } -std::string EagerDeletionOpHandle::Name() const { return kEagerDeletionOpName; } +std::string EagerDeletionOpHandle::Name() const { return "eager_deletion"; } void EagerDeletionOpHandle::RunImpl() { - platform::RecordEvent event(kEagerDeletionOpName, nullptr); Scope *exec_scope = nullptr; std::deque> garbages; for (auto &name : var_names_) { diff --git a/paddle/fluid/framework/details/eager_deletion_pass.cc b/paddle/fluid/framework/details/eager_deletion_pass.cc index 6c8cb66b10..566bc15c17 100644 --- a/paddle/fluid/framework/details/eager_deletion_pass.cc +++ b/paddle/fluid/framework/details/eager_deletion_pass.cc @@ -21,35 +21,42 @@ #include "paddle/fluid/framework/details/computation_op_handle.h" #include "paddle/fluid/framework/details/eager_deletion_op_handle.h" -#include "paddle/fluid/framework/details/eager_deletion_pass.h" #include "paddle/fluid/framework/details/multi_devices_helper.h" #include "paddle/fluid/framework/ir/graph_helper.h" -DEFINE_double(fraction_of_eager_deletion, 1.0, "Fraction of eager deletion"); -DEFINE_bool(eager_delete_tensor_only, false, ""); +DEFINE_double(memory_fraction_of_eager_deletion, 1.0, + "Fraction of eager deletion"); namespace paddle { namespace framework { namespace details { -namespace { // NOLINT +// op -> variables which can be deleted after op runs using OpToVarNameSetMap = std::unordered_map>; -} // NOLINT +// Check whether the variable is LoDTensor based on static VarDesc info static bool IsLoDTensor(VarDesc *var) { return var->Proto()->type().type() == proto::VarType::LOD_TENSOR; } -static int64_t GetNumel(const GraphVars &vars, const std::string &var_name, - size_t scope_idx) { - auto *var_desc = TryGetLatestVarDesc(vars[scope_idx].at(var_name)); +// Get memory size of LoDTensor +static int64_t GetMemorySize( + const std::unordered_map> &vars, + const std::string &var_name) { + auto *var_desc = TryGetLatestVarDesc(vars.at(var_name)); + PADDLE_ENFORCE_NOT_NULL(var_desc); PADDLE_ENFORCE(IsLoDTensor(var_desc)); auto dims = var_desc->GetShape(); - return std::accumulate(dims.begin(), dims.end(), static_cast(1), + return SizeOfType(var_desc->GetDataType()) * + std::accumulate(dims.begin(), dims.end(), static_cast(1), std::multiplies()); } +// Split all variables in the graph into LoDTensor and Non-LoDTensor (e.g. +// SelectedRows, LoDTensorArray) +// Since partial GC is based on static analysis of memory size of each variable +// So we should skip SelectedRows and LoDTensorArray here static void SplitIntoLoDTensorAndNonLoDTensorVars( const OpToVarNameSetMap &m, const GraphVars &vars, OpToVarNameSetMap *lod_tensors, OpToVarNameSetMap *other_vars) { @@ -69,76 +76,106 @@ static void SplitIntoLoDTensorAndNonLoDTensorVars( } } -static OpToVarNameSetMap ShrinkGCVars(const OpToVarNameSetMap &m, - const GraphVars &vars, - double fraction_of_memory_size, - bool delete_lod_tensor_only = false) { - // Do not perform gc +struct GCVarInfo { + GCVarInfo(const std::string &name, int64_t memory_size, + ComputationOpHandle *op, size_t scope_idx) + : name_(name), + memory_size_(memory_size), + op_(op), + scope_idx_(scope_idx) {} + + std::string name_; // variable name + int64_t memory_size_; // memory size + ComputationOpHandle *op_; // op after which the variable could be deleted + size_t scope_idx_; // scope index where the variable locates + + int64_t AbsMemorySize() const { return std::abs(memory_size_); } +}; + +// Delete delete_lod_tensor_only is not used currently +static OpToVarNameSetMap ShrinkGCVars( + const OpToVarNameSetMap &m, const GraphVars &vars, + const std::vector &places, double fraction_of_memory_size, + bool delete_lod_tensor_only = false) { + // Do not perform gc when fraction_of_memory_size = 0 if (fraction_of_memory_size <= 0.0) return {}; - // Perform complete gc + /** + * Step 1: Split all variables into LoDTensor and Non-LoDTensor. + * We can only calculate memory size of LoDTensors + */ + OpToVarNameSetMap lod_tensors, other_vars; + SplitIntoLoDTensorAndNonLoDTensorVars(m, vars, &lod_tensors, &other_vars); + + // Perform complete gc when fraction_of_memory_size >= 1 if (fraction_of_memory_size >= 1.0) { - if (delete_lod_tensor_only) { - OpToVarNameSetMap lod_tensors, other_vars; - SplitIntoLoDTensorAndNonLoDTensorVars(m, vars, &lod_tensors, &other_vars); - return lod_tensors; - } else { - return m; - } + return delete_lod_tensor_only ? lod_tensors : m; } - // Perform partial gc - OpToVarNameSetMap lod_tensors, other_vars; - SplitIntoLoDTensorAndNonLoDTensorVars(m, vars, &lod_tensors, &other_vars); + /** + * Step 2: build GCVarInfos, and calculate total memory sizes of each device + */ - using TupleType = std::tuple; + // place -> variable info (name, memory size, place, scope_idx) + std::map> place_to_vars; - std::unordered_map> place_to_vars; - std::unordered_map total_memory_size; + // place -> total memory sizes + std::map place_to_size; for (auto &op_vars_pair : lod_tensors) { - auto scope_idx = op_vars_pair.first->GetScopeIdx(); - int64_t size = 0; - for (auto &var_name : op_vars_pair.second) { - auto var_size = GetNumel(vars, var_name, scope_idx); - size += std::abs(var_size); - place_to_vars[scope_idx].emplace_back(var_name, op_vars_pair.first, - var_size); + auto *op = op_vars_pair.first; + auto &var_names = op_vars_pair.second; + auto scope_idx = op->GetScopeIdx(); + auto &place = places[scope_idx]; + + for (auto &var_name : var_names) { + auto var_size = GetMemorySize(vars[scope_idx], var_name); + GCVarInfo var_info(var_name, var_size, op, scope_idx); + place_to_size[place] += var_info.AbsMemorySize(); + place_to_vars[place].emplace_back(std::move(var_info)); } - total_memory_size.emplace(scope_idx, size); } - for (auto &pair : place_to_vars) { - std::sort(pair.second.begin(), pair.second.end(), - [](const TupleType &t1, const TupleType &t2) { - return std::abs(std::get<2>(t1)) > std::abs(std::get<2>(t2)); + /** + * Step 3: sort GCVarInfos, and only delete the largest variables. + */ + OpToVarNameSetMap partial_vars; + for (auto &place_to_var_pair : place_to_vars) { + auto &place = place_to_var_pair.first; + auto &gc_vars = place_to_var_pair.second; + std::sort(gc_vars.begin(), gc_vars.end(), + [](const GCVarInfo &var1, const GCVarInfo &var2) { + return var1.AbsMemorySize() > var2.AbsMemorySize(); }); - } - OpToVarNameSetMap ret; - for (auto &pair : place_to_vars) { - auto desired_delete_size = static_cast( - fraction_of_memory_size * total_memory_size.at(pair.first)); - int64_t cur_size = 0; - for (size_t i = 0; i < pair.second.size() && cur_size < desired_delete_size; + int64_t accumulated_size = 0; + int64_t size_threshold = + static_cast(fraction_of_memory_size * place_to_size[place]); + for (size_t i = 0; i < gc_vars.size() && accumulated_size < size_threshold; ++i) { - auto &var_name = std::get<0>(pair.second[i]); - auto *op = std::get<1>(pair.second[i]); - cur_size += std::get<2>(pair.second[i]); - ret[op].insert(var_name); + partial_vars[gc_vars[i].op_].insert(gc_vars[i].name_); + accumulated_size += gc_vars[i].AbsMemorySize(); } } + /** + * Step 4: Combine other vars (SelectedRows, LoDTensorArray) + */ if (!delete_lod_tensor_only) { for (auto &op_vars_pair : other_vars) { - for (auto &var_name : op_vars_pair.second) { - ret[op_vars_pair.first].insert(var_name); - } + partial_vars[op_vars_pair.first].insert(op_vars_pair.second.begin(), + op_vars_pair.second.end()); } } - return ret; + return partial_vars; } +class EagerDeletionPass : public ir::Pass { + protected: + std::unique_ptr ApplyImpl( + std::unique_ptr graph) const override; +}; + std::unique_ptr EagerDeletionPass::ApplyImpl( std::unique_ptr graph) const { auto &ref_cnts = @@ -166,9 +203,8 @@ std::unique_ptr EagerDeletionPass::ApplyImpl( } } - op_vars_map = - ShrinkGCVars(op_vars_map, vars, FLAGS_fraction_of_eager_deletion, - FLAGS_eager_delete_tensor_only); + op_vars_map = ShrinkGCVars(op_vars_map, vars, places, + FLAGS_memory_fraction_of_eager_deletion); for (auto &pair : op_vars_map) { auto *op = pair.first; @@ -200,12 +236,13 @@ std::unique_ptr EagerDeletionPass::ApplyImpl( eager_deletion_op->AddOutput(dummy_leaf); } - VLOG(10) << "FLAGS_fraction_of_eager_deletion = " - << FLAGS_fraction_of_eager_deletion; - VLOG(10) << "FLAGS_eager_delete_tensor_only = " - << FLAGS_eager_delete_tensor_only; + VLOG(10) << "FLAGS_memory_fraction_of_eager_deletion = " + << FLAGS_memory_fraction_of_eager_deletion; VLOG(10) << "Create " << op_vars_map.size() << " EagerDeletionOpHandle(s)"; - return graph; + + auto while_op_eager_deletion_pass = + ir::PassRegistry::Instance().Get("while_op_eager_deletion_pass"); + return while_op_eager_deletion_pass->Apply(std::move(graph)); } } // namespace details @@ -218,3 +255,5 @@ REGISTER_PASS(eager_deletion_pass, .RequirePassAttr(paddle::framework::details::kLastLiveOpsOfVars) .RequirePassAttr(paddle::framework::details::kAllPlaces) .RequirePassAttr(paddle::framework::details::kGarbageCollector); + +USE_PASS(while_op_eager_deletion_pass); diff --git a/paddle/fluid/framework/details/eager_deletion_pass.h b/paddle/fluid/framework/details/eager_deletion_pass.h deleted file mode 100644 index d7a7a9709d..0000000000 --- a/paddle/fluid/framework/details/eager_deletion_pass.h +++ /dev/null @@ -1,32 +0,0 @@ -// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once - -#include "paddle/fluid/framework/ir/graph.h" -#include "paddle/fluid/framework/ir/pass.h" - -namespace paddle { -namespace framework { -namespace details { - -class EagerDeletionPass : public ir::Pass { - protected: - std::unique_ptr ApplyImpl( - std::unique_ptr graph) const override; -}; - -} // namespace details -} // namespace framework -} // namespace paddle diff --git a/paddle/fluid/framework/details/while_op_eager_deletion_pass.cc b/paddle/fluid/framework/details/while_op_eager_deletion_pass.cc new file mode 100644 index 0000000000..fd6b6dd227 --- /dev/null +++ b/paddle/fluid/framework/details/while_op_eager_deletion_pass.cc @@ -0,0 +1,62 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/framework/details/computation_op_handle.h" +#include "paddle/fluid/framework/details/multi_devices_helper.h" +#include "paddle/fluid/framework/ir/graph_helper.h" +#include "paddle/fluid/operators/controlflow/while_op_helper.h" + +namespace paddle { +namespace framework { +namespace details { + +class WhileOpEagerDeletionPass : public ir::Pass { + protected: + std::unique_ptr ApplyImpl( + std::unique_ptr graph) const override { + auto all_ops = ir::FilterByNodeWrapper(*graph); + + // Find all while_op and while_grad_op + std::unordered_map, + std::vector>> + target_ops; + for (auto *op : all_ops) { + auto compute_op = dynamic_cast(op); + if (compute_op == nullptr) continue; + + if (compute_op->Name() == "while") { + target_ops[compute_op->GetScopeIdx()].first.emplace_back( + compute_op->GetOp()); + } else if (compute_op->Name() == "while_grad") { + target_ops[compute_op->GetScopeIdx()].second.emplace_back( + compute_op->GetOp()); + } + } + + for (auto &ops_pair : target_ops) { + auto &while_ops = ops_pair.second.first; + auto &while_grad_ops = ops_pair.second.second; + operators::PrepareSafeEagerDeletionOnWhileOpAndWhileGradOp( + while_ops, while_grad_ops); + } + return graph; + } +}; + +} // namespace details +} // namespace framework +} // namespace paddle + +REGISTER_PASS(while_op_eager_deletion_pass, + paddle::framework::details::WhileOpEagerDeletionPass); diff --git a/paddle/fluid/framework/executor.cc b/paddle/fluid/framework/executor.cc index c31d0beec3..5555679412 100644 --- a/paddle/fluid/framework/executor.cc +++ b/paddle/fluid/framework/executor.cc @@ -23,6 +23,7 @@ limitations under the License. */ #include "paddle/fluid/framework/threadpool.h" #include "paddle/fluid/framework/transfer_scope_cache.h" #include "paddle/fluid/framework/variable_helper.h" +#include "paddle/fluid/operators/controlflow/while_op_helper.h" #include "paddle/fluid/operators/distributed/distributed.h" #include "paddle/fluid/platform/place.h" #include "paddle/fluid/platform/profiler.h" @@ -409,8 +410,7 @@ void Executor::RunPreparedContext(ExecutorPrepareContext* ctx, Scope* scope, int64_t max_memory_size = GetEagerDeletionThreshold(); std::unique_ptr gc; - // skip while_op and while_grad_op temporarily - if (max_memory_size >= 0 && !keep_kids) { + if (max_memory_size >= 0) { ctx->ResetReferenceCount(); #ifdef PADDLE_WITH_CUDA if (platform::is_gpu_place(place_)) { @@ -428,6 +428,10 @@ void Executor::RunPreparedContext(ExecutorPrepareContext* ctx, Scope* scope, #ifdef PADDLE_WITH_CUDA } #endif + if (gc) { + operators::PrepareSafeEagerDeletionOnWhileOpAndWhileGradOp(ctx->block_id_, + ctx->ops_); + } } for (auto& op : ctx->ops_) { diff --git a/paddle/fluid/operators/controlflow/CMakeLists.txt b/paddle/fluid/operators/controlflow/CMakeLists.txt index b614e9b035..7aa1c44eaa 100644 --- a/paddle/fluid/operators/controlflow/CMakeLists.txt +++ b/paddle/fluid/operators/controlflow/CMakeLists.txt @@ -1,4 +1,5 @@ include(operators) register_operators(DEPS naive_executor) +cc_library(while_op_helper SRCS while_op_helper.cc DEPS operator) file(APPEND ${pybind_file} "USE_OP(less_than);\nUSE_OP(logical_and);\nUSE_NO_KERNEL_OP(read_from_array);\n") diff --git a/paddle/fluid/operators/controlflow/while_op.cc b/paddle/fluid/operators/controlflow/while_op.cc index 0360cf5273..8352ba4f2b 100644 --- a/paddle/fluid/operators/controlflow/while_op.cc +++ b/paddle/fluid/operators/controlflow/while_op.cc @@ -18,6 +18,7 @@ #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/operator.h" #include "paddle/fluid/framework/var_type.h" +#include "paddle/fluid/operators/controlflow/while_op_helper.h" #include "paddle/fluid/operators/detail/safe_ref.h" namespace paddle { @@ -26,14 +27,6 @@ namespace operators { using StepScopeVar = std::vector; using LoDTensor = framework::LoDTensor; -static constexpr char kStepBlock[] = "sub_block"; -static constexpr char kCondition[] = "Condition"; -static constexpr char kStepScopes[] = "StepScopes"; -static constexpr char kX[] = "X"; -static constexpr char kXGRAD[] = "X@GRAD"; -static constexpr char kOutputs[] = "Out"; -static constexpr char kSkipEagerDeletionVars[] = "skip_eager_deletion_vars"; - namespace { // NOLINT static std::string GetSkipEagerDeletionVarsDebugString( const std::vector &vars) { diff --git a/paddle/fluid/operators/controlflow/while_op_helper.cc b/paddle/fluid/operators/controlflow/while_op_helper.cc new file mode 100644 index 0000000000..0324a1586a --- /dev/null +++ b/paddle/fluid/operators/controlflow/while_op_helper.cc @@ -0,0 +1,292 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/operators/controlflow/while_op_helper.h" +#include +#include "paddle/fluid/framework/program_desc.h" + +namespace paddle { +namespace operators { + +// OpVariant is a wrapper class of OpDesc and OperatorBase +// So that API would be the same. +class OpVariant { + struct InputsVisitor + : public boost::static_visitor { + template + const framework::VariableNameMap *operator()(const OpType *op) const { + return &(op->Inputs()); + } + }; + + struct OutputsVisitor + : public boost::static_visitor { + template + const framework::VariableNameMap *operator()(const OpType *op) const { + return &(op->Outputs()); + } + }; + + struct AttributeMapVisitor + : public boost::static_visitor { + const framework::AttributeMap *operator()( + const framework::OpDesc *op) const { + return &(op->GetAttrMap()); + } + + const framework::AttributeMap *operator()( + const framework::OperatorBase *op) const { + return &(op->Attrs()); + } + }; + + struct RawPointerVisitor : public boost::static_visitor { + template + const void *operator()(const OpType *op) const { + return op; + } + }; + + public: + OpVariant(const framework::OperatorBase *op) : op_(op) {} // NOLINT + + OpVariant(const framework::OpDesc *op) : op_(op) {} // NOLINT + + const framework::VariableNameMap &Inputs() const { + return *boost::apply_visitor(InputsVisitor(), op_); + } + + const framework::VariableNameMap &Outputs() const { + return *boost::apply_visitor(OutputsVisitor(), op_); + } + + const framework::AttributeMap &Attrs() const { + return *boost::apply_visitor(AttributeMapVisitor(), op_); + } + + template + const AttrType &Attr(const std::string &name) const { + auto &attrs = Attrs(); + auto it = attrs.find(name); + PADDLE_ENFORCE(it != attrs.end(), "Cannot find attribute %s", name); + return boost::get(it->second); + } + + bool operator==(const OpVariant &other) const { + return RawPointer() == other.RawPointer(); + } + + const void *RawPointer() const { + return boost::apply_visitor(RawPointerVisitor(), op_); + } + + int which() const { return static_cast(op_.which()); } + + struct Hasher { + size_t operator()(const OpVariant &op) const { + return reinterpret_cast(op.RawPointer()); + } + }; + + private: + const boost::variant + op_; +}; + +static std::string GetDebugString(const std::vector &names) { + if (names.empty()) return ""; + std::string ret = names[0]; + for (size_t i = 1; i < names.size(); ++i) { + ret += (" " + names[i]); + } + return ret; +} + +// Set skip variables of while_op and while_grad_op +// These variables should be skipped when eager deletion enables. +// It is because: +// 1. while_grad_op needs some variables defined in while_op. +// 2. while_grad_op needs variables from the previous time step. +static void SetSkipVars(const OpVariant &op, std::vector attr) { + auto &attrs = const_cast(op.Attrs()); + VLOG(2) << "Prepare to skip " << attr.size() + << " var(s): " << GetDebugString(attr); + attrs[kSkipEagerDeletionVars] = std::move(attr); +} + +// Check whether the forward while_op and while_grad_op match +// The program may have many while_ops. +static bool IsMatchedWhileOpAndWhileGradOp(const OpVariant &fwd_op, + const OpVariant &grad_op) { + return fwd_op.Inputs().at(kX) == grad_op.Inputs().at(kX) && + fwd_op.Outputs().at(kOutputs) == grad_op.Inputs().at(kOutputs); +} + +// Test whether the variable is skippable in forward while_op +// The variable is skippable in while_op when the variable used in while_grad +// is not from grad_block. +static bool IsSkippableVar(const std::string &name, + framework::BlockDesc *grad_block) { + return name != framework::kEmptyVarName && !grad_block->HasVar(name); +} + +static void ModifyWhileOpAndWhileGradOpAttr(const OpVariant &fwd_op, + const OpVariant &bwd_op) { + auto *grad_block = bwd_op.Attr(kStepBlock); + + // Find all skippable variables in forward while_op + std::unordered_set forward_skip_vars; + for (auto *op_desc : grad_block->AllOps()) { + for (auto &in_arg_name : op_desc->InputArgumentNames()) { + if (IsSkippableVar(in_arg_name, grad_block)) { + forward_skip_vars.insert(in_arg_name); + } + } + + for (auto &out_arg_name : op_desc->OutputArgumentNames()) { + if (IsSkippableVar(out_arg_name, grad_block)) { + forward_skip_vars.insert(out_arg_name); + } + } + } + + SetSkipVars(fwd_op, std::vector(forward_skip_vars.begin(), + forward_skip_vars.end())); + + // Find all skippable variables in while_grad_op + // The skipped variables are those which would be used across time steps. + auto &fwd_input = fwd_op.Inputs().at(kX); + auto &in_grads = bwd_op.Outputs().at(framework::GradVarName(kX)); + PADDLE_ENFORCE_EQ( + fwd_input.size(), in_grads.size(), + "Backward input gradient number does not match forward input number."); + + std::unordered_set backward_skip_vars; + for (size_t i = 0; i < in_grads.size(); ++i) { + if (in_grads[i] == framework::kEmptyVarName) { + continue; + } + backward_skip_vars.insert(in_grads[i]); + backward_skip_vars.insert(framework::GradVarName(fwd_input[i])); + } + + SetSkipVars(bwd_op, std::vector(backward_skip_vars.begin(), + backward_skip_vars.end())); +} + +// Find all while_ops and while_grad_ops in the graph or program +// The while_grad_op and while_op may located in different blocks +// So we should traverse all blocks in the program and find them out. +static void FindAllWhileAndWhileGradOp(std::vector *while_ops, + std::vector *while_grad_ops) { + PADDLE_ENFORCE_GE(while_ops->size(), while_grad_ops->size()); + + if (while_ops->empty()) return; + + const auto *program = + while_ops->front().Attr(kStepBlock)->Program(); + for (size_t i = 1; i < program->Size(); ++i) { + auto &block = program->Block(i); + for (size_t j = 0; j < block.OpSize(); ++j) { + auto *op = block.Op(j); + if (op->Type() == "while") { + while_ops->emplace_back(op); + } else if (op->Type() == "while_grad") { + while_grad_ops->emplace_back(op); + } + } + } + + PADDLE_ENFORCE_GE(while_ops->size(), while_grad_ops->size(), + "There are extra while_grad ops in the graph or program"); +} + +static void PrepareSafeEagerDeletionOnWhileOpAndWhileGradOpImpl( + std::vector *while_ops, std::vector *while_grad_ops) { + FindAllWhileAndWhileGradOp(while_ops, while_grad_ops); + + VLOG(2) << "Found while op num: " << while_ops->size() + << ", while grad op num: " << while_grad_ops->size(); + + if (while_grad_ops->empty()) { + return; + } + + std::unordered_set while_op_set( + while_ops->begin(), while_ops->end()); + + for (auto &bwd_op : *while_grad_ops) { + const OpVariant *matched_fwd_op = nullptr; + for (auto &fwd_op : while_op_set) { + if (IsMatchedWhileOpAndWhileGradOp(fwd_op, bwd_op)) { + PADDLE_ENFORCE(matched_fwd_op == nullptr, + "Found multiple matched while ops"); + matched_fwd_op = &fwd_op; + } + } + PADDLE_ENFORCE_NOT_NULL(matched_fwd_op, + "Cannot find matched forward while op."); + ModifyWhileOpAndWhileGradOpAttr(*matched_fwd_op, bwd_op); + while_op_set.erase(*matched_fwd_op); + } + + PADDLE_ENFORCE(while_op_set.empty(), + "There are not matched while_grad op in graph."); +} + +void PrepareSafeEagerDeletionOnWhileOpAndWhileGradOp( + int block_id, + const std::vector> &all_ops) { + // If block_id is not 0, returns + // This is because all while_ops and while_grad_ops in the whole program + // would be processed when block_id is 0 (i.e. when Executor::Run() or + // ParallelExecutor constructs). + + // What's more, all while_ops and while_grad_ops must be processed when + // block_id is zero. If not, while_op may run first and erase variables + // used in while_grad_op, and in this moment, while_grad_ops may be not + // constructed yet. + if (block_id != 0) return; + + std::vector fwd_ops, bwd_ops; + for (auto &op : all_ops) { + if (op->Type() == "while") { + fwd_ops.emplace_back(op.get()); + } else if (op->Type() == "while_grad") { + bwd_ops.emplace_back(op.get()); + } + } + PrepareSafeEagerDeletionOnWhileOpAndWhileGradOpImpl(&fwd_ops, &bwd_ops); +} + +void PrepareSafeEagerDeletionOnWhileOpAndWhileGradOp( + const std::vector &while_ops, + const std::vector &while_grad_ops) { + std::vector fwd_ops, bwd_ops; + fwd_ops.reserve(while_ops.size()); + for (auto *op : while_ops) { + fwd_ops.emplace_back(op); + } + + bwd_ops.reserve(while_grad_ops.size()); + for (auto *op : while_grad_ops) { + bwd_ops.emplace_back(op); + } + + PrepareSafeEagerDeletionOnWhileOpAndWhileGradOpImpl(&fwd_ops, &bwd_ops); +} + +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/controlflow/while_op_helper.h b/paddle/fluid/operators/controlflow/while_op_helper.h new file mode 100644 index 0000000000..456ba8642b --- /dev/null +++ b/paddle/fluid/operators/controlflow/while_op_helper.h @@ -0,0 +1,43 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include +#include +#include "paddle/fluid/framework/operator.h" +#include "paddle/fluid/platform/variant.h" + +namespace paddle { +namespace operators { + +static constexpr char kStepBlock[] = "sub_block"; +static constexpr char kCondition[] = "Condition"; +static constexpr char kStepScopes[] = "StepScopes"; +static constexpr char kX[] = "X"; +static constexpr char kXGRAD[] = "X@GRAD"; +static constexpr char kOutputs[] = "Out"; +static constexpr char kSkipEagerDeletionVars[] = "skip_eager_deletion_vars"; + +void PrepareSafeEagerDeletionOnWhileOpAndWhileGradOp( + int block_id, + const std::vector> &all_ops); + +void PrepareSafeEagerDeletionOnWhileOpAndWhileGradOp( + const std::vector &while_ops, + const std::vector &while_grad_ops); + +} // namespace operators +} // namespace paddle diff --git a/python/paddle/fluid/tests/unittests/test_eager_deletion_while_op.py b/python/paddle/fluid/tests/unittests/test_eager_deletion_while_op.py new file mode 100644 index 0000000000..7fa1636579 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/test_eager_deletion_while_op.py @@ -0,0 +1,153 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function + +import os +os.environ['CPU_NUM'] = '2' +os.environ['FLAGS_eager_delete_tensor_gb'] = '0.0' +os.environ['FLAGS_fast_eager_deletion_mode'] = '1' + +import unittest +import paddle.fluid as fluid +import paddle.fluid.layers as layers +from paddle.fluid.executor import Executor +import paddle.fluid.core as core +from paddle.fluid.backward import append_backward +import paddle.fluid.compiler as compiler +import numpy +import multiprocessing + + +class TestEagerDeletionWhileOpBase(unittest.TestCase): + def test_main(self): + places = [core.CPUPlace(), ] + if core.is_compiled_with_cuda(): + places.append(core.CUDAPlace(0)) + + for p in places: + for with_data_parallel in [False, True]: + with fluid.program_guard(fluid.Program(), fluid.Program()): + with fluid.scope_guard(fluid.Scope()): + self.run_main(p, with_data_parallel) + + def run_main(self, place, with_data_parallel): + self.place = place + self.with_data_parallel = with_data_parallel + + if not core.is_compiled_with_cuda() and isinstance(self.place, + core.CUDPlace): + return + + if isinstance(self.place, core.CUDAPlace): + device_cnt = core.get_cuda_device_count( + ) if self.with_data_parallel else 1 + else: + device_cnt = int( + os.environ['CPU_NUM'], + multiprocessing.cpu_count()) if self.with_data_parallel else 1 + + d0 = layers.data( + "d0", shape=[10], append_batch_size=False, dtype='float32') + d1 = layers.data( + "d1", shape=[10], append_batch_size=False, dtype='float32') + d2 = layers.data( + "d2", shape=[10], append_batch_size=False, dtype='float32') + + i = layers.zeros(shape=[1], dtype='int64') + i.stop_gradient = True + + init = layers.zeros(shape=[10], dtype='float32') + mem_array = layers.array_write(x=init, i=i) + data_array = layers.array_write(x=d0, i=i) + + i = layers.increment(i) + layers.array_write(d1, i, array=data_array) + + i = layers.increment(i) + layers.array_write(d2, i, array=data_array) + + i = layers.zeros(shape=[1], dtype='int64') + i.stop_gradient = True + + array_len = layers.fill_constant(shape=[1], dtype='int64', value=1) + array_len.stop_gradient = True + cond = layers.less_than(x=i, y=array_len) + + j = layers.fill_constant(shape=[1], dtype='int64', value=1) + j.stop_gradient = True + + array_len2 = layers.fill_constant(shape=[1], dtype='int64', value=3) + array_len2.stop_gradient = True + cond2 = layers.less_than(x=j, y=array_len2) + + while_op = layers.While(cond=cond) + while_op2 = layers.While(cond=cond2) + with while_op.block(): + d = layers.array_read(array=data_array, i=i) + prev = layers.array_read(array=mem_array, i=i) + d = layers.reshape(d, shape=[10]) + prev = layers.reshape(prev, shape=[10]) + result = layers.sums(input=[d, prev]) + + i = layers.increment(x=i, in_place=True) + layers.array_write(result, i=i, array=mem_array) + layers.less_than(x=i, y=array_len, cond=cond) + with while_op2.block(): + d2 = layers.array_read(array=data_array, i=j) + prev2 = layers.array_read(array=mem_array, i=j) + d2 = layers.reshape(d2, shape=[10]) + prev2 = layers.reshape(prev2, shape=[10]) + result2 = layers.sums(input=[d2, prev2]) + + j = layers.increment(x=j, in_place=True) + layers.array_write(result2, i=j, array=mem_array) + layers.less_than(x=j, y=array_len2, cond=cond2) + + sum_result = layers.array_read(array=mem_array, i=j) + sum_result.persistable = True + tmp = layers.unsqueeze(sum_result, axes=[0]) + tmp = layers.expand(tmp, expand_times=[10, 1]) + fc = layers.fc(tmp, size=256) + loss = layers.mean(sum_result) + + optim = fluid.optimizer.Adam(learning_rate=1e-3) + optim.minimize(loss) + + exe = Executor(self.place) + exe.run(fluid.default_startup_program()) + + prog = compiler.CompiledProgram(fluid.default_main_program()) + if self.with_data_parallel: + prog = prog.with_data_parallel() + + for _ in range(5): + d = [] + for i in range(3): + tmp = numpy.random.random(size=[10]).astype('float32') + if not self.with_data_parallel: + d.append(tmp) + else: + d.append(numpy.array([tmp] * device_cnt)) + + outs = exe.run(program=prog, + feed={'d0': d[0], + 'd1': d[1], + 'd2': d[2]}, + fetch_list=[sum_result]) + self.assertAlmostEqual(numpy.sum(d), numpy.sum(outs[0]), delta=0.01) + + +if __name__ == '__main__': + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_partial_eager_deletion_transformer.py b/python/paddle/fluid/tests/unittests/test_partial_eager_deletion_transformer.py new file mode 100644 index 0000000000..d44e4627d8 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/test_partial_eager_deletion_transformer.py @@ -0,0 +1,26 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +import unittest +os.environ['FLAGS_eager_delete_tensor_gb'] = "0.0" +os.environ['FLAGS_memory_fraction_of_eager_deletion'] = "0.55" + +os.environ[ + 'RECORDIO_FILENAME'] = '/tmp/eager_deletion_transformer.wmt16.recordio' + +from test_parallel_executor_transformer import TestTransformer + +if __name__ == '__main__': + unittest.main() From 806832e09163500fa01b8e9eabb871424dc26dbd Mon Sep 17 00:00:00 2001 From: Zhen Wang Date: Tue, 5 Mar 2019 20:15:41 +0800 Subject: [PATCH 09/64] update the input format of channel wise dequantize op. --- paddle/fluid/operators/fake_dequantize_op.cc | 42 ++++++++----------- paddle/fluid/operators/fake_dequantize_op.h | 38 +++++++---------- .../unittests/test_fake_dequantize_op.py | 27 ++++++------ 3 files changed, 46 insertions(+), 61 deletions(-) diff --git a/paddle/fluid/operators/fake_dequantize_op.cc b/paddle/fluid/operators/fake_dequantize_op.cc index 73ffaae6a5..68c7227e5a 100644 --- a/paddle/fluid/operators/fake_dequantize_op.cc +++ b/paddle/fluid/operators/fake_dequantize_op.cc @@ -14,6 +14,7 @@ limitations under the License. */ #include "paddle/fluid/operators/fake_dequantize_op.h" #include +#include namespace paddle { namespace operators { @@ -84,8 +85,8 @@ class FakeChannelWiseDequantizeMaxAbsOp : public framework::OperatorWithKernel { PADDLE_ENFORCE( ctx->HasInput("X"), "Input(X) of FakeChannelWiseDequantizeMaxAbsOp should not be null."); - PADDLE_ENFORCE(ctx->HasInput("WeightScales"), - "Input(WeightScales) of FakeChannelWiseDequantizeMaxAbsOp " + PADDLE_ENFORCE(ctx->HasInputs("Scales"), + "Input(Scales) of FakeChannelWiseDequantizeMaxAbsOp " "should not be null."); PADDLE_ENFORCE( ctx->HasOutput("Out"), @@ -103,39 +104,32 @@ class FakeChannelWiseDequantizeMaxAbsOpMaker AddInput("X", "(Tensor) The input with float-32/64 type is the " "low precision tensor."); - AddInput("ActivationScale", - "(float) The activation scale in quantization stage.") - .AsDispensable(); - AddInput("WeightScales", - "(float array) The weight scales in quantization stage."); + AddInput("Scales", + "(Tensors) The scales in quantization stage. " + "Now, `Scales` is a vector with at most two tensors. " + "If Scales has two elements, the second tensor should only have " + "one value.") + .AsDuplicable(); AddOutput("Out", "(Tensor) The output is the dequantized high " "precision tensor."); - AddAttr("activation_bits", "Quantization bit number for activation.") - .SetDefault(8) - .AddCustomChecker([](const int& bit_length) { - PADDLE_ENFORCE(bit_length >= 1 && bit_length <= 16, - "'activation_bits' should be between 1 and 16."); - }); - AddAttr("weight_bits", "Quantization bit number for weights.") - .SetDefault(8) - .AddCustomChecker([](const int& bit_length) { - PADDLE_ENFORCE(bit_length >= 1 && bit_length <= 16, - "'weight_bits' should be between 1 and 16."); - }); + AddAttr>( + "quant_bits", + "Quantization bit numbers in quantization stage. " + "The size of `quant_bits` should be equal to the size of `Scales`.") + .SetDefault({8}); AddComment(R"DOC( FakeChannelWiseDequantizeMaxAbsOp operator. This calculation is an opposite operation of FakeChannelWiseQuantizeMaxAbsOp: -$$Out_c = \frac{ActivationScale*WeightScale_c*X_c}{(2^{weight\_bits-1}-1)*(2^{activation\_bits-1}-1)}$$ +$$Out_c = \frac{X_c\prod_{i=1}^{n}Scales_{ic}}{\prod_{i=1}^{n}(2^{quant\_bits_i-1}-1)}$$ -In the above formula, the range value of c is as follow: -$$0 \leq c \lt \ the\ channel\ number\ of\ X$$ +In the above formula, the range value of $c$ can be represented as $0 \leq c \lt \ the\ channel\ number\ of\ X$. +Besides, the size of $quant\_bits$ should be equal to the size of $Scales$, and it is called $n$ in the formula. -Notes: Tha per-channel quantization is only applied to weights(channel size scale). -And the activations use per-layer quantization(only one scale). +Notes: In general, the per-channel quantization is only applied to weights and the activations use per-layer quantization. )DOC"); } }; diff --git a/paddle/fluid/operators/fake_dequantize_op.h b/paddle/fluid/operators/fake_dequantize_op.h index c26dfa8332..549f5039f4 100644 --- a/paddle/fluid/operators/fake_dequantize_op.h +++ b/paddle/fluid/operators/fake_dequantize_op.h @@ -14,6 +14,7 @@ limitations under the License. */ #pragma once +#include #include "paddle/fluid/framework/eigen.h" #include "paddle/fluid/framework/op_registry.h" @@ -50,47 +51,40 @@ class FakeChannelWiseDequantizeMaxAbsKernel : public framework::OpKernel { public: virtual void Compute(const framework::ExecutionContext& ctx) const { auto* in = ctx.Input("X"); - auto* weight_scales = ctx.Input("WeightScales"); + auto scales = ctx.MultiInput("Scales"); auto* out = ctx.Output("Out"); - PADDLE_ENFORCE_EQ(weight_scales->numel(), in->dims()[0], - "The weight uses the per-channel quantization type, so " - "the number of weight scale values must be the same with " + PADDLE_ENFORCE_EQ(scales[0]->numel(), in->dims()[0], + "The number of first scale values must be the same with " "first dimension value of Input(X)."); - int ativation_bits = ctx.Attr("activation_bits"); - int weight_bits = ctx.Attr("weight_bits"); - int range = std::pow(2, weight_bits - 1) - 1; + auto quant_bits = ctx.Attr>("quant_bits"); + int max_range = std::pow(2, quant_bits[0] - 1) - 1; auto& dev_ctx = ctx.template device_context(); out->mutable_data(dev_ctx.GetPlace()); auto dequant = DequantizeFunctor(); - if (ctx.HasInput("ActivationScale")) { - auto* activation_scale = ctx.Input("ActivationScale"); - PADDLE_ENFORCE_EQ(activation_scale->numel(), 1, - "The activation uses per-layer quantization type, so " - "it must have only one value."); - framework::Tensor cpu_weigth_scales; - framework::TensorCopy(*weight_scales, platform::CPUPlace(), - &cpu_weigth_scales); - dev_ctx.Wait(); - const T* weight_scales_data = cpu_weigth_scales.data(); - range *= (std::pow(2, ativation_bits - 1) - 1); + if (scales.size() == 2) { + PADDLE_ENFORCE_EQ( + scales[1]->numel(), 1, + "The second scale tensor should only have one value at now."); for (int64_t i = 0; i < in->dims()[0]; i++) { framework::Tensor one_channel_in = in->Slice(i, i + 1); framework::Tensor one_channel_out = out->Slice(i, i + 1); - auto max_range = range / weight_scales_data[i]; - dequant(dev_ctx, &one_channel_in, activation_scale, + framework::Tensor one_channel_scale = scales[0]->Slice(i, i + 1); + max_range *= (std::pow(2, quant_bits[1] - 1) - 1); + dequant(dev_ctx, &one_channel_in, &one_channel_scale, static_cast(max_range), &one_channel_out); } + dequant(dev_ctx, out, scales[1], static_cast(1), out); } else { for (int64_t i = 0; i < in->dims()[0]; i++) { framework::Tensor one_channel_in = in->Slice(i, i + 1); framework::Tensor one_channel_out = out->Slice(i, i + 1); - framework::Tensor one_channel_scale = weight_scales->Slice(i, i + 1); + framework::Tensor one_channel_scale = scales[0]->Slice(i, i + 1); dequant(dev_ctx, &one_channel_in, &one_channel_scale, - static_cast(range), &one_channel_out); + static_cast(max_range), &one_channel_out); } } } diff --git a/python/paddle/fluid/tests/unittests/test_fake_dequantize_op.py b/python/paddle/fluid/tests/unittests/test_fake_dequantize_op.py index bd8dad4d59..8d91d8fd1d 100644 --- a/python/paddle/fluid/tests/unittests/test_fake_dequantize_op.py +++ b/python/paddle/fluid/tests/unittests/test_fake_dequantize_op.py @@ -49,53 +49,50 @@ def channel_wise_dequantize_max_abs(x, scales, max_range): return y -class TestFakeChannelWiseDequantizeMaxAbsOp(OpTest): +class TestFakeChannelWiseDequantizeMaxAbsOpTwoScales(OpTest): def set_args(self): - self.weight_bits = 8 - self.activation_bits = 2 + self.quant_bits = [8, 2] self.data_type = "float32" def setUp(self): self.set_args() self.op_type = "fake_channel_wise_dequantize_max_abs" x = np.random.randn(4, 3, 64, 64).astype(self.data_type) - max_range = math.pow(2, self.weight_bits - 1) - 1 + max_range = math.pow(2, self.quant_bits[0] - 1) - 1 + max_range *= (math.pow(2, self.quant_bits[1] - 1) - 1) yq, scales = channel_wise_quantize_max_abs(x, max_range) ydq = channel_wise_dequantize_max_abs(yq, scales, max_range) self.inputs = { 'X': yq, - 'ActivationScale': np.array(1.0).astype(self.data_type), - 'WeightScales': np.array(scales).astype(self.data_type) - } - self.attrs = { - 'weight_bits': self.weight_bits, - 'activation_bits': self.activation_bits + 'Scales': [("scales0", np.array(scales).astype(self.data_type)), + ("scales1", np.array([1.0]).astype(self.data_type))] } + self.attrs = {'quant_bits': self.quant_bits} self.outputs = {'Out': ydq} def test_check_output(self): self.check_output() -class TestFakeChannelWiseDequantizeMaxAbsOpNoActivationScale(OpTest): +class TestFakeChannelWiseDequantizeMaxAbsOpOneScale(OpTest): def set_args(self): - self.weight_bits = 8 + self.quant_bits = [8] self.data_type = "float32" def setUp(self): self.set_args() self.op_type = "fake_channel_wise_dequantize_max_abs" x = np.random.randn(4, 3, 64, 64).astype(self.data_type) - max_range = math.pow(2, self.weight_bits - 1) - 1 + max_range = math.pow(2, self.quant_bits[0] - 1) - 1 yq, scales = channel_wise_quantize_max_abs(x, max_range) ydq = channel_wise_dequantize_max_abs(yq, scales, max_range) self.inputs = { 'X': yq, - 'WeightScales': np.array(scales).astype(self.data_type) + 'Scales': [("scales0", np.array(scales).astype(self.data_type))] } - self.attrs = {'weight_bits': self.weight_bits} + self.attrs = {'quant_bits': self.quant_bits} self.outputs = {'Out': ydq} def test_check_output(self): From 8063b31e2d485b665303a2010e63909ba53d1664 Mon Sep 17 00:00:00 2001 From: Zhen Wang Date: Tue, 5 Mar 2019 22:54:22 +0800 Subject: [PATCH 10/64] Reduce redundant code for channel wise dequant op. test=develop --- paddle/fluid/operators/fake_dequantize_op.h | 27 +++++++---------- .../unittests/test_fake_dequantize_op.py | 30 +++++++++++-------- 2 files changed, 28 insertions(+), 29 deletions(-) diff --git a/paddle/fluid/operators/fake_dequantize_op.h b/paddle/fluid/operators/fake_dequantize_op.h index 549f5039f4..d05f203853 100644 --- a/paddle/fluid/operators/fake_dequantize_op.h +++ b/paddle/fluid/operators/fake_dequantize_op.h @@ -65,27 +65,20 @@ class FakeChannelWiseDequantizeMaxAbsKernel : public framework::OpKernel { out->mutable_data(dev_ctx.GetPlace()); auto dequant = DequantizeFunctor(); + for (int64_t i = 0; i < in->dims()[0]; i++) { + framework::Tensor one_channel_in = in->Slice(i, i + 1); + framework::Tensor one_channel_out = out->Slice(i, i + 1); + framework::Tensor one_channel_scale = scales[0]->Slice(i, i + 1); + dequant(dev_ctx, &one_channel_in, &one_channel_scale, + static_cast(max_range), &one_channel_out); + } + if (scales.size() == 2) { PADDLE_ENFORCE_EQ( scales[1]->numel(), 1, "The second scale tensor should only have one value at now."); - for (int64_t i = 0; i < in->dims()[0]; i++) { - framework::Tensor one_channel_in = in->Slice(i, i + 1); - framework::Tensor one_channel_out = out->Slice(i, i + 1); - framework::Tensor one_channel_scale = scales[0]->Slice(i, i + 1); - max_range *= (std::pow(2, quant_bits[1] - 1) - 1); - dequant(dev_ctx, &one_channel_in, &one_channel_scale, - static_cast(max_range), &one_channel_out); - } - dequant(dev_ctx, out, scales[1], static_cast(1), out); - } else { - for (int64_t i = 0; i < in->dims()[0]; i++) { - framework::Tensor one_channel_in = in->Slice(i, i + 1); - framework::Tensor one_channel_out = out->Slice(i, i + 1); - framework::Tensor one_channel_scale = scales[0]->Slice(i, i + 1); - dequant(dev_ctx, &one_channel_in, &one_channel_scale, - static_cast(max_range), &one_channel_out); - } + max_range = std::pow(2, quant_bits[1] - 1) - 1; + dequant(dev_ctx, out, scales[1], static_cast(max_range), out); } } }; diff --git a/python/paddle/fluid/tests/unittests/test_fake_dequantize_op.py b/python/paddle/fluid/tests/unittests/test_fake_dequantize_op.py index 8d91d8fd1d..32cb23cbfa 100644 --- a/python/paddle/fluid/tests/unittests/test_fake_dequantize_op.py +++ b/python/paddle/fluid/tests/unittests/test_fake_dequantize_op.py @@ -31,42 +31,49 @@ def dequantize_max_abs(x, scale, max_range): return y -def channel_wise_quantize_max_abs(x, max_range): +def channel_wise_quantize_max_abs(x, quant_bit=8): scales = [] for i in range(x.shape[0]): scales.append(np.max(np.abs(x[i])).astype("float32")) y = x.copy() + max_range = math.pow(2, quant_bit - 1) - 1 for i, scale in enumerate(scales): y[i] = np.round(y[i] / scale * max_range) return y, scales -def channel_wise_dequantize_max_abs(x, scales, max_range): +def channel_wise_dequantize_max_abs(x, + scales, + quant_bits, + activation_scale=None): y = x.copy() for i in range(x.shape[0]): - y[i] = (scales[i] / max_range) * y[i] + y[i] = (scales[i] / (math.pow(2, quant_bits[0] - 1) - 1)) * y[i] + if activation_scale is not None: + y *= activation_scale / (math.pow(2, quant_bits[1] - 1) - 1) return y class TestFakeChannelWiseDequantizeMaxAbsOpTwoScales(OpTest): def set_args(self): - self.quant_bits = [8, 2] + self.quant_bits = [8, 8] self.data_type = "float32" + self.activation_scale = 0.7861 def setUp(self): self.set_args() self.op_type = "fake_channel_wise_dequantize_max_abs" x = np.random.randn(4, 3, 64, 64).astype(self.data_type) - max_range = math.pow(2, self.quant_bits[0] - 1) - 1 - max_range *= (math.pow(2, self.quant_bits[1] - 1) - 1) - yq, scales = channel_wise_quantize_max_abs(x, max_range) - ydq = channel_wise_dequantize_max_abs(yq, scales, max_range) + yq, scales = channel_wise_quantize_max_abs(x, self.quant_bits[0]) + ydq = channel_wise_dequantize_max_abs(yq, scales, self.quant_bits, + self.activation_scale) self.inputs = { 'X': yq, 'Scales': [("scales0", np.array(scales).astype(self.data_type)), - ("scales1", np.array([1.0]).astype(self.data_type))] + ("scales1", np.array( + [self.activation_scale]).astype(self.data_type))] } self.attrs = {'quant_bits': self.quant_bits} self.outputs = {'Out': ydq} @@ -84,9 +91,8 @@ class TestFakeChannelWiseDequantizeMaxAbsOpOneScale(OpTest): self.set_args() self.op_type = "fake_channel_wise_dequantize_max_abs" x = np.random.randn(4, 3, 64, 64).astype(self.data_type) - max_range = math.pow(2, self.quant_bits[0] - 1) - 1 - yq, scales = channel_wise_quantize_max_abs(x, max_range) - ydq = channel_wise_dequantize_max_abs(yq, scales, max_range) + yq, scales = channel_wise_quantize_max_abs(x, self.quant_bits[0]) + ydq = channel_wise_dequantize_max_abs(yq, scales, self.quant_bits) self.inputs = { 'X': yq, From 33138a421d72912d0b51ddaf9d1e32f0c8b808d5 Mon Sep 17 00:00:00 2001 From: sneaxiy Date: Tue, 5 Mar 2019 11:32:23 +0000 Subject: [PATCH 11/64] remove match check test=develop --- paddle/fluid/operators/controlflow/while_op_helper.cc | 3 --- .../fluid/tests/unittests/test_eager_deletion_while_op.py | 6 +++--- .../unittests/test_partial_eager_deletion_transformer.py | 2 +- 3 files changed, 4 insertions(+), 7 deletions(-) diff --git a/paddle/fluid/operators/controlflow/while_op_helper.cc b/paddle/fluid/operators/controlflow/while_op_helper.cc index 0324a1586a..848ff5e8f1 100644 --- a/paddle/fluid/operators/controlflow/while_op_helper.cc +++ b/paddle/fluid/operators/controlflow/while_op_helper.cc @@ -241,9 +241,6 @@ static void PrepareSafeEagerDeletionOnWhileOpAndWhileGradOpImpl( ModifyWhileOpAndWhileGradOpAttr(*matched_fwd_op, bwd_op); while_op_set.erase(*matched_fwd_op); } - - PADDLE_ENFORCE(while_op_set.empty(), - "There are not matched while_grad op in graph."); } void PrepareSafeEagerDeletionOnWhileOpAndWhileGradOp( diff --git a/python/paddle/fluid/tests/unittests/test_eager_deletion_while_op.py b/python/paddle/fluid/tests/unittests/test_eager_deletion_while_op.py index 7fa1636579..898d04ebe1 100644 --- a/python/paddle/fluid/tests/unittests/test_eager_deletion_while_op.py +++ b/python/paddle/fluid/tests/unittests/test_eager_deletion_while_op.py @@ -47,7 +47,7 @@ class TestEagerDeletionWhileOpBase(unittest.TestCase): self.with_data_parallel = with_data_parallel if not core.is_compiled_with_cuda() and isinstance(self.place, - core.CUDPlace): + core.CUDAPlace): return if isinstance(self.place, core.CUDAPlace): @@ -55,8 +55,8 @@ class TestEagerDeletionWhileOpBase(unittest.TestCase): ) if self.with_data_parallel else 1 else: device_cnt = int( - os.environ['CPU_NUM'], - multiprocessing.cpu_count()) if self.with_data_parallel else 1 + os.environ.get('CPU_NUM', multiprocessing.cpu_count( + ))) if self.with_data_parallel else 1 d0 = layers.data( "d0", shape=[10], append_batch_size=False, dtype='float32') diff --git a/python/paddle/fluid/tests/unittests/test_partial_eager_deletion_transformer.py b/python/paddle/fluid/tests/unittests/test_partial_eager_deletion_transformer.py index d44e4627d8..ba3b275c7e 100644 --- a/python/paddle/fluid/tests/unittests/test_partial_eager_deletion_transformer.py +++ b/python/paddle/fluid/tests/unittests/test_partial_eager_deletion_transformer.py @@ -18,7 +18,7 @@ os.environ['FLAGS_eager_delete_tensor_gb'] = "0.0" os.environ['FLAGS_memory_fraction_of_eager_deletion'] = "0.55" os.environ[ - 'RECORDIO_FILENAME'] = '/tmp/eager_deletion_transformer.wmt16.recordio' + 'RECORDIO_FILENAME'] = '/tmp/partial_eager_deletion_transformer.wmt16.recordio' from test_parallel_executor_transformer import TestTransformer From 7b608396feb0106733295b6ff00cef8d87e505d1 Mon Sep 17 00:00:00 2001 From: sneaxiy Date: Wed, 6 Mar 2019 07:11:34 +0000 Subject: [PATCH 12/64] fix travis-ci format check test=develop --- paddle/fluid/framework/details/computation_op_handle.h | 1 + paddle/fluid/framework/details/eager_deletion_op_handle.cc | 5 ++++- paddle/fluid/framework/details/eager_deletion_pass.cc | 5 ++++- paddle/fluid/framework/details/reference_count_pass.cc | 4 ++++ .../fluid/framework/details/reference_count_pass_helper.h | 1 + paddle/fluid/framework/executor.cc | 6 +++++- paddle/fluid/operators/controlflow/while_op_helper.cc | 2 ++ .../tests/unittests/test_eager_deletion_transformer.py | 3 +-- .../unittests/test_partial_eager_deletion_transformer.py | 2 +- 9 files changed, 23 insertions(+), 6 deletions(-) diff --git a/paddle/fluid/framework/details/computation_op_handle.h b/paddle/fluid/framework/details/computation_op_handle.h index 67f7cb738f..e98b16e6b3 100644 --- a/paddle/fluid/framework/details/computation_op_handle.h +++ b/paddle/fluid/framework/details/computation_op_handle.h @@ -14,6 +14,7 @@ #pragma once +#include #include #include diff --git a/paddle/fluid/framework/details/eager_deletion_op_handle.cc b/paddle/fluid/framework/details/eager_deletion_op_handle.cc index e58e501e6d..dbc90737f2 100644 --- a/paddle/fluid/framework/details/eager_deletion_op_handle.cc +++ b/paddle/fluid/framework/details/eager_deletion_op_handle.cc @@ -12,11 +12,14 @@ // See the License for the specific language governing permissions and // limitations under the License. +#include +#include +#include + #include "paddle/fluid/framework/details/eager_deletion_op_handle.h" #include "paddle/fluid/framework/lod_tensor_array.h" #include "paddle/fluid/framework/scope.h" #include "paddle/fluid/framework/selected_rows.h" -#include "paddle/fluid/platform/profiler.h" #ifdef PADDLE_WITH_CUDA #include "paddle/fluid/platform/cuda_device_guard.h" #endif diff --git a/paddle/fluid/framework/details/eager_deletion_pass.cc b/paddle/fluid/framework/details/eager_deletion_pass.cc index 566bc15c17..377bb915e0 100644 --- a/paddle/fluid/framework/details/eager_deletion_pass.cc +++ b/paddle/fluid/framework/details/eager_deletion_pass.cc @@ -25,7 +25,10 @@ #include "paddle/fluid/framework/ir/graph_helper.h" DEFINE_double(memory_fraction_of_eager_deletion, 1.0, - "Fraction of eager deletion"); + "Fraction of eager deletion. If less than 1.0, all variables in " + "the program would be sorted according to its memory size, and " + "only the FLAGS_memory_fraction_of_eager_deletion of the largest " + "variables would be deleted."); namespace paddle { namespace framework { diff --git a/paddle/fluid/framework/details/reference_count_pass.cc b/paddle/fluid/framework/details/reference_count_pass.cc index 892f638f1f..6092143449 100644 --- a/paddle/fluid/framework/details/reference_count_pass.cc +++ b/paddle/fluid/framework/details/reference_count_pass.cc @@ -12,9 +12,13 @@ // See the License for the specific language governing permissions and // limitations under the License. +#include #include #include #include +#include +#include +#include #include #include "paddle/fluid/framework/details/computation_op_handle.h" diff --git a/paddle/fluid/framework/details/reference_count_pass_helper.h b/paddle/fluid/framework/details/reference_count_pass_helper.h index d9e8776d7e..ce700119c5 100644 --- a/paddle/fluid/framework/details/reference_count_pass_helper.h +++ b/paddle/fluid/framework/details/reference_count_pass_helper.h @@ -16,6 +16,7 @@ #include #include +#include #include #include #include diff --git a/paddle/fluid/framework/executor.cc b/paddle/fluid/framework/executor.cc index 5555679412..7eef9ec504 100644 --- a/paddle/fluid/framework/executor.cc +++ b/paddle/fluid/framework/executor.cc @@ -14,6 +14,10 @@ limitations under the License. */ #include "paddle/fluid/framework/executor.h" #include +#include +#include +#include +#include #include "paddle/fluid/framework/feed_fetch_method.h" #include "paddle/fluid/framework/lod_rank_table.h" @@ -428,7 +432,7 @@ void Executor::RunPreparedContext(ExecutorPrepareContext* ctx, Scope* scope, #ifdef PADDLE_WITH_CUDA } #endif - if (gc) { + if (gc && keep_kids) { operators::PrepareSafeEagerDeletionOnWhileOpAndWhileGradOp(ctx->block_id_, ctx->ops_); } diff --git a/paddle/fluid/operators/controlflow/while_op_helper.cc b/paddle/fluid/operators/controlflow/while_op_helper.cc index 848ff5e8f1..2cbd94a061 100644 --- a/paddle/fluid/operators/controlflow/while_op_helper.cc +++ b/paddle/fluid/operators/controlflow/while_op_helper.cc @@ -14,6 +14,8 @@ #include "paddle/fluid/operators/controlflow/while_op_helper.h" #include +#include +#include #include "paddle/fluid/framework/program_desc.h" namespace paddle { diff --git a/python/paddle/fluid/tests/unittests/test_eager_deletion_transformer.py b/python/paddle/fluid/tests/unittests/test_eager_deletion_transformer.py index 603c8e7488..05cc41b96f 100644 --- a/python/paddle/fluid/tests/unittests/test_eager_deletion_transformer.py +++ b/python/paddle/fluid/tests/unittests/test_eager_deletion_transformer.py @@ -16,8 +16,7 @@ import os import unittest os.environ['FLAGS_eager_delete_tensor_gb'] = "0.0" -os.environ[ - 'RECORDIO_FILENAME'] = '/tmp/eager_deletion_transformer.wmt16.recordio' +os.environ['RECORDIO_FILENAME'] = './eager_deletion_transformer.wmt16.recordio' from test_parallel_executor_transformer import TestTransformer diff --git a/python/paddle/fluid/tests/unittests/test_partial_eager_deletion_transformer.py b/python/paddle/fluid/tests/unittests/test_partial_eager_deletion_transformer.py index ba3b275c7e..fc1d762ec9 100644 --- a/python/paddle/fluid/tests/unittests/test_partial_eager_deletion_transformer.py +++ b/python/paddle/fluid/tests/unittests/test_partial_eager_deletion_transformer.py @@ -18,7 +18,7 @@ os.environ['FLAGS_eager_delete_tensor_gb'] = "0.0" os.environ['FLAGS_memory_fraction_of_eager_deletion'] = "0.55" os.environ[ - 'RECORDIO_FILENAME'] = '/tmp/partial_eager_deletion_transformer.wmt16.recordio' + 'RECORDIO_FILENAME'] = './partial_eager_deletion_transformer.wmt16.recordio' from test_parallel_executor_transformer import TestTransformer From f49d9b393c6b333d741b2df722fa8b41ab923d5e Mon Sep 17 00:00:00 2001 From: minqiyang Date: Thu, 7 Mar 2019 10:15:49 +0800 Subject: [PATCH 13/64] Transfer GRU unit test=develop --- python/paddle/fluid/imperative/nn.py | 137 ++++++++++++++++++++++++++- 1 file changed, 136 insertions(+), 1 deletion(-) diff --git a/python/paddle/fluid/imperative/nn.py b/python/paddle/fluid/imperative/nn.py index 6c5961cc63..bad2c325be 100644 --- a/python/paddle/fluid/imperative/nn.py +++ b/python/paddle/fluid/imperative/nn.py @@ -22,7 +22,7 @@ from . import layers from ..framework import Variable, OpProtoHolder from ..param_attr import ParamAttr from ..initializer import Normal, Constant -__all__ = ['Conv2D', 'Pool2D', 'FC', 'BatchNorm', 'Embedding'] +__all__ = ['Conv2D', 'Pool2D', 'FC', 'BatchNorm', 'Embedding', 'GRUUnit'] class Conv2D(layers.Layer): @@ -496,3 +496,138 @@ class Embedding(layers.Layer): }) return out + + +class GRUUnit(layers.Layer): + """ + **GRU unit layer** + + if origin_mode is True, then the equation of a gru step is from paper + `Learning Phrase Representations using RNN Encoder-Decoder for Statistical + Machine Translation `_ + + .. math:: + u_t & = actGate(xu_{t} + W_u h_{t-1} + b_u) + + r_t & = actGate(xr_{t} + W_r h_{t-1} + b_r) + + m_t & = actNode(xm_t + W_c dot(r_t, h_{t-1}) + b_m) + + h_t & = dot(u_t, h_{t-1}) + dot((1-u_t), m_t) + + if origin_mode is False, then the equation of a gru step is from paper + `Empirical Evaluation of Gated Recurrent Neural Networks on Sequence + Modeling `_ + + .. math:: + u_t & = actGate(xu_{t} + W_u h_{t-1} + b_u) + + r_t & = actGate(xr_{t} + W_r h_{t-1} + b_r) + + m_t & = actNode(xm_t + W_c dot(r_t, h_{t-1}) + b_m) + + h_t & = dot((1-u_t), h_{t-1}) + dot(u_t, m_t) + + + The inputs of gru unit includes :math:`z_t`, :math:`h_{t-1}`. In terms + of the equation above, the :math:`z_t` is split into 3 parts - + :math:`xu_t`, :math:`xr_t` and :math:`xm_t`. This means that in order to + implement a full GRU unit operator for an input, a fully + connected layer has to be applied, such that :math:`z_t = W_{fc}x_t`. + + The terms :math:`u_t` and :math:`r_t` represent the update and reset gates + of the GRU cell. Unlike LSTM, GRU has one lesser gate. However, there is + an intermediate candidate hidden output, which is denoted by :math:`m_t`. + This layer has three outputs :math:`h_t`, :math:`dot(r_t, h_{t-1})` + and concatenation of :math:`u_t`, :math:`r_t` and :math:`m_t`. + + Args: + input (Variable): The fc transformed input value of current step. + hidden (Variable): The hidden value of gru unit from previous step. + size (integer): The input dimension value. + param_attr(ParamAttr|None): The parameter attribute for the learnable + hidden-hidden weight matrix. Note: + + - The shape of the weight matrix is :math:`(T \\times 3D)`, where + :math:`D` is the hidden size. + - All elements in the weight matrix can be divided into two parts. + The first part are weights of the update gate and reset gate with + shape :math:`(D \\times 2D)`, and the second part are weights for + candidate hidden state with shape :math:`(D \\times D)`. + + If it is set to None or one attribute of ParamAttr, gru_unit will + create ParamAttr as param_attr. If the Initializer of the param_attr + is not set, the parameter is initialized with Xavier. Default: None. + bias_attr (ParamAttr|bool|None): The parameter attribute for the bias + of GRU.Note that the bias with :math:`(1 \\times 3D)` concatenates + the bias in the update gate, reset gate and candidate calculations. + If it is set to False, no bias will be applied to the update gate, + reset gate and candidate calculations. If it is set to None or one + attribute of ParamAttr, gru_unit will create ParamAttr as + bias_attr. If the Initializer of the bias_attr is not set, the bias + is initialized zero. Default: None. + activation (string): The activation type for cell (actNode). + Default: 'tanh' + gate_activation (string): The activation type for gates (actGate). + Default: 'sigmoid' + + Returns: + tuple: The hidden value, reset-hidden value and gate values. + """ + + def __init__(self, + hidden, + size, + param_attr=None, + bias_attr=None, + activation='tanh', + gate_activation='sigmoid', + origin_mode=False, + dtype='float32'): + + super(GRUUnit, self).__init__() + activation_dict = dict( + identity=0, + sigmoid=1, + tanh=2, + relu=3, ) + activation = activation_dict[activation] + gate_activation = activation_dict[gate_activation] + + helper = LayerHelper('gru_unit', **locals()) + dtype = helper.input_dtype() + size = size // 3 + + # create weight + weight = helper.create_parameter( + attr=helper.param_attr, shape=[size, 3 * size], dtype=dtype) + + gate = helper.create_variable_for_type_inference(dtype) + reset_hidden_pre = helper.create_variable_for_type_inference(dtype) + updated_hidden = helper.create_variable_for_type_inference(dtype) + inputs = {'Input': input, 'HiddenPrev': hidden, 'Weight': weight} + # create bias + if helper.bias_attr: + bias_size = [1, 3 * size] + bias = helper.create_parameter( + attr=helper.bias_attr, + shape=bias_size, + dtype=dtype, + is_bias=True) + inputs['Bias'] = bias + + def forward(self, input): + self._helper.append_op( + type='gru_unit', + inputs=inputs, + outputs={ + 'Gate': gate, + 'ResetHiddenPrev': reset_hidden_pre, + 'Hidden': updated_hidden, + }, + attrs={ + 'activation': 2, # tanh + 'gate_activation': 1, # sigmoid + }) + + return updated_hidden, reset_hidden_pre, gate From fad06cb92868a3a3bf78ae00d7bcc499dad7cbce Mon Sep 17 00:00:00 2001 From: luotao1 Date: Thu, 7 Mar 2019 11:57:44 +0800 Subject: [PATCH 14/64] unify ZeroCopy in analysis_test --- paddle/fluid/inference/api/helper.h | 15 +- .../tests/api/analyzer_pyramid_dnn_tester.cc | 3 + .../fluid/inference/tests/api/tester_helper.h | 147 ++++++++++-------- 3 files changed, 96 insertions(+), 69 deletions(-) diff --git a/paddle/fluid/inference/api/helper.h b/paddle/fluid/inference/api/helper.h index b92781e4f2..8114754a27 100644 --- a/paddle/fluid/inference/api/helper.h +++ b/paddle/fluid/inference/api/helper.h @@ -127,9 +127,8 @@ static void TensorAssignData(PaddleTensor *tensor, } template -static int ZeroCopyTensorAssignData(ZeroCopyTensor *tensor, - const std::vector> &data) { - int size{0}; +static void ZeroCopyTensorAssignData(ZeroCopyTensor *tensor, + const std::vector> &data) { auto *ptr = tensor->mutable_data(PaddlePlace::kCPU); int c = 0; for (const auto &f : data) { @@ -137,7 +136,15 @@ static int ZeroCopyTensorAssignData(ZeroCopyTensor *tensor, ptr[c++] = v; } } - return size; +} + +template +static void ZeroCopyTensorAssignData(ZeroCopyTensor *tensor, + const PaddleBuf &data) { + auto *ptr = tensor->mutable_data(PaddlePlace::kCPU); + for (size_t i = 0; i < data.length() / sizeof(T); i++) { + ptr[i] = *(reinterpret_cast(data.data()) + i); + } } static bool CompareTensor(const PaddleTensor &a, const PaddleTensor &b) { diff --git a/paddle/fluid/inference/tests/api/analyzer_pyramid_dnn_tester.cc b/paddle/fluid/inference/tests/api/analyzer_pyramid_dnn_tester.cc index 3f6c933f2b..df834e75df 100644 --- a/paddle/fluid/inference/tests/api/analyzer_pyramid_dnn_tester.cc +++ b/paddle/fluid/inference/tests/api/analyzer_pyramid_dnn_tester.cc @@ -107,6 +107,9 @@ void SetConfig(AnalysisConfig *cfg) { cfg->DisableGpu(); cfg->SwitchSpecifyInputNames(); cfg->SwitchIrOptim(); + if (FLAGS_zero_copy) { + cfg->SwitchUseFeedFetchOps(false); + } } void SetInput(std::vector> *inputs) { diff --git a/paddle/fluid/inference/tests/api/tester_helper.h b/paddle/fluid/inference/tests/api/tester_helper.h index 2e53fddfe7..3becb4bf68 100644 --- a/paddle/fluid/inference/tests/api/tester_helper.h +++ b/paddle/fluid/inference/tests/api/tester_helper.h @@ -51,6 +51,7 @@ DEFINE_bool(use_analysis, true, DEFINE_bool(record_benchmark, false, "Record benchmark after profiling the model"); DEFINE_double(accuracy, 1e-3, "Result Accuracy."); +DEFINE_bool(zero_copy, false, "Use ZeroCopy to speedup Feed/Fetch."); DECLARE_bool(profile); DECLARE_int32(paddle_num_threads); @@ -198,61 +199,104 @@ void GetInputPerBatch(const std::vector> &in, } } -void TestOneThreadPrediction( - const PaddlePredictor::Config *config, - const std::vector> &inputs, - std::vector *outputs, bool use_analysis = true) { - int batch_size = FLAGS_batch_size; - int num_times = FLAGS_repeat; - auto predictor = CreateTestPredictor(config, use_analysis); +void ConvertPaddleTensorToZeroCopyTensor( + PaddlePredictor *predictor, const std::vector &inputs) { + for (size_t i = 0; i < inputs.size(); i++) { + auto input = inputs[i]; + auto tensor = predictor->GetInputTensor(input.name); + tensor->Reshape(input.shape); + tensor->SetLoD({input.lod}); + if (input.dtype == PaddleDType::INT64) { + ZeroCopyTensorAssignData(tensor.get(), input.data); + } else if (input.dtype == PaddleDType::FLOAT32) { + ZeroCopyTensorAssignData(tensor.get(), input.data); + } else { + LOG(ERROR) << "unsupported feed type " << input.dtype; + } + } +} - // warmup run - LOG(INFO) << "Warm up run..."; - { - Timer warmup_timer; - warmup_timer.tic(); +void PredictionWarmUp(PaddlePredictor *predictor, + const std::vector> &inputs, + std::vector *outputs, int num_threads, + int tid) { + int batch_size = FLAGS_batch_size; + LOG(INFO) << "Running thread " << tid << ", warm up run..."; + if (FLAGS_zero_copy) { + ConvertPaddleTensorToZeroCopyTensor(predictor, inputs[0]); + } + Timer warmup_timer; + warmup_timer.tic(); + if (!FLAGS_zero_copy) { predictor->Run(inputs[0], outputs, batch_size); - PrintTime(batch_size, 1, 1, 0, warmup_timer.toc(), 1); - if (FLAGS_profile) { - paddle::platform::ResetProfiler(); - } + } else { + predictor->ZeroCopyRun(); } + PrintTime(batch_size, 1, num_threads, tid, warmup_timer.toc(), 1); + if (FLAGS_profile) { + paddle::platform::ResetProfiler(); + } +} - LOG(INFO) << "Run " << num_times << " times..."; - { - Timer run_timer; - run_timer.tic(); +void PredictionRun(PaddlePredictor *predictor, + const std::vector> &inputs, + std::vector *outputs, int num_threads, + int tid) { + int batch_size = FLAGS_batch_size; + int num_times = FLAGS_repeat; + LOG(INFO) << "Thread " << tid << " run " << num_times << " times..."; + Timer run_timer; + double elapsed_time = 0; #ifdef WITH_GPERFTOOLS - ProfilerStart("paddle_inference.prof"); + ProfilerStart("paddle_inference.prof"); #endif - for (int i = 0; i < num_times; i++) { - for (size_t j = 0; j < inputs.size(); j++) { - predictor->Run(inputs[j], outputs, batch_size); + if (!FLAGS_zero_copy) { + run_timer.tic(); + for (size_t i = 0; i < inputs.size(); i++) { + for (int j = 0; j < num_times; j++) { + predictor->Run(inputs[i], outputs, batch_size); } } + elapsed_time = run_timer.toc(); + } else { + for (size_t i = 0; i < inputs.size(); i++) { + ConvertPaddleTensorToZeroCopyTensor(predictor, inputs[i]); + run_timer.tic(); + for (int j = 0; j < num_times; j++) { + predictor->ZeroCopyRun(); + } + elapsed_time += run_timer.toc(); + } + } #ifdef WITH_GPERFTOOLS - ProfilerStop(); + ProfilerStop(); #endif - double latency = run_timer.toc() / (num_times > 1 ? num_times : 1); - PrintTime(batch_size, num_times, 1, 0, latency, inputs.size()); - if (FLAGS_record_benchmark) { - Benchmark benchmark; - benchmark.SetName(FLAGS_model_name); - benchmark.SetBatchSize(batch_size); - benchmark.SetLatency(latency); - benchmark.PersistToFile("benchmark_record.txt"); - } + PrintTime(batch_size, num_times, num_threads, tid, elapsed_time / num_times, + inputs.size()); + if (FLAGS_record_benchmark) { + Benchmark benchmark; + benchmark.SetName(FLAGS_model_name); + benchmark.SetBatchSize(batch_size); + benchmark.SetLatency(elapsed_time / num_times); + benchmark.PersistToFile("benchmark_record.txt"); } } +void TestOneThreadPrediction( + const PaddlePredictor::Config *config, + const std::vector> &inputs, + std::vector *outputs, bool use_analysis = true) { + auto predictor = CreateTestPredictor(config, use_analysis); + PredictionWarmUp(predictor.get(), inputs, outputs, 1, 0); + PredictionRun(predictor.get(), inputs, outputs, 1, 0); +} + void TestMultiThreadPrediction( const PaddlePredictor::Config *config, const std::vector> &inputs, std::vector *outputs, int num_threads, bool use_analysis = true) { - int batch_size = FLAGS_batch_size; - int num_times = FLAGS_repeat; std::vector threads; std::vector> predictors; predictors.emplace_back(CreateTestPredictor(config, use_analysis)); @@ -260,7 +304,6 @@ void TestMultiThreadPrediction( predictors.emplace_back(predictors.front()->Clone()); } - size_t total_time{0}; for (int tid = 0; tid < num_threads; ++tid) { threads.emplace_back([&, tid]() { // Each thread should have local inputs and outputs. @@ -273,34 +316,8 @@ void TestMultiThreadPrediction( ->SetMkldnnThreadID(static_cast(tid) + 1); } #endif - - // warmup run - LOG(INFO) << "Running thread " << tid << ", warm up run..."; - { - Timer warmup_timer; - warmup_timer.tic(); - predictor->Run(inputs[0], outputs, batch_size); - PrintTime(batch_size, 1, num_threads, tid, warmup_timer.toc(), 1); - if (FLAGS_profile) { - paddle::platform::ResetProfiler(); - } - } - - LOG(INFO) << "Thread " << tid << " run " << num_times << " times..."; - { - Timer timer; - timer.tic(); - for (int i = 0; i < num_times; i++) { - for (const auto &input : inputs) { - ASSERT_TRUE(predictor->Run(input, &outputs_tid)); - } - } - - auto time = timer.toc(); - total_time += time; - PrintTime(batch_size, num_times, num_threads, tid, time / num_times, - inputs.size()); - } + PredictionWarmUp(predictor.get(), inputs, outputs, num_threads, tid); + PredictionRun(predictor.get(), inputs, outputs, num_threads, tid); }); } for (int i = 0; i < num_threads; ++i) { From d0f8d94ca4c81d765ba0bc47235e25d3456381b1 Mon Sep 17 00:00:00 2001 From: sneaxiy Date: Thu, 7 Mar 2019 05:09:16 +0000 Subject: [PATCH 15/64] try to fix unittest test=develop --- .../tests/unittests/test_partial_eager_deletion_transformer.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/python/paddle/fluid/tests/unittests/test_partial_eager_deletion_transformer.py b/python/paddle/fluid/tests/unittests/test_partial_eager_deletion_transformer.py index fc1d762ec9..7607189454 100644 --- a/python/paddle/fluid/tests/unittests/test_partial_eager_deletion_transformer.py +++ b/python/paddle/fluid/tests/unittests/test_partial_eager_deletion_transformer.py @@ -17,8 +17,7 @@ import unittest os.environ['FLAGS_eager_delete_tensor_gb'] = "0.0" os.environ['FLAGS_memory_fraction_of_eager_deletion'] = "0.55" -os.environ[ - 'RECORDIO_FILENAME'] = './partial_eager_deletion_transformer.wmt16.recordio' +os.environ['RECORDIO_FILENAME'] = './p_gc_transformer.wmt16.recordio' from test_parallel_executor_transformer import TestTransformer From 802f362ac44e99a06e3c499893f3c8279a367f0d Mon Sep 17 00:00:00 2001 From: tensor-tang Date: Thu, 7 Mar 2019 07:21:35 +0000 Subject: [PATCH 16/64] unify the kernelfuncs cache and add unit test test=develop --- paddle/fluid/operators/crf_decoding_op.h | 5 +- .../mkldnn/elementwise_mul_mkldnn_op.cc | 6 ++- .../fused/fused_embedding_seq_pool_op.h | 12 +++-- paddle/fluid/operators/fused/fusion_gru_op.cc | 49 +++++++++-------- .../fluid/operators/fused/fusion_lstm_op.cc | 54 ++++++++++--------- .../fused/fusion_repeated_fc_relu_op.cc | 10 ++-- .../fused/fusion_seqpool_concat_op.cc | 6 +-- .../fused/fusion_squared_mat_sub_op.cc | 32 ++++++----- paddle/fluid/operators/jit/CMakeLists.txt | 2 +- paddle/fluid/operators/jit/benchmark.cc | 2 +- paddle/fluid/operators/jit/helper.h | 34 ++++++++---- paddle/fluid/operators/jit/test.cc | 30 ++++++++--- paddle/fluid/operators/layer_norm_op.h | 6 +-- .../fluid/operators/math/sequence_pooling.cc | 6 +-- paddle/fluid/operators/optimizers/sgd_op.h | 10 ++-- 15 files changed, 158 insertions(+), 106 deletions(-) diff --git a/paddle/fluid/operators/crf_decoding_op.h b/paddle/fluid/operators/crf_decoding_op.h index 72774a878d..3d98790a4d 100644 --- a/paddle/fluid/operators/crf_decoding_op.h +++ b/paddle/fluid/operators/crf_decoding_op.h @@ -82,8 +82,9 @@ class CRFDecodingOpKernel : public framework::OpKernel { Tensor track; int* track_value = track.mutable_data(emission_dims, platform::CPUPlace()); - auto ker = jit::Get, - platform::CPUPlace>(tag_num); + auto ker = jit::KernelFuncs, + platform::CPUPlace>::Cache() + .At(tag_num); ker(static_cast(seq_len), x, w, alpha_value, track_value, tag_num); T max_score = -std::numeric_limits::max(); int max_i = 0; diff --git a/paddle/fluid/operators/elementwise/mkldnn/elementwise_mul_mkldnn_op.cc b/paddle/fluid/operators/elementwise/mkldnn/elementwise_mul_mkldnn_op.cc index 04e8800bbc..e37bbd2837 100644 --- a/paddle/fluid/operators/elementwise/mkldnn/elementwise_mul_mkldnn_op.cc +++ b/paddle/fluid/operators/elementwise/mkldnn/elementwise_mul_mkldnn_op.cc @@ -110,8 +110,10 @@ class ElementwiseMulMKLDNNKernel : public framework::OpKernel { constexpr int simd_width = 16; int C = c / simd_width; - auto multiply = jit::Get, - platform::CPUPlace>(0); + auto multiply = + jit::KernelFuncs, + platform::CPUPlace>::Cache() + .At(0); #pragma omp parallel for collapse(2) for (int ni = 0; ni < n; ni++) { for (int ci = 0; ci < C; ci++) { diff --git a/paddle/fluid/operators/fused/fused_embedding_seq_pool_op.h b/paddle/fluid/operators/fused/fused_embedding_seq_pool_op.h index f13c020386..fe43545e60 100644 --- a/paddle/fluid/operators/fused/fused_embedding_seq_pool_op.h +++ b/paddle/fluid/operators/fused/fused_embedding_seq_pool_op.h @@ -52,8 +52,10 @@ struct EmbeddingVSumFunctor { out_width, jit::SeqPoolType::kSum); for (size_t i = 0; i != ids_lod.size() - 1; ++i) { attr.index_height = ids_lod[i + 1] - ids_lod[i]; - auto emb_seqpool = jit::Get, - platform::CPUPlace>(attr); + auto emb_seqpool = + jit::KernelFuncs, + platform::CPUPlace>::Cache() + .At(attr); emb_seqpool(table, ids + ids_lod[i] * idx_width, output + i * out_width, &attr); } @@ -135,8 +137,10 @@ class FusedEmbeddingSeqPoolGradKernel : public framework::OpKernel { T *d_table_data = d_table_value->mutable_data(context.GetPlace()); const T *d_output_data = d_output->data(); - auto vbroadcast = jit::Get, - platform::CPUPlace>(out_width); + auto vbroadcast = + jit::KernelFuncs, + platform::CPUPlace>::Cache() + .At(out_width); for (int i = 0; i < static_cast(lod.size()) - 1; ++i) { int64_t h = static_cast(lod[i + 1] - lod[i]); const T *src = d_output_data + i * out_width; diff --git a/paddle/fluid/operators/fused/fusion_gru_op.cc b/paddle/fluid/operators/fused/fusion_gru_op.cc index 66acba49e5..cd8a6a55d4 100644 --- a/paddle/fluid/operators/fused/fusion_gru_op.cc +++ b/paddle/fluid/operators/fused/fusion_gru_op.cc @@ -182,29 +182,32 @@ class FusionGRUKernel : public framework::OpKernel { const int total_T = x_dims[0]; \ const int D3 = wh_dims[1] -#define INIT_OTHER_DEFINES \ - auto* h0 = ctx.Input("H0"); \ - auto* wx = ctx.Input("WeightX"); \ - auto* bias = ctx.Input("Bias"); \ - auto* hidden_out = ctx.Output("Hidden"); \ - bool is_reverse = ctx.Attr("is_reverse"); \ - const int M = x_dims[1]; \ - const int D = wh_dims[0]; \ - const int D2 = D * 2; \ - const jit::gru_attr_t attr( \ - D, jit::to_kerneltype(ctx.Attr("gate_activation")), \ - jit::to_kerneltype(ctx.Attr("activation"))); \ - jit::gru_t one_step; \ - auto ComputeH1 = \ - jit::Get, platform::CPUPlace>(attr); \ - auto ComputeHtPart1 = \ - jit::Get, platform::CPUPlace>(attr); \ - auto ComputeHtPart2 = \ - jit::Get, platform::CPUPlace>(attr); \ - const T* x_data = x->data(); \ - const T* wx_data = wx->data(); \ - const T* wh_data = wh->data(); \ - auto place = ctx.GetPlace(); \ +#define INIT_OTHER_DEFINES \ + auto* h0 = ctx.Input("H0"); \ + auto* wx = ctx.Input("WeightX"); \ + auto* bias = ctx.Input("Bias"); \ + auto* hidden_out = ctx.Output("Hidden"); \ + bool is_reverse = ctx.Attr("is_reverse"); \ + const int M = x_dims[1]; \ + const int D = wh_dims[0]; \ + const int D2 = D * 2; \ + const jit::gru_attr_t attr( \ + D, jit::to_kerneltype(ctx.Attr("gate_activation")), \ + jit::to_kerneltype(ctx.Attr("activation"))); \ + jit::gru_t one_step; \ + auto ComputeH1 = jit::KernelFuncs, \ + platform::CPUPlace>::Cache() \ + .At(attr); \ + auto ComputeHtPart1 = jit::KernelFuncs, \ + platform::CPUPlace>::Cache() \ + .At(attr); \ + auto ComputeHtPart2 = jit::KernelFuncs, \ + platform::CPUPlace>::Cache() \ + .At(attr); \ + const T* x_data = x->data(); \ + const T* wx_data = wx->data(); \ + const T* wh_data = wh->data(); \ + auto place = ctx.GetPlace(); \ T* xx_data = xx->mutable_data(place) void SeqCompute(const framework::ExecutionContext& ctx) const { diff --git a/paddle/fluid/operators/fused/fusion_lstm_op.cc b/paddle/fluid/operators/fused/fusion_lstm_op.cc index b11b7c11bf..d7d12df4bf 100644 --- a/paddle/fluid/operators/fused/fusion_lstm_op.cc +++ b/paddle/fluid/operators/fused/fusion_lstm_op.cc @@ -235,32 +235,34 @@ class FuisonLSTMKernel : public framework::OpKernel { const int D = wh_dims[0]; \ const int D4 = wh_dims[1] -#define INIT_OTHER_DEFINES \ - const T* x_data = x->data(); \ - const T* wx_data = wx->data(); \ - const T* wh_data = wh->data(); \ - /* diagonal weight*/ \ - const T* wp_data = bias->data() + D4; \ - /* for peephole only*/ \ - T* checked_cell_data = nullptr; \ - auto place = ctx.GetPlace(); \ - if (use_peepholes) { \ - /* w_ic * Ct-1, w_fc * Ct-1 ; w_oc * Ct => ih*/ \ - auto* checked_cell = ctx.Output("CheckedCell"); \ - checked_cell_data = checked_cell->mutable_data(place); \ - } \ - const jit::lstm_attr_t attr( \ - D, jit::to_kerneltype(ctx.Attr("gate_activation")), \ - jit::to_kerneltype(ctx.Attr("candidate_activation")), \ - jit::to_kerneltype(ctx.Attr("cell_activation")), \ - use_peepholes); \ - jit::lstm_t one_step; \ - one_step.wp = wp_data; \ - one_step.checked = checked_cell_data; \ - auto ComputeC1H1 = \ - jit::Get, platform::CPUPlace>(attr); \ - auto ComputeCtHt = \ - jit::Get, platform::CPUPlace>(attr) +#define INIT_OTHER_DEFINES \ + const T* x_data = x->data(); \ + const T* wx_data = wx->data(); \ + const T* wh_data = wh->data(); \ + /* diagonal weight*/ \ + const T* wp_data = bias->data() + D4; \ + /* for peephole only*/ \ + T* checked_cell_data = nullptr; \ + auto place = ctx.GetPlace(); \ + if (use_peepholes) { \ + /* w_ic * Ct-1, w_fc * Ct-1 ; w_oc * Ct => ih*/ \ + auto* checked_cell = ctx.Output("CheckedCell"); \ + checked_cell_data = checked_cell->mutable_data(place); \ + } \ + const jit::lstm_attr_t attr( \ + D, jit::to_kerneltype(ctx.Attr("gate_activation")), \ + jit::to_kerneltype(ctx.Attr("candidate_activation")), \ + jit::to_kerneltype(ctx.Attr("cell_activation")), \ + use_peepholes); \ + jit::lstm_t one_step; \ + one_step.wp = wp_data; \ + one_step.checked = checked_cell_data; \ + auto ComputeC1H1 = jit::KernelFuncs, \ + platform::CPUPlace>::Cache() \ + .At(attr); \ + auto ComputeCtHt = jit::KernelFuncs, \ + platform::CPUPlace>::Cache() \ + .At(attr) // Wh GEMM #define GEMM_WH_ADDON(bs, prev, out) \ diff --git a/paddle/fluid/operators/fused/fusion_repeated_fc_relu_op.cc b/paddle/fluid/operators/fused/fusion_repeated_fc_relu_op.cc index 8ecdf2ed9d..e057724b5a 100644 --- a/paddle/fluid/operators/fused/fusion_repeated_fc_relu_op.cc +++ b/paddle/fluid/operators/fused/fusion_repeated_fc_relu_op.cc @@ -81,10 +81,12 @@ void FusionRepeatedFCReluOpMaker::Make() { template static void fc_relu(const T* x, const T* w, const T* b, T* y, const jit::matmul_attr_t& attr) { - auto matmul = - jit::Get, platform::CPUPlace>(attr); - auto addbias_relu = - jit::Get, platform::CPUPlace>(attr.n); + auto matmul = jit::KernelFuncs, + platform::CPUPlace>::Cache() + .At(attr); + auto addbias_relu = jit::KernelFuncs, + platform::CPUPlace>::Cache() + .At(attr.n); matmul(x, w, y, &attr); T* dst = y; for (int i = 0; i < attr.m; ++i) { diff --git a/paddle/fluid/operators/fused/fusion_seqpool_concat_op.cc b/paddle/fluid/operators/fused/fusion_seqpool_concat_op.cc index d48bdafe0a..7aeeabc512 100644 --- a/paddle/fluid/operators/fused/fusion_seqpool_concat_op.cc +++ b/paddle/fluid/operators/fused/fusion_seqpool_concat_op.cc @@ -97,9 +97,9 @@ class FusionSeqPoolConcatKernel : public framework::OpKernel { } else if (pooltype == "SQRT") { attr.type = jit::SeqPoolType::kSqrt; } - auto seqpool = - jit::Get, platform::CPUPlace>( - attr); + auto seqpool = jit::KernelFuncs, + platform::CPUPlace>::Cache() + .At(attr); size_t n = ins.size(); size_t dst_step_size = n * w; for (size_t i = 0; i < n; ++i) { diff --git a/paddle/fluid/operators/fused/fusion_squared_mat_sub_op.cc b/paddle/fluid/operators/fused/fusion_squared_mat_sub_op.cc index 8493f4468f..9382bf0ebb 100644 --- a/paddle/fluid/operators/fused/fusion_squared_mat_sub_op.cc +++ b/paddle/fluid/operators/fused/fusion_squared_mat_sub_op.cc @@ -93,20 +93,24 @@ class FusionSquaredMatSubKernel : public framework::OpKernel { attr.n = y_dims[1]; int o_numel = attr.m * attr.n; - auto vsquare_x = - jit::Get, platform::CPUPlace>(attr.m * - attr.k); - auto vsquare_y = - jit::Get, platform::CPUPlace>(attr.k * - attr.n); - auto vsquare_xy = - jit::Get, platform::CPUPlace>(o_numel); - auto vsub = - jit::Get, platform::CPUPlace>(o_numel); - auto vscal = - jit::Get, platform::CPUPlace>(o_numel); - auto matmul = - jit::Get, platform::CPUPlace>(attr); + auto vsquare_x = jit::KernelFuncs, + platform::CPUPlace>::Cache() + .At(attr.m * attr.k); + auto vsquare_y = jit::KernelFuncs, + platform::CPUPlace>::Cache() + .At(attr.k * attr.n); + auto vsquare_xy = jit::KernelFuncs, + platform::CPUPlace>::Cache() + .At(o_numel); + auto vsub = jit::KernelFuncs, + platform::CPUPlace>::Cache() + .At(o_numel); + auto vscal = jit::KernelFuncs, + platform::CPUPlace>::Cache() + .At(o_numel); + auto matmul = jit::KernelFuncs, + platform::CPUPlace>::Cache() + .At(attr); const T* x_data = x->data(); const T* y_data = y->data(); diff --git a/paddle/fluid/operators/jit/CMakeLists.txt b/paddle/fluid/operators/jit/CMakeLists.txt index 35775d7ec9..47d6c83f2a 100644 --- a/paddle/fluid/operators/jit/CMakeLists.txt +++ b/paddle/fluid/operators/jit/CMakeLists.txt @@ -5,7 +5,7 @@ file(APPEND ${jit_file} "\#pragma once\n") file(APPEND ${jit_file} "\#include \"paddle/fluid/operators/jit/helper.h\"\n") file(APPEND ${jit_file} "\#include \"paddle/fluid/operators/jit/registry.h\"\n\n") -set(JIT_KERNEL_DEPS cpu_info cblas gflags enforce place) +set(JIT_KERNEL_DEPS cpu_info cblas gflags enforce place xxhash) file(GLOB jit_kernel_cc_srcs RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "*.cc") list(REMOVE_ITEM jit_kernel_cc_srcs test.cc benchmark.cc) diff --git a/paddle/fluid/operators/jit/benchmark.cc b/paddle/fluid/operators/jit/benchmark.cc index 3088280bb9..deb96ee6cd 100644 --- a/paddle/fluid/operators/jit/benchmark.cc +++ b/paddle/fluid/operators/jit/benchmark.cc @@ -142,7 +142,7 @@ void BenchAllImpls(const typename KernelTuples::attr_type& attr, Args... args) { } } // Test result from Get function - auto tgt = jit::Get(attr); + auto tgt = jit::KernelFuncs::Cache().At(attr); if (!tgt) { LOG(FATAL) << "Target can not be empty!"; } diff --git a/paddle/fluid/operators/jit/helper.h b/paddle/fluid/operators/jit/helper.h index d85c719c1c..1af1add3ee 100644 --- a/paddle/fluid/operators/jit/helper.h +++ b/paddle/fluid/operators/jit/helper.h @@ -14,6 +14,9 @@ #pragma once +extern "C" { +#include +} #include #include #include @@ -127,23 +130,36 @@ class KernelFuncs { return g_func_cache; } - bool Has(int key) const { return funcs_.find(key) != funcs_.end(); } - - void Insert(int key, typename KernelTuples::func_type func) { - funcs_.emplace(key, func); - } - - typename KernelTuples::func_type At(int key) { + // the exposed interface to use + typename KernelTuples::func_type At( + const typename KernelTuples::attr_type& attr) { + // XXH64: 13.8 GB/s + int64_t key = XXH64(&attr, sizeof(typename KernelTuples::attr_type), 0); if (Has(key)) { return funcs_.at(key); } - auto func = Get(key); + // If do not have this attr in cache, + // then could run some runtime benchmark of this attr and save the best one. + // Here just get the offline benchmarked best one. + auto func = Get(attr); Insert(key, func); return func; } + typename KernelTuples::func_type operator[]( + const typename KernelTuples::attr_type& attr) { + return At(attr); + } + + protected: + bool Has(int64_t key) const { return funcs_.find(key) != funcs_.end(); } + + void Insert(int64_t key, typename KernelTuples::func_type func) { + funcs_.emplace(key, func); + } + private: - std::unordered_map funcs_; + std::unordered_map funcs_; DISABLE_COPY_AND_ASSIGN(KernelFuncs); }; diff --git a/paddle/fluid/operators/jit/test.cc b/paddle/fluid/operators/jit/test.cc index cdec14dc43..18f8c09f14 100644 --- a/paddle/fluid/operators/jit/test.cc +++ b/paddle/fluid/operators/jit/test.cc @@ -462,7 +462,7 @@ void TestAllImpls(const typename KernelTuples::attr_type& attr, Args... args) { } // test result from Get function // VLOG(10) << "Test Get function "; - auto tgt = jit::Get(attr); + auto tgt = jit::KernelFuncs::Cache().At(attr); test(tgt, args...); } @@ -845,7 +845,9 @@ void TestKernelNCHW16CMulNCTuples() { T* zjit_data = zjit.data(); constexpr int simd_width = ZMM_FLOAT_BLOCK; int C = c / simd_width; - auto tgt = jit::Get, PlaceType>(0); + auto tgt = + jit::KernelFuncs, PlaceType>::Cache().At( + 0); auto jitcode = jit::GetJitCode, PlaceType>(0); EXPECT_TRUE(tgt != nullptr); @@ -967,10 +969,10 @@ void TestKernelVBroadcastTuples() { } } -#define TEST_CPU_KERNEL(test_tuple, kernel_type) \ - TEST(JITKernel, kernel_type) { \ - TestKernel##test_tuple(); \ - TestKernel##test_tuple(); \ +#define TEST_CPU_KERNEL(test_tuple, kernel_type) \ + TEST(JITKernel, kernel_type) { \ + TestKernel##test_tuple(); \ + TestKernel##test_tuple(); \ } TEST_CPU_KERNEL(XYZNTuples, kVMul); @@ -1041,4 +1043,18 @@ TEST(JITKernel_key, gru) { EXPECT_TRUE(key2 == key3); EXPECT_TRUE(key3 != key4); } -// TODO(TJ): add more test about key and pool + +TEST(JITKernel, kernel_func) { + auto f1 = + jit::KernelFuncs, CPUPlace>::Cache() + .At(3); + auto f2 = jit::KernelFuncs, + CPUPlace>::Cache()[3]; + EXPECT_TRUE(f1 == f2); + + f1 = jit::KernelFuncs, CPUPlace>::Cache() + .At(3); + f2 = jit::KernelFuncs, CPUPlace>::Cache() + .At(4); + EXPECT_TRUE(f1 != f2); +} diff --git a/paddle/fluid/operators/layer_norm_op.h b/paddle/fluid/operators/layer_norm_op.h index f564a10396..f0c3064d41 100644 --- a/paddle/fluid/operators/layer_norm_op.h +++ b/paddle/fluid/operators/layer_norm_op.h @@ -229,9 +229,9 @@ class LayerNormKernel : public framework::OpKernel { PADDLE_ENFORCE_EQ(scale->numel(), right); PADDLE_ENFORCE_EQ(bias->numel(), right); - auto ker = - jit::Get, platform::CPUPlace>( - right); + auto ker = jit::KernelFuncs, + platform::CPUPlace>::Cache() + .At(right); ker(x.data(), out.data(), mean->data(), var->data(), scale->data(), bias->data(), static_cast(left), static_cast(epsilon), right); diff --git a/paddle/fluid/operators/math/sequence_pooling.cc b/paddle/fluid/operators/math/sequence_pooling.cc index 2a47502614..db103e5fab 100644 --- a/paddle/fluid/operators/math/sequence_pooling.cc +++ b/paddle/fluid/operators/math/sequence_pooling.cc @@ -255,9 +255,9 @@ class SequencePoolFunctor { jit::seq_pool_attr_t attr( static_cast(input.numel() / input.dims()[0]), jit::SeqPoolType::kSum); - auto seqpool = - jit::Get, platform::CPUPlace>( - attr); + auto seqpool = jit::KernelFuncs, + platform::CPUPlace>::Cache() + .At(attr); for (int i = 0; i < static_cast(lod.size()) - 1; ++i) { attr.h = static_cast(lod[i + 1] - lod[i]); seqpool(src, dst, &attr); diff --git a/paddle/fluid/operators/optimizers/sgd_op.h b/paddle/fluid/operators/optimizers/sgd_op.h index c9c9f530fe..0425a3d194 100644 --- a/paddle/fluid/operators/optimizers/sgd_op.h +++ b/paddle/fluid/operators/optimizers/sgd_op.h @@ -47,8 +47,9 @@ class SGDOpKernel : public framework::OpKernel { int64_t rows_idx = 0; T *out_data = param_out->mutable_data(ctx.GetPlace()); - auto sgd = - jit::Get, platform::CPUPlace>(attr); + auto sgd = jit::KernelFuncs, + platform::CPUPlace>::Cache() + .At(attr); sgd(lr, param_data, grad_data, &rows_idx, out_data, &attr); } else if (grad_var->IsType()) { // TODO(qijun): In Sparse SGD operator, in-place update is enforced. @@ -81,8 +82,9 @@ class SGDOpKernel : public framework::OpKernel { attr.selected_rows_size = grad_rows.size(); PADDLE_ENFORCE_EQ(attr.grad_width, attr.param_width); - auto sgd = - jit::Get, platform::CPUPlace>(attr); + auto sgd = jit::KernelFuncs, + platform::CPUPlace>::Cache() + .At(attr); sgd(lr, param_data, grad_data, rows_data, out_data, &attr); } else { PADDLE_THROW("Unsupported Variable Type of Grad"); From 14a764c930c4ff895168d482db51c21b6338f283 Mon Sep 17 00:00:00 2001 From: tensor-tang Date: Fri, 8 Mar 2019 08:04:04 +0000 Subject: [PATCH 17/64] simplify the jitkernel templates and tests test=develop --- paddle/fluid/operators/crf_decoding_op.h | 6 +- .../mkldnn/elementwise_mul_mkldnn_op.cc | 7 +- .../fused/fused_embedding_seq_pool_op.h | 6 +- paddle/fluid/operators/fused/fusion_gru_op.cc | 52 +- .../fluid/operators/fused/fusion_lstm_op.cc | 56 +- .../fused/fusion_repeated_fc_relu_op.cc | 12 +- .../fused/fusion_seqpool_concat_op.cc | 6 +- .../fused/fusion_squared_mat_sub_op.cc | 36 +- paddle/fluid/operators/jit/benchmark.cc | 269 ++-- paddle/fluid/operators/jit/helper.h | 71 +- paddle/fluid/operators/jit/kernel_base.h | 93 +- .../jit/more/intrinsic/crf_decoding.h | 5 +- .../operators/jit/more/intrinsic/layer_norm.h | 4 +- paddle/fluid/operators/jit/more/mix/mix.cc | 68 +- paddle/fluid/operators/jit/more/mix/mix.h | 28 +- paddle/fluid/operators/jit/more/mkl/mkl.cc | 32 +- paddle/fluid/operators/jit/more/mkl/mkl.h | 49 +- paddle/fluid/operators/jit/refer/refer.cc | 80 +- paddle/fluid/operators/jit/refer/refer.h | 80 +- paddle/fluid/operators/jit/test.cc | 1282 ++++++++--------- paddle/fluid/operators/layer_norm_op.h | 6 +- paddle/fluid/operators/math/fc_compute.h | 11 +- .../fluid/operators/math/sequence_pooling.cc | 6 +- paddle/fluid/operators/math/softmax_impl.h | 3 +- paddle/fluid/operators/optimizers/sgd_op.h | 12 +- 25 files changed, 1135 insertions(+), 1145 deletions(-) diff --git a/paddle/fluid/operators/crf_decoding_op.h b/paddle/fluid/operators/crf_decoding_op.h index 3d98790a4d..d6b54038ec 100644 --- a/paddle/fluid/operators/crf_decoding_op.h +++ b/paddle/fluid/operators/crf_decoding_op.h @@ -82,9 +82,9 @@ class CRFDecodingOpKernel : public framework::OpKernel { Tensor track; int* track_value = track.mutable_data(emission_dims, platform::CPUPlace()); - auto ker = jit::KernelFuncs, - platform::CPUPlace>::Cache() - .At(tag_num); + auto ker = + jit::KernelFuncs, platform::CPUPlace>::Cache() + .At(tag_num); ker(static_cast(seq_len), x, w, alpha_value, track_value, tag_num); T max_score = -std::numeric_limits::max(); int max_i = 0; diff --git a/paddle/fluid/operators/elementwise/mkldnn/elementwise_mul_mkldnn_op.cc b/paddle/fluid/operators/elementwise/mkldnn/elementwise_mul_mkldnn_op.cc index e37bbd2837..f2f4d3fee0 100644 --- a/paddle/fluid/operators/elementwise/mkldnn/elementwise_mul_mkldnn_op.cc +++ b/paddle/fluid/operators/elementwise/mkldnn/elementwise_mul_mkldnn_op.cc @@ -110,10 +110,9 @@ class ElementwiseMulMKLDNNKernel : public framework::OpKernel { constexpr int simd_width = 16; int C = c / simd_width; - auto multiply = - jit::KernelFuncs, - platform::CPUPlace>::Cache() - .At(0); + auto multiply = jit::KernelFuncs, + platform::CPUPlace>::Cache() + .At(0); #pragma omp parallel for collapse(2) for (int ni = 0; ni < n; ni++) { for (int ci = 0; ci < C; ci++) { diff --git a/paddle/fluid/operators/fused/fused_embedding_seq_pool_op.h b/paddle/fluid/operators/fused/fused_embedding_seq_pool_op.h index fe43545e60..5e2e336e71 100644 --- a/paddle/fluid/operators/fused/fused_embedding_seq_pool_op.h +++ b/paddle/fluid/operators/fused/fused_embedding_seq_pool_op.h @@ -53,8 +53,7 @@ struct EmbeddingVSumFunctor { for (size_t i = 0; i != ids_lod.size() - 1; ++i) { attr.index_height = ids_lod[i + 1] - ids_lod[i]; auto emb_seqpool = - jit::KernelFuncs, - platform::CPUPlace>::Cache() + jit::KernelFuncs, platform::CPUPlace>::Cache() .At(attr); emb_seqpool(table, ids + ids_lod[i] * idx_width, output + i * out_width, &attr); @@ -138,8 +137,7 @@ class FusedEmbeddingSeqPoolGradKernel : public framework::OpKernel { const T *d_output_data = d_output->data(); auto vbroadcast = - jit::KernelFuncs, - platform::CPUPlace>::Cache() + jit::KernelFuncs, platform::CPUPlace>::Cache() .At(out_width); for (int i = 0; i < static_cast(lod.size()) - 1; ++i) { int64_t h = static_cast(lod[i + 1] - lod[i]); diff --git a/paddle/fluid/operators/fused/fusion_gru_op.cc b/paddle/fluid/operators/fused/fusion_gru_op.cc index cd8a6a55d4..ba5f0747c4 100644 --- a/paddle/fluid/operators/fused/fusion_gru_op.cc +++ b/paddle/fluid/operators/fused/fusion_gru_op.cc @@ -182,32 +182,32 @@ class FusionGRUKernel : public framework::OpKernel { const int total_T = x_dims[0]; \ const int D3 = wh_dims[1] -#define INIT_OTHER_DEFINES \ - auto* h0 = ctx.Input("H0"); \ - auto* wx = ctx.Input("WeightX"); \ - auto* bias = ctx.Input("Bias"); \ - auto* hidden_out = ctx.Output("Hidden"); \ - bool is_reverse = ctx.Attr("is_reverse"); \ - const int M = x_dims[1]; \ - const int D = wh_dims[0]; \ - const int D2 = D * 2; \ - const jit::gru_attr_t attr( \ - D, jit::to_kerneltype(ctx.Attr("gate_activation")), \ - jit::to_kerneltype(ctx.Attr("activation"))); \ - jit::gru_t one_step; \ - auto ComputeH1 = jit::KernelFuncs, \ - platform::CPUPlace>::Cache() \ - .At(attr); \ - auto ComputeHtPart1 = jit::KernelFuncs, \ - platform::CPUPlace>::Cache() \ - .At(attr); \ - auto ComputeHtPart2 = jit::KernelFuncs, \ - platform::CPUPlace>::Cache() \ - .At(attr); \ - const T* x_data = x->data(); \ - const T* wx_data = wx->data(); \ - const T* wh_data = wh->data(); \ - auto place = ctx.GetPlace(); \ +#define INIT_OTHER_DEFINES \ + auto* h0 = ctx.Input("H0"); \ + auto* wx = ctx.Input("WeightX"); \ + auto* bias = ctx.Input("Bias"); \ + auto* hidden_out = ctx.Output("Hidden"); \ + bool is_reverse = ctx.Attr("is_reverse"); \ + const int M = x_dims[1]; \ + const int D = wh_dims[0]; \ + const int D2 = D * 2; \ + const jit::gru_attr_t attr( \ + D, jit::to_kerneltype(ctx.Attr("gate_activation")), \ + jit::to_kerneltype(ctx.Attr("activation"))); \ + jit::gru_t one_step; \ + auto ComputeH1 = \ + jit::KernelFuncs, platform::CPUPlace>::Cache().At( \ + attr); \ + auto ComputeHtPart1 = \ + jit::KernelFuncs, platform::CPUPlace>::Cache() \ + .At(attr); \ + auto ComputeHtPart2 = \ + jit::KernelFuncs, platform::CPUPlace>::Cache() \ + .At(attr); \ + const T* x_data = x->data(); \ + const T* wx_data = wx->data(); \ + const T* wh_data = wh->data(); \ + auto place = ctx.GetPlace(); \ T* xx_data = xx->mutable_data(place) void SeqCompute(const framework::ExecutionContext& ctx) const { diff --git a/paddle/fluid/operators/fused/fusion_lstm_op.cc b/paddle/fluid/operators/fused/fusion_lstm_op.cc index d7d12df4bf..c8c07bd126 100644 --- a/paddle/fluid/operators/fused/fusion_lstm_op.cc +++ b/paddle/fluid/operators/fused/fusion_lstm_op.cc @@ -235,34 +235,34 @@ class FuisonLSTMKernel : public framework::OpKernel { const int D = wh_dims[0]; \ const int D4 = wh_dims[1] -#define INIT_OTHER_DEFINES \ - const T* x_data = x->data(); \ - const T* wx_data = wx->data(); \ - const T* wh_data = wh->data(); \ - /* diagonal weight*/ \ - const T* wp_data = bias->data() + D4; \ - /* for peephole only*/ \ - T* checked_cell_data = nullptr; \ - auto place = ctx.GetPlace(); \ - if (use_peepholes) { \ - /* w_ic * Ct-1, w_fc * Ct-1 ; w_oc * Ct => ih*/ \ - auto* checked_cell = ctx.Output("CheckedCell"); \ - checked_cell_data = checked_cell->mutable_data(place); \ - } \ - const jit::lstm_attr_t attr( \ - D, jit::to_kerneltype(ctx.Attr("gate_activation")), \ - jit::to_kerneltype(ctx.Attr("candidate_activation")), \ - jit::to_kerneltype(ctx.Attr("cell_activation")), \ - use_peepholes); \ - jit::lstm_t one_step; \ - one_step.wp = wp_data; \ - one_step.checked = checked_cell_data; \ - auto ComputeC1H1 = jit::KernelFuncs, \ - platform::CPUPlace>::Cache() \ - .At(attr); \ - auto ComputeCtHt = jit::KernelFuncs, \ - platform::CPUPlace>::Cache() \ - .At(attr) +#define INIT_OTHER_DEFINES \ + const T* x_data = x->data(); \ + const T* wx_data = wx->data(); \ + const T* wh_data = wh->data(); \ + /* diagonal weight*/ \ + const T* wp_data = bias->data() + D4; \ + /* for peephole only*/ \ + T* checked_cell_data = nullptr; \ + auto place = ctx.GetPlace(); \ + if (use_peepholes) { \ + /* w_ic * Ct-1, w_fc * Ct-1 ; w_oc * Ct => ih*/ \ + auto* checked_cell = ctx.Output("CheckedCell"); \ + checked_cell_data = checked_cell->mutable_data(place); \ + } \ + const jit::lstm_attr_t attr( \ + D, jit::to_kerneltype(ctx.Attr("gate_activation")), \ + jit::to_kerneltype(ctx.Attr("candidate_activation")), \ + jit::to_kerneltype(ctx.Attr("cell_activation")), \ + use_peepholes); \ + jit::lstm_t one_step; \ + one_step.wp = wp_data; \ + one_step.checked = checked_cell_data; \ + auto ComputeC1H1 = \ + jit::KernelFuncs, platform::CPUPlace>::Cache().At( \ + attr); \ + auto ComputeCtHt = \ + jit::KernelFuncs, platform::CPUPlace>::Cache().At( \ + attr) // Wh GEMM #define GEMM_WH_ADDON(bs, prev, out) \ diff --git a/paddle/fluid/operators/fused/fusion_repeated_fc_relu_op.cc b/paddle/fluid/operators/fused/fusion_repeated_fc_relu_op.cc index e057724b5a..6be35de65f 100644 --- a/paddle/fluid/operators/fused/fusion_repeated_fc_relu_op.cc +++ b/paddle/fluid/operators/fused/fusion_repeated_fc_relu_op.cc @@ -81,12 +81,12 @@ void FusionRepeatedFCReluOpMaker::Make() { template static void fc_relu(const T* x, const T* w, const T* b, T* y, const jit::matmul_attr_t& attr) { - auto matmul = jit::KernelFuncs, - platform::CPUPlace>::Cache() - .At(attr); - auto addbias_relu = jit::KernelFuncs, - platform::CPUPlace>::Cache() - .At(attr.n); + auto matmul = + jit::KernelFuncs, platform::CPUPlace>::Cache().At( + attr); + auto addbias_relu = + jit::KernelFuncs, platform::CPUPlace>::Cache().At( + attr.n); matmul(x, w, y, &attr); T* dst = y; for (int i = 0; i < attr.m; ++i) { diff --git a/paddle/fluid/operators/fused/fusion_seqpool_concat_op.cc b/paddle/fluid/operators/fused/fusion_seqpool_concat_op.cc index 7aeeabc512..25916768c0 100644 --- a/paddle/fluid/operators/fused/fusion_seqpool_concat_op.cc +++ b/paddle/fluid/operators/fused/fusion_seqpool_concat_op.cc @@ -97,9 +97,9 @@ class FusionSeqPoolConcatKernel : public framework::OpKernel { } else if (pooltype == "SQRT") { attr.type = jit::SeqPoolType::kSqrt; } - auto seqpool = jit::KernelFuncs, - platform::CPUPlace>::Cache() - .At(attr); + auto seqpool = + jit::KernelFuncs, platform::CPUPlace>::Cache().At( + attr); size_t n = ins.size(); size_t dst_step_size = n * w; for (size_t i = 0; i < n; ++i) { diff --git a/paddle/fluid/operators/fused/fusion_squared_mat_sub_op.cc b/paddle/fluid/operators/fused/fusion_squared_mat_sub_op.cc index 9382bf0ebb..53679ebdde 100644 --- a/paddle/fluid/operators/fused/fusion_squared_mat_sub_op.cc +++ b/paddle/fluid/operators/fused/fusion_squared_mat_sub_op.cc @@ -93,24 +93,24 @@ class FusionSquaredMatSubKernel : public framework::OpKernel { attr.n = y_dims[1]; int o_numel = attr.m * attr.n; - auto vsquare_x = jit::KernelFuncs, - platform::CPUPlace>::Cache() - .At(attr.m * attr.k); - auto vsquare_y = jit::KernelFuncs, - platform::CPUPlace>::Cache() - .At(attr.k * attr.n); - auto vsquare_xy = jit::KernelFuncs, - platform::CPUPlace>::Cache() - .At(o_numel); - auto vsub = jit::KernelFuncs, - platform::CPUPlace>::Cache() - .At(o_numel); - auto vscal = jit::KernelFuncs, - platform::CPUPlace>::Cache() - .At(o_numel); - auto matmul = jit::KernelFuncs, - platform::CPUPlace>::Cache() - .At(attr); + auto vsquare_x = + jit::KernelFuncs, platform::CPUPlace>::Cache().At( + attr.m * attr.k); + auto vsquare_y = + jit::KernelFuncs, platform::CPUPlace>::Cache().At( + attr.k * attr.n); + auto vsquare_xy = + jit::KernelFuncs, platform::CPUPlace>::Cache().At( + o_numel); + auto vsub = + jit::KernelFuncs, platform::CPUPlace>::Cache().At( + o_numel); + auto vscal = + jit::KernelFuncs, platform::CPUPlace>::Cache().At( + o_numel); + auto matmul = + jit::KernelFuncs, platform::CPUPlace>::Cache().At( + attr); const T* x_data = x->data(); const T* y_data = y->data(); diff --git a/paddle/fluid/operators/jit/benchmark.cc b/paddle/fluid/operators/jit/benchmark.cc index deb96ee6cd..773cf38eb9 100644 --- a/paddle/fluid/operators/jit/benchmark.cc +++ b/paddle/fluid/operators/jit/benchmark.cc @@ -59,8 +59,6 @@ BenchJITKernel* InsertBenchmark(BenchJITKernel* b) { InsertBenchmark(new BenchJITKernel_##name##_##dtype##_##place##_()); \ void BenchJITKernel_##name##_##dtype##_##place##_::Run() -#define BENCH_FP32_CPU(name) BENCH_JITKERNEL(name, FP32, CPU) - void RUN_ALL_BENCHMARK() { for (auto p : g_all_benchmarks) { if (!FLAGS_filter.empty() && FLAGS_filter != p->Name()) { @@ -90,11 +88,11 @@ std::vector TestSizes() { return s; } -template +template struct BenchFunc { // return this function avg time // TODO(TJ): clear cache every time - double operator()(const typename KernelTuples::func_type tgt, Args... args) { + double operator()(const typename KernelTuple::func_type tgt, Args... args) { for (int i = 0; i < FLAGS_burning; ++i) { tgt(args...); } @@ -109,31 +107,30 @@ struct BenchFunc { namespace jit = paddle::operators::jit; -template -void BenchAllImpls(const typename KernelTuples::attr_type& attr, Args... args) { - BenchFunc benchmark; +template +void BenchAllImpls(const typename KernelTuple::attr_type& attr, Args... args) { + BenchFunc benchmark; std::vector> infos; // test refer - auto refer = jit::GetRefer(); + auto refer = jit::GetRefer(); if (!refer) { LOG(FATAL) << "Refer can not be empty!"; } infos.push_back(std::make_pair("Refer", benchmark(refer, args...))); // test jitcode - auto jitcode = jit::GetJitCode(attr); + auto jitcode = jit::GetJitCode(attr); if (jitcode) { infos.push_back(std::make_pair("JitCode", benchmark(jitcode, args...))); } // test all impls in more - jit::KernelKey kkey(KT, PlaceType()); + jit::KernelKey kkey(KernelTuple::kernel_type, PlaceType()); auto& pool = jit::KernelPool().Instance().AllKernels(); auto iter = pool.find(kkey); if (iter != pool.end()) { auto& impls = iter->second; for (auto& impl : impls) { - auto i = dynamic_cast*>(impl.get()); + auto i = dynamic_cast*>(impl.get()); if (i && i->UseMe(attr)) { auto more = i->GetFunc(); infos.push_back( @@ -142,7 +139,7 @@ void BenchAllImpls(const typename KernelTuples::attr_type& attr, Args... args) { } } // Test result from Get function - auto tgt = jit::KernelFuncs::Cache().At(attr); + auto tgt = jit::KernelFuncs::Cache().At(attr); if (!tgt) { LOG(FATAL) << "Target can not be empty!"; } @@ -150,7 +147,8 @@ void BenchAllImpls(const typename KernelTuples::attr_type& attr, Args... args) { // print std::ostringstream loginfos; - loginfos << "Kernel Type " << jit::to_string(KT) << ": " << attr << ": "; + loginfos << "Kernel Type " << jit::to_string(KernelTuple::kernel_type) << ": " + << attr << ": "; for (auto pair : infos) { loginfos << pair.first << " takes " << pair.second << " us; "; } @@ -159,8 +157,9 @@ void BenchAllImpls(const typename KernelTuples::attr_type& attr, Args... args) { using Tensor = paddle::framework::Tensor; -template -void BenchXYZNKernel() { +template +void BenchKernelXYZN() { + using T = typename KernelTuple::data_type; for (int d : TestSizes()) { Tensor x, y, z; x.Resize({d}); @@ -171,16 +170,16 @@ void BenchXYZNKernel() { T* z_data = z.mutable_data(PlaceType()); RandomVec(d, x_data); RandomVec(d, y_data); - BenchAllImpls, PlaceType>(d, x.data(), - y.data(), z_data, d); + BenchAllImpls(d, x.data(), y.data(), z_data, + d); // test inplace - BenchAllImpls, PlaceType>(d, x.data(), z_data, - z_data, d); + BenchAllImpls(d, x.data(), z_data, z_data, d); } } -template -void BenchAXYNKernel() { +template +void BenchKernelAXYN() { + using T = typename KernelTuple::data_type; for (int d : TestSizes()) { const T a = static_cast(3); Tensor x, y; @@ -189,26 +188,26 @@ void BenchAXYNKernel() { T* x_data = x.mutable_data(PlaceType()); T* y_data = y.mutable_data(PlaceType()); RandomVec(d, x_data); - BenchAllImpls, PlaceType>(d, &a, x.data(), y_data, - d); + BenchAllImpls(d, &a, x.data(), y_data, d); // test inplace - BenchAllImpls, PlaceType>(d, &a, x.data(), x_data, - d); + BenchAllImpls(d, &a, x.data(), x_data, d); } } -template -void BenchXRNKernel() { +template +void BenchKernelXRN() { + using T = typename KernelTuple::data_type; for (int d : TestSizes()) { Tensor x; RandomVec(d, x.mutable_data({d}, PlaceType())); T res; - BenchAllImpls, PlaceType>(d, x.data(), &res, d); + BenchAllImpls(d, x.data(), &res, d); } } -template -void BenchXYNKernel() { +template +void BenchKernelXYN() { + using T = typename KernelTuple::data_type; for (int d : TestSizes()) { Tensor x, y; x.Resize({d}); @@ -216,12 +215,13 @@ void BenchXYNKernel() { T* x_data = x.mutable_data(PlaceType()); T* y_data = y.mutable_data(PlaceType()); RandomVec(d, x_data); - BenchAllImpls, PlaceType>(d, x.data(), y_data, d); + BenchAllImpls(d, x.data(), y_data, d); } } -template -void BenchLSTMKernel() { +template +void BenchKernelLSTM() { + using T = typename KernelTuple::data_type; for (bool use_peephole : {true, false}) { for (int d : TestSizes()) { const jit::lstm_attr_t attr(d, jit::kVSigmoid, jit::kVTanh, jit::kVTanh, @@ -252,13 +252,14 @@ void BenchLSTMKernel() { step.wp = wp_data; step.checked = checked_data; } - BenchAllImpls, PlaceType>(attr, &step, &attr); + BenchAllImpls(attr, &step, &attr); } } } -template -void BenchGRUKernel() { +template +void BenchKernelGRU() { + using T = typename KernelTuple::data_type; for (int d : TestSizes()) { const jit::gru_attr_t attr(d, jit::kVSigmoid, jit::kVTanh); auto place = PlaceType(); @@ -275,12 +276,13 @@ void BenchGRUKernel() { step.gates = x_data; step.ht_1 = ht_1_data; step.ht = ht_data; - BenchAllImpls, PlaceType>(attr, &step, &attr); + BenchAllImpls(attr, &step, &attr); } } -template -void BenchSeqPoolKernel() { +template +void BenchKernelSeqPool() { + using T = typename KernelTuple::data_type; std::vector pool_types = { jit::SeqPoolType::kSum, jit::SeqPoolType::kAvg, jit::SeqPoolType::kSqrt}; for (auto type : pool_types) { @@ -294,15 +296,15 @@ void BenchSeqPoolKernel() { RandomVec(h * w, x.mutable_data(PlaceType()), -2.f, 2.f); const T* x_data = x.data(); T* y_data = y.mutable_data(PlaceType()); - BenchAllImpls, PlaceType>(attr, x_data, - y_data, &attr); + BenchAllImpls(attr, x_data, y_data, &attr); } } } } -template -void BenchEmbSeqPoolKernel() { +template +void BenchKernelEmbSeqPool() { + using T = typename KernelTuple::data_type; std::vector pool_types = {jit::SeqPoolType::kSum}; int64_t tbl_h = 1e4; for (int tbl_w : {10, 16, 256}) { @@ -324,16 +326,17 @@ void BenchEmbSeqPoolKernel() { tbl_h - 1); const int64_t* idx_data = idx.data(); T* o_data = out.mutable_data(PlaceType()); - BenchAllImpls, PlaceType>( - attr, table_data, idx_data, o_data, &attr); + BenchAllImpls(attr, table_data, idx_data, + o_data, &attr); } } } } } -template -void BenchSgdKernel() { +template +void BenchKernelSgd() { + using T = typename KernelTuple::data_type; const T lr = 0.1; auto UnDuplicatedRandomVec = [](int n, const int64_t lower, const int64_t upper) -> std::vector { @@ -364,15 +367,16 @@ void BenchSgdKernel() { const T* grad_data = grad.data(); const int64_t* rows_data = rows.data(); jit::sgd_attr_t attr(param_h, grad_w, rows_size, grad_w, rows_size); - BenchAllImpls, PlaceType>( - attr, &lr, param_data, grad_data, rows_data, param_data, &attr); + BenchAllImpls(attr, &lr, param_data, grad_data, + rows_data, param_data, &attr); } } } } -template -void BenchMatMulKernel() { +template +void BenchKernelMatMul() { + using T = typename KernelTuple::data_type; for (int m : {1, 2, 3, 4}) { for (int n : TestSizes()) { for (int k : TestSizes()) { @@ -386,15 +390,16 @@ void BenchMatMulKernel() { const T* b_data = b.data(); T* c_data = c.mutable_data(PlaceType()); const jit::matmul_attr_t attr{m, n, k}; - BenchAllImpls, PlaceType>(attr, a_data, b_data, - c_data, &attr); + BenchAllImpls(attr, a_data, b_data, c_data, + &attr); } } } } -template -void BenchSoftmaxKernel() { +template +void BenchKernelSoftmax() { + using T = typename KernelTuple::data_type; for (int bs : {1, 2, 10}) { for (int n : TestSizes()) { Tensor x, y; @@ -403,14 +408,14 @@ void BenchSoftmaxKernel() { RandomVec(bs * n, x.mutable_data(PlaceType()), -2.f, 2.f); const T* x_data = x.data(); T* y_data = y.mutable_data(PlaceType()); - BenchAllImpls, PlaceType>(n, x_data, y_data, n, - bs); + BenchAllImpls(n, x_data, y_data, n, bs); } } } -template -void BenchLayerNormKernel() { +template +void BenchKernelLayerNorm() { + using T = typename KernelTuple::data_type; const T epsilon = 9.99999975e-06; for (int n : {1, 2, 10}) { for (int x_dim_0 : {1, 9, 17, 50}) { @@ -439,16 +444,17 @@ void BenchLayerNormKernel() { T* var_data = var.data(); T* out_data = out.mutable_data(PlaceType()); - BenchAllImpls, PlaceType>( - right, x_data, out_data, mean_data, var_data, scale_data, bias_data, - left, epsilon, right); + BenchAllImpls(right, x_data, out_data, + mean_data, var_data, scale_data, + bias_data, left, epsilon, right); } } } } -template -void BenchCRFDecodingKernel() { +template +void BenchKernelCRFDecoding() { + using T = typename KernelTuple::data_type; constexpr int state_trans_base_idx = 2; for (int seq_len : {1, 11, 17, 50}) { for (int tag_num : TestSizes()) { @@ -468,14 +474,15 @@ void BenchCRFDecodingKernel() { T* alpha_data = alpha.mutable_data(PlaceType()); int* track_data = track.mutable_data(PlaceType()); - BenchAllImpls, PlaceType>( - tag_num, seq_len, x_data, w_data, alpha_data, track_data, tag_num); + BenchAllImpls(tag_num, seq_len, x_data, w_data, + alpha_data, track_data, tag_num); } } } -template -void BenchVBroadcastKernel() { +template +void BenchKernelVBroadcast() { + using T = typename KernelTuple::data_type; for (int64_t w : {1, 16, 64, 100, 256}) { Tensor x; x.Resize({w}); @@ -485,78 +492,86 @@ void BenchVBroadcastKernel() { Tensor y; y.Resize({h * w}); T* y_data = y.mutable_data(PlaceType()); - BenchAllImpls, PlaceType>( - w, x_data, y_data, static_cast(h), w); + BenchAllImpls(w, x_data, y_data, + static_cast(h), w); } } } -using T = float; -using CPUPlace = paddle::platform::CPUPlace; +#define BenchKernelVMul BenchKernelXYZN +#define BenchKernelVAdd BenchKernelXYZN +#define BenchKernelVAddRelu BenchKernelXYZN +#define BenchKernelVSub BenchKernelXYZN -// xyzn -BENCH_FP32_CPU(kVMul) { BenchXYZNKernel(); } -BENCH_FP32_CPU(kVAdd) { BenchXYZNKernel(); } -BENCH_FP32_CPU(kVAddRelu) { BenchXYZNKernel(); } -BENCH_FP32_CPU(kVSub) { BenchXYZNKernel(); } +#define BenchKernelVScal BenchKernelAXYN +#define BenchKernelVAddBias BenchKernelAXYN -// axyn -BENCH_FP32_CPU(kVScal) { BenchAXYNKernel(); } -BENCH_FP32_CPU(kVAddBias) { BenchAXYNKernel(); } +#define BenchKernelVRelu BenchKernelXYN +#define BenchKernelVIdentity BenchKernelXYN +#define BenchKernelVSquare BenchKernelXYN +#define BenchKernelVExp BenchKernelXYN +#define BenchKernelVSigmoid BenchKernelXYN +#define BenchKernelVTanh BenchKernelXYN +#define BenchKernelVCopy BenchKernelXYN -// xrn -BENCH_FP32_CPU(kHSum) { BenchXRNKernel(); } -BENCH_FP32_CPU(kHMax) { BenchXRNKernel(); } +#define BenchKernelHMax BenchKernelXRN +#define BenchKernelHSum BenchKernelXRN -// xyn -BENCH_FP32_CPU(kVRelu) { BenchXYNKernel(); } -BENCH_FP32_CPU(kVIdentity) { BenchXYNKernel(); } -BENCH_FP32_CPU(kVSquare) { BenchXYNKernel(); } -BENCH_FP32_CPU(kVExp) { BenchXYNKernel(); } -BENCH_FP32_CPU(kVSigmoid) { BenchXYNKernel(); } -BENCH_FP32_CPU(kVTanh) { BenchXYNKernel(); } -BENCH_FP32_CPU(kVCopy) { BenchXYNKernel(); } - -// lstm and peephole -BENCH_FP32_CPU(kLSTMCtHt) { BenchLSTMKernel(); } -BENCH_FP32_CPU(kLSTMC1H1) { BenchLSTMKernel(); } - -// gru functions -BENCH_FP32_CPU(kGRUH1) { BenchGRUKernel(); } -BENCH_FP32_CPU(kGRUHtPart1) { BenchGRUKernel(); } -BENCH_FP32_CPU(kGRUHtPart2) { BenchGRUKernel(); } - -// seq pool function -BENCH_FP32_CPU(kSeqPool) { BenchSeqPoolKernel(); } - -// embedding seq pool function -BENCH_FP32_CPU(kEmbSeqPool) { - BenchEmbSeqPoolKernel(); -} +#define BenchKernelLSTMCtHt BenchKernelLSTM +#define BenchKernelLSTMC1H1 BenchKernelLSTM -// sgd function -BENCH_FP32_CPU(kSgd) { BenchSgdKernel(); } +#define BenchKernelGRUH1 BenchKernelGRU +#define BenchKernelGRUHtPart1 BenchKernelGRU +#define BenchKernelGRUHtPart2 BenchKernelGRU -// matmul -BENCH_FP32_CPU(kMatMul) { BenchMatMulKernel(); } +using CPUPlace = paddle::platform::CPUPlace; -// softmax -BENCH_FP32_CPU(kSoftmax) { BenchSoftmaxKernel(); } +#define BENCH_FP32_CPU(name) \ + BENCH_JITKERNEL(name, FP32, CPU) { \ + BenchKernel##name, CPUPlace>(); \ + } -// layernorm -BENCH_FP32_CPU(kLayerNorm) { - BenchLayerNormKernel(); -} +// xyzn +BENCH_FP32_CPU(VMul); +BENCH_FP32_CPU(VAdd); +BENCH_FP32_CPU(VAddRelu); +BENCH_FP32_CPU(VSub); -// crfdecoding -BENCH_FP32_CPU(kCRFDecoding) { - BenchCRFDecodingKernel(); -} +// axyn +BENCH_FP32_CPU(VScal); +BENCH_FP32_CPU(VAddBias); -// vbroadcast function -BENCH_FP32_CPU(kVBroadcast) { - BenchVBroadcastKernel(); -} +// xyn +BENCH_FP32_CPU(VRelu); +BENCH_FP32_CPU(VIdentity); +BENCH_FP32_CPU(VSquare); +BENCH_FP32_CPU(VExp); +BENCH_FP32_CPU(VSigmoid); +BENCH_FP32_CPU(VTanh); +BENCH_FP32_CPU(VCopy); + +// xrn +BENCH_FP32_CPU(HMax); +BENCH_FP32_CPU(HSum); + +// LSTM +BENCH_FP32_CPU(LSTMCtHt); +BENCH_FP32_CPU(LSTMC1H1); + +// GRU +BENCH_FP32_CPU(GRUH1); +BENCH_FP32_CPU(GRUHtPart1); +BENCH_FP32_CPU(GRUHtPart2); + +BENCH_FP32_CPU(LayerNorm); +BENCH_FP32_CPU(CRFDecoding); + +BENCH_FP32_CPU(SeqPool); +BENCH_FP32_CPU(EmbSeqPool); +BENCH_FP32_CPU(MatMul); +BENCH_FP32_CPU(Softmax); +BENCH_FP32_CPU(Sgd); +BENCH_FP32_CPU(VBroadcast); // Benchmark all jit kernels including jitcode, mkl and refer. // To use this tool, run command: ./benchmark [options...] diff --git a/paddle/fluid/operators/jit/helper.h b/paddle/fluid/operators/jit/helper.h index 1af1add3ee..85f4072dd3 100644 --- a/paddle/fluid/operators/jit/helper.h +++ b/paddle/fluid/operators/jit/helper.h @@ -19,6 +19,8 @@ extern "C" { } #include #include +#include +#include // for std::move #include #include "paddle/fluid/operators/jit/gen_base.h" #include "paddle/fluid/operators/jit/kernel_base.h" @@ -30,22 +32,22 @@ namespace paddle { namespace operators { namespace jit { -template +template inline typename std::enable_if< - std::is_same::value && + std::is_same::value && std::is_same::value, - typename KernelTuples::func_type>::type -GetJitCode(const typename KernelTuples::attr_type& attr) { - using Func = typename KernelTuples::func_type; - using Attr = typename KernelTuples::attr_type; + typename KernelTuple::func_type>::type +GetJitCode(const typename KernelTuple::attr_type& attr) { + using Func = typename KernelTuple::func_type; + using Attr = typename KernelTuple::attr_type; size_t key = JitCodeKey(attr); - auto& codes = JitCodePool().Instance(); + auto& codes = JitCodePool().Instance(); if (codes.Has(key)) { return codes.AllKernels().at(key)->template getCode(); } // creator is not related with attr, so can use KernelKey as key - KernelKey kkey(KT, PlaceType()); + KernelKey kkey(KernelTuple::kernel_type, PlaceType()); // pool: (KernelKey(type, place), vector) auto& creator_map = JitCodeCreatorPool().Instance().AllCreators(); auto iter = creator_map.find(kkey); @@ -66,27 +68,27 @@ GetJitCode(const typename KernelTuples::attr_type& attr) { return nullptr; } -template +template inline typename std::enable_if< - !std::is_same::value || + !std::is_same::value || !std::is_same::value, - typename KernelTuples::func_type>::type -GetJitCode(const typename KernelTuples::attr_type& attr) { + typename KernelTuple::func_type>::type +GetJitCode(const typename KernelTuple::attr_type& attr) { return nullptr; } // Refer code do not related with attr, which is just for cast // Refer is always on CPUPlace -template -inline typename KernelTuples::func_type GetRefer() { +template +inline typename KernelTuple::func_type GetRefer() { auto& ref_pool = ReferKernelPool().Instance().AllKernels(); - KernelKey kkey(KT, platform::CPUPlace()); + KernelKey kkey(KernelTuple::kernel_type, platform::CPUPlace()); auto ref_iter = ref_pool.find(kkey); PADDLE_ENFORCE(ref_iter != ref_pool.end(), "Every Kernel should have reference function."); auto& ref_impls = ref_iter->second; for (auto& impl : ref_impls) { - auto i = dynamic_cast*>(impl.get()); + auto i = dynamic_cast*>(impl.get()); if (i) { return i->GetFunc(); } @@ -94,23 +96,22 @@ inline typename KernelTuples::func_type GetRefer() { return nullptr; } -template -typename KernelTuples::func_type Get( - const typename KernelTuples::attr_type& attr) { - auto jitfunc = GetJitCode(attr); +template +typename KernelTuple::func_type Get( + const typename KernelTuple::attr_type& attr) { + auto jitfunc = GetJitCode(attr); if (jitfunc) { return jitfunc; } // pool: (KernelKey(type, place), vector) - KernelKey kkey(KT, PlaceType()); + KernelKey kkey(KernelTuple::kernel_type, PlaceType()); auto& pool = KernelPool().Instance().AllKernels(); auto iter = pool.find(kkey); if (iter != pool.end()) { auto& impls = iter->second; for (auto& impl : impls) { - auto i = dynamic_cast*>(impl.get()); + auto i = dynamic_cast*>(impl.get()); if (i && i->UseMe(attr)) { return i->GetFunc(); } @@ -118,48 +119,50 @@ typename KernelTuples::func_type Get( } // The last implementation should be reference function on CPUPlace. - return GetRefer(); + return GetRefer(); } -template +template class KernelFuncs { public: KernelFuncs() = default; static KernelFuncs& Cache() { - static thread_local KernelFuncs g_func_cache; + static thread_local KernelFuncs g_func_cache; return g_func_cache; } // the exposed interface to use - typename KernelTuples::func_type At( - const typename KernelTuples::attr_type& attr) { + typename KernelTuple::func_type At( + const typename KernelTuple::attr_type& attr) { // XXH64: 13.8 GB/s - int64_t key = XXH64(&attr, sizeof(typename KernelTuples::attr_type), 0); + // TODO(TJ): change me, maybe not all attr change need one key, should be + // attrkey + int64_t key = XXH64(&attr, sizeof(typename KernelTuple::attr_type), 0); if (Has(key)) { return funcs_.at(key); } // If do not have this attr in cache, // then could run some runtime benchmark of this attr and save the best one. // Here just get the offline benchmarked best one. - auto func = Get(attr); + auto func = Get(attr); Insert(key, func); return func; } - typename KernelTuples::func_type operator[]( - const typename KernelTuples::attr_type& attr) { + typename KernelTuple::func_type operator[]( + const typename KernelTuple::attr_type& attr) { return At(attr); } protected: bool Has(int64_t key) const { return funcs_.find(key) != funcs_.end(); } - void Insert(int64_t key, typename KernelTuples::func_type func) { + void Insert(int64_t key, typename KernelTuple::func_type func) { funcs_.emplace(key, func); } private: - std::unordered_map funcs_; + std::unordered_map funcs_; DISABLE_COPY_AND_ASSIGN(KernelFuncs); }; diff --git a/paddle/fluid/operators/jit/kernel_base.h b/paddle/fluid/operators/jit/kernel_base.h index 96e162a21b..e8dbcced4f 100644 --- a/paddle/fluid/operators/jit/kernel_base.h +++ b/paddle/fluid/operators/jit/kernel_base.h @@ -62,26 +62,55 @@ typedef enum { kSqrt, } SeqPoolType; +// x, y, z, n template -struct XYZNTuples { +struct XYZNTuple { typedef T data_type; typedef int attr_type; typedef void (*func_type)(const T*, const T*, T*, int); }; +// a, x, y, n template -struct AXYNTuples : public XYZNTuples {}; +struct AXYNTuple : public XYZNTuple {}; +// x, y, n template -struct XYNTuples { +struct XYNTuple { typedef T data_type; typedef int attr_type; typedef void (*func_type)(const T*, T*, int); }; -// x, return and int +// x, returned value, n template -struct XRNTuples : public XYNTuples {}; +struct XRNTuple : public XYNTuple {}; + +#define DECLARE_KERNELTUPLE(kernel_tuple, type) \ + template \ + struct type##Tuple : public kernel_tuple { \ + static constexpr KernelType kernel_type = k##type; \ + } + +// Tuple should be corresponding to the KernelType +DECLARE_KERNELTUPLE(XYZNTuple, VMul); +DECLARE_KERNELTUPLE(XYZNTuple, VAdd); +DECLARE_KERNELTUPLE(XYZNTuple, VAddRelu); +DECLARE_KERNELTUPLE(XYZNTuple, VSub); + +DECLARE_KERNELTUPLE(AXYNTuple, VScal); +DECLARE_KERNELTUPLE(AXYNTuple, VAddBias); + +DECLARE_KERNELTUPLE(XYNTuple, VRelu); +DECLARE_KERNELTUPLE(XYNTuple, VIdentity); +DECLARE_KERNELTUPLE(XYNTuple, VSquare); +DECLARE_KERNELTUPLE(XYNTuple, VExp); +DECLARE_KERNELTUPLE(XYNTuple, VSigmoid); +DECLARE_KERNELTUPLE(XYNTuple, VTanh); +DECLARE_KERNELTUPLE(XYNTuple, VCopy); + +DECLARE_KERNELTUPLE(XRNTuple, HMax); +DECLARE_KERNELTUPLE(XRNTuple, HSum); typedef struct { void* gates; // gates: x_ch, x_ih, x_fh, x_oh @@ -122,21 +151,31 @@ typedef struct rnn_attr_s gru_attr_t; typedef struct lstm_attr_s lstm_attr_t; template -struct LSTMTuples { +struct LSTMTuple { typedef T data_type; typedef lstm_attr_t attr_type; typedef void (*func_type)(lstm_t*, const lstm_attr_t*); }; template -struct GRUTuples { +struct GRUTuple { typedef T data_type; typedef gru_attr_t attr_type; typedef void (*func_type)(gru_t*, const gru_attr_t*); }; +DECLARE_KERNELTUPLE(LSTMTuple, LSTMCtHt); +DECLARE_KERNELTUPLE(LSTMTuple, LSTMC1H1); + +DECLARE_KERNELTUPLE(GRUTuple, GRUH1); +DECLARE_KERNELTUPLE(GRUTuple, GRUHtPart1); +DECLARE_KERNELTUPLE(GRUTuple, GRUHtPart2); + +#undef DECLARE_KERNELTUPLE + template -struct VBroadcastTuples { +struct VBroadcastTuple { + static constexpr KernelType kernel_type = kVBroadcast; typedef T data_type; typedef int64_t attr_type; typedef void (*func_type)(const T*, T*, int64_t, int64_t); @@ -151,7 +190,8 @@ typedef struct seq_pool_attr_s { } seq_pool_attr_t; template -struct SeqPoolTuples { +struct SeqPoolTuple { + static constexpr KernelType kernel_type = kSeqPool; typedef T data_type; typedef seq_pool_attr_t attr_type; typedef void (*func_type)(const T*, T*, const seq_pool_attr_t*); @@ -176,7 +216,8 @@ typedef struct emb_seq_pool_attr_s { } emb_seq_pool_attr_t; template -struct EmbSeqPoolTuples { +struct EmbSeqPoolTuple { + static constexpr KernelType kernel_type = kEmbSeqPool; typedef T data_type; typedef emb_seq_pool_attr_t attr_type; typedef void (*func_type)(const T*, const int64_t*, T*, @@ -198,7 +239,8 @@ typedef struct sgd_attr_s { } sgd_attr_t; template -struct SgdTuples { +struct SgdTuple { + static constexpr KernelType kernel_type = kSgd; typedef T data_type; typedef sgd_attr_t attr_type; typedef void (*func_type)(const T*, const T*, const T*, const int64_t*, T*, @@ -214,21 +256,24 @@ typedef struct matmul_attr_s { } matmul_attr_t; template -struct MatMulTuples { +struct MatMulTuple { + static constexpr KernelType kernel_type = kMatMul; typedef T data_type; typedef matmul_attr_t attr_type; typedef void (*func_type)(const T*, const T*, T*, const matmul_attr_t*); }; template -struct CRFDecodingTuples { +struct CRFDecodingTuple { + static constexpr KernelType kernel_type = kCRFDecoding; typedef T data_type; typedef int attr_type; typedef void (*func_type)(const int, const T*, const T*, T*, int*, int); }; template -struct LayerNormTuples { +struct LayerNormTuple { + static constexpr KernelType kernel_type = kLayerNorm; typedef T data_type; typedef int attr_type; typedef void (*func_type)(T*, T*, T*, T*, const T*, const T*, int, @@ -236,7 +281,8 @@ struct LayerNormTuples { }; template -struct SoftmaxTuples { +struct SoftmaxTuple { + static constexpr KernelType kernel_type = kSoftmax; typedef T data_type; typedef int attr_type; typedef void (*func_type)(const T*, T*, int, int); @@ -244,7 +290,8 @@ struct SoftmaxTuples { // nChw16c = nChw16c .* NC template -struct NCHW16CMulNCTuples { +struct NCHW16CMulNCTuple { + static constexpr KernelType kernel_type = kNCHW16CMulNC; typedef T data_type; typedef int attr_type; typedef void (*func_type)(const T*, const T*, T*, int, int); @@ -258,12 +305,12 @@ class Kernel { DISABLE_COPY_AND_ASSIGN(Kernel); }; -template +template class KernelMore : public Kernel { public: - using T = typename KernelTuples::data_type; - using Func = typename KernelTuples::func_type; - using Attr = typename KernelTuples::attr_type; + using T = typename KernelTuple::data_type; + using Func = typename KernelTuple::func_type; + using Attr = typename KernelTuple::attr_type; virtual Func GetFunc() const { return func; } virtual bool UseMe(const Attr& attr) const = 0; virtual const char* ImplType() const = 0; @@ -272,11 +319,11 @@ class KernelMore : public Kernel { Func func{nullptr}; }; -template -class ReferKernel : public KernelMore { +template +class ReferKernel : public KernelMore { public: // Refer code can always be used - bool UseMe(const typename KernelTuples::attr_type& attr) const override { + bool UseMe(const typename KernelTuple::attr_type& attr) const override { return true; } const char* ImplType() const override { return "Refer"; } diff --git a/paddle/fluid/operators/jit/more/intrinsic/crf_decoding.h b/paddle/fluid/operators/jit/more/intrinsic/crf_decoding.h index 24179d90dd..f4187bd3ba 100644 --- a/paddle/fluid/operators/jit/more/intrinsic/crf_decoding.h +++ b/paddle/fluid/operators/jit/more/intrinsic/crf_decoding.h @@ -26,11 +26,10 @@ namespace intrinsic { void CRFDecoding(const int seq_len, const float* x, const float* w, float* alpha, int* track, int tag_num); -class CRFDecodingKernel : public KernelMore> { +class CRFDecodingKernel : public KernelMore> { public: CRFDecodingKernel() { this->func = CRFDecoding; } - bool UseMe( - const typename CRFDecodingTuples::attr_type&) const override; + bool UseMe(const typename CRFDecodingTuple::attr_type&) const override; const char* ImplType() const override { return "Intrinsic"; } }; diff --git a/paddle/fluid/operators/jit/more/intrinsic/layer_norm.h b/paddle/fluid/operators/jit/more/intrinsic/layer_norm.h index 89da2940f4..dfa4c2f072 100644 --- a/paddle/fluid/operators/jit/more/intrinsic/layer_norm.h +++ b/paddle/fluid/operators/jit/more/intrinsic/layer_norm.h @@ -27,10 +27,10 @@ void LayerNorm(float* x, float* out, float* mean, float* var, const float* scale, const float* bias, int height, const float epsilon, int right); -class LayerNormKernel : public KernelMore> { +class LayerNormKernel : public KernelMore> { public: LayerNormKernel() { this->func = LayerNorm; } - bool UseMe(const typename LayerNormTuples::attr_type&) const override; + bool UseMe(const typename LayerNormTuple::attr_type&) const override; const char* ImplType() const override { return "Intrinsic"; } }; diff --git a/paddle/fluid/operators/jit/more/mix/mix.cc b/paddle/fluid/operators/jit/more/mix/mix.cc index 0036d1c238..9ee1032e95 100644 --- a/paddle/fluid/operators/jit/more/mix/mix.cc +++ b/paddle/fluid/operators/jit/more/mix/mix.cc @@ -23,6 +23,8 @@ namespace jit { namespace more { namespace mix { +using CPUPlace = platform::CPUPlace; + void VSigmoid(const T* x, T* y, int n) { const float min = SIGMOID_THRESHOLD_MIN; const float max = SIGMOID_THRESHOLD_MAX; @@ -30,7 +32,7 @@ void VSigmoid(const T* x, T* y, int n) { y[i] = (x[i] < min) ? min : ((x[i] > max) ? max : x[i]); y[i] = static_cast(0) - y[i]; } - auto compute = Get, platform::CPUPlace>(n); + auto compute = KernelFuncs, CPUPlace>::Cache().At(n); compute(y, y, n); for (int i = 0; i < n; ++i) { y[i] = static_cast(1) / (static_cast(1) + y[i]); @@ -39,9 +41,9 @@ void VSigmoid(const T* x, T* y, int n) { void VTanh(const T* x, T* y, int n) { const T a = 2, b = -1; - auto compute_scal = Get, platform::CPUPlace>(n); - auto compute_addbias = Get, platform::CPUPlace>(n); - auto compute_sigmoid = Get, platform::CPUPlace>(n); + auto compute_scal = KernelFuncs, CPUPlace>::Cache().At(n); + auto compute_addbias = KernelFuncs, CPUPlace>::Cache().At(n); + auto compute_sigmoid = KernelFuncs, CPUPlace>::Cache().At(n); compute_scal(&a, x, y, n); compute_sigmoid(y, y, n); compute_scal(&a, y, y, n); @@ -49,16 +51,12 @@ void VTanh(const T* x, T* y, int n) { } void Softmax(const T* x, T* y, int n, int bs) { - auto compute_hmax = - KernelFuncs, platform::CPUPlace>::Cache().At(n); - auto compute_hsum = - KernelFuncs, platform::CPUPlace>::Cache().At(n); - auto compute_vscal = - KernelFuncs, platform::CPUPlace>::Cache().At(n); + auto compute_hmax = KernelFuncs, CPUPlace>::Cache().At(n); + auto compute_hsum = KernelFuncs, CPUPlace>::Cache().At(n); + auto compute_vscal = KernelFuncs, CPUPlace>::Cache().At(n); auto compute_vaddbias = - KernelFuncs, platform::CPUPlace>::Cache().At(n); - auto compute_vexp = - KernelFuncs, platform::CPUPlace>::Cache().At(n); + KernelFuncs, CPUPlace>::Cache().At(n); + auto compute_vexp = KernelFuncs, CPUPlace>::Cache().At(n); for (int i = 0; i < bs; ++i) { T scalar; @@ -76,13 +74,13 @@ void Softmax(const T* x, T* y, int n, int bs) { void (*getActFunc(KernelType type, int d))(const T*, T*, int) { // NOLINT if (type == kVSigmoid) { - return Get, platform::CPUPlace>(d); + return KernelFuncs, CPUPlace>::Cache().At(d); } else if (type == kVRelu) { - return Get, platform::CPUPlace>(d); + return KernelFuncs, CPUPlace>::Cache().At(d); } else if (type == kVTanh) { - return Get, platform::CPUPlace>(d); + return KernelFuncs, CPUPlace>::Cache().At(d); } else if (type == kVIdentity) { - return Get, platform::CPUPlace>(d); + return KernelFuncs, CPUPlace>::Cache().At(d); } PADDLE_THROW("Not support type: %s", type); return nullptr; @@ -98,9 +96,9 @@ void LSTMCtHt(lstm_t* step, const lstm_attr_t* attr) { const int d = attr->d; const int d2 = d * 2; const int d3 = d * 3; - auto vmul_d = Get, platform::CPUPlace>(d); - auto vadd_d = Get, platform::CPUPlace>(d); - auto vadd_d2 = Get, platform::CPUPlace>(d2); + auto vmul_d = KernelFuncs, CPUPlace>::Cache().At(d); + auto vadd_d = KernelFuncs, CPUPlace>::Cache().At(d); + auto vadd_d2 = KernelFuncs, CPUPlace>::Cache().At(d2); auto act_gate_d = getActFunc(attr->act_gate, d); auto act_gate_d2 = getActFunc(attr->act_gate, d2); auto act_gate_d3 = getActFunc(attr->act_gate, d3); @@ -140,8 +138,8 @@ void LSTMC1H1(lstm_t* step, const lstm_attr_t* attr) { int d = attr->d; int d2 = d * 2; int d3 = d * 3; - auto vmul_d = Get, platform::CPUPlace>(d); - auto vadd_d = Get, platform::CPUPlace>(d); + auto vmul_d = KernelFuncs, CPUPlace>::Cache().At(d); + auto vadd_d = KernelFuncs, CPUPlace>::Cache().At(d); auto act_gate_d = getActFunc(attr->act_gate, d); auto act_cand_d = getActFunc(attr->act_cand, d); auto act_cell_d = getActFunc(attr->act_cell, d); @@ -169,7 +167,7 @@ void GRUH1(gru_t* step, const gru_attr_t* attr) { int d2 = d * 2; auto act_gate = getActFunc(attr->act_gate, d); auto act_cand = getActFunc(attr->act_cand, d); - auto vmul_d = Get, platform::CPUPlace>(d); + auto vmul_d = KernelFuncs, CPUPlace>::Cache().At(d); act_gate(gates, gates, d); act_cand(gates + d2, gates + d2, d); vmul_d(gates, gates + d2, ht, d); @@ -182,7 +180,7 @@ void GRUHtPart1(gru_t* step, const gru_attr_t* attr) { T* ht = reinterpret_cast(step->ht); const T* ht_1 = reinterpret_cast(step->ht_1); auto act_gate = getActFunc(attr->act_gate, attr->d); - auto vmul_d = Get, platform::CPUPlace>(attr->d); + auto vmul_d = KernelFuncs, CPUPlace>::Cache().At(attr->d); act_gate(gates + attr->d, gates + attr->d, attr->d); vmul_d(ht_1, gates + attr->d, ht, attr->d); } @@ -230,16 +228,16 @@ bool GRUHtPart2Kernel::UseMe(const gru_attr_t& attr) const { return true; } namespace mix = paddle::operators::jit::more::mix; -#define REGISTER_MORE_KERNEL(key, func) \ - REGISTER_JITKERNEL_MORE(key, mix, mix::func##Kernel) - -REGISTER_MORE_KERNEL(kVSigmoid, VSigmoid); -REGISTER_MORE_KERNEL(kVTanh, VTanh); -REGISTER_MORE_KERNEL(kSoftmax, Softmax); -REGISTER_MORE_KERNEL(kLSTMCtHt, LSTMCtHt); -REGISTER_MORE_KERNEL(kLSTMC1H1, LSTMC1H1); -REGISTER_MORE_KERNEL(kGRUH1, GRUH1); -REGISTER_MORE_KERNEL(kGRUHtPart1, GRUHtPart1); -REGISTER_MORE_KERNEL(kGRUHtPart2, GRUHtPart2); +#define REGISTER_MORE_KERNEL(func) \ + REGISTER_JITKERNEL_MORE(k##func, mix, mix::func##Kernel) + +REGISTER_MORE_KERNEL(VSigmoid); +REGISTER_MORE_KERNEL(VTanh); +REGISTER_MORE_KERNEL(Softmax); +REGISTER_MORE_KERNEL(LSTMCtHt); +REGISTER_MORE_KERNEL(LSTMC1H1); +REGISTER_MORE_KERNEL(GRUH1); +REGISTER_MORE_KERNEL(GRUHtPart1); +REGISTER_MORE_KERNEL(GRUHtPart2); #undef REGISTER_MORE_KERNEL diff --git a/paddle/fluid/operators/jit/more/mix/mix.h b/paddle/fluid/operators/jit/more/mix/mix.h index d64af19219..17eb96462f 100644 --- a/paddle/fluid/operators/jit/more/mix/mix.h +++ b/paddle/fluid/operators/jit/more/mix/mix.h @@ -34,27 +34,27 @@ void GRUH1(gru_t* step, const gru_attr_t* attr); void GRUHtPart1(gru_t* step, const gru_attr_t* attr); void GRUHtPart2(gru_t* step, const gru_attr_t* attr); -#define DECLARE_MORE_KERNEL(name, tuples) \ - class name##Kernel : public KernelMore> { \ - public: \ - name##Kernel() { this->func = name; } \ - bool UseMe(const typename tuples::attr_type&) const override; \ - const char* ImplType() const override { return "Mixed"; } \ +#define DECLARE_MORE_KERNEL(name) \ + class name##Kernel : public KernelMore> { \ + public: \ + name##Kernel() { this->func = name; } \ + bool UseMe(const typename name##Tuple::attr_type&) const override; \ + const char* ImplType() const override { return "Mixed"; } \ } // XYN -DECLARE_MORE_KERNEL(VSigmoid, XYNTuples); -DECLARE_MORE_KERNEL(VTanh, XYNTuples); +DECLARE_MORE_KERNEL(VSigmoid); +DECLARE_MORE_KERNEL(VTanh); // XRN -DECLARE_MORE_KERNEL(Softmax, SoftmaxTuples); +DECLARE_MORE_KERNEL(Softmax); -DECLARE_MORE_KERNEL(LSTMCtHt, LSTMTuples); -DECLARE_MORE_KERNEL(LSTMC1H1, LSTMTuples); +DECLARE_MORE_KERNEL(LSTMCtHt); +DECLARE_MORE_KERNEL(LSTMC1H1); -DECLARE_MORE_KERNEL(GRUH1, GRUTuples); -DECLARE_MORE_KERNEL(GRUHtPart1, GRUTuples); -DECLARE_MORE_KERNEL(GRUHtPart2, GRUTuples); +DECLARE_MORE_KERNEL(GRUH1); +DECLARE_MORE_KERNEL(GRUHtPart1); +DECLARE_MORE_KERNEL(GRUHtPart2); #undef DECLARE_MORE_KERNEL diff --git a/paddle/fluid/operators/jit/more/mkl/mkl.cc b/paddle/fluid/operators/jit/more/mkl/mkl.cc index 4f51353bce..084ea571ce 100644 --- a/paddle/fluid/operators/jit/more/mkl/mkl.cc +++ b/paddle/fluid/operators/jit/more/mkl/mkl.cc @@ -250,23 +250,23 @@ AWALYS_USE_ME_WITH_DOUBLE(Softmax); namespace mkl = paddle::operators::jit::more::mkl; -#define REGISTER_MKL_KERNEL(key, func) \ - REGISTER_JITKERNEL_MORE(key, mkl, mkl::func##Kernel, \ +#define REGISTER_MKL_KERNEL(func) \ + REGISTER_JITKERNEL_MORE(k##func, mkl, mkl::func##Kernel, \ mkl::func##Kernel) -REGISTER_MKL_KERNEL(kMatMul, MatMul); -REGISTER_MKL_KERNEL(kVMul, VMul); -REGISTER_MKL_KERNEL(kVAdd, VAdd); -REGISTER_MKL_KERNEL(kVScal, VScal); -REGISTER_MKL_KERNEL(kVExp, VExp); -REGISTER_MKL_KERNEL(kVSquare, VSquare); -REGISTER_MKL_KERNEL(kVCopy, VCopy); -REGISTER_MKL_KERNEL(kVBroadcast, VBroadcast); -REGISTER_MKL_KERNEL(kVSigmoid, VSigmoid); -REGISTER_MKL_KERNEL(kVTanh, VTanh); -REGISTER_MKL_KERNEL(kSeqPool, SeqPool); -REGISTER_MKL_KERNEL(kEmbSeqPool, EmbSeqPool); -REGISTER_MKL_KERNEL(kSoftmax, Softmax); -REGISTER_MKL_KERNEL(kSgd, Sgd); +REGISTER_MKL_KERNEL(MatMul); +REGISTER_MKL_KERNEL(VMul); +REGISTER_MKL_KERNEL(VAdd); +REGISTER_MKL_KERNEL(VScal); +REGISTER_MKL_KERNEL(VExp); +REGISTER_MKL_KERNEL(VSquare); +REGISTER_MKL_KERNEL(VCopy); +REGISTER_MKL_KERNEL(VBroadcast); +REGISTER_MKL_KERNEL(VSigmoid); +REGISTER_MKL_KERNEL(VTanh); +REGISTER_MKL_KERNEL(SeqPool); +REGISTER_MKL_KERNEL(EmbSeqPool); +REGISTER_MKL_KERNEL(Softmax); +REGISTER_MKL_KERNEL(Sgd); #undef REGISTER_MKL_KERNEL diff --git a/paddle/fluid/operators/jit/more/mkl/mkl.h b/paddle/fluid/operators/jit/more/mkl/mkl.h index db2d6faed4..8c1d8b57e0 100644 --- a/paddle/fluid/operators/jit/more/mkl/mkl.h +++ b/paddle/fluid/operators/jit/more/mkl/mkl.h @@ -175,41 +175,38 @@ void Sgd(const T* lr, const T* param, const T* grad, const int64_t* rows, } } -#define DECLARE_MKL_KERNEL(name, tuples) \ - template \ - class name##Kernel : public KernelMore> { \ - public: \ - name##Kernel() { this->func = name; } \ - bool UseMe(const typename tuples::attr_type&) const override; \ - const char* ImplType() const override { return "MKL"; } \ +#define DECLARE_MKL_KERNEL(name) \ + template \ + class name##Kernel : public KernelMore> { \ + public: \ + name##Kernel() { this->func = name; } \ + bool UseMe(const typename name##Tuple::attr_type&) const override; \ + const char* ImplType() const override { return "MKL"; } \ } // ABCMNK -DECLARE_MKL_KERNEL(MatMul, MatMulTuples); +DECLARE_MKL_KERNEL(MatMul); // XYZN -DECLARE_MKL_KERNEL(VMul, XYZNTuples); -DECLARE_MKL_KERNEL(VAdd, XYZNTuples); +DECLARE_MKL_KERNEL(VMul); +DECLARE_MKL_KERNEL(VAdd); // AXYN -DECLARE_MKL_KERNEL(VScal, AXYNTuples); +DECLARE_MKL_KERNEL(VScal); // XYN -DECLARE_MKL_KERNEL(VExp, XYNTuples); -DECLARE_MKL_KERNEL(VSigmoid, XYNTuples); -DECLARE_MKL_KERNEL(VTanh, XYNTuples); -DECLARE_MKL_KERNEL(VSquare, XYNTuples); -DECLARE_MKL_KERNEL(VCopy, XYNTuples); - -DECLARE_MKL_KERNEL(SeqPool, SeqPoolTuples); - -DECLARE_MKL_KERNEL(EmbSeqPool, EmbSeqPoolTuples); - -DECLARE_MKL_KERNEL(Softmax, SoftmaxTuples); - -DECLARE_MKL_KERNEL(Sgd, SgdTuples); - -DECLARE_MKL_KERNEL(VBroadcast, VBroadcastTuples); +DECLARE_MKL_KERNEL(VExp); +DECLARE_MKL_KERNEL(VSigmoid); +DECLARE_MKL_KERNEL(VTanh); +DECLARE_MKL_KERNEL(VSquare); +DECLARE_MKL_KERNEL(VCopy); + +// others +DECLARE_MKL_KERNEL(SeqPool); +DECLARE_MKL_KERNEL(EmbSeqPool); +DECLARE_MKL_KERNEL(Softmax); +DECLARE_MKL_KERNEL(Sgd); +DECLARE_MKL_KERNEL(VBroadcast); #undef DECLARE_MKL_KERNEL diff --git a/paddle/fluid/operators/jit/refer/refer.cc b/paddle/fluid/operators/jit/refer/refer.cc index c279d1b2ca..0d1c477090 100644 --- a/paddle/fluid/operators/jit/refer/refer.cc +++ b/paddle/fluid/operators/jit/refer/refer.cc @@ -17,51 +17,43 @@ namespace refer = paddle::operators::jit::refer; -#define REGISTER_REFER_KERNEL(key, func) \ - REGISTER_JITKERNEL_REFER(key, refer::func##Kernel, \ +#define REGISTER_REFER_KERNEL(func) \ + REGISTER_JITKERNEL_REFER(k##func, refer::func##Kernel, \ refer::func##Kernel) -REGISTER_REFER_KERNEL(kVMul, VMul); -REGISTER_REFER_KERNEL(kVAdd, VAdd); -REGISTER_REFER_KERNEL(kVAddRelu, VAddRelu); -REGISTER_REFER_KERNEL(kVSub, VSub); - -REGISTER_REFER_KERNEL(kVScal, VScal); -REGISTER_REFER_KERNEL(kVAddBias, VAddBias); - -REGISTER_REFER_KERNEL(kVRelu, VRelu); -REGISTER_REFER_KERNEL(kVCopy, VCopy); -REGISTER_REFER_KERNEL(kVIdentity, VIdentity); -REGISTER_REFER_KERNEL(kVSquare, VSquare); -REGISTER_REFER_KERNEL(kVExp, VExp); -REGISTER_REFER_KERNEL(kVSigmoid, VSigmoid); -REGISTER_REFER_KERNEL(kVTanh, VTanh); - -REGISTER_REFER_KERNEL(kLSTMCtHt, LSTMCtHt); -REGISTER_REFER_KERNEL(kLSTMC1H1, LSTMC1H1); - -REGISTER_REFER_KERNEL(kGRUH1, GRUH1); -REGISTER_REFER_KERNEL(kGRUHtPart1, GRUHtPart1); -REGISTER_REFER_KERNEL(kGRUHtPart2, GRUHtPart2); - -REGISTER_REFER_KERNEL(kCRFDecoding, CRFDecoding); -REGISTER_REFER_KERNEL(kLayerNorm, LayerNorm); - -REGISTER_REFER_KERNEL(kNCHW16CMulNC, NCHW16CMulNC); - -REGISTER_REFER_KERNEL(kSeqPool, SeqPool); - -REGISTER_REFER_KERNEL(kMatMul, MatMul); - -REGISTER_REFER_KERNEL(kHMax, HMax); -REGISTER_REFER_KERNEL(kHSum, HSum); - -REGISTER_REFER_KERNEL(kSoftmax, Softmax); - -REGISTER_REFER_KERNEL(kEmbSeqPool, EmbSeqPool); - -REGISTER_REFER_KERNEL(kSgd, Sgd); - -REGISTER_REFER_KERNEL(kVBroadcast, VBroadcast); +REGISTER_REFER_KERNEL(VMul); +REGISTER_REFER_KERNEL(VAdd); +REGISTER_REFER_KERNEL(VAddRelu); +REGISTER_REFER_KERNEL(VSub); + +REGISTER_REFER_KERNEL(VScal); +REGISTER_REFER_KERNEL(VAddBias); + +REGISTER_REFER_KERNEL(VRelu); +REGISTER_REFER_KERNEL(VCopy); +REGISTER_REFER_KERNEL(VIdentity); +REGISTER_REFER_KERNEL(VSquare); +REGISTER_REFER_KERNEL(VExp); +REGISTER_REFER_KERNEL(VSigmoid); +REGISTER_REFER_KERNEL(VTanh); + +REGISTER_REFER_KERNEL(LSTMCtHt); +REGISTER_REFER_KERNEL(LSTMC1H1); + +REGISTER_REFER_KERNEL(GRUH1); +REGISTER_REFER_KERNEL(GRUHtPart1); +REGISTER_REFER_KERNEL(GRUHtPart2); + +REGISTER_REFER_KERNEL(CRFDecoding); +REGISTER_REFER_KERNEL(LayerNorm); +REGISTER_REFER_KERNEL(NCHW16CMulNC); +REGISTER_REFER_KERNEL(SeqPool); +REGISTER_REFER_KERNEL(MatMul); +REGISTER_REFER_KERNEL(HMax); +REGISTER_REFER_KERNEL(HSum); +REGISTER_REFER_KERNEL(Softmax); +REGISTER_REFER_KERNEL(EmbSeqPool); +REGISTER_REFER_KERNEL(Sgd); +REGISTER_REFER_KERNEL(VBroadcast); #undef REGISTER_REFER_KERNEL diff --git a/paddle/fluid/operators/jit/refer/refer.h b/paddle/fluid/operators/jit/refer/refer.h index b3b2097828..cac705a484 100644 --- a/paddle/fluid/operators/jit/refer/refer.h +++ b/paddle/fluid/operators/jit/refer/refer.h @@ -490,60 +490,54 @@ void Sgd(const T* lr, const T* param, const T* grad, const int64_t* rows, } } -#define DECLARE_REFER_KERNEL(name, tuples) \ - template \ - class name##Kernel : public ReferKernel> { \ - public: \ - name##Kernel() { this->func = name; } \ +#define DECLARE_REFER_KERNEL(name) \ + template \ + class name##Kernel : public ReferKernel> { \ + public: \ + name##Kernel() { this->func = name; } \ } // const T* x, const T* y, T* z, int n -DECLARE_REFER_KERNEL(VMul, XYZNTuples); -DECLARE_REFER_KERNEL(VAdd, XYZNTuples); -DECLARE_REFER_KERNEL(VAddRelu, XYZNTuples); -DECLARE_REFER_KERNEL(VSub, XYZNTuples); +DECLARE_REFER_KERNEL(VMul); +DECLARE_REFER_KERNEL(VAdd); +DECLARE_REFER_KERNEL(VAddRelu); +DECLARE_REFER_KERNEL(VSub); // const T* a, const T* x, T* y, int n -DECLARE_REFER_KERNEL(VScal, AXYNTuples); -DECLARE_REFER_KERNEL(VAddBias, AXYNTuples); +DECLARE_REFER_KERNEL(VScal); +DECLARE_REFER_KERNEL(VAddBias); // const T* x, T* y, int n -DECLARE_REFER_KERNEL(VRelu, XYNTuples); -DECLARE_REFER_KERNEL(VIdentity, XYNTuples); -DECLARE_REFER_KERNEL(VExp, XYNTuples); -DECLARE_REFER_KERNEL(VSigmoid, XYNTuples); -DECLARE_REFER_KERNEL(VTanh, XYNTuples); -DECLARE_REFER_KERNEL(VSquare, XYNTuples); -DECLARE_REFER_KERNEL(VCopy, XYNTuples); +DECLARE_REFER_KERNEL(VRelu); +DECLARE_REFER_KERNEL(VIdentity); +DECLARE_REFER_KERNEL(VExp); +DECLARE_REFER_KERNEL(VSigmoid); +DECLARE_REFER_KERNEL(VTanh); +DECLARE_REFER_KERNEL(VSquare); +DECLARE_REFER_KERNEL(VCopy); // lstm_t*, const lstm_attr_t* -DECLARE_REFER_KERNEL(LSTMCtHt, LSTMTuples); -DECLARE_REFER_KERNEL(LSTMC1H1, LSTMTuples); +DECLARE_REFER_KERNEL(LSTMCtHt); +DECLARE_REFER_KERNEL(LSTMC1H1); // gru_t*, const gru_attr_t* -DECLARE_REFER_KERNEL(GRUH1, GRUTuples); -DECLARE_REFER_KERNEL(GRUHtPart1, GRUTuples); -DECLARE_REFER_KERNEL(GRUHtPart2, GRUTuples); - -DECLARE_REFER_KERNEL(CRFDecoding, CRFDecodingTuples); -DECLARE_REFER_KERNEL(LayerNorm, LayerNormTuples); - -DECLARE_REFER_KERNEL(NCHW16CMulNC, NCHW16CMulNCTuples); - -DECLARE_REFER_KERNEL(SeqPool, SeqPoolTuples); - -DECLARE_REFER_KERNEL(MatMul, MatMulTuples); - -DECLARE_REFER_KERNEL(HMax, XRNTuples); -DECLARE_REFER_KERNEL(HSum, XRNTuples); - -DECLARE_REFER_KERNEL(Softmax, SoftmaxTuples); - -DECLARE_REFER_KERNEL(EmbSeqPool, EmbSeqPoolTuples); - -DECLARE_REFER_KERNEL(Sgd, SgdTuples); - -DECLARE_REFER_KERNEL(VBroadcast, VBroadcastTuples); +DECLARE_REFER_KERNEL(GRUH1); +DECLARE_REFER_KERNEL(GRUHtPart1); +DECLARE_REFER_KERNEL(GRUHtPart2); + +DECLARE_REFER_KERNEL(HMax); +DECLARE_REFER_KERNEL(HSum); + +// others +DECLARE_REFER_KERNEL(CRFDecoding); +DECLARE_REFER_KERNEL(LayerNorm); +DECLARE_REFER_KERNEL(NCHW16CMulNC); +DECLARE_REFER_KERNEL(SeqPool); +DECLARE_REFER_KERNEL(MatMul); +DECLARE_REFER_KERNEL(Softmax); +DECLARE_REFER_KERNEL(EmbSeqPool); +DECLARE_REFER_KERNEL(Sgd); +DECLARE_REFER_KERNEL(VBroadcast); #undef DECLARE_REFER_KERNEL diff --git a/paddle/fluid/operators/jit/test.cc b/paddle/fluid/operators/jit/test.cc index 18f8c09f14..a574bf2079 100644 --- a/paddle/fluid/operators/jit/test.cc +++ b/paddle/fluid/operators/jit/test.cc @@ -64,413 +64,43 @@ std::vector TestSizes() { namespace jit = paddle::operators::jit; using CPUPlace = paddle::platform::CPUPlace; -template -struct TestFuncWithRefer { - void operator()(const typename KernelTuples::func_type tgt, Args... args) { - LOG(FATAL) << "Should specify this function."; - } -}; - -template -struct TestFuncWithRefer, std::vector, std::vector, - std::vector> { - void operator()(const typename jit::XYZNTuples::func_type tgt, - const std::vector& x, const std::vector& y, - const std::vector& zref) { - EXPECT_TRUE(tgt != nullptr); - EXPECT_EQ(zref.size(), x.size()); - EXPECT_EQ(zref.size(), y.size()); - const T* x_data = x.data(); - const T* y_data = y.data(); - const T* zref_data = zref.data(); - const int d = zref.size(); - - std::vector ztgt(d); - T* ztgt_data = ztgt.data(); - // test normal - tgt(x_data, y_data, ztgt_data, d); - ExpectEQ(ztgt_data, zref_data, d); - // test inplace x - std::copy(x.begin(), x.end(), ztgt.begin()); - tgt(ztgt_data, y_data, ztgt_data, d); - ExpectEQ(ztgt_data, zref_data, d); - // test inplace y - std::copy(y.begin(), y.end(), ztgt.begin()); - tgt(x_data, ztgt_data, ztgt_data, d); - ExpectEQ(ztgt_data, zref_data, d); - } -}; - -template -struct TestFuncWithRefer, T, std::vector, - std::vector> { - void operator()(const typename jit::AXYNTuples::func_type tgt, const T a, - const std::vector& x, const std::vector& yref) { - EXPECT_TRUE(tgt != nullptr); - EXPECT_EQ(yref.size(), x.size()); - const T* x_data = x.data(); - const T* yref_data = yref.data(); - const int d = yref.size(); - std::vector ytgt(d); - T* ytgt_data = ytgt.data(); - // test normal - tgt(&a, x_data, ytgt_data, d); - ExpectEQ(ytgt_data, yref_data, d); - // test inplace x - std::copy(x.begin(), x.end(), ytgt.begin()); - tgt(&a, ytgt_data, ytgt_data, d); - ExpectEQ(ytgt_data, yref_data, d); - } -}; - -template -struct TestFuncWithRefer, std::vector, std::vector, - int, int> { - void operator()(const typename jit::SoftmaxTuples::func_type tgt, - const std::vector& x, const std::vector& yref, int n, - int bs) { - EXPECT_TRUE(tgt != nullptr); - EXPECT_EQ(yref.size(), x.size()); - EXPECT_EQ(x.size(), static_cast(n * bs)); - const T* x_data = x.data(); - const T* yref_data = yref.data(); - std::vector ytgt(n * bs); - T* ytgt_data = ytgt.data(); - // test normal - tgt(x_data, ytgt_data, n, bs); - ExpectEQ(ytgt_data, yref_data, n * bs); - // test inplace x - std::copy(x.begin(), x.end(), ytgt.begin()); - tgt(ytgt_data, ytgt_data, n, bs); - ExpectEQ(ytgt_data, yref_data, n * bs); - } -}; - -template -struct TestFuncWithRefer, std::vector, T> { - void operator()(const typename jit::XRNTuples::func_type tgt, - const std::vector& x, const T ref_res) { - EXPECT_TRUE(tgt != nullptr); - T tgt_res; - tgt(x.data(), &tgt_res, x.size()); - ExpectEQ(&tgt_res, &ref_res, 1); - } -}; - -template -struct TestFuncWithRefer, std::vector, - std::vector, int64_t, - typename jit::VBroadcastTuples::attr_type> { - void operator()(const typename jit::VBroadcastTuples::func_type tgt, - const std::vector& x, const std::vector& yref, - int64_t h, - const typename jit::VBroadcastTuples::attr_type& attr) { - EXPECT_TRUE(tgt != nullptr); - EXPECT_EQ(x.size(), static_cast(attr)); - EXPECT_EQ(yref.size(), x.size() * h); - std::vector y(yref.size()); - const T* x_data = x.data(); - const T* yref_data = yref.data(); - T* y_data = y.data(); - tgt(x_data, y_data, h, attr); - ExpectEQ(y_data, yref_data, yref.size()); - } -}; - -template -struct TestFuncWithRefer, std::vector, std::vector> { - void operator()(const typename jit::XYNTuples::func_type tgt, - const std::vector& x, const std::vector& yref) { - EXPECT_TRUE(tgt != nullptr); - EXPECT_EQ(yref.size(), x.size()); - const T* x_data = x.data(); - const T* yref_data = yref.data(); - const int d = yref.size(); - std::vector ytgt(d); - T* ytgt_data = ytgt.data(); - // test normal - tgt(x_data, ytgt_data, d); - ExpectEQ(ytgt_data, yref_data, d); - // test inplace x - std::copy(x.begin(), x.end(), ytgt.begin()); - tgt(ytgt_data, ytgt_data, d); - ExpectEQ(ytgt_data, yref_data, d); - } -}; - -template -struct TestFuncWithRefer, std::vector, std::vector, - std::vector, std::vector, std::vector, - typename jit::LSTMTuples::attr_type> { - void operator()(const typename jit::LSTMTuples::func_type tgt, - const std::vector& xsrc, const std::vector& wp, - const std::vector& ct_1, const std::vector& ct_ref, - const std::vector& ht_ref, - const typename jit::LSTMTuples::attr_type& attr) { - EXPECT_TRUE(tgt != nullptr); - EXPECT_EQ(ct_ref.size(), ht_ref.size()); - EXPECT_EQ(ct_1.size(), ht_ref.size()); - EXPECT_EQ(xsrc.size(), 4 * ht_ref.size()); - EXPECT_EQ(wp.size(), 3 * ht_ref.size()); - - // x could be changed after compute, so copy to save src - int d = ht_ref.size(); - std::vector x(xsrc.size()), ct(ct_ref.size()), ht(ht_ref.size()); - std::vector checked(2 * d); - std::copy(xsrc.begin(), xsrc.end(), x.begin()); - - const T* ct_1_data = ct_1.data(); - const T* wp_data = wp.data(); - const T* ct_ref_data = ct_ref.data(); - const T* ht_ref_data = ht_ref.data(); - T* x_data = x.data(); - T* ct_data = ct.data(); - T* ht_data = ht.data(); - T* checked_data = checked.data(); - - jit::lstm_t step; - step.gates = x_data; - step.ct_1 = ct_1_data; - step.ct = ct_data; - step.ht = ht_data; - if (attr.use_peephole) { - step.wp = wp_data; - step.checked = checked_data; - } - - tgt(&step, &attr); - ExpectEQ(ct_data, ct_ref_data, d); - ExpectEQ(ht_data, ht_ref_data, d); - } -}; - -template -struct TestFuncWithRefer, std::vector, std::vector, - std::vector, - typename jit::GRUTuples::attr_type> { - void operator()(const typename jit::GRUTuples::func_type tgt, - const std::vector& xsrc, const std::vector& ht_1, - const std::vector& ht_ref, - const typename jit::GRUTuples::attr_type& attr) { - EXPECT_TRUE(tgt != nullptr); - EXPECT_EQ(ht_1.size(), ht_ref.size()); - EXPECT_EQ(xsrc.size(), 3 * ht_ref.size()); - - // x could be changed after compute, so copy to save src - int d = ht_ref.size(); - std::vector x(xsrc.size()), ht(ht_ref.size()); - std::copy(xsrc.begin(), xsrc.end(), x.begin()); - const T* ht_1_data = ht_1.data(); - const T* ht_ref_data = ht_ref.data(); - T* x_data = x.data(); - T* ht_data = ht.data(); - jit::gru_t step; - step.gates = x_data; - step.ht_1 = ht_1_data; - step.ht = ht_data; - tgt(&step, &attr); - ExpectEQ(ht_data, ht_ref_data, d); - } -}; - -template -struct TestFuncWithRefer, std::vector, std::vector, - typename jit::SeqPoolTuples::attr_type> { - void operator()(const typename jit::SeqPoolTuples::func_type tgt, - const std::vector& x, const std::vector& yref, - const typename jit::SeqPoolTuples::attr_type& attr) { - EXPECT_TRUE(tgt != nullptr); - EXPECT_EQ(x.size() % yref.size(), static_cast(0)); - int w = yref.size(); - std::vector y(w); - const T* x_data = x.data(); - const T* yref_data = yref.data(); - T* y_data = y.data(); - tgt(x_data, y_data, &attr); - ExpectEQ(y_data, yref_data, w); - } -}; - -template -struct TestFuncWithRefer, std::vector, - std::vector, std::vector, - typename jit::EmbSeqPoolTuples::attr_type> { - void operator()(const typename jit::EmbSeqPoolTuples::func_type tgt, - const std::vector& table, const std::vector& idx, - const std::vector& oref, - const typename jit::EmbSeqPoolTuples::attr_type& attr) { - EXPECT_TRUE(tgt != nullptr); - EXPECT_EQ(table.size(), - static_cast(attr.table_height * attr.table_width)); - EXPECT_EQ(idx.size(), - static_cast(attr.index_height * attr.index_width)); - EXPECT_EQ(oref.size(), - static_cast(attr.table_width * attr.index_width)); - const T* table_data = table.data(); - const int64_t* idx_data = idx.data(); - const T* oref_data = oref.data(); - int o_w = oref.size(); - std::vector out(o_w); - T* o_data = out.data(); - tgt(table_data, idx_data, o_data, &attr); - ExpectEQ(o_data, oref_data, o_w); - } -}; - -template -struct TestFuncWithRefer, T, std::vector, std::vector, - std::vector, std::vector, - typename jit::SgdTuples::attr_type> { - void operator()(const typename jit::SgdTuples::func_type tgt, const T lr, - const std::vector& param, const std::vector& grad, - const std::vector& rows, const std::vector& oref, - const typename jit::SgdTuples::attr_type& attr) { - EXPECT_TRUE(tgt != nullptr); - EXPECT_EQ(param.size(), - static_cast(attr.param_height * attr.param_width)); - EXPECT_EQ(grad.size(), - static_cast(attr.grad_height * attr.grad_width)); - EXPECT_EQ(rows.size(), static_cast(attr.selected_rows_size)); - EXPECT_EQ(param.size(), oref.size()); - const T* param_data = param.data(); - const T* grad_data = grad.data(); - const int64_t* rows_data = rows.data(); - const T* oref_data = oref.data(); - - std::vector out(oref.size()); - T* o_data = out.data(); - tgt(&lr, param_data, grad_data, rows_data, o_data, &attr); - // only the selected rows should be equal - for (size_t i = 0; i < rows.size(); ++i) { - ExpectEQ(o_data + rows[i] * attr.grad_width, - oref_data + rows[i] * attr.grad_width, attr.grad_width); - } - - // inplace - std::copy(param.begin(), param.end(), out.begin()); - tgt(&lr, o_data, grad_data, rows_data, o_data, &attr); - for (size_t i = 0; i < rows.size(); ++i) { - ExpectEQ(o_data + rows[i] * attr.grad_width, - oref_data + rows[i] * attr.grad_width, attr.grad_width); - } - } -}; - -template -struct TestFuncWithRefer, std::vector, std::vector, - std::vector, - typename jit::MatMulTuples::attr_type> { - void operator()(const typename jit::MatMulTuples::func_type tgt, - const std::vector& a, const std::vector& b, - const std::vector& cref, - const typename jit::MatMulTuples::attr_type& attr) { - EXPECT_TRUE(tgt != nullptr); - EXPECT_EQ(a.size(), static_cast(attr.m * attr.k)); - EXPECT_EQ(b.size(), static_cast(attr.k * attr.n)); - EXPECT_EQ(cref.size(), static_cast(attr.m * attr.n)); - std::vector c(cref.size()); - const T* a_data = a.data(); - const T* b_data = b.data(); - const T* cref_data = cref.data(); - T* c_data = c.data(); - tgt(a_data, b_data, c_data, &attr); - ExpectEQ(c_data, cref_data, attr.m * attr.n); - } -}; - -template -struct TestFuncWithRefer, std::vector, - std::vector, std::vector, std::vector, - std::vector, std::vector, int, float, int> { - void operator()(const typename jit::LayerNormTuples::func_type tgt, - std::vector& x, std::vector& outref, // NOLINT - std::vector& mean, std::vector& var, // NOLINT - const std::vector& scale, const std::vector& bias, - int left, const float epsilon, int right) { - EXPECT_TRUE(tgt != nullptr); - EXPECT_EQ(x.size(), static_cast(left * right)); - EXPECT_EQ(outref.size(), static_cast(left * right)); - EXPECT_EQ(mean.size(), static_cast(left)); - EXPECT_EQ(var.size(), static_cast(left)); - EXPECT_EQ(scale.size(), static_cast(right)); - EXPECT_EQ(bias.size(), static_cast(right)); - std::vector outtgt(outref.size()); - const T* scale_data = scale.data(); - const T* bias_data = bias.data(); - T* x_data = x.data(); - T* mean_data = mean.data(); - T* var_data = var.data(); - T* outref_data = outref.data(); - T* outtgt_data = outtgt.data(); - - tgt(x_data, outtgt_data, mean_data, var_data, scale_data, bias_data, left, - epsilon, right); - ExpectEQ(outtgt_data, outref_data, left * right); - } -}; - -template -struct TestFuncWithRefer, int, std::vector, - std::vector, std::vector, std::vector, - int> { - void operator()(const typename jit::CRFDecodingTuples::func_type tgt, - const int seq_len, const std::vector& x, - const std::vector& w, std::vector& alpharef, // NOLINT - std::vector& trackref, int tag_num) { // NOLINT - constexpr int state_trans_base_idx = 2; - EXPECT_TRUE(tgt != nullptr); - EXPECT_EQ(x.size(), static_cast(seq_len * tag_num)); - EXPECT_EQ(w.size(), - static_cast((tag_num + state_trans_base_idx) * tag_num)); - EXPECT_EQ(alpharef.size(), static_cast(seq_len * tag_num)); - EXPECT_EQ(trackref.size(), static_cast(seq_len * tag_num)); - std::vector alphatgt(alpharef.size()); - std::vector tracktgt(trackref.size()); - - memcpy(trackref.data(), tracktgt.data(), tag_num * sizeof(int)); - tgt(seq_len, (const T*)x.data(), (const T*)w.data(), alphatgt.data(), - tracktgt.data(), tag_num); - ExpectEQ(alpharef.data(), alphatgt.data(), seq_len * tag_num); - ExpectEQ(trackref.data(), tracktgt.data(), seq_len * tag_num); - } -}; - -template -void TestAllImpls(const typename KernelTuples::attr_type& attr, Args... args) { - TestFuncWithRefer test; +void TestAllImpls(const typename KernelTuple::attr_type& attr, + const Tester& verifier, const Args&... args) { // test jitcode - auto jitcode = jit::GetJitCode(attr); + auto jitcode = jit::GetJitCode(attr); if (jitcode) { VLOG(10) << "Test Jitcode Kernel "; - test(jitcode, args...); + verifier(jitcode, args...); } // test all impls in more - jit::KernelKey kkey(KT, PlaceType()); + jit::KernelKey kkey(KernelTuple::kernel_type, PlaceType()); auto& pool = jit::KernelPool().Instance().AllKernels(); auto iter = pool.find(kkey); if (iter != pool.end()) { auto& impls = iter->second; for (auto& impl : impls) { - auto i = dynamic_cast*>(impl.get()); + auto i = dynamic_cast*>(impl.get()); if (i && i->UseMe(attr)) { auto more = i->GetFunc(); VLOG(10) << "Test More Kernel : " << i->ImplType(); - test(more, args...); + verifier(more, args...); } } } // test result from Get function - // VLOG(10) << "Test Get function "; - auto tgt = jit::KernelFuncs::Cache().At(attr); - test(tgt, args...); + VLOG(10) << "Test final get function "; + auto tgt = jit::KernelFuncs::Cache().At(attr); + verifier(tgt, args...); } -template -void TestKernelXYZNTuples() { - VLOG(10) << "===== Test JITKernel " << jit::to_string(KT); +template +void TestKernelXYZN() { + using T = typename KernelTuple::data_type; + VLOG(10) << "Test JITKernel: " << jit::to_string(KernelTuple::kernel_type); for (int d : TestSizes()) { - auto ref = jit::GetRefer>(); + auto ref = jit::GetRefer(); EXPECT_TRUE(ref != nullptr); std::vector x(d), y(d), zref(d); @@ -494,16 +124,42 @@ void TestKernelXYZNTuples() { ExpectEQ(xinp_data, zref_data, d); ExpectEQ(yinp_data, zref_data, d); - TestAllImpls, PlaceType, std::vector, - std::vector, std::vector>(d, x, y, zref); + auto verifier = [](const typename KernelTuple::func_type tgt, + const std::vector& x, const std::vector& y, + const std::vector& zref) { + EXPECT_TRUE(tgt != nullptr); + EXPECT_EQ(zref.size(), x.size()); + EXPECT_EQ(zref.size(), y.size()); + const T* x_data = x.data(); + const T* y_data = y.data(); + const T* zref_data = zref.data(); + const int d = zref.size(); + + std::vector ztgt(d); + T* ztgt_data = ztgt.data(); + // test normal + tgt(x_data, y_data, ztgt_data, d); + ExpectEQ(ztgt_data, zref_data, d); + // test inplace x + std::copy(x.begin(), x.end(), ztgt.begin()); + tgt(ztgt_data, y_data, ztgt_data, d); + ExpectEQ(ztgt_data, zref_data, d); + // test inplace y + std::copy(y.begin(), y.end(), ztgt.begin()); + tgt(x_data, ztgt_data, ztgt_data, d); + ExpectEQ(ztgt_data, zref_data, d); + }; + + TestAllImpls(d, verifier, x, y, zref); } } -template -void TestKernelAXYNTuples() { - VLOG(10) << "===== Test JITKernel " << jit::to_string(KT); +template +void TestKernelAXYN() { + using T = typename KernelTuple::data_type; + VLOG(10) << "Test JITKernel: " << jit::to_string(KernelTuple::kernel_type); for (int d : TestSizes()) { - auto ref = jit::GetRefer>(); + auto ref = jit::GetRefer(); EXPECT_TRUE(ref != nullptr); const T a = static_cast(3); @@ -520,34 +176,33 @@ void TestKernelAXYNTuples() { ref(&a, xinp_data, xinp_data, d); ExpectEQ(xinp_data, yref_data, d); - TestAllImpls, PlaceType, T, std::vector, - std::vector>(d, a, x, yref); - } -} - -template -void TestKernelXRNTuples() { - VLOG(10) << "===== Test JITKernel " << jit::to_string(KT); - auto last_acc = FLAGS_acc; - FLAGS_acc = 1e-4; - for (int d : TestSizes()) { - auto ref = jit::GetRefer>(); - EXPECT_TRUE(ref != nullptr); - std::vector x(d); - RandomVec(d, x.data()); - T ref_res; - ref(x.data(), &ref_res, d); - TestAllImpls, PlaceType, std::vector, T>(d, x, - ref_res); + auto verifier = [](const typename KernelTuple::func_type tgt, const T a, + const std::vector& x, const std::vector& yref) { + EXPECT_TRUE(tgt != nullptr); + EXPECT_EQ(yref.size(), x.size()); + const T* x_data = x.data(); + const T* yref_data = yref.data(); + const int d = yref.size(); + std::vector ytgt(d); + T* ytgt_data = ytgt.data(); + // test normal + tgt(&a, x_data, ytgt_data, d); + ExpectEQ(ytgt_data, yref_data, d); + // test inplace x + std::copy(x.begin(), x.end(), ytgt.begin()); + tgt(&a, ytgt_data, ytgt_data, d); + ExpectEQ(ytgt_data, yref_data, d); + }; + TestAllImpls(d, verifier, a, x, yref); } - FLAGS_acc = last_acc; } -template -void TestKernelXYNTuples() { - VLOG(10) << "===== Test JITKernel " << jit::to_string(KT); +template +void TestKernelXYN() { + using T = typename KernelTuple::data_type; + VLOG(10) << "Test JITKernel: " << jit::to_string(KernelTuple::kernel_type); for (int d : TestSizes()) { - auto ref = jit::GetRefer>(); + auto ref = jit::GetRefer(); EXPECT_TRUE(ref != nullptr); std::vector x(d), yref(d); @@ -562,15 +217,57 @@ void TestKernelXYNTuples() { ref(x_data, yref_data, d); ref(xinp_data, xinp_data, d); ExpectEQ(xinp_data, yref_data, d); + auto verifier = [](const typename KernelTuple::func_type tgt, + const std::vector& x, const std::vector& yref) { + EXPECT_TRUE(tgt != nullptr); + EXPECT_EQ(yref.size(), x.size()); + const T* x_data = x.data(); + const T* yref_data = yref.data(); + const int d = yref.size(); + std::vector ytgt(d); + T* ytgt_data = ytgt.data(); + // test normal + tgt(x_data, ytgt_data, d); + ExpectEQ(ytgt_data, yref_data, d); + // test inplace x + std::copy(x.begin(), x.end(), ytgt.begin()); + tgt(ytgt_data, ytgt_data, d); + ExpectEQ(ytgt_data, yref_data, d); + }; + TestAllImpls(d, verifier, x, yref); + } +} + +template +void TestKernelXRN() { + using T = typename KernelTuple::data_type; + VLOG(10) << "Test JITKernel: " << jit::to_string(KernelTuple::kernel_type); + auto last_acc = FLAGS_acc; + FLAGS_acc = 1e-4; + for (int d : TestSizes()) { + auto ref = jit::GetRefer(); + EXPECT_TRUE(ref != nullptr); + std::vector x(d); + RandomVec(d, x.data()); + T ref_res; + ref(x.data(), &ref_res, d); - TestAllImpls, PlaceType, std::vector, - std::vector>(d, x, yref); + auto verifier = [](const typename KernelTuple::func_type tgt, + const std::vector& x, const T ref_res) { + EXPECT_TRUE(tgt != nullptr); + T tgt_res; + tgt(x.data(), &tgt_res, x.size()); + ExpectEQ(&tgt_res, &ref_res, 1); + }; + TestAllImpls(d, verifier, x, ref_res); } + FLAGS_acc = last_acc; } -template -void TestKernelLSTMTuples() { - VLOG(10) << "===== Test JITKernel " << jit::to_string(KT); +template +void TestKernelLSTM() { + using T = typename KernelTuple::data_type; + VLOG(10) << "Test JITKernel: " << jit::to_string(KernelTuple::kernel_type); std::vector all_acts = {"sigmoid", "tanh", "relu", "identity"}; auto test_sizes = TestSizes(); test_sizes.erase(std::remove(test_sizes.begin(), test_sizes.end(), 1000)); @@ -582,7 +279,7 @@ void TestKernelLSTMTuples() { const jit::lstm_attr_t attr( d, jit::to_kerneltype(act_gate), jit::to_kerneltype(act_cand), jit::to_kerneltype(act_cell), use_peephole); - auto ref = jit::GetRefer>(); + auto ref = jit::GetRefer(); EXPECT_TRUE(ref != nullptr); std::vector xsrc(4 * d), wp(3 * d), ct_1(d); std::vector ct_ref(d), ht_ref(d), checked(2 * d); @@ -609,10 +306,51 @@ void TestKernelLSTMTuples() { } ref(&step, &attr); VLOG(10) << attr; - TestAllImpls, PlaceType, std::vector, - std::vector, std::vector, std::vector, - std::vector>(attr, xsrc, wp, ct_1, ct_ref, ht_ref, - attr); + + auto verifier = []( + const typename KernelTuple::func_type tgt, + const std::vector& xsrc, const std::vector& wp, + const std::vector& ct_1, const std::vector& ct_ref, + const std::vector& ht_ref, + const typename KernelTuple::attr_type& attr) { + EXPECT_TRUE(tgt != nullptr); + EXPECT_EQ(ct_ref.size(), ht_ref.size()); + EXPECT_EQ(ct_1.size(), ht_ref.size()); + EXPECT_EQ(xsrc.size(), 4 * ht_ref.size()); + EXPECT_EQ(wp.size(), 3 * ht_ref.size()); + + // x could be changed after compute, so copy to save src + int d = ht_ref.size(); + std::vector x(xsrc.size()), ct(ct_ref.size()), + ht(ht_ref.size()); + std::vector checked(2 * d); + std::copy(xsrc.begin(), xsrc.end(), x.begin()); + + const T* ct_1_data = ct_1.data(); + const T* wp_data = wp.data(); + const T* ct_ref_data = ct_ref.data(); + const T* ht_ref_data = ht_ref.data(); + T* x_data = x.data(); + T* ct_data = ct.data(); + T* ht_data = ht.data(); + T* checked_data = checked.data(); + + jit::lstm_t step; + step.gates = x_data; + step.ct_1 = ct_1_data; + step.ct = ct_data; + step.ht = ht_data; + if (attr.use_peephole) { + step.wp = wp_data; + step.checked = checked_data; + } + + tgt(&step, &attr); + ExpectEQ(ct_data, ct_ref_data, d); + ExpectEQ(ht_data, ht_ref_data, d); + }; + TestAllImpls(attr, verifier, xsrc, wp, ct_1, + ct_ref, ht_ref, attr); } } } @@ -620,9 +358,10 @@ void TestKernelLSTMTuples() { } } -template -void TestKernelGRUTuples() { - VLOG(10) << "===== Test JITKernel " << jit::to_string(KT); +template +void TestKernelGRU() { + using T = typename KernelTuple::data_type; + VLOG(10) << "Test JITKernel: " << jit::to_string(KernelTuple::kernel_type); std::vector all_acts = {"sigmoid", "tanh", "relu", "identity"}; auto test_sizes = TestSizes(); test_sizes.erase(std::remove(test_sizes.begin(), test_sizes.end(), 1000)); @@ -631,7 +370,7 @@ void TestKernelGRUTuples() { for (auto& act_cand : all_acts) { const jit::gru_attr_t attr(d, jit::to_kerneltype(act_gate), jit::to_kerneltype(act_cand)); - auto ref = jit::GetRefer>(); + auto ref = jit::GetRefer(); EXPECT_TRUE(ref != nullptr); std::vector xsrc(3 * d), ht_1(d), ht_ref(d); RandomVec(3 * d, xsrc.data()); @@ -648,17 +387,216 @@ void TestKernelGRUTuples() { step.ht = ht_ref_data; ref(&step, &attr); VLOG(10) << attr; - TestAllImpls, PlaceType, std::vector, - std::vector, std::vector>(attr, xsrc, ht_1, ht_ref, - attr); + auto verifier = [](const typename KernelTuple::func_type tgt, + const std::vector& xsrc, + const std::vector& ht_1, + const std::vector& ht_ref, + const typename KernelTuple::attr_type& attr) { + EXPECT_TRUE(tgt != nullptr); + EXPECT_EQ(ht_1.size(), ht_ref.size()); + EXPECT_EQ(xsrc.size(), 3 * ht_ref.size()); + + // x could be changed after compute, so copy to save src + int d = ht_ref.size(); + std::vector x(xsrc.size()), ht(ht_ref.size()); + std::copy(xsrc.begin(), xsrc.end(), x.begin()); + const T* ht_1_data = ht_1.data(); + const T* ht_ref_data = ht_ref.data(); + T* x_data = x.data(); + T* ht_data = ht.data(); + jit::gru_t step; + step.gates = x_data; + step.ht_1 = ht_1_data; + step.ht = ht_data; + tgt(&step, &attr); + ExpectEQ(ht_data, ht_ref_data, d); + }; + TestAllImpls(attr, verifier, xsrc, ht_1, ht_ref, + attr); } } } } -template -void TestKernelSeqPoolTuples() { - VLOG(10) << "===== Test JITKernel " << jit::to_string(KT); +template +void TestKernelNCHW16CMulNC() { + using T = typename KernelTuple::data_type; + VLOG(10) << "Test JITKernel: " << jit::to_string(KernelTuple::kernel_type); + const int n = 3, c = 16 * 4, h = 10, w = 10; + auto ref = jit::GetRefer(); + EXPECT_TRUE(ref != nullptr); + int sz = n * c * h * w; + std::vector x(sz), y(n * c), zref(sz); + std::vector ztgt(sz), zjit(sz); + RandomVec(sz, x.data()); + RandomVec(n * c, y.data()); + + const T* x_data = x.data(); + const T* y_data = y.data(); + T* zref_data = zref.data(); + T* ztgt_data = ztgt.data(); + T* zjit_data = zjit.data(); + constexpr int simd_width = ZMM_FLOAT_BLOCK; + int C = c / simd_width; + auto tgt = jit::KernelFuncs::Cache().At(0); + auto jitcode = jit::GetJitCode(0); + EXPECT_TRUE(tgt != nullptr); + + if (std::is_same::value && + paddle::platform::MayIUse(paddle::platform::avx512f)) { + EXPECT_TRUE(jitcode != nullptr); + } + for (int ni = 0; ni < n; ni++) { + for (int ci = 0; ci < C; ci++) { + auto ptr_x = + x_data + ni * C * h * w * simd_width + ci * h * w * simd_width; + auto ptr_y = y_data + ni * C * simd_width + ci * simd_width; + auto ptr_zref = + zref_data + ni * C * h * w * simd_width + ci * h * w * simd_width; + auto ptr_ztgt = + ztgt_data + ni * C * h * w * simd_width + ci * h * w * simd_width; + + ref(ptr_x, ptr_y, ptr_zref, h, w); + tgt(ptr_x, ptr_y, ptr_ztgt, h, w); + + if (jitcode) { + auto ptr_zjit = + zjit_data + ni * C * h * w * simd_width + ci * h * w * simd_width; + jitcode(ptr_x, ptr_y, ptr_zjit, h, w); + } + } + } + ExpectEQ(ztgt_data, zref_data, sz); + if (jitcode) { + ExpectEQ(zjit_data, zref_data, sz); + } +} + +template +void TestKernelLayerNorm() { + using T = typename KernelTuple::data_type; + VLOG(10) << "Test JITKernel: " << jit::to_string(KernelTuple::kernel_type); + const T epsilon = 9.99999975e-06; + for (int n : {1, 2, 10}) { + for (int x_dim_0 : {1, 9, 17, 50}) { + int left = n * x_dim_0; + for (int x_dim_1 : TestSizes()) { + int right = x_dim_1; + auto ref = jit::GetRefer(); + EXPECT_TRUE(ref != nullptr); + int sz = left * right; + std::vector x(sz), mean(left), var(left), scale(right), bias(right), + outref(sz); + RandomVec(sz, x.data()); + RandomVec(left, mean.data()); + RandomVec(left, var.data()); + RandomVec(right, scale.data()); + RandomVec(right, bias.data()); + + const T* scale_data = scale.data(); + const T* bias_data = bias.data(); + T* x_data = x.data(); + T* mean_data = mean.data(); + T* var_data = var.data(); + T* outref_data = outref.data(); + + ref(x_data, outref_data, mean_data, var_data, scale_data, bias_data, + left, epsilon, right); + + auto verifier = []( + const typename KernelTuple::func_type tgt, const std::vector& x_, + const std::vector& outref_, const std::vector& mean_, + const std::vector& var_, const std::vector& scale, + const std::vector& bias, const int& left, const float& epsilon, + const typename KernelTuple::attr_type& right) { + EXPECT_TRUE(tgt != nullptr); + std::vector outtgt(outref_.size()); + std::vector x(x_.size()); + std::vector mean(mean_.size()); + std::vector var(var_.size()); + std::vector outref(outref_.size()); + std::copy(x_.begin(), x_.end(), x.begin()); + std::copy(mean_.begin(), mean_.end(), mean.begin()); + std::copy(var_.begin(), var_.end(), var.begin()); + std::copy(outref_.begin(), outref_.end(), outref.begin()); + + EXPECT_EQ(x.size(), static_cast(left * right)); + EXPECT_EQ(outref.size(), static_cast(left * right)); + EXPECT_EQ(mean.size(), static_cast(left)); + EXPECT_EQ(var.size(), static_cast(left)); + EXPECT_EQ(scale.size(), static_cast(right)); + EXPECT_EQ(bias.size(), static_cast(right)); + + const T* scale_data = scale.data(); + const T* bias_data = bias.data(); + T* x_data = x.data(); + T* mean_data = mean.data(); + T* var_data = var.data(); + T* outref_data = outref.data(); + T* outtgt_data = outtgt.data(); + tgt(x_data, outtgt_data, mean_data, var_data, scale_data, bias_data, + left, epsilon, right); + ExpectEQ(outtgt_data, outref_data, left * right); + }; + TestAllImpls(right, verifier, x, outref, mean, + var, scale, bias, left, epsilon, + right); + } + } + } +} + +template +void TestKernelCRFDecoding() { + using T = typename KernelTuple::data_type; + VLOG(10) << "Test JITKernel: " << jit::to_string(KernelTuple::kernel_type); + constexpr int state_trans_base_idx = 2; + auto test_sizes = TestSizes(); + test_sizes.erase(std::remove(test_sizes.begin(), test_sizes.end(), 2000)); + for (int seq_len : {1, 11, 17, 50}) { + for (int tag_num : test_sizes) { + auto ref = jit::GetRefer(); + EXPECT_TRUE(ref != nullptr); + int x_sz = seq_len * tag_num; + int w_sz = (tag_num + state_trans_base_idx) * tag_num; + std::vector x(x_sz), w(w_sz), alpharef(x_sz); + std::vector trackref(x_sz); + RandomVec(x_sz, x.data()); + RandomVec(w_sz, w.data()); + + ref(seq_len, (const T*)x.data(), (const T*)w.data(), alpharef.data(), + trackref.data(), tag_num); + + auto verifier = []( + const typename KernelTuple::func_type tgt, const int& seq_len, + const std::vector& x, const std::vector& w, + const std::vector& alpharef, const std::vector& trackref, + const typename KernelTuple::attr_type& tag_num) { + constexpr int state_trans_base_idx = 2; + EXPECT_TRUE(tgt != nullptr); + EXPECT_EQ(x.size(), static_cast(seq_len * tag_num)); + EXPECT_EQ(w.size(), static_cast( + (tag_num + state_trans_base_idx) * tag_num)); + EXPECT_EQ(alpharef.size(), static_cast(seq_len * tag_num)); + EXPECT_EQ(trackref.size(), static_cast(seq_len * tag_num)); + std::vector alphatgt(alpharef.size()); + std::vector tracktgt(trackref.size()); + memcpy(tracktgt.data(), trackref.data(), tag_num * sizeof(int)); + tgt(seq_len, (const T*)x.data(), (const T*)w.data(), alphatgt.data(), + tracktgt.data(), tag_num); + ExpectEQ(alpharef.data(), alphatgt.data(), seq_len * tag_num); + ExpectEQ(trackref.data(), tracktgt.data(), seq_len * tag_num); + }; + TestAllImpls(tag_num, verifier, seq_len, x, w, + alpharef, trackref, tag_num); + } + } +} + +template +void TestKernelSeqPool() { + using T = typename KernelTuple::data_type; + VLOG(10) << "Test JITKernel: " << jit::to_string(KernelTuple::kernel_type); std::vector pool_types = { jit::SeqPoolType::kSum, jit::SeqPoolType::kAvg, jit::SeqPoolType::kSqrt}; auto test_sizes = TestSizes(); @@ -668,7 +606,7 @@ void TestKernelSeqPoolTuples() { jit::seq_pool_attr_t attr(w, type); for (int h : test_sizes) { attr.h = h; - auto ref = jit::GetRefer>(); + auto ref = jit::GetRefer(); EXPECT_TRUE(ref != nullptr); std::vector x(h * w), yref(w); RandomVec(h * w, x.data()); @@ -676,16 +614,86 @@ void TestKernelSeqPoolTuples() { T* yref_data = yref.data(); ref(x_data, yref_data, &attr); VLOG(10) << attr; - TestAllImpls, PlaceType, std::vector, - std::vector>(attr, x, yref, attr); + auto verifier = [](const typename KernelTuple::func_type tgt, + const std::vector& x, const std::vector& yref, + const typename KernelTuple::attr_type& attr) { + EXPECT_TRUE(tgt != nullptr); + EXPECT_EQ(x.size() % yref.size(), static_cast(0)); + int w = yref.size(); + std::vector y(w); + const T* x_data = x.data(); + const T* yref_data = yref.data(); + T* y_data = y.data(); + tgt(x_data, y_data, &attr); + ExpectEQ(y_data, yref_data, w); + }; + TestAllImpls(attr, verifier, x, yref, attr); + } + } + } +} + +template +void TestKernelEmbSeqPool() { + using T = typename KernelTuple::data_type; + VLOG(10) << "Test JITKernel: " << jit::to_string(KernelTuple::kernel_type); + int64_t tbl_h = 1e4; + std::vector pool_types = { + jit::SeqPoolType::kSum}; // only support sum yet + auto test_sizes = TestSizes(); + test_sizes.erase(std::remove(test_sizes.begin(), test_sizes.end(), 1000)); + for (int tbl_w : test_sizes) { + std::vector table(tbl_h * tbl_w); + RandomVec(tbl_h * tbl_w, table.data()); + const T* table_data = table.data(); + for (auto type : pool_types) { + for (int idx_w : {1, 2, 10, 16}) { + for (int idx_h : {1, 2, 9, 13, 16}) { + auto ref = jit::GetRefer(); + EXPECT_TRUE(ref != nullptr); + std::vector idx(idx_h * idx_w); + RandomVec(idx_h * idx_w, idx.data(), 0, tbl_h - 1); + int64_t out_w = tbl_w * idx_w; + std::vector oref(out_w); + const int64_t* idx_data = idx.data(); + T* o_data = oref.data(); + jit::emb_seq_pool_attr_t attr(tbl_h, tbl_w, idx_h, idx_w, out_w, + type); + ref(table_data, idx_data, o_data, &attr); + + auto verifier = [](const typename KernelTuple::func_type tgt, + const std::vector& table, + const std::vector& idx, + const std::vector& oref, + const typename KernelTuple::attr_type& attr) { + EXPECT_TRUE(tgt != nullptr); + EXPECT_EQ(table.size(), static_cast(attr.table_height * + attr.table_width)); + EXPECT_EQ(idx.size(), static_cast(attr.index_height * + attr.index_width)); + EXPECT_EQ(oref.size(), + static_cast(attr.table_width * attr.index_width)); + const T* table_data = table.data(); + const int64_t* idx_data = idx.data(); + const T* oref_data = oref.data(); + int o_w = oref.size(); + std::vector out(o_w); + T* o_data = out.data(); + tgt(table_data, idx_data, o_data, &attr); + ExpectEQ(o_data, oref_data, o_w); + }; + TestAllImpls(attr, verifier, table, idx, oref, + attr); + } } } } } -template -void TestKernelMatMulTuples() { - VLOG(10) << "===== Test JITKernel " << jit::to_string(KT); +template +void TestKernelMatMul() { + using T = typename KernelTuple::data_type; + VLOG(10) << "Test JITKernel: " << jit::to_string(KernelTuple::kernel_type); auto last_acc = FLAGS_acc; // export MKL_CBWR=AVX would make MKL force to use AVX // export KMP_DETERMINISTIC_REDUCTION=yes would make the result deterministic @@ -693,7 +701,7 @@ void TestKernelMatMulTuples() { for (int m : {1, 2, 3, 4}) { for (int n : {1, 2, 3, 4}) { for (int k : TestSizes()) { - auto ref = jit::GetRefer>(); + auto ref = jit::GetRefer(); EXPECT_TRUE(ref != nullptr); std::vector a(m * k), b(k * n), c(m * n); RandomVec(m * k, a.data()); @@ -703,20 +711,36 @@ void TestKernelMatMulTuples() { T* c_data = c.data(); const jit::matmul_attr_t attr{m, n, k}; ref(a_data, b_data, c_data, &attr); - TestAllImpls, PlaceType, std::vector, - std::vector, std::vector>(attr, a, b, c, attr); + auto verifier = [](const typename KernelTuple::func_type tgt, + const std::vector& a, const std::vector& b, + const std::vector& cref, + const typename KernelTuple::attr_type& attr) { + EXPECT_TRUE(tgt != nullptr); + EXPECT_EQ(a.size(), static_cast(attr.m * attr.k)); + EXPECT_EQ(b.size(), static_cast(attr.k * attr.n)); + EXPECT_EQ(cref.size(), static_cast(attr.m * attr.n)); + std::vector c(cref.size()); + const T* a_data = a.data(); + const T* b_data = b.data(); + const T* cref_data = cref.data(); + T* c_data = c.data(); + tgt(a_data, b_data, c_data, &attr); + ExpectEQ(c_data, cref_data, attr.m * attr.n); + }; + TestAllImpls(attr, verifier, a, b, c, attr); } } } FLAGS_acc = last_acc; } -template -void TestKernelSoftmaxTuples() { - VLOG(10) << "===== Test JITKernel " << jit::to_string(KT); +template +void TestKernelSoftmax() { + using T = typename KernelTuple::data_type; + VLOG(10) << "Test JITKernel: " << jit::to_string(KernelTuple::kernel_type); for (int bs : {1, 2, 10}) { for (int n : TestSizes()) { - auto ref = jit::GetRefer>(); + auto ref = jit::GetRefer(); EXPECT_TRUE(ref != nullptr); std::vector x(bs * n), y(bs * n); RandomVec(bs * n, x.data()); @@ -730,51 +754,33 @@ void TestKernelSoftmaxTuples() { ref(xinp_data, xinp_data, n, bs); ExpectEQ(xinp_data, y_data, n * bs); - TestAllImpls, PlaceType, std::vector, - std::vector>(n, x, y, n, bs); - } - } -} - -template -void TestKernelEmbSeqPoolTuples() { - VLOG(10) << "===== Test JITKernel " << jit::to_string(KT); - int64_t tbl_h = 1e4; - std::vector pool_types = { - jit::SeqPoolType::kSum}; // only support sum yet - auto test_sizes = TestSizes(); - test_sizes.erase(std::remove(test_sizes.begin(), test_sizes.end(), 1000)); - for (int tbl_w : test_sizes) { - std::vector table(tbl_h * tbl_w); - RandomVec(tbl_h * tbl_w, table.data()); - const T* table_data = table.data(); - for (auto type : pool_types) { - for (int idx_w : {1, 2, 10, 16}) { - for (int idx_h : {1, 2, 9, 13, 16}) { - auto ref = jit::GetRefer>(); - EXPECT_TRUE(ref != nullptr); - std::vector idx(idx_h * idx_w); - RandomVec(idx_h * idx_w, idx.data(), 0, tbl_h - 1); - int64_t out_w = tbl_w * idx_w; - std::vector oref(out_w); - const int64_t* idx_data = idx.data(); - T* o_data = oref.data(); - jit::emb_seq_pool_attr_t attr(tbl_h, tbl_w, idx_h, idx_w, out_w, - type); - ref(table_data, idx_data, o_data, &attr); - - TestAllImpls, PlaceType, std::vector, - std::vector, std::vector>(attr, table, idx, - oref, attr); - } - } + auto verifier = [](const typename KernelTuple::func_type tgt, + const std::vector& x, const std::vector& yref, + int n, int bs) { + EXPECT_TRUE(tgt != nullptr); + EXPECT_EQ(yref.size(), x.size()); + EXPECT_EQ(x.size(), static_cast(n * bs)); + const T* x_data = x.data(); + const T* yref_data = yref.data(); + std::vector ytgt(n * bs); + T* ytgt_data = ytgt.data(); + // test normal + tgt(x_data, ytgt_data, n, bs); + ExpectEQ(ytgt_data, yref_data, n * bs); + // test inplace x + std::copy(x.begin(), x.end(), ytgt.begin()); + tgt(ytgt_data, ytgt_data, n, bs); + ExpectEQ(ytgt_data, yref_data, n * bs); + }; + TestAllImpls(n, verifier, x, y, n, bs); } } } -template -void TestKernelSgdTuples() { - VLOG(10) << "===== Test JITKernel " << jit::to_string(KT); +template +void TestKernelSgd() { + using T = typename KernelTuple::data_type; + VLOG(10) << "Test JITKernel: " << jit::to_string(KernelTuple::kernel_type); const T lr = 0.1; auto UnDuplicatedRandomVec = [](int n, const int64_t lower, const int64_t upper) -> std::vector { @@ -802,7 +808,7 @@ void TestKernelSgdTuples() { RandomVec(rows_size * grad_w, grad.data()); const int64_t* rows_data = rows.data(); const T* grad_data = grad.data(); - auto ref = jit::GetRefer>(); + auto ref = jit::GetRefer(); EXPECT_TRUE(ref != nullptr); jit::sgd_attr_t attr(param_h, grad_w, rows_size, grad_w, rows_size); ref(&lr, param_data, grad_data, rows_data, out_data, &attr); @@ -818,199 +824,150 @@ void TestKernelSgdTuples() { grad_w); } - TestAllImpls, PlaceType, T, std::vector, - std::vector, std::vector, std::vector>( - attr, lr, param, grad, rows, param_out, attr); - } - } - } -} - -template -void TestKernelNCHW16CMulNCTuples() { - VLOG(10) << "===== Test JITKernel " << jit::to_string(KT); - const int n = 3, c = 16 * 4, h = 10, w = 10; - auto ref = jit::GetRefer>(); - EXPECT_TRUE(ref != nullptr); - int sz = n * c * h * w; - std::vector x(sz), y(n * c), zref(sz); - std::vector ztgt(sz), zjit(sz); - RandomVec(sz, x.data()); - RandomVec(n * c, y.data()); - - const T* x_data = x.data(); - const T* y_data = y.data(); - T* zref_data = zref.data(); - T* ztgt_data = ztgt.data(); - T* zjit_data = zjit.data(); - constexpr int simd_width = ZMM_FLOAT_BLOCK; - int C = c / simd_width; - auto tgt = - jit::KernelFuncs, PlaceType>::Cache().At( - 0); - auto jitcode = jit::GetJitCode, PlaceType>(0); - EXPECT_TRUE(tgt != nullptr); - - if (std::is_same::value && - paddle::platform::MayIUse(paddle::platform::avx512f)) { - EXPECT_TRUE(jitcode != nullptr); - } - for (int ni = 0; ni < n; ni++) { - for (int ci = 0; ci < C; ci++) { - auto ptr_x = - x_data + ni * C * h * w * simd_width + ci * h * w * simd_width; - auto ptr_y = y_data + ni * C * simd_width + ci * simd_width; - auto ptr_zref = - zref_data + ni * C * h * w * simd_width + ci * h * w * simd_width; - auto ptr_ztgt = - ztgt_data + ni * C * h * w * simd_width + ci * h * w * simd_width; - - ref(ptr_x, ptr_y, ptr_zref, h, w); - tgt(ptr_x, ptr_y, ptr_ztgt, h, w); - - if (jitcode) { - auto ptr_zjit = - zjit_data + ni * C * h * w * simd_width + ci * h * w * simd_width; - jitcode(ptr_x, ptr_y, ptr_zjit, h, w); - } - } - } - ExpectEQ(ztgt_data, zref_data, sz); - if (jitcode) { - ExpectEQ(zjit_data, zref_data, sz); - } -} - -template -void TestKernelLayerNormTuples() { - VLOG(10) << "===== Test JITKernel " << jit::to_string(KT); - const T epsilon = 9.99999975e-06; - for (int n : {1, 2, 10}) { - for (int x_dim_0 : {1, 9, 17, 50}) { - int left = n * x_dim_0; - for (int x_dim_1 : TestSizes()) { - int right = x_dim_1; - auto ref = jit::GetRefer>(); - EXPECT_TRUE(ref != nullptr); - int sz = left * right; - std::vector x(sz), mean(left), var(left), scale(right), bias(right), - outref(sz); - RandomVec(sz, x.data()); - RandomVec(left, mean.data()); - RandomVec(left, var.data()); - RandomVec(right, scale.data()); - RandomVec(right, bias.data()); - - const T* scale_data = scale.data(); - const T* bias_data = bias.data(); - T* x_data = x.data(); - T* mean_data = mean.data(); - T* var_data = var.data(); - T* outref_data = outref.data(); - - ref(x_data, outref_data, mean_data, var_data, scale_data, bias_data, - left, epsilon, right); + auto verifier = []( + const typename KernelTuple::func_type tgt, const T lr, + const std::vector& param, const std::vector& grad, + const std::vector& rows, const std::vector& oref, + const typename KernelTuple::attr_type& attr) { + EXPECT_TRUE(tgt != nullptr); + EXPECT_EQ(param.size(), + static_cast(attr.param_height * attr.param_width)); + EXPECT_EQ(grad.size(), + static_cast(attr.grad_height * attr.grad_width)); + EXPECT_EQ(rows.size(), static_cast(attr.selected_rows_size)); + EXPECT_EQ(param.size(), oref.size()); + const T* param_data = param.data(); + const T* grad_data = grad.data(); + const int64_t* rows_data = rows.data(); + const T* oref_data = oref.data(); + + std::vector out(oref.size()); + T* o_data = out.data(); + tgt(&lr, param_data, grad_data, rows_data, o_data, &attr); + // only the selected rows should be equal + for (size_t i = 0; i < rows.size(); ++i) { + ExpectEQ(o_data + rows[i] * attr.grad_width, + oref_data + rows[i] * attr.grad_width, attr.grad_width); + } - TestAllImpls, PlaceType, std::vector, - std::vector, std::vector, std::vector, - std::vector, std::vector, int, float>( - right, x, outref, mean, var, scale, bias, left, epsilon, right); + // inplace + std::copy(param.begin(), param.end(), out.begin()); + tgt(&lr, o_data, grad_data, rows_data, o_data, &attr); + for (size_t i = 0; i < rows.size(); ++i) { + ExpectEQ(o_data + rows[i] * attr.grad_width, + oref_data + rows[i] * attr.grad_width, attr.grad_width); + } + }; + TestAllImpls(attr, verifier, lr, param, grad, + rows, param_out, attr); } } } } -template -void TestKernelCRFDecodingTuples() { - VLOG(10) << "===== Test JITKernel " << jit::to_string(KT); - constexpr int state_trans_base_idx = 2; - auto test_sizes = TestSizes(); - test_sizes.erase(std::remove(test_sizes.begin(), test_sizes.end(), 2000)); - for (int seq_len : {1, 11, 17, 50}) { - for (int tag_num : test_sizes) { - auto ref = jit::GetRefer>(); - EXPECT_TRUE(ref != nullptr); - int x_sz = seq_len * tag_num; - int w_sz = (tag_num + state_trans_base_idx) * tag_num; - std::vector x(x_sz), w(w_sz), alpharef(x_sz); - std::vector trackref(x_sz); - RandomVec(x_sz, x.data()); - RandomVec(w_sz, w.data()); - - ref(seq_len, (const T*)x.data(), (const T*)w.data(), alpharef.data(), - trackref.data(), tag_num); - - TestAllImpls, PlaceType, int, - std::vector, std::vector, std::vector, - std::vector, int>(tag_num, seq_len, x, w, alpharef, - trackref, tag_num); - } - } -} - -template -void TestKernelVBroadcastTuples() { - VLOG(10) << "===== Test JITKernel " << jit::to_string(KT); +template +void TestKernelVBroadcast() { + using T = typename KernelTuple::data_type; + VLOG(10) << "Test JITKernel: " << jit::to_string(KernelTuple::kernel_type); for (int w : TestSizes()) { std::vector x(w); RandomVec(w, x.data()); const T* x_data = x.data(); for (int64_t h : {1, 2, 6}) { - auto ref = jit::GetRefer>(); + auto ref = jit::GetRefer(); EXPECT_TRUE(ref != nullptr); std::vector y(w * h); T* y_data = y.data(); ref(x_data, y_data, h, w); - TestAllImpls, PlaceType, std::vector, - std::vector, int64_t>(static_cast(w), x, y, h, - static_cast(w)); + auto verifier = [](const typename KernelTuple::func_type tgt, + const std::vector& x, const std::vector& yref, + const int64_t& h, + const typename KernelTuple::attr_type& attr) { + EXPECT_TRUE(tgt != nullptr); + EXPECT_EQ(x.size(), static_cast(attr)); + EXPECT_EQ(yref.size(), x.size() * h); + std::vector y(yref.size()); + const T* x_data = x.data(); + const T* yref_data = yref.data(); + T* y_data = y.data(); + tgt(x_data, y_data, h, attr); + ExpectEQ(y_data, yref_data, yref.size()); + }; + TestAllImpls(static_cast(w), verifier, x, + y, h, static_cast(w)); } } } -#define TEST_CPU_KERNEL(test_tuple, kernel_type) \ - TEST(JITKernel, kernel_type) { \ - TestKernel##test_tuple(); \ - TestKernel##test_tuple(); \ +#define TestKernelVMul TestKernelXYZN +#define TestKernelVAdd TestKernelXYZN +#define TestKernelVAddRelu TestKernelXYZN +#define TestKernelVSub TestKernelXYZN + +#define TestKernelVScal TestKernelAXYN +#define TestKernelVAddBias TestKernelAXYN + +#define TestKernelVRelu TestKernelXYN +#define TestKernelVIdentity TestKernelXYN +#define TestKernelVSquare TestKernelXYN +#define TestKernelVExp TestKernelXYN +#define TestKernelVSigmoid TestKernelXYN +#define TestKernelVTanh TestKernelXYN +#define TestKernelVCopy TestKernelXYN + +#define TestKernelHMax TestKernelXRN +#define TestKernelHSum TestKernelXRN + +#define TestKernelLSTMCtHt TestKernelLSTM +#define TestKernelLSTMC1H1 TestKernelLSTM + +#define TestKernelGRUH1 TestKernelGRU +#define TestKernelGRUHtPart1 TestKernelGRU +#define TestKernelGRUHtPart2 TestKernelGRU + +#define TEST_CPU_KERNEL(kernel_type) \ + TEST(JITKernel, kernel_type) { \ + TestKernel##kernel_type, CPUPlace>(); \ + TestKernel##kernel_type, CPUPlace>(); \ } -TEST_CPU_KERNEL(XYZNTuples, kVMul); -TEST_CPU_KERNEL(XYZNTuples, kVAdd); -TEST_CPU_KERNEL(XYZNTuples, kVAddRelu); -TEST_CPU_KERNEL(XYZNTuples, kVSub); +TEST_CPU_KERNEL(VMul); +TEST_CPU_KERNEL(VAdd); +TEST_CPU_KERNEL(VAddRelu); +TEST_CPU_KERNEL(VSub); -TEST_CPU_KERNEL(AXYNTuples, kVScal); -TEST_CPU_KERNEL(AXYNTuples, kVAddBias); +TEST_CPU_KERNEL(VScal); +TEST_CPU_KERNEL(VAddBias); -TEST_CPU_KERNEL(XRNTuples, kHMax); -TEST_CPU_KERNEL(XRNTuples, kHSum); +TEST_CPU_KERNEL(VRelu); +TEST_CPU_KERNEL(VIdentity); +TEST_CPU_KERNEL(VSquare); +TEST_CPU_KERNEL(VExp); +TEST_CPU_KERNEL(VSigmoid); +TEST_CPU_KERNEL(VTanh); +TEST_CPU_KERNEL(VCopy); -TEST_CPU_KERNEL(XYNTuples, kVRelu); -TEST_CPU_KERNEL(XYNTuples, kVIdentity); -TEST_CPU_KERNEL(XYNTuples, kVSquare); -TEST_CPU_KERNEL(XYNTuples, kVExp); -TEST_CPU_KERNEL(XYNTuples, kVSigmoid); -TEST_CPU_KERNEL(XYNTuples, kVTanh); -TEST_CPU_KERNEL(XYNTuples, kVCopy); +TEST_CPU_KERNEL(HMax); +TEST_CPU_KERNEL(HSum); -TEST_CPU_KERNEL(LSTMTuples, kLSTMCtHt); -TEST_CPU_KERNEL(LSTMTuples, kLSTMC1H1); +TEST_CPU_KERNEL(LSTMCtHt); +TEST_CPU_KERNEL(LSTMC1H1); -TEST_CPU_KERNEL(GRUTuples, kGRUH1); -TEST_CPU_KERNEL(GRUTuples, kGRUHtPart1); -TEST_CPU_KERNEL(GRUTuples, kGRUHtPart2); +TEST_CPU_KERNEL(GRUH1); +TEST_CPU_KERNEL(GRUHtPart1); +TEST_CPU_KERNEL(GRUHtPart2); -TEST_CPU_KERNEL(NCHW16CMulNCTuples, kNCHW16CMulNC); +TEST_CPU_KERNEL(NCHW16CMulNC); +TEST_CPU_KERNEL(LayerNorm); +TEST_CPU_KERNEL(CRFDecoding); -TEST_CPU_KERNEL(SeqPoolTuples, kSeqPool); -TEST_CPU_KERNEL(MatMulTuples, kMatMul); -TEST_CPU_KERNEL(SoftmaxTuples, kSoftmax); -TEST_CPU_KERNEL(EmbSeqPoolTuples, kEmbSeqPool); -TEST_CPU_KERNEL(SgdTuples, kSgd); -TEST_CPU_KERNEL(LayerNormTuples, kLayerNorm); -TEST_CPU_KERNEL(CRFDecodingTuples, kCRFDecoding); -TEST_CPU_KERNEL(VBroadcastTuples, kVBroadcast); +TEST_CPU_KERNEL(SeqPool); +TEST_CPU_KERNEL(EmbSeqPool); +TEST_CPU_KERNEL(MatMul); +TEST_CPU_KERNEL(Softmax); +TEST_CPU_KERNEL(Sgd); +TEST_CPU_KERNEL(VBroadcast); TEST(JITKernel_key, lstm) { jit::lstm_attr_t attr1(8, jit::kVIdentity, jit::kVSigmoid, jit::kVTanh); @@ -1045,16 +1002,9 @@ TEST(JITKernel_key, gru) { } TEST(JITKernel, kernel_func) { - auto f1 = - jit::KernelFuncs, CPUPlace>::Cache() - .At(3); - auto f2 = jit::KernelFuncs, - CPUPlace>::Cache()[3]; + auto f1 = jit::KernelFuncs, CPUPlace>::Cache().At(3); + auto f2 = jit::KernelFuncs, CPUPlace>::Cache()[3]; + EXPECT_TRUE(f1 != nullptr); EXPECT_TRUE(f1 == f2); - - f1 = jit::KernelFuncs, CPUPlace>::Cache() - .At(3); - f2 = jit::KernelFuncs, CPUPlace>::Cache() - .At(4); - EXPECT_TRUE(f1 != f2); + // TODO(TJ): check not equal } diff --git a/paddle/fluid/operators/layer_norm_op.h b/paddle/fluid/operators/layer_norm_op.h index f0c3064d41..8627c83b43 100644 --- a/paddle/fluid/operators/layer_norm_op.h +++ b/paddle/fluid/operators/layer_norm_op.h @@ -229,9 +229,9 @@ class LayerNormKernel : public framework::OpKernel { PADDLE_ENFORCE_EQ(scale->numel(), right); PADDLE_ENFORCE_EQ(bias->numel(), right); - auto ker = jit::KernelFuncs, - platform::CPUPlace>::Cache() - .At(right); + auto ker = + jit::KernelFuncs, platform::CPUPlace>::Cache() + .At(right); ker(x.data(), out.data(), mean->data(), var->data(), scale->data(), bias->data(), static_cast(left), static_cast(epsilon), right); diff --git a/paddle/fluid/operators/math/fc_compute.h b/paddle/fluid/operators/math/fc_compute.h index 0ad57c51be..66ce57594a 100644 --- a/paddle/fluid/operators/math/fc_compute.h +++ b/paddle/fluid/operators/math/fc_compute.h @@ -30,17 +30,16 @@ inline void FCCompute(const BlasT& blas, const int M, return; } if (relu) { - auto compute = jit::KernelFuncs, - platform::CPUPlace>::Cache() - .At(N); + auto compute = + jit::KernelFuncs, platform::CPUPlace>::Cache().At( + N); for (int i = 0; i < M; i++) { T* dst = Y + i * N; compute(B, dst, dst, N); } } else { - auto compute = jit::KernelFuncs, - platform::CPUPlace>::Cache() - .At(N); + auto compute = + jit::KernelFuncs, platform::CPUPlace>::Cache().At(N); #ifdef PADDLE_WITH_MKLML #pragma omp parallel for #endif diff --git a/paddle/fluid/operators/math/sequence_pooling.cc b/paddle/fluid/operators/math/sequence_pooling.cc index db103e5fab..7af44f2b2c 100644 --- a/paddle/fluid/operators/math/sequence_pooling.cc +++ b/paddle/fluid/operators/math/sequence_pooling.cc @@ -255,9 +255,9 @@ class SequencePoolFunctor { jit::seq_pool_attr_t attr( static_cast(input.numel() / input.dims()[0]), jit::SeqPoolType::kSum); - auto seqpool = jit::KernelFuncs, - platform::CPUPlace>::Cache() - .At(attr); + auto seqpool = + jit::KernelFuncs, platform::CPUPlace>::Cache() + .At(attr); for (int i = 0; i < static_cast(lod.size()) - 1; ++i) { attr.h = static_cast(lod[i + 1] - lod[i]); seqpool(src, dst, &attr); diff --git a/paddle/fluid/operators/math/softmax_impl.h b/paddle/fluid/operators/math/softmax_impl.h index a1cb3f9728..d77b6712c5 100644 --- a/paddle/fluid/operators/math/softmax_impl.h +++ b/paddle/fluid/operators/math/softmax_impl.h @@ -82,8 +82,7 @@ class SoftmaxFunctor> { const int kClassDim = 1; // 2D data. Batch x C auto compute_softmax = - jit::KernelFuncs, - platform::CPUPlace>::Cache() + jit::KernelFuncs, platform::CPUPlace>::Cache() .At(in_dims[kClassDim]); compute_softmax(in_data, out_data, in_dims[kClassDim], in_dims[kBatchDim]); } diff --git a/paddle/fluid/operators/optimizers/sgd_op.h b/paddle/fluid/operators/optimizers/sgd_op.h index 0425a3d194..5dd5f67e00 100644 --- a/paddle/fluid/operators/optimizers/sgd_op.h +++ b/paddle/fluid/operators/optimizers/sgd_op.h @@ -47,9 +47,9 @@ class SGDOpKernel : public framework::OpKernel { int64_t rows_idx = 0; T *out_data = param_out->mutable_data(ctx.GetPlace()); - auto sgd = jit::KernelFuncs, - platform::CPUPlace>::Cache() - .At(attr); + auto sgd = + jit::KernelFuncs, platform::CPUPlace>::Cache().At( + attr); sgd(lr, param_data, grad_data, &rows_idx, out_data, &attr); } else if (grad_var->IsType()) { // TODO(qijun): In Sparse SGD operator, in-place update is enforced. @@ -82,9 +82,9 @@ class SGDOpKernel : public framework::OpKernel { attr.selected_rows_size = grad_rows.size(); PADDLE_ENFORCE_EQ(attr.grad_width, attr.param_width); - auto sgd = jit::KernelFuncs, - platform::CPUPlace>::Cache() - .At(attr); + auto sgd = + jit::KernelFuncs, platform::CPUPlace>::Cache().At( + attr); sgd(lr, param_data, grad_data, rows_data, out_data, &attr); } else { PADDLE_THROW("Unsupported Variable Type of Grad"); From 6bb84b74b2b67dce9b4e0b397088da0cb7c3c960 Mon Sep 17 00:00:00 2001 From: Liu Yiqun Date: Fri, 8 Mar 2019 16:24:43 +0800 Subject: [PATCH 18/64] Change the download and compress command of cmake. test=develop --- paddle/fluid/inference/tests/test.cmake | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/paddle/fluid/inference/tests/test.cmake b/paddle/fluid/inference/tests/test.cmake index 6c5fe043ff..5ceb630976 100644 --- a/paddle/fluid/inference/tests/test.cmake +++ b/paddle/fluid/inference/tests/test.cmake @@ -30,13 +30,14 @@ function(inference_download_and_uncompress INSTALL_DIR URL FILENAME) ${EXTERNAL_PROJECT_NAME} ${EXTERNAL_PROJECT_LOG_ARGS} PREFIX ${INSTALL_DIR} - URL ${URL}/${FILENAME} + DOWNLOAD_COMMAND wget -q -O ${INSTALL_DIR}/${FILENAME} ${URL}/${FILENAME} && + ${CMAKE_COMMAND} -E tar xzf ${INSTALL_DIR}/${FILENAME} DOWNLOAD_DIR ${INSTALL_DIR} DOWNLOAD_NO_PROGRESS 1 CONFIGURE_COMMAND "" BUILD_COMMAND "" UPDATE_COMMAND "" - INSTALL_COMMAND ${CMAKE_COMMAND} -E copy_directory ${UNPACK_DIR} ${INSTALL_DIR} + INSTALL_COMMAND "" ) endfunction() From aeee4cbe7149a08bf19932ed8a657d931153ca4c Mon Sep 17 00:00:00 2001 From: luotao1 Date: Fri, 8 Mar 2019 18:48:50 +0800 Subject: [PATCH 19/64] add compare between zerocopy and analysis --- .../tests/api/analyzer_pyramid_dnn_tester.cc | 8 ++- .../fluid/inference/tests/api/tester_helper.h | 60 +++++++++++++++++++ 2 files changed, 67 insertions(+), 1 deletion(-) diff --git a/paddle/fluid/inference/tests/api/analyzer_pyramid_dnn_tester.cc b/paddle/fluid/inference/tests/api/analyzer_pyramid_dnn_tester.cc index df834e75df..5ba553aad6 100644 --- a/paddle/fluid/inference/tests/api/analyzer_pyramid_dnn_tester.cc +++ b/paddle/fluid/inference/tests/api/analyzer_pyramid_dnn_tester.cc @@ -134,7 +134,7 @@ TEST(Analyzer_Pyramid_DNN, profile) { TestPrediction(reinterpret_cast(&cfg), input_slots_all, &outputs, FLAGS_num_threads); - if (FLAGS_num_threads == 1 && !FLAGS_test_all_data) { + if (FLAGS_num_threads == 1 && !FLAGS_test_all_data && !FLAGS_zero_copy) { PADDLE_ENFORCE_EQ(outputs.size(), 1UL); size_t size = GetSize(outputs[0]); PADDLE_ENFORCE_GT(size, 0); @@ -167,6 +167,12 @@ TEST(Analyzer_Pyramid_DNN, compare) { SetInput(&input_slots_all); CompareNativeAndAnalysis( reinterpret_cast(&cfg), input_slots_all); + + // Compare AnalysisConfig and AnalysisConfig + ZeroCopy + std::vector outputs_name; + outputs_name.emplace_back("cos_sim_2.tmp_0"); + CompareAnalysisAndZeroCopy(reinterpret_cast(&cfg), + input_slots_all, outputs_name); } // Compare Deterministic result diff --git a/paddle/fluid/inference/tests/api/tester_helper.h b/paddle/fluid/inference/tests/api/tester_helper.h index 3becb4bf68..9a843e8d02 100644 --- a/paddle/fluid/inference/tests/api/tester_helper.h +++ b/paddle/fluid/inference/tests/api/tester_helper.h @@ -69,6 +69,7 @@ void PrintConfig(const PaddlePredictor::Config *config, bool use_analysis) { LOG(INFO) << analysis_config->ToNativeConfig(); } +// Compare result between two PaddleTensor void CompareResult(const std::vector &outputs, const std::vector &ref_outputs) { EXPECT_GT(outputs.size(), 0UL); @@ -102,6 +103,41 @@ void CompareResult(const std::vector &outputs, } } +// Compare result between a PaddleTensor and a ZeroCopyTensor +void CompareResult(const std::vector &outputs, + const std::vector &ref_outputs) { + EXPECT_GT(outputs.size(), 0UL); + EXPECT_EQ(outputs.size(), ref_outputs.size()); + for (size_t i = 0; i < outputs.size(); i++) { + auto &out = outputs[i]; + auto &ref_out = ref_outputs[i]; + size_t size = VecReduceToInt(out.shape); + EXPECT_GT(size, 0UL); + int ref_size = 0; // this is the number of elements not memory size + PaddlePlace place; + switch (out.dtype) { + case PaddleDType::INT64: { + int64_t *pdata = static_cast(out.data.data()); + int64_t *pdata_ref = ref_out.data(&place, &ref_size); + EXPECT_EQ(size, ref_size); + for (size_t j = 0; j < size; ++j) { + EXPECT_EQ(pdata_ref[j], pdata[j]); + } + break; + } + case PaddleDType::FLOAT32: { + float *pdata = static_cast(out.data.data()); + float *pdata_ref = ref_out.data(&place, &ref_size); + EXPECT_EQ(size, ref_size); + for (size_t j = 0; j < size; ++j) { + CHECK_LE(std::abs(pdata_ref[j] - pdata[j]), FLAGS_accuracy); + } + break; + } + } + } +} + std::unique_ptr CreateTestPredictor( const PaddlePredictor::Config *config, bool use_analysis = true) { const auto *analysis_config = @@ -377,6 +413,30 @@ void CompareNativeAndAnalysis( CompareResult(analysis_outputs, native_outputs); } +void CompareAnalysisAndZeroCopy( + PaddlePredictor::Config *config, + const std::vector> &inputs, + const std::vector &outputs_name) { + int batch_size = FLAGS_batch_size; + // analysis + std::vector analysis_outputs; + auto predictor = CreateTestPredictor(config, true); + predictor->Run(inputs[0], &analysis_outputs, batch_size); + // analysis + zero_copy + std::vector zerocopy_outputs; + reinterpret_cast(config)->SwitchUseFeedFetchOps(false); + predictor = CreateTestPredictor(config, true); + ConvertPaddleTensorToZeroCopyTensor(predictor.get(), inputs[0]); + predictor->ZeroCopyRun(); + for (size_t i = 0; i < outputs_name.size(); i++) { + ZeroCopyTensor zerocopy_output = + *predictor->GetOutputTensor(outputs_name[i]).get(); + zerocopy_outputs.emplace_back(zerocopy_output); + } + // compare + CompareResult(analysis_outputs, zerocopy_outputs); +} + template std::string LoDTensorSummary(const framework::LoDTensor &tensor) { std::stringstream ss; From 81773d0b1ca73791f4912a47862345fc4aef8b59 Mon Sep 17 00:00:00 2001 From: Xin Pan Date: Fri, 8 Mar 2019 20:14:58 +0800 Subject: [PATCH 20/64] add imperative and declarative mode testbase and example test=develop --- .../fluid/tests/unittests/test_layers.py | 93 ++++++++++++++++++- 1 file changed, 91 insertions(+), 2 deletions(-) diff --git a/python/paddle/fluid/tests/unittests/test_layers.py b/python/paddle/fluid/tests/unittests/test_layers.py index ff49c1be97..b29ad25870 100644 --- a/python/paddle/fluid/tests/unittests/test_layers.py +++ b/python/paddle/fluid/tests/unittests/test_layers.py @@ -15,13 +15,102 @@ from __future__ import print_function import unittest -import paddle.fluid.layers as layers +import contextlib +import numpy as np +import decorators + +import paddle +import paddle.fluid as fluid from paddle.fluid.layers.device import get_places import paddle.fluid.nets as nets from paddle.fluid.framework import Program, program_guard, default_main_program from paddle.fluid.param_attr import ParamAttr -import decorators +from paddle.fluid import core from paddle.fluid.initializer import Constant +import paddle.fluid.layers as layers +from test_imperative_base import new_program_scope +from paddle.fluid.imperative import nn +from paddle.fluid.imperative import base + + +class LayerTest(unittest.TestCase): + @classmethod + def setUpClass(cls): + cls.seed = 111 + + @classmethod + def tearDownClass(cls): + pass + + def _get_place(self): + if core.is_compiled_with_cuda(): + return core.CUDAPlace(0) + return core.CPUPlace() + + @contextlib.contextmanager + def static_graph(self): + with new_program_scope(): + fluid.default_startup_program().random_seed = self.seed + fluid.default_main_program().random_seed = self.seed + yield + + def get_static_graph_result(self, feed, fetch_list): + exe = fluid.Executor(self._get_place()) + exe.run(fluid.default_startup_program()) + return exe.run(fluid.default_main_program(), + feed=feed, + fetch_list=fetch_list) + + @contextlib.contextmanager + def dynamic_graph(self): + with fluid.imperative.guard(self._get_place()): + fluid.default_startup_program().random_seed = self.seed + fluid.default_main_program().random_seed = self.seed + yield + + +class TestLayer(LayerTest): + def test_relu(self): + with self.static_graph(): + t = layers.data(name='t', shape=[3, 3], dtype='float32') + ret = layers.relu(t) + static_ret = self.get_static_graph_result( + feed={'t': np.ones( + [3, 3], dtype='float32')}, fetch_list=[ret])[0] + + with self.dynamic_graph(): + t = np.ones([3, 3], dtype='float32') + dy_ret = layers.relu(base.to_variable(t)) + + self.assertTrue(np.allclose(static_ret, dy_ret._numpy())) + + def test_conv2d(self): + with self.static_graph(): + images = layers.data(name='pixel', shape=[3, 5, 5], dtype='float32') + ret = layers.conv2d(input=images, num_filters=3, filter_size=[2, 2]) + static_ret = self.get_static_graph_result( + feed={'pixel': np.ones( + [2, 3, 5, 5], dtype='float32')}, + fetch_list=[ret])[0] + + with self.static_graph(): + images = layers.data(name='pixel', shape=[3, 5, 5], dtype='float32') + conv2d = nn.Conv2D( + 'conv2d', num_channels=3, num_filters=3, filter_size=[2, 2]) + ret = conv2d(images) + static_ret2 = self.get_static_graph_result( + feed={'pixel': np.ones( + [2, 3, 5, 5], dtype='float32')}, + fetch_list=[ret])[0] + + with self.dynamic_graph(): + images = np.ones([2, 3, 5, 5], dtype='float32') + conv2d = nn.Conv2D( + 'conv2d', num_channels=3, num_filters=3, filter_size=[2, 2]) + dy_ret = conv2d(base.to_variable(images)) + + self.assertTrue(np.allclose(static_ret, dy_ret._numpy())) + self.assertTrue(np.allclose(static_ret, static_ret2)) class TestBook(unittest.TestCase): From 837ad7f86fe5de124b2d3a6c4933747fa30d92ea Mon Sep 17 00:00:00 2001 From: tink2123 Date: Sun, 10 Mar 2019 09:42:39 +0000 Subject: [PATCH 21/64] Add the inverse trigonometric function test=develop --- paddle/fluid/API.spec | 3 + paddle/fluid/operators/activation_op.cc | 27 +++++ paddle/fluid/operators/activation_op.h | 103 +++++++++++++++++- python/paddle/fluid/layers/ops.py | 3 + .../tests/unittests/test_activation_op.py | 54 +++++++++ 5 files changed, 187 insertions(+), 3 deletions(-) diff --git a/paddle/fluid/API.spec b/paddle/fluid/API.spec index a4e683da0b..00bd23b1fe 100644 --- a/paddle/fluid/API.spec +++ b/paddle/fluid/API.spec @@ -292,6 +292,7 @@ paddle.fluid.layers.sigmoid (ArgSpec(args=['x', 'name'], varargs=None, keywords= paddle.fluid.layers.logsigmoid (ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '81ccb7acafd06c7728e11581f5d342e3')) paddle.fluid.layers.exp (ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', 'e6b3e769413d96aab4176f96db25984b')) paddle.fluid.layers.tanh (ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', 'e9d586a0b5bd05f67ee78048f9d503b6')) +paddle.fluid.layers.atan (ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '7ca26a8235099486bdf243754439c6b6')) paddle.fluid.layers.tanh_shrink (ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '1e521554b9fdda9061ec6d306f0709b7')) paddle.fluid.layers.softshrink (ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '9eef31597bbafa2bd49691e072296e13')) paddle.fluid.layers.sqrt (ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '072a8541e0f632366bba10f67cb0db27')) @@ -299,7 +300,9 @@ paddle.fluid.layers.abs (ArgSpec(args=['x', 'name'], varargs=None, keywords=None paddle.fluid.layers.ceil (ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', 'c75d67dc5fe28f68e4cfffead4f698ad')) paddle.fluid.layers.floor (ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '647b16c5da5ef909649ae02abb434973')) paddle.fluid.layers.cos (ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '485f2686bcc2fe37a4bd893769c8a3e2')) +paddle.fluid.layers.acos (ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', 'c721122352acfc1853bffadf2d59103b')) paddle.fluid.layers.sin (ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '01f1766aa76eff1df30147505b59f7c4')) +paddle.fluid.layers.asin (ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '0619b891e80f419b28016cde3d106c68')) paddle.fluid.layers.round (ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', 'b47f5da13913d3e56bdb1e612a73f3f2')) paddle.fluid.layers.reciprocal (ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', 'cc6ac2f14f03c52aaa83a59bf83b8d26')) paddle.fluid.layers.square (ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '48dfb45d773dbc30126c3a7f777de5ee')) diff --git a/paddle/fluid/operators/activation_op.cc b/paddle/fluid/operators/activation_op.cc index 2feb8e4c47..cc948f2697 100644 --- a/paddle/fluid/operators/activation_op.cc +++ b/paddle/fluid/operators/activation_op.cc @@ -269,6 +269,27 @@ $$out = \\frac{x}{1 + \|x\|}$$ )DOC"; +UNUSED constexpr char AcosDoc[] = R"DOC( +Arccosine Activation Operator. + +$${out}_{i} = \cos^{-1}({input}_{i})$$ + +)DOC"; + +UNUSED constexpr char AsinDoc[] = R"DOC( +Arcsine Activation Operator. + +$out = \sin^{-1}({input}_{i})$ + +)DOC"; + +UNUSED constexpr char AtanDoc[] = R"DOC( +Arctanh Activation Operator. + +$out = \tanh^{-1}({input}_{i})$ + +)DOC"; + class LeakyReluOpMaker : public framework::OpProtoAndCheckerMaker { public: void Make() override { @@ -494,13 +515,16 @@ REGISTER_ACTIVATION_OP_MAKER(Exp, ExpDoc); REGISTER_ACTIVATION_OP_MAKER(Relu, ReluDoc); REGISTER_ACTIVATION_OP_MAKER(Gelu, GeluDoc); REGISTER_ACTIVATION_OP_MAKER(Tanh, TanhDoc); +REGISTER_ACTIVATION_OP_MAKER(Atan, AtanDoc); REGISTER_ACTIVATION_OP_MAKER(TanhShrink, TanhShrinkDoc); REGISTER_ACTIVATION_OP_MAKER(Sqrt, SqrtDoc); REGISTER_ACTIVATION_OP_MAKER(Abs, AbsDoc); REGISTER_ACTIVATION_OP_MAKER(Ceil, CeilDoc); REGISTER_ACTIVATION_OP_MAKER(Floor, FloorDoc); REGISTER_ACTIVATION_OP_MAKER(Cos, CosDoc); +REGISTER_ACTIVATION_OP_MAKER(Acos, AcosDoc); REGISTER_ACTIVATION_OP_MAKER(Sin, SinDoc); +REGISTER_ACTIVATION_OP_MAKER(Asin, AsinDoc); REGISTER_ACTIVATION_OP_MAKER(Round, RoundDoc); REGISTER_ACTIVATION_OP_MAKER(Reciprocal, ReciprocalDoc); REGISTER_ACTIVATION_OP_MAKER(Log, LogDoc); @@ -543,7 +567,10 @@ namespace ops = paddle::operators; __macro(SoftShrink, softshrink); \ __macro(Abs, abs); \ __macro(Cos, cos); \ + __macro(Acos, acos); \ __macro(Sin, sin); \ + __macro(Asin, asin); \ + __macro(Atan, atan); \ __macro(Round, round); \ __macro(Log, log); \ __macro(Square, square); \ diff --git a/paddle/fluid/operators/activation_op.h b/paddle/fluid/operators/activation_op.h index 1f5ae7fb5c..ff7e623f6f 100644 --- a/paddle/fluid/operators/activation_op.h +++ b/paddle/fluid/operators/activation_op.h @@ -39,9 +39,8 @@ namespace operators { Please refer to the layer_helper.py and get the details. */ static std::unordered_set InplaceOpSet = { - "sigmoid", "exp", "relu", "tanh", "sqrt", "ceil", - "floor", "reciprocal", "relu6", "soft_relu", "hard_sigmoid", -}; + "sigmoid", "exp", "relu", "tanh", "sqrt", "ceil", + "floor", "reciprocal", "relu6", "soft_relu", "hard_sigmoid"}; static bool IsInplace(const std::string& op) { bool inplace = InplaceOpSet.count(op); @@ -553,6 +552,101 @@ struct SinFunctor : public BaseActivationFunctor { } }; +template +struct Acos { + HOSTDEVICE T operator()(const T& val) const { return acos(val); } +}; + +template <> +struct Acos { + HOSTDEVICE platform::float16 operator()(const platform::float16& val) const { + return platform::float16(acos(static_cast(val))); + } +}; + +// Acos(x) = acos(x) +template +struct AcosFunctor : public BaseActivationFunctor { + template + void operator()(Device d, X x, Out out) const { + out.device(d) = x.unaryExpr(Acos()); + } +}; + +// acos'(x) = -1/sqrt(1-x^2) +template +struct AcosGradFunctor : public BaseActivationFunctor { + template + void operator()(Device d, X x, Out out, dOut dout, dX dx) const { + dx.device(d) = + -dout * static_cast(1) / (static_cast(1) - x.square()).sqrt(); + } +}; + +template +struct Asin { + HOSTDEVICE T operator()(const T& val) const { return asin(val); } +}; + +template <> +struct Asin { + HOSTDEVICE platform::float16 operator()(const platform::float16& val) const { + return platform::float16(asin(static_cast(val))); + } +}; + +// Asin(x) = asin(x) +template +struct AsinFunctor : public BaseActivationFunctor { + template + void operator()(Device d, X x, Out out) const { + out.device(d) = x.unaryExpr(Asin()); + } +}; + +// asin'(x) = 1/sqrt(1-x^2) +template +struct AsinGradFunctor : public BaseActivationFunctor { + template + void operator()(Device d, X x, Out out, dOut dout, dX dx) const { + dx.device(d) = + dout * static_cast(1) / (static_cast(1) - x.square()).sqrt(); + } +}; + +template +struct Atan { + HOSTDEVICE T operator()(const T& val) const { return atan(val); } +}; + +template <> +struct Atan { + HOSTDEVICE platform::float16 operator()(const platform::float16& val) const { + return platform::float16(atan(static_cast(val))); + } +}; + +// Atan(x) = atan(x) +template +struct AtanFunctor : public BaseActivationFunctor { + template + void operator()(Device d, X x, Out out) const { + out.device(d) = x.unaryExpr(Atan()); + } +}; + +// atan'(x) = 1 / (1 + x^2) +template +struct AtanGradFunctor : public BaseActivationFunctor { + template + void operator()(Device d, X x, Out out, dOut dout, dX dx) const { + dx.device(d) = dout * static_cast(1) / (static_cast(1) + x.square()); + } +}; + // round(x) = [x] template struct RoundFunctor : public BaseActivationFunctor { @@ -1001,13 +1095,16 @@ struct SwishGradFunctor : public BaseActivationFunctor { __macro(relu, ReluFunctor, ReluGradFunctor); \ __macro(gelu, GeluFunctor, GeluGradFunctor); \ __macro(tanh, TanhFunctor, TanhGradFunctor); \ + __macro(atan, AtanFunctor, AtanGradFunctor); \ __macro(softshrink, SoftShrinkFunctor, SoftShrinkGradFunctor); \ __macro(sqrt, SqrtFunctor, SqrtGradFunctor); \ __macro(abs, AbsFunctor, AbsGradFunctor); \ __macro(ceil, CeilFunctor, ZeroGradFunctor); \ __macro(floor, FloorFunctor, ZeroGradFunctor); \ __macro(cos, CosFunctor, CosGradFunctor); \ + __macro(acos, AcosFunctor, AcosGradFunctor); \ __macro(sin, SinFunctor, SinGradFunctor); \ + __macro(asin, AsinFunctor, AsinGradFunctor); \ __macro(round, RoundFunctor, ZeroGradFunctor); \ __macro(reciprocal, ReciprocalFunctor, ReciprocalGradFunctor); \ __macro(log, LogFunctor, LogGradFunctor); \ diff --git a/python/paddle/fluid/layers/ops.py b/python/paddle/fluid/layers/ops.py index 4381727a09..f018bb8af8 100644 --- a/python/paddle/fluid/layers/ops.py +++ b/python/paddle/fluid/layers/ops.py @@ -23,6 +23,7 @@ __activations_noattr__ = [ 'logsigmoid', 'exp', 'tanh', + 'atan', 'tanh_shrink', 'softshrink', 'sqrt', @@ -30,6 +31,8 @@ __activations_noattr__ = [ 'ceil', 'floor', 'cos', + 'acos', + 'asin', 'sin', 'round', 'reciprocal', diff --git a/python/paddle/fluid/tests/unittests/test_activation_op.py b/python/paddle/fluid/tests/unittests/test_activation_op.py index d5a8385409..d587715d60 100644 --- a/python/paddle/fluid/tests/unittests/test_activation_op.py +++ b/python/paddle/fluid/tests/unittests/test_activation_op.py @@ -100,6 +100,23 @@ class TestTanh(TestActivation): self.check_grad(['X'], 'Out', max_relative_error=0.007) +class TestAtan(TestActivation): + def setUp(self): + self.op_type = "atan" + self.init_dtype() + + x = np.random.uniform(0.1, 1, [11, 17]).astype(self.dtype) + out = np.arctan(x) + + self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(x)} + self.outputs = {'Out': out} + + def test_check_grad(self): + if self.dtype == np.float16: + return + self.check_grad(['X'], 'Out', max_relative_error=0.007) + + class TestTanhShrink(TestActivation): def setUp(self): self.op_type = "tanh_shrink" @@ -248,6 +265,23 @@ class TestCos(TestActivation): self.check_grad(['X'], 'Out', max_relative_error=0.007) +class TestAcos(TestActivation): + def setUp(self): + self.op_type = "acos" + self.init_dtype() + + x = np.random.uniform(-1, 1, [4, 4]).astype(self.dtype) + out = np.arccos(x) + + self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(x)} + self.outputs = {'Out': out} + + def test_check_grad(self): + if self.dtype == np.float16: + return + self.check_grad(['X'], 'Out', max_relative_error=0.007) + + class TestSin(TestActivation): def setUp(self): self.op_type = "sin" @@ -265,6 +299,23 @@ class TestSin(TestActivation): self.check_grad(['X'], 'Out', max_relative_error=0.007) +class TestAsin(TestActivation): + def setUp(self): + self.op_type = "asin" + self.init_dtype() + + x = np.random.uniform(-1, 1, [4, 4]).astype(self.dtype) + out = np.arcsin(x) + + self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(x)} + self.outputs = {'Out': out} + + def test_check_grad(self): + if self.dtype == np.float16: + return + self.check_grad(['X'], 'Out', max_relative_error=0.007) + + class TestRound(TestActivation): def setUp(self): self.op_type = "round" @@ -665,7 +716,10 @@ create_test_act_fp16_class(TestAbs) create_test_act_fp16_class(TestCeil, grad_check=False) create_test_act_fp16_class(TestFloor, grad_check=False) create_test_act_fp16_class(TestCos, grad_atol=0.85) +create_test_act_fp16_class(TestAcos, grad_atol=0.85) create_test_act_fp16_class(TestSin) +create_test_act_fp16_class(TestAsin) +create_test_act_fp16_class(TestAtan) create_test_act_fp16_class(TestRound, grad_check=False) create_test_act_fp16_class(TestRelu) create_test_act_fp16_class(TestGelu) From 45bdd84dac51b6f3fb4315b144f958e6e15c8389 Mon Sep 17 00:00:00 2001 From: tensor-tang Date: Sun, 10 Mar 2019 16:26:59 +0000 Subject: [PATCH 22/64] enhance the jitkernel helper and add unit tests test=develop --- paddle/fluid/operators/jit/benchmark.cc | 28 +-- paddle/fluid/operators/jit/gen/act.cc | 14 +- paddle/fluid/operators/jit/gen/blas.cc | 4 +- paddle/fluid/operators/jit/gen/embseqpool.cc | 2 +- paddle/fluid/operators/jit/gen/gru.cc | 2 +- paddle/fluid/operators/jit/gen/hopv.cc | 2 +- paddle/fluid/operators/jit/gen/jitcode.h | 2 +- paddle/fluid/operators/jit/gen/lstm.cc | 2 +- paddle/fluid/operators/jit/gen/matmul.cc | 2 +- paddle/fluid/operators/jit/gen/seqpool.cc | 2 +- paddle/fluid/operators/jit/gen/sgd.cc | 2 +- paddle/fluid/operators/jit/gen/vbroadcast.cc | 2 +- paddle/fluid/operators/jit/gen_base.cc | 2 +- paddle/fluid/operators/jit/gen_base.h | 7 +- paddle/fluid/operators/jit/helper.h | 116 +++++++--- paddle/fluid/operators/jit/kernel_base.h | 7 +- paddle/fluid/operators/jit/kernel_key.cc | 3 + .../jit/more/intrinsic/crf_decoding.cc | 2 +- .../jit/more/intrinsic/crf_decoding.h | 3 +- .../jit/more/intrinsic/layer_norm.cc | 2 +- .../operators/jit/more/intrinsic/layer_norm.h | 3 +- paddle/fluid/operators/jit/more/mix/mix.cc | 16 +- paddle/fluid/operators/jit/more/mix/mix.h | 12 +- paddle/fluid/operators/jit/more/mkl/mkl.cc | 47 ++-- paddle/fluid/operators/jit/more/mkl/mkl.h | 14 +- paddle/fluid/operators/jit/registry.h | 4 +- paddle/fluid/operators/jit/test.cc | 208 ++++++++++++++---- 27 files changed, 328 insertions(+), 182 deletions(-) diff --git a/paddle/fluid/operators/jit/benchmark.cc b/paddle/fluid/operators/jit/benchmark.cc index 773cf38eb9..fbb04a166e 100644 --- a/paddle/fluid/operators/jit/benchmark.cc +++ b/paddle/fluid/operators/jit/benchmark.cc @@ -111,33 +111,11 @@ template void BenchAllImpls(const typename KernelTuple::attr_type& attr, Args... args) { BenchFunc benchmark; std::vector> infos; - // test refer - auto refer = jit::GetRefer(); - if (!refer) { - LOG(FATAL) << "Refer can not be empty!"; + auto funcs = jit::GetAllCandidateFuncsWithTypes(attr); + for (auto f : funcs) { + infos.push_back(std::make_pair(f.first, benchmark(f.second, args...))); } - infos.push_back(std::make_pair("Refer", benchmark(refer, args...))); - // test jitcode - auto jitcode = jit::GetJitCode(attr); - if (jitcode) { - infos.push_back(std::make_pair("JitCode", benchmark(jitcode, args...))); - } - // test all impls in more - jit::KernelKey kkey(KernelTuple::kernel_type, PlaceType()); - auto& pool = jit::KernelPool().Instance().AllKernels(); - auto iter = pool.find(kkey); - if (iter != pool.end()) { - auto& impls = iter->second; - for (auto& impl : impls) { - auto i = dynamic_cast*>(impl.get()); - if (i && i->UseMe(attr)) { - auto more = i->GetFunc(); - infos.push_back( - std::make_pair(i->ImplType(), benchmark(more, args...))); - } - } - } // Test result from Get function auto tgt = jit::KernelFuncs::Cache().At(attr); if (!tgt) { diff --git a/paddle/fluid/operators/jit/gen/act.cc b/paddle/fluid/operators/jit/gen/act.cc index e7a7375879..5cac219f95 100644 --- a/paddle/fluid/operators/jit/gen/act.cc +++ b/paddle/fluid/operators/jit/gen/act.cc @@ -81,7 +81,7 @@ void VActJitCode::genCode() { #define DECLARE_ACT_CREATOR(name) \ class name##Creator : public JitCodeCreator { \ public: \ - bool UseMe(const int& attr) const override; \ + bool CanBeUsed(const int& attr) const override; \ size_t CodeSize(const int& d) const override; \ std::unique_ptr CreateJitCode(const int& attr) const override { \ return make_unique(attr, CodeSize(attr)); \ @@ -96,27 +96,27 @@ DECLARE_ACT_CREATOR(VSigmoid); DECLARE_ACT_CREATOR(VTanh); // TODO(TJ): tuning use me -bool VReluCreator::UseMe(const int& d) const { +bool VReluCreator::CanBeUsed(const int& d) const { return platform::MayIUse(platform::avx); } -bool VSquareCreator::UseMe(const int& d) const { +bool VSquareCreator::CanBeUsed(const int& d) const { return platform::MayIUse(platform::avx); } -bool VIdentityCreator::UseMe(const int& d) const { +bool VIdentityCreator::CanBeUsed(const int& d) const { return platform::MayIUse(platform::avx); } -bool VExpCreator::UseMe(const int& d) const { +bool VExpCreator::CanBeUsed(const int& d) const { return platform::MayIUse(platform::avx) && d < 32; } -bool VSigmoidCreator::UseMe(const int& d) const { +bool VSigmoidCreator::CanBeUsed(const int& d) const { return platform::MayIUse(platform::avx); } -bool VTanhCreator::UseMe(const int& d) const { +bool VTanhCreator::CanBeUsed(const int& d) const { return platform::MayIUse(platform::avx); } diff --git a/paddle/fluid/operators/jit/gen/blas.cc b/paddle/fluid/operators/jit/gen/blas.cc index 5da24c359e..e764a7983d 100644 --- a/paddle/fluid/operators/jit/gen/blas.cc +++ b/paddle/fluid/operators/jit/gen/blas.cc @@ -142,7 +142,7 @@ void NCHW16CMulNCJitCode::genCode() { class NCHW16CMulNCCreator : public JitCodeCreator { public: - bool UseMe(const int& attr) const override { + bool CanBeUsed(const int& attr) const override { return platform::MayIUse(platform::avx512f); } size_t CodeSize(const int& d) const override { return 256 * 1024; } @@ -154,7 +154,7 @@ class NCHW16CMulNCCreator : public JitCodeCreator { #define DECLARE_BLAS_CREATOR(name) \ class name##Creator : public JitCodeCreator { \ public: \ - bool UseMe(const int& attr) const override { \ + bool CanBeUsed(const int& attr) const override { \ return platform::MayIUse(platform::avx) && attr <= 1024; \ } \ size_t CodeSize(const int& d) const override { \ diff --git a/paddle/fluid/operators/jit/gen/embseqpool.cc b/paddle/fluid/operators/jit/gen/embseqpool.cc index 23837a3fb9..6e8ecc07e7 100644 --- a/paddle/fluid/operators/jit/gen/embseqpool.cc +++ b/paddle/fluid/operators/jit/gen/embseqpool.cc @@ -121,7 +121,7 @@ void EmbSeqPoolJitCode::genCode() { class EmbSeqPoolCreator : public JitCodeCreator { public: - bool UseMe(const emb_seq_pool_attr_t& attr) const override { + bool CanBeUsed(const emb_seq_pool_attr_t& attr) const override { return platform::MayIUse(platform::avx) && attr.table_width % YMM_FLOAT_BLOCK == 0; } diff --git a/paddle/fluid/operators/jit/gen/gru.cc b/paddle/fluid/operators/jit/gen/gru.cc index 13f7a14111..4bc9247f6f 100644 --- a/paddle/fluid/operators/jit/gen/gru.cc +++ b/paddle/fluid/operators/jit/gen/gru.cc @@ -86,7 +86,7 @@ void GRUJitCode::genCode() { class name##Creator : public JitCodeCreator { \ public: \ /* TODO(TJ): enable more */ \ - bool UseMe(const gru_attr_t& attr) const override { \ + bool CanBeUsed(const gru_attr_t& attr) const override { \ return platform::MayIUse(platform::avx) && attr.d % 8 == 0; \ } \ size_t CodeSize(const gru_attr_t& attr) const override { \ diff --git a/paddle/fluid/operators/jit/gen/hopv.cc b/paddle/fluid/operators/jit/gen/hopv.cc index e788401719..3383f17df8 100644 --- a/paddle/fluid/operators/jit/gen/hopv.cc +++ b/paddle/fluid/operators/jit/gen/hopv.cc @@ -76,7 +76,7 @@ void HOPVJitCode::genCode() { #define DECLARE_HOP_CREATOR(name) \ class name##Creator : public JitCodeCreator { \ public: \ - bool UseMe(const int& attr) const override { \ + bool CanBeUsed(const int& attr) const override { \ return platform::MayIUse(platform::avx); \ } \ size_t CodeSize(const int& d) const override { \ diff --git a/paddle/fluid/operators/jit/gen/jitcode.h b/paddle/fluid/operators/jit/gen/jitcode.h index 39847d1b65..228db7cc72 100644 --- a/paddle/fluid/operators/jit/gen/jitcode.h +++ b/paddle/fluid/operators/jit/gen/jitcode.h @@ -73,7 +73,7 @@ class JitCode : public GenBase, public Xbyak::CodeGenerator { virtual void genCode() = 0; size_t getSize() const override { return CodeGenerator::getSize(); } - const unsigned char* getCodeInternal() override { + const unsigned char* getCodeInternal() const override { const Xbyak::uint8* code = CodeGenerator::getCode(); return code; } diff --git a/paddle/fluid/operators/jit/gen/lstm.cc b/paddle/fluid/operators/jit/gen/lstm.cc index 08bafb5a81..5e7789aede 100644 --- a/paddle/fluid/operators/jit/gen/lstm.cc +++ b/paddle/fluid/operators/jit/gen/lstm.cc @@ -114,7 +114,7 @@ void LSTMJitCode::genCode() { class name##Creator : public JitCodeCreator { \ public: \ /* TODO(TJ): enable more */ \ - bool UseMe(const lstm_attr_t& attr) const override { \ + bool CanBeUsed(const lstm_attr_t& attr) const override { \ return platform::MayIUse(platform::avx) && attr.d % 8 == 0; \ } \ size_t CodeSize(const lstm_attr_t& attr) const override { \ diff --git a/paddle/fluid/operators/jit/gen/matmul.cc b/paddle/fluid/operators/jit/gen/matmul.cc index ae3858eab2..ca50f26ce5 100644 --- a/paddle/fluid/operators/jit/gen/matmul.cc +++ b/paddle/fluid/operators/jit/gen/matmul.cc @@ -98,7 +98,7 @@ void MatMulJitCode::genCode() { class MatMulCreator : public JitCodeCreator { public: - bool UseMe(const matmul_attr_t& attr) const override { + bool CanBeUsed(const matmul_attr_t& attr) const override { return attr.m == 1 && platform::MayIUse(platform::avx512f) && attr.n % ZMM_FLOAT_BLOCK == 0 && attr.k < 512; } diff --git a/paddle/fluid/operators/jit/gen/seqpool.cc b/paddle/fluid/operators/jit/gen/seqpool.cc index 530d24ee1f..ceca104cc9 100644 --- a/paddle/fluid/operators/jit/gen/seqpool.cc +++ b/paddle/fluid/operators/jit/gen/seqpool.cc @@ -57,7 +57,7 @@ void SeqPoolJitCode::genCode() { class SeqPoolCreator : public JitCodeCreator { public: - bool UseMe(const seq_pool_attr_t& attr) const override { + bool CanBeUsed(const seq_pool_attr_t& attr) const override { return platform::MayIUse(platform::avx); } size_t CodeSize(const seq_pool_attr_t& attr) const override { diff --git a/paddle/fluid/operators/jit/gen/sgd.cc b/paddle/fluid/operators/jit/gen/sgd.cc index a745a27f95..a40da9b993 100644 --- a/paddle/fluid/operators/jit/gen/sgd.cc +++ b/paddle/fluid/operators/jit/gen/sgd.cc @@ -104,7 +104,7 @@ void SgdJitCode::genCode() { class SgdCreator : public JitCodeCreator { public: - bool UseMe(const sgd_attr_t& attr) const override { + bool CanBeUsed(const sgd_attr_t& attr) const override { return platform::MayIUse(platform::avx) && attr.grad_width % YMM_FLOAT_BLOCK == 0; } diff --git a/paddle/fluid/operators/jit/gen/vbroadcast.cc b/paddle/fluid/operators/jit/gen/vbroadcast.cc index 3f9fbdbd82..66a8d75fd4 100644 --- a/paddle/fluid/operators/jit/gen/vbroadcast.cc +++ b/paddle/fluid/operators/jit/gen/vbroadcast.cc @@ -69,7 +69,7 @@ void VBroadcastJitCode::genCode() { class VBroadcastCreator : public JitCodeCreator { public: - bool UseMe(const int64_t& w) const override { + bool CanBeUsed(const int64_t& w) const override { return platform::MayIUse(platform::avx) && w % YMM_FLOAT_BLOCK == 0; } size_t CodeSize(const int64_t& w) const override { diff --git a/paddle/fluid/operators/jit/gen_base.cc b/paddle/fluid/operators/jit/gen_base.cc index f3603875ad..4c49eff49e 100644 --- a/paddle/fluid/operators/jit/gen_base.cc +++ b/paddle/fluid/operators/jit/gen_base.cc @@ -31,7 +31,7 @@ namespace paddle { namespace operators { namespace jit { -// refer do not need useme, it would be the last one. +// refer do not need CanBeUsed, it would be the last one. void GenBase::dumpCode(const unsigned char* code) const { if (code) { static int counter = 0; diff --git a/paddle/fluid/operators/jit/gen_base.h b/paddle/fluid/operators/jit/gen_base.h index a7c7a35a7e..033c603c07 100644 --- a/paddle/fluid/operators/jit/gen_base.h +++ b/paddle/fluid/operators/jit/gen_base.h @@ -31,9 +31,10 @@ class GenBase : public Kernel { virtual ~GenBase() = default; virtual std::string name() const = 0; virtual size_t getSize() const = 0; - virtual const unsigned char* getCodeInternal() = 0; + virtual const unsigned char* getCodeInternal() const = 0; + const char* ImplType() const override { return "JitCode"; } template - Func getCode() { + Func getCode() const { const unsigned char* code = this->getCodeInternal(); if (FLAGS_dump_jitcode) { this->dumpCode(code); @@ -65,7 +66,7 @@ class JitCodeCreator : public GenCreator { virtual ~JitCodeCreator() = default; // condition when this jit code can be used. - virtual bool UseMe(const Attr& attr) const = 0; + virtual bool CanBeUsed(const Attr& attr) const = 0; // estimate this code size virtual size_t CodeSize(const Attr& attr) const = 0; diff --git a/paddle/fluid/operators/jit/helper.h b/paddle/fluid/operators/jit/helper.h index 85f4072dd3..d98eada81c 100644 --- a/paddle/fluid/operators/jit/helper.h +++ b/paddle/fluid/operators/jit/helper.h @@ -14,9 +14,6 @@ #pragma once -extern "C" { -#include -} #include #include #include @@ -36,31 +33,30 @@ template inline typename std::enable_if< std::is_same::value && std::is_same::value, - typename KernelTuple::func_type>::type + const Kernel*>::type GetJitCode(const typename KernelTuple::attr_type& attr) { - using Func = typename KernelTuple::func_type; using Attr = typename KernelTuple::attr_type; size_t key = JitCodeKey(attr); - auto& codes = JitCodePool().Instance(); + auto& codes = JitCodePool::Instance(); if (codes.Has(key)) { - return codes.AllKernels().at(key)->template getCode(); + return codes.AllKernels().at(key).get(); } // creator is not related with attr, so can use KernelKey as key KernelKey kkey(KernelTuple::kernel_type, PlaceType()); // pool: (KernelKey(type, place), vector) - auto& creator_map = JitCodeCreatorPool().Instance().AllCreators(); + auto& creator_map = JitCodeCreatorPool::Instance().AllCreators(); auto iter = creator_map.find(kkey); if (iter != creator_map.end()) { auto& creators = iter->second; for (auto& cur : creators) { auto i = dynamic_cast*>(cur.get()); - if (i && i->UseMe(attr)) { + if (i && i->CanBeUsed(attr)) { auto p = i->CreateJitCode(attr); if (p) { - auto f = p->template getCode(); + auto res = p.get(); codes.Insert(key, std::move(p)); - return f; + return res; } } } @@ -72,7 +68,7 @@ template inline typename std::enable_if< !std::is_same::value || !std::is_same::value, - typename KernelTuple::func_type>::type + const Kernel*>::type GetJitCode(const typename KernelTuple::attr_type& attr) { return nullptr; } @@ -80,8 +76,8 @@ GetJitCode(const typename KernelTuple::attr_type& attr) { // Refer code do not related with attr, which is just for cast // Refer is always on CPUPlace template -inline typename KernelTuple::func_type GetRefer() { - auto& ref_pool = ReferKernelPool().Instance().AllKernels(); +inline const Kernel* GetReferKernel() { + auto& ref_pool = ReferKernelPool::Instance().AllKernels(); KernelKey kkey(KernelTuple::kernel_type, platform::CPUPlace()); auto ref_iter = ref_pool.find(kkey); PADDLE_ENFORCE(ref_iter != ref_pool.end(), @@ -90,36 +86,93 @@ inline typename KernelTuple::func_type GetRefer() { for (auto& impl : ref_impls) { auto i = dynamic_cast*>(impl.get()); if (i) { - return i->GetFunc(); + return i; } } return nullptr; } -template -typename KernelTuple::func_type Get( +template +inline typename KernelTuple::func_type GetReferFunc() { + auto ker = GetReferKernel(); + auto p = dynamic_cast*>(ker); + PADDLE_ENFORCE(p, "The Refer kernel should exsit"); + return p->GetFunc(); +} + +// Return all Kernels that can be used +template +std::vector GetAllCandidateKernels( const typename KernelTuple::attr_type& attr) { - auto jitfunc = GetJitCode(attr); - if (jitfunc) { - return jitfunc; + // the search order shoudl be jitcode > more > refer + std::vector res; + auto jitker = GetJitCode(attr); + if (jitker) { + res.emplace_back(jitker); } - // pool: (KernelKey(type, place), vector) + // more kernelpool: (KernelKey(type, place), vector) KernelKey kkey(KernelTuple::kernel_type, PlaceType()); - auto& pool = KernelPool().Instance().AllKernels(); + auto& pool = KernelPool::Instance().AllKernels(); auto iter = pool.find(kkey); if (iter != pool.end()) { auto& impls = iter->second; for (auto& impl : impls) { auto i = dynamic_cast*>(impl.get()); - if (i && i->UseMe(attr)) { - return i->GetFunc(); + if (i && i->CanBeUsed(attr)) { + res.emplace_back(i); } } } // The last implementation should be reference function on CPUPlace. - return GetRefer(); + auto ref = GetReferKernel(); + PADDLE_ENFORCE(ref != nullptr, "Refer Kernel can not be empty."); + res.emplace_back(ref); + return res; +} + +template +std::vector> +GetAllCandidateFuncsWithTypes(const typename KernelTuple::attr_type& attr) { + using Func = typename KernelTuple::func_type; + auto kers = GetAllCandidateKernels(attr); + std::vector> res; + for (auto k : kers) { + std::string name = k->ImplType(); + if (name == "JitCode") { + auto i = dynamic_cast(k); + PADDLE_ENFORCE(i, "jitcode kernel cast can not fail."); + res.emplace_back(std::make_pair(name, i->template getCode())); + } else { + auto i = dynamic_cast*>(k); + PADDLE_ENFORCE(i, "kernel cast can not fail."); + res.emplace_back(std::make_pair(name, i->GetFunc())); + } + } + return res; +} + +template +std::vector GetAllCandidateFuncs( + const typename KernelTuple::attr_type& attr) { + auto funcs = GetAllCandidateFuncsWithTypes(attr); + std::vector res; + for (auto& i : funcs) { + res.emplace_back(i.second); + } + return res; +} + +template +typename KernelTuple::func_type GetDefaultBestFunc( + const typename KernelTuple::attr_type& attr) { + auto funcs = GetAllCandidateFuncs(attr); + PADDLE_ENFORCE_GE(funcs.size(), 1UL); + // Here could do some runtime benchmark of this attr and return the best one. + // But yet just get the first one as the default best one, + // which is searched in order and tuned by offline. + return funcs[0]; } template @@ -134,17 +187,13 @@ class KernelFuncs { // the exposed interface to use typename KernelTuple::func_type At( const typename KernelTuple::attr_type& attr) { - // XXH64: 13.8 GB/s - // TODO(TJ): change me, maybe not all attr change need one key, should be - // attrkey - int64_t key = XXH64(&attr, sizeof(typename KernelTuple::attr_type), 0); + // Maybe here is not good enough, not all kernels should have jitcode + int64_t key = JitCodeKey(attr); if (Has(key)) { return funcs_.at(key); } - // If do not have this attr in cache, - // then could run some runtime benchmark of this attr and save the best one. - // Here just get the offline benchmarked best one. - auto func = Get(attr); + // If do not have this attr in cache then get the default best + auto func = GetDefaultBestFunc(attr); Insert(key, func); return func; } @@ -156,7 +205,6 @@ class KernelFuncs { protected: bool Has(int64_t key) const { return funcs_.find(key) != funcs_.end(); } - void Insert(int64_t key, typename KernelTuple::func_type func) { funcs_.emplace(key, func); } diff --git a/paddle/fluid/operators/jit/kernel_base.h b/paddle/fluid/operators/jit/kernel_base.h index e8dbcced4f..bd34d7dfc7 100644 --- a/paddle/fluid/operators/jit/kernel_base.h +++ b/paddle/fluid/operators/jit/kernel_base.h @@ -302,6 +302,7 @@ class Kernel { public: Kernel() = default; virtual ~Kernel() = default; + virtual const char* ImplType() const = 0; DISABLE_COPY_AND_ASSIGN(Kernel); }; @@ -312,8 +313,8 @@ class KernelMore : public Kernel { using Func = typename KernelTuple::func_type; using Attr = typename KernelTuple::attr_type; virtual Func GetFunc() const { return func; } - virtual bool UseMe(const Attr& attr) const = 0; - virtual const char* ImplType() const = 0; + // specify this kernel can be used, means it should not fail if use it. + virtual bool CanBeUsed(const Attr& attr) const = 0; protected: Func func{nullptr}; @@ -323,7 +324,7 @@ template class ReferKernel : public KernelMore { public: // Refer code can always be used - bool UseMe(const typename KernelTuple::attr_type& attr) const override { + bool CanBeUsed(const typename KernelTuple::attr_type& attr) const override { return true; } const char* ImplType() const override { return "Refer"; } diff --git a/paddle/fluid/operators/jit/kernel_key.cc b/paddle/fluid/operators/jit/kernel_key.cc index 1c2fddcae7..6987c893de 100644 --- a/paddle/fluid/operators/jit/kernel_key.cc +++ b/paddle/fluid/operators/jit/kernel_key.cc @@ -13,6 +13,7 @@ * limitations under the License. */ #include "paddle/fluid/operators/jit/kernel_key.h" +#include #include "paddle/fluid/platform/enforce.h" namespace paddle { @@ -49,6 +50,8 @@ static inline int act_type_convert(KernelType type) { template <> size_t JitCodeKey(const lstm_attr_t& attr) { + // XXH64: 13.8 GB/s + size_t key = attr.d; int gate_key = act_type_convert(attr.act_gate) << 1; int cand_key = act_type_convert(attr.act_cand) << (1 + act_type_shift); diff --git a/paddle/fluid/operators/jit/more/intrinsic/crf_decoding.cc b/paddle/fluid/operators/jit/more/intrinsic/crf_decoding.cc index 16c91f8246..1254d00189 100644 --- a/paddle/fluid/operators/jit/more/intrinsic/crf_decoding.cc +++ b/paddle/fluid/operators/jit/more/intrinsic/crf_decoding.cc @@ -161,7 +161,7 @@ void CRFDecoding(const int seq_len, const float* x, const float* w, } } -bool CRFDecodingKernel::UseMe(const int& d) const { +bool CRFDecodingKernel::CanBeUsed(const int& d) const { #ifdef __AVX512F__ constexpr int block = ZMM_FLOAT_BLOCK; #else diff --git a/paddle/fluid/operators/jit/more/intrinsic/crf_decoding.h b/paddle/fluid/operators/jit/more/intrinsic/crf_decoding.h index f4187bd3ba..49b1a1fea4 100644 --- a/paddle/fluid/operators/jit/more/intrinsic/crf_decoding.h +++ b/paddle/fluid/operators/jit/more/intrinsic/crf_decoding.h @@ -29,7 +29,8 @@ void CRFDecoding(const int seq_len, const float* x, const float* w, class CRFDecodingKernel : public KernelMore> { public: CRFDecodingKernel() { this->func = CRFDecoding; } - bool UseMe(const typename CRFDecodingTuple::attr_type&) const override; + bool CanBeUsed( + const typename CRFDecodingTuple::attr_type&) const override; const char* ImplType() const override { return "Intrinsic"; } }; diff --git a/paddle/fluid/operators/jit/more/intrinsic/layer_norm.cc b/paddle/fluid/operators/jit/more/intrinsic/layer_norm.cc index e9b6e401c6..a4e3246f10 100644 --- a/paddle/fluid/operators/jit/more/intrinsic/layer_norm.cc +++ b/paddle/fluid/operators/jit/more/intrinsic/layer_norm.cc @@ -153,7 +153,7 @@ void LayerNorm(float* x, float* out, float* mean, float* var, } } -bool LayerNormKernel::UseMe(const int& d) const { +bool LayerNormKernel::CanBeUsed(const int& d) const { return platform::MayIUse(platform::avx) && d >= YMM_FLOAT_BLOCK; } diff --git a/paddle/fluid/operators/jit/more/intrinsic/layer_norm.h b/paddle/fluid/operators/jit/more/intrinsic/layer_norm.h index dfa4c2f072..7b9f676050 100644 --- a/paddle/fluid/operators/jit/more/intrinsic/layer_norm.h +++ b/paddle/fluid/operators/jit/more/intrinsic/layer_norm.h @@ -30,7 +30,8 @@ void LayerNorm(float* x, float* out, float* mean, float* var, class LayerNormKernel : public KernelMore> { public: LayerNormKernel() { this->func = LayerNorm; } - bool UseMe(const typename LayerNormTuple::attr_type&) const override; + bool CanBeUsed( + const typename LayerNormTuple::attr_type&) const override; const char* ImplType() const override { return "Intrinsic"; } }; diff --git a/paddle/fluid/operators/jit/more/mix/mix.cc b/paddle/fluid/operators/jit/more/mix/mix.cc index 9ee1032e95..6e709a16d2 100644 --- a/paddle/fluid/operators/jit/more/mix/mix.cc +++ b/paddle/fluid/operators/jit/more/mix/mix.cc @@ -204,21 +204,21 @@ void GRUHtPart2(gru_t* step, const gru_attr_t* attr) { } // TODO(TJ): tuning me -bool VSigmoidKernel::UseMe(const int& d) const { return true; } +bool VSigmoidKernel::CanBeUsed(const int& d) const { return true; } -bool VTanhKernel::UseMe(const int& d) const { return true; } +bool VTanhKernel::CanBeUsed(const int& d) const { return true; } -bool SoftmaxKernel::UseMe(const int& d) const { return true; } +bool SoftmaxKernel::CanBeUsed(const int& d) const { return true; } -bool LSTMCtHtKernel::UseMe(const lstm_attr_t& attr) const { return true; } +bool LSTMCtHtKernel::CanBeUsed(const lstm_attr_t& attr) const { return true; } -bool LSTMC1H1Kernel::UseMe(const lstm_attr_t& attr) const { return true; } +bool LSTMC1H1Kernel::CanBeUsed(const lstm_attr_t& attr) const { return true; } -bool GRUH1Kernel::UseMe(const gru_attr_t& attr) const { return true; } +bool GRUH1Kernel::CanBeUsed(const gru_attr_t& attr) const { return true; } -bool GRUHtPart1Kernel::UseMe(const gru_attr_t& attr) const { return true; } +bool GRUHtPart1Kernel::CanBeUsed(const gru_attr_t& attr) const { return true; } -bool GRUHtPart2Kernel::UseMe(const gru_attr_t& attr) const { return true; } +bool GRUHtPart2Kernel::CanBeUsed(const gru_attr_t& attr) const { return true; } } // namespace mix } // namespace more diff --git a/paddle/fluid/operators/jit/more/mix/mix.h b/paddle/fluid/operators/jit/more/mix/mix.h index 17eb96462f..994d485909 100644 --- a/paddle/fluid/operators/jit/more/mix/mix.h +++ b/paddle/fluid/operators/jit/more/mix/mix.h @@ -34,12 +34,12 @@ void GRUH1(gru_t* step, const gru_attr_t* attr); void GRUHtPart1(gru_t* step, const gru_attr_t* attr); void GRUHtPart2(gru_t* step, const gru_attr_t* attr); -#define DECLARE_MORE_KERNEL(name) \ - class name##Kernel : public KernelMore> { \ - public: \ - name##Kernel() { this->func = name; } \ - bool UseMe(const typename name##Tuple::attr_type&) const override; \ - const char* ImplType() const override { return "Mixed"; } \ +#define DECLARE_MORE_KERNEL(name) \ + class name##Kernel : public KernelMore> { \ + public: \ + name##Kernel() { this->func = name; } \ + bool CanBeUsed(const typename name##Tuple::attr_type&) const override; \ + const char* ImplType() const override { return "Mixed"; } \ } // XYN diff --git a/paddle/fluid/operators/jit/more/mkl/mkl.cc b/paddle/fluid/operators/jit/more/mkl/mkl.cc index 084ea571ce..4f600b3814 100644 --- a/paddle/fluid/operators/jit/more/mkl/mkl.cc +++ b/paddle/fluid/operators/jit/more/mkl/mkl.cc @@ -130,105 +130,106 @@ void ASum(const double* x, double* res, int n) { // TODO(TJ): tuning me carefully on AVX, AVX2 and AVX512 template <> -bool VMulKernel::UseMe(const int& d) const { +bool VMulKernel::CanBeUsed(const int& d) const { return platform::MayIUse(platform::avx512f) && d > 512; } template <> -bool VAddKernel::UseMe(const int& d) const { +bool VAddKernel::CanBeUsed(const int& d) const { return platform::MayIUse(platform::avx) && d > 512; } template <> -bool VScalKernel::UseMe(const int& d) const { +bool VScalKernel::CanBeUsed(const int& d) const { return platform::MayIUse(platform::avx512f) && d > 512; } template <> -bool VExpKernel::UseMe(const int& d) const { +bool VExpKernel::CanBeUsed(const int& d) const { return d > 7; } template <> -bool VSquareKernel::UseMe(const int& d) const { +bool VSquareKernel::CanBeUsed(const int& d) const { return d > 7; } template <> -bool VCopyKernel::UseMe(const int& d) const { +bool VCopyKernel::CanBeUsed(const int& d) const { return d > 15; } template <> -bool VBroadcastKernel::UseMe(const int64_t& d) const { +bool VBroadcastKernel::CanBeUsed(const int64_t& d) const { return d > 127; } template <> -bool VBroadcastKernel::UseMe(const int64_t& attr) const { +bool VBroadcastKernel::CanBeUsed(const int64_t& attr) const { return true; } template <> -bool VSigmoidKernel::UseMe(const int& d) const { +bool VSigmoidKernel::CanBeUsed(const int& d) const { return d > 7; } template <> -bool VTanhKernel::UseMe(const int& d) const { +bool VTanhKernel::CanBeUsed(const int& d) const { return d > 7; } template <> -bool SeqPoolKernel::UseMe(const seq_pool_attr_t& attr) const { +bool SeqPoolKernel::CanBeUsed(const seq_pool_attr_t& attr) const { return true; } template <> -bool SeqPoolKernel::UseMe(const seq_pool_attr_t& attr) const { +bool SeqPoolKernel::CanBeUsed(const seq_pool_attr_t& attr) const { return true; } template <> -bool EmbSeqPoolKernel::UseMe(const emb_seq_pool_attr_t& attr) const { +bool EmbSeqPoolKernel::CanBeUsed(const emb_seq_pool_attr_t& attr) const { return true; } template <> -bool EmbSeqPoolKernel::UseMe(const emb_seq_pool_attr_t& attr) const { +bool EmbSeqPoolKernel::CanBeUsed( + const emb_seq_pool_attr_t& attr) const { return true; } template <> -bool SgdKernel::UseMe(const sgd_attr_t& attr) const { +bool SgdKernel::CanBeUsed(const sgd_attr_t& attr) const { return true; } template <> -bool SgdKernel::UseMe(const sgd_attr_t& attr) const { +bool SgdKernel::CanBeUsed(const sgd_attr_t& attr) const { return true; } template <> -bool MatMulKernel::UseMe(const matmul_attr_t& attr) const { +bool MatMulKernel::CanBeUsed(const matmul_attr_t& attr) const { return platform::MayIUse(platform::avx); } template <> -bool MatMulKernel::UseMe(const matmul_attr_t& attr) const { +bool MatMulKernel::CanBeUsed(const matmul_attr_t& attr) const { return true; } template <> -bool SoftmaxKernel::UseMe(const int& d) const { +bool SoftmaxKernel::CanBeUsed(const int& d) const { // tuned on avx2 return platform::MayIUse(platform::avx) && d < 60; } -#define AWALYS_USE_ME_WITH_DOUBLE(func) \ - template <> \ - bool func##Kernel::UseMe(const int& d) const { \ - return true; \ +#define AWALYS_USE_ME_WITH_DOUBLE(func) \ + template <> \ + bool func##Kernel::CanBeUsed(const int& d) const { \ + return true; \ } AWALYS_USE_ME_WITH_DOUBLE(VMul); diff --git a/paddle/fluid/operators/jit/more/mkl/mkl.h b/paddle/fluid/operators/jit/more/mkl/mkl.h index 8c1d8b57e0..f51dca654c 100644 --- a/paddle/fluid/operators/jit/more/mkl/mkl.h +++ b/paddle/fluid/operators/jit/more/mkl/mkl.h @@ -175,13 +175,13 @@ void Sgd(const T* lr, const T* param, const T* grad, const int64_t* rows, } } -#define DECLARE_MKL_KERNEL(name) \ - template \ - class name##Kernel : public KernelMore> { \ - public: \ - name##Kernel() { this->func = name; } \ - bool UseMe(const typename name##Tuple::attr_type&) const override; \ - const char* ImplType() const override { return "MKL"; } \ +#define DECLARE_MKL_KERNEL(name) \ + template \ + class name##Kernel : public KernelMore> { \ + public: \ + name##Kernel() { this->func = name; } \ + bool CanBeUsed(const typename name##Tuple::attr_type&) const override; \ + const char* ImplType() const override { return "MKL"; } \ } // ABCMNK diff --git a/paddle/fluid/operators/jit/registry.h b/paddle/fluid/operators/jit/registry.h index cb32c48720..c8da92c0c5 100644 --- a/paddle/fluid/operators/jit/registry.h +++ b/paddle/fluid/operators/jit/registry.h @@ -49,8 +49,8 @@ struct JitKernelRegistrarFunctor { void operator()(KernelType kt) const { KernelKey kkey(kt, PlaceType()); - Pool().Instance().Insert(kkey, - std::move(make_unique())); + Pool::Instance().Insert(kkey, + std::move(make_unique())); constexpr auto size = std::tuple_size>::value; JitKernelRegistrarFunctor diff --git a/paddle/fluid/operators/jit/test.cc b/paddle/fluid/operators/jit/test.cc index a574bf2079..898133a03b 100644 --- a/paddle/fluid/operators/jit/test.cc +++ b/paddle/fluid/operators/jit/test.cc @@ -13,6 +13,7 @@ See the License for the specific language governing permissions and limitations under the License. */ #include +#include #include #include #include @@ -68,31 +69,11 @@ template void TestAllImpls(const typename KernelTuple::attr_type& attr, const Tester& verifier, const Args&... args) { - // test jitcode - auto jitcode = jit::GetJitCode(attr); - if (jitcode) { - VLOG(10) << "Test Jitcode Kernel "; - verifier(jitcode, args...); + auto funcs = jit::GetAllCandidateFuncsWithTypes(attr); + for (auto f : funcs) { + VLOG(10) << "Test Kernel " << f.first; + verifier(f.second, args...); } - // test all impls in more - jit::KernelKey kkey(KernelTuple::kernel_type, PlaceType()); - auto& pool = jit::KernelPool().Instance().AllKernels(); - auto iter = pool.find(kkey); - if (iter != pool.end()) { - auto& impls = iter->second; - for (auto& impl : impls) { - auto i = dynamic_cast*>(impl.get()); - if (i && i->UseMe(attr)) { - auto more = i->GetFunc(); - VLOG(10) << "Test More Kernel : " << i->ImplType(); - verifier(more, args...); - } - } - } - // test result from Get function - VLOG(10) << "Test final get function "; - auto tgt = jit::KernelFuncs::Cache().At(attr); - verifier(tgt, args...); } template @@ -100,7 +81,7 @@ void TestKernelXYZN() { using T = typename KernelTuple::data_type; VLOG(10) << "Test JITKernel: " << jit::to_string(KernelTuple::kernel_type); for (int d : TestSizes()) { - auto ref = jit::GetRefer(); + auto ref = jit::GetReferFunc(); EXPECT_TRUE(ref != nullptr); std::vector x(d), y(d), zref(d); @@ -159,7 +140,7 @@ void TestKernelAXYN() { using T = typename KernelTuple::data_type; VLOG(10) << "Test JITKernel: " << jit::to_string(KernelTuple::kernel_type); for (int d : TestSizes()) { - auto ref = jit::GetRefer(); + auto ref = jit::GetReferFunc(); EXPECT_TRUE(ref != nullptr); const T a = static_cast(3); @@ -202,7 +183,7 @@ void TestKernelXYN() { using T = typename KernelTuple::data_type; VLOG(10) << "Test JITKernel: " << jit::to_string(KernelTuple::kernel_type); for (int d : TestSizes()) { - auto ref = jit::GetRefer(); + auto ref = jit::GetReferFunc(); EXPECT_TRUE(ref != nullptr); std::vector x(d), yref(d); @@ -245,7 +226,7 @@ void TestKernelXRN() { auto last_acc = FLAGS_acc; FLAGS_acc = 1e-4; for (int d : TestSizes()) { - auto ref = jit::GetRefer(); + auto ref = jit::GetReferFunc(); EXPECT_TRUE(ref != nullptr); std::vector x(d); RandomVec(d, x.data()); @@ -279,7 +260,7 @@ void TestKernelLSTM() { const jit::lstm_attr_t attr( d, jit::to_kerneltype(act_gate), jit::to_kerneltype(act_cand), jit::to_kerneltype(act_cell), use_peephole); - auto ref = jit::GetRefer(); + auto ref = jit::GetReferFunc(); EXPECT_TRUE(ref != nullptr); std::vector xsrc(4 * d), wp(3 * d), ct_1(d); std::vector ct_ref(d), ht_ref(d), checked(2 * d); @@ -370,7 +351,7 @@ void TestKernelGRU() { for (auto& act_cand : all_acts) { const jit::gru_attr_t attr(d, jit::to_kerneltype(act_gate), jit::to_kerneltype(act_cand)); - auto ref = jit::GetRefer(); + auto ref = jit::GetReferFunc(); EXPECT_TRUE(ref != nullptr); std::vector xsrc(3 * d), ht_1(d), ht_ref(d); RandomVec(3 * d, xsrc.data()); @@ -423,7 +404,7 @@ void TestKernelNCHW16CMulNC() { using T = typename KernelTuple::data_type; VLOG(10) << "Test JITKernel: " << jit::to_string(KernelTuple::kernel_type); const int n = 3, c = 16 * 4, h = 10, w = 10; - auto ref = jit::GetRefer(); + auto ref = jit::GetReferFunc(); EXPECT_TRUE(ref != nullptr); int sz = n * c * h * w; std::vector x(sz), y(n * c), zref(sz); @@ -439,7 +420,9 @@ void TestKernelNCHW16CMulNC() { constexpr int simd_width = ZMM_FLOAT_BLOCK; int C = c / simd_width; auto tgt = jit::KernelFuncs::Cache().At(0); - auto jitcode = jit::GetJitCode(0); + auto funcs = jit::GetAllCandidateFuncs(0); + EXPECT_GT(funcs.size(), 0UL); + auto jitcode = funcs[0]; EXPECT_TRUE(tgt != nullptr); if (std::is_same::value && @@ -482,7 +465,7 @@ void TestKernelLayerNorm() { int left = n * x_dim_0; for (int x_dim_1 : TestSizes()) { int right = x_dim_1; - auto ref = jit::GetRefer(); + auto ref = jit::GetReferFunc(); EXPECT_TRUE(ref != nullptr); int sz = left * right; std::vector x(sz), mean(left), var(left), scale(right), bias(right), @@ -555,7 +538,7 @@ void TestKernelCRFDecoding() { test_sizes.erase(std::remove(test_sizes.begin(), test_sizes.end(), 2000)); for (int seq_len : {1, 11, 17, 50}) { for (int tag_num : test_sizes) { - auto ref = jit::GetRefer(); + auto ref = jit::GetReferFunc(); EXPECT_TRUE(ref != nullptr); int x_sz = seq_len * tag_num; int w_sz = (tag_num + state_trans_base_idx) * tag_num; @@ -606,7 +589,7 @@ void TestKernelSeqPool() { jit::seq_pool_attr_t attr(w, type); for (int h : test_sizes) { attr.h = h; - auto ref = jit::GetRefer(); + auto ref = jit::GetReferFunc(); EXPECT_TRUE(ref != nullptr); std::vector x(h * w), yref(w); RandomVec(h * w, x.data()); @@ -649,7 +632,7 @@ void TestKernelEmbSeqPool() { for (auto type : pool_types) { for (int idx_w : {1, 2, 10, 16}) { for (int idx_h : {1, 2, 9, 13, 16}) { - auto ref = jit::GetRefer(); + auto ref = jit::GetReferFunc(); EXPECT_TRUE(ref != nullptr); std::vector idx(idx_h * idx_w); RandomVec(idx_h * idx_w, idx.data(), 0, tbl_h - 1); @@ -701,7 +684,7 @@ void TestKernelMatMul() { for (int m : {1, 2, 3, 4}) { for (int n : {1, 2, 3, 4}) { for (int k : TestSizes()) { - auto ref = jit::GetRefer(); + auto ref = jit::GetReferFunc(); EXPECT_TRUE(ref != nullptr); std::vector a(m * k), b(k * n), c(m * n); RandomVec(m * k, a.data()); @@ -740,7 +723,7 @@ void TestKernelSoftmax() { VLOG(10) << "Test JITKernel: " << jit::to_string(KernelTuple::kernel_type); for (int bs : {1, 2, 10}) { for (int n : TestSizes()) { - auto ref = jit::GetRefer(); + auto ref = jit::GetReferFunc(); EXPECT_TRUE(ref != nullptr); std::vector x(bs * n), y(bs * n); RandomVec(bs * n, x.data()); @@ -808,7 +791,7 @@ void TestKernelSgd() { RandomVec(rows_size * grad_w, grad.data()); const int64_t* rows_data = rows.data(); const T* grad_data = grad.data(); - auto ref = jit::GetRefer(); + auto ref = jit::GetReferFunc(); EXPECT_TRUE(ref != nullptr); jit::sgd_attr_t attr(param_h, grad_w, rows_size, grad_w, rows_size); ref(&lr, param_data, grad_data, rows_data, out_data, &attr); @@ -874,7 +857,7 @@ void TestKernelVBroadcast() { RandomVec(w, x.data()); const T* x_data = x.data(); for (int64_t h : {1, 2, 6}) { - auto ref = jit::GetRefer(); + auto ref = jit::GetReferFunc(); EXPECT_TRUE(ref != nullptr); std::vector y(w * h); T* y_data = y.data(); @@ -900,6 +883,135 @@ void TestKernelVBroadcast() { } } +// test pool +TEST(JITKernel_pool, jitcreator) { + const auto& jitcreators = jit::JitCodeCreatorPool::Instance().AllCreators(); + EXPECT_EQ(jitcreators.size(), 25UL); +} + +TEST(JITKernel_pool, jitpool) { + // jitpool is related with attr + const auto& kers = jit::JitCodePool().Instance().AllKernels(); + EXPECT_EQ(kers.size(), 0UL); + jit::GetAllCandidateKernels, CPUPlace>(3); + // after call GetAllCandidateKernels, it will create jitcode Automatically + EXPECT_EQ(kers.size(), 1UL); +} + +TEST(JITKernel_pool, more) { + const auto& kers = jit::KernelPool::Instance().AllKernels(); + EXPECT_EQ(kers.size(), 21UL); +} + +TEST(JITKernel_pool, refer) { + const auto& kers = jit::ReferKernelPool::Instance().AllKernels(); + EXPECT_EQ(kers.size(), 29UL); +} + +// test helper +TEST(JITKernel_helper, GetAllCandidateKernels) { + auto fp_kers = + jit::GetAllCandidateKernels, CPUPlace>(10); +#if defined(_WIN32) || defined(__APPLE__) || defined(__OSX__) + EXPECT_GE(fp_kers.size(), 1UL); // refer +#else + EXPECT_GE(fp_kers.size(), 3UL); // jitcode, mkl, refer +#endif + + auto db_kers = + jit::GetAllCandidateKernels, CPUPlace>(10); +#if defined(_WIN32) || defined(__APPLE__) || defined(__OSX__) + EXPECT_GE(db_kers.size(), 1UL); // refer +#else + EXPECT_GE(db_kers.size(), 2UL); // mkl, refer +#endif +} + +TEST(JITKernel_helper, GetAllCandidateFuncsWithTypes) { + auto fp_kers = + jit::GetAllCandidateFuncsWithTypes, CPUPlace>(10); + EXPECT_GE(fp_kers.size(), 3UL); // jitcode, mkl, refer + + auto db_kers = + jit::GetAllCandidateFuncsWithTypes, CPUPlace>(10); + EXPECT_GE(db_kers.size(), 2UL); // mkl, refer +} + +TEST(JITKernel_helper, GetAllCandidateFuncs) { + auto funcs = jit::GetAllCandidateFuncs, CPUPlace>(10); + auto kers = jit::GetAllCandidateKernels, CPUPlace>(10); + EXPECT_EQ(funcs.size(), kers.size()); + + std::vector x(10), tgt(10); + RandomVec(10, x.data()); + auto best = jit::GetDefaultBestFunc, CPUPlace>(10); + best(x.data(), tgt.data(), 10); + for (auto f : funcs) { + std::vector y(10); + f(x.data(), y.data(), 10); + ExpectEQ(y.data(), tgt.data(), 10); + } +} + +TEST(JITKernel_helper, attr) { + std::ostringstream out; + + // KernelTypes + out << jit::to_string(jit::kNone) << jit::to_string(jit::kCRFDecoding) + << jit::to_string(jit::kEmbSeqPool) << jit::to_string(jit::kGRUH1) + << jit::to_string(jit::kGRUHtPart1) << jit::to_string(jit::kGRUHtPart2) + << jit::to_string(jit::kHSum) << jit::to_string(jit::kHMax) + << jit::to_string(jit::kLSTMCtHt) << jit::to_string(jit::kLSTMC1H1) + << jit::to_string(jit::kLayerNorm) << jit::to_string(jit::kMatMul) + << jit::to_string(jit::kNCHW16CMulNC) << jit::to_string(jit::kSeqPool) + << jit::to_string(jit::kSoftmax) << jit::to_string(jit::kVAdd) + << jit::to_string(jit::kVAddBias) << jit::to_string(jit::kVAddRelu) + << jit::to_string(jit::kVBroadcast) << jit::to_string(jit::kVCopy) + << jit::to_string(jit::kVExp) << jit::to_string(jit::kVIdentity) + << jit::to_string(jit::kVMul) << jit::to_string(jit::kVRelu) + << jit::to_string(jit::kVScal) << jit::to_string(jit::kSgd) + << jit::to_string(jit::kVSigmoid) << jit::to_string(jit::kVSquare) + << jit::to_string(jit::kVSub) << jit::to_string(jit::kVTanh); + EXPECT_EQ(out.str().size(), 234); + + // SeqPoolTypes + out.str(""); + out << jit::to_string(jit::kSum) << jit::to_string(jit::kAvg) + << jit::to_string(jit::kSqrt); + EXPECT_EQ(out.str().size(), 13); + + EXPECT_EQ(jit::to_kerneltype("relu"), jit::kVRelu); + EXPECT_EQ(jit::to_kerneltype("Identity"), jit::kVIdentity); + EXPECT_EQ(jit::to_kerneltype("VEXP"), jit::kVExp); + EXPECT_EQ(jit::to_kerneltype("SigmoiD"), jit::kVSigmoid); + EXPECT_EQ(jit::to_kerneltype("VTanh"), jit::kVTanh); + + out.str(""); + out << jit::lstm_attr_t(8, jit::kVIdentity, jit::kVSigmoid, jit::kVTanh); + EXPECT_EQ(out.str().size(), 89); + + out.str(""); + out << jit::gru_attr_t(8, jit::kVIdentity, jit::kVSigmoid); + EXPECT_EQ(out.str().size(), 52); + + out.str(""); + out << jit::seq_pool_attr_t(8, jit::SeqPoolType::kSum); + EXPECT_EQ(out.str().size(), 44); + + out.str(""); + out << jit::emb_seq_pool_attr_t(1, 2, 3, 4, 5, jit::SeqPoolType::kAvg); + EXPECT_EQ(out.str().size(), 93); + + out.str(""); + out << jit::sgd_attr_t(1, 2, 3, 4, 5); + EXPECT_EQ(out.str().size(), 81); + + out.str(""); + out << jit::matmul_attr_t(1, 2, 3); + EXPECT_EQ(out.str().size(), 14); +} + +// test kernerls #define TestKernelVMul TestKernelXYZN #define TestKernelVAdd TestKernelXYZN #define TestKernelVAddRelu TestKernelXYZN @@ -969,6 +1081,14 @@ TEST_CPU_KERNEL(Softmax); TEST_CPU_KERNEL(Sgd); TEST_CPU_KERNEL(VBroadcast); +TEST(JITKernel, kernel_func) { + auto f1 = jit::KernelFuncs, CPUPlace>::Cache().At(3); + auto f2 = jit::KernelFuncs, CPUPlace>::Cache()[3]; + EXPECT_TRUE(f1 != nullptr); + EXPECT_TRUE(f1 == f2); + // TODO(TJ): check not equal +} + TEST(JITKernel_key, lstm) { jit::lstm_attr_t attr1(8, jit::kVIdentity, jit::kVSigmoid, jit::kVTanh); jit::lstm_attr_t attr2(9, jit::kVIdentity, jit::kVSigmoid, jit::kVTanh); @@ -1000,11 +1120,3 @@ TEST(JITKernel_key, gru) { EXPECT_TRUE(key2 == key3); EXPECT_TRUE(key3 != key4); } - -TEST(JITKernel, kernel_func) { - auto f1 = jit::KernelFuncs, CPUPlace>::Cache().At(3); - auto f2 = jit::KernelFuncs, CPUPlace>::Cache()[3]; - EXPECT_TRUE(f1 != nullptr); - EXPECT_TRUE(f1 == f2); - // TODO(TJ): check not equal -} From 5579fae1d2717443825e7489ed8b6da1b436bd95 Mon Sep 17 00:00:00 2001 From: Tink_Y <31891223+tink2123@users.noreply.github.com> Date: Mon, 11 Mar 2019 01:01:21 +0800 Subject: [PATCH 23/64] Update activation_op.cc test=develop --- paddle/fluid/operators/activation_op.cc | 2 ++ 1 file changed, 2 insertions(+) diff --git a/paddle/fluid/operators/activation_op.cc b/paddle/fluid/operators/activation_op.cc index cc948f2697..27e1463570 100644 --- a/paddle/fluid/operators/activation_op.cc +++ b/paddle/fluid/operators/activation_op.cc @@ -12,6 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ +#include +#include #include "paddle/fluid/operators/activation_op.h" #include #include "paddle/fluid/operators/mkldnn/mkldnn_activation_op.h" From e4e0d034594f34b6327ecdcd0f0fcbba956f18a3 Mon Sep 17 00:00:00 2001 From: tink2123 Date: Mon, 11 Mar 2019 02:12:40 +0000 Subject: [PATCH 24/64] fix format test=develop --- paddle/fluid/operators/activation_op.cc | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/paddle/fluid/operators/activation_op.cc b/paddle/fluid/operators/activation_op.cc index 27e1463570..747b9ca278 100644 --- a/paddle/fluid/operators/activation_op.cc +++ b/paddle/fluid/operators/activation_op.cc @@ -12,10 +12,10 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include -#include #include "paddle/fluid/operators/activation_op.h" +#include #include +#include #include "paddle/fluid/operators/mkldnn/mkldnn_activation_op.h" #include "paddle/fluid/platform/port.h" #ifdef PADDLE_WITH_CUDA From 732fa00eaf905e531d907493132ba948bf159639 Mon Sep 17 00:00:00 2001 From: sneaxiy Date: Fri, 8 Mar 2019 13:05:11 +0000 Subject: [PATCH 25/64] disable gc in recurrent_op currently test=develop --- paddle/fluid/framework/executor.cc | 38 ++++++++++++++++---------- paddle/fluid/framework/executor.h | 17 +++++++++--- paddle/fluid/operators/recurrent_op.cc | 8 ++++-- paddle/fluid/pybind/pybind.cc | 6 ++-- python/paddle/fluid/executor.py | 2 +- 5 files changed, 47 insertions(+), 24 deletions(-) diff --git a/paddle/fluid/framework/executor.cc b/paddle/fluid/framework/executor.cc index 7eef9ec504..f3869ceb6d 100644 --- a/paddle/fluid/framework/executor.cc +++ b/paddle/fluid/framework/executor.cc @@ -80,11 +80,11 @@ static std::unordered_map GetNonPersistableReferenceCounts( ExecutorPrepareContext::ExecutorPrepareContext( const framework::ProgramDesc& prog, size_t block_id, - const std::vector& skip_ref_cnt_vars) - : prog_(prog), block_id_(block_id) { - if (GetEagerDeletionThreshold() >= 0) { - global_ref_cnts_ = GetNonPersistableReferenceCounts(prog.Block(block_id), - skip_ref_cnt_vars); + const std::vector& keep_vars, bool force_disable_gc) + : prog_(prog), block_id_(block_id), force_disable_gc_(force_disable_gc) { + if (GetEagerDeletionThreshold() >= 0 && !force_disable_gc_) { + global_ref_cnts_ = + GetNonPersistableReferenceCounts(prog.Block(block_id), keep_vars); } } @@ -189,13 +189,15 @@ void Executor::CreateVariables(const ProgramDesc& pdesc, Scope* scope, } void Executor::Run(const ProgramDesc& pdesc, Scope* scope, int block_id, - bool create_local_scope, bool create_vars) { + bool create_local_scope, bool create_vars, + const std::vector& skip_ref_cnt_vars, + bool force_disable_gc) { platform::RecordBlock b(block_id); if (FLAGS_use_mkldnn) EnableMKLDNN(pdesc); #ifdef PADDLE_WITH_NGRAPH if (FLAGS_use_ngraph) operators::NgraphEngine::EnableNgraph(pdesc); #endif - auto ctx = Prepare(pdesc, block_id); + auto ctx = Prepare(pdesc, block_id, skip_ref_cnt_vars, force_disable_gc); RunPreparedContext(ctx.get(), scope, create_local_scope, create_vars); } @@ -362,9 +364,9 @@ void Executor::Run(const ProgramDesc& program, Scope* scope, std::unique_ptr Executor::Prepare( const ProgramDesc& program, int block_id, - const std::vector& skip_ref_cnt_vars) { - std::unique_ptr ctx( - new ExecutorPrepareContext(program, block_id, skip_ref_cnt_vars)); + const std::vector& skip_ref_cnt_vars, bool force_disable_gc) { + std::unique_ptr ctx(new ExecutorPrepareContext( + program, block_id, skip_ref_cnt_vars, force_disable_gc)); PADDLE_ENFORCE_LT(static_cast(block_id), program.Size()); auto& block = program.Block(block_id); for (auto& op_desc : block.AllOps()) { @@ -375,7 +377,8 @@ std::unique_ptr Executor::Prepare( std::vector> Executor::Prepare( const ProgramDesc& program, const std::vector& block_ids, - const std::vector>& skip_ref_cnt_vars) { + const std::vector>& skip_ref_cnt_vars, + bool force_disable_gc) { PADDLE_ENFORCE( skip_ref_cnt_vars.empty() || skip_ref_cnt_vars.size() == block_ids.size(), "skip_ref_cnt_vars should be either empty or equals to block number %d", @@ -385,9 +388,11 @@ std::vector> Executor::Prepare( for (auto& bid : block_ids) { ExecutorPrepareContext* ctx; if (skip_ref_cnt_vars.empty()) { - ctx = new ExecutorPrepareContext(program, bid); + ctx = new ExecutorPrepareContext(program, bid, std::vector(), + force_disable_gc); } else { - ctx = new ExecutorPrepareContext(program, bid, skip_ref_cnt_vars[idx]); + ctx = new ExecutorPrepareContext(program, bid, skip_ref_cnt_vars[idx], + force_disable_gc); } PADDLE_ENFORCE_LT(static_cast(bid), program.Size()); auto& block = program.Block(bid); @@ -414,7 +419,9 @@ void Executor::RunPreparedContext(ExecutorPrepareContext* ctx, Scope* scope, int64_t max_memory_size = GetEagerDeletionThreshold(); std::unique_ptr gc; - if (max_memory_size >= 0) { + // FIXME(zjl): recurrent_op is rather complex, we would + // disable gc forcely in recurrent_op + if (!ctx->force_disable_gc_ && max_memory_size >= 0) { ctx->ResetReferenceCount(); #ifdef PADDLE_WITH_CUDA if (platform::is_gpu_place(place_)) { @@ -432,7 +439,8 @@ void Executor::RunPreparedContext(ExecutorPrepareContext* ctx, Scope* scope, #ifdef PADDLE_WITH_CUDA } #endif - if (gc && keep_kids) { + // If gc is enabled and block size > 1 + if (gc && ctx->prog_.Size() > 1) { operators::PrepareSafeEagerDeletionOnWhileOpAndWhileGradOp(ctx->block_id_, ctx->ops_); } diff --git a/paddle/fluid/framework/executor.h b/paddle/fluid/framework/executor.h index 5a040ac641..65cb9e51ab 100644 --- a/paddle/fluid/framework/executor.h +++ b/paddle/fluid/framework/executor.h @@ -15,7 +15,9 @@ limitations under the License. */ #pragma once #include +#include #include +#include #include #include "paddle/fluid/framework/garbage_collector.h" #include "paddle/fluid/framework/op_info.h" @@ -30,7 +32,8 @@ namespace framework { struct ExecutorPrepareContext { ExecutorPrepareContext(const framework::ProgramDesc& prog, size_t block_id, const std::vector& skip_ref_cnt_vars = - std::vector()); + std::vector(), + bool force_disable_gc = false); ~ExecutorPrepareContext(); @@ -38,6 +41,7 @@ struct ExecutorPrepareContext { const framework::ProgramDesc& prog_; size_t block_id_; + bool force_disable_gc_; std::vector> ops_; std::unordered_map global_ref_cnts_; @@ -66,7 +70,10 @@ class Executor { * Scope */ void Run(const ProgramDesc& prog, Scope* scope, int block_id, - bool create_local_scope = true, bool create_vars = true); + bool create_local_scope = true, bool create_vars = true, + const std::vector& skip_ref_cnt_vars = + std::vector(), + bool force_disable_gc = false); // This API is very slow. void Run(const ProgramDesc& program, Scope* scope, @@ -79,12 +86,14 @@ class Executor { static std::unique_ptr Prepare( const ProgramDesc& program, int block_id, const std::vector& skip_ref_cnt_vars = - std::vector()); + std::vector(), + bool force_disable_gc = false); static std::vector> Prepare( const ProgramDesc& program, const std::vector& block_ids, const std::vector>& skip_ref_cnt_vars = - std::vector>()); + std::vector>(), + bool force_disable_gc = false); void CreateVariables(const ProgramDesc& pdesc, Scope* scope, int block_id); diff --git a/paddle/fluid/operators/recurrent_op.cc b/paddle/fluid/operators/recurrent_op.cc index a1e02a3fd0..eb39b3119c 100644 --- a/paddle/fluid/operators/recurrent_op.cc +++ b/paddle/fluid/operators/recurrent_op.cc @@ -270,7 +270,9 @@ class RecurrentOp : public RecurrentBase { // Every inputs are linked now, execute! executor.Run(*program, &cur_scope, block->ID(), - false /*create_local_scope*/); + false /*create_local_scope*/, true /*create_vars*/, + std::vector() /*skip_ref_cnt_vars*/, + true /*force_disable_gc*/); // get device context from pool platform::DeviceContextPool &pool = @@ -385,7 +387,9 @@ class RecurrentGradOp : public RecurrentBase { VLOG(5) << "Recurrent memory linking finished "; // Run step block with cur_scope executor.Run(*program, &cur_scope, block->ID(), - false /*create_local_scope*/); + false /*create_local_scope*/, true /*create_vars*/, + std::vector() /*skip_ref_cnt_vars*/, + true /*force_disable_gc*/); VLOG(5) << "executor.Run finished "; diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc index cf59ff6d3b..439d9aa83f 100644 --- a/paddle/fluid/pybind/pybind.cc +++ b/paddle/fluid/pybind/pybind.cc @@ -876,9 +876,11 @@ All parameter, weight, gradient are variables in Paddle. .def(py::init()) .def("close", &Executor::Close) .def("run", [](Executor &self, const ProgramDesc &prog, Scope *scope, - int block_id, bool create_local_scope, bool create_vars) { + int block_id, bool create_local_scope, bool create_vars, + const std::vector &fetch_vars) { pybind11::gil_scoped_release release; - self.Run(prog, scope, block_id, create_local_scope, create_vars); + self.Run(prog, scope, block_id, create_local_scope, create_vars, + fetch_vars); }); m.def("init_gflags", framework::InitGflags); diff --git a/python/paddle/fluid/executor.py b/python/paddle/fluid/executor.py index dfa50e721c..cc3c0dd689 100644 --- a/python/paddle/fluid/executor.py +++ b/python/paddle/fluid/executor.py @@ -590,7 +590,7 @@ class Executor(object): fetch_var_name=fetch_var_name) self._feed_data(program, feed, feed_var_name, scope) - exe.run(program.desc, scope, 0, True, True) + exe.run(program.desc, scope, 0, True, True, fetch_var_name) outs = self._fetch_data(fetch_list, fetch_var_name, scope) if return_numpy: outs = as_numpy(outs) From cfc59b13e9a3bb9af39edb1acfc34c7e69791166 Mon Sep 17 00:00:00 2001 From: tink2123 Date: Mon, 11 Mar 2019 03:12:16 +0000 Subject: [PATCH 26/64] modified api.spec test=develop --- paddle/fluid/API.spec | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/paddle/fluid/API.spec b/paddle/fluid/API.spec index 00bd23b1fe..e6d638031f 100644 --- a/paddle/fluid/API.spec +++ b/paddle/fluid/API.spec @@ -301,8 +301,8 @@ paddle.fluid.layers.ceil (ArgSpec(args=['x', 'name'], varargs=None, keywords=Non paddle.fluid.layers.floor (ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '647b16c5da5ef909649ae02abb434973')) paddle.fluid.layers.cos (ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '485f2686bcc2fe37a4bd893769c8a3e2')) paddle.fluid.layers.acos (ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', 'c721122352acfc1853bffadf2d59103b')) -paddle.fluid.layers.sin (ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '01f1766aa76eff1df30147505b59f7c4')) paddle.fluid.layers.asin (ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '0619b891e80f419b28016cde3d106c68')) +paddle.fluid.layers.sin (ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '01f1766aa76eff1df30147505b59f7c4')) paddle.fluid.layers.round (ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', 'b47f5da13913d3e56bdb1e612a73f3f2')) paddle.fluid.layers.reciprocal (ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', 'cc6ac2f14f03c52aaa83a59bf83b8d26')) paddle.fluid.layers.square (ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '48dfb45d773dbc30126c3a7f777de5ee')) From 9e2c7e69fb1ba277a0558b18d61676110db7049e Mon Sep 17 00:00:00 2001 From: luotao1 Date: Mon, 11 Mar 2019 11:51:08 +0800 Subject: [PATCH 27/64] simplify the zero_copy tests test=develop --- .../tests/api/analyzer_pyramid_dnn_tester.cc | 9 +- .../tests/api/analyzer_rnn1_tester.cc | 139 ++--------------- .../tests/api/analyzer_seq_pool1_tester.cc | 143 ++---------------- .../fluid/inference/tests/api/tester_helper.h | 1 + 4 files changed, 35 insertions(+), 257 deletions(-) diff --git a/paddle/fluid/inference/tests/api/analyzer_pyramid_dnn_tester.cc b/paddle/fluid/inference/tests/api/analyzer_pyramid_dnn_tester.cc index 5ba553aad6..5157bd280d 100644 --- a/paddle/fluid/inference/tests/api/analyzer_pyramid_dnn_tester.cc +++ b/paddle/fluid/inference/tests/api/analyzer_pyramid_dnn_tester.cc @@ -167,8 +167,15 @@ TEST(Analyzer_Pyramid_DNN, compare) { SetInput(&input_slots_all); CompareNativeAndAnalysis( reinterpret_cast(&cfg), input_slots_all); +} + +// Compare result of AnalysisConfig and AnalysisConfig + ZeroCopy +TEST(Analyzer_Pyramid_DNN, compare_zero_copy) { + AnalysisConfig cfg; + SetConfig(&cfg); - // Compare AnalysisConfig and AnalysisConfig + ZeroCopy + std::vector> input_slots_all; + SetInput(&input_slots_all); std::vector outputs_name; outputs_name.emplace_back("cos_sim_2.tmp_0"); CompareAnalysisAndZeroCopy(reinterpret_cast(&cfg), diff --git a/paddle/fluid/inference/tests/api/analyzer_rnn1_tester.cc b/paddle/fluid/inference/tests/api/analyzer_rnn1_tester.cc index 36282b3efe..dcf4b38ce8 100644 --- a/paddle/fluid/inference/tests/api/analyzer_rnn1_tester.cc +++ b/paddle/fluid/inference/tests/api/analyzer_rnn1_tester.cc @@ -207,6 +207,9 @@ void SetConfig(AnalysisConfig *cfg) { cfg->DisableGpu(); cfg->SwitchSpecifyInputNames(); cfg->SwitchIrOptim(); + if (FLAGS_zero_copy) { + cfg->SwitchUseFeedFetchOps(false); + } } void SetInput(std::vector> *inputs) { @@ -285,133 +288,17 @@ TEST(Analyzer_rnn1, multi_thread) { input_slots_all, &outputs, 2 /* multi_thread */); } -// Validate that the AnalysisPredictor + ZeroCopyTensor really works by testing -// on the complex RNN1 model. -TEST(Analyzer_rnn1, ZeroCopy) { - AnalysisConfig config; - SetConfig(&config); - config.SwitchUseFeedFetchOps(false); - - PaddlePlace place; - - auto predictor = CreatePaddlePredictor(config); - - config.SwitchUseFeedFetchOps(true); - auto native_predictor = - CreatePaddlePredictor(config.ToNativeConfig()); - - config.SwitchUseFeedFetchOps( - true); // the analysis predictor needs feed/fetch. - auto analysis_predictor = CreatePaddlePredictor(config); - -#define NEW_TENSOR(name__) \ - auto name__##_tensor = predictor->GetInputTensor(#name__); - NEW_TENSOR(data_lod_attention); - NEW_TENSOR(cell_init); - NEW_TENSOR(data); - NEW_TENSOR(week); - NEW_TENSOR(minute); - NEW_TENSOR(hidden_init); - - // Prepare data for AnalysisPredictor - DataRecord data(FLAGS_infer_data, FLAGS_batch_size); - PrepareZeroCopyInputs(data_lod_attention_tensor.get(), cell_init_tensor.get(), - data_tensor.get(), hidden_init_tensor.get(), - week_tensor.get(), minute_tensor.get(), &data, - FLAGS_batch_size); - - // Prepare data for NativePredictor - std::vector> native_inputs; - SetInput(&native_inputs); - std::vector native_outputs; - std::vector analysis_outputs; - - auto output_tensor = predictor->GetOutputTensor("final_output.tmp_1"); - // Run analysis predictor - - int num_ops; - auto fuse_statis = GetFuseStatis(predictor.get(), &num_ops); - ASSERT_TRUE(fuse_statis.count("fc_fuse")); - ASSERT_EQ(fuse_statis.at("fc_fuse"), 1); - ASSERT_EQ(fuse_statis.at("fc_nobias_lstm_fuse"), 2); // bi-directional LSTM - ASSERT_EQ(fuse_statis.at("seq_concat_fc_fuse"), 1); - ASSERT_EQ(num_ops, - 13); // After graph optimization, only 13 operators exists. - - Timer timer; - double total_time{0}; - for (int i = 0; i < FLAGS_repeat; i++) { - timer.tic(); - predictor->ZeroCopyRun(); - total_time += timer.toc(); - } - LOG(INFO) << "ZeroCopy output: " << DescribeZeroCopyTensor(*output_tensor); - - ASSERT_TRUE(native_predictor->Run(native_inputs.front(), &native_outputs)); - LOG(INFO) << "native output " << DescribeTensor(native_outputs.front()); - - int output_size{0}; // this is the number of elements not memory size - auto *zero_copy_data = output_tensor->data(&place, &output_size); - auto *native_data = static_cast(native_outputs.front().data.data()); - for (int i = 0; i < output_size; i++) { - EXPECT_NEAR(zero_copy_data[i], native_data[i], 1e-3); - } -} - -TEST(Analyzer_rnn1, ZeroCopyMultiThread) { - AnalysisConfig config; - SetConfig(&config); - config.SwitchUseFeedFetchOps(false); - -#define NEW_TENSOR(name__) \ - auto name__##_tensor = predictor->GetInputTensor(#name__); - - std::vector> predictors; - predictors.emplace_back(CreatePaddlePredictor(config)); - for (int tid = 1; tid < FLAGS_num_threads; tid++) { - predictors.emplace_back(predictors.front()->Clone()); - } - double total_time_of_threads{0}; - std::vector threads; - - for (int tid = 0; tid < FLAGS_num_threads; tid++) { - threads.emplace_back([&, tid] { - auto &predictor = predictors[tid]; - NEW_TENSOR(data_lod_attention); - NEW_TENSOR(cell_init); - NEW_TENSOR(data); - NEW_TENSOR(week); - NEW_TENSOR(minute); - NEW_TENSOR(hidden_init); - - // Prepare data for AnalysisPredictor - DataRecord data(FLAGS_infer_data, FLAGS_batch_size); - Timer timer; - double total_time{0}; - - for (int i = 0; i < FLAGS_repeat; i++) { - PrepareZeroCopyInputs(data_lod_attention_tensor.get(), - cell_init_tensor.get(), data_tensor.get(), - hidden_init_tensor.get(), week_tensor.get(), - minute_tensor.get(), &data, FLAGS_batch_size); - - timer.tic(); - predictor->ZeroCopyRun(); - total_time += timer.toc(); - } - - total_time_of_threads += total_time; - - LOG(INFO) << "thread time: " << total_time / FLAGS_repeat; - }); - } - - for (auto &t : threads) { - t.join(); - } +// Compare result of AnalysisConfig and AnalysisConfig + ZeroCopy +TEST(Analyzer_rnn1, compare_zero_copy) { + AnalysisConfig cfg; + SetConfig(&cfg); - LOG(INFO) << "average time: " - << total_time_of_threads / FLAGS_num_threads / FLAGS_repeat; + std::vector> input_slots_all; + SetInput(&input_slots_all); + std::vector outputs_name; + outputs_name.emplace_back("final_output.tmp_1"); + CompareAnalysisAndZeroCopy(reinterpret_cast(&cfg), + input_slots_all, outputs_name); } } // namespace inference diff --git a/paddle/fluid/inference/tests/api/analyzer_seq_pool1_tester.cc b/paddle/fluid/inference/tests/api/analyzer_seq_pool1_tester.cc index cca2ab1ee1..19fa5528da 100644 --- a/paddle/fluid/inference/tests/api/analyzer_seq_pool1_tester.cc +++ b/paddle/fluid/inference/tests/api/analyzer_seq_pool1_tester.cc @@ -144,6 +144,9 @@ void SetConfig(AnalysisConfig *cfg, bool use_mkldnn = false) { cfg->SwitchSpecifyInputNames(); cfg->SwitchIrDebug(); cfg->SetCpuMathLibraryNumThreads(FLAGS_paddle_num_threads); + if (FLAGS_zero_copy) { + cfg->SwitchUseFeedFetchOps(false); + } if (use_mkldnn) { cfg->EnableMKLDNN(); } @@ -184,10 +187,10 @@ TEST(Analyzer_seq_pool1, compare_determine) { input_slots_all); } -void analysis_fuse_statis(bool use_zerocopy) { +// Check the fuse status +TEST(Analyzer_seq_pool1, fuse_statis) { AnalysisConfig cfg; SetConfig(&cfg); - cfg.SwitchUseFeedFetchOps(!use_zerocopy); int num_ops; auto predictor = CreatePaddlePredictor(cfg); auto fuse_statis = GetFuseStatis(predictor.get(), &num_ops); @@ -203,137 +206,17 @@ void analysis_fuse_statis(bool use_zerocopy) { EXPECT_EQ(num_ops, 171); } -// Check the fuse status -TEST(Analyzer_seq_pool1, fuse_statis) { analysis_fuse_statis(false); } - -void PrepareZeroCopyInputs( - const std::unique_ptr &predictor, - std::vector> *inputs) { - DataRecord data(FLAGS_infer_data, FLAGS_batch_size); - // only feed one batch - const auto &one_batch = data.NextBatch(); - inputs->clear(); - for (size_t i = 0; i < one_batch.size(); ++i) { - auto &slot = one_batch[i]; - auto tensor = predictor->GetInputTensor(slot.name + "_embed"); - tensor->Reshape(slot.shape); - tensor->SetLoD({slot.lod}); - ZeroCopyTensorAssignData(tensor.get(), slot.data); - inputs->emplace_back(std::move(tensor)); - } -} - -// return the output values -std::vector zerocopy_profile(int repeat_times) { - AnalysisConfig config; - SetConfig(&config); - config.SwitchUseFeedFetchOps(false); - auto predictor = CreatePaddlePredictor(config); - std::vector> inputs; - PrepareZeroCopyInputs(predictor, &inputs); - auto output_tensor = predictor->GetOutputTensor(out_var_name); - Timer timer; - LOG(INFO) << "Warm up run..."; - timer.tic(); - predictor->ZeroCopyRun(); - PrintTime(FLAGS_batch_size, 1, 1, 0, timer.toc(), 1); - if (FLAGS_profile) { - paddle::platform::ResetProfiler(); - } - LOG(INFO) << "Run " << repeat_times << " times..."; - timer.tic(); - for (int i = 0; i < repeat_times; i++) { - predictor->ZeroCopyRun(); - } - PrintTime(FLAGS_batch_size, repeat_times, 1, 0, timer.toc() / repeat_times, - 1); - - LOG(INFO) << "ZeroCopy output: " << DescribeZeroCopyTensor(*output_tensor); - PaddlePlace place; - int output_size{0}; - auto *pdata = output_tensor->data(&place, &output_size); - std::vector res(output_size); - for (int i = 0; i < output_size; ++i) { - res[i] = pdata[i]; - } - return res; -} - -TEST(Analyzer_seq_pool1, zerocopy_profile) { zerocopy_profile(FLAGS_repeat); } - -TEST(Analyzer_seq_pool1, zerocopy_profile_threads) { - AnalysisConfig config; - SetConfig(&config); - config.SwitchUseFeedFetchOps(false); - - std::vector> predictors; - predictors.emplace_back(CreatePaddlePredictor(config)); - for (int tid = 1; tid < FLAGS_num_threads; tid++) { - predictors.emplace_back(predictors.front()->Clone()); - } - double total_time_of_threads{0}; - std::vector threads; - - for (int tid = 0; tid < FLAGS_num_threads; tid++) { - threads.emplace_back([&, tid] { - auto &predictor = predictors[tid]; - std::vector> inputs; - PrepareZeroCopyInputs(predictor, &inputs); - auto output_tensor = predictor->GetOutputTensor(out_var_name); - Timer timer; - double total_time{0}; - - LOG(INFO) << "Warm up run..."; - timer.tic(); - predictor->ZeroCopyRun(); - PrintTime(FLAGS_batch_size, 1, FLAGS_num_threads, tid, timer.toc(), 1); - if (FLAGS_profile) { - paddle::platform::ResetProfiler(); - } - int repeat_times = FLAGS_repeat; - LOG(INFO) << "Run " << repeat_times << " times..."; - timer.tic(); - - for (int i = 0; i < repeat_times; i++) { - predictor->ZeroCopyRun(); - } - total_time += timer.toc(); - total_time_of_threads += total_time; - - LOG(INFO) << "thread time: " << total_time / repeat_times; - }); - } - - for (auto &t : threads) { - t.join(); - } - - LOG(INFO) << "average time: " - << total_time_of_threads / FLAGS_num_threads / FLAGS_repeat; -} - -TEST(Analyzer_seq_pool1, zerocopy_fuse_statis) { analysis_fuse_statis(true); } +// Compare result of AnalysisConfig and AnalysisConfig + ZeroCopy +TEST(Analyzer_seq_pool1, compare_zero_copy) { + AnalysisConfig cfg; + SetConfig(&cfg); -TEST(Analyzer_seq_pool1, zerocopy_compare_native) { - AnalysisConfig config; - SetConfig(&config); - config.SwitchUseFeedFetchOps(true); - auto predictor = CreatePaddlePredictor(config.ToNativeConfig()); - std::vector native_outputs; std::vector> input_slots_all; SetInput(&input_slots_all); - ASSERT_TRUE(predictor->Run(input_slots_all[0], &native_outputs)); - EXPECT_EQ(native_outputs.size(), 1UL); - - auto zerocopy_output = zerocopy_profile(1); - EXPECT_EQ(zerocopy_output.size() * sizeof(float), - native_outputs.front().data.length()); - auto *native_data = static_cast(native_outputs.front().data.data()); - for (size_t i = 0; i < zerocopy_output.size(); ++i) { - EXPECT_LT( - std::fabs((zerocopy_output[i] - native_data[i]) / zerocopy_output[i]), - 1e-3); - } + std::vector outputs_name; + outputs_name.emplace_back(out_var_name); + CompareAnalysisAndZeroCopy(reinterpret_cast(&cfg), + input_slots_all, outputs_name); } } // namespace analysis diff --git a/paddle/fluid/inference/tests/api/tester_helper.h b/paddle/fluid/inference/tests/api/tester_helper.h index 9a843e8d02..c32e6e3857 100644 --- a/paddle/fluid/inference/tests/api/tester_helper.h +++ b/paddle/fluid/inference/tests/api/tester_helper.h @@ -432,6 +432,7 @@ void CompareAnalysisAndZeroCopy( ZeroCopyTensor zerocopy_output = *predictor->GetOutputTensor(outputs_name[i]).get(); zerocopy_outputs.emplace_back(zerocopy_output); + LOG(INFO) << "ZeroCopy output: " << DescribeZeroCopyTensor(zerocopy_output); } // compare CompareResult(analysis_outputs, zerocopy_outputs); From 24fbe6d6109b5db9bfa4d4d2445d0b4b89746091 Mon Sep 17 00:00:00 2001 From: ceci3 <592712189@qq.com> Date: Mon, 11 Mar 2019 04:05:25 +0000 Subject: [PATCH 28/64] test=develop, replace sce --- python/paddle/fluid/layers/nn.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py index 9d1d5fe093..d0bff52e43 100644 --- a/python/paddle/fluid/layers/nn.py +++ b/python/paddle/fluid/layers/nn.py @@ -10704,8 +10704,9 @@ def npair_loss(anchor, positive, labels, l2_reg=0.002): similarity_matrix = matmul( anchor, positive, transpose_x=False, transpose_y=True) - softmax_value = softmax(similarity_matrix) - cross_entropy = -1 * reduce_sum(labels * log(softmax_value), 0) + softmax_ce = softmax_with_cross_entropy( + logits=similarity_matrix, label=labels, soft_label=True) + cross_entropy = reduce_sum(labels * softmax_ce, 0) celoss = reduce_mean(cross_entropy) return l2loss + celoss From a38db3cb996885162971bb650f4f0bcc63e745d4 Mon Sep 17 00:00:00 2001 From: wopeizl Date: Mon, 11 Mar 2019 13:59:38 +0800 Subject: [PATCH 29/64] Fixrecordio (#16124) * fix recordio on win test=develop * test=develop * test=develop * fix code style test=develop * test=develop --- paddle/fluid/pybind/recordio.cc | 2 +- paddle/fluid/recordio/scanner.cc | 4 +++- python/paddle/fluid/tests/unittests/test_accuracy_op.py | 4 ++-- python/paddle/fluid/tests/unittests/test_random_crop_op.py | 2 +- 4 files changed, 7 insertions(+), 5 deletions(-) diff --git a/paddle/fluid/pybind/recordio.cc b/paddle/fluid/pybind/recordio.cc index f83b026d4d..32caf4bed9 100644 --- a/paddle/fluid/pybind/recordio.cc +++ b/paddle/fluid/pybind/recordio.cc @@ -31,7 +31,7 @@ class RecordIOWriter { RecordIOWriter(const std::string& filename, recordio::Compressor compressor, size_t max_num_record) : closed_(false), - stream_(filename), + stream_(filename, std::ios::binary), writer_(&stream_, compressor, max_num_record) {} void AppendTensor(const framework::LoDTensor& tensor) { diff --git a/paddle/fluid/recordio/scanner.cc b/paddle/fluid/recordio/scanner.cc index a0a2f98422..b06c274ada 100644 --- a/paddle/fluid/recordio/scanner.cc +++ b/paddle/fluid/recordio/scanner.cc @@ -15,6 +15,7 @@ #include "paddle/fluid/recordio/scanner.h" #include +#include #include "paddle/fluid/platform/enforce.h" @@ -27,7 +28,8 @@ Scanner::Scanner(std::unique_ptr &&stream) } Scanner::Scanner(const std::string &filename) - : stream_(new std::ifstream(filename)), parser_(*stream_) { + : stream_(new std::ifstream(filename, std::ios::in | std::ios::binary)), + parser_(*stream_) { PADDLE_ENFORCE(static_cast(*stream_), "Cannot open file %s", filename); Reset(); } diff --git a/python/paddle/fluid/tests/unittests/test_accuracy_op.py b/python/paddle/fluid/tests/unittests/test_accuracy_op.py index 5257b0be6f..b57aaeb52a 100644 --- a/python/paddle/fluid/tests/unittests/test_accuracy_op.py +++ b/python/paddle/fluid/tests/unittests/test_accuracy_op.py @@ -26,8 +26,8 @@ class TestAccuracyOp(OpTest): self.init_dtype() n = 8192 infer = np.random.random((n, 1)).astype(self.dtype) - indices = np.random.randint(0, 2, (n, 1)) - label = np.random.randint(0, 2, (n, 1)) + indices = np.random.randint(0, 2, (n, 1)).astype('int64') + label = np.random.randint(0, 2, (n, 1)).astype('int64') self.inputs = {'Out': infer, 'Indices': indices, "Label": label} num_correct = 0 for rowid in range(n): diff --git a/python/paddle/fluid/tests/unittests/test_random_crop_op.py b/python/paddle/fluid/tests/unittests/test_random_crop_op.py index f29dddff7a..db65b9e3e9 100644 --- a/python/paddle/fluid/tests/unittests/test_random_crop_op.py +++ b/python/paddle/fluid/tests/unittests/test_random_crop_op.py @@ -31,7 +31,7 @@ class TestRandomCropOp(OpTest): np.array([[6, 7, 8], [10, 11, 12]]).astype(np.int32) ] self.op_type = "random_crop" - self.inputs = {'X': to_crop, 'Seed': np.array([10])} + self.inputs = {'X': to_crop, 'Seed': np.array([10]).astype('int64')} self.outputs = {'Out': np.array([]), 'SeedOut': np.array([])} self.attrs = {'shape': [2, 3]} From cfc83c14452a4776c2d8c4fe5c04c6e1ef05e945 Mon Sep 17 00:00:00 2001 From: tensor-tang Date: Mon, 11 Mar 2019 06:49:30 +0000 Subject: [PATCH 30/64] refine jitcodekey and enhance unit tests test=develop --- paddle/fluid/operators/jit/gen/act.cc | 1 + paddle/fluid/operators/jit/gen/blas.cc | 1 + paddle/fluid/operators/jit/gen/embseqpool.cc | 1 + paddle/fluid/operators/jit/gen/gru.cc | 1 + paddle/fluid/operators/jit/gen/hopv.cc | 1 + paddle/fluid/operators/jit/gen/lstm.cc | 1 + paddle/fluid/operators/jit/gen/matmul.cc | 2 +- paddle/fluid/operators/jit/gen/seqpool.cc | 1 + paddle/fluid/operators/jit/gen/sgd.cc | 1 + paddle/fluid/operators/jit/helper.h | 2 +- paddle/fluid/operators/jit/kernel_key.cc | 61 ++--- paddle/fluid/operators/jit/kernel_key.h | 2 +- paddle/fluid/operators/jit/kernel_pool.h | 6 +- paddle/fluid/operators/jit/registry.h | 1 + paddle/fluid/operators/jit/test.cc | 220 +++++++++++++++---- 15 files changed, 211 insertions(+), 91 deletions(-) diff --git a/paddle/fluid/operators/jit/gen/act.cc b/paddle/fluid/operators/jit/gen/act.cc index 5cac219f95..ad68e792c7 100644 --- a/paddle/fluid/operators/jit/gen/act.cc +++ b/paddle/fluid/operators/jit/gen/act.cc @@ -13,6 +13,7 @@ * limitations under the License. */ #include "paddle/fluid/operators/jit/gen/act.h" +#include #include "paddle/fluid/operators/jit/registry.h" #include "paddle/fluid/platform/cpu_info.h" diff --git a/paddle/fluid/operators/jit/gen/blas.cc b/paddle/fluid/operators/jit/gen/blas.cc index e764a7983d..c126b9077a 100644 --- a/paddle/fluid/operators/jit/gen/blas.cc +++ b/paddle/fluid/operators/jit/gen/blas.cc @@ -13,6 +13,7 @@ * limitations under the License. */ #include "paddle/fluid/operators/jit/gen/blas.h" +#include #include "paddle/fluid/operators/jit/registry.h" #include "paddle/fluid/platform/cpu_info.h" diff --git a/paddle/fluid/operators/jit/gen/embseqpool.cc b/paddle/fluid/operators/jit/gen/embseqpool.cc index 6e8ecc07e7..331a4b0d07 100644 --- a/paddle/fluid/operators/jit/gen/embseqpool.cc +++ b/paddle/fluid/operators/jit/gen/embseqpool.cc @@ -14,6 +14,7 @@ #include "paddle/fluid/operators/jit/gen/embseqpool.h" #include // offsetof +#include #include #include "paddle/fluid/operators/jit/gen/act.h" // for exp_float_consts ones #include "paddle/fluid/operators/jit/registry.h" diff --git a/paddle/fluid/operators/jit/gen/gru.cc b/paddle/fluid/operators/jit/gen/gru.cc index 4bc9247f6f..b5b0cffa80 100644 --- a/paddle/fluid/operators/jit/gen/gru.cc +++ b/paddle/fluid/operators/jit/gen/gru.cc @@ -14,6 +14,7 @@ #include "paddle/fluid/operators/jit/gen/gru.h" #include // offsetof +#include #include "paddle/fluid/operators/jit/registry.h" #include "paddle/fluid/platform/cpu_info.h" diff --git a/paddle/fluid/operators/jit/gen/hopv.cc b/paddle/fluid/operators/jit/gen/hopv.cc index 3383f17df8..462ac68a93 100644 --- a/paddle/fluid/operators/jit/gen/hopv.cc +++ b/paddle/fluid/operators/jit/gen/hopv.cc @@ -13,6 +13,7 @@ * limitations under the License. */ #include "paddle/fluid/operators/jit/gen/hopv.h" +#include #include "paddle/fluid/operators/jit/registry.h" #include "paddle/fluid/platform/cpu_info.h" diff --git a/paddle/fluid/operators/jit/gen/lstm.cc b/paddle/fluid/operators/jit/gen/lstm.cc index 5e7789aede..2c3bc985e9 100644 --- a/paddle/fluid/operators/jit/gen/lstm.cc +++ b/paddle/fluid/operators/jit/gen/lstm.cc @@ -14,6 +14,7 @@ #include "paddle/fluid/operators/jit/gen/lstm.h" #include // offsetof +#include #include "paddle/fluid/operators/jit/registry.h" #include "paddle/fluid/platform/cpu_info.h" diff --git a/paddle/fluid/operators/jit/gen/matmul.cc b/paddle/fluid/operators/jit/gen/matmul.cc index ca50f26ce5..d9955c8cc6 100644 --- a/paddle/fluid/operators/jit/gen/matmul.cc +++ b/paddle/fluid/operators/jit/gen/matmul.cc @@ -14,8 +14,8 @@ #include "paddle/fluid/operators/jit/gen/matmul.h" #include // offsetof +#include #include - #include "paddle/fluid/operators/jit/registry.h" #include "paddle/fluid/platform/cpu_info.h" diff --git a/paddle/fluid/operators/jit/gen/seqpool.cc b/paddle/fluid/operators/jit/gen/seqpool.cc index ceca104cc9..d9e5904add 100644 --- a/paddle/fluid/operators/jit/gen/seqpool.cc +++ b/paddle/fluid/operators/jit/gen/seqpool.cc @@ -13,6 +13,7 @@ * limitations under the License. */ #include "paddle/fluid/operators/jit/gen/seqpool.h" +#include #include "paddle/fluid/operators/jit/gen/act.h" // for exp_float_consts ones #include "paddle/fluid/operators/jit/registry.h" #include "paddle/fluid/platform/cpu_info.h" diff --git a/paddle/fluid/operators/jit/gen/sgd.cc b/paddle/fluid/operators/jit/gen/sgd.cc index a40da9b993..e65d3500b4 100644 --- a/paddle/fluid/operators/jit/gen/sgd.cc +++ b/paddle/fluid/operators/jit/gen/sgd.cc @@ -14,6 +14,7 @@ #include "paddle/fluid/operators/jit/gen/sgd.h" #include // offsetof +#include #include #include "paddle/fluid/operators/jit/registry.h" #include "paddle/fluid/platform/cpu_info.h" diff --git a/paddle/fluid/operators/jit/helper.h b/paddle/fluid/operators/jit/helper.h index d98eada81c..1ac5318d46 100644 --- a/paddle/fluid/operators/jit/helper.h +++ b/paddle/fluid/operators/jit/helper.h @@ -36,7 +36,7 @@ inline typename std::enable_if< const Kernel*>::type GetJitCode(const typename KernelTuple::attr_type& attr) { using Attr = typename KernelTuple::attr_type; - size_t key = JitCodeKey(attr); + int64_t key = JitCodeKey(attr); auto& codes = JitCodePool::Instance(); if (codes.Has(key)) { return codes.AllKernels().at(key).get(); diff --git a/paddle/fluid/operators/jit/kernel_key.cc b/paddle/fluid/operators/jit/kernel_key.cc index 6987c893de..1ad220b397 100644 --- a/paddle/fluid/operators/jit/kernel_key.cc +++ b/paddle/fluid/operators/jit/kernel_key.cc @@ -13,7 +13,7 @@ * limitations under the License. */ #include "paddle/fluid/operators/jit/kernel_key.h" -#include +#include // XXH64: 13.8 GB/s #include "paddle/fluid/platform/enforce.h" namespace paddle { @@ -21,73 +21,46 @@ namespace operators { namespace jit { template <> -size_t JitCodeKey(const int& d) { +int64_t JitCodeKey(const int& d) { return d; } template <> -size_t JitCodeKey(const int64_t& d) { +int64_t JitCodeKey(const int64_t& d) { return d; } -// TODO(TJ): refine and benchmark JitCodeKey generatation -constexpr int act_type_shift = 3; // suppot 2^3 act types -static inline int act_type_convert(KernelType type) { - if (type == kVIdentity) { - return 0; - } else if (type == kVExp) { - return 1; - } else if (type == kVRelu) { - return 2; - } else if (type == kVSigmoid) { - return 3; - } else if (type == kVTanh) { - return 4; - } - PADDLE_THROW("Unsupported act type %d", type); - return 0; -} - template <> -size_t JitCodeKey(const lstm_attr_t& attr) { - // XXH64: 13.8 GB/s - - size_t key = attr.d; - int gate_key = act_type_convert(attr.act_gate) << 1; - int cand_key = act_type_convert(attr.act_cand) << (1 + act_type_shift); - int cell_key = act_type_convert(attr.act_cell) << (1 + act_type_shift * 2); - return (key << (1 + act_type_shift * 3)) + gate_key + cand_key + cell_key + - attr.use_peephole; +int64_t JitCodeKey(const gru_attr_t& attr) { + return XXH64(&attr, sizeof(gru_attr_t), 0); } template <> -size_t JitCodeKey(const gru_attr_t& attr) { - size_t key = attr.d; - return (key << (act_type_shift * 2)) + act_type_convert(attr.act_gate) + - (act_type_convert(attr.act_cand) << act_type_shift); +int64_t JitCodeKey(const lstm_attr_t& attr) { + int keys[5] = { + attr.d, static_cast(attr.act_gate), static_cast(attr.act_cand), + static_cast(attr.act_cell), static_cast(attr.use_peephole)}; + return XXH64(keys, sizeof(int) * 5, 0); } template <> -size_t JitCodeKey(const seq_pool_attr_t& attr) { - size_t key = attr.w; - constexpr int pool_type_shift = 3; - return (key << pool_type_shift) + static_cast(attr.type); +int64_t JitCodeKey(const seq_pool_attr_t& attr) { + int keys[2] = {attr.w, static_cast(attr.type)}; + return XXH64(keys, sizeof(int) * 2, 0); } template <> -size_t JitCodeKey(const matmul_attr_t& attr) { - size_t key = attr.m; - constexpr int shift = 21; - return (key << shift * 2) + ((static_cast(attr.n)) << shift) + attr.k; +int64_t JitCodeKey(const matmul_attr_t& attr) { + return XXH64(&attr, sizeof(int) * 3, 0); // m, n, k } template <> -size_t JitCodeKey(const emb_seq_pool_attr_t& attr) { +int64_t JitCodeKey(const emb_seq_pool_attr_t& attr) { return attr.table_width; } template <> -size_t JitCodeKey(const sgd_attr_t& attr) { +int64_t JitCodeKey(const sgd_attr_t& attr) { return attr.grad_width; } diff --git a/paddle/fluid/operators/jit/kernel_key.h b/paddle/fluid/operators/jit/kernel_key.h index 611a0210d6..b2cf92f23e 100644 --- a/paddle/fluid/operators/jit/kernel_key.h +++ b/paddle/fluid/operators/jit/kernel_key.h @@ -46,7 +46,7 @@ struct KernelKey { // Every JitCode should have a method to get the key from attribution template -size_t JitCodeKey(const Attr& attr); +int64_t JitCodeKey(const Attr& attr); } // namespace jit } // namespace operators diff --git a/paddle/fluid/operators/jit/kernel_pool.h b/paddle/fluid/operators/jit/kernel_pool.h index 3e15242af2..ec5c2be55b 100644 --- a/paddle/fluid/operators/jit/kernel_pool.h +++ b/paddle/fluid/operators/jit/kernel_pool.h @@ -30,7 +30,7 @@ namespace jit { template class JitCodePool { typedef std::unique_ptr GenBasePtr; - typedef std::unordered_map JitCodeMap; + typedef std::unordered_map JitCodeMap; public: JitCodePool() = default; @@ -41,9 +41,9 @@ class JitCodePool { const JitCodeMap& AllKernels() { return codes_; } - bool Has(size_t key) const { return codes_.find(key) != codes_.end(); } + bool Has(int64_t key) const { return codes_.find(key) != codes_.end(); } - void Insert(size_t key, GenBasePtr value) { + void Insert(int64_t key, GenBasePtr value) { codes_.emplace(key, std::move(value)); } diff --git a/paddle/fluid/operators/jit/registry.h b/paddle/fluid/operators/jit/registry.h index c8da92c0c5..567a903236 100644 --- a/paddle/fluid/operators/jit/registry.h +++ b/paddle/fluid/operators/jit/registry.h @@ -17,6 +17,7 @@ #include #include #include +#include // for std::move #include "paddle/fluid/operators/jit/kernel_base.h" #include "paddle/fluid/operators/jit/kernel_pool.h" #include "paddle/fluid/platform/place.h" diff --git a/paddle/fluid/operators/jit/test.cc b/paddle/fluid/operators/jit/test.cc index 898133a03b..068b0ba7ae 100644 --- a/paddle/fluid/operators/jit/test.cc +++ b/paddle/fluid/operators/jit/test.cc @@ -886,7 +886,11 @@ void TestKernelVBroadcast() { // test pool TEST(JITKernel_pool, jitcreator) { const auto& jitcreators = jit::JitCodeCreatorPool::Instance().AllCreators(); +#if defined(_WIN32) || defined(__APPLE__) || defined(__OSX__) + EXPECT_EQ(jitcreators.size(), 0UL); +#else EXPECT_EQ(jitcreators.size(), 25UL); +#endif } TEST(JITKernel_pool, jitpool) { @@ -894,13 +898,25 @@ TEST(JITKernel_pool, jitpool) { const auto& kers = jit::JitCodePool().Instance().AllKernels(); EXPECT_EQ(kers.size(), 0UL); jit::GetAllCandidateKernels, CPUPlace>(3); - // after call GetAllCandidateKernels, it will create jitcode Automatically +// after call GetAllCandidateKernels, it will create jitcode Automatically +#if defined(_WIN32) || defined(__APPLE__) || defined(__OSX__) + EXPECT_EQ(kers.size(), 0UL); +#else EXPECT_EQ(kers.size(), 1UL); +#endif } TEST(JITKernel_pool, more) { const auto& kers = jit::KernelPool::Instance().AllKernels(); +#if defined(__APPLE__) || defined(__OSX__) + EXPECT_EQ(kers.size(), 10UL); +#else +#ifdef PADDLE_WITH_MKLML EXPECT_EQ(kers.size(), 21UL); +#else + EXPECT_EQ(kers.size(), 8UL); +#endif +#endif } TEST(JITKernel_pool, refer) { @@ -915,7 +931,11 @@ TEST(JITKernel_helper, GetAllCandidateKernels) { #if defined(_WIN32) || defined(__APPLE__) || defined(__OSX__) EXPECT_GE(fp_kers.size(), 1UL); // refer #else +#ifdef PADDLE_WITH_MKLML EXPECT_GE(fp_kers.size(), 3UL); // jitcode, mkl, refer +#else + EXPECT_GE(fp_kers.size(), 2UL); // jitcode, refer +#endif #endif auto db_kers = @@ -923,18 +943,48 @@ TEST(JITKernel_helper, GetAllCandidateKernels) { #if defined(_WIN32) || defined(__APPLE__) || defined(__OSX__) EXPECT_GE(db_kers.size(), 1UL); // refer #else +#ifdef PADDLE_WITH_MKLML EXPECT_GE(db_kers.size(), 2UL); // mkl, refer +#else + EXPECT_GE(db_kers.size(), 1UL); // refer +#endif #endif } TEST(JITKernel_helper, GetAllCandidateFuncsWithTypes) { auto fp_kers = jit::GetAllCandidateFuncsWithTypes, CPUPlace>(10); +#if defined(__APPLE__) || defined(__OSX__) + EXPECT_GE(fp_kers.size(), 1UL); // refer +#else +#if !defined(PADDLE_WITH_MKLML) || defined(_WIN32) + EXPECT_GE(fp_kers.size(), 2UL); // jitcode/mkl, refer +#else EXPECT_GE(fp_kers.size(), 3UL); // jitcode, mkl, refer +#endif +#endif auto db_kers = jit::GetAllCandidateFuncsWithTypes, CPUPlace>(10); +#if defined(__APPLE__) || defined(__OSX__) || !defined(PADDLE_WITH_MKLML) + EXPECT_GE(db_kers.size(), 1UL); // refer +#else EXPECT_GE(db_kers.size(), 2UL); // mkl, refer +#endif +} + +TEST(JITKernel_helper, KernelFuncs) { + auto f1 = jit::KernelFuncs, CPUPlace>::Cache().At(3); + auto f2 = jit::KernelFuncs, CPUPlace>::Cache()[3]; + EXPECT_TRUE(f1 != nullptr); + EXPECT_TRUE(f1 == f2); + + auto f3 = jit::KernelFuncs, CPUPlace>::Cache()[5]; +#if defined(_WIN32) || defined(__APPLE__) || defined(__OSX__) + EXPECT_TRUE(f2 == f3); +#else + EXPECT_TRUE(f2 != f3); +#endif } TEST(JITKernel_helper, GetAllCandidateFuncs) { @@ -1011,6 +1061,134 @@ TEST(JITKernel_helper, attr) { EXPECT_EQ(out.str().size(), 14); } +// test keys +TEST(JITKernel_key, int) { + EXPECT_TRUE(jit::JitCodeKey(2) == jit::JitCodeKey(2)); + EXPECT_TRUE(jit::JitCodeKey(2) == jit::JitCodeKey(2)); + EXPECT_TRUE(jit::JitCodeKey(2) != jit::JitCodeKey(3)); +} + +TEST(JITKernel_key, gru) { + jit::gru_attr_t attr1(8, jit::kVSigmoid, jit::kVTanh); + jit::gru_attr_t attr2(8, jit::kVSigmoid, jit::kVTanh); + jit::gru_attr_t attr3(9, jit::kVSigmoid, jit::kVTanh); + jit::gru_attr_t attr4(9, jit::kVSigmoid, jit::kVIdentity); + jit::gru_attr_t attr5(9, jit::kVTanh, jit::kVIdentity); + + auto key1 = jit::JitCodeKey(attr1); + auto key2 = jit::JitCodeKey(attr2); + auto key3 = jit::JitCodeKey(attr3); + auto key4 = jit::JitCodeKey(attr4); + auto key5 = jit::JitCodeKey(attr5); + + EXPECT_TRUE(key1 == key2); + EXPECT_TRUE(key2 != key3); + EXPECT_TRUE(key2 != key4); + EXPECT_TRUE(key2 != key5); + EXPECT_TRUE(key3 != key4); + EXPECT_TRUE(key3 != key5); + EXPECT_TRUE(key4 != key5); +} + +TEST(JITKernel_key, lstm) { + jit::lstm_attr_t attr1(8, jit::kVIdentity, jit::kVSigmoid, jit::kVTanh); + jit::lstm_attr_t attr2(8, jit::kVIdentity, jit::kVSigmoid, jit::kVTanh); + jit::lstm_attr_t attr3(9, jit::kVIdentity, jit::kVSigmoid, jit::kVTanh); + jit::lstm_attr_t attr4(9, jit::kVRelu, jit::kVSigmoid, jit::kVTanh); + jit::lstm_attr_t attr5(9, jit::kVRelu, jit::kVSigmoid, jit::kVTanh, true); + jit::lstm_attr_t attr6(9, jit::kVRelu, jit::kVSigmoid, jit::kVTanh, true); + + auto key1 = jit::JitCodeKey(attr1); + auto key2 = jit::JitCodeKey(attr2); + auto key3 = jit::JitCodeKey(attr3); + auto key4 = jit::JitCodeKey(attr4); + auto key5 = jit::JitCodeKey(attr5); + auto key6 = jit::JitCodeKey(attr6); + + EXPECT_TRUE(key1 == key2); + EXPECT_TRUE(key2 != key3); + EXPECT_TRUE(key2 != key4); + EXPECT_TRUE(key2 != key5); + EXPECT_TRUE(key3 != key4); + EXPECT_TRUE(key3 != key5); + EXPECT_TRUE(key4 != key5); + EXPECT_TRUE(key5 == key6); +} + +TEST(JITKernel_key, seq_pool) { + jit::seq_pool_attr_t attr1(2, jit::SeqPoolType::kSum, 1); + jit::seq_pool_attr_t attr2(2, jit::SeqPoolType::kSum, 3); + jit::seq_pool_attr_t attr3(3, jit::SeqPoolType::kSum, 3); + jit::seq_pool_attr_t attr4(3, jit::SeqPoolType::kAvg, 3); + + auto key1 = jit::JitCodeKey(attr1); + auto key2 = jit::JitCodeKey(attr2); + auto key3 = jit::JitCodeKey(attr3); + auto key4 = jit::JitCodeKey(attr4); + + EXPECT_TRUE(key1 == key2); + EXPECT_TRUE(key2 != key3); + EXPECT_TRUE(key2 != key4); + EXPECT_TRUE(key3 != key4); +} + +TEST(JITKernel_key, matmul) { + jit::matmul_attr_t attr1(1, 2, 3); + jit::matmul_attr_t attr2(1, 2, 3); + jit::matmul_attr_t attr3(1, 3, 3); + jit::matmul_attr_t attr4(2, 3, 4); + + auto key1 = jit::JitCodeKey(attr1); + auto key2 = jit::JitCodeKey(attr2); + auto key3 = jit::JitCodeKey(attr3); + auto key4 = jit::JitCodeKey(attr4); + + EXPECT_TRUE(key1 == key2); + EXPECT_TRUE(key2 != key3); + EXPECT_TRUE(key2 != key4); + EXPECT_TRUE(key3 != key4); +} + +TEST(JITKernel_key, emb_seq_pool) { + jit::emb_seq_pool_attr_t attr1(1, 2, 3, 4, 5, jit::SeqPoolType::kSum); + jit::emb_seq_pool_attr_t attr2(1, 2, 3, 4, 5, jit::SeqPoolType::kSum); + jit::emb_seq_pool_attr_t attr3(10, 2, 9, 8, 7, jit::SeqPoolType::kAvg); + jit::emb_seq_pool_attr_t attr4(10, 3, 9, 8, 7, jit::SeqPoolType::kSum); + jit::emb_seq_pool_attr_t attr5(1, 6, 3, 4, 5, jit::SeqPoolType::kSum); + + auto key1 = jit::JitCodeKey(attr1); + auto key2 = jit::JitCodeKey(attr2); + auto key3 = jit::JitCodeKey(attr3); + auto key4 = jit::JitCodeKey(attr4); + auto key5 = jit::JitCodeKey(attr5); + + EXPECT_TRUE(key1 == key2); + EXPECT_TRUE(key2 == key3); + EXPECT_TRUE(key2 != key4); + EXPECT_TRUE(key2 != key5); + EXPECT_TRUE(key4 != key5); +} + +TEST(JITKernel_key, sgd) { + jit::sgd_attr_t attr1(1, 2, 3, 4, 5); + jit::sgd_attr_t attr2(1, 2, 3, 4, 5); + jit::sgd_attr_t attr3(9, 8, 7, 4, 6); + jit::sgd_attr_t attr4(1, 2, 3, 6, 5); + jit::sgd_attr_t attr5(10, 9, 8, 7, 6); + + auto key1 = jit::JitCodeKey(attr1); + auto key2 = jit::JitCodeKey(attr2); + auto key3 = jit::JitCodeKey(attr3); + auto key4 = jit::JitCodeKey(attr4); + auto key5 = jit::JitCodeKey(attr5); + + EXPECT_TRUE(key1 == key2); + EXPECT_TRUE(key2 == key3); + EXPECT_TRUE(key3 != key4); + EXPECT_TRUE(key3 != key5); + EXPECT_TRUE(key4 != key5); +} + // test kernerls #define TestKernelVMul TestKernelXYZN #define TestKernelVAdd TestKernelXYZN @@ -1080,43 +1258,3 @@ TEST_CPU_KERNEL(MatMul); TEST_CPU_KERNEL(Softmax); TEST_CPU_KERNEL(Sgd); TEST_CPU_KERNEL(VBroadcast); - -TEST(JITKernel, kernel_func) { - auto f1 = jit::KernelFuncs, CPUPlace>::Cache().At(3); - auto f2 = jit::KernelFuncs, CPUPlace>::Cache()[3]; - EXPECT_TRUE(f1 != nullptr); - EXPECT_TRUE(f1 == f2); - // TODO(TJ): check not equal -} - -TEST(JITKernel_key, lstm) { - jit::lstm_attr_t attr1(8, jit::kVIdentity, jit::kVSigmoid, jit::kVTanh); - jit::lstm_attr_t attr2(9, jit::kVIdentity, jit::kVSigmoid, jit::kVTanh); - jit::lstm_attr_t attr3(9, jit::kVIdentity, jit::kVSigmoid, jit::kVTanh); - jit::lstm_attr_t attr4(9, jit::kVRelu, jit::kVSigmoid, jit::kVTanh); - - auto key1 = jit::JitCodeKey(attr1); - auto key2 = jit::JitCodeKey(attr2); - auto key3 = jit::JitCodeKey(attr3); - auto key4 = jit::JitCodeKey(attr4); - - EXPECT_TRUE(key1 != key2); - EXPECT_TRUE(key2 == key3); - EXPECT_TRUE(key3 != key4); -} - -TEST(JITKernel_key, gru) { - jit::gru_attr_t attr1(8, jit::kVSigmoid, jit::kVTanh); - jit::gru_attr_t attr2(9, jit::kVSigmoid, jit::kVTanh); - jit::gru_attr_t attr3(9, jit::kVSigmoid, jit::kVTanh); - jit::gru_attr_t attr4(9, jit::kVSigmoid, jit::kVIdentity); - - auto key1 = jit::JitCodeKey(attr1); - auto key2 = jit::JitCodeKey(attr2); - auto key3 = jit::JitCodeKey(attr3); - auto key4 = jit::JitCodeKey(attr4); - - EXPECT_TRUE(key1 != key2); - EXPECT_TRUE(key2 == key3); - EXPECT_TRUE(key3 != key4); -} From 1283833395645c8d52d7b603c2e8bc3092d4ef12 Mon Sep 17 00:00:00 2001 From: luotao1 Date: Mon, 11 Mar 2019 15:18:32 +0800 Subject: [PATCH 31/64] zero_copy tensor support INT32 test=develop --- .../fluid/inference/api/details/zero_copy_tensor.cc | 5 +++++ paddle/fluid/inference/tests/api/tester_helper.h | 11 +++++++++++ 2 files changed, 16 insertions(+) diff --git a/paddle/fluid/inference/api/details/zero_copy_tensor.cc b/paddle/fluid/inference/api/details/zero_copy_tensor.cc index cf02901d96..9a40cf4b60 100644 --- a/paddle/fluid/inference/api/details/zero_copy_tensor.cc +++ b/paddle/fluid/inference/api/details/zero_copy_tensor.cc @@ -126,15 +126,20 @@ void ZeroCopyTensor::copy_to_cpu(T *data) { } template void ZeroCopyTensor::copy_from_cpu(const float *data); template void ZeroCopyTensor::copy_from_cpu(const int64_t *data); +template void ZeroCopyTensor::copy_from_cpu(const int32_t *data); template void ZeroCopyTensor::copy_to_cpu(float *data); template void ZeroCopyTensor::copy_to_cpu(int64_t *data); +template void ZeroCopyTensor::copy_to_cpu(int32_t *data); template float *ZeroCopyTensor::data(PaddlePlace *place, int *size) const; template int64_t *ZeroCopyTensor::data(PaddlePlace *place, int *size) const; +template int32_t *ZeroCopyTensor::data(PaddlePlace *place, + int *size) const; template float *ZeroCopyTensor::mutable_data(PaddlePlace place); template int64_t *ZeroCopyTensor::mutable_data(PaddlePlace place); +template int32_t *ZeroCopyTensor::mutable_data(PaddlePlace place); void *ZeroCopyTensor::FindTensor() const { PADDLE_ENFORCE(!name_.empty(), diff --git a/paddle/fluid/inference/tests/api/tester_helper.h b/paddle/fluid/inference/tests/api/tester_helper.h index 915ea772ed..a4881afe58 100644 --- a/paddle/fluid/inference/tests/api/tester_helper.h +++ b/paddle/fluid/inference/tests/api/tester_helper.h @@ -141,6 +141,15 @@ void CompareResult(const std::vector &outputs, } break; } + case PaddleDType::INT32: { + int32_t *pdata = static_cast(out.data.data()); + int32_t *pdata_ref = ref_out.data(&place, &ref_size); + EXPECT_EQ(size, ref_size); + for (size_t j = 0; j < size; ++j) { + EXPECT_EQ(pdata_ref[j], pdata[j]); + } + break; + } } } } @@ -253,6 +262,8 @@ void ConvertPaddleTensorToZeroCopyTensor( ZeroCopyTensorAssignData(tensor.get(), input.data); } else if (input.dtype == PaddleDType::FLOAT32) { ZeroCopyTensorAssignData(tensor.get(), input.data); + } else if (input.dtype == PaddleDType::INT32) { + ZeroCopyTensorAssignData(tensor.get(), input.data); } else { LOG(ERROR) << "unsupported feed type " << input.dtype; } From ad80bde824702518b835fc951be7e98800cc34e3 Mon Sep 17 00:00:00 2001 From: chengduo Date: Mon, 11 Mar 2019 03:20:46 -0500 Subject: [PATCH 32/64] Revert "Revert "Add Event for TensorCopy"" (#16035) * Revert "Revert "Add Event for TensorCopy" (#16022)" This reverts commit e2da3a5b22aec1575687f48beedca2ee98c425e5. * use default stream test=develop --- paddle/fluid/framework/CMakeLists.txt | 4 +- paddle/fluid/framework/tensor_util.cc | 5 ++ paddle/fluid/memory/CMakeLists.txt | 2 +- paddle/fluid/memory/memcpy.cc | 20 ++++++ .../fluid/operators/reader/buffered_reader.cc | 19 +++--- paddle/fluid/platform/device_tracer.cc | 63 ++++++++++++++++--- paddle/fluid/platform/device_tracer.h | 13 +++- tools/timeline.py | 2 +- 8 files changed, 107 insertions(+), 21 deletions(-) diff --git a/paddle/fluid/framework/CMakeLists.txt b/paddle/fluid/framework/CMakeLists.txt index 7ddf1ab44f..b9491c953f 100644 --- a/paddle/fluid/framework/CMakeLists.txt +++ b/paddle/fluid/framework/CMakeLists.txt @@ -38,10 +38,10 @@ if(WITH_GPU) nv_library(tensor SRCS tensor.cc .tensor_util.cu DEPS place memory data_type device_context) add_dependencies(tensor tensor_util) else() - nv_library(tensor SRCS tensor.cc tensor_util.cu DEPS place memory data_type device_context ) + nv_library(tensor SRCS tensor.cc tensor_util.cu DEPS place memory data_type device_context profiler) endif(WIN32) else() - cc_library(tensor SRCS tensor.cc tensor_util.cc DEPS place memory data_type device_context ) + cc_library(tensor SRCS tensor.cc tensor_util.cc DEPS place memory data_type device_context profiler) endif() cc_test(tensor_test SRCS tensor_test.cc DEPS tensor) diff --git a/paddle/fluid/framework/tensor_util.cc b/paddle/fluid/framework/tensor_util.cc index 89166bfd15..a7f09df491 100644 --- a/paddle/fluid/framework/tensor_util.cc +++ b/paddle/fluid/framework/tensor_util.cc @@ -18,6 +18,7 @@ #include #include #include "paddle/fluid/framework/data_type.h" +#include "paddle/fluid/platform/profiler.h" namespace paddle { namespace framework { @@ -137,16 +138,19 @@ void TensorCopySync(const Tensor& src, const platform::Place& dst_place, #ifdef PADDLE_WITH_CUDA else if (platform::is_gpu_place(src_place) && // NOLINT platform::is_cpu_place(dst_place)) { + platform::RecordEvent record_event("TensorCopy:GPU->CPU"); auto src_gpu_place = boost::get(src_place); auto dst_cpu_place = boost::get(dst_place); memory::Copy(dst_cpu_place, dst_ptr, src_gpu_place, src_ptr, size, nullptr); } else if (platform::is_cpu_place(src_place) && platform::is_gpu_place(dst_place)) { + platform::RecordEvent record_event("TensorCopy:CPU->GPU"); auto src_cpu_place = boost::get(src_place); auto dst_gpu_place = boost::get(dst_place); memory::Copy(dst_gpu_place, dst_ptr, src_cpu_place, src_ptr, size, nullptr); } else if (platform::is_gpu_place(src_place) && platform::is_gpu_place(dst_place)) { + platform::RecordEvent record_event("TensorCopy:GPU->GPU"); if (src_ptr == dst_ptr && platform::is_same_place(src_place, dst_place)) { VLOG(3) << "Skip copy the same data from " << src_place << " to " << dst_place; @@ -157,6 +161,7 @@ void TensorCopySync(const Tensor& src, const platform::Place& dst_place, memory::Copy(dst_gpu_place, dst_ptr, src_gpu_place, src_ptr, size, nullptr); } else if (platform::is_cuda_pinned_place(src_place) && platform::is_gpu_place(dst_place)) { + platform::RecordEvent record_event("TensorCopy:CUDAPinned->GPU"); auto src_pinned_place = boost::get(src_place); auto dst_gpu_place = boost::get(dst_place); memory::Copy(dst_gpu_place, dst_ptr, src_pinned_place, src_ptr, size, diff --git a/paddle/fluid/memory/CMakeLists.txt b/paddle/fluid/memory/CMakeLists.txt index e726807764..7eb663ea28 100644 --- a/paddle/fluid/memory/CMakeLists.txt +++ b/paddle/fluid/memory/CMakeLists.txt @@ -1,6 +1,6 @@ add_subdirectory(detail) add_subdirectory(allocation) -cc_library(malloc SRCS malloc.cc DEPS place enforce allocator_facade) +cc_library(malloc SRCS malloc.cc DEPS place enforce allocator_facade profiler) cc_library(memcpy SRCS memcpy.cc DEPS place) cc_library(memory diff --git a/paddle/fluid/memory/memcpy.cc b/paddle/fluid/memory/memcpy.cc index 2a6f70a01e..1408163e4b 100644 --- a/paddle/fluid/memory/memcpy.cc +++ b/paddle/fluid/memory/memcpy.cc @@ -15,6 +15,7 @@ limitations under the License. */ #include "paddle/fluid/memory/memcpy.h" #include // for memcpy +#include "paddle/fluid/platform/profiler.h" namespace paddle { namespace memory { @@ -29,14 +30,23 @@ void Copy(platform::CPUPlace, void* dst, #ifdef PADDLE_WITH_CUDA static constexpr size_t kMaxGpuAsyncCopyBytes = 64 * 1024; // 64K +// NOTE(zcd): Do not use GpuMemcpySync as much as possible. +// because GpuMemcpySync issues the copying command to the default stream, +// which will make two commands from different streams cannot run concurrently. +// Reference: +// https://devblogs.nvidia.com/gpu-pro-tip-cuda-7-streams-simplify-concurrency/ + template <> void Copy( platform::CPUPlace dst_place, void* dst, platform::CUDAPlace src_place, const void* src, size_t num, cudaStream_t stream) { platform::SetDeviceId(src_place.device); + if (stream) { + platform::RecordEvent record_event("GpuMemcpyAsync:GPU->CPU"); platform::GpuMemcpyAsync(dst, src, num, cudaMemcpyDeviceToHost, stream); } else { + platform::RecordEvent record_event("GpuMemcpySync:GPU->CPU"); platform::GpuMemcpySync(dst, src, num, cudaMemcpyDeviceToHost); // FIXME(zjl): do we really need it? if (num <= kMaxGpuAsyncCopyBytes) { @@ -51,8 +61,10 @@ void Copy( const void* src, size_t num, cudaStream_t stream) { platform::SetDeviceId(dst_place.device); if (stream) { + platform::RecordEvent record_event("GpuMemcpyAsync:CPU->GPU"); platform::GpuMemcpyAsync(dst, src, num, cudaMemcpyHostToDevice, stream); } else { + platform::RecordEvent record_event("GpuMemcpySync:CPU->GPU"); platform::GpuMemcpySync(dst, src, num, cudaMemcpyHostToDevice); // FIXME(zjl): do we really need it? if (num <= kMaxGpuAsyncCopyBytes) { @@ -68,15 +80,19 @@ void Copy( if (dst_place == src_place) { platform::SetDeviceId(src_place.device); if (stream) { + platform::RecordEvent record_event("GpuMemcpyAsync(same_gpu):GPU->GPU"); platform::GpuMemcpyAsync(dst, src, num, cudaMemcpyDeviceToDevice, stream); } else { + platform::RecordEvent record_event("GpuMemcpySync(same_gpu):GPU->GPU"); platform::GpuMemcpySync(dst, src, num, cudaMemcpyDeviceToDevice); } } else { if (stream) { + platform::RecordEvent record_event("GpuMemcpyPeerAsync:GPU->GPU"); platform::GpuMemcpyPeerAsync(dst, dst_place.device, src, src_place.device, num, stream); } else { + platform::RecordEvent record_event("GpuMemcpyPeerSync:GPU->GPU"); platform::GpuMemcpyPeerSync(dst, dst_place.device, src, src_place.device, num); } @@ -111,8 +127,10 @@ void Copy( cudaStream_t stream) { platform::SetDeviceId(src_place.device); if (stream) { + platform::RecordEvent record_event("GpuMemcpyAsync:GPU->CUDAPinned"); platform::GpuMemcpyAsync(dst, src, num, cudaMemcpyDeviceToHost, stream); } else { + platform::RecordEvent record_event("GpuMemcpySync:GPU->CUDAPinned"); platform::GpuMemcpySync(dst, src, num, cudaMemcpyDeviceToHost); } } @@ -124,8 +142,10 @@ void Copy( cudaStream_t stream) { platform::SetDeviceId(dst_place.device); if (stream) { + platform::RecordEvent record_event("GpuMemcpyAsync:CUDAPinned->GPU"); platform::GpuMemcpyAsync(dst, src, num, cudaMemcpyHostToDevice, stream); } else { + platform::RecordEvent record_event("GpuMemcpySync:CUDAPinned->GPU"); platform::GpuMemcpySync(dst, src, num, cudaMemcpyHostToDevice); } } diff --git a/paddle/fluid/operators/reader/buffered_reader.cc b/paddle/fluid/operators/reader/buffered_reader.cc index 52e96c4fb3..134807092d 100644 --- a/paddle/fluid/operators/reader/buffered_reader.cc +++ b/paddle/fluid/operators/reader/buffered_reader.cc @@ -17,6 +17,7 @@ #include #include "paddle/fluid/framework/data_type.h" +#include "paddle/fluid/platform/profiler.h" namespace paddle { namespace operators { namespace reader { @@ -50,8 +51,9 @@ BufferedReader::BufferedReader( .Get(place_))) ->stream(); events.resize(buffer_size); - for (auto &event : events) + for (auto &event : events) { PADDLE_ENFORCE(cudaEventCreateWithFlags(&event, cudaEventDisableTiming)); + } PADDLE_ENFORCE(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking)); } #endif @@ -84,12 +86,15 @@ void BufferedReader::ReadAsync(size_t i) { #ifdef PADDLE_WITH_CUDA // NOTE(liangdun): using async copy instead of TensorCopySync - // TensorCopySync would block other stream + // TensorCopySync would block other stream, because TensorCopySync + // issues the copying command to the default stream, it will make two + // commands from different streams cannot run concurrently. if (platform::is_gpu_place(place_)) { platform::SetDeviceId(boost::get(place_).device); PADDLE_ENFORCE(cudaStreamWaitEvent(stream, events[i], 0)); TensorVec &gpu = gpu_buffer_[i]; gpu.resize(cpu.size()); + platform::RecordEvent record_event("BufferedReader:MemoryCopy"); for (size_t i = 0; i < cpu.size(); ++i) { gpu[i].Resize(cpu[i].dims()); gpu[i].set_layout(cpu[i].layout()); @@ -98,20 +103,20 @@ void BufferedReader::ReadAsync(size_t i) { auto gpu_ptr = gpu[i].mutable_data(place_, cpu[i].type()); auto size = cpu[i].numel() * paddle::framework::SizeOfType(cpu[i].type()); - if (platform::is_cuda_pinned_place(cpu_place)) + if (platform::is_cuda_pinned_place(cpu_place)) { memory::Copy(boost::get(place_), gpu_ptr, boost::get(cpu_place), cpu_ptr, size, stream); - else if ((platform::is_gpu_place(cpu_place))) + } else if ((platform::is_gpu_place(cpu_place))) { memory::Copy(boost::get(place_), gpu_ptr, boost::get(cpu_place), cpu_ptr, size, stream); - else - // if cpu place is not pinned, async copy is slower than sync copy, - // so we use sync copy instead. + } else { + // TODO(zcd): The default stream should not be used here. memory::Copy(boost::get(place_), gpu_ptr, boost::get(cpu_place), cpu_ptr, size, 0); + } gpu[i].set_lod(cpu[i].lod()); } PADDLE_ENFORCE(cudaStreamSynchronize(stream)); diff --git a/paddle/fluid/platform/device_tracer.cc b/paddle/fluid/platform/device_tracer.cc index 0179daa557..b084f1a649 100644 --- a/paddle/fluid/platform/device_tracer.cc +++ b/paddle/fluid/platform/device_tracer.cc @@ -30,7 +30,6 @@ limitations under the License. */ #include "glog/logging.h" #include "google/protobuf/text_format.h" #include "paddle/fluid/framework/block_desc.h" -#include "paddle/fluid/platform/profiler.h" #include "paddle/fluid/string/printf.h" namespace paddle { @@ -222,19 +221,24 @@ void CUPTIAPI bufferCompleted(CUcontext ctx, uint32_t streamId, uint8_t *buffer, } case CUPTI_ACTIVITY_KIND_DRIVER: { auto *api = reinterpret_cast(record); - if (api->start != 0 && api->end != 0) - // -1 device id represents CUDA api call - tracer->AddCPURecords( + if (api->start != 0 && api->end != 0) { + // -1 device id represents ActiveKind api call + tracer->AddActiveKindRecords( DriverKind(api->cbid), api->start, api->end, -1, - GetThreadIdFromSystemThreadId(api->threadId)); + GetThreadIdFromSystemThreadId(api->threadId), + api->correlationId); + } break; } case CUPTI_ACTIVITY_KIND_RUNTIME: { auto *api = reinterpret_cast(record); - if (api->start != 0 && api->end != 0) - tracer->AddCPURecords( + if (api->start != 0 && api->end != 0) { + // -1 device id represents ActiveKind api call + tracer->AddActiveKindRecords( RuntimeKind(api->cbid), api->start, api->end, -1, - GetThreadIdFromSystemThreadId(api->threadId)); + GetThreadIdFromSystemThreadId(api->threadId), + api->correlationId); + } break; } default: { break; } @@ -313,6 +317,25 @@ class DeviceTracerImpl : public DeviceTracer { stream_id, correlation_id, bytes}); } + void AddActiveKindRecords(const std::string &anno, uint64_t start_ns, + uint64_t end_ns, int64_t device_id, + int64_t thread_id, uint32_t correlation_id) { + if (anno.empty()) { + VLOG(1) << "Empty timeline annotation."; + return; + } + thread_local std::forward_list + *local_active_kind_records = nullptr; + if (local_active_kind_records == nullptr) { + std::lock_guard l(trace_mu_); + active_kind_records_.emplace_front(); + local_active_kind_records = &active_kind_records_.front(); + } + // lock is not needed, only one thread call this function. + local_active_kind_records->push_front(ActiveKindRecord{ + anno, start_ns, end_ns, device_id, thread_id, correlation_id}); + } + void AddKernelRecords(std::string name, uint64_t start, uint64_t end, int64_t device_id, int64_t stream_id, uint32_t correlation_id) { @@ -355,6 +378,7 @@ class DeviceTracerImpl : public DeviceTracer { } const std::vector cbids { CUPTI_RUNTIME_TRACE_CBID_cudaMemcpy_v3020, + CUPTI_RUNTIME_TRACE_CBID_cudaSetupArgument_v3020, CUPTI_RUNTIME_TRACE_CBID_cudaMemcpyAsync_v3020, CUPTI_RUNTIME_TRACE_CBID_cudaMemset_v3020, CUPTI_RUNTIME_TRACE_CBID_cudaMemsetAsync_v3020, @@ -385,6 +409,7 @@ class DeviceTracerImpl : public DeviceTracer { correlations_.clear(); for (auto &tmp : correlations_pairs) tmp.clear(); for (auto &tmp : cpu_records_) tmp.clear(); + for (auto &tmp : active_kind_records_) tmp.clear(); } void GenEventKernelCudaElapsedTime() { @@ -437,7 +462,7 @@ class DeviceTracerImpl : public DeviceTracer { event->set_device_id(r.device_id); } VLOG(1) << "KernelRecord event miss: " << miss << " find: " << find; - for (auto &tmp : cpu_records_) + for (auto &tmp : cpu_records_) { for (const CPURecord &r : tmp) { auto *event = profile_pb.add_events(); event->set_type(proto::Event::CPU); @@ -447,6 +472,24 @@ class DeviceTracerImpl : public DeviceTracer { event->set_sub_device_id(r.thread_id); event->set_device_id(r.device_id); } + } + for (auto &tmp : active_kind_records_) { + for (const ActiveKindRecord &r : tmp) { + auto *event = profile_pb.add_events(); + event->set_type(proto::Event::CPU); + auto c = correlations_.find(r.correlation_id); + if (c != correlations_.end() && c->second != nullptr) { + event->set_name(c->second->name()); + event->set_detail_info(r.name); + } else { + event->set_name(r.name); + } + event->set_start_ns(r.start_ns); + event->set_end_ns(r.end_ns); + event->set_sub_device_id(r.thread_id); + event->set_device_id(r.device_id); + } + } miss = find = 0; for (const MemRecord &r : mem_records_) { auto *event = profile_pb.add_events(); @@ -510,6 +553,7 @@ class DeviceTracerImpl : public DeviceTracer { std::forward_list kernel_records_; std::forward_list mem_records_; std::forward_list> cpu_records_; + std::forward_list> active_kind_records_; std::forward_list>> correlations_pairs; std::unordered_map correlations_; @@ -613,6 +657,7 @@ void initCuptiCbidStr() { REGISTER_RUNTIME_CBID_STR(cudaUnbindTexture_v3020); REGISTER_RUNTIME_CBID_STR(cudaSetupArgument_v3020); REGISTER_RUNTIME_CBID_STR(cudaLaunch_v3020); + REGISTER_RUNTIME_CBID_STR(cudaDeviceGetPCIBusId_v4010); #if CUDA_VERSION >= 9000 REGISTER_RUNTIME_CBID_STR(cudaLaunchCooperativeKernel_v9000); REGISTER_RUNTIME_CBID_STR(cudaLaunchCooperativeKernelMultiDevice_v9000); diff --git a/paddle/fluid/platform/device_tracer.h b/paddle/fluid/platform/device_tracer.h index d4418d836d..a8f1d89383 100644 --- a/paddle/fluid/platform/device_tracer.h +++ b/paddle/fluid/platform/device_tracer.h @@ -63,7 +63,14 @@ class DeviceTracer { uint32_t correlation_id; uint64_t bytes; }; - + struct ActiveKindRecord { + std::string name; + uint64_t start_ns; + uint64_t end_ns; + int64_t device_id; + int64_t thread_id; + uint32_t correlation_id; + }; virtual ~DeviceTracer() {} // Needs to be called once before use. virtual void Enable() = 0; @@ -85,6 +92,10 @@ class DeviceTracer { virtual void AddCPURecords(const std::string& anno, uint64_t start_ns, uint64_t end_ns, int64_t device_id, int64_t thread_id) = 0; + virtual void AddActiveKindRecords(const std::string& anno, uint64_t start_ns, + uint64_t end_ns, int64_t device_id, + int64_t thread_id, + uint32_t correlation_id) = 0; // Add a cuda kernel stats. `correlation_id` will be mapped to annotation // added before for human readability. diff --git a/tools/timeline.py b/tools/timeline.py index ebadb29bdb..7879666417 100644 --- a/tools/timeline.py +++ b/tools/timeline.py @@ -131,7 +131,7 @@ class Timeline(object): if (k, event.device_id, "CPU") not in self._devices: pid = self._allocate_pid() self._devices[(k, event.device_id, "CPU")] = pid - # -1 device id represents CUDA api call + # -1 device id represents CUDA API(RunTime) call.(e.g. cudaLaunch, cudaMemcpy) if event.device_id == -1: self._chrome_trace.emit_pid("%s:cuda_api" % k, pid) else: From 4e052e0ac9af166869ad38b7649bdd0696cdf72a Mon Sep 17 00:00:00 2001 From: Liu Yiqun Date: Mon, 11 Mar 2019 17:25:12 +0800 Subject: [PATCH 33/64] Disable inference download for WIN32 temporary. test=develop --- paddle/fluid/inference/tests/test.cmake | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/paddle/fluid/inference/tests/test.cmake b/paddle/fluid/inference/tests/test.cmake index 5ceb630976..f551b322fe 100644 --- a/paddle/fluid/inference/tests/test.cmake +++ b/paddle/fluid/inference/tests/test.cmake @@ -42,8 +42,8 @@ function(inference_download_and_uncompress INSTALL_DIR URL FILENAME) endfunction() set(WORD2VEC_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/word2vec") -if (NOT EXISTS ${WORD2VEC_INSTALL_DIR}) - inference_download_and_uncompress(${WORD2VEC_INSTALL_DIR} ${INFERENCE_URL} "word2vec.inference.model.tar.gz") +if(NOT EXISTS ${WORD2VEC_INSTALL_DIR} AND NOT WIN32) + inference_download_and_uncompress(${WORD2VEC_INSTALL_DIR} ${INFERENCE_URL} "word2vec.inference.model.tar.gz") endif() set(WORD2VEC_MODEL_DIR "${WORD2VEC_INSTALL_DIR}/word2vec.inference.model") From d17bb4e6006880ed6331fed7e33f2e5d7624101e Mon Sep 17 00:00:00 2001 From: minqiyang Date: Mon, 11 Mar 2019 18:05:15 +0800 Subject: [PATCH 34/64] Add unit test for gru unit test=develop --- python/paddle/fluid/imperative/nn.py | 39 ++++--- .../paddle/fluid/tests/unittests/op_test.py | 107 +++++++++++++++++- .../fluid/tests/unittests/test_gru_op.py | 2 +- .../fluid/tests/unittests/test_layers.py | 41 +++++++ 4 files changed, 164 insertions(+), 25 deletions(-) diff --git a/python/paddle/fluid/imperative/nn.py b/python/paddle/fluid/imperative/nn.py index 2d2b70e3f7..6681b42341 100644 --- a/python/paddle/fluid/imperative/nn.py +++ b/python/paddle/fluid/imperative/nn.py @@ -22,6 +22,7 @@ from . import layers from ..framework import Variable, OpProtoHolder from ..param_attr import ParamAttr from ..initializer import Normal, Constant + __all__ = ['Conv2D', 'Pool2D', 'FC', 'BatchNorm', 'Embedding', 'GRUUnit'] @@ -548,7 +549,7 @@ class GRUUnit(layers.Layer): """ def __init__(self, - hidden, + name_scope, size, param_attr=None, bias_attr=None, @@ -556,8 +557,8 @@ class GRUUnit(layers.Layer): gate_activation='sigmoid', origin_mode=False, dtype='float32'): + super(GRUUnit, self).__init__(name_scope) - super(GRUUnit, self).__init__() activation_dict = dict( identity=0, sigmoid=1, @@ -566,29 +567,27 @@ class GRUUnit(layers.Layer): activation = activation_dict[activation] gate_activation = activation_dict[gate_activation] - helper = LayerHelper('gru_unit', **locals()) - dtype = helper.input_dtype() + self._dtype = dtype size = size // 3 - # create weight - weight = helper.create_parameter( - attr=helper.param_attr, shape=[size, 3 * size], dtype=dtype) + self._weight = self.create_parameter( + attr=param_attr, shape=[size, 3 * size], dtype=dtype) - gate = helper.create_variable_for_type_inference(dtype) - reset_hidden_pre = helper.create_variable_for_type_inference(dtype) - updated_hidden = helper.create_variable_for_type_inference(dtype) - inputs = {'Input': input, 'HiddenPrev': hidden, 'Weight': weight} # create bias - if helper.bias_attr: - bias_size = [1, 3 * size] - bias = helper.create_parameter( - attr=helper.bias_attr, - shape=bias_size, - dtype=dtype, - is_bias=True) - inputs['Bias'] = bias + bias_size = [1, 3 * size] + self._bias = self.create_parameter( + attr=bias_attr, shape=bias_size, dtype=dtype, is_bias=True) - def forward(self, input): + def forward(self, input, hidden): + inputs = {'Input': input, 'HiddenPrev': hidden, 'Weight': self._weight} + if self._bias: + inputs['Bias'] = self._bias + + gate = self._helper.create_variable_for_type_inference(self._dtype) + reset_hidden_pre = self._helper.create_variable_for_type_inference( + self._dtype) + updated_hidden = self._helper.create_variable_for_type_inference( + self._dtype) self._helper.append_op( type='gru_unit', inputs=inputs, diff --git a/python/paddle/fluid/tests/unittests/op_test.py b/python/paddle/fluid/tests/unittests/op_test.py index 8234457243..9fa62a692e 100644 --- a/python/paddle/fluid/tests/unittests/op_test.py +++ b/python/paddle/fluid/tests/unittests/op_test.py @@ -22,6 +22,7 @@ import six import time import itertools import collections +from collections import defaultdict import paddle.fluid as fluid import paddle.fluid.core as core @@ -257,8 +258,65 @@ class OpTest(unittest.TestCase): outs, _ = self._calc_output(place) return outs - def _calc_output(self, place, parallel=False, no_check_set=None): + def _create_var_from_numpy(self, value): + if isinstance(value, tuple): + data = value[0] + lod = value[1] + v = fluid.imperative.base.to_variable(value=data) + v._ivar.value().get_tensor().set_recursive_sequence_lengths(lod) + return v + else: + return fluid.imperative.base.to_variable(value) + + def _calc_imperative_output(self, place, parallel=False, no_check_set=None): + with fluid.imperative.base.guard(place=place): + block = fluid.default_main_program().global_block() + + # prepare input variable + inputs = defaultdict(list) + for name, np_value in six.iteritems(self.inputs): + if not isinstance(np_value, list): + np_value = [np_value] + + for i in range(len(np_value)): + inputs[name].append( + self._create_var_from_numpy(np_value[i])) + + # prepare output variable + outputs = defaultdict(list) + for name, np_value in six.iteritems(self.outputs): + if not isinstance(np_value, list): + np_value = [np_value] + + for i in range(len(np_value)): + value = np_value[i] + if isinstance(value, tuple): + v = block.create_var( + name="%s_out%d" % (name, i), + dtype=value[0].dtype, + type=core.VarDesc.VarType.LOD_TENSOR, + persistable=False, + stop_gradient=False) + v._ivar.value().get_tensor( + ).set_recursive_sequence_lengths(value[1]) + else: + v = block.create_var( + name="%s_out%d" % (name, i), + dtype=value.dtype, + type=core.VarDesc.VarType.LOD_TENSOR, + persistable=False, + stop_gradient=False) + outputs[name].append(v) + + block.append_op( + type=self.op_type, + inputs=inputs, + outputs=outputs, + attrs=self.attrs) + + return outputs + def _calc_output(self, place, parallel=False, no_check_set=None): program = Program() block = program.global_block() self._append_ops(block) @@ -305,8 +363,13 @@ class OpTest(unittest.TestCase): place, atol, no_check_set=None, - equal_nan=False): + equal_nan=False, + check_imperative=False): + if check_imperative: + imperative_outs = self._calc_imperative_output( + place, no_check_set=no_check_set) outs, fetch_list = self._calc_output(place, no_check_set=no_check_set) + for out_name, out_dup in Operator.get_op_outputs(self.op_type): if out_name not in self.outputs: continue @@ -330,6 +393,10 @@ class OpTest(unittest.TestCase): type(sub_out)) for item in sub_out: sub_out_name, expect = item[0], item[1] + if check_imperative: + imperative_actual = imperative_outs[sub_out_name][0] + imperative_actual_t = np.array( + imperative_actual._ivar.value().get_tensor()) idx = find_actual(sub_out_name, fetch_list) actual = outs[idx] actual_t = np.array(actual) @@ -340,12 +407,24 @@ class OpTest(unittest.TestCase): actual_t, expect_t, atol=atol, equal_nan=equal_nan), "Output (" + sub_out_name + ") has diff at " + str(place)) + self.assertTrue( + np.allclose( + imperative_actual_t, + expect_t, + atol=atol, + equal_nan=equal_nan), + "Output (" + sub_out_name + ") has diff at " + + str(place) + " in imperative mode") if isinstance(expect, tuple): self.assertListEqual( actual.recursive_sequence_lengths(), expect[1], "Output (" + sub_out_name + ") has different lod at " + str(place)) else: + if check_imperative: + imperative_actual = imperative_outs[out_name][0] + imperative_actual_t = np.array( + imperative_actual._ivar.value().get_tensor()) idx = find_actual(out_name, fetch_list) actual = outs[idx] actual_t = np.array(actual) @@ -357,10 +436,25 @@ class OpTest(unittest.TestCase): "Output (" + out_name + ") has diff at " + str(place) + "\nExpect " + str(expect_t) + "\n" + "But Got" + str(actual_t) + " in class " + self.__class__.__name__) + self.assertTrue( + np.allclose( + imperative_actual_t, + expect_t, + atol=atol, + equal_nan=equal_nan), + "Output (" + out_name + ") has diff at " + str(place) + + "\nExpect " + str(expect_t) + "\n" + "But Got" + + str(imperative_actual_t) + " in class " + + self.__class__.__name__) if isinstance(expect, tuple): self.assertListEqual(actual.recursive_sequence_lengths(), expect[1], "Output (" + out_name + ") has different lod at " + str(place)) + if check_imperative: + self.assertListEqual( + imperative_actual._ivar.value().get_tensor() + .recursive_sequence_lengths(), expect[1], "Output (" + + out_name + ") has different lod at " + str(place)) def _get_places(self): if self.dtype == np.float16: @@ -383,10 +477,15 @@ class OpTest(unittest.TestCase): places.append(core.CUDAPlace(0)) return places - def check_output(self, atol=1e-5, no_check_set=None, equal_nan=False): + def check_output(self, + atol=1e-5, + no_check_set=None, + equal_nan=False, + check_imperative=False): places = self._get_places() for place in places: - self.check_output_with_place(place, atol, no_check_set, equal_nan) + self.check_output_with_place(place, atol, no_check_set, equal_nan, + check_imperative) def check_output_customized(self, checker): places = self._get_places() diff --git a/python/paddle/fluid/tests/unittests/test_gru_op.py b/python/paddle/fluid/tests/unittests/test_gru_op.py index 6606162733..848c9a4952 100644 --- a/python/paddle/fluid/tests/unittests/test_gru_op.py +++ b/python/paddle/fluid/tests/unittests/test_gru_op.py @@ -156,7 +156,7 @@ class TestGRUOp(OpTest): } def test_check_output(self): - self.check_output(atol=1e-8) + self.check_output(atol=1e-8, check_imperative=True) def test_check_grad(self): self.check_grad(['Input', 'H0', 'Weight', 'Bias'], ['Hidden']) diff --git a/python/paddle/fluid/tests/unittests/test_layers.py b/python/paddle/fluid/tests/unittests/test_layers.py index b29ad25870..5b186ae038 100644 --- a/python/paddle/fluid/tests/unittests/test_layers.py +++ b/python/paddle/fluid/tests/unittests/test_layers.py @@ -112,6 +112,47 @@ class TestLayer(LayerTest): self.assertTrue(np.allclose(static_ret, dy_ret._numpy())) self.assertTrue(np.allclose(static_ret, static_ret2)) + def test_gru_unit(self): + lod = [[2, 4, 3]] + D = 5 + T = sum(lod[0]) + N = len(lod[0]) + + input = np.random.rand(T, 3 * D).astype('float32') + hidden_input = np.random.rand(T, D).astype('float32') + + with self.static_graph(): + x = layers.data(name='x', shape=[-1, D * 3], dtype='float32') + hidden = layers.data(name='hidden', shape=[-1, D], dtype='float32') + updated_hidden, reset_hidden_pre, gate = layers.gru_unit( + input=x, hidden=hidden, size=D * 3) + static_ret = self.get_static_graph_result( + feed={'x': input, + 'hidden': hidden_input}, + fetch_list=[updated_hidden, reset_hidden_pre, gate]) + + with self.static_graph(): + x = layers.data(name='x', shape=[-1, D * 3], dtype='float32') + hidden = layers.data(name='hidden', shape=[-1, D], dtype='float32') + updated_hidden, reset_hidden_pre, gate = layers.gru_unit( + input=x, hidden=hidden, size=D * 3) + gru = nn.GRUUnit('gru', size=D * 3) + updated_hidden, reset_hidden_pre, gate = gru(x, hidden) + + static_ret2 = self.get_static_graph_result( + feed={'x': input, + 'hidden': hidden_input}, + fetch_list=[updated_hidden, reset_hidden_pre, gate]) + + with self.dynamic_graph(): + gru = nn.GRUUnit('gru', size=D * 3) + dy_ret = gru( + base.to_variable(input), base.to_variable(hidden_input)) + + for i in range(len(static_ret)): + self.assertTrue(np.allclose(static_ret[i], static_ret2[i])) + self.assertTrue(np.allclose(static_ret[i], dy_ret[i]._numpy())) + class TestBook(unittest.TestCase): def test_fit_a_line(self): From 14d871121b0d1f9ba17c219ecfb95a49836bbe73 Mon Sep 17 00:00:00 2001 From: tensor-tang Date: Mon, 11 Mar 2019 10:48:57 +0000 Subject: [PATCH 35/64] enhance jitkernel unit test test=develop --- paddle/fluid/operators/jit/kernel_pool.h | 1 + paddle/fluid/operators/jit/test.cc | 36 +++++++++++++++++++++++- 2 files changed, 36 insertions(+), 1 deletion(-) diff --git a/paddle/fluid/operators/jit/kernel_pool.h b/paddle/fluid/operators/jit/kernel_pool.h index ec5c2be55b..04710a54ac 100644 --- a/paddle/fluid/operators/jit/kernel_pool.h +++ b/paddle/fluid/operators/jit/kernel_pool.h @@ -17,6 +17,7 @@ #include // for unique_ptr #include #include +#include // for move #include #include "paddle/fluid/operators/jit/gen_base.h" #include "paddle/fluid/operators/jit/kernel_base.h" diff --git a/paddle/fluid/operators/jit/test.cc b/paddle/fluid/operators/jit/test.cc index 068b0ba7ae..6c099a7a06 100644 --- a/paddle/fluid/operators/jit/test.cc +++ b/paddle/fluid/operators/jit/test.cc @@ -1003,9 +1003,43 @@ TEST(JITKernel_helper, GetAllCandidateFuncs) { } } +TEST(JITKernel_helper, pack_weights) { + const int N = 8 * 60, K = 2; + float src[K][N], yref[K][N], y[K * N]; + float* x = &(src[0][0]); + float* ref = &(yref[0][0]); + for (int i = 0; i < N * K; ++i) { + *(x + i) = static_cast(i); + } + int block = 0; + std::vector groups; + if (paddle::platform::MayIUse(paddle::platform::avx512f)) { + block = ZMM_FLOAT_BLOCK; + groups.push_back(30); + } else { + block = YMM_FLOAT_BLOCK; + groups.insert(groups.end(), {14, 14, 14, 14, 4}); + } + + int offset = 0; + int acc = 0; + for (int g : groups) { + g = g * block; + for (int k = 0; k < K; ++k) { + for (int i = 0; i < g; ++i) { + *(ref + offset) = src[k][i + acc]; + offset++; + } + } + acc += g; + } + + jit::pack_weights(x, y, N, K); + ExpectEQ(y, ref, N * K); +} + TEST(JITKernel_helper, attr) { std::ostringstream out; - // KernelTypes out << jit::to_string(jit::kNone) << jit::to_string(jit::kCRFDecoding) << jit::to_string(jit::kEmbSeqPool) << jit::to_string(jit::kGRUH1) From 31ccaf091641b991af885427eb3071a276ccc70e Mon Sep 17 00:00:00 2001 From: luotao1 Date: Mon, 11 Mar 2019 19:58:41 +0800 Subject: [PATCH 36/64] add all_kernels_must_compute_runtime_shape example for speedup infershape test=develop --- paddle/fluid/framework/operator.cc | 11 +++++++++-- .../operators/fused/fused_embedding_seq_pool_op.cc | 11 ++++++++--- paddle/fluid/operators/hash_op.cc | 11 ++++++++--- .../operators/sequence_ops/sequence_enumerate_op.cc | 11 ++++++++--- 4 files changed, 33 insertions(+), 11 deletions(-) diff --git a/paddle/fluid/framework/operator.cc b/paddle/fluid/framework/operator.cc index df1689764d..9f48b8cb9e 100644 --- a/paddle/fluid/framework/operator.cc +++ b/paddle/fluid/framework/operator.cc @@ -926,8 +926,15 @@ void OperatorWithKernel::RunImpl(const Scope& scope, dev_ctx = pool.Get(expected_kernel_key.place_); } - RuntimeInferShapeContext infer_shape_ctx(*this, exec_scope, ctx); - this->InferShape(&infer_shape_ctx); + // If Op has attribute all_kernels_must_compute_runtime_shape, + // all the kernels of this Op would compute runtime shape, + // and skip infershape in runtime for speedup. + // TODO(luotao): Note that it is a temporal attribute, after all ops + // implement computing runtime shape, this attribute would be deleted. + if (!HasAttr("all_kernels_must_compute_runtime_shape")) { + RuntimeInferShapeContext infer_shape_ctx(*this, exec_scope, ctx); + this->InferShape(&infer_shape_ctx); + } // TODO(panyx0718): ExecutionContext should only depend on RuntimeContext // not Scope. Imperative mode only pass inputs and get outputs. kernel_iter->second( diff --git a/paddle/fluid/operators/fused/fused_embedding_seq_pool_op.cc b/paddle/fluid/operators/fused/fused_embedding_seq_pool_op.cc index 80caf70b08..17a81d3e88 100644 --- a/paddle/fluid/operators/fused/fused_embedding_seq_pool_op.cc +++ b/paddle/fluid/operators/fused/fused_embedding_seq_pool_op.cc @@ -23,9 +23,6 @@ class FusedEmbeddingSeqPoolOp : public framework::OperatorWithKernel { using framework::OperatorWithKernel::OperatorWithKernel; void InferShape(framework::InferShapeContext* ctx) const override { - if (ctx->IsRuntime()) { - return; - } PADDLE_ENFORCE(ctx->HasInput("W"), "Input W of FusedEmbeddingSeqPoolOp should not be null."); PADDLE_ENFORCE(ctx->HasInput("Ids"), @@ -91,6 +88,14 @@ class FusedEmbeddingSeqPoolOpMaker : public framework::OpProtoAndCheckerMaker { "(boolean, default false) " "Sparse update.") .SetDefault(false); + AddAttr( + "all_kernels_must_compute_runtime_shape", + "(boolean, default true) " + "An attribute to speed up OperatorWithKernel::RunImpl." + "If true, all the kernels of this Op would compute runtime " + "shape, but skip infershape in runtime. Note that it is a temporal " + "attribute, please do DOT set it in python layer.") + .SetDefault(true); AddComment(R"DOC( FusedEmbeddingSeqPool Operator. diff --git a/paddle/fluid/operators/hash_op.cc b/paddle/fluid/operators/hash_op.cc index 7a29f80ff1..b39eba081e 100644 --- a/paddle/fluid/operators/hash_op.cc +++ b/paddle/fluid/operators/hash_op.cc @@ -26,9 +26,6 @@ class HashOp : public framework::OperatorWithKernel { : OperatorWithKernel(type, inputs, outputs, attrs) {} void InferShape(framework::InferShapeContext *ctx) const override { - if (ctx->IsRuntime()) { - return; - } PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) of HashOp should not be null."); PADDLE_ENFORCE(ctx->HasOutput("Out"), @@ -57,6 +54,14 @@ $$Out = scale * X$$ )DOC"); AddAttr("num_hash", "").SetDefault(1); AddAttr("mod_by", "").SetDefault(100000); + AddAttr( + "all_kernels_must_compute_runtime_shape", + "(boolean, default true) " + "An attribute to speed up OperatorWithKernel::RunImpl." + "If true, all the kernels of this Op would compute runtime " + "shape, but skip infershape in runtime. Note that it is a temporal " + "attribute, please do DOT set it in python layer.") + .SetDefault(true); } }; diff --git a/paddle/fluid/operators/sequence_ops/sequence_enumerate_op.cc b/paddle/fluid/operators/sequence_ops/sequence_enumerate_op.cc index d3dcd1f96a..63e95e8654 100644 --- a/paddle/fluid/operators/sequence_ops/sequence_enumerate_op.cc +++ b/paddle/fluid/operators/sequence_ops/sequence_enumerate_op.cc @@ -22,9 +22,6 @@ class SequenceEnumerateOp : public framework::OperatorWithKernel { using framework::OperatorWithKernel::OperatorWithKernel; void InferShape(framework::InferShapeContext* ctx) const override { - if (ctx->IsRuntime()) { - return; - } PADDLE_ENFORCE( ctx->HasInput("X"), "Input(X) of SequecceEnumerate operator should not be null."); @@ -62,6 +59,14 @@ class SequenceEnumerateOpMaker : public framework::OpProtoAndCheckerMaker { }); AddAttr("pad_value", "(int) The enumerate sequence padding value.") .SetDefault(0); + AddAttr( + "all_kernels_must_compute_runtime_shape", + "(boolean, default true) " + "An attribute to speed up OperatorWithKernel::RunImpl." + "If true, all the kernels of this Op would compute runtime " + "shape, but skip infershape in runtime. Note that it is a temporal " + "attribute, please do DOT set it in python layer.") + .SetDefault(true); AddComment(R"DOC( Sequence Enumerate Operator. From 1f4aa7a202c8fe0e2418ad9424ba39b475b33994 Mon Sep 17 00:00:00 2001 From: Qiyang Min Date: Mon, 11 Mar 2019 22:24:05 +0800 Subject: [PATCH 37/64] Imperative remove all descs (#16045) * Remove Desc in Forward Pass * Refactor VarBase * Add dbg info * Only check type in imperative mode * Polish code and support optimizer test=develop * Fix stop gradient problem in PyLayer test=develop --- paddle/fluid/imperative/layer.cc | 65 +-- paddle/fluid/imperative/layer.h | 127 +++-- paddle/fluid/imperative/tracer.cc | 237 +++++---- paddle/fluid/imperative/tracer.h | 7 +- .../memory/allocation/legacy_allocator.cc | 1 + paddle/fluid/pybind/imperative.cc | 16 +- paddle/fluid/pybind/imperative.h | 3 + paddle/fluid/pybind/protobuf.cc | 92 +--- paddle/fluid/pybind/pybind.cc | 70 +-- paddle/fluid/pybind/pybind_boost_headers.h | 115 +++++ python/paddle/fluid/framework.py | 480 ++++++++++-------- python/paddle/fluid/imperative/layers.py | 2 +- python/paddle/fluid/imperative/tracer.py | 11 +- python/paddle/fluid/optimizer.py | 9 +- .../tests/unittests/test_imperative_basic.py | 2 +- .../tests/unittests/test_imperative_resnet.py | 4 +- 16 files changed, 724 insertions(+), 517 deletions(-) create mode 100644 paddle/fluid/pybind/pybind_boost_headers.h diff --git a/paddle/fluid/imperative/layer.cc b/paddle/fluid/imperative/layer.cc index 012dfc1c7f..5530823b90 100644 --- a/paddle/fluid/imperative/layer.cc +++ b/paddle/fluid/imperative/layer.cc @@ -159,10 +159,9 @@ class Autograd { for (auto it : candidate->pre_ops_) { for (OpBase* pre_op : it.second) { if (!pre_op) continue; - VLOG(5) << "op dep " << candidate->op_desc_->Type() << " trace id " + VLOG(5) << "op dep " << candidate->Type() << " trace id " << candidate->trace_id_ << " <---- " << it.first << " <---- " - << pre_op->op_desc_->Type() << " trace id " - << pre_op->trace_id_; + << pre_op->Type() << " trace id " << pre_op->trace_id_; if (visited.find(pre_op) == visited.end()) { visited.insert(pre_op); queue.push_back(pre_op); @@ -180,10 +179,12 @@ std::unique_ptr VarBase::NewVarBase(const platform::Place& dst_place, PADDLE_ENFORCE(var_->IsInitialized(), "Variable must be initialized when getting numpy tensor"); - std::unique_ptr new_var(new VarBase()); + // TODO(minqiyang): change this after move unique_name generator to CXX + const framework::LoDTensor& self_tensor = var_->Get(); + std::unique_ptr new_var(new VarBase( + "Itmp", self_tensor.type(), self_tensor.dims(), dst_place, true, false)); framework::LoDTensor* tensor = new_var->var_->GetMutable(); - tensor->Resize(var_->Get().dims()); tensor->set_lod(var_->Get().lod()); if (blocking) { @@ -199,52 +200,62 @@ std::unique_ptr VarBase::NewVarBase(const platform::Place& dst_place, } if (platform::is_gpu_place(dst_place)) { - VLOG(3) << "copy tensor " << var_desc_->Name() << " from gpu"; + VLOG(3) << "copy tensor " << Name() << " from gpu"; } return new_var; } framework::LoDTensor& VarBase::GradValue() { - VLOG(3) << "get var grad " << var_desc_->Name(); + VLOG(3) << "get var grad " << Name(); + PADDLE_ENFORCE_NOT_NULL(grads_, + "Could not get grad value from no grad variable"); return *(grads_->var_->GetMutable()); } std::map> OpBase::ApplyGrad() { if (grad_op_descs_.empty() && backward_id_ <= 0) { - VLOG(3) << "op with no grad: " << op_desc_->Type(); + VLOG(3) << "op with no grad: " << Type(); return {}; } - VLOG(3) << "apply op grad: " << op_desc_->Type(); - std::vector grad_outputs; + VLOG(3) << "apply op grad: " << Type(); + std::vector tmp_grad_outputs; if (backward_id_ > 0) { VLOG(3) << "py_layer_grad"; - grad_outputs.resize(1); - grad_outputs[0][framework::GradVarName(PyLayer::kFwdOut)] = + tmp_grad_outputs.resize(1); + tmp_grad_outputs[0][framework::GradVarName(PyLayer::kFwdOut)] = PyLayer::ApplyGrad( backward_id_, grad_input_vars_[0][framework::GradVarName(PyLayer::kFwdInp)]); } else { - grad_outputs.resize(grad_op_descs_.size()); - for (size_t k = 0; k < grad_op_descs_.size(); ++k) { + const size_t grad_op_count = grad_op_descs_.size(); + + tmp_grad_outputs.resize(grad_op_count); + for (size_t k = 0; k < grad_op_count; ++k) { framework::OpDesc* grad_op_desc = grad_op_descs_[k]; - VLOG(3) << "op grad " << grad_op_desc->Type(); - for (auto it : grad_output_vars_[k]) { - auto& outputs = grad_outputs[k][it.first]; + auto& grad_output_variable_map = grad_output_vars_[k]; + + VLOG(3) << "apply grad op " << grad_op_desc->Type(); + + // Allocate tmp grad output variable + for (auto it : grad_output_variable_map) { + auto& outputs = tmp_grad_outputs[k][it.first]; + outputs.reserve(it.second.size()); for (size_t i = 0; i < it.second.size(); ++i) { // Allocate a new variable Variable* tmp_var = new framework::Variable(); tmp_var->GetMutable(); - outputs.push_back(tmp_var); + outputs.emplace_back(tmp_var); } } - framework::RuntimeContext ctx(grad_input_vars_[k], grad_outputs[k]); + // Run grad op + framework::RuntimeContext ctx(grad_input_vars_[k], tmp_grad_outputs[k]); // No need to do compile time infer shape here. // grad_op_desc_->InferShape(*block_); - grad_op_desc->InferVarType(block_); + // grad_op_desc->InferVarType(block_); std::unique_ptr opbase = framework::OpRegistry::CreateOp(*grad_op_desc); @@ -260,9 +271,10 @@ std::map> OpBase::ApplyGrad() { } } + // Add tmp grad outputs to original grad vars for (size_t k = 0; k < grad_output_vars_.size(); ++k) { for (auto it : grad_output_vars_[k]) { - auto& outputs = grad_outputs[k][it.first]; + auto& outputs = tmp_grad_outputs[k][it.first]; auto& origin_outputs = it.second; PADDLE_ENFORCE_EQ(outputs.size(), origin_outputs.size()); @@ -316,19 +328,14 @@ void PyLayer::RegisterFunc(int func_id, const py::object& py_func) { int PyLayer::NumFuncs() { return py_funcs_.size(); } -std::vector PyLayer::Apply(int func_id, - const std::vector& inputs) { +std::vector PyLayer::Apply(int func_id, + const std::vector& inputs) { std::vector invars; for (const VarBase* in : inputs) { invars.push_back(in->var_); } PADDLE_ENFORCE(py_funcs_.find(func_id) != py_funcs_.end()); - std::vector outvars = CallPythonFunc(py_funcs_[func_id], invars); - std::vector ret; - for (Variable* v : outvars) { - ret.push_back(new VarBase(v, new VarBase(true))); - } - return ret; + return CallPythonFunc(py_funcs_[func_id], invars); } std::vector PyLayer::ApplyGrad( diff --git a/paddle/fluid/imperative/layer.h b/paddle/fluid/imperative/layer.h index 7a9f33dc1e..618a5b7a03 100644 --- a/paddle/fluid/imperative/layer.h +++ b/paddle/fluid/imperative/layer.h @@ -112,31 +112,53 @@ class OpBase; */ class VarBase { public: - VarBase() : VarBase(new framework::Variable(), new VarBase(true)) {} - - explicit VarBase(bool stop_gradient) - : VarBase(new framework::Variable(), - stop_gradient ? nullptr : new VarBase(true), stop_gradient) {} - - VarBase(framework::Variable* var, VarBase* grad) - : VarBase(var, grad, false) {} + // Internal interface, create VarBase from exist variable + VarBase(const std::string& name, framework::Variable* var, VarBase* grad, + bool stop_gradient) + : VarBase(name, var->Get().type(), + var->Get().dims(), + var->Get().place(), var, grad, + stop_gradient, false) {} + + // Python interface + VarBase(const std::string& name, const framework::proto::VarType::Type dtype, + const std::vector& shape, const platform::Place& place, + bool stop_gradient, bool persistable) + : VarBase(name, dtype, framework::make_ddim(shape), place, stop_gradient, + persistable) {} + + // Internal interface, create VarBase from with ddim + VarBase(const std::string& name, const framework::proto::VarType::Type dtype, + const framework::DDim& shape, const platform::Place& place, + bool stop_gradient, bool persistable) + : VarBase(name, dtype, shape, place, nullptr, nullptr, stop_gradient, + persistable) {} private: - VarBase(framework::Variable* var, VarBase* grad, bool stop_gradient) - : name_(), - var_desc_(nullptr), + VarBase(const std::string& name, framework::proto::VarType::Type dtype, + const framework::DDim& shape, const platform::Place& place, + framework::Variable* var, VarBase* grad, bool stop_gradient, + bool persistable) + : name_(name), + dtype_(dtype), + place_(place), var_(var), grads_(grad), - block_(nullptr), - persistable_(false), stop_gradient_(stop_gradient), + persistable_(persistable), pre_op_(nullptr), pre_op_out_name_(), - pre_op_out_idx_(-1) {} + pre_op_out_idx_(-1) { + if (!var_) { + var_ = new framework::Variable(); + auto tensor = var_->GetMutable(); + tensor->Resize(shape); + tensor->mutable_data(place_, dtype_); + } + } public: virtual ~VarBase() { - // TODO(minqiyang): remove var desc from block desc if (var_) { delete var_; var_ = nullptr; @@ -151,14 +173,30 @@ class VarBase { pre_op_out_idx_ = -1; } - inline OpBase* PreOp() const { return pre_op_; } - inline int PreOpOutIdx() const { return pre_op_out_idx_; } + inline void SetName(const std::string& name) { name_ = name; } + inline std::string Name() const { return name_; } + + inline std::vector Shape() const { + if (var_->IsInitialized()) { + return framework::vectorize(var_->Get().dims()); + } else { + return {}; + } + } + + inline framework::proto::VarType::Type DType() const { return dtype_; } inline void SetStopGradient(bool stop_gradient) { stop_gradient_ = stop_gradient; } inline bool IsStopGradient() const { return stop_gradient_; } + inline void SetPersistable(bool persistable) { persistable_ = persistable; } + inline bool IsPersistable() const { return persistable_; } + + inline OpBase* PreOp() const { return pre_op_; } + inline int PreOpOutIdx() const { return pre_op_out_idx_; } + void RunBackward(); inline void ResetPreOp(OpBase* op) { @@ -180,7 +218,7 @@ class VarBase { } void ClearGradient() { - VLOG(1) << "clear gradient of " << var_desc_->Name(); + VLOG(1) << "clear gradient of " << Name(); if (grads_ && grads_->var_ && grads_->var_->IsInitialized()) { auto grads_t = grads_->var_->GetMutable(); operators::math::set_constant( @@ -196,23 +234,20 @@ class VarBase { const bool blocking) const; inline std::string GradName() const { - PADDLE_ENFORCE( - var_desc_, - "Couldn't get gradient variable's name, please call backward() first"); - return string::Sprintf("%s@IGrad", var_desc_->Name()); + return string::Sprintf("%s@IGrad", Name()); } std::string name_; - framework::VarDesc* var_desc_; + framework::proto::VarType::Type dtype_; + platform::Place place_; framework::Variable* var_; VarBase* grads_; - framework::BlockDesc* block_; - bool persistable_; - private: bool stop_gradient_; + bool persistable_; + OpBase* pre_op_; std::string pre_op_out_name_; int pre_op_out_idx_; @@ -223,11 +258,11 @@ class VarBase { */ class PYBIND11_HIDDEN OpBase { public: - OpBase() - : op_desc_(nullptr), + OpBase(const std::string& type) + : type_(type), + trace_id_(-1), forward_id_(-1), backward_id_(-1), - trace_id_(-1), place_(platform::CPUPlace()), backward_hooks_() {} @@ -249,13 +284,34 @@ class PYBIND11_HIDDEN OpBase { std::map> ApplyGrad(); + inline std::string Type() const { return type_; } + inline std::string GradOpType(size_t index) const { + PADDLE_ENFORCE_NOT_NULL(grad_op_descs_[index]); + return grad_op_descs_[index]->Type(); + } + void RegisterBackwardHooks(const py::object& callable); void InvokeBackwardHooks(); - // One of `op_desc_` or `forward_id_` is set, not both. - // For pure python PyLayer, use `forward_id_`, otherwise, use op_desc_. - framework::OpDesc* op_desc_; + void TrackPreOp(const VarBase* inp_var, const std::string& inp_name) { + if (inp_var->PreOp() && !inp_var->IsStopGradient()) { + VLOG(3) << "add pre op " << inp_var->PreOp()->Type() << " in slot " + << inp_name; + pre_ops_[inp_name].push_back(inp_var->PreOp()); + pre_ops_out_idx_[inp_name].push_back(inp_var->PreOpOutIdx()); + } else { + VLOG(3) << "no pre op in slot " << inp_name + << " input var stop_gradient: " << inp_var->IsStopGradient(); + pre_ops_[inp_name].push_back(nullptr); + // pre_ops_out_idx_[inp_name].push_back(-1); + } + } + + std::string type_; + // One of `trace_id_` or `forward_id_` is set, not both. + // For pure python PyLayer, use `forward_id_`, otherwise, use trace_id_. + int trace_id_; int forward_id_; // When has backward, one of `grad_op_descs_` or `backward_id_` is set, @@ -263,7 +319,6 @@ class PYBIND11_HIDDEN OpBase { // Note: each fwd op corresponds to a vector of bwd ops. std::vector grad_op_descs_; int backward_id_; - int trace_id_; platform::Place place_; @@ -277,8 +332,6 @@ class PYBIND11_HIDDEN OpBase { // Outputs to a vector of bwd ops. std::vector grad_output_vars_; - framework::BlockDesc* block_; - std::vector backward_hooks_; }; @@ -303,8 +356,8 @@ class PyLayer { static int NumFuncs(); - static std::vector Apply(int func_id, - const std::vector& inputs); + static std::vector Apply( + int func_id, const std::vector& inputs); static std::vector ApplyGrad( int func_id, const std::vector& inputs); diff --git a/paddle/fluid/imperative/tracer.cc b/paddle/fluid/imperative/tracer.cc index 0cb1676372..7ee92b4d8c 100644 --- a/paddle/fluid/imperative/tracer.cc +++ b/paddle/fluid/imperative/tracer.cc @@ -56,15 +56,19 @@ void CreateGradOp(const framework::OpDesc& op_desc, } } -void InitVar(framework::Variable* var, framework::Variable* grad_var, - platform::DeviceContext* dev_ctx) { +void InitGrad(VarBase* var, platform::DeviceContext* dev_ctx) { + PADDLE_ENFORCE_NOT_NULL(var, "Could not get valid var base"); PADDLE_ENFORCE_NOT_NULL(dev_ctx, "Could not get valid device from forward op"); - auto& var_t = var->Get(); - grad_var->GetMutable()->mutable_data( - var_t.dims(), dev_ctx->GetPlace()); - operators::math::set_constant( - *dev_ctx, grad_var->GetMutable(), 0.0); + + if (var->grads_ == nullptr) { + auto& var_t = var->var_->Get(); + var->grads_ = new VarBase(var->GradName(), framework::proto::VarType::FP32, + framework::vectorize(var_t.dims()), + dev_ctx->GetPlace(), true, false); + auto grad_t = var->grads_->var_->GetMutable(); + operators::math::set_constant(*dev_ctx, grad_t, 0.0); + } } platform::Place GetExpectedPlace(platform::Place place, VarBasePtrMap inputs) { @@ -85,6 +89,62 @@ platform::Place GetExpectedPlace(platform::Place place, VarBasePtrMap inputs) { return result; } +framework::VariableNameMap CreateInputVarNameMap( + const OpBase* op, const VarBasePtrMap& varbase_map) { + framework::VariableNameMap result; + + auto& info_map = framework::OpInfoMap::Instance(); + auto* op_info = info_map.GetNullable(op->Type()); + if (op_info == nullptr || op_info->proto_ == nullptr) { + return result; + } + + for (auto& in : op_info->Proto().inputs()) { + auto it = varbase_map.find(in.name()); + if (it == varbase_map.end()) { + PADDLE_ENFORCE(in.dispensable()); + result[in.name()] = {}; + } else { + auto var_vector = it->second; + std::vector args; + args.reserve(var_vector.size()); + for (VarBase* var_base : var_vector) { + args.emplace_back(var_base->Name()); + } + result[in.name()] = args; + } + } + return result; +} + +framework::VariableNameMap CreateOutputVarNameMap( + const OpBase* op, const VarBasePtrMap& varbase_map) { + framework::VariableNameMap result; + + auto& info_map = framework::OpInfoMap::Instance(); + auto* op_info = info_map.GetNullable(op->Type()); + if (op_info == nullptr || op_info->proto_ == nullptr) { + return result; + } + + for (auto& out : op_info->Proto().outputs()) { + auto it = varbase_map.find(out.name()); + if (it == varbase_map.end()) { + PADDLE_ENFORCE(out.dispensable()); + result[out.name()] = {}; + } else { + auto var_vector = it->second; + std::vector args; + args.reserve(var_vector.size()); + for (VarBase* var_base : var_vector) { + args.emplace_back(var_base->Name()); + } + result[out.name()] = args; + } + } + return result; +} + Tracer::Tracer(framework::BlockDesc* root_block) : root_block_(root_block) { if (!FLAGS_tracer_profile_fname.empty()) { std::call_once(gTracerProfileOnce, [] { @@ -101,7 +161,7 @@ Tracer::Tracer(framework::BlockDesc* root_block) : root_block_(root_block) { std::set Tracer::Trace(OpBase* op, const VarBasePtrMap& inputs, const VarBasePtrMap& outputs, - framework::BlockDesc* block, + framework::AttributeMap attrs_map, const platform::Place expected_place, const bool stop_gradient) { #ifdef WITH_GPERFTOOLS @@ -110,40 +170,27 @@ std::set Tracer::Trace(OpBase* op, const VarBasePtrMap& inputs, } #endif - std::map vars; - - framework::OpDesc* op_desc = op->op_desc_; - VLOG(3) << "tracer tracing " << op_desc->Type() << " trace id " - << op->trace_id_; - op_desc->InferShape(*block); - op_desc->InferVarType(block); - - std::unique_ptr op_base = - framework::OpRegistry::CreateOp(*op_desc); - framework::VariableValueMap invars_map; framework::VariableValueMap outvars_map; + // Construct input_vars_map and output_vars_map + std::map current_vars_map; op->input_vars_ = inputs; for (auto it : op->input_vars_) { auto& invars = invars_map[it.first]; invars.reserve(it.second.size()); for (VarBase* inp : it.second) { - PADDLE_ENFORCE_NOT_NULL(inp->var_, "op %s input %s nullptr", - op->op_desc_->Type(), inp->var_desc_->Name()); + PADDLE_ENFORCE_NOT_NULL(inp->var_, "op %s input %s nullptr", op->Type(), + inp->Name()); invars.emplace_back(inp->var_); - vars[inp->var_desc_->Name()] = inp; - if (inp->PreOp() && !inp->IsStopGradient()) { - op->pre_ops_[it.first].push_back(inp->PreOp()); - op->pre_ops_out_idx_[it.first].push_back(inp->PreOpOutIdx()); - VLOG(3) << "add pre op " << inp->PreOp()->op_desc_->Type(); - } else { - op->pre_ops_[it.first].push_back(nullptr); + op->TrackPreOp(inp, it.first); + if (!stop_gradient) { + current_vars_map[inp->Name()] = inp; } - VLOG(3) << "input vname " << inp->var_desc_->Name() << " " - << inp->var_->IsInitialized() << " stop_gradient " - << inp->IsStopGradient(); + VLOG(3) << "input var name: " << inp->Name() + << " inited: " << inp->var_->IsInitialized() + << " stop_grad: " << inp->IsStopGradient(); } } @@ -152,25 +199,38 @@ std::set Tracer::Trace(OpBase* op, const VarBasePtrMap& inputs, auto& outvars = outvars_map[it.first]; const std::vector& outputs = it.second; outvars.reserve(outputs.size()); - for (size_t i = 0; i < outputs.size(); ++i) { + for (size_t i = 0U; i < outputs.size(); ++i) { VarBase* out = outputs[i]; outvars.emplace_back(out->var_); - vars[out->var_desc_->Name()] = out; - - framework::VarDesc* var_desc = block->FindVar(out->var_desc_->Name()); - if (var_desc->GetType() == framework::proto::VarType::LOD_TENSOR) { - out->var_->GetMutable(); - } else { - LOG(ERROR) << "tracer doesn't support yet"; - } out->TrackPreOp(op, it.first, i, stop_gradient); + if (!stop_gradient) { + current_vars_map[out->Name()] = out; + } - VLOG(3) << "output vname " << out->var_desc_->Name() << " " - << out->var_->IsInitialized(); + VLOG(3) << "input var name: " << out->Name() + << " inited: " << out->var_->IsInitialized() + << " stop_grad: " << out->IsStopGradient(); } } - VLOG(3) << "tracer running " << op_desc->Type(); + // Check attrs and create op + framework::VariableNameMap invars_name_map = + CreateInputVarNameMap(op, inputs); + framework::VariableNameMap outvars_name_map = + CreateOutputVarNameMap(op, outputs); + + auto& info = framework::OpInfoMap::Instance().Get(op->Type()); + if (info.Checker() != nullptr) { + info.Checker()->Check(&attrs_map); + } + + std::unique_ptr op_base = + framework::OpRegistry::CreateOp(op->Type(), invars_name_map, + outvars_name_map, attrs_map); + + // TODO(minqiyang): Support infer var type in imperative mode + // Run forward op + VLOG(3) << "tracer running " << op->Type(); framework::RuntimeContext ctx(invars_map, outvars_map); // TODO(panyx0718): Cache p. @@ -186,36 +246,44 @@ std::set Tracer::Trace(OpBase* op, const VarBasePtrMap& inputs, framework::ExecutionContext(prepared_op.op, scope, *prepared_op.dev_ctx, prepared_op.ctx, prepared_op.kernel_configs)); + // construct backward op std::set vars_saved_for_backward; - if (!stop_gradient) { + VLOG(5) << "start construct backward op"; + + // construct grad op descs + std::unique_ptr fwd_op_desc(new framework::OpDesc( + op->Type(), invars_name_map, outvars_name_map, attrs_map)); std::unique_ptr> grad_to_var( new std::unordered_map()); - CreateGradOp(*op_desc, {}, {block}, &op->grad_op_descs_, grad_to_var.get()); + // NOTE(minqiyang): We don't support control flow op in imperative now + // Add grad_block_ when we want to support it + CreateGradOp(*fwd_op_desc, {}, {}, &op->grad_op_descs_, grad_to_var.get()); - op->grad_input_vars_.resize(op->grad_op_descs_.size()); - op->grad_output_vars_.resize(op->grad_op_descs_.size()); + VLOG(5) << "create grad op desc: " << op->grad_op_descs_[0]->Type(); - for (size_t i = 0; i < op->grad_op_descs_.size(); ++i) { + const size_t grad_op_count = op->grad_op_descs_.size(); + + op->grad_input_vars_.resize(grad_op_count); + op->grad_output_vars_.resize(grad_op_count); + + for (size_t i = 0; i < grad_op_count; ++i) { framework::OpDesc* grad_op_desc = op->grad_op_descs_[i]; for (auto it : grad_op_desc->Inputs()) { auto& grad_in_vars = op->grad_input_vars_[i][it.first]; + grad_in_vars.reserve(it.second.size()); for (const std::string& grad_invar : it.second) { - block->FindRecursiveOrCreateVar(grad_invar); auto var_it = grad_to_var->find(grad_invar); if (var_it == grad_to_var->end()) { - auto fwd_var_it = vars.find(grad_invar); - PADDLE_ENFORCE(fwd_var_it != vars.end()); + auto fwd_var_it = current_vars_map.find(grad_invar); + PADDLE_ENFORCE(fwd_var_it != current_vars_map.end()); // Forward inputs or outputs. - grad_in_vars.push_back(fwd_var_it->second->var_); + grad_in_vars.emplace_back(fwd_var_it->second->var_); } else { - VarBase* var = vars[var_it->second]; - if (!var->grads_->var_->IsInitialized()) { - InitVar(var->var_, var->grads_->var_, - prepared_op.GetDeviceContext()); - } + VarBase* var = current_vars_map[var_it->second]; + InitGrad(var, prepared_op.GetDeviceContext()); // Douts. - grad_in_vars.push_back(var->grads_->var_); + grad_in_vars.emplace_back(var->grads_->var_); } vars_saved_for_backward.insert(it.first); @@ -225,48 +293,48 @@ std::set Tracer::Trace(OpBase* op, const VarBasePtrMap& inputs, for (auto it : grad_op_desc->Outputs()) { auto& grad_out_vars = op->grad_output_vars_[i][it.first]; for (const std::string& grad_outvar : it.second) { - block->FindRecursiveOrCreateVar(grad_outvar); auto var_it = grad_to_var->find(grad_outvar); PADDLE_ENFORCE(var_it != grad_to_var->end(), "Could not found the grad op output var, should this " "operator %s's stop gradient be True", - op_desc->Type()); - VarBase* var = vars[var_it->second]; - if (!var->grads_->var_->IsInitialized()) { - InitVar(var->var_, var->grads_->var_, - prepared_op.GetDeviceContext()); - } + op->Type()); + VarBase* var = current_vars_map[var_it->second]; + InitGrad(var, prepared_op.GetDeviceContext()); grad_out_vars.push_back(var->grads_->var_); } } } } - op->block_ = block; return vars_saved_for_backward; } std::vector Tracer::PyTrace(OpBase* op, const std::vector& inputs, bool stop_gradient) { - VLOG(3) << "py_trace"; + VLOG(3) << "py_trace " << op->Type(); + op->input_vars_[PyLayer::kFwdInp] = inputs; - op->output_vars_[PyLayer::kFwdOut] = PyLayer::Apply(op->forward_id_, inputs); + + std::vector ret_vars = + PyLayer::Apply(op->forward_id_, inputs); + for (VarBase* inp : inputs) { - if (inp->PreOp() && !inp->IsStopGradient()) { - op->pre_ops_[PyLayer::kFwdInp].push_back(inp->PreOp()); - op->pre_ops_out_idx_[PyLayer::kFwdInp].push_back(inp->PreOpOutIdx()); - } else { - op->pre_ops_[PyLayer::kFwdInp].push_back(nullptr); - } + op->TrackPreOp(inp, PyLayer::kFwdInp); } - auto& outputs = op->output_vars_[PyLayer::kFwdOut]; - for (size_t i = 0; i < outputs.size(); ++i) { - VarBase* out = outputs[i]; + std::vector& outputs = op->output_vars_[PyLayer::kFwdOut]; + outputs.reserve(ret_vars.size()); + for (size_t i = 0U; i != ret_vars.size(); ++i) { + framework::Variable* v = ret_vars[i]; + VarBase* out = new VarBase(string::Sprintf("%s_out_%d", op->Type(), i), v, + nullptr, stop_gradient); + outputs.emplace_back(out); out->TrackPreOp(op, PyLayer::kFwdOut, i, stop_gradient); } + if (!stop_gradient) { + VLOG(5) << "start construct backward op"; op->grad_input_vars_.resize(1); op->grad_output_vars_.resize(1); auto& grad_input_vars = @@ -281,23 +349,16 @@ std::vector Tracer::PyTrace(OpBase* op, grad_input_vars.push_back(out->var_); } + // TODO(minqiyang): Add GPU support for PyLayer, only support CPU now platform::CPUPlace place; for (VarBase* out : outputs) { + InitGrad(out, platform::DeviceContextPool::Instance().Get(place)); grad_input_vars.push_back(out->grads_->var_); - if (!grad_input_vars.back()->IsInitialized()) { - // TODO(minqiyang): Add GPU support for PyLayer, only support CPU now - InitVar(out->var_, grad_input_vars.back(), - platform::DeviceContextPool::Instance().Get(place)); - } } - for (const VarBase* inp : inputs) { + for (VarBase* inp : inputs) { + InitGrad(inp, platform::DeviceContextPool::Instance().Get(place)); grad_output_vars.push_back(inp->grads_->var_); - if (!grad_output_vars.back()->IsInitialized()) { - // TODO(minqiyang): Add GPU support for PyLayer, only support CPU now - InitVar(inp->var_, grad_output_vars.back(), - platform::DeviceContextPool::Instance().Get(place)); - } } } return outputs; diff --git a/paddle/fluid/imperative/tracer.h b/paddle/fluid/imperative/tracer.h index 8a0267c37f..7b65d55e9e 100644 --- a/paddle/fluid/imperative/tracer.h +++ b/paddle/fluid/imperative/tracer.h @@ -17,6 +17,8 @@ #include #include #include +#include +#include #include #include "paddle/fluid/framework/op_desc.h" @@ -34,7 +36,8 @@ void CreateGradOp(const framework::OpDesc& op_desc, framework::OpDesc** grad_op_desc, std::unordered_map* grad_to_var); -void InitVar(framework::Variable* var, framework::Variable* grad_var); +void InitVar(const VarBase* var, framework::Variable* grad_var, + platform::DeviceContext* dev_ctx); platform::Place GetExpectedPlace(platform::Place place, VarBasePtrMap inputs); @@ -46,7 +49,7 @@ class Tracer { std::set Trace(OpBase* op, const VarBasePtrMap& inputs, const VarBasePtrMap& outputs, - framework::BlockDesc* block, + framework::AttributeMap attrs_map, const platform::Place expected_place, const bool stop_gradient = false); diff --git a/paddle/fluid/memory/allocation/legacy_allocator.cc b/paddle/fluid/memory/allocation/legacy_allocator.cc index 1936f9d4cd..a97d54a191 100644 --- a/paddle/fluid/memory/allocation/legacy_allocator.cc +++ b/paddle/fluid/memory/allocation/legacy_allocator.cc @@ -14,6 +14,7 @@ #include "paddle/fluid/memory/allocation/legacy_allocator.h" +#include #include #include #include diff --git a/paddle/fluid/pybind/imperative.cc b/paddle/fluid/pybind/imperative.cc index aeabed19ab..6bbda69297 100644 --- a/paddle/fluid/pybind/imperative.cc +++ b/paddle/fluid/pybind/imperative.cc @@ -13,10 +13,18 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/pybind/imperative.h" + +#include +#include +#include +#include + #include "paddle/fluid/framework/block_desc.h" #include "paddle/fluid/imperative/tracer.h" #include "paddle/fluid/imperative/type_defs.h" +#include "paddle/fluid/pybind/pybind_boost_headers.h" + namespace paddle { namespace pybind { @@ -31,20 +39,20 @@ void BindTracer(pybind11::module* m) { [](imperative::Tracer& self, imperative::OpBase* op, const imperative::VarBasePtrMap& inputs, const imperative::VarBasePtrMap& outputs, - framework::BlockDesc* block, + framework::AttributeMap attrs_map, const platform::CPUPlace expected_place, const bool stop_gradient = false) { - return self.Trace(op, inputs, outputs, block, expected_place, + return self.Trace(op, inputs, outputs, attrs_map, expected_place, stop_gradient); }) .def("trace", [](imperative::Tracer& self, imperative::OpBase* op, const imperative::VarBasePtrMap& inputs, const imperative::VarBasePtrMap& outputs, - framework::BlockDesc* block, + framework::AttributeMap attrs_map, const platform::CUDAPlace expected_place, const bool stop_gradient = false) { - return self.Trace(op, inputs, outputs, block, expected_place, + return self.Trace(op, inputs, outputs, attrs_map, expected_place, stop_gradient); }) .def("py_trace", &imperative::Tracer::PyTrace, diff --git a/paddle/fluid/pybind/imperative.h b/paddle/fluid/pybind/imperative.h index 8c48b2a715..8496cbfcb1 100644 --- a/paddle/fluid/pybind/imperative.h +++ b/paddle/fluid/pybind/imperative.h @@ -14,6 +14,7 @@ limitations under the License. */ #pragma once #include +#include #include #include "paddle/fluid/imperative/layer.h" #include "pybind11/pybind11.h" @@ -36,6 +37,8 @@ class Layer : public imperative::Layer { class PYBIND11_HIDDEN PyOpBase : public imperative::OpBase { public: using imperative::OpBase::OpBase; // Inherit constructors + + PyOpBase(const std::string& name) : OpBase(name) {} }; class PyVarBase : public imperative::VarBase { diff --git a/paddle/fluid/pybind/protobuf.cc b/paddle/fluid/pybind/protobuf.cc index e729be4a95..7b5e417504 100644 --- a/paddle/fluid/pybind/protobuf.cc +++ b/paddle/fluid/pybind/protobuf.cc @@ -23,97 +23,7 @@ limitations under the License. */ #include "paddle/fluid/framework/program_desc.h" #include "paddle/fluid/framework/var_desc.h" -// Cast boost::variant for PyBind. -// Copy from -// https://github.com/pybind/pybind11/issues/576#issuecomment-269563199 -namespace pybind11 { -namespace detail { - -#if !defined(PYBIND11_HIDDEN) -#ifdef _WIN32 -#define PYBIND11_HIDDEN __declspec(dllexport) -#else -#define PYBIND11_HIDDEN __attribute__((visibility("hidden"))) -#endif -#endif - -// Can be replaced by a generic lambda in C++14 -struct PYBIND11_HIDDEN paddle_variant_caster_visitor - : public boost::static_visitor { - return_value_policy policy; - handle parent; - - paddle_variant_caster_visitor(return_value_policy policy, handle parent) - : policy(policy), parent(parent) {} - - template - handle operator()(T const &src) const { - return make_caster::cast(src, policy, parent); - } -}; - -template -struct paddle_variant_caster; - -template