From 594dc4d8f0a74fe7640d22830cd221b91cbebbb5 Mon Sep 17 00:00:00 2001 From: sneaxiy Date: Thu, 10 Jan 2019 01:47:22 +0000 Subject: [PATCH 001/230] partial gc 1st version test=develop --- paddle/fluid/framework/details/CMakeLists.txt | 2 +- .../details/eager_deletion_op_handle.cc | 14 +- .../framework/details/eager_deletion_pass.cc | 125 +++++++++++++++++- .../framework/details/reference_count_pass.cc | 9 -- .../details/reference_count_pass_helper.cc | 15 ++- .../details/reference_count_pass_helper.h | 8 +- python/paddle/fluid/__init__.py | 1 + 7 files changed, 158 insertions(+), 16 deletions(-) diff --git a/paddle/fluid/framework/details/CMakeLists.txt b/paddle/fluid/framework/details/CMakeLists.txt index 179aa14528..cb34712975 100644 --- a/paddle/fluid/framework/details/CMakeLists.txt +++ b/paddle/fluid/framework/details/CMakeLists.txt @@ -54,7 +54,7 @@ cc_library(memory_optimize_pass SRCS analysis_var_pass.cc memory_reuse_types.cc cc_library(modify_op_lock_and_record_event_pass SRCS modify_op_lock_and_record_event_pass.cc DEPS computation_op_handle op_graph_view multi_devices_helper) cc_library(memory_early_delete_pass SRCS memory_early_delete_pass.cc DEPS memory_optimize_pass computation_op_handle scale_loss_grad_op_handle rpc_op_handle all_reduce_op_handle reduce_op_handle broadcast_op_handle data_balance_op_handle graph graph_helper pass) -cc_library(reference_count_pass_helper SRCS reference_count_pass_helper.cc DEPS garbage_collector computation_op_handle) +cc_library(reference_count_pass_helper SRCS reference_count_pass_helper.cc DEPS garbage_collector computation_op_handle proto_desc var_handle) cc_library(eager_deletion_op_handle SRCS eager_deletion_op_handle.cc DEPS lod_tensor selected_rows reference_count_pass_helper) cc_library(eager_deletion_pass SRCS eager_deletion_pass.cc DEPS computation_op_handle eager_deletion_op_handle graph graph_helper pass) cc_library(reference_count_pass SRCS reference_count_pass.cc DEPS computation_op_handle graph graph_helper pass op_graph_view reference_count_pass_helper) diff --git a/paddle/fluid/framework/details/eager_deletion_op_handle.cc b/paddle/fluid/framework/details/eager_deletion_op_handle.cc index 03fbfd7f24..58cdd65601 100644 --- a/paddle/fluid/framework/details/eager_deletion_op_handle.cc +++ b/paddle/fluid/framework/details/eager_deletion_op_handle.cc @@ -16,6 +16,7 @@ #include "paddle/fluid/framework/lod_tensor_array.h" #include "paddle/fluid/framework/scope.h" #include "paddle/fluid/framework/selected_rows.h" +#include "paddle/fluid/platform/profiler.h" #ifdef PADDLE_WITH_CUDA #include "paddle/fluid/platform/cuda_device_guard.h" #endif @@ -45,6 +46,7 @@ EagerDeletionOpHandle::EagerDeletionOpHandle( } } #endif + PADDLE_ENFORCE(!var_names_.empty(), "Var names cannot be empty"); } EagerDeletionOpHandle::~EagerDeletionOpHandle() { @@ -60,7 +62,13 @@ EagerDeletionOpHandle::~EagerDeletionOpHandle() { std::string EagerDeletionOpHandle::Name() const { return "eager_deletion"; } void EagerDeletionOpHandle::RunImpl() { - auto *exec_scope = scope_->FindVar(kLocalExecScopeName)->Get(); +#ifdef PADDLE_WITH_CUDA + platform::RecordEvent record_event(Name(), dev_ctx_); +#else + platform::RecordEvent record_event(Name(), nullptr); +#endif + + Scope *exec_scope = nullptr; std::deque> garbages; for (auto &name : var_names_) { auto it = ref_cnts_->find(name); @@ -69,6 +77,10 @@ void EagerDeletionOpHandle::RunImpl() { continue; } + if (!exec_scope) { + exec_scope = scope_->FindVar(kLocalExecScopeName)->Get(); + } + auto *var = exec_scope->FindVar(name); if (var == nullptr) { continue; diff --git a/paddle/fluid/framework/details/eager_deletion_pass.cc b/paddle/fluid/framework/details/eager_deletion_pass.cc index 4e42d0b497..6c8cb66b10 100644 --- a/paddle/fluid/framework/details/eager_deletion_pass.cc +++ b/paddle/fluid/framework/details/eager_deletion_pass.cc @@ -12,8 +12,11 @@ // See the License for the specific language governing permissions and // limitations under the License. +#include +#include #include #include +#include #include #include "paddle/fluid/framework/details/computation_op_handle.h" @@ -22,10 +25,120 @@ #include "paddle/fluid/framework/details/multi_devices_helper.h" #include "paddle/fluid/framework/ir/graph_helper.h" +DEFINE_double(fraction_of_eager_deletion, 1.0, "Fraction of eager deletion"); +DEFINE_bool(eager_delete_tensor_only, false, ""); + namespace paddle { namespace framework { namespace details { +namespace { // NOLINT +using OpToVarNameSetMap = + std::unordered_map>; +} // NOLINT + +static bool IsLoDTensor(VarDesc *var) { + return var->Proto()->type().type() == proto::VarType::LOD_TENSOR; +} + +static int64_t GetNumel(const GraphVars &vars, const std::string &var_name, + size_t scope_idx) { + auto *var_desc = TryGetLatestVarDesc(vars[scope_idx].at(var_name)); + PADDLE_ENFORCE(IsLoDTensor(var_desc)); + auto dims = var_desc->GetShape(); + return std::accumulate(dims.begin(), dims.end(), static_cast(1), + std::multiplies()); +} + +static void SplitIntoLoDTensorAndNonLoDTensorVars( + const OpToVarNameSetMap &m, const GraphVars &vars, + OpToVarNameSetMap *lod_tensors, OpToVarNameSetMap *other_vars) { + lod_tensors->clear(); + other_vars->clear(); + + for (auto &op_vars_pair : m) { + for (auto &var_name : op_vars_pair.second) { + auto *var_desc = TryGetLatestVarDesc( + vars[op_vars_pair.first->GetScopeIdx()].at(var_name)); + if (IsLoDTensor(var_desc)) { + (*lod_tensors)[op_vars_pair.first].insert(var_name); + } else { + (*other_vars)[op_vars_pair.first].insert(var_name); + } + } + } +} + +static OpToVarNameSetMap ShrinkGCVars(const OpToVarNameSetMap &m, + const GraphVars &vars, + double fraction_of_memory_size, + bool delete_lod_tensor_only = false) { + // Do not perform gc + if (fraction_of_memory_size <= 0.0) return {}; + + // Perform complete gc + if (fraction_of_memory_size >= 1.0) { + if (delete_lod_tensor_only) { + OpToVarNameSetMap lod_tensors, other_vars; + SplitIntoLoDTensorAndNonLoDTensorVars(m, vars, &lod_tensors, &other_vars); + return lod_tensors; + } else { + return m; + } + } + + // Perform partial gc + OpToVarNameSetMap lod_tensors, other_vars; + SplitIntoLoDTensorAndNonLoDTensorVars(m, vars, &lod_tensors, &other_vars); + + using TupleType = std::tuple; + + std::unordered_map> place_to_vars; + std::unordered_map total_memory_size; + for (auto &op_vars_pair : lod_tensors) { + auto scope_idx = op_vars_pair.first->GetScopeIdx(); + int64_t size = 0; + for (auto &var_name : op_vars_pair.second) { + auto var_size = GetNumel(vars, var_name, scope_idx); + size += std::abs(var_size); + place_to_vars[scope_idx].emplace_back(var_name, op_vars_pair.first, + var_size); + } + total_memory_size.emplace(scope_idx, size); + } + + for (auto &pair : place_to_vars) { + std::sort(pair.second.begin(), pair.second.end(), + [](const TupleType &t1, const TupleType &t2) { + return std::abs(std::get<2>(t1)) > std::abs(std::get<2>(t2)); + }); + } + + OpToVarNameSetMap ret; + for (auto &pair : place_to_vars) { + auto desired_delete_size = static_cast( + fraction_of_memory_size * total_memory_size.at(pair.first)); + int64_t cur_size = 0; + for (size_t i = 0; i < pair.second.size() && cur_size < desired_delete_size; + ++i) { + auto &var_name = std::get<0>(pair.second[i]); + auto *op = std::get<1>(pair.second[i]); + cur_size += std::get<2>(pair.second[i]); + ret[op].insert(var_name); + } + } + + if (!delete_lod_tensor_only) { + for (auto &op_vars_pair : other_vars) { + for (auto &var_name : op_vars_pair.second) { + ret[op_vars_pair.first].insert(var_name); + } + } + } + + return ret; +} + std::unique_ptr EagerDeletionPass::ApplyImpl( std::unique_ptr graph) const { auto &ref_cnts = @@ -43,9 +156,7 @@ std::unique_ptr EagerDeletionPass::ApplyImpl( // a reverse map of last_live_ops // i.e., last op --> variable names which can be deleted. - std::unordered_map> - op_vars_map; - + OpToVarNameSetMap op_vars_map; for (auto &var_ops_map : last_live_ops) { for (auto &var_ops_pair : var_ops_map) { const std::string &var_name = var_ops_pair.first; @@ -55,6 +166,10 @@ std::unique_ptr EagerDeletionPass::ApplyImpl( } } + op_vars_map = + ShrinkGCVars(op_vars_map, vars, FLAGS_fraction_of_eager_deletion, + FLAGS_eager_delete_tensor_only); + for (auto &pair : op_vars_map) { auto *op = pair.first; auto &var_names = pair.second; @@ -85,6 +200,10 @@ std::unique_ptr EagerDeletionPass::ApplyImpl( eager_deletion_op->AddOutput(dummy_leaf); } + VLOG(10) << "FLAGS_fraction_of_eager_deletion = " + << FLAGS_fraction_of_eager_deletion; + VLOG(10) << "FLAGS_eager_delete_tensor_only = " + << FLAGS_eager_delete_tensor_only; VLOG(10) << "Create " << op_vars_map.size() << " EagerDeletionOpHandle(s)"; return graph; } diff --git a/paddle/fluid/framework/details/reference_count_pass.cc b/paddle/fluid/framework/details/reference_count_pass.cc index 13a042d8e6..892f638f1f 100644 --- a/paddle/fluid/framework/details/reference_count_pass.cc +++ b/paddle/fluid/framework/details/reference_count_pass.cc @@ -189,15 +189,6 @@ ExtractComputationOpFromLastLivedVar(VarHandle *var, size_t scope_idx, return shrink_func(computation_op); } -static VarDesc *TryGetLatestVarDesc(const std::vector &vars) { - VarDesc *var_desc = nullptr; - std::find_if(vars.rbegin(), vars.rend(), [&](VarHandle *var_handle) -> bool { - var_desc = var_handle->Node()->Var(); - return var_desc != nullptr; - }); - return var_desc; -} - std::unique_ptr ReferenceCountPass::ApplyImpl( std::unique_ptr graph) const { auto &ref_cnts = Get>(kGlobalReferenceCount); diff --git a/paddle/fluid/framework/details/reference_count_pass_helper.cc b/paddle/fluid/framework/details/reference_count_pass_helper.cc index 89bd08c2d0..94de0e6ab0 100644 --- a/paddle/fluid/framework/details/reference_count_pass_helper.cc +++ b/paddle/fluid/framework/details/reference_count_pass_helper.cc @@ -13,9 +13,22 @@ // limitations under the License. #include "paddle/fluid/framework/details/reference_count_pass_helper.h" +#include "paddle/fluid/framework/details/var_handle.h" +#include "paddle/fluid/framework/var_desc.h" namespace paddle { namespace framework { -namespace details {} // namespace details +namespace details { + +VarDesc *TryGetLatestVarDesc(const std::vector &vars) { + VarDesc *var_desc = nullptr; + std::find_if(vars.rbegin(), vars.rend(), [&](VarHandle *var_handle) -> bool { + var_desc = var_handle->Node()->Var(); + return var_desc != nullptr; + }); + return var_desc; +} + +} // namespace details } // namespace framework } // namespace paddle diff --git a/paddle/fluid/framework/details/reference_count_pass_helper.h b/paddle/fluid/framework/details/reference_count_pass_helper.h index 1c083dbf00..d9e8776d7e 100644 --- a/paddle/fluid/framework/details/reference_count_pass_helper.h +++ b/paddle/fluid/framework/details/reference_count_pass_helper.h @@ -25,6 +25,10 @@ namespace paddle { namespace framework { + +class VarDesc; +class VarHandle; + namespace details { class ComputationOpHandle; @@ -43,9 +47,11 @@ const char kGarbageCollector[] = "garbage_collector"; const char kAllPlaces[] = "all_places"; using LastLiveOpsOfVars = - std::unordered_map>; + std::unordered_map>; const char kLastLiveOpsOfVars[] = "last_live_ops_of_var"; +VarDesc *TryGetLatestVarDesc(const std::vector &vars); + } // namespace details } // namespace framework } // namespace paddle diff --git a/python/paddle/fluid/__init__.py b/python/paddle/fluid/__init__.py index f9f3807b15..794f583037 100644 --- a/python/paddle/fluid/__init__.py +++ b/python/paddle/fluid/__init__.py @@ -127,6 +127,7 @@ def __bootstrap__(): 'use_ngraph', 'initial_cpu_memory_in_mb', 'init_allocated_mem', 'free_idle_memory', 'paddle_num_threads', "dist_threadpool_size", 'eager_delete_tensor_gb', 'fast_eager_deletion_mode', + 'fraction_of_eager_deletion', 'eager_delete_tensor_only', 'allocator_strategy', 'reader_queue_speed_test_mode', 'print_sub_graph_dir', 'pe_profile_fname', 'warpctc_dir', 'enable_parallel_graph' From 7160cb0f322aa5a0f3478bc2957ea704a907c30b Mon Sep 17 00:00:00 2001 From: sneaxiy Date: Tue, 19 Feb 2019 11:15:23 +0000 Subject: [PATCH 002/230] decoupled reader test=develop --- paddle/fluid/framework/reader.h | 53 +++++- paddle/fluid/operators/reader/CMakeLists.txt | 7 +- .../fluid/operators/reader/blocking_queue.h | 7 +- .../fluid/operators/reader/buffered_reader.cc | 32 ++-- .../fluid/operators/reader/buffered_reader.h | 6 +- .../fluid/operators/reader/compose_reader.cc | 39 +++++ .../fluid/operators/reader/compose_reader.h | 34 ++++ .../operators/reader/create_py_reader_op.cc | 26 +-- paddle/fluid/operators/reader/py_reader.cc | 78 +++++++++ paddle/fluid/operators/reader/py_reader.h | 62 +++++++ paddle/fluid/pybind/CMakeLists.txt | 2 +- paddle/fluid/pybind/pybind.cc | 54 +++++- paddle/fluid/pybind/reader_py.cc | 132 +++++++++++++++ paddle/fluid/pybind/reader_py.h | 25 +++ python/paddle/fluid/compiler.py | 43 +++-- python/paddle/fluid/executor.py | 4 + python/paddle/fluid/framework.py | 36 ++++ python/paddle/fluid/io.py | 4 +- python/paddle/fluid/reader.py | 141 ++++++++++++++++ .../unittests/test_decoupled_py_reader.py | 157 ++++++++++++++++++ 20 files changed, 869 insertions(+), 73 deletions(-) create mode 100644 paddle/fluid/operators/reader/compose_reader.cc create mode 100644 paddle/fluid/operators/reader/compose_reader.h create mode 100644 paddle/fluid/operators/reader/py_reader.cc create mode 100644 paddle/fluid/operators/reader/py_reader.h create mode 100644 paddle/fluid/pybind/reader_py.cc create mode 100644 paddle/fluid/pybind/reader_py.h create mode 100644 python/paddle/fluid/reader.py create mode 100644 python/paddle/fluid/tests/unittests/test_decoupled_py_reader.py diff --git a/paddle/fluid/framework/reader.h b/paddle/fluid/framework/reader.h index 82562bf883..61120dcf12 100644 --- a/paddle/fluid/framework/reader.h +++ b/paddle/fluid/framework/reader.h @@ -54,6 +54,7 @@ class ReaderBase { private: friend class DecoratedReader; + friend class MultiDecoratedReader; // These methods can be only invoked inside DecoratedReader to record the // decorating chain. void InsertDecoratedReader( @@ -62,15 +63,20 @@ class ReaderBase { std::vector> decorated_readers_; }; -class DecoratedReader : public ReaderBase, +class DecoratedReaderBase : public ReaderBase { + public: + virtual void RegisterDecorateChain() = 0; +}; + +class DecoratedReader : public DecoratedReaderBase, public std::enable_shared_from_this { public: explicit DecoratedReader(const std::shared_ptr& reader) - : ReaderBase(), reader_(reader) { + : DecoratedReaderBase(), reader_(reader) { PADDLE_ENFORCE_NOT_NULL(reader_); } - void RegisterDecorateChain() { + void RegisterDecorateChain() final { reader_->InsertDecoratedReader(shared_from_this()); } @@ -84,6 +90,41 @@ class DecoratedReader : public ReaderBase, std::shared_ptr reader_; }; +class MultiDecoratedReader + : public DecoratedReaderBase, + public std::enable_shared_from_this { + public: + explicit MultiDecoratedReader( + const std::vector>& readers) + : readers_(readers) { + PADDLE_ENFORCE(!readers_.empty()); + for (auto& r : readers_) { + PADDLE_ENFORCE_NOT_NULL(r); + } + } + + void RegisterDecorateChain() final { + for (auto& r : readers_) { + r->InsertDecoratedReader(shared_from_this()); + } + } + + protected: + void ShutdownImpl() override { + for (auto& r : readers_) { + r->Shutdown(); + } + } + + void StartImpl() override { + for (auto& r : readers_) { + r->Start(); + } + } + + std::vector> readers_; +}; + // FileReader is just a conceptual class. class FileReader : public ReaderBase {}; @@ -132,8 +173,10 @@ class ReaderHolder { }; template -inline std::shared_ptr MakeDecoratedReader(ARGS&&... args) { - std::shared_ptr reader(new T(std::forward(args)...)); +inline std::shared_ptr MakeDecoratedReader( + ARGS&&... args) { + std::shared_ptr reader( + new T(std::forward(args)...)); reader->RegisterDecorateChain(); return reader; } diff --git a/paddle/fluid/operators/reader/CMakeLists.txt b/paddle/fluid/operators/reader/CMakeLists.txt index 7c284312df..2701e10b30 100644 --- a/paddle/fluid/operators/reader/CMakeLists.txt +++ b/paddle/fluid/operators/reader/CMakeLists.txt @@ -17,7 +17,10 @@ function(reader_library TARGET_NAME) PARENT_SCOPE) endfunction() +cc_library(py_reader SRCS py_reader.cc DEPS reader) +cc_library(compose_reader SRCS compose_reader.cc DEPS reader) cc_library(buffered_reader SRCS buffered_reader.cc DEPS reader simple_threadpool) + reader_library(open_files_op SRCS open_files_op.cc DEPS buffered_reader) reader_library(create_random_data_generator_op SRCS create_random_data_generator_op.cc) reader_library(create_shuffle_reader_op SRCS create_shuffle_reader_op.cc) @@ -26,7 +29,7 @@ reader_library(create_recordio_file_reader_op SRCS create_recordio_file_reader_o reader_library(create_double_buffer_reader_op SRCS create_double_buffer_reader_op.cc DEPS buffered_reader) reader_library(create_multi_pass_reader_op SRCS create_multi_pass_reader_op.cc) reader_library(create_custom_reader_op SRCS create_custom_reader_op.cc) -reader_library(create_py_reader_op SRCS create_py_reader_op.cc) +reader_library(create_py_reader_op SRCS create_py_reader_op.cc DEPS py_reader) if (NOT WIN32 AND NOT ON_INFER) cc_library(ctr_reader SRCS ctr_reader.cc DEPS gzstream reader zlib) @@ -38,7 +41,7 @@ cc_test(reader_blocking_queue_test SRCS reader_blocking_queue_test.cc) # Export local libraries to parent # set(READER_LIBRARY ${LOCAL_READER_LIBS} PARENT_SCOPE) -op_library(read_op) +op_library(read_op DEPS py_reader compose_reader buffered_reader) foreach(src ${LOCAL_READER_LIBS}) set(OP_LIBRARY ${src} ${OP_LIBRARY} CACHE INTERNAL "op libs") diff --git a/paddle/fluid/operators/reader/blocking_queue.h b/paddle/fluid/operators/reader/blocking_queue.h index 51b980acb5..b76f482c57 100644 --- a/paddle/fluid/operators/reader/blocking_queue.h +++ b/paddle/fluid/operators/reader/blocking_queue.h @@ -34,7 +34,7 @@ class BlockingQueue { explicit BlockingQueue(size_t capacity, bool speed_test_mode = false) : capacity_(capacity), speed_test_mode_(speed_test_mode), closed_(false) { PADDLE_ENFORCE_GT( - capacity_, 0, + capacity_, static_cast(0), "The capacity of a reader::BlockingQueue must be greater than 0."); } @@ -114,6 +114,11 @@ class BlockingQueue { return queue_.size(); } + void Clear() { + std::lock_guard lock(mutex_); + queue_.clear(); + } + private: size_t capacity_; bool speed_test_mode_; diff --git a/paddle/fluid/operators/reader/buffered_reader.cc b/paddle/fluid/operators/reader/buffered_reader.cc index defc29b91f..b8c98ff5e7 100644 --- a/paddle/fluid/operators/reader/buffered_reader.cc +++ b/paddle/fluid/operators/reader/buffered_reader.cc @@ -28,8 +28,10 @@ BufferedReader::~BufferedReader() { #ifdef PADDLE_WITH_CUDA if (platform::is_gpu_place(place_)) { platform::SetDeviceId(boost::get(place_).device); - PADDLE_ENFORCE(cudaStreamDestroy(stream)); - for (auto &event : events) PADDLE_ENFORCE(cudaEventDestroy(event)); + PADDLE_ENFORCE(cudaStreamDestroy(stream_)); + for (auto &event : events_) { + PADDLE_ENFORCE(cudaEventDestroy(event)); + } } #endif } @@ -44,14 +46,15 @@ BufferedReader::BufferedReader( #ifdef PADDLE_WITH_CUDA if (platform::is_gpu_place(place_)) { platform::SetDeviceId(boost::get(place_).device); - compute_stream = + compute_stream_ = ((platform::CUDADeviceContext *)(platform::DeviceContextPool::Instance() .Get(place_))) ->stream(); - events.resize(buffer_size); - for (auto &event : events) + events_.resize(buffer_size); + for (auto &event : events_) { PADDLE_ENFORCE(cudaEventCreateWithFlags(&event, cudaEventDisableTiming)); - PADDLE_ENFORCE(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking)); + } + PADDLE_ENFORCE(cudaStreamCreateWithFlags(&stream_, cudaStreamNonBlocking)); } #endif cpu_buffer_.resize(buffer_size); @@ -70,7 +73,7 @@ void BufferedReader::ReadAsync(size_t i) { #ifdef PADDLE_WITH_CUDA if (platform::is_gpu_place(place_)) { platform::SetDeviceId(boost::get(place_).device); - PADDLE_ENFORCE(cudaEventRecord(events[i], compute_stream)); + PADDLE_ENFORCE(cudaEventRecord(events_[i], compute_stream_)); } #endif position_.emplace(thread_pool_.enqueue([this, i]() -> size_t { @@ -86,7 +89,7 @@ void BufferedReader::ReadAsync(size_t i) { // TensorCopySync would block other stream if (platform::is_gpu_place(place_)) { platform::SetDeviceId(boost::get(place_).device); - PADDLE_ENFORCE(cudaStreamWaitEvent(stream, events[i], 0)); + PADDLE_ENFORCE(cudaStreamWaitEvent(stream_, events_[i], 0)); TensorVec &gpu = gpu_buffer_[i]; gpu.resize(cpu.size()); for (size_t i = 0; i < cpu.size(); ++i) { @@ -97,23 +100,24 @@ void BufferedReader::ReadAsync(size_t i) { auto gpu_ptr = gpu[i].mutable_data(place_, cpu[i].type()); auto size = cpu[i].numel() * paddle::framework::SizeOfType(cpu[i].type()); - if (platform::is_cuda_pinned_place(cpu_place)) + if (platform::is_cuda_pinned_place(cpu_place)) { memory::Copy(boost::get(place_), gpu_ptr, boost::get(cpu_place), - cpu_ptr, size, stream); - else if ((platform::is_gpu_place(cpu_place))) + cpu_ptr, size, stream_); + } else if ((platform::is_gpu_place(cpu_place))) { memory::Copy(boost::get(place_), gpu_ptr, boost::get(cpu_place), cpu_ptr, - size, stream); - else + size, stream_); + } else { // if cpu place is not pinned, async copy is slower than sync copy, // so we use sync copy instead. memory::Copy(boost::get(place_), gpu_ptr, boost::get(cpu_place), cpu_ptr, size, 0); + } gpu[i].set_lod(cpu[i].lod()); } - PADDLE_ENFORCE(cudaStreamSynchronize(stream)); + PADDLE_ENFORCE(cudaStreamSynchronize(stream_)); } #endif return i; diff --git a/paddle/fluid/operators/reader/buffered_reader.h b/paddle/fluid/operators/reader/buffered_reader.h index 87680da01a..6b21de0949 100644 --- a/paddle/fluid/operators/reader/buffered_reader.h +++ b/paddle/fluid/operators/reader/buffered_reader.h @@ -63,9 +63,9 @@ class BufferedReader : public framework::DecoratedReader { std::vector gpu_buffer_; size_t prev_pos_{-1UL}; #ifdef PADDLE_WITH_CUDA - cudaStream_t stream; - cudaStream_t compute_stream; - std::vector events; + cudaStream_t stream_; + cudaStream_t compute_stream_; + std::vector events_; #endif }; diff --git a/paddle/fluid/operators/reader/compose_reader.cc b/paddle/fluid/operators/reader/compose_reader.cc new file mode 100644 index 0000000000..4b88b9331c --- /dev/null +++ b/paddle/fluid/operators/reader/compose_reader.cc @@ -0,0 +1,39 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/operators/reader/compose_reader.h" + +namespace paddle { +namespace operators { +namespace reader { + +ComposeReader::ComposeReader( + const std::vector> &readers) + : framework::MultiDecoratedReader(readers) {} + +void ComposeReader::ReadNext(std::vector *out) { + out->clear(); + std::vector each_ret; + for (auto &r : readers_) { + r->ReadNext(&each_ret); + out->reserve(out->size() + each_ret.size()); + for (auto &data : each_ret) { + out->emplace_back(std::move(data)); + } + } +} + +} // namespace reader +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/reader/compose_reader.h b/paddle/fluid/operators/reader/compose_reader.h new file mode 100644 index 0000000000..c9e2a2d72f --- /dev/null +++ b/paddle/fluid/operators/reader/compose_reader.h @@ -0,0 +1,34 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include "paddle/fluid/framework/reader.h" + +namespace paddle { +namespace operators { +namespace reader { + +class ComposeReader : public framework::MultiDecoratedReader { + public: + explicit ComposeReader( + const std::vector> &readers); + + void ReadNext(std::vector *out) override; +}; + +} // namespace reader +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/reader/create_py_reader_op.cc b/paddle/fluid/operators/reader/create_py_reader_op.cc index 901a92ab5b..4a6581bbbd 100644 --- a/paddle/fluid/operators/reader/create_py_reader_op.cc +++ b/paddle/fluid/operators/reader/create_py_reader_op.cc @@ -12,37 +12,13 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "paddle/fluid/operators/reader/lod_tensor_blocking_queue.h" +#include "paddle/fluid/operators/reader/py_reader.h" #include "paddle/fluid/operators/reader/reader_op_registry.h" namespace paddle { namespace operators { namespace reader { -class PyReader : public framework::FileReader { - public: - explicit PyReader(const std::shared_ptr& queue) - : framework::FileReader() { - PADDLE_ENFORCE(queue != nullptr, "LoDTensorBlockingQueue must not be null"); - queue_ = queue; - } - - void ReadNext(std::vector* out) override { - bool success; - *out = queue_->Pop(&success); - if (!success) out->clear(); - } - - ~PyReader() { queue_->Close(); } - - void Shutdown() override { queue_->Close(); } - - void Start() override { queue_->ReOpen(); } - - private: - std::shared_ptr queue_; -}; - class CreatePyReaderOp : public framework::OperatorBase { public: using framework::OperatorBase::OperatorBase; diff --git a/paddle/fluid/operators/reader/py_reader.cc b/paddle/fluid/operators/reader/py_reader.cc new file mode 100644 index 0000000000..dc84faa974 --- /dev/null +++ b/paddle/fluid/operators/reader/py_reader.cc @@ -0,0 +1,78 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/operators/reader/py_reader.h" + +namespace paddle { +namespace operators { +namespace reader { + +PyReader::PyReader(const std::shared_ptr& queue) + : framework::FileReader() { + PADDLE_ENFORCE(queue != nullptr, "LoDTensorBlockingQueue must not be null"); + queue_ = queue; +} + +void PyReader::ReadNext(std::vector* out) { + bool success; + *out = queue_->Pop(&success); + if (!success) out->clear(); +} + +PyReader::~PyReader() { queue_->Close(); } + +void PyReader::Shutdown() { queue_->Close(); } + +void PyReader::Start() { queue_->ReOpen(); } + +MultiQueuePyReader::MultiQueuePyReader( + const std::vector>& queues) + : queues_(queues) { + PADDLE_ENFORCE(!queues_.empty()); + for (auto& q : queues_) { + PADDLE_ENFORCE_NOT_NULL(q); + } +} + +void MultiQueuePyReader::ReadNext(std::vector* out) { + auto idx = read_out_idx_.fetch_add(1) % queues_.size(); + for (size_t i = 0; i < queues_.size(); ++i) { + *out = queues_[idx]->Pop(); + if (!out->empty()) return; + idx = (idx + 1) % queues_.size(); + } +} + +MultiQueuePyReader::~MultiQueuePyReader() { + for (auto& q : queues_) { + q->Close(); + } +} + +void MultiQueuePyReader::Shutdown() { + for (auto& q : queues_) { + q->Close(); + } + read_out_idx_.store(0, std::memory_order::memory_order_seq_cst); +} + +void MultiQueuePyReader::Start() { + for (auto& q : queues_) { + q->ReOpen(); + } +} + +} // namespace reader +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/reader/py_reader.h b/paddle/fluid/operators/reader/py_reader.h new file mode 100644 index 0000000000..146a2351e5 --- /dev/null +++ b/paddle/fluid/operators/reader/py_reader.h @@ -0,0 +1,62 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include +#include "paddle/fluid/framework/reader.h" +#include "paddle/fluid/operators/reader/lod_tensor_blocking_queue.h" + +namespace paddle { +namespace operators { +namespace reader { + +class PyReader : public framework::FileReader { + public: + explicit PyReader(const std::shared_ptr& queue); + + void ReadNext(std::vector* out) override; + + ~PyReader(); + + void Shutdown() override; + + void Start() override; + + private: + std::shared_ptr queue_; +}; + +class MultiQueuePyReader : public framework::FileReader { + public: + explicit MultiQueuePyReader( + const std::vector>& queues); + + void ReadNext(std::vector* out) override; + + ~MultiQueuePyReader(); + + void Shutdown() override; + + void Start() override; + + private: + std::vector> queues_; + std::atomic read_out_idx_{0}; +}; + +} // namespace reader +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/pybind/CMakeLists.txt b/paddle/fluid/pybind/CMakeLists.txt index 4ac5b83c56..84da8491d3 100644 --- a/paddle/fluid/pybind/CMakeLists.txt +++ b/paddle/fluid/pybind/CMakeLists.txt @@ -5,7 +5,7 @@ set(PYBIND_DEPS pybind python proto_desc memory executor async_executor prune if(WITH_PYTHON) list(APPEND PYBIND_DEPS py_func_op) endif() -set(PYBIND_SRCS pybind.cc exception.cc protobuf.cc const_value.cc recordio.cc async_executor_py.cc imperative.cc ir.cc inference_api.cc) +set(PYBIND_SRCS pybind.cc exception.cc protobuf.cc const_value.cc recordio.cc reader_py.cc async_executor_py.cc imperative.cc ir.cc inference_api.cc) if(WITH_PYTHON) if(WITH_AMD_GPU) diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc index a4a01ad647..f2000cc45e 100644 --- a/paddle/fluid/pybind/pybind.cc +++ b/paddle/fluid/pybind/pybind.cc @@ -54,6 +54,7 @@ limitations under the License. */ #include "paddle/fluid/pybind/ir.h" #include "paddle/fluid/pybind/protobuf.h" #include "paddle/fluid/pybind/pybind.h" // NOLINT +#include "paddle/fluid/pybind/reader_py.h" #include "paddle/fluid/pybind/recordio.h" #include "paddle/fluid/pybind/tensor_py.h" @@ -106,6 +107,16 @@ bool IsCompiledWithDIST() { #endif } +template +static inline bool IsSamePlace(const PlaceType1 &p1, const PlaceType2 &p2) { + return paddle::platform::Place(p1) == paddle::platform::Place(p2); +} + +template +static inline int PlaceIndex(const PlaceType &p) { + return static_cast(paddle::platform::Place(p).which()); +} + PYBIND11_MODULE(core, m) { // Not used, just make sure cpu_info.cc is linked. paddle::platform::CpuTotalPhysicalMemory(); @@ -452,6 +463,7 @@ PYBIND11_MODULE(core, m) { All parameter, weight, gradient are variables in Paddle. )DOC") + .def(py::init<>()) .def("is_int", [](const Variable &var) { return var.IsType(); }) .def("set_int", [](Variable &var, int val) -> void { *var.GetMutable() = val; }) @@ -493,9 +505,7 @@ All parameter, weight, gradient are variables in Paddle. }, py::return_value_policy::reference); - py::class_(m, "Reader", "") - .def("start", &framework::ReaderHolder::Start) - .def("reset", &framework::ReaderHolder::ResetAll); + BindReader(&m); using LoDTensorBlockingQueue = ::paddle::operators::reader::LoDTensorBlockingQueue; @@ -657,29 +667,65 @@ All parameter, weight, gradient are variables in Paddle. PADDLE_THROW("Cannot use CUDAPlace in CPU only version"); #endif }) + .def("_type", &PlaceIndex) + .def("_equals", &IsSamePlace) + .def("_equals", &IsSamePlace) + .def("_equals", &IsSamePlace) + .def("_equals", + &IsSamePlace) + .def("gpu_device_id", + [](platform::CUDAPlace &self) { return self.device; }) .def("__str__", string::to_string); py::class_(m, "CPUPlace") .def(py::init<>()) + .def("_type", &PlaceIndex) + .def("_equals", &IsSamePlace) + .def("_equals", &IsSamePlace) + .def("_equals", &IsSamePlace) + .def("_equals", + &IsSamePlace) .def("__str__", string::to_string); py::class_(m, "CUDAPinnedPlace") .def("__init__", - [](platform::CUDAPinnedPlace &) { + [](platform::CUDAPinnedPlace &self) { #ifndef PADDLE_WITH_CUDA PADDLE_THROW("Cannot use CUDAPinnedPlace in CPU only version"); #endif + new (&self) platform::CUDAPinnedPlace(); }) + .def("_type", &PlaceIndex) + .def("_equals", &IsSamePlace) + .def("_equals", + &IsSamePlace) + .def("_equals", + &IsSamePlace) + .def("_equals", + &IsSamePlace) .def("__str__", string::to_string); py::class_(m, "Place") .def(py::init<>()) + .def("_type", &PlaceIndex) + .def("_equals", &IsSamePlace) + .def("_equals", &IsSamePlace) + .def("_equals", &IsSamePlace) + .def("_equals", &IsSamePlace) .def("is_gpu_place", [](platform::Place &self) { return platform::is_gpu_place(self); }) + .def("is_cpu_place", + [](platform::Place &self) { return platform::is_cpu_place(self); }) + .def("is_cuda_pinned_place", + [](platform::Place &self) { + return platform::is_cuda_pinned_place(self); + }) .def("gpu_device_id", [](platform::Place &self) { return boost::get(self).device; }) + .def("set_place", [](platform::Place &self, + const platform::Place &other) { self = other; }) .def("set_place", [](platform::Place &self, const platform::CPUPlace &cpu_place) { self = cpu_place; diff --git a/paddle/fluid/pybind/reader_py.cc b/paddle/fluid/pybind/reader_py.cc new file mode 100644 index 0000000000..a09d18656f --- /dev/null +++ b/paddle/fluid/pybind/reader_py.cc @@ -0,0 +1,132 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/pybind/reader_py.h" +#include +#include +#include "paddle/fluid/framework/reader.h" +#include "paddle/fluid/operators/reader/buffered_reader.h" +#include "paddle/fluid/operators/reader/compose_reader.h" +#include "paddle/fluid/operators/reader/py_reader.h" +#include "paddle/fluid/platform/place.h" +#include "pybind11/stl.h" + +namespace paddle { +namespace pybind { + +class FeedReader { + using ResultDictList = + std::vector>; + + public: + FeedReader(std::unique_ptr reader, + const std::vector &names, size_t num_places, + bool drop_last = true) + : reader_(std::move(reader)), + names_(names), + num_places_(num_places), + drop_last_(drop_last) {} + + ResultDictList ReadNext() { + std::vector tensors; + reader_->ReadNext(&tensors); + if (tensors.empty()) return ResultDictList(); + + PADDLE_ENFORCE(tensors.size() % names_.size() == 0, + "Tensor size: %d, names size: %d", tensors.size(), + names_.size()); + + size_t read_place_num = tensors.size() / names_.size(); + + if (drop_last_ && read_place_num != num_places_) { + return ResultDictList(); + } + + ResultDictList ret(read_place_num); + for (size_t i = 0; i < tensors.size(); ++i) { + ret[i / names_.size()].emplace(names_[i % names_.size()], + std::move(tensors[i])); + } + return ret; + } + + void Start() { reader_->Start(); } + + void Reset() { reader_->ResetAll(); } + + private: + std::unique_ptr reader_; + std::vector names_; + size_t num_places_; + bool drop_last_; +}; + +static std::unique_ptr CreatePyReader( + const std::vector< + std::shared_ptr> &queues, + const std::vector &dst_places) { + std::shared_ptr reader; + if (queues.size() == 1) { + reader.reset(new operators::reader::PyReader(queues[0])); + } else { + reader.reset(new operators::reader::MultiQueuePyReader(queues)); + } + std::vector> buffered_reader; + buffered_reader.reserve(dst_places.size()); + for (auto &p : dst_places) { + buffered_reader.emplace_back( + framework::MakeDecoratedReader( + reader, p, 2)); + } + reader = framework::MakeDecoratedReader( + buffered_reader); + + auto *holder = new framework::ReaderHolder(); + holder->Reset(reader); + return std::unique_ptr(holder); +} + +namespace py = pybind11; + +void BindReader(py::module *module) { + auto &m = *module; + + namespace reader = ::paddle::operators::reader; + + py::class_(m, "Reader", "") + .def("start", &framework::ReaderHolder::Start) + .def("reset", &framework::ReaderHolder::ResetAll); + + py::class_(m, "FeedReader", "") + .def("read_next", &FeedReader::ReadNext, + py::call_guard()) + .def("start", &FeedReader::Start, + py::call_guard()) + .def("reset", &FeedReader::Reset, + py::call_guard()); + + m.def("create_py_reader", + [](const std::vector< + std::shared_ptr> + queues, + const std::vector &names, + const std::vector &dst_places, bool drop_last) { + return new FeedReader(CreatePyReader(queues, dst_places), names, + dst_places.size(), drop_last); + }, + py::return_value_policy::take_ownership); +} + +} // namespace pybind +} // namespace paddle diff --git a/paddle/fluid/pybind/reader_py.h b/paddle/fluid/pybind/reader_py.h new file mode 100644 index 0000000000..472ff65368 --- /dev/null +++ b/paddle/fluid/pybind/reader_py.h @@ -0,0 +1,25 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "pybind11/pybind11.h" + +namespace paddle { +namespace pybind { + +void BindReader(pybind11::module *module); + +} // namespace pybind +} // namespace paddle diff --git a/python/paddle/fluid/compiler.py b/python/paddle/fluid/compiler.py index ef02429428..523894af7b 100644 --- a/python/paddle/fluid/compiler.py +++ b/python/paddle/fluid/compiler.py @@ -17,6 +17,7 @@ import os import six import sys from .. import compat as cpt +from .framework import cuda_places, cpu_places from . import core @@ -78,7 +79,8 @@ class CompiledProgram(object): loss_name=None, build_strategy=None, exec_strategy=None, - share_vars_from=None): + share_vars_from=None, + places=None): """Configs the program to run in data parallel way. Args: @@ -97,6 +99,12 @@ class CompiledProgram(object): will share variables from `share_vars_from`. `share_vars_from` must be run by the executor before this CompiledProgram so that vars are ready. + places(list(CUDAPlace)|list(CPUPlace)|None): If provide, only compile + program in the given places. Otherwise, the places used when compiled + is determined by the Executor, and the places used are controlled + by environment variables: FLAGS_selected_gpus or CUDA_VISIBLE_DEVICES + if using GPU; or CPU_NUM if using CPU. + Returns: self """ @@ -110,6 +118,12 @@ class CompiledProgram(object): self._exec_strategy = ExecutionStrategy() if self._build_strategy is None: self._build_strategy = BuildStrategy() + if places is not None: + if not isinstance(places, (list, tuple)): + places = [places] + self._places = [_place_obj(p) for p in places] + else: + self._places = None return self def with_inference_optimize(self, config): @@ -148,19 +162,16 @@ class CompiledProgram(object): self._local_scopes = [] self._exec_strategy.use_cuda = isinstance(self._place, core.CUDAPlace) - if self._exec_strategy.use_cuda: - gpus_env = os.getenv("FLAGS_selected_gpus") - if gpus_env: - gpus = [int(s) for s in gpus_env.split(",")] - else: - gpus = [ - i for i in six.moves.range(core.get_cuda_device_count()) - ] - self._places = [core.CUDAPlace(i) for i in gpus] + has_set_place = (self._places is not None) + if has_set_place: + desire_place = _place_obj(self._place) + for p in self._places: + assert p._type() == desire_place._type(), \ + "Place type not match. You may set the wrong type of places" else: - cpu_num = int( - os.environ.get('CPU_NUM', multiprocessing.cpu_count())) - self._places = [core.CPUPlace() for _ in six.moves.range(cpu_num)] + places = cuda_places( + ) if self._exec_strategy.use_cuda else cpu_places() + self._places = [_place_obj(p) for p in places] assert self._places, "no place for execution" if self._exec_strategy.num_threads == 0: @@ -169,9 +180,7 @@ class CompiledProgram(object): # performance. Worth tunning for other models in the future. self._exec_strategy.num_threads = len(self._places) * 4 else: - cpu_num = int( - os.environ.get('CPU_NUM', multiprocessing.cpu_count())) - self._exec_strategy.num_threads = cpu_num * 2 + self._exec_strategy.num_threads = len(self._places) * 2 trainers_endpoints = self._program._trainers_endpoints @@ -217,7 +226,7 @@ class CompiledProgram(object): if self._compiled: if scope and self._scope != scope: raise ValueError("Cannot compile with different scope") - if place and self._place != place: + if place and not self._place._equals(place): raise ValueError("Cannot compile with different place") return self self._compiled = True diff --git a/python/paddle/fluid/executor.py b/python/paddle/fluid/executor.py index 8815911eae..5454d12e2c 100644 --- a/python/paddle/fluid/executor.py +++ b/python/paddle/fluid/executor.py @@ -554,6 +554,10 @@ class Executor(object): if feed is None: feed = {} + elif isinstance(feed, (list, tuple)): + assert len(feed) == 1, "Not compiled with data parallel" + feed = feed[0] + if not isinstance(feed, dict): raise TypeError( "feed requires dict as its Parameter. But you passed in %s" % diff --git a/python/paddle/fluid/framework.py b/python/paddle/fluid/framework.py index ef304b1110..deb837d96c 100644 --- a/python/paddle/fluid/framework.py +++ b/python/paddle/fluid/framework.py @@ -26,6 +26,7 @@ import six import numpy as np import subprocess +import multiprocessing from .. import compat as cpt from .proto import framework_pb2 @@ -63,6 +64,9 @@ __all__ = [ 'default_main_program', 'program_guard', 'name_scope', + 'cuda_places', + 'cpu_places', + 'cuda_pinned_places', ] EMPTY_VAR_NAME = core.kEmptyVarName() @@ -87,6 +91,38 @@ def _current_expected_place(): return _imperative_current_expected_place_ +def _cpu_num(): + return int(os.environ.get('CPU_NUM', multiprocessing.cpu_count())) + + +def cuda_places(device_ids=None): + assert core.is_compiled_with_cuda(), \ + "Not compiled with CUDA" + if device_ids is None: + gpus_env = os.getenv("FLAGS_selected_gpus") + if gpus_env: + device_ids = [int(s) for s in gpus_env.split(",")] + else: + device_ids = six.moves.range(core.get_cuda_device_count()) + elif not isinstance(device_ids, (list, tuple)): + device_ids = [device_ids] + return [core.CUDAPlace(dev_id) for dev_id in device_ids] + + +def cpu_places(device_count=None): + if device_count is None: + device_count = _cpu_num() + return [core.CPUPlace()] * device_count + + +def cuda_pinned_places(device_count=None): + assert core.is_compiled_with_cuda(), \ + "Not compiled with CUDA" + if device_count is None: + device_count = _cpu_num() + return [core.cuda_pinned_places()] * device_count + + class NameScope(object): def __init__(self, name="", parent=None): self._children = dict() diff --git a/python/paddle/fluid/io.py b/python/paddle/fluid/io.py index a2abbf36c0..1e3f4f476f 100644 --- a/python/paddle/fluid/io.py +++ b/python/paddle/fluid/io.py @@ -26,12 +26,14 @@ from paddle.fluid import layers from paddle.fluid.executor import Executor from paddle.fluid.evaluator import Evaluator from paddle.fluid.framework import Program, Parameter, default_main_program, default_startup_program, Variable, program_guard +from . import reader +from .reader import * from . import core __all__ = [ 'save_vars', 'save_params', 'save_persistables', 'load_vars', 'load_params', 'load_persistables', 'save_inference_model', 'load_inference_model' -] +] + reader.__all__ def is_parameter(var): diff --git a/python/paddle/fluid/reader.py b/python/paddle/fluid/reader.py new file mode 100644 index 0000000000..b765430622 --- /dev/null +++ b/python/paddle/fluid/reader.py @@ -0,0 +1,141 @@ +# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import core +import six +import threading +from .framework import Program, Variable, program_guard +from .data_feeder import DataFeeder + +__all__ = ['PyReader'] + + +def _convert_places(places): + if not isinstance(places, (list, tuple)): + places = [places] + + ret = [] + for p in places: + if not isinstance(p, core.Place): + tmp = core.Place() + tmp.set_place(p) + p = tmp + + ret.append(p) + return ret + + +class PyReader(object): + def __init__(self, feed_list, places, capacity, multi_queue=True): + self._tensor_reader = None + self._thread = None + + # TODO(zjl): to support drop_last = False + self._drop_last = True + + self._feed_list = feed_list + self._var_names = [v.name for v in feed_list] + + self._queues = [] + + self._places = _convert_places(places) + + self._queue_capacity = capacity + + queue_num = len(self._places) if multi_queue else 1 + for _ in six.moves.range(queue_num): + self._queues.append( + core.init_lod_tensor_blocking_queue(core.Variable(), + self._queue_capacity)) + + self._reader = core.create_py_reader(self._queues, self._var_names, + self._places, self._drop_last) + self._exited = True + + def __call__(self): + assert self._tensor_reader is not None, \ + "Data source of PyReader has not set yet" + + class Iterator(object): + def __init__(self, reader): + self._reader = reader + + def __iter__(self): + return self + + def next(self): + ret = self._reader._reader.read_next() + if len(ret): + return ret + else: + self._reader._restart_reader() + self._reader._reader.reset() + raise StopIteration + + return Iterator(self) + + def _restart_reader(self): + if not self._exited: + for q in self._queues: + q.close() + + self._thread.join() + + def __thread_main__(): + queue_num = len(self._queues) + idx = 0 + for tensors in self._tensor_reader(): + array = core.LoDTensorArray() + for item in tensors: + if not isinstance(item, core.LoDTensor): + tmp = core.LoDTensor() + tmp.set(item, core.CPUPlace()) + item = tmp + + array.append(item) + + if not self._queues[idx].push(array): + break + + idx = (idx + 1) % queue_num + + for q in self._queues: + q.close() + + self._exited = True + + self._thread = threading.Thread(target=__thread_main__) + self._thread.daemon = True + self._exited = False + self._thread.start() + + def set_numpy_reader(self, reader): + assert self._tensor_reader is None, \ + "Cannot reset the data source of PyReader" + with program_guard(Program(), Program()): + feeder = DataFeeder( + feed_list=self._feed_list, place=core.CPUPlace()) + paddle_reader = feeder.decorate_reader(reader, multi_devices=False) + + def __tensor_reader_impl__(): + for slots in paddle_reader(): + yield [slots[var.name] for var in self._feed_list] + + self.set_tensor_reader(__tensor_reader_impl__) + + def set_tensor_reader(self, reader): + assert self._tensor_reader is None, \ + "Cannot reset the data source of PyReader" + self._tensor_reader = reader + self._restart_reader() diff --git a/python/paddle/fluid/tests/unittests/test_decoupled_py_reader.py b/python/paddle/fluid/tests/unittests/test_decoupled_py_reader.py new file mode 100644 index 0000000000..807cbaf39d --- /dev/null +++ b/python/paddle/fluid/tests/unittests/test_decoupled_py_reader.py @@ -0,0 +1,157 @@ +# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import paddle +import paddle.fluid as fluid +import numpy as np +import time +import six +import unittest + +EPOCH_NUM = 60 +BATCH_SIZE = 32 +CLASS_NUM = 10 + + +def random_reader(): + for i in range(BATCH_SIZE * 40): + image = np.random.random([784]) + label = np.random.random_integers(low=0, high=CLASS_NUM - 1) + yield image, label + + +def simple_fc_net(places, use_legacy_py_reader): + startup_prog = fluid.Program() + main_prog = fluid.Program() + startup_prog.random_seed = 1 + main_prog.random_seed = 1 + reader = paddle.batch(random_reader, batch_size=BATCH_SIZE) + + with fluid.unique_name.guard(): + with fluid.program_guard(main_prog, startup_prog): + if not use_legacy_py_reader: + image = fluid.layers.data( + name='image', shape=[784], dtype='float32') + label = fluid.layers.data( + name='label', shape=[1], dtype='int64') + py_reader = fluid.io.PyReader( + feed_list=[image, label], + places=places, + capacity=4, + multi_queue=False) + py_reader.set_numpy_reader(reader) + else: + py_reader = fluid.layers.py_reader( + capacity=4, + shapes=[(-1, 784), (-1, 1)], + dtypes=['float32', 'int64']) + image, label = fluid.layers.read_file(py_reader) + py_reader.decorate_paddle_reader(reader) + + hidden = image + for hidden_size in [10, 20, 30]: + hidden = fluid.layers.fc( + hidden, + size=hidden_size, + act='tanh', + bias_attr=fluid.ParamAttr( + initializer=fluid.initializer.Constant(value=1.0))) + + predict_label = fluid.layers.fc(hidden, + size=CLASS_NUM, + act='softmax') + loss = fluid.layers.mean( + fluid.layers.cross_entropy( + input=predict_label, label=label)) + + optimizer = fluid.optimizer.Adam() + optimizer.minimize(loss) + return startup_prog, main_prog, py_reader, loss + + +class TestBase(unittest.TestCase): + def run_main(self, use_legacy_py_reader, with_data_parallel, places): + with fluid.scope_guard(fluid.Scope()): + startup_prog, main_prog, py_reader, loss = simple_fc_net( + places, use_legacy_py_reader) + exe = fluid.Executor(place=places[0]) + exe.run(startup_prog) + + prog = fluid.CompiledProgram(main_prog) + if with_data_parallel: + prog = prog.with_data_parallel( + loss_name=loss.name, places=places) + + step = 0 + start_t = time.time() + if use_legacy_py_reader: + for _ in six.moves.range(EPOCH_NUM): + py_reader.start() + while True: + try: + L, = exe.run(program=prog, fetch_list=[loss]) + step += 1 + except fluid.core.EOFException: + py_reader.reset() + break + else: + for _ in six.moves.range(EPOCH_NUM): + for d in py_reader(): + ''' + assert len(d) == len(places) + for i, item in enumerate(d): + image = item['image'] + label = item['label'] + assert image.shape() == [BATCH_SIZE, 784] + assert label.shape() == [BATCH_SIZE, 1] + assert image._place()._equals(places[i]) + assert label._place()._equals(places[i]) + ''' + L, = exe.run(program=prog, feed=d, fetch_list=[loss]) + step += 1 + end_t = time.time() + return {"time": end_t - start_t, "step": step} + + def prepare_places(self, with_data_parallel): + places = [[fluid.CPUPlace()], ] + if with_data_parallel: + places.append([fluid.CPUPlace()] * 2) + + if fluid.core.is_compiled_with_cuda(): + tmp = fluid.cuda_places() + assert len(tmp) > 0, "no gpu detected" + if with_data_parallel: + places.append(tmp) + places.append([tmp[0]]) + return places + + def test_main(self): + for with_data_parallel in [True, False]: + for p in self.prepare_places(with_data_parallel): + t = [] + for use_legacy_py_reader in [False, True]: + ret = self.run_main( + use_legacy_py_reader=use_legacy_py_reader, + with_data_parallel=with_data_parallel, + places=p) + ret['legacy'] = use_legacy_py_reader + ret['data_parallel'] = with_data_parallel + ret['places'] = p + t.append(ret) + + print(t) + + +if __name__ == '__main__': + unittest.main() From 27e1a9ee5ac9c2df61f2c1fa7cd3ca96030a3bd1 Mon Sep 17 00:00:00 2001 From: sneaxiy Date: Wed, 20 Feb 2019 12:58:07 +0000 Subject: [PATCH 003/230] fix hang bug test=develop --- python/paddle/fluid/reader.py | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/python/paddle/fluid/reader.py b/python/paddle/fluid/reader.py index b765430622..8352587f23 100644 --- a/python/paddle/fluid/reader.py +++ b/python/paddle/fluid/reader.py @@ -69,29 +69,32 @@ class PyReader(object): class Iterator(object): def __init__(self, reader): - self._reader = reader + self._reader = reader._reader + self._reset = reader._reset def __iter__(self): return self def next(self): - ret = self._reader._reader.read_next() + ret = self._reader.read_next() if len(ret): return ret else: - self._reader._restart_reader() - self._reader._reader.reset() + self._reset() raise StopIteration return Iterator(self) - def _restart_reader(self): + def _reset(self): if not self._exited: for q in self._queues: q.close() + if self._thread: self._thread.join() + self._reader.reset() + def __thread_main__(): queue_num = len(self._queues) idx = 0 @@ -138,4 +141,4 @@ class PyReader(object): assert self._tensor_reader is None, \ "Cannot reset the data source of PyReader" self._tensor_reader = reader - self._restart_reader() + self._reset() From b17541a9c1fcf424dc5550f581b87a2bb26ad22c Mon Sep 17 00:00:00 2001 From: sneaxiy Date: Mon, 25 Feb 2019 08:31:11 +0000 Subject: [PATCH 004/230] fix hang bug --- paddle/fluid/pybind/pybind.cc | 33 +++- paddle/fluid/pybind/reader_py.cc | 154 +++++++++++------- python/paddle/fluid/layers/io.py | 18 +- python/paddle/fluid/reader.py | 36 ++-- .../unittests/test_decoupled_py_reader.py | 52 ++++-- 5 files changed, 181 insertions(+), 112 deletions(-) diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc index c907cb48b8..1b53410d16 100644 --- a/paddle/fluid/pybind/pybind.cc +++ b/paddle/fluid/pybind/pybind.cc @@ -546,19 +546,39 @@ All parameter, weight, gradient are variables in Paddle. ::paddle::operators::reader::LoDTensorBlockingQueue; using LoDTensorBlockingQueueHolder = ::paddle::operators::reader::LoDTensorBlockingQueueHolder; + + using LockFreeLoDTensorBlockingQueue = + ::paddle::operators::reader::LockFreeLoDTensorBlockingQueue; + using LockFreeLoDTensorBlockingQueueHolder = + ::paddle::operators::reader::LockFreeLoDTensorBlockingQueueHolder; + py::class_>( m, "LoDTensorBlockingQueue", "") .def("push", [](LoDTensorBlockingQueue &self, - const std::vector &lod_tensor_vec) { + std::vector &lod_tensor_vec) { pybind11::gil_scoped_release release; - return self.Push(lod_tensor_vec); + return self.Push(std::move(lod_tensor_vec)); }) .def("size", &LoDTensorBlockingQueue::Size) .def("capacity", &LoDTensorBlockingQueue::Cap) .def("close", &LoDTensorBlockingQueue::Close) .def("is_closed", &LoDTensorBlockingQueue::IsClosed); + py::class_>( + m, "LockFreeLoDTensorBlockingQueue", "") + .def("push", + [](LockFreeLoDTensorBlockingQueue &self, + std::vector &lod_tensor_vec) { + pybind11::gil_scoped_release release; + return self.Push(std::move(lod_tensor_vec)); + }) + .def("size", &LockFreeLoDTensorBlockingQueue::Size) + .def("capacity", &LockFreeLoDTensorBlockingQueue::Cap) + .def("close", &LockFreeLoDTensorBlockingQueue::Close) + .def("is_closed", &LockFreeLoDTensorBlockingQueue::IsClosed); + m.def("init_lod_tensor_blocking_queue", [](Variable &var, size_t capacity) -> std::shared_ptr { @@ -568,6 +588,15 @@ All parameter, weight, gradient are variables in Paddle. }, py::return_value_policy::copy); + m.def("init_lock_free_lod_tensor_blocking_queue", + [](Variable &var, + size_t capacity) -> std::shared_ptr { + auto *holder = var.GetMutable(); + holder->InitOnce(capacity); + return holder->GetQueue(); + }, + py::return_value_policy::copy); + py::class_(m, "_Scope", R"DOC( Scope is an association of a name to Variable. All variables belong to Scope. diff --git a/paddle/fluid/pybind/reader_py.cc b/paddle/fluid/pybind/reader_py.cc index a09d18656f..22f67b38bb 100644 --- a/paddle/fluid/pybind/reader_py.cc +++ b/paddle/fluid/pybind/reader_py.cc @@ -25,77 +25,107 @@ namespace paddle { namespace pybind { -class FeedReader { +class MultiDeviceFeedReader { + public: using ResultDictList = std::vector>; - public: - FeedReader(std::unique_ptr reader, - const std::vector &names, size_t num_places, - bool drop_last = true) - : reader_(std::move(reader)), + MultiDeviceFeedReader( + const std::shared_ptr &queue, + const std::vector &names, + const std::vector &dst_places, bool use_double_buffer) + : queue_(queue), names_(names), - num_places_(num_places), - drop_last_(drop_last) {} - - ResultDictList ReadNext() { - std::vector tensors; - reader_->ReadNext(&tensors); - if (tensors.empty()) return ResultDictList(); + pool_(new ::ThreadPool(dst_places.size())) { + std::shared_ptr reader( + new operators::reader::PyReader(queue)); + + readers_.reserve(dst_places.size()); + for (auto &p : dst_places) { + auto *holder = new framework::ReaderHolder(); + if (use_double_buffer) { + holder->Reset( + framework::MakeDecoratedReader( + reader, p, 2)); + } else { + if (platform::is_gpu_place(p)) { + PADDLE_THROW( + "Place cannot be CUDAPlace when use_double_buffer is False"); + } + holder->Reset(reader); + } + readers_.emplace_back(holder); + } - PADDLE_ENFORCE(tensors.size() % names_.size() == 0, - "Tensor size: %d, names size: %d", tensors.size(), - names_.size()); + futures_.resize(dst_places.size()); + ret_.resize(dst_places.size()); + ReadAsync(); + } - size_t read_place_num = tensors.size() / names_.size(); + ResultDictList ReadNext() { + bool success = WaitFutures(); - if (drop_last_ && read_place_num != num_places_) { - return ResultDictList(); + if (!success) { + return {}; } - ResultDictList ret(read_place_num); - for (size_t i = 0; i < tensors.size(); ++i) { - ret[i / names_.size()].emplace(names_[i % names_.size()], - std::move(tensors[i])); + ResultDictList result(ret_.size()); + for (size_t i = 0; i < ret_.size(); ++i) { + for (size_t j = 0; j < names_.size(); ++j) { + result[i].emplace(names_[j], std::move(ret_[i][j])); + } } - return ret; + ReadAsync(); + return result; } - void Start() { reader_->Start(); } + void Reset() { + Shutdown(); + Start(); - void Reset() { reader_->ResetAll(); } + ReadAsync(); + } + + ~MultiDeviceFeedReader() { + queue_->Close(); + pool_.reset(); + } private: - std::unique_ptr reader_; - std::vector names_; - size_t num_places_; - bool drop_last_; -}; + bool WaitFutures() { + bool success = true; + for (auto &f : futures_) { + success &= f.get(); + } + return success; + } -static std::unique_ptr CreatePyReader( - const std::vector< - std::shared_ptr> &queues, - const std::vector &dst_places) { - std::shared_ptr reader; - if (queues.size() == 1) { - reader.reset(new operators::reader::PyReader(queues[0])); - } else { - reader.reset(new operators::reader::MultiQueuePyReader(queues)); + void Shutdown() { + for (auto &r : readers_) r->Shutdown(); } - std::vector> buffered_reader; - buffered_reader.reserve(dst_places.size()); - for (auto &p : dst_places) { - buffered_reader.emplace_back( - framework::MakeDecoratedReader( - reader, p, 2)); + + void Start() { + for (auto &r : readers_) r->Start(); } - reader = framework::MakeDecoratedReader( - buffered_reader); - auto *holder = new framework::ReaderHolder(); - holder->Reset(reader); - return std::unique_ptr(holder); -} + void ReadAsync() { + for (size_t i = 0; i < readers_.size(); ++i) { + futures_[i] = pool_->enqueue([this, i] { + readers_[i]->ReadNext(&ret_[i]); + return !ret_[i].empty(); + }); + } + } + + std::vector names_; + std::unique_ptr<::ThreadPool> pool_; + + std::shared_ptr queue_; + std::vector> readers_; + std::vector> futures_; + std::vector> ret_; + bool drop_last_; +}; namespace py = pybind11; @@ -108,22 +138,20 @@ void BindReader(py::module *module) { .def("start", &framework::ReaderHolder::Start) .def("reset", &framework::ReaderHolder::ResetAll); - py::class_(m, "FeedReader", "") - .def("read_next", &FeedReader::ReadNext, - py::call_guard()) - .def("start", &FeedReader::Start, + py::class_(m, "MultiDeviceFeedReader", "") + .def("read_next", &MultiDeviceFeedReader::ReadNext, py::call_guard()) - .def("reset", &FeedReader::Reset, + .def("reset", &MultiDeviceFeedReader::Reset, py::call_guard()); m.def("create_py_reader", - [](const std::vector< - std::shared_ptr> - queues, + [](const std::shared_ptr + &queue, const std::vector &names, - const std::vector &dst_places, bool drop_last) { - return new FeedReader(CreatePyReader(queues, dst_places), names, - dst_places.size(), drop_last); + const std::vector &dst_places, + bool use_double_buffer) { + return new MultiDeviceFeedReader(queues, names, dst_places, + use_double_buffer); }, py::return_value_policy::take_ownership); } diff --git a/python/paddle/fluid/layers/io.py b/python/paddle/fluid/layers/io.py index a9b391fd53..639be053b0 100644 --- a/python/paddle/fluid/layers/io.py +++ b/python/paddle/fluid/layers/io.py @@ -486,7 +486,8 @@ def _py_reader(capacity, lod_levels=None, name=None, use_double_buffer=True, - feed_list=None): + feed_list=None, + lock_free=False): if feed_list is not None: if not isinstance(feed_list, list): @@ -526,12 +527,17 @@ def _py_reader(capacity, double_buffer_name = "_".join([name, "double_buffer"]) var = global_scope().var(queue_name) - feed_queue = core.init_lod_tensor_blocking_queue(var, capacity) + if not lock_free: + feed_queue = core.init_lod_tensor_blocking_queue(var, capacity) + else: + feed_queue = core.init_lock_free_lod_tensor_blocking_queue(var, + capacity) startup_blk = default_startup_program().current_block() startup_var = startup_blk.create_var(name=reader_name) startup_blk.append_op( - type='create_py_reader', + type='create_py_reader' + if not lock_free else 'create_lock_free_py_reader', inputs={'blocking_queue': [queue_name]}, outputs={'Out': [startup_var]}, attrs={ @@ -638,7 +644,8 @@ def py_reader(capacity, dtypes, lod_levels=None, name=None, - use_double_buffer=True): + use_double_buffer=True, + lock_free=False): """ Create a Python reader for data feeding in Python @@ -763,7 +770,8 @@ def py_reader(capacity, dtypes=dtypes, lod_levels=lod_levels, name=name, - use_double_buffer=use_double_buffer) + use_double_buffer=use_double_buffer, + lock_free=lock_free) def create_py_reader_by_data(capacity, diff --git a/python/paddle/fluid/reader.py b/python/paddle/fluid/reader.py index 8352587f23..7c95ea20e3 100644 --- a/python/paddle/fluid/reader.py +++ b/python/paddle/fluid/reader.py @@ -17,6 +17,7 @@ import six import threading from .framework import Program, Variable, program_guard from .data_feeder import DataFeeder +import paddle.reader.decorator as decorator __all__ = ['PyReader'] @@ -36,8 +37,8 @@ def _convert_places(places): return ret -class PyReader(object): - def __init__(self, feed_list, places, capacity, multi_queue=True): +class PyReader(Reader): + def __init__(self, feed_list, places, capacity): self._tensor_reader = None self._thread = None @@ -53,15 +54,11 @@ class PyReader(object): self._queue_capacity = capacity - queue_num = len(self._places) if multi_queue else 1 - for _ in six.moves.range(queue_num): - self._queues.append( - core.init_lod_tensor_blocking_queue(core.Variable(), - self._queue_capacity)) + self.queue = core.init_lod_tensor_blocking_queue(core.Variable(), + self._queue_capacity) - self._reader = core.create_py_reader(self._queues, self._var_names, + self._reader = core.create_py_reader(self._queue, self._var_names, self._places, self._drop_last) - self._exited = True def __call__(self): assert self._tensor_reader is not None, \ @@ -77,7 +74,7 @@ class PyReader(object): def next(self): ret = self._reader.read_next() - if len(ret): + if ret: return ret else: self._reset() @@ -86,18 +83,11 @@ class PyReader(object): return Iterator(self) def _reset(self): - if not self._exited: - for q in self._queues: - q.close() - if self._thread: + self._reader.reset() self._thread.join() - self._reader.reset() - def __thread_main__(): - queue_num = len(self._queues) - idx = 0 for tensors in self._tensor_reader(): array = core.LoDTensorArray() for item in tensors: @@ -108,19 +98,13 @@ class PyReader(object): array.append(item) - if not self._queues[idx].push(array): + if not self.queue.push(array): break - idx = (idx + 1) % queue_num - - for q in self._queues: - q.close() - - self._exited = True + self.queue.close() self._thread = threading.Thread(target=__thread_main__) self._thread.daemon = True - self._exited = False self._thread.start() def set_numpy_reader(self, reader): diff --git a/python/paddle/fluid/tests/unittests/test_decoupled_py_reader.py b/python/paddle/fluid/tests/unittests/test_decoupled_py_reader.py index 807cbaf39d..dd64f10395 100644 --- a/python/paddle/fluid/tests/unittests/test_decoupled_py_reader.py +++ b/python/paddle/fluid/tests/unittests/test_decoupled_py_reader.py @@ -31,7 +31,7 @@ def random_reader(): yield image, label -def simple_fc_net(places, use_legacy_py_reader): +def simple_fc_net(places, use_legacy_py_reader, lock_free=False): startup_prog = fluid.Program() main_prog = fluid.Program() startup_prog.random_seed = 1 @@ -55,7 +55,8 @@ def simple_fc_net(places, use_legacy_py_reader): py_reader = fluid.layers.py_reader( capacity=4, shapes=[(-1, 784), (-1, 1)], - dtypes=['float32', 'int64']) + dtypes=['float32', 'int64'], + lock_free=lock_free) image, label = fluid.layers.read_file(py_reader) py_reader.decorate_paddle_reader(reader) @@ -82,7 +83,8 @@ def simple_fc_net(places, use_legacy_py_reader): class TestBase(unittest.TestCase): def run_main(self, use_legacy_py_reader, with_data_parallel, places): - with fluid.scope_guard(fluid.Scope()): + scope = fluid.Scope() + with fluid.scope_guard(scope): startup_prog, main_prog, py_reader, loss = simple_fc_net( places, use_legacy_py_reader) exe = fluid.Executor(place=places[0]) @@ -94,21 +96,29 @@ class TestBase(unittest.TestCase): loss_name=loss.name, places=places) step = 0 + step_list = [] start_t = time.time() if use_legacy_py_reader: for _ in six.moves.range(EPOCH_NUM): + step = 0 py_reader.start() while True: try: - L, = exe.run(program=prog, fetch_list=[loss]) + L, = exe.run(program=prog, + fetch_list=[loss], + use_program_cache=True) + # print('runned', step, py_reader.queue.is_closed(), py_reader.queue.size()) step += 1 except fluid.core.EOFException: + # print('try to reset') py_reader.reset() + # print('reseted') break + step_list.append(step) else: for _ in six.moves.range(EPOCH_NUM): + step = 0 for d in py_reader(): - ''' assert len(d) == len(places) for i, item in enumerate(d): image = item['image'] @@ -117,18 +127,25 @@ class TestBase(unittest.TestCase): assert label.shape() == [BATCH_SIZE, 1] assert image._place()._equals(places[i]) assert label._place()._equals(places[i]) - ''' - L, = exe.run(program=prog, feed=d, fetch_list=[loss]) + L, = exe.run(program=prog, + feed=d, + fetch_list=[loss], + use_program_cache=True) step += 1 + step_list.append(step) end_t = time.time() - return {"time": end_t - start_t, "step": step} - - def prepare_places(self, with_data_parallel): - places = [[fluid.CPUPlace()], ] - if with_data_parallel: - places.append([fluid.CPUPlace()] * 2) + ret = {"time": end_t - start_t, "step": step_list} + scope._remove_from_pool() + return ret + + def prepare_places(self, with_data_parallel, with_cpu=False, with_gpu=True): + places = [] + if with_cpu: + places.append([fluid.CPUPlace()]) + if with_data_parallel: + places.append([fluid.CPUPlace()] * 2) - if fluid.core.is_compiled_with_cuda(): + if with_gpu and fluid.core.is_compiled_with_cuda(): tmp = fluid.cuda_places() assert len(tmp) > 0, "no gpu detected" if with_data_parallel: @@ -140,7 +157,10 @@ class TestBase(unittest.TestCase): for with_data_parallel in [True, False]: for p in self.prepare_places(with_data_parallel): t = [] - for use_legacy_py_reader in [False, True]: + for use_legacy_py_reader in [ + False + ]: #[True, False]: #[False, True]: + print(p, use_legacy_py_reader) ret = self.run_main( use_legacy_py_reader=use_legacy_py_reader, with_data_parallel=with_data_parallel, @@ -148,7 +168,7 @@ class TestBase(unittest.TestCase): ret['legacy'] = use_legacy_py_reader ret['data_parallel'] = with_data_parallel ret['places'] = p - t.append(ret) + t.append([ret['step'], ]) #, ret['places']]) print(t) From c545f1ed8f0aa76053e78c165c441760871c8d03 Mon Sep 17 00:00:00 2001 From: sneaxiy Date: Mon, 25 Feb 2019 10:38:32 +0000 Subject: [PATCH 005/230] unify API test=develop --- paddle/fluid/API.spec | 10 +- paddle/fluid/framework/reader.h | 53 +------ paddle/fluid/operators/reader/CMakeLists.txt | 3 +- .../fluid/operators/reader/blocking_queue.h | 5 - .../fluid/operators/reader/compose_reader.cc | 39 ----- .../fluid/operators/reader/compose_reader.h | 34 ----- paddle/fluid/operators/reader/py_reader.cc | 37 ----- paddle/fluid/operators/reader/py_reader.h | 18 --- paddle/fluid/pybind/pybind.cc | 30 ---- paddle/fluid/pybind/reader_py.cc | 8 +- python/paddle/fluid/layers/io.py | 15 +- python/paddle/fluid/reader.py | 140 ++++++++++++++---- .../unittests/test_decoupled_py_reader.py | 76 ++++------ 13 files changed, 164 insertions(+), 304 deletions(-) delete mode 100644 paddle/fluid/operators/reader/compose_reader.cc delete mode 100644 paddle/fluid/operators/reader/compose_reader.h diff --git a/paddle/fluid/API.spec b/paddle/fluid/API.spec index 2544b7308c..946e264f05 100644 --- a/paddle/fluid/API.spec +++ b/paddle/fluid/API.spec @@ -10,6 +10,9 @@ paddle.fluid.default_startup_program ArgSpec(args=[], varargs=None, keywords=Non paddle.fluid.default_main_program ArgSpec(args=[], varargs=None, keywords=None, defaults=None) paddle.fluid.program_guard ArgSpec(args=['main_program', 'startup_program'], varargs=None, keywords=None, defaults=(None,)) paddle.fluid.name_scope ArgSpec(args=['prefix'], varargs=None, keywords=None, defaults=(None,)) +paddle.fluid.cuda_places ArgSpec(args=['device_ids'], varargs=None, keywords=None, defaults=(None,)) +paddle.fluid.cpu_places ArgSpec(args=['device_count'], varargs=None, keywords=None, defaults=(None,)) +paddle.fluid.cuda_pinned_places ArgSpec(args=['device_count'], varargs=None, keywords=None, defaults=(None,)) paddle.fluid.Executor.__init__ ArgSpec(args=['self', 'place'], varargs=None, keywords=None, defaults=None) paddle.fluid.Executor.close ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None) paddle.fluid.Executor.run ArgSpec(args=['self', 'program', 'feed', 'fetch_list', 'feed_var_name', 'fetch_var_name', 'scope', 'return_numpy', 'use_program_cache'], varargs=None, keywords=None, defaults=(None, None, None, 'feed', 'fetch', None, True, False)) @@ -44,7 +47,7 @@ paddle.fluid.AsyncExecutor.run ArgSpec(args=['self', 'program', 'data_feed', 'fi paddle.fluid.AsyncExecutor.save_model ArgSpec(args=['self', 'save_path'], varargs=None, keywords=None, defaults=None) paddle.fluid.AsyncExecutor.stop ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None) paddle.fluid.CompiledProgram.__init__ ArgSpec(args=['self', 'program'], varargs=None, keywords=None, defaults=None) -paddle.fluid.CompiledProgram.with_data_parallel ArgSpec(args=['self', 'loss_name', 'build_strategy', 'exec_strategy', 'share_vars_from'], varargs=None, keywords=None, defaults=(None, None, None, None)) +paddle.fluid.CompiledProgram.with_data_parallel ArgSpec(args=['self', 'loss_name', 'build_strategy', 'exec_strategy', 'share_vars_from', 'places'], varargs=None, keywords=None, defaults=(None, None, None, None, None)) paddle.fluid.CompiledProgram.with_inference_optimize ArgSpec(args=['self', 'config'], varargs=None, keywords=None, defaults=None) paddle.fluid.ExecutionStrategy.__init__ __init__(self: paddle.fluid.core.ParallelExecutor.ExecutionStrategy) -> None paddle.fluid.BuildStrategy.GradientScaleStrategy.__init__ __init__(self: paddle.fluid.core.ParallelExecutor.BuildStrategy.GradientScaleStrategy, arg0: int) -> None @@ -58,6 +61,11 @@ paddle.fluid.io.load_params ArgSpec(args=['executor', 'dirname', 'main_program', paddle.fluid.io.load_persistables ArgSpec(args=['executor', 'dirname', 'main_program', 'filename'], varargs=None, keywords=None, defaults=(None, None)) paddle.fluid.io.save_inference_model ArgSpec(args=['dirname', 'feeded_var_names', 'target_vars', 'executor', 'main_program', 'model_filename', 'params_filename', 'export_for_deployment'], varargs=None, keywords=None, defaults=(None, None, None, True)) paddle.fluid.io.load_inference_model ArgSpec(args=['dirname', 'executor', 'model_filename', 'params_filename', 'pserver_endpoints'], varargs=None, keywords=None, defaults=(None, None, None)) +paddle.fluid.io.PyReader.__init__ ArgSpec(args=['self', 'feed_list', 'capacity', 'use_double_buffer', 'iterable'], varargs=None, keywords=None, defaults=(True, True)) +paddle.fluid.io.PyReader.decorate_paddle_reader ArgSpec(args=['self', 'reader', 'places'], varargs=None, keywords=None, defaults=(None,)) +paddle.fluid.io.PyReader.decorate_tensor_provider ArgSpec(args=['self', 'reader', 'places'], varargs=None, keywords=None, defaults=(None,)) +paddle.fluid.io.PyReader.reset ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None) +paddle.fluid.io.PyReader.start ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None) paddle.fluid.initializer.ConstantInitializer.__init__ ArgSpec(args=['self', 'value', 'force_cpu'], varargs=None, keywords=None, defaults=(0.0, False)) paddle.fluid.initializer.UniformInitializer.__init__ ArgSpec(args=['self', 'low', 'high', 'seed'], varargs=None, keywords=None, defaults=(-1.0, 1.0, 0)) paddle.fluid.initializer.NormalInitializer.__init__ ArgSpec(args=['self', 'loc', 'scale', 'seed'], varargs=None, keywords=None, defaults=(0.0, 1.0, 0)) diff --git a/paddle/fluid/framework/reader.h b/paddle/fluid/framework/reader.h index 61120dcf12..82562bf883 100644 --- a/paddle/fluid/framework/reader.h +++ b/paddle/fluid/framework/reader.h @@ -54,7 +54,6 @@ class ReaderBase { private: friend class DecoratedReader; - friend class MultiDecoratedReader; // These methods can be only invoked inside DecoratedReader to record the // decorating chain. void InsertDecoratedReader( @@ -63,20 +62,15 @@ class ReaderBase { std::vector> decorated_readers_; }; -class DecoratedReaderBase : public ReaderBase { - public: - virtual void RegisterDecorateChain() = 0; -}; - -class DecoratedReader : public DecoratedReaderBase, +class DecoratedReader : public ReaderBase, public std::enable_shared_from_this { public: explicit DecoratedReader(const std::shared_ptr& reader) - : DecoratedReaderBase(), reader_(reader) { + : ReaderBase(), reader_(reader) { PADDLE_ENFORCE_NOT_NULL(reader_); } - void RegisterDecorateChain() final { + void RegisterDecorateChain() { reader_->InsertDecoratedReader(shared_from_this()); } @@ -90,41 +84,6 @@ class DecoratedReader : public DecoratedReaderBase, std::shared_ptr reader_; }; -class MultiDecoratedReader - : public DecoratedReaderBase, - public std::enable_shared_from_this { - public: - explicit MultiDecoratedReader( - const std::vector>& readers) - : readers_(readers) { - PADDLE_ENFORCE(!readers_.empty()); - for (auto& r : readers_) { - PADDLE_ENFORCE_NOT_NULL(r); - } - } - - void RegisterDecorateChain() final { - for (auto& r : readers_) { - r->InsertDecoratedReader(shared_from_this()); - } - } - - protected: - void ShutdownImpl() override { - for (auto& r : readers_) { - r->Shutdown(); - } - } - - void StartImpl() override { - for (auto& r : readers_) { - r->Start(); - } - } - - std::vector> readers_; -}; - // FileReader is just a conceptual class. class FileReader : public ReaderBase {}; @@ -173,10 +132,8 @@ class ReaderHolder { }; template -inline std::shared_ptr MakeDecoratedReader( - ARGS&&... args) { - std::shared_ptr reader( - new T(std::forward(args)...)); +inline std::shared_ptr MakeDecoratedReader(ARGS&&... args) { + std::shared_ptr reader(new T(std::forward(args)...)); reader->RegisterDecorateChain(); return reader; } diff --git a/paddle/fluid/operators/reader/CMakeLists.txt b/paddle/fluid/operators/reader/CMakeLists.txt index 2701e10b30..5ee1206175 100644 --- a/paddle/fluid/operators/reader/CMakeLists.txt +++ b/paddle/fluid/operators/reader/CMakeLists.txt @@ -18,7 +18,6 @@ function(reader_library TARGET_NAME) endfunction() cc_library(py_reader SRCS py_reader.cc DEPS reader) -cc_library(compose_reader SRCS compose_reader.cc DEPS reader) cc_library(buffered_reader SRCS buffered_reader.cc DEPS reader simple_threadpool) reader_library(open_files_op SRCS open_files_op.cc DEPS buffered_reader) @@ -41,7 +40,7 @@ cc_test(reader_blocking_queue_test SRCS reader_blocking_queue_test.cc) # Export local libraries to parent # set(READER_LIBRARY ${LOCAL_READER_LIBS} PARENT_SCOPE) -op_library(read_op DEPS py_reader compose_reader buffered_reader) +op_library(read_op DEPS py_reader buffered_reader) foreach(src ${LOCAL_READER_LIBS}) set(OP_LIBRARY ${src} ${OP_LIBRARY} CACHE INTERNAL "op libs") diff --git a/paddle/fluid/operators/reader/blocking_queue.h b/paddle/fluid/operators/reader/blocking_queue.h index b76f482c57..7962c0332d 100644 --- a/paddle/fluid/operators/reader/blocking_queue.h +++ b/paddle/fluid/operators/reader/blocking_queue.h @@ -114,11 +114,6 @@ class BlockingQueue { return queue_.size(); } - void Clear() { - std::lock_guard lock(mutex_); - queue_.clear(); - } - private: size_t capacity_; bool speed_test_mode_; diff --git a/paddle/fluid/operators/reader/compose_reader.cc b/paddle/fluid/operators/reader/compose_reader.cc deleted file mode 100644 index 4b88b9331c..0000000000 --- a/paddle/fluid/operators/reader/compose_reader.cc +++ /dev/null @@ -1,39 +0,0 @@ -// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "paddle/fluid/operators/reader/compose_reader.h" - -namespace paddle { -namespace operators { -namespace reader { - -ComposeReader::ComposeReader( - const std::vector> &readers) - : framework::MultiDecoratedReader(readers) {} - -void ComposeReader::ReadNext(std::vector *out) { - out->clear(); - std::vector each_ret; - for (auto &r : readers_) { - r->ReadNext(&each_ret); - out->reserve(out->size() + each_ret.size()); - for (auto &data : each_ret) { - out->emplace_back(std::move(data)); - } - } -} - -} // namespace reader -} // namespace operators -} // namespace paddle diff --git a/paddle/fluid/operators/reader/compose_reader.h b/paddle/fluid/operators/reader/compose_reader.h deleted file mode 100644 index c9e2a2d72f..0000000000 --- a/paddle/fluid/operators/reader/compose_reader.h +++ /dev/null @@ -1,34 +0,0 @@ -// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once - -#include -#include "paddle/fluid/framework/reader.h" - -namespace paddle { -namespace operators { -namespace reader { - -class ComposeReader : public framework::MultiDecoratedReader { - public: - explicit ComposeReader( - const std::vector> &readers); - - void ReadNext(std::vector *out) override; -}; - -} // namespace reader -} // namespace operators -} // namespace paddle diff --git a/paddle/fluid/operators/reader/py_reader.cc b/paddle/fluid/operators/reader/py_reader.cc index dc84faa974..f2c28c1df8 100644 --- a/paddle/fluid/operators/reader/py_reader.cc +++ b/paddle/fluid/operators/reader/py_reader.cc @@ -36,43 +36,6 @@ void PyReader::Shutdown() { queue_->Close(); } void PyReader::Start() { queue_->ReOpen(); } -MultiQueuePyReader::MultiQueuePyReader( - const std::vector>& queues) - : queues_(queues) { - PADDLE_ENFORCE(!queues_.empty()); - for (auto& q : queues_) { - PADDLE_ENFORCE_NOT_NULL(q); - } -} - -void MultiQueuePyReader::ReadNext(std::vector* out) { - auto idx = read_out_idx_.fetch_add(1) % queues_.size(); - for (size_t i = 0; i < queues_.size(); ++i) { - *out = queues_[idx]->Pop(); - if (!out->empty()) return; - idx = (idx + 1) % queues_.size(); - } -} - -MultiQueuePyReader::~MultiQueuePyReader() { - for (auto& q : queues_) { - q->Close(); - } -} - -void MultiQueuePyReader::Shutdown() { - for (auto& q : queues_) { - q->Close(); - } - read_out_idx_.store(0, std::memory_order::memory_order_seq_cst); -} - -void MultiQueuePyReader::Start() { - for (auto& q : queues_) { - q->ReOpen(); - } -} - } // namespace reader } // namespace operators } // namespace paddle diff --git a/paddle/fluid/operators/reader/py_reader.h b/paddle/fluid/operators/reader/py_reader.h index 146a2351e5..7d760eca64 100644 --- a/paddle/fluid/operators/reader/py_reader.h +++ b/paddle/fluid/operators/reader/py_reader.h @@ -39,24 +39,6 @@ class PyReader : public framework::FileReader { std::shared_ptr queue_; }; -class MultiQueuePyReader : public framework::FileReader { - public: - explicit MultiQueuePyReader( - const std::vector>& queues); - - void ReadNext(std::vector* out) override; - - ~MultiQueuePyReader(); - - void Shutdown() override; - - void Start() override; - - private: - std::vector> queues_; - std::atomic read_out_idx_{0}; -}; - } // namespace reader } // namespace operators } // namespace paddle diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc index 1b53410d16..2acedca245 100644 --- a/paddle/fluid/pybind/pybind.cc +++ b/paddle/fluid/pybind/pybind.cc @@ -547,11 +547,6 @@ All parameter, weight, gradient are variables in Paddle. using LoDTensorBlockingQueueHolder = ::paddle::operators::reader::LoDTensorBlockingQueueHolder; - using LockFreeLoDTensorBlockingQueue = - ::paddle::operators::reader::LockFreeLoDTensorBlockingQueue; - using LockFreeLoDTensorBlockingQueueHolder = - ::paddle::operators::reader::LockFreeLoDTensorBlockingQueueHolder; - py::class_>( m, "LoDTensorBlockingQueue", "") .def("push", @@ -565,20 +560,6 @@ All parameter, weight, gradient are variables in Paddle. .def("close", &LoDTensorBlockingQueue::Close) .def("is_closed", &LoDTensorBlockingQueue::IsClosed); - py::class_>( - m, "LockFreeLoDTensorBlockingQueue", "") - .def("push", - [](LockFreeLoDTensorBlockingQueue &self, - std::vector &lod_tensor_vec) { - pybind11::gil_scoped_release release; - return self.Push(std::move(lod_tensor_vec)); - }) - .def("size", &LockFreeLoDTensorBlockingQueue::Size) - .def("capacity", &LockFreeLoDTensorBlockingQueue::Cap) - .def("close", &LockFreeLoDTensorBlockingQueue::Close) - .def("is_closed", &LockFreeLoDTensorBlockingQueue::IsClosed); - m.def("init_lod_tensor_blocking_queue", [](Variable &var, size_t capacity) -> std::shared_ptr { @@ -588,15 +569,6 @@ All parameter, weight, gradient are variables in Paddle. }, py::return_value_policy::copy); - m.def("init_lock_free_lod_tensor_blocking_queue", - [](Variable &var, - size_t capacity) -> std::shared_ptr { - auto *holder = var.GetMutable(); - holder->InitOnce(capacity); - return holder->GetQueue(); - }, - py::return_value_policy::copy); - py::class_(m, "_Scope", R"DOC( Scope is an association of a name to Variable. All variables belong to Scope. @@ -777,8 +749,6 @@ All parameter, weight, gradient are variables in Paddle. .def("_equals", &IsSamePlace) .def("_equals", &IsSamePlace) - .def("gpu_device_id", - [](platform::CUDAPlace &self) { return self.device; }) .def("__str__", string::to_string); py::class_(m, "CPUPlace") diff --git a/paddle/fluid/pybind/reader_py.cc b/paddle/fluid/pybind/reader_py.cc index 22f67b38bb..8af0490310 100644 --- a/paddle/fluid/pybind/reader_py.cc +++ b/paddle/fluid/pybind/reader_py.cc @@ -17,7 +17,6 @@ #include #include "paddle/fluid/framework/reader.h" #include "paddle/fluid/operators/reader/buffered_reader.h" -#include "paddle/fluid/operators/reader/compose_reader.h" #include "paddle/fluid/operators/reader/py_reader.h" #include "paddle/fluid/platform/place.h" #include "pybind11/stl.h" @@ -82,7 +81,6 @@ class MultiDeviceFeedReader { void Reset() { Shutdown(); Start(); - ReadAsync(); } @@ -117,14 +115,14 @@ class MultiDeviceFeedReader { } } + std::shared_ptr queue_; std::vector names_; std::unique_ptr<::ThreadPool> pool_; - std::shared_ptr queue_; std::vector> readers_; + std::vector> futures_; std::vector> ret_; - bool drop_last_; }; namespace py = pybind11; @@ -150,7 +148,7 @@ void BindReader(py::module *module) { const std::vector &names, const std::vector &dst_places, bool use_double_buffer) { - return new MultiDeviceFeedReader(queues, names, dst_places, + return new MultiDeviceFeedReader(queue, names, dst_places, use_double_buffer); }, py::return_value_policy::take_ownership); diff --git a/python/paddle/fluid/layers/io.py b/python/paddle/fluid/layers/io.py index 639be053b0..6b9e000358 100644 --- a/python/paddle/fluid/layers/io.py +++ b/python/paddle/fluid/layers/io.py @@ -486,8 +486,7 @@ def _py_reader(capacity, lod_levels=None, name=None, use_double_buffer=True, - feed_list=None, - lock_free=False): + feed_list=None): if feed_list is not None: if not isinstance(feed_list, list): @@ -527,11 +526,7 @@ def _py_reader(capacity, double_buffer_name = "_".join([name, "double_buffer"]) var = global_scope().var(queue_name) - if not lock_free: - feed_queue = core.init_lod_tensor_blocking_queue(var, capacity) - else: - feed_queue = core.init_lock_free_lod_tensor_blocking_queue(var, - capacity) + feed_queue = core.init_lod_tensor_blocking_queue(var, capacity) startup_blk = default_startup_program().current_block() startup_var = startup_blk.create_var(name=reader_name) @@ -644,8 +639,7 @@ def py_reader(capacity, dtypes, lod_levels=None, name=None, - use_double_buffer=True, - lock_free=False): + use_double_buffer=True): """ Create a Python reader for data feeding in Python @@ -770,8 +764,7 @@ def py_reader(capacity, dtypes=dtypes, lod_levels=lod_levels, name=name, - use_double_buffer=use_double_buffer, - lock_free=lock_free) + use_double_buffer=use_double_buffer) def create_py_reader_by_data(capacity, diff --git a/python/paddle/fluid/reader.py b/python/paddle/fluid/reader.py index 7c95ea20e3..f29231589e 100644 --- a/python/paddle/fluid/reader.py +++ b/python/paddle/fluid/reader.py @@ -15,9 +15,11 @@ import core import six import threading -from .framework import Program, Variable, program_guard +from .framework import Program, Variable, program_guard, default_main_program, default_startup_program +from .executor import global_scope from .data_feeder import DataFeeder -import paddle.reader.decorator as decorator +from .layers.io import monkey_patch_reader_methods, _copy_reader_var_, double_buffer +import unique_name __all__ = ['PyReader'] @@ -37,30 +39,101 @@ def _convert_places(places): return ret -class PyReader(Reader): - def __init__(self, feed_list, places, capacity): +class PyReader(object): + unique_name_generator = unique_name.UniqueNameGenerator() + + def __init__(self, + feed_list, + capacity, + use_double_buffer=True, + iterable=True): self._tensor_reader = None self._thread = None - - # TODO(zjl): to support drop_last = False - self._drop_last = True - + self._iterable = iterable + self._use_double_buffer = use_double_buffer + self._capacity = capacity self._feed_list = feed_list - self._var_names = [v.name for v in feed_list] - - self._queues = [] + self._scope = global_scope() + if not self._iterable: + self._init_non_iterable() + def _init_iterable(self, places): + self._var_names = [v.name for v in self._feed_list] self._places = _convert_places(places) - - self._queue_capacity = capacity - - self.queue = core.init_lod_tensor_blocking_queue(core.Variable(), - self._queue_capacity) - - self._reader = core.create_py_reader(self._queue, self._var_names, - self._places, self._drop_last) + self._queue = core.init_lod_tensor_blocking_queue(core.Variable(), + self._capacity) + self._reader = core.create_py_reader( + self.queue, self._var_names, self._places, self._use_double_buffer) + + def _init_non_iterable(self): + lod_levels = [] + dtypes = [] + shape_concat = [] + ranks = [] + shapes = [] + + for feed_data in self._feed_list: + dtypes.append(feed_data.dtype) + shape_concat.extend(feed_data.shape) + ranks.append(len(feed_data.shape)) + shapes.append(feed_data.shape) + lod_levels.append(feed_data.lod_level) + + queue_name = PyReader.unique_name_generator('lod_tensor_blocking_queue') + reader_name = PyReader.unique_name_generator('create_py_reader') + double_buffer_name = PyReader.unique_name_generator('double_buffer') + + var = self._scope.var(queue_name) + self._queue = core.init_lod_tensor_blocking_queue(var, self._capacity) + + startup_blk = default_startup_program().current_block() + startup_var = startup_blk.create_var(name=reader_name) + + startup_blk.append_op( + type='create_py_reader', + inputs={'blocking_queue': [queue_name]}, + outputs={'Out': [startup_var]}, + attrs={ + 'shape_concat': shape_concat, + 'lod_levels': lod_levels, + 'ranks': ranks + }) + + startup_var.desc.set_dtypes(dtypes) + startup_var.persistable = True + + main_prog_var = _copy_reader_var_( + default_main_program().current_block(), startup_var) + + main_prog_var.stop_gradient = True + main_prog_var.persistable = True + + reader = monkey_patch_reader_methods(main_prog_var) + if self._use_double_buffer: + double_buffer_reader = double_buffer( + reader, name=double_buffer_name) + # we return a double buffer reader. However, the reset method comes from + # py_reader. + double_buffer_reader.reset = reader.reset + reader = double_buffer_reader + + self._reader = reader + + default_main_program().current_block().append_op( + type='read', + inputs={'Reader': [self._reader]}, + outputs={'Out': self._feed_list}) + + @property + def queue(self): + return self._queue + + @property + def iterable(self): + return self._iterable def __call__(self): + assert self.iterable, "PyReader is not iterable" assert self._tensor_reader is not None, \ "Data source of PyReader has not set yet" @@ -80,13 +153,22 @@ class PyReader(Reader): self._reset() raise StopIteration + self._start() return Iterator(self) def _reset(self): - if self._thread: - self._reader.reset() - self._thread.join() + self._reader.reset() + self._thread.join() + + def start(self): + assert not self._iterable, "start() cannot be called when PyReader is iterable" + self._start() + def reset(self): + assert not self._iterable, "reset() cannot be called when PyReader is iterable" + self._reset() + + def _start(self): def __thread_main__(): for tensors in self._tensor_reader(): array = core.LoDTensorArray() @@ -98,16 +180,16 @@ class PyReader(Reader): array.append(item) - if not self.queue.push(array): + if not self._queue.push(array): break - self.queue.close() + self._queue.close() self._thread = threading.Thread(target=__thread_main__) self._thread.daemon = True self._thread.start() - def set_numpy_reader(self, reader): + def decorate_paddle_reader(self, reader, places=None): assert self._tensor_reader is None, \ "Cannot reset the data source of PyReader" with program_guard(Program(), Program()): @@ -119,10 +201,12 @@ class PyReader(Reader): for slots in paddle_reader(): yield [slots[var.name] for var in self._feed_list] - self.set_tensor_reader(__tensor_reader_impl__) + self.decorate_tensor_provider(__tensor_reader_impl__, places) - def set_tensor_reader(self, reader): + def decorate_tensor_provider(self, reader, places=None): assert self._tensor_reader is None, \ "Cannot reset the data source of PyReader" self._tensor_reader = reader - self._reset() + if self._iterable: + assert places is not None, "Places cannot be None when py_reader is iterable" + self._init_iterable(places) diff --git a/python/paddle/fluid/tests/unittests/test_decoupled_py_reader.py b/python/paddle/fluid/tests/unittests/test_decoupled_py_reader.py index dd64f10395..96a11edd49 100644 --- a/python/paddle/fluid/tests/unittests/test_decoupled_py_reader.py +++ b/python/paddle/fluid/tests/unittests/test_decoupled_py_reader.py @@ -31,35 +31,22 @@ def random_reader(): yield image, label -def simple_fc_net(places, use_legacy_py_reader, lock_free=False): +def simple_fc_net(places, use_legacy_py_reader, use_double_buffer): startup_prog = fluid.Program() main_prog = fluid.Program() startup_prog.random_seed = 1 main_prog.random_seed = 1 - reader = paddle.batch(random_reader, batch_size=BATCH_SIZE) with fluid.unique_name.guard(): with fluid.program_guard(main_prog, startup_prog): - if not use_legacy_py_reader: - image = fluid.layers.data( - name='image', shape=[784], dtype='float32') - label = fluid.layers.data( - name='label', shape=[1], dtype='int64') - py_reader = fluid.io.PyReader( - feed_list=[image, label], - places=places, - capacity=4, - multi_queue=False) - py_reader.set_numpy_reader(reader) - else: - py_reader = fluid.layers.py_reader( - capacity=4, - shapes=[(-1, 784), (-1, 1)], - dtypes=['float32', 'int64'], - lock_free=lock_free) - image, label = fluid.layers.read_file(py_reader) - py_reader.decorate_paddle_reader(reader) - + image = fluid.layers.data( + name='image', shape=[784], dtype='float32') + label = fluid.layers.data(name='label', shape=[1], dtype='int64') + py_reader = fluid.io.PyReader( + feed_list=[image, label], + capacity=4, + iterable=not use_legacy_py_reader, + use_double_buffer=use_double_buffer) hidden = image for hidden_size in [10, 20, 30]: hidden = fluid.layers.fc( @@ -82,11 +69,19 @@ def simple_fc_net(places, use_legacy_py_reader, lock_free=False): class TestBase(unittest.TestCase): - def run_main(self, use_legacy_py_reader, with_data_parallel, places): + def run_main(self, use_legacy_py_reader, with_data_parallel, places, + use_double_buffer): scope = fluid.Scope() with fluid.scope_guard(scope): startup_prog, main_prog, py_reader, loss = simple_fc_net( - places, use_legacy_py_reader) + places, use_legacy_py_reader, use_double_buffer) + + reader = paddle.batch(random_reader, batch_size=BATCH_SIZE) + + ps = places if use_double_buffer else fluid.cpu_places(len(places)) + py_reader.decorate_paddle_reader( + reader, places=ps if py_reader.iterable else None) + exe = fluid.Executor(place=places[0]) exe.run(startup_prog) @@ -98,7 +93,7 @@ class TestBase(unittest.TestCase): step = 0 step_list = [] start_t = time.time() - if use_legacy_py_reader: + if not py_reader.iterable: for _ in six.moves.range(EPOCH_NUM): step = 0 py_reader.start() @@ -107,12 +102,9 @@ class TestBase(unittest.TestCase): L, = exe.run(program=prog, fetch_list=[loss], use_program_cache=True) - # print('runned', step, py_reader.queue.is_closed(), py_reader.queue.size()) step += 1 except fluid.core.EOFException: - # print('try to reset') py_reader.reset() - # print('reseted') break step_list.append(step) else: @@ -125,8 +117,8 @@ class TestBase(unittest.TestCase): label = item['label'] assert image.shape() == [BATCH_SIZE, 784] assert label.shape() == [BATCH_SIZE, 1] - assert image._place()._equals(places[i]) - assert label._place()._equals(places[i]) + assert image._place()._equals(ps[i]) + assert label._place()._equals(ps[i]) L, = exe.run(program=prog, feed=d, fetch_list=[loss], @@ -138,7 +130,7 @@ class TestBase(unittest.TestCase): scope._remove_from_pool() return ret - def prepare_places(self, with_data_parallel, with_cpu=False, with_gpu=True): + def prepare_places(self, with_data_parallel, with_cpu=True, with_gpu=True): places = [] if with_cpu: places.append([fluid.CPUPlace()]) @@ -156,21 +148,13 @@ class TestBase(unittest.TestCase): def test_main(self): for with_data_parallel in [True, False]: for p in self.prepare_places(with_data_parallel): - t = [] - for use_legacy_py_reader in [ - False - ]: #[True, False]: #[False, True]: - print(p, use_legacy_py_reader) - ret = self.run_main( - use_legacy_py_reader=use_legacy_py_reader, - with_data_parallel=with_data_parallel, - places=p) - ret['legacy'] = use_legacy_py_reader - ret['data_parallel'] = with_data_parallel - ret['places'] = p - t.append([ret['step'], ]) #, ret['places']]) - - print(t) + for use_double_buffer in [False, True]: + for use_legacy_py_reader in [False, True]: + ret = self.run_main( + use_legacy_py_reader=use_legacy_py_reader, + with_data_parallel=with_data_parallel, + places=p, + use_double_buffer=use_double_buffer) if __name__ == '__main__': From 2e67f8ae88db689494dff7ba71bfb7a7322d1447 Mon Sep 17 00:00:00 2001 From: sneaxiy Date: Tue, 26 Feb 2019 02:39:14 +0000 Subject: [PATCH 006/230] add doc test=develop --- python/paddle/fluid/framework.py | 49 +++++++++++++++ python/paddle/fluid/layers/io.py | 3 +- python/paddle/fluid/reader.py | 102 +++++++++++++++++++++++++++++++ 3 files changed, 152 insertions(+), 2 deletions(-) diff --git a/python/paddle/fluid/framework.py b/python/paddle/fluid/framework.py index 98200e2efb..00b6e7afa0 100644 --- a/python/paddle/fluid/framework.py +++ b/python/paddle/fluid/framework.py @@ -96,6 +96,27 @@ def _cpu_num(): def cuda_places(device_ids=None): + ''' + Create a list of :code:`fluid.CUDAPlace` objects. + + If :code:`device_ids` is None, environment variable of + :code:`FLAGS_selected_gpus` would be checked first. If + :code:`FLAGS_selected_gpus=0,1,2`, the returned list would + be [fluid.CUDAPlace(0), fluid.CUDAPlace(1), fluid.CUDAPlace(2)]. + If :code:`FLAGS_selected_gpus` is not set, all visible + gpu places would be returned. + + If :code:`device_ids` is not None, it should be the device + ids of gpus. For example, if :code:`device_ids=[0,1,2]`, + the returned list would be + [fluid.CUDAPlace(0), fluid.CUDAPlace(1), fluid.CUDAPlace(2)]. + + Args: + device_ids (None|list(int)|tuple(int)): gpu device id list. + + Returns: + out (list(fluid.CUDAPlace)): gpu place list. + ''' assert core.is_compiled_with_cuda(), \ "Not compiled with CUDA" if device_ids is None: @@ -110,12 +131,40 @@ def cuda_places(device_ids=None): def cpu_places(device_count=None): + ''' + Create a list of :code:`fluid.CPUPlace` objects. + + If :code:`device_count` is None, the device count would + be determined by environment variable :code:`CPU_NUM`. + If :code:`CPU_NUM` is not set, the device count would + be determined by :code:`multiprocessing.cpu_count()`. + + Args: + device_count (None|int): device number. + + Returns: + out (list(fluid.CPUPlace)): cpu place list. + ''' if device_count is None: device_count = _cpu_num() return [core.CPUPlace()] * device_count def cuda_pinned_places(device_count=None): + ''' + Create a list of :code:`fluid.CUDAPinnedPlace` objects. + + If :code:`device_count` is None, the device count would + be determined by environment variable :code:`CPU_NUM`. + If :code:`CPU_NUM` is not set, the device count would + be determined by :code:`multiprocessing.cpu_count()`. + + Args: + device_count (None|int): device number. + + Returns: + out (list(fluid.CUDAPinnedPlace)): cuda pinned place list. + ''' assert core.is_compiled_with_cuda(), \ "Not compiled with CUDA" if device_count is None: diff --git a/python/paddle/fluid/layers/io.py b/python/paddle/fluid/layers/io.py index 6b9e000358..a9b391fd53 100644 --- a/python/paddle/fluid/layers/io.py +++ b/python/paddle/fluid/layers/io.py @@ -531,8 +531,7 @@ def _py_reader(capacity, startup_blk = default_startup_program().current_block() startup_var = startup_blk.create_var(name=reader_name) startup_blk.append_op( - type='create_py_reader' - if not lock_free else 'create_lock_free_py_reader', + type='create_py_reader', inputs={'blocking_queue': [queue_name]}, outputs={'Out': [startup_var]}, attrs={ diff --git a/python/paddle/fluid/reader.py b/python/paddle/fluid/reader.py index f29231589e..a0ce914822 100644 --- a/python/paddle/fluid/reader.py +++ b/python/paddle/fluid/reader.py @@ -47,6 +47,76 @@ class PyReader(object): capacity, use_double_buffer=True, iterable=True): + """ + Create a reader object for data feeding in Python. + Data would be prefetched using Python thread and be pushed + into a queue asynchronously. Data in the queue would be extracted + automatically when `Executor.run(...)` is called. + + Args: + feed_list (list(Variable)|tuple(Variable)): feed variable list. + The variables should be created by :code:`fluid.layers.data()`. + capacity (int): capacity of the queue maintained in PyReader object. + use_double_buffer (bool): whether to use double_buffer_reader to + speed up data feeding. + iterable (bool): whether the created reader object is iterable. + + Returns: + reader (Reader): the created reader object. + + Examples: + 1. If iterable = False, the created PyReader object is almost the + same as :code:`fluid.layers.py_reader()`. Operators would be + inserted into the program. User should call :code:`start()` + before each epoch and catch :code:`fluid.core.EOFException` + thrown by :code:`Executor.run()` when epoch ends. Once the + exception is caught, user should call :code:`reset()` to reset + the reader manually. + + .. code-block:: python + + image = fluid.layers.data( + name='image', shape=[784], dtype='float32') + label = fluid.layers.data( + name='label', shape=[1], dtype='int64') + + reader = fluid.io.PyReader(feed_list=[image, label], + capacity=4, iterable=False) + reader.decorate_paddle_reader(user_defined_reader) + ... # definition of network is omitted + executor.run(fluid.default_main_program()) + for _ in range(EPOCH_NUM): + reader.start() + while True: + try: + executor.run(feed=None, ...) + except fluid.core.EOFException: + reader.reset() + break + + 2. If iterable=True, the created PyReader object is decoupled with + the program. No operator would be inserted into the program. + In this case, the created reader is a Python generator, which + is iterable. User should feed the data yielded from PyReader + object into :code:`Executor.run(feed=...)`. + + .. code-block:: python + + image = fluid.layers.data( + name='image', shape=[784], dtype='float32') + label = fluid.layers.data( + name='label', shape=[1], dtype='int64') + + reader = fluid.io.PyReader(feed_list=[image, label], + capacity=4, iterable=True) + reader.decorate_paddle_reader(user_defined_reader, + places=fluid.cuda_places()) + ... # definition of network is omitted + executor.run(fluid.default_main_program()) + for _ in range(EPOCH_NUM): + for data in reader(): + executor.run(feed=data, ...) + """ self._tensor_reader = None self._thread = None self._iterable = iterable @@ -161,10 +231,18 @@ class PyReader(object): self._thread.join() def start(self): + ''' + Start the data feeding thread. + Can only call when the reader object is not iterable. + ''' assert not self._iterable, "start() cannot be called when PyReader is iterable" self._start() def reset(self): + ''' + Reset the reader object when :code:`fluid.core.EOFException` raises. + Can only call when the reader object is not iterable. + ''' assert not self._iterable, "reset() cannot be called when PyReader is iterable" self._reset() @@ -190,6 +268,18 @@ class PyReader(object): self._thread.start() def decorate_paddle_reader(self, reader, places=None): + ''' + Set the data source of the PyReader object. + + The provided :code:`reader` should be a Python generator, + which yields numpy-typed batched data. + + :code:`places` must be set when the PyReader object is iterable. + + Args: + reader (generator): Python generator that yields numpy-typed + batched data. + ''' assert self._tensor_reader is None, \ "Cannot reset the data source of PyReader" with program_guard(Program(), Program()): @@ -204,6 +294,18 @@ class PyReader(object): self.decorate_tensor_provider(__tensor_reader_impl__, places) def decorate_tensor_provider(self, reader, places=None): + ''' + Set the data source of the PyReader object. + + The provided :code:`reader` should be a Python generator, + which yields LoDTensor-typed batched data. + + :code:`places` must be set when the PyReader object is iterable. + + Args: + reader (generator): Python generator that yields LoDTensor-typed + batched data. + ''' assert self._tensor_reader is None, \ "Cannot reset the data source of PyReader" self._tensor_reader = reader From 7b5a9d75d94c3b79d772a31a8185628355901c0d Mon Sep 17 00:00:00 2001 From: sneaxiy Date: Wed, 27 Feb 2019 03:54:45 +0000 Subject: [PATCH 007/230] add cache reader test=develop --- paddle/fluid/API.spec | 1 + python/paddle/fluid/reader.py | 6 ++++-- python/paddle/reader/decorator.py | 27 ++++++++++++++++++++++++++- 3 files changed, 31 insertions(+), 3 deletions(-) diff --git a/paddle/fluid/API.spec b/paddle/fluid/API.spec index dad729bd15..db3739e65b 100644 --- a/paddle/fluid/API.spec +++ b/paddle/fluid/API.spec @@ -511,6 +511,7 @@ paddle.fluid.unique_name.guard ArgSpec(args=['new_generator'], varargs=None, key paddle.fluid.recordio_writer.convert_reader_to_recordio_file ArgSpec(args=['filename', 'reader_creator', 'feeder', 'compressor', 'max_num_records', 'feed_order'], varargs=None, keywords=None, defaults=(Compressor.Snappy, 1000, None)) paddle.fluid.recordio_writer.convert_reader_to_recordio_files ArgSpec(args=['filename', 'batch_per_file', 'reader_creator', 'feeder', 'compressor', 'max_num_records', 'feed_order'], varargs=None, keywords=None, defaults=(Compressor.Snappy, 1000, None)) paddle.fluid.Scope Scope() -> paddle.fluid.core._Scope +paddle.reader.cache ArgSpec(args=['reader'], varargs=None, keywords=None, defaults=None) paddle.reader.map_readers ArgSpec(args=['func'], varargs='readers', keywords=None, defaults=None) paddle.reader.buffered ArgSpec(args=['reader', 'size'], varargs=None, keywords=None, defaults=None) paddle.reader.compose ArgSpec(args=[], varargs='readers', keywords='kwargs', defaults=None) diff --git a/python/paddle/fluid/reader.py b/python/paddle/fluid/reader.py index af340c03a4..7d08403d26 100644 --- a/python/paddle/fluid/reader.py +++ b/python/paddle/fluid/reader.py @@ -123,7 +123,6 @@ class PyReader(object): self._use_double_buffer = use_double_buffer self._capacity = capacity self._feed_list = feed_list - self._scope = global_scope() if not self._iterable: self._init_non_iterable() @@ -153,7 +152,7 @@ class PyReader(object): reader_name = PyReader.unique_name_generator('create_py_reader') double_buffer_name = PyReader.unique_name_generator('double_buffer') - var = self._scope.var(queue_name) + var = global_scope().var(queue_name) self._queue = core.init_lod_tensor_blocking_queue(var, self._capacity) startup_blk = default_startup_program().current_block() @@ -215,6 +214,9 @@ class PyReader(object): def __iter__(self): return self + def __next__(self): + return self.next() + def next(self): ret = self._reader.read_next() if ret: diff --git a/python/paddle/reader/decorator.py b/python/paddle/reader/decorator.py index b2ef9f7580..7443a6bb19 100644 --- a/python/paddle/reader/decorator.py +++ b/python/paddle/reader/decorator.py @@ -13,7 +13,7 @@ # limitations under the License. __all__ = [ - 'map_readers', 'buffered', 'compose', 'chain', 'shuffle', + 'cache', 'map_readers', 'buffered', 'compose', 'chain', 'shuffle', 'ComposeNotAligned', 'firstn', 'xmap_readers', 'PipeReader', 'multiprocess_reader', 'Fake' ] @@ -33,6 +33,31 @@ import zlib import paddle.compat as cpt +def cache(reader): + """ + Cache the reader data into memory. + + Be careful that this method may take long time to process, + and consume lots of memory. :code:`reader()` would only + call once. + + Args: + reader (generator): a reader object which yields + data each time. + + Returns: + reader (generator): a decorated reader object + which yields data from cached memory. + """ + all_data = tuple(reader()) + + def __impl__(): + for item in all_data: + yield item + + return __impl__ + + def map_readers(func, *readers): """ Creates a data reader that outputs return value of function using From e5fa9c58cff2ea452f9be0169e013b62b8085f30 Mon Sep 17 00:00:00 2001 From: tianshuo78520a <707759223@qq.com> Date: Thu, 28 Feb 2019 17:52:23 +0800 Subject: [PATCH 008/230] add travis_ci check api.spec --- .travis.yml | 1 + paddle/scripts/paddle_build.sh | 72 ++++++++++++++++++++++------------ 2 files changed, 49 insertions(+), 24 deletions(-) diff --git a/.travis.yml b/.travis.yml index 87de895dda..3766317166 100644 --- a/.travis.yml +++ b/.travis.yml @@ -12,6 +12,7 @@ os: - linux env: - JOB=check_style + - JOB=check_api addons: ssh_known_hosts: 13.229.163.131 before_install: diff --git a/paddle/scripts/paddle_build.sh b/paddle/scripts/paddle_build.sh index 1135caf4f8..c322f3970f 100755 --- a/paddle/scripts/paddle_build.sh +++ b/paddle/scripts/paddle_build.sh @@ -87,7 +87,7 @@ function cmake_gen() { PYTHON_FLAGS="-DPYTHON_EXECUTABLE:FILEPATH=/Library/Frameworks/Python.framework/Versions/3.5/bin/python3 -DPYTHON_INCLUDE_DIR:PATH=/Library/Frameworks/Python.framework/Versions/3.5/include/python3.5m/ -DPYTHON_LIBRARY:FILEPATH=/Library/Frameworks/Python.framework/Versions/3.5/lib/libpython3.5m.dylib" - WITH_FLUID_ONLY=${WITH_FLUID_ONLY:-ON} + pip3.5 uninstall -y protobuf pip3.5 install --user -r ${PADDLE_ROOT}/python/requirements.txt else exit 1 @@ -100,7 +100,7 @@ function cmake_gen() { PYTHON_FLAGS="-DPYTHON_EXECUTABLE:FILEPATH=/Library/Frameworks/Python.framework/Versions/3.6/bin/python3 -DPYTHON_INCLUDE_DIR:PATH=/Library/Frameworks/Python.framework/Versions/3.6/include/python3.6m/ -DPYTHON_LIBRARY:FILEPATH=/Library/Frameworks/Python.framework/Versions/3.6/lib/libpython3.6m.dylib" - WITH_FLUID_ONLY=${WITH_FLUID_ONLY:-ON} + pip3.6 uninstall -y protobuf pip3.6 install --user -r ${PADDLE_ROOT}/python/requirements.txt else exit 1 @@ -113,7 +113,7 @@ function cmake_gen() { PYTHON_FLAGS="-DPYTHON_EXECUTABLE:FILEPATH=/Library/Frameworks/Python.framework/Versions/3.7/bin/python3 -DPYTHON_INCLUDE_DIR:PATH=/Library/Frameworks/Python.framework/Versions/3.7/include/python3.7m/ -DPYTHON_LIBRARY:FILEPATH=/Library/Frameworks/Python.framework/Versions/3.7/lib/libpython3.7m.dylib" - WITH_FLUID_ONLY=${WITH_FLUID_ONLY:-ON} + pip3.7 uninstall -y protobuf pip3.7 install --user -r ${PADDLE_ROOT}/python/requirements.txt else exit 1 @@ -128,31 +128,44 @@ function cmake_gen() { PYTHON_FLAGS="-DPYTHON_EXECUTABLE:FILEPATH=/opt/python/cp27-cp27m/bin/python -DPYTHON_INCLUDE_DIR:PATH=/opt/python/cp27-cp27m/include/python2.7 -DPYTHON_LIBRARIES:FILEPATH=/opt/_internal/cpython-2.7.11-ucs2/lib/libpython2.7.so" + pip uninstall -y protobuf + pip install -r ${PADDLE_ROOT}/python/requirements.txt elif [ "$1" == "cp27-cp27mu" ]; then export LD_LIBRARY_PATH=/opt/_internal/cpython-2.7.11-ucs4/lib:${LD_LIBRARY_PATH#/opt/_internal/cpython-2.7.11-ucs2/lib:} export PATH=/opt/python/cp27-cp27mu/bin/:${PATH} PYTHON_FLAGS="-DPYTHON_EXECUTABLE:FILEPATH=/opt/python/cp27-cp27mu/bin/python -DPYTHON_INCLUDE_DIR:PATH=/opt/python/cp27-cp27mu/include/python2.7 -DPYTHON_LIBRARIES:FILEPATH=/opt/_internal/cpython-2.7.11-ucs4/lib/libpython2.7.so" + pip uninstall -y protobuf + pip install -r ${PADDLE_ROOT}/python/requirements.txt elif [ "$1" == "cp35-cp35m" ]; then export LD_LIBRARY_PATH=/opt/_internal/cpython-3.5.1/lib/:${LD_LIBRARY_PATH} export PATH=/opt/_internal/cpython-3.5.1/bin/:${PATH} export PYTHON_FLAGS="-DPYTHON_EXECUTABLE:FILEPATH=/opt/_internal/cpython-3.5.1/bin/python3 -DPYTHON_INCLUDE_DIR:PATH=/opt/_internal/cpython-3.5.1/include/python3.5m -DPYTHON_LIBRARIES:FILEPATH=/opt/_internal/cpython-3.5.1/lib/libpython3.so" + pip3.5 uninstall -y protobuf + pip3.5 install -r ${PADDLE_ROOT}/python/requirements.txt elif [ "$1" == "cp36-cp36m" ]; then export LD_LIBRARY_PATH=/opt/_internal/cpython-3.6.0/lib/:${LD_LIBRARY_PATH} export PATH=/opt/_internal/cpython-3.6.0/bin/:${PATH} export PYTHON_FLAGS="-DPYTHON_EXECUTABLE:FILEPATH=/opt/_internal/cpython-3.6.0/bin/python3 -DPYTHON_INCLUDE_DIR:PATH=/opt/_internal/cpython-3.6.0/include/python3.6m -DPYTHON_LIBRARIES:FILEPATH=/opt/_internal/cpython-3.6.0/lib/libpython3.so" + pip3.6 uninstall -y protobuf + pip3.6 install -r ${PADDLE_ROOT}/python/requirements.txt elif [ "$1" == "cp37-cp37m" ]; then export LD_LIBRARY_PATH=/opt/_internal/cpython-3.7.0/lib/:${LD_LIBRARY_PATH} export PATH=/opt/_internal/cpython-3.7.0/bin/:${PATH} export PYTHON_FLAGS="-DPYTHON_EXECUTABLE:FILEPATH=/opt/_internal/cpython-3.7.0/bin/python3.7 -DPYTHON_INCLUDE_DIR:PATH=/opt/_internal/cpython-3.7.0/include/python3.7m -DPYTHON_LIBRARIES:FILEPATH=/opt/_internal/cpython-3.7.0/lib/libpython3.so" + pip3.7 uninstall -y protobuf + pip3.7 install -r ${PADDLE_ROOT}/python/requirements.txt fi + else + pip uninstall -y protobuf + pip install -r ${PADDLE_ROOT}/python/requirements.txt fi fi @@ -186,7 +199,6 @@ function cmake_gen() { -DWITH_TESTING=${WITH_TESTING:-ON} -DCMAKE_MODULE_PATH=/opt/rocm/hip/cmake -DCMAKE_EXPORT_COMPILE_COMMANDS=ON - -DWITH_FLUID_ONLY=${WITH_FLUID_ONLY:-OFF} -DCMAKE_EXPORT_COMPILE_COMMANDS=ON -DWITH_CONTRIB=${WITH_CONTRIB:-ON} -DWITH_INFERENCE_API_TEST=${WITH_INFERENCE_API_TEST:-ON} @@ -219,7 +231,6 @@ EOF -DCUDNN_ROOT=/usr/ \ -DWITH_TESTING=${WITH_TESTING:-ON} \ -DCMAKE_MODULE_PATH=/opt/rocm/hip/cmake \ - -DWITH_FLUID_ONLY=${WITH_FLUID_ONLY:-OFF} \ -DCMAKE_EXPORT_COMPILE_COMMANDS=ON \ -DWITH_CONTRIB=${WITH_CONTRIB:-ON} \ -DWITH_INFERENCE_API_TEST=${WITH_INFERENCE_API_TEST:-ON} \ @@ -248,6 +259,7 @@ function check_style() { eval "$(GIMME_GO_VERSION=1.8.3 gimme)" fi + pip install cpplint # set up go environment for running gometalinter mkdir -p $GOPATH/src/github.com/PaddlePaddle/ ln -sf ${PADDLE_ROOT} $GOPATH/src/github.com/PaddlePaddle/Paddle @@ -382,9 +394,7 @@ EOF pip3.7 install --user ${INSTALL_PREFIX:-/paddle/build}/opt/paddle/share/wheels/*.whl fi - if [[ ${WITH_FLUID_ONLY:-OFF} == "OFF" ]] ; then - paddle version - fi + paddle version if [ "$1" == "cp27-cp27m" ]; then pip uninstall -y paddlepaddle @@ -422,8 +432,8 @@ function assert_api_spec_approvals() { BRANCH="develop" fi - API_FILES=("cmake/external" - "paddle/fluid/API.spec" + API_FILES=("paddle/fluid/API.spec" + "python/paddle/fluid/parallel_executor.py" "paddle/fluid/framework/operator.h" "paddle/fluid/framework/tensor.h" "paddle/fluid/framework/lod_tensor.h" @@ -435,6 +445,7 @@ function assert_api_spec_approvals() { "paddle/fluid/framework/ir/node.h" "paddle/fluid/framework/ir/graph.h" "paddle/fluid/framework/framework.proto" + "python/paddle/fluid/compiler.py" "paddle/fluid/operators/distributed/send_recv.proto.in") for API_FILE in ${API_FILES[*]}; do API_CHANGE=`git diff --name-only upstream/$BRANCH | grep "${API_FILE}" || true` @@ -539,7 +550,6 @@ EOF -DCMAKE_BUILD_TYPE=Release \ -DWITH_GPU=OFF \ -DWITH_MKL=OFF \ - -DWITH_FLUID_ONLY=ON local LIB_TYPE=$1 case $LIB_TYPE in @@ -615,13 +625,8 @@ EOF NCCL_DEPS="true" fi - if [[ ${WITH_FLUID_ONLY:-OFF} == "OFF" ]]; then - PADDLE_VERSION="paddle version" - CMD='"paddle", "version"' - else - PADDLE_VERSION="true" - CMD='"true"' - fi + PADDLE_VERSION="paddle version" + CMD='"paddle", "version"' if [ "$1" == "cp35-cp35m" ]; then cat >> ${PADDLE_ROOT}/build/Dockerfile <> ${PADDLE_ROOT}/build/Dockerfile <> ${PADDLE_ROOT}/build/Dockerfile < Date: Thu, 28 Feb 2019 18:05:17 +0800 Subject: [PATCH 009/230] update paddle_build --- paddle/scripts/paddle_build.sh | 1 + 1 file changed, 1 insertion(+) diff --git a/paddle/scripts/paddle_build.sh b/paddle/scripts/paddle_build.sh index c322f3970f..74d7250471 100755 --- a/paddle/scripts/paddle_build.sh +++ b/paddle/scripts/paddle_build.sh @@ -862,6 +862,7 @@ function main() { ;; test_fluid_lib) test_fluid_lib + ;; travis_check_api) traivs_check_api traivs_check_api_py35 From ffbd83947c23cda36a5371d466f3f8a4ea87ac78 Mon Sep 17 00:00:00 2001 From: tianshuo78520a <707759223@qq.com> Date: Thu, 28 Feb 2019 19:03:23 +0800 Subject: [PATCH 010/230] update build --- paddle/scripts/paddle_build.sh | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/paddle/scripts/paddle_build.sh b/paddle/scripts/paddle_build.sh index 74d7250471..260a9e4061 100755 --- a/paddle/scripts/paddle_build.sh +++ b/paddle/scripts/paddle_build.sh @@ -756,7 +756,9 @@ EOF ./clean.sh } -function traivs_check_api() { +function travis_check_api() { + mkdir -p ${PADDLE_ROOT}/build + cd ${PADDLE_ROOT}/build cmake .. \ -DCMAKE_BUILD_TYPE=Release \ -DWITH_GPU=OFF \ @@ -767,7 +769,9 @@ function traivs_check_api() { pip uninstall paddlepaddle } -function traivs_check_api_py35() { +function travis_check_api_py35() { + mkdir -p ${PADDLE_ROOT}/build + cd ${PADDLE_ROOT}/build cmake .. \ -DPY_VERSION=3.5 \ -DCMAKE_BUILD_TYPE=Release \ @@ -863,7 +867,7 @@ function main() { test_fluid_lib) test_fluid_lib ;; - travis_check_api) + check_api) traivs_check_api traivs_check_api_py35 ;; From 10439e8afc28869d470e1326bf2598f086d30c82 Mon Sep 17 00:00:00 2001 From: tianshuo78520a <707759223@qq.com> Date: Thu, 28 Feb 2019 21:55:38 +0800 Subject: [PATCH 011/230] update paddle_build check_api --- paddle/scripts/paddle_build.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/paddle/scripts/paddle_build.sh b/paddle/scripts/paddle_build.sh index 260a9e4061..2512c810f4 100755 --- a/paddle/scripts/paddle_build.sh +++ b/paddle/scripts/paddle_build.sh @@ -868,8 +868,8 @@ function main() { test_fluid_lib ;; check_api) - traivs_check_api - traivs_check_api_py35 + travis_check_api + travis_check_api_py35 ;; *) print_usage From 3334c279d098c884b915a56dedc7dd69c95d7c7e Mon Sep 17 00:00:00 2001 From: sneaxiy Date: Wed, 27 Feb 2019 10:40:26 +0000 Subject: [PATCH 012/230] add sample_generator test=develop --- paddle/fluid/API.spec | 3 +- .../fluid/operators/reader/blocking_queue.h | 1 + .../fluid/operators/reader/buffered_reader.cc | 1 + .../fluid/operators/reader/buffered_reader.h | 1 + paddle/fluid/operators/reader/py_reader.cc | 1 + paddle/fluid/operators/reader/py_reader.h | 1 + paddle/fluid/pybind/reader_py.cc | 3 + python/paddle/fluid/data_feeder.py | 82 ++++++++--- python/paddle/fluid/reader.py | 60 +++++++- .../unittests/test_decoupled_py_reader.py | 1 - .../test_py_reader_sample_generator.py | 137 ++++++++++++++++++ 11 files changed, 264 insertions(+), 27 deletions(-) create mode 100644 python/paddle/fluid/tests/unittests/test_py_reader_sample_generator.py diff --git a/paddle/fluid/API.spec b/paddle/fluid/API.spec index db3739e65b..746b1eecfe 100644 --- a/paddle/fluid/API.spec +++ b/paddle/fluid/API.spec @@ -61,8 +61,9 @@ paddle.fluid.io.load_params ArgSpec(args=['executor', 'dirname', 'main_program', paddle.fluid.io.load_persistables ArgSpec(args=['executor', 'dirname', 'main_program', 'filename'], varargs=None, keywords=None, defaults=(None, None)) paddle.fluid.io.save_inference_model ArgSpec(args=['dirname', 'feeded_var_names', 'target_vars', 'executor', 'main_program', 'model_filename', 'params_filename', 'export_for_deployment'], varargs=None, keywords=None, defaults=(None, None, None, True)) paddle.fluid.io.load_inference_model ArgSpec(args=['dirname', 'executor', 'model_filename', 'params_filename', 'pserver_endpoints'], varargs=None, keywords=None, defaults=(None, None, None)) -paddle.fluid.io.PyReader.__init__ ArgSpec(args=['self', 'feed_list', 'capacity', 'use_double_buffer', 'iterable'], varargs=None, keywords=None, defaults=(True, True)) +paddle.fluid.io.PyReader.__init__ ArgSpec(args=['self', 'feed_list', 'capacity', 'use_double_buffer', 'iterable'], varargs=None, keywords=None, defaults=(True, False)) paddle.fluid.io.PyReader.decorate_paddle_reader ArgSpec(args=['self', 'reader', 'places'], varargs=None, keywords=None, defaults=(None,)) +paddle.fluid.io.PyReader.decorate_sample_generator ArgSpec(args=['self', 'sample_generator', 'batch_size', 'drop_last', 'places'], varargs=None, keywords=None, defaults=(True, None)) paddle.fluid.io.PyReader.decorate_tensor_provider ArgSpec(args=['self', 'reader', 'places'], varargs=None, keywords=None, defaults=(None,)) paddle.fluid.io.PyReader.reset ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None) paddle.fluid.io.PyReader.start ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None) diff --git a/paddle/fluid/operators/reader/blocking_queue.h b/paddle/fluid/operators/reader/blocking_queue.h index 7962c0332d..78d238aa61 100644 --- a/paddle/fluid/operators/reader/blocking_queue.h +++ b/paddle/fluid/operators/reader/blocking_queue.h @@ -16,6 +16,7 @@ #include // NOLINT #include +#include #include "paddle/fluid/platform/enforce.h" diff --git a/paddle/fluid/operators/reader/buffered_reader.cc b/paddle/fluid/operators/reader/buffered_reader.cc index b8c98ff5e7..c9962b4ac2 100644 --- a/paddle/fluid/operators/reader/buffered_reader.cc +++ b/paddle/fluid/operators/reader/buffered_reader.cc @@ -13,6 +13,7 @@ // limitations under the License. #include "paddle/fluid/operators/reader/buffered_reader.h" +#include #include #include "paddle/fluid/framework/data_type.h" diff --git a/paddle/fluid/operators/reader/buffered_reader.h b/paddle/fluid/operators/reader/buffered_reader.h index 6b21de0949..5f8b2d47c2 100644 --- a/paddle/fluid/operators/reader/buffered_reader.h +++ b/paddle/fluid/operators/reader/buffered_reader.h @@ -15,6 +15,7 @@ #pragma once #include +#include #include #include #include "ThreadPool.h" diff --git a/paddle/fluid/operators/reader/py_reader.cc b/paddle/fluid/operators/reader/py_reader.cc index f2c28c1df8..155ae859de 100644 --- a/paddle/fluid/operators/reader/py_reader.cc +++ b/paddle/fluid/operators/reader/py_reader.cc @@ -13,6 +13,7 @@ // limitations under the License. #include "paddle/fluid/operators/reader/py_reader.h" +#include namespace paddle { namespace operators { diff --git a/paddle/fluid/operators/reader/py_reader.h b/paddle/fluid/operators/reader/py_reader.h index 7d760eca64..4307907514 100644 --- a/paddle/fluid/operators/reader/py_reader.h +++ b/paddle/fluid/operators/reader/py_reader.h @@ -15,6 +15,7 @@ #pragma once #include +#include #include #include "paddle/fluid/framework/reader.h" #include "paddle/fluid/operators/reader/lod_tensor_blocking_queue.h" diff --git a/paddle/fluid/pybind/reader_py.cc b/paddle/fluid/pybind/reader_py.cc index 8af0490310..af7d30552e 100644 --- a/paddle/fluid/pybind/reader_py.cc +++ b/paddle/fluid/pybind/reader_py.cc @@ -13,7 +13,10 @@ // limitations under the License. #include "paddle/fluid/pybind/reader_py.h" +#include #include +#include +#include #include #include "paddle/fluid/framework/reader.h" #include "paddle/fluid/operators/reader/buffered_reader.h" diff --git a/python/paddle/fluid/data_feeder.py b/python/paddle/fluid/data_feeder.py index a24e1d1300..83d7cef19c 100644 --- a/python/paddle/fluid/data_feeder.py +++ b/python/paddle/fluid/data_feeder.py @@ -26,6 +26,24 @@ from .framework import Variable, default_main_program __all__ = ['DataFeeder'] +def convert_dtype(dtype): + if dtype == core.VarDesc.VarType.FP32: + return 'float32' + elif dtype == core.VarDesc.VarType.INT64: + return 'int64' + elif dtype == core.VarDesc.VarType.FP64: + return 'float64' + elif dtype == core.VarDesc.VarType.FP16: + return 'float16' + elif dtype == core.VarDesc.VarType.INT32: + return 'int32' + elif dtype == core.VarDesc.VarType.UINT8: + return 'uint8' + else: + raise ValueError("dtype must be any of [int32, float32, int64, " + "float64, uint8]") + + class DataToLoDTensorConverter(object): def __init__(self, place, lod_level, shape, dtype): self.place = place @@ -38,27 +56,12 @@ class DataToLoDTensorConverter(object): if negtive_count > 1: self.shape = None break - if dtype == core.VarDesc.VarType.FP32: - self.dtype = 'float32' - elif dtype == core.VarDesc.VarType.INT64: - self.dtype = 'int64' - elif dtype == core.VarDesc.VarType.FP64: - self.dtype = 'float64' - elif dtype == core.VarDesc.VarType.FP16: - self.dtype = 'float16' - elif dtype == core.VarDesc.VarType.INT32: - self.dtype = 'int32' - elif dtype == core.VarDesc.VarType.UINT8: - self.dtype = 'uint8' - else: - raise ValueError("dtype must be any of [int32, float32, int64, " - "float64, uint8]") + self.dtype = convert_dtype(dtype) + self._reset() + def _reset(self): self.data = [] - self.lod = [] - - for i in six.moves.range(lod_level): - self.lod.append([]) + self.lod = [[] for _ in six.moves.range(self.lod_level)] def feed(self, data): self._feed_impl_(data, self.lod, self.lod_level) @@ -88,15 +91,52 @@ class DataToLoDTensorConverter(object): raise ValueError( "Reshape error. What is defined in data layer is {}, but receive {}" .format(self.shape, arr.shape)) - #else: - # self._check_shape(arr.shape) t = core.LoDTensor() t.set(arr, self.place) if self.lod_level > 0: t.set_recursive_sequence_lengths(self.lod) + self._reset() return t +class BatchedTensorProvider(object): + def __init__(self, feed_list, place, batch_size, generator, drop_last): + self.place = place + self.batch_size = batch_size + self.generator = generator + self.converters = [] + self.drop_last = drop_last + + for var in feed_list: + assert var.lod_level == 0, "lod_level must be 0" + self.converters.append( + DataToLoDTensorConverter( + place=self.place, + lod_level=0, + shape=var.shape, + dtype=var.dtype)) + + def _done(self): + return [c.done() for c in self.converters] + + def __call__(self): + idx = 0 + for each_sample in self.generator(): + for each_slot, each_converter in six.moves.zip(each_sample, + self.converters): + each_converter.data.append(each_slot) + + idx += 1 + if idx == self.batch_size: + idx = 0 + yield self._done() + + if not self.drop_last and idx > 0: + yield self._done() + else: + [c._reset() for c in self.converters] + + class DataFeeder(object): """ DataFeeder converts the data that returned by a reader into a data diff --git a/python/paddle/fluid/reader.py b/python/paddle/fluid/reader.py index 7d08403d26..49ea1b83b5 100644 --- a/python/paddle/fluid/reader.py +++ b/python/paddle/fluid/reader.py @@ -17,7 +17,7 @@ import six import threading from .framework import Program, Variable, program_guard, default_main_program, default_startup_program from .executor import global_scope -from .data_feeder import DataFeeder +from .data_feeder import DataFeeder, BatchedTensorProvider from .layers.io import monkey_patch_reader_methods, _copy_reader_var_, double_buffer from .unique_name import UniqueNameGenerator @@ -46,7 +46,7 @@ class PyReader(object): feed_list, capacity, use_double_buffer=True, - iterable=True): + iterable=False): """ Create a reader object for data feeding in Python. Data would be prefetched using Python thread and be pushed @@ -269,6 +269,54 @@ class PyReader(object): self._thread.daemon = True self._thread.start() + def decorate_sample_generator(self, + sample_generator, + batch_size, + drop_last=True, + places=None): + ''' + Set the data source of the PyReader object. + + The provided :code:`sample_generator` should be a Python generator, + which yields numpy.ndarray typed data of each sample. + + :code:`places` must be set when the PyReader object is iterable. + + If all inputs have no lods, this method is faster than + :code:`decorate_paddle_reader(paddle.batch(sample_generator, ...))` . + + Args: + sample_generator (generator): Python generator that yields + numpy.ndarray-typed sample data. + batch_size (int): batch size. Must be larger than 0. + drop_last (bool): Whether to drop the last batch when sample number + is less than batch_size. + places (None|list(CUDAPlace)|list(CPUPlace)): place list. Must + be provided when PyReader is iterable. + ''' + assert batch_size > 0, "batch_size must be larger than 0" + has_lod = False + for f in self._feed_list: + if f.lod_level != 0: + has_lod = True + break + + if has_lod: + self.decorate_paddle_reader( + paddle.batch( + sample_generator, + batch_size=batch_size, + drop_last=drop_last), + places=places) + else: + reader = BatchedTensorProvider( + feed_list=self._feed_list, + place=core.CPUPlace(), + batch_size=batch_size, + generator=sample_generator, + drop_last=drop_last) + self.decorate_tensor_provider(reader, places=places) + def decorate_paddle_reader(self, reader, places=None): ''' Set the data source of the PyReader object. @@ -279,8 +327,10 @@ class PyReader(object): :code:`places` must be set when the PyReader object is iterable. Args: - reader (generator): Python generator that yields numpy-typed - batched data. + reader (generator): Python generator that yields + list(numpy.ndarray)-typed batched data. + places (None|list(CUDAPlace)|list(CPUPlace)): place list. Must + be provided when PyReader is iterable. ''' assert self._tensor_reader is None, \ "Cannot reset the data source of PyReader" @@ -307,6 +357,8 @@ class PyReader(object): Args: reader (generator): Python generator that yields LoDTensor-typed batched data. + places (None|list(CUDAPlace)|list(CPUPlace)): place list. Must + be provided when PyReader is iterable. ''' assert self._tensor_reader is None, \ "Cannot reset the data source of PyReader" diff --git a/python/paddle/fluid/tests/unittests/test_decoupled_py_reader.py b/python/paddle/fluid/tests/unittests/test_decoupled_py_reader.py index 96a11edd49..7112a57743 100644 --- a/python/paddle/fluid/tests/unittests/test_decoupled_py_reader.py +++ b/python/paddle/fluid/tests/unittests/test_decoupled_py_reader.py @@ -127,7 +127,6 @@ class TestBase(unittest.TestCase): step_list.append(step) end_t = time.time() ret = {"time": end_t - start_t, "step": step_list} - scope._remove_from_pool() return ret def prepare_places(self, with_data_parallel, with_cpu=True, with_gpu=True): diff --git a/python/paddle/fluid/tests/unittests/test_py_reader_sample_generator.py b/python/paddle/fluid/tests/unittests/test_py_reader_sample_generator.py new file mode 100644 index 0000000000..2f8f0b1b6e --- /dev/null +++ b/python/paddle/fluid/tests/unittests/test_py_reader_sample_generator.py @@ -0,0 +1,137 @@ +# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import paddle +import paddle.fluid as fluid +import math +import unittest +import numpy as np +import os + +os.environ['CPU_NUM'] = '1' + + +def random_reader(sample_num): + def __impl__(): + for _ in range(sample_num): + yield np.random.random( + size=[784]).astype('float32'), np.random.random_integers( + low=0, high=9, size=[1]).astype('int64') + + return paddle.reader.cache(__impl__) + + +class TestCaseBase(unittest.TestCase): + def setUp(self): + self.batch_size = 32 + self.epoch_num = 2 + self.sample_num = 165 + + def generate_all_data(self, reader): + ret = [] + for d in reader(): + slots = [[], []] + for item in d: + slots[0].append(item[0]) + slots[1].append(item[1]) + slots = [np.array(slot) for slot in slots] + ret.append(slots) + return ret + + def run_main(self, reader, use_sample_generator, iterable, drop_last): + image = fluid.layers.data(name='image', dtype='float32', shape=[784]) + label = fluid.layers.data(name='label', dtype='int64', shape=[1]) + py_reader = fluid.io.PyReader( + feed_list=[image, label], + capacity=16, + iterable=iterable, + use_double_buffer=False) + + batch_reader = paddle.batch(reader, self.batch_size, drop_last) + all_datas = self.generate_all_data(batch_reader) + + if not use_sample_generator: + py_reader.decorate_paddle_reader( + batch_reader, places=fluid.cpu_places()) + else: + py_reader.decorate_sample_generator( + reader, self.batch_size, drop_last, places=fluid.cpu_places()) + + if drop_last: + batch_num = int(self.sample_num / self.batch_size) + else: + batch_num = math.ceil(float(self.sample_num) / self.batch_size) + + exe = fluid.Executor(fluid.CPUPlace()) + exe.run(fluid.default_startup_program()) + for _ in range(self.epoch_num): + if py_reader.iterable: + step = 0 + for data in py_reader(): + img, lbl = exe.run(feed=data, fetch_list=[image, label]) + self.assertArrayEqual(img, all_datas[step][0]) + self.assertArrayEqual(lbl, all_datas[step][1]) + step += 1 + self.assertEqual(step, len(all_datas)) + else: + step = 0 + try: + py_reader.start() + while True: + img, lbl = exe.run(fetch_list=[image, label]) + self.assertArrayEqual(img, all_datas[step][0]) + self.assertArrayEqual(lbl, all_datas[step][1]) + step += 1 + except fluid.core.EOFException: + py_reader.reset() + self.assertEqual(step, len(all_datas)) + break + + def assertArrayEqual(self, arr1, arr2): + self.assertEqual(arr1.shape, arr2.shape) + self.assertTrue((arr1 == arr2).all()) + + def test_main(self): + reader = random_reader(self.sample_num) + for use_sample_generator in [False, True]: + for iterable in [False, True]: + for drop_last in [False, True]: + with fluid.program_guard(fluid.Program(), fluid.Program()): + self.run_main(reader, use_sample_generator, iterable, + drop_last) + + +class TestCase1(TestCaseBase): + def setUp(self): + self.batch_size = 32 + self.epoch_num = 10 + self.sample_num = 160 + + +class TestCase2(TestCaseBase): + def setUp(self): + self.batch_size = 32 + self.epoch_num = 2 + self.sample_num = 200 + + +class TestCase3(TestCaseBase): + def setUp(self): + self.batch_size = 32 + self.epoch_num = 2 + self.sample_num = 159 + + +if __name__ == '__main__': + unittest.main() From 9773f38f99e0fbb1a19348bb0a1a60d3995afaf6 Mon Sep 17 00:00:00 2001 From: luotao1 Date: Fri, 1 Mar 2019 12:39:49 +0800 Subject: [PATCH 013/230] cache runtime_context test=develop --- paddle/fluid/framework/operator.cc | 21 ++++++++++++++------- paddle/fluid/framework/operator.h | 1 + 2 files changed, 15 insertions(+), 7 deletions(-) diff --git a/paddle/fluid/framework/operator.cc b/paddle/fluid/framework/operator.cc index 64592d73e1..c2063b5e6a 100644 --- a/paddle/fluid/framework/operator.cc +++ b/paddle/fluid/framework/operator.cc @@ -916,7 +916,14 @@ std::vector* OperatorWithKernel::GetKernelConfig( void OperatorWithKernel::RunImpl(const Scope& scope, const platform::Place& place) const { - RuntimeContext ctx(Inputs(), Outputs(), scope); + if (!runtime_ctx_) { + // RuntimeContext is used to relate input/output names of Operator with + // the corresponding variables in Scope. + // Since the input/output names of Operator do not change in the execution, + // RuntimeContext could be created only at the first iteration of + // the execution to save the elapsed time. + runtime_ctx_ = new RuntimeContext(Inputs(), Outputs(), scope); + } platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance(); auto* dev_ctx = pool.Get(place); @@ -931,7 +938,7 @@ void OperatorWithKernel::RunImpl(const Scope& scope, OpKernelMap& kernels = kernels_iter->second; auto expected_kernel_key = this->GetExpectedKernelType( - ExecutionContext(*this, scope, *dev_ctx, ctx, nullptr)); + ExecutionContext(*this, scope, *dev_ctx, *runtime_ctx_, nullptr)); VLOG(3) << "expected_kernel_key:" << expected_kernel_key; auto kernel_iter = kernels.find(expected_kernel_key); @@ -955,8 +962,8 @@ void OperatorWithKernel::RunImpl(const Scope& scope, // do data transformScope &transfer_scope; std::vector transfered_inplace_vars; - auto* transfer_scope = - PrepareData(scope, expected_kernel_key, &transfered_inplace_vars, &ctx); + auto* transfer_scope = PrepareData(scope, expected_kernel_key, + &transfered_inplace_vars, runtime_ctx_); // exec scope is the scope that kernel actually executed on. const Scope& exec_scope = @@ -966,12 +973,12 @@ void OperatorWithKernel::RunImpl(const Scope& scope, dev_ctx = pool.Get(expected_kernel_key.place_); } - RuntimeInferShapeContext infer_shape_ctx(*this, exec_scope, ctx); + RuntimeInferShapeContext infer_shape_ctx(*this, exec_scope, *runtime_ctx_); this->InferShape(&infer_shape_ctx); // TODO(panyx0718): ExecutionContext should only depend on RuntimeContext // not Scope. Imperative mode only pass inputs and get outputs. - kernel_iter->second( - ExecutionContext(*this, exec_scope, *dev_ctx, ctx, kernel_configs)); + kernel_iter->second(ExecutionContext(*this, exec_scope, *dev_ctx, + *runtime_ctx_, kernel_configs)); if (!transfered_inplace_vars.empty()) { // there is inplace variable has been transfered. diff --git a/paddle/fluid/framework/operator.h b/paddle/fluid/framework/operator.h index 8a86813e93..e34a0e2141 100644 --- a/paddle/fluid/framework/operator.h +++ b/paddle/fluid/framework/operator.h @@ -541,6 +541,7 @@ class OperatorWithKernel : public OperatorBase { protected: mutable OpKernelConfigsMap kernel_configs_map_; + mutable RuntimeContext* runtime_ctx_ = nullptr; }; extern bool OpSupportGPU(const std::string& op_type); From 82b0bb9d72b0a023477e2b1361e79a432cf39957 Mon Sep 17 00:00:00 2001 From: luotao1 Date: Fri, 1 Mar 2019 18:23:42 +0800 Subject: [PATCH 014/230] fix cpplint error test=develop --- paddle/fluid/framework/operator.h | 2 ++ 1 file changed, 2 insertions(+) diff --git a/paddle/fluid/framework/operator.h b/paddle/fluid/framework/operator.h index e34a0e2141..3c3e9096c0 100644 --- a/paddle/fluid/framework/operator.h +++ b/paddle/fluid/framework/operator.h @@ -16,9 +16,11 @@ limitations under the License. */ #include #include +#include #include #include #include +#include #include #include "glog/logging.h" // For VLOG From 545247d7b4e803a2067c0187b2c3c962ec22629d Mon Sep 17 00:00:00 2001 From: Zhen Wang Date: Mon, 4 Mar 2019 17:59:31 +0800 Subject: [PATCH 015/230] add channel wise quantize op. --- paddle/fluid/operators/fake_quantize_op.cc | 62 +++++++++++++++++++ paddle/fluid/operators/fake_quantize_op.cu | 2 + paddle/fluid/operators/fake_quantize_op.h | 33 ++++++++++ .../tests/unittests/test_fake_quantize_op.py | 24 +++++++ 4 files changed, 121 insertions(+) diff --git a/paddle/fluid/operators/fake_quantize_op.cc b/paddle/fluid/operators/fake_quantize_op.cc index 3bb07d3835..c873ee6718 100644 --- a/paddle/fluid/operators/fake_quantize_op.cc +++ b/paddle/fluid/operators/fake_quantize_op.cc @@ -134,6 +134,61 @@ $$Out = round(X/scale * range)$$ } }; +class FakeChannelWiseQuantizeAbsMaxOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext* ctx) const override { + PADDLE_ENFORCE(ctx->HasInput("X"), + "Input(X) of FakeChannelWiseQuantizeOp should not be null."); + PADDLE_ENFORCE( + ctx->HasOutput("Out"), + "Output(Out) of FakeChannelWiseQuantizeOp should not be null."); + PADDLE_ENFORCE( + ctx->HasOutput("OutScales"), + "Output(Scales) of FakeChannelWiseQuantizeOp should not be null."); + ctx->SetOutputDim("Out", ctx->GetInputDim("X")); + ctx->SetOutputDim("OutScales", {ctx->GetInputDim("X")[0]}); + ctx->ShareLoD("X", /*->*/ "Out"); + } + + protected: + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext& ctx) const override { + return framework::OpKernelType(ctx.Input("X")->type(), + ctx.GetPlace()); + } +}; + +class FakeChannelWiseQuantizeAbsMaxOpMaker + : public framework::OpProtoAndCheckerMaker { + public: + void Make() override { + AddInput("X", "(Tensor) Input is float data type."); + AddOutput("Out", + "(Tensor) Output of quantized low level tensor, " + "but also saved as float data type."); + AddOutput("OutScales", "(Tensor) Current channel wise scale"); + AddAttr("bit_length", "(int, default 8)") + .SetDefault(8) + .AddCustomChecker([](const int& bit_length) { + PADDLE_ENFORCE(bit_length >= 1 && bit_length <= 16, + "'bit_length' should be between 1 and 16."); + }); + AddComment(R"DOC( +The scale of FakeChannelWiseQuantize operator is a vector. +In detail, each channel of the input X has a scale value. + +$$scale_c = max(abs(X_c))$$ +$$range = 2^{bit_length - 1} - 1$$ +$$Out_c = round(X_c / scale_c * range)$$ + +In above three formulas, the range value of c is as follow: +$$0 \leq c \leq \ the\ channel\ number\ of\ X$$ +)DOC"); + } +}; + class FakeQuantizeRangeAbsMaxOp : public framework::OperatorWithKernel { public: FakeQuantizeRangeAbsMaxOp(const std::string& type, @@ -218,3 +273,10 @@ REGISTER_OPERATOR(fake_quantize_range_abs_max, ops::FakeQuantizeRangeAbsMaxOp, paddle::framework::EmptyGradOpMaker); REGISTER_OP_CPU_KERNEL(fake_quantize_range_abs_max, ops::FakeQuantizeRangeAbsMaxKernel); + +REGISTER_OPERATOR(fake_channel_wise_quantize_abs_max, + ops::FakeChannelWiseQuantizeAbsMaxOp, + ops::FakeChannelWiseQuantizeAbsMaxOpMaker, + paddle::framework::EmptyGradOpMaker); +REGISTER_OP_CPU_KERNEL(fake_channel_wise_quantize_abs_max, + ops::FakeChannelWiseQuantizeAbsMaxKernel); diff --git a/paddle/fluid/operators/fake_quantize_op.cu b/paddle/fluid/operators/fake_quantize_op.cu index a0ff639621..5da16a7c73 100644 --- a/paddle/fluid/operators/fake_quantize_op.cu +++ b/paddle/fluid/operators/fake_quantize_op.cu @@ -174,5 +174,7 @@ namespace ops = paddle::operators; using CUDA = paddle::platform::CUDADeviceContext; REGISTER_OP_CUDA_KERNEL(fake_quantize_abs_max, ops::FakeQuantizeAbsMaxKernel); +REGISTER_OP_CUDA_KERNEL(fake_channel_wise_quantize_abs_max, + ops::FakeChannelWiseQuantizeAbsMaxKernel); REGISTER_OP_CUDA_KERNEL(fake_quantize_range_abs_max, ops::FakeQuantizeRangeAbsMaxKernel); diff --git a/paddle/fluid/operators/fake_quantize_op.h b/paddle/fluid/operators/fake_quantize_op.h index 7ace7573ec..8b47600e7d 100644 --- a/paddle/fluid/operators/fake_quantize_op.h +++ b/paddle/fluid/operators/fake_quantize_op.h @@ -63,6 +63,39 @@ class FakeQuantizeAbsMaxKernel : public framework::OpKernel { } }; +template +class FakeChannelWiseQuantizeAbsMaxKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& context) const override { + auto* in = context.Input("X"); + + auto* out = context.Output("Out"); + auto* out_scales = context.Output("OutScales"); + T* out_scales_data = out_scales->mutable_data(context.GetPlace()); + out->mutable_data(context.GetPlace()); + + int bit_length = context.Attr("bit_length"); + int bin_cnt = std::pow(2, bit_length - 1) - 1; + + auto& dev_ctx = context.template device_context(); + auto find_abs_max = FindAbsMaxFunctor(); + for (int64_t i = 0; i < in->dims()[0]; i++) { + framework::Tensor one_channel = in->Slice(i, i + 1); + const T* one_channel_data = one_channel.data(); + find_abs_max(dev_ctx, one_channel_data, one_channel.numel(), + &out_scales_data[i]); + } + auto clip_quant = ClipAndFakeQuantFunctor(); + for (int64_t i = 0; i < in->dims()[0]; i++) { + framework::Tensor one_channel_in = in->Slice(i, i + 1); + framework::Tensor one_channel_out = out->Slice(i, i + 1); + framework::Tensor one_channel_scale = out_scales->Slice(i, i + 1); + clip_quant(dev_ctx, one_channel_in, one_channel_scale, bin_cnt, + &one_channel_out); + } + } +}; + template class FakeQuantizeRangeAbsMaxKernel : public framework::OpKernel { public: diff --git a/python/paddle/fluid/tests/unittests/test_fake_quantize_op.py b/python/paddle/fluid/tests/unittests/test_fake_quantize_op.py index 4582b2a0ee..90a90112bd 100644 --- a/python/paddle/fluid/tests/unittests/test_fake_quantize_op.py +++ b/python/paddle/fluid/tests/unittests/test_fake_quantize_op.py @@ -35,6 +35,30 @@ class TestFakeQuantizeOp(OpTest): self.check_output() +class TestFakeChannelWiseQuantizeOp(OpTest): + def setUp(self): + self.op_type = "fake_channel_wise_quantize_abs_max" + self.attrs = {'bit_length': 8} + self.inputs = { + 'X': np.random.random((4, 3, 64, 64)).astype("float32"), + } + scales = [] + for i in range(self.inputs['X'].shape[0]): + scales.append(np.max(np.abs(self.inputs['X'][i])).astype("float32")) + outputs = self.inputs['X'].copy() + for i, scale in enumerate(scales): + outputs[i] = np.round(outputs[i] / scale * ( + (1 << (self.attrs['bit_length'] - 1)) - 1)) + + self.outputs = { + 'Out': outputs, + 'OutScales': np.array(scales).astype("float32"), + } + + def test_check_output(self): + self.check_output() + + class TestFakeQuantizeRangeAbsMaxOp(OpTest): def setUp(self): self.op_type = "fake_quantize_range_abs_max" From 784826a4f507e6045d582b2cdf2332af44a46b1a Mon Sep 17 00:00:00 2001 From: luotao1 Date: Mon, 4 Mar 2019 13:49:38 +0800 Subject: [PATCH 016/230] enhance cache runtime_context for different scope test=develop --- paddle/fluid/framework/operator.cc | 20 +++++++++++++------- paddle/fluid/framework/operator.h | 3 ++- 2 files changed, 15 insertions(+), 8 deletions(-) diff --git a/paddle/fluid/framework/operator.cc b/paddle/fluid/framework/operator.cc index 0b436f4c8b..ef0a4779dc 100644 --- a/paddle/fluid/framework/operator.cc +++ b/paddle/fluid/framework/operator.cc @@ -20,6 +20,7 @@ limitations under the License. */ #include #include #include "paddle/fluid/framework/data_transform.h" +#include "paddle/fluid/framework/details/op_handle_base.h" #include "paddle/fluid/framework/executor.h" #include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/framework/op_proto_maker.h" @@ -917,13 +918,18 @@ std::vector* OperatorWithKernel::GetKernelConfig( void OperatorWithKernel::RunImpl(const Scope& scope, const platform::Place& place) const { - if (!runtime_ctx_) { + const Scope* cur_scope = &scope; + if (!runtime_ctx_ || pre_scope_ != cur_scope || + scope.FindVar(details::kLocalExecScopeName)) { // RuntimeContext is used to relate input/output names of Operator with // the corresponding variables in Scope. - // Since the input/output names of Operator do not change in the execution, - // RuntimeContext could be created only at the first iteration of - // the execution to save the elapsed time. - runtime_ctx_ = new RuntimeContext(Inputs(), Outputs(), scope); + // In a same Scope, since the input/output names of Operator do not change + // in the execution, RuntimeContext could be created only at the first + // iteration of the execution to save the elapsed time. + // Note that the Scope should not be the local scope, since local scope + // would be cleaned regularly. + runtime_ctx_.reset(new RuntimeContext(Inputs(), Outputs(), scope)); + pre_scope_ = cur_scope; } platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance(); auto* dev_ctx = pool.Get(place); @@ -963,8 +969,8 @@ void OperatorWithKernel::RunImpl(const Scope& scope, // do data transformScope &transfer_scope; std::vector transfered_inplace_vars; - auto* transfer_scope = PrepareData(scope, expected_kernel_key, - &transfered_inplace_vars, runtime_ctx_); + auto* transfer_scope = PrepareData( + scope, expected_kernel_key, &transfered_inplace_vars, runtime_ctx_.get()); // exec scope is the scope that kernel actually executed on. const Scope& exec_scope = diff --git a/paddle/fluid/framework/operator.h b/paddle/fluid/framework/operator.h index 3c3e9096c0..6d21d0c749 100644 --- a/paddle/fluid/framework/operator.h +++ b/paddle/fluid/framework/operator.h @@ -543,7 +543,8 @@ class OperatorWithKernel : public OperatorBase { protected: mutable OpKernelConfigsMap kernel_configs_map_; - mutable RuntimeContext* runtime_ctx_ = nullptr; + mutable std::unique_ptr runtime_ctx_; + mutable const Scope* pre_scope_ = nullptr; }; extern bool OpSupportGPU(const std::string& op_type); From c0b240aa433939081730f31563f38fc2f410847d Mon Sep 17 00:00:00 2001 From: luotao1 Date: Tue, 5 Mar 2019 15:33:53 +0800 Subject: [PATCH 017/230] try to fix distributed unit-test test=develop --- paddle/fluid/framework/operator.cc | 19 ++++++++++--------- paddle/fluid/framework/scope.cc | 4 ++++ paddle/fluid/framework/scope.h | 4 ++++ 3 files changed, 18 insertions(+), 9 deletions(-) diff --git a/paddle/fluid/framework/operator.cc b/paddle/fluid/framework/operator.cc index ef0a4779dc..3959728a20 100644 --- a/paddle/fluid/framework/operator.cc +++ b/paddle/fluid/framework/operator.cc @@ -919,15 +919,16 @@ std::vector* OperatorWithKernel::GetKernelConfig( void OperatorWithKernel::RunImpl(const Scope& scope, const platform::Place& place) const { const Scope* cur_scope = &scope; - if (!runtime_ctx_ || pre_scope_ != cur_scope || - scope.FindVar(details::kLocalExecScopeName)) { - // RuntimeContext is used to relate input/output names of Operator with - // the corresponding variables in Scope. - // In a same Scope, since the input/output names of Operator do not change - // in the execution, RuntimeContext could be created only at the first - // iteration of the execution to save the elapsed time. - // Note that the Scope should not be the local scope, since local scope - // would be cleaned regularly. + // RuntimeContext is used to relate input/output names of Operator with + // the corresponding variables in Scope. + // In a same Scope, since the input/output names of Operator do not change + // in the execution, RuntimeContext could be created only at the first + // iteration of the execution to save the elapsed time. + // Note that the Scope should not be the local scope, since local scope + // would be cleaned regularly. + if (scope.FindVar(details::kLocalExecScopeName)) { + runtime_ctx_.reset(new RuntimeContext(Inputs(), Outputs(), scope)); + } else if (!runtime_ctx_ || pre_scope_ != cur_scope) { runtime_ctx_.reset(new RuntimeContext(Inputs(), Outputs(), scope)); pre_scope_ = cur_scope; } diff --git a/paddle/fluid/framework/scope.cc b/paddle/fluid/framework/scope.cc index 87f0f307d3..e6de477171 100644 --- a/paddle/fluid/framework/scope.cc +++ b/paddle/fluid/framework/scope.cc @@ -107,6 +107,10 @@ const Scope* Scope::FindScope(const Variable* var) const { return FindScopeInternal(var); } +bool Scope::HasLocalVar(const std::string& name) const { + return vars_.find(name) != vars_.end(); +} + void Scope::DropKids() { SCOPE_KIDS_WRITER_LOCK for (Scope* s : kids_) delete s; diff --git a/paddle/fluid/framework/scope.h b/paddle/fluid/framework/scope.h index f0915d2eee..38d3b4d6ce 100644 --- a/paddle/fluid/framework/scope.h +++ b/paddle/fluid/framework/scope.h @@ -75,6 +75,10 @@ class Scope { /// Caller doesn't own the returned Variable. Variable* FindLocalVar(const std::string& name) const; + /// Find whether a variable in the current scope. + /// Return false if cannot find. + bool HasLocalVar(const std::string& name) const; + const Scope* parent() const { return parent_; } /// Find the scope or an ancestor scope that contains the given variable. From 89dee160d18d699075c2bfbfce6d7311dfa4f59f Mon Sep 17 00:00:00 2001 From: Zhen Wang Date: Tue, 5 Mar 2019 16:41:46 +0800 Subject: [PATCH 018/230] add channel wise dequantize op. --- paddle/fluid/operators/fake_dequantize_op.cc | 72 +++++++++++++++++++ paddle/fluid/operators/fake_dequantize_op.cu | 4 ++ paddle/fluid/operators/fake_dequantize_op.h | 51 +++++++++++++ paddle/fluid/operators/fake_quantize_op.cc | 7 +- .../unittests/test_fake_dequantize_op.py | 71 ++++++++++++++++++ 5 files changed, 201 insertions(+), 4 deletions(-) diff --git a/paddle/fluid/operators/fake_dequantize_op.cc b/paddle/fluid/operators/fake_dequantize_op.cc index 5d6488c67e..73ffaae6a5 100644 --- a/paddle/fluid/operators/fake_dequantize_op.cc +++ b/paddle/fluid/operators/fake_dequantize_op.cc @@ -76,6 +76,70 @@ $$Out = \frac{scale*X}{ max_range }$$ } }; +class FakeChannelWiseDequantizeMaxAbsOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext* ctx) const override { + PADDLE_ENFORCE( + ctx->HasInput("X"), + "Input(X) of FakeChannelWiseDequantizeMaxAbsOp should not be null."); + PADDLE_ENFORCE(ctx->HasInput("WeightScales"), + "Input(WeightScales) of FakeChannelWiseDequantizeMaxAbsOp " + "should not be null."); + PADDLE_ENFORCE( + ctx->HasOutput("Out"), + "Output(Out) of FakeChannelWiseDequantizeMaxAbsOp should not be null."); + + ctx->ShareDim("X", /*->*/ "Out"); + ctx->ShareLoD("X", /*->*/ "Out"); + } +}; + +class FakeChannelWiseDequantizeMaxAbsOpMaker + : public framework::OpProtoAndCheckerMaker { + public: + void Make() override { + AddInput("X", + "(Tensor) The input with float-32/64 type is the " + "low precision tensor."); + AddInput("ActivationScale", + "(float) The activation scale in quantization stage.") + .AsDispensable(); + AddInput("WeightScales", + "(float array) The weight scales in quantization stage."); + AddOutput("Out", + "(Tensor) The output is the dequantized high " + "precision tensor."); + AddAttr("activation_bits", "Quantization bit number for activation.") + .SetDefault(8) + .AddCustomChecker([](const int& bit_length) { + PADDLE_ENFORCE(bit_length >= 1 && bit_length <= 16, + "'activation_bits' should be between 1 and 16."); + }); + AddAttr("weight_bits", "Quantization bit number for weights.") + .SetDefault(8) + .AddCustomChecker([](const int& bit_length) { + PADDLE_ENFORCE(bit_length >= 1 && bit_length <= 16, + "'weight_bits' should be between 1 and 16."); + }); + + AddComment(R"DOC( +FakeChannelWiseDequantizeMaxAbsOp operator. + +This calculation is an opposite operation of FakeChannelWiseQuantizeMaxAbsOp: + +$$Out_c = \frac{ActivationScale*WeightScale_c*X_c}{(2^{weight\_bits-1}-1)*(2^{activation\_bits-1}-1)}$$ + +In the above formula, the range value of c is as follow: +$$0 \leq c \lt \ the\ channel\ number\ of\ X$$ + +Notes: Tha per-channel quantization is only applied to weights(channel size scale). +And the activations use per-layer quantization(only one scale). +)DOC"); + } +}; + } // namespace operators } // namespace paddle @@ -88,3 +152,11 @@ REGISTER_OPERATOR(fake_dequantize_max_abs, ops::FakeDequantizeMaxAbsOp, REGISTER_OP_CPU_KERNEL(fake_dequantize_max_abs, ops::FakeDequantizeMaxAbsKernel, ops::FakeDequantizeMaxAbsKernel); + +REGISTER_OPERATOR(fake_channel_wise_dequantize_max_abs, + ops::FakeChannelWiseDequantizeMaxAbsOp, + ops::FakeChannelWiseDequantizeMaxAbsOpMaker, + paddle::framework::EmptyGradOpMaker); +REGISTER_OP_CPU_KERNEL(fake_channel_wise_dequantize_max_abs, + ops::FakeChannelWiseDequantizeMaxAbsKernel, + ops::FakeChannelWiseDequantizeMaxAbsKernel); diff --git a/paddle/fluid/operators/fake_dequantize_op.cu b/paddle/fluid/operators/fake_dequantize_op.cu index 225bcc45bc..35dcc69279 100644 --- a/paddle/fluid/operators/fake_dequantize_op.cu +++ b/paddle/fluid/operators/fake_dequantize_op.cu @@ -55,3 +55,7 @@ using CUDA = paddle::platform::CUDADeviceContext; REGISTER_OP_CUDA_KERNEL(fake_dequantize_max_abs, ops::FakeDequantizeMaxAbsKernel, ops::FakeDequantizeMaxAbsKernel); +REGISTER_OP_CUDA_KERNEL( + fake_channel_wise_dequantize_max_abs, + ops::FakeChannelWiseDequantizeMaxAbsKernel, + ops::FakeChannelWiseDequantizeMaxAbsKernel); diff --git a/paddle/fluid/operators/fake_dequantize_op.h b/paddle/fluid/operators/fake_dequantize_op.h index d9923a10da..c26dfa8332 100644 --- a/paddle/fluid/operators/fake_dequantize_op.h +++ b/paddle/fluid/operators/fake_dequantize_op.h @@ -45,5 +45,56 @@ class FakeDequantizeMaxAbsKernel : public framework::OpKernel { } }; +template +class FakeChannelWiseDequantizeMaxAbsKernel : public framework::OpKernel { + public: + virtual void Compute(const framework::ExecutionContext& ctx) const { + auto* in = ctx.Input("X"); + auto* weight_scales = ctx.Input("WeightScales"); + auto* out = ctx.Output("Out"); + + PADDLE_ENFORCE_EQ(weight_scales->numel(), in->dims()[0], + "The weight uses the per-channel quantization type, so " + "the number of weight scale values must be the same with " + "first dimension value of Input(X)."); + + int ativation_bits = ctx.Attr("activation_bits"); + int weight_bits = ctx.Attr("weight_bits"); + int range = std::pow(2, weight_bits - 1) - 1; + + auto& dev_ctx = ctx.template device_context(); + out->mutable_data(dev_ctx.GetPlace()); + + auto dequant = DequantizeFunctor(); + if (ctx.HasInput("ActivationScale")) { + auto* activation_scale = ctx.Input("ActivationScale"); + PADDLE_ENFORCE_EQ(activation_scale->numel(), 1, + "The activation uses per-layer quantization type, so " + "it must have only one value."); + framework::Tensor cpu_weigth_scales; + framework::TensorCopy(*weight_scales, platform::CPUPlace(), + &cpu_weigth_scales); + dev_ctx.Wait(); + const T* weight_scales_data = cpu_weigth_scales.data(); + range *= (std::pow(2, ativation_bits - 1) - 1); + for (int64_t i = 0; i < in->dims()[0]; i++) { + framework::Tensor one_channel_in = in->Slice(i, i + 1); + framework::Tensor one_channel_out = out->Slice(i, i + 1); + auto max_range = range / weight_scales_data[i]; + dequant(dev_ctx, &one_channel_in, activation_scale, + static_cast(max_range), &one_channel_out); + } + } else { + for (int64_t i = 0; i < in->dims()[0]; i++) { + framework::Tensor one_channel_in = in->Slice(i, i + 1); + framework::Tensor one_channel_out = out->Slice(i, i + 1); + framework::Tensor one_channel_scale = weight_scales->Slice(i, i + 1); + dequant(dev_ctx, &one_channel_in, &one_channel_scale, + static_cast(range), &one_channel_out); + } + } + } +}; + } // namespace operators } // namespace paddle diff --git a/paddle/fluid/operators/fake_quantize_op.cc b/paddle/fluid/operators/fake_quantize_op.cc index c873ee6718..70186e5efa 100644 --- a/paddle/fluid/operators/fake_quantize_op.cc +++ b/paddle/fluid/operators/fake_quantize_op.cc @@ -180,11 +180,10 @@ The scale of FakeChannelWiseQuantize operator is a vector. In detail, each channel of the input X has a scale value. $$scale_c = max(abs(X_c))$$ -$$range = 2^{bit_length - 1} - 1$$ -$$Out_c = round(X_c / scale_c * range)$$ - +$$range = 2^{bit\_length - 1} - 1$$ +$$Out_c = round(\frac{X_c * range} {scale_c})$$ In above three formulas, the range value of c is as follow: -$$0 \leq c \leq \ the\ channel\ number\ of\ X$$ +$$0 \leq c \lt \ the\ channel\ number\ of\ X$$ )DOC"); } }; diff --git a/python/paddle/fluid/tests/unittests/test_fake_dequantize_op.py b/python/paddle/fluid/tests/unittests/test_fake_dequantize_op.py index 1bb4662e8d..bd8dad4d59 100644 --- a/python/paddle/fluid/tests/unittests/test_fake_dequantize_op.py +++ b/python/paddle/fluid/tests/unittests/test_fake_dequantize_op.py @@ -31,6 +31,77 @@ def dequantize_max_abs(x, scale, max_range): return y +def channel_wise_quantize_max_abs(x, max_range): + scales = [] + for i in range(x.shape[0]): + scales.append(np.max(np.abs(x[i])).astype("float32")) + + y = x.copy() + for i, scale in enumerate(scales): + y[i] = np.round(y[i] / scale * max_range) + return y, scales + + +def channel_wise_dequantize_max_abs(x, scales, max_range): + y = x.copy() + for i in range(x.shape[0]): + y[i] = (scales[i] / max_range) * y[i] + return y + + +class TestFakeChannelWiseDequantizeMaxAbsOp(OpTest): + def set_args(self): + self.weight_bits = 8 + self.activation_bits = 2 + self.data_type = "float32" + + def setUp(self): + self.set_args() + self.op_type = "fake_channel_wise_dequantize_max_abs" + x = np.random.randn(4, 3, 64, 64).astype(self.data_type) + max_range = math.pow(2, self.weight_bits - 1) - 1 + yq, scales = channel_wise_quantize_max_abs(x, max_range) + ydq = channel_wise_dequantize_max_abs(yq, scales, max_range) + + self.inputs = { + 'X': yq, + 'ActivationScale': np.array(1.0).astype(self.data_type), + 'WeightScales': np.array(scales).astype(self.data_type) + } + self.attrs = { + 'weight_bits': self.weight_bits, + 'activation_bits': self.activation_bits + } + self.outputs = {'Out': ydq} + + def test_check_output(self): + self.check_output() + + +class TestFakeChannelWiseDequantizeMaxAbsOpNoActivationScale(OpTest): + def set_args(self): + self.weight_bits = 8 + self.data_type = "float32" + + def setUp(self): + self.set_args() + self.op_type = "fake_channel_wise_dequantize_max_abs" + x = np.random.randn(4, 3, 64, 64).astype(self.data_type) + max_range = math.pow(2, self.weight_bits - 1) - 1 + yq, scales = channel_wise_quantize_max_abs(x, max_range) + ydq = channel_wise_dequantize_max_abs(yq, scales, max_range) + + self.inputs = { + 'X': yq, + 'WeightScales': np.array(scales).astype(self.data_type) + } + self.attrs = {'weight_bits': self.weight_bits} + self.outputs = {'Out': ydq} + + def test_check_output(self): + self.check_output() + + class TestFakeDequantizeMaxAbsOp(OpTest): def set_args(self): self.num_bits = 8 From 597dc65e76669e381cf4c52dfeeb671d5eecc4e6 Mon Sep 17 00:00:00 2001 From: sneaxiy Date: Tue, 5 Mar 2019 10:11:10 +0000 Subject: [PATCH 019/230] enhance gc test=develop --- paddle/fluid/framework/CMakeLists.txt | 2 +- paddle/fluid/framework/details/CMakeLists.txt | 3 +- .../framework/details/computation_op_handle.h | 2 + .../details/eager_deletion_op_handle.cc | 5 +- .../framework/details/eager_deletion_pass.cc | 165 ++++++---- .../framework/details/eager_deletion_pass.h | 32 -- .../details/while_op_eager_deletion_pass.cc | 62 ++++ paddle/fluid/framework/executor.cc | 8 +- .../operators/controlflow/CMakeLists.txt | 1 + .../fluid/operators/controlflow/while_op.cc | 9 +- .../operators/controlflow/while_op_helper.cc | 292 ++++++++++++++++++ .../operators/controlflow/while_op_helper.h | 43 +++ .../unittests/test_eager_deletion_while_op.py | 153 +++++++++ ...test_partial_eager_deletion_transformer.py | 26 ++ 14 files changed, 692 insertions(+), 111 deletions(-) delete mode 100644 paddle/fluid/framework/details/eager_deletion_pass.h create mode 100644 paddle/fluid/framework/details/while_op_eager_deletion_pass.cc create mode 100644 paddle/fluid/operators/controlflow/while_op_helper.cc create mode 100644 paddle/fluid/operators/controlflow/while_op_helper.h create mode 100644 python/paddle/fluid/tests/unittests/test_eager_deletion_while_op.py create mode 100644 python/paddle/fluid/tests/unittests/test_partial_eager_deletion_transformer.py diff --git a/paddle/fluid/framework/CMakeLists.txt b/paddle/fluid/framework/CMakeLists.txt index b9491c953f..ad19d729eb 100644 --- a/paddle/fluid/framework/CMakeLists.txt +++ b/paddle/fluid/framework/CMakeLists.txt @@ -174,7 +174,7 @@ else() cc_test(test_naive_executor SRCS naive_executor_test.cc DEPS naive_executor elementwise_add_op) endif() -target_link_libraries(executor garbage_collector) +target_link_libraries(executor garbage_collector while_op_helper) cc_library(parallel_executor SRCS parallel_executor.cc DEPS threaded_ssa_graph_executor scope_buffered_ssa_graph_executor parallel_ssa_graph_executor diff --git a/paddle/fluid/framework/details/CMakeLists.txt b/paddle/fluid/framework/details/CMakeLists.txt index dc308fd259..9f06455ea5 100644 --- a/paddle/fluid/framework/details/CMakeLists.txt +++ b/paddle/fluid/framework/details/CMakeLists.txt @@ -61,7 +61,8 @@ cc_library(inplace_op_pass SRCS inplace_op_pass.cc DEPS memory_optimize_pass op_ cc_library(modify_op_lock_and_record_event_pass SRCS modify_op_lock_and_record_event_pass.cc DEPS computation_op_handle op_graph_view multi_devices_helper) cc_library(reference_count_pass_helper SRCS reference_count_pass_helper.cc DEPS garbage_collector computation_op_handle) cc_library(eager_deletion_op_handle SRCS eager_deletion_op_handle.cc DEPS lod_tensor selected_rows reference_count_pass_helper) -cc_library(eager_deletion_pass SRCS eager_deletion_pass.cc DEPS computation_op_handle eager_deletion_op_handle graph graph_helper pass) +cc_library(while_op_eager_deletion_pass SRCS while_op_eager_deletion_pass.cc DEPS while_op_helper graph_helper pass computation_op_handle) +cc_library(eager_deletion_pass SRCS eager_deletion_pass.cc DEPS computation_op_handle eager_deletion_op_handle graph graph_helper pass while_op_eager_deletion_pass) cc_library(reference_count_pass SRCS reference_count_pass.cc DEPS computation_op_handle graph graph_helper pass op_graph_view reference_count_pass_helper) cc_library(sequential_execution_pass SRCS sequential_execution_pass.cc DEPS graph graph_helper pass) diff --git a/paddle/fluid/framework/details/computation_op_handle.h b/paddle/fluid/framework/details/computation_op_handle.h index 1e3dbb1e44..67f7cb738f 100644 --- a/paddle/fluid/framework/details/computation_op_handle.h +++ b/paddle/fluid/framework/details/computation_op_handle.h @@ -31,6 +31,8 @@ class ComputationOpHandle : public OpHandleBase { ComputationOpHandle(ir::Node *node, Scope *scope, platform::Place place, size_t scope_idx); + OperatorBase *GetOp() { return op_.get(); } + std::string Name() const override; const Scope *GetScope() const { return scope_; } diff --git a/paddle/fluid/framework/details/eager_deletion_op_handle.cc b/paddle/fluid/framework/details/eager_deletion_op_handle.cc index 9faef8a186..e58e501e6d 100644 --- a/paddle/fluid/framework/details/eager_deletion_op_handle.cc +++ b/paddle/fluid/framework/details/eager_deletion_op_handle.cc @@ -25,8 +25,6 @@ namespace paddle { namespace framework { namespace details { -static const std::string kEagerDeletionOpName{"eager_deletion"}; // NOLINT - EagerDeletionOpHandle::EagerDeletionOpHandle( ir::Node *node, const Scope *scope, const platform::Place &place, const std::unordered_set &var_names, GarbageCollector *gc, @@ -61,10 +59,9 @@ EagerDeletionOpHandle::~EagerDeletionOpHandle() { #endif } -std::string EagerDeletionOpHandle::Name() const { return kEagerDeletionOpName; } +std::string EagerDeletionOpHandle::Name() const { return "eager_deletion"; } void EagerDeletionOpHandle::RunImpl() { - platform::RecordEvent event(kEagerDeletionOpName, nullptr); Scope *exec_scope = nullptr; std::deque> garbages; for (auto &name : var_names_) { diff --git a/paddle/fluid/framework/details/eager_deletion_pass.cc b/paddle/fluid/framework/details/eager_deletion_pass.cc index 6c8cb66b10..566bc15c17 100644 --- a/paddle/fluid/framework/details/eager_deletion_pass.cc +++ b/paddle/fluid/framework/details/eager_deletion_pass.cc @@ -21,35 +21,42 @@ #include "paddle/fluid/framework/details/computation_op_handle.h" #include "paddle/fluid/framework/details/eager_deletion_op_handle.h" -#include "paddle/fluid/framework/details/eager_deletion_pass.h" #include "paddle/fluid/framework/details/multi_devices_helper.h" #include "paddle/fluid/framework/ir/graph_helper.h" -DEFINE_double(fraction_of_eager_deletion, 1.0, "Fraction of eager deletion"); -DEFINE_bool(eager_delete_tensor_only, false, ""); +DEFINE_double(memory_fraction_of_eager_deletion, 1.0, + "Fraction of eager deletion"); namespace paddle { namespace framework { namespace details { -namespace { // NOLINT +// op -> variables which can be deleted after op runs using OpToVarNameSetMap = std::unordered_map>; -} // NOLINT +// Check whether the variable is LoDTensor based on static VarDesc info static bool IsLoDTensor(VarDesc *var) { return var->Proto()->type().type() == proto::VarType::LOD_TENSOR; } -static int64_t GetNumel(const GraphVars &vars, const std::string &var_name, - size_t scope_idx) { - auto *var_desc = TryGetLatestVarDesc(vars[scope_idx].at(var_name)); +// Get memory size of LoDTensor +static int64_t GetMemorySize( + const std::unordered_map> &vars, + const std::string &var_name) { + auto *var_desc = TryGetLatestVarDesc(vars.at(var_name)); + PADDLE_ENFORCE_NOT_NULL(var_desc); PADDLE_ENFORCE(IsLoDTensor(var_desc)); auto dims = var_desc->GetShape(); - return std::accumulate(dims.begin(), dims.end(), static_cast(1), + return SizeOfType(var_desc->GetDataType()) * + std::accumulate(dims.begin(), dims.end(), static_cast(1), std::multiplies()); } +// Split all variables in the graph into LoDTensor and Non-LoDTensor (e.g. +// SelectedRows, LoDTensorArray) +// Since partial GC is based on static analysis of memory size of each variable +// So we should skip SelectedRows and LoDTensorArray here static void SplitIntoLoDTensorAndNonLoDTensorVars( const OpToVarNameSetMap &m, const GraphVars &vars, OpToVarNameSetMap *lod_tensors, OpToVarNameSetMap *other_vars) { @@ -69,76 +76,106 @@ static void SplitIntoLoDTensorAndNonLoDTensorVars( } } -static OpToVarNameSetMap ShrinkGCVars(const OpToVarNameSetMap &m, - const GraphVars &vars, - double fraction_of_memory_size, - bool delete_lod_tensor_only = false) { - // Do not perform gc +struct GCVarInfo { + GCVarInfo(const std::string &name, int64_t memory_size, + ComputationOpHandle *op, size_t scope_idx) + : name_(name), + memory_size_(memory_size), + op_(op), + scope_idx_(scope_idx) {} + + std::string name_; // variable name + int64_t memory_size_; // memory size + ComputationOpHandle *op_; // op after which the variable could be deleted + size_t scope_idx_; // scope index where the variable locates + + int64_t AbsMemorySize() const { return std::abs(memory_size_); } +}; + +// Delete delete_lod_tensor_only is not used currently +static OpToVarNameSetMap ShrinkGCVars( + const OpToVarNameSetMap &m, const GraphVars &vars, + const std::vector &places, double fraction_of_memory_size, + bool delete_lod_tensor_only = false) { + // Do not perform gc when fraction_of_memory_size = 0 if (fraction_of_memory_size <= 0.0) return {}; - // Perform complete gc + /** + * Step 1: Split all variables into LoDTensor and Non-LoDTensor. + * We can only calculate memory size of LoDTensors + */ + OpToVarNameSetMap lod_tensors, other_vars; + SplitIntoLoDTensorAndNonLoDTensorVars(m, vars, &lod_tensors, &other_vars); + + // Perform complete gc when fraction_of_memory_size >= 1 if (fraction_of_memory_size >= 1.0) { - if (delete_lod_tensor_only) { - OpToVarNameSetMap lod_tensors, other_vars; - SplitIntoLoDTensorAndNonLoDTensorVars(m, vars, &lod_tensors, &other_vars); - return lod_tensors; - } else { - return m; - } + return delete_lod_tensor_only ? lod_tensors : m; } - // Perform partial gc - OpToVarNameSetMap lod_tensors, other_vars; - SplitIntoLoDTensorAndNonLoDTensorVars(m, vars, &lod_tensors, &other_vars); + /** + * Step 2: build GCVarInfos, and calculate total memory sizes of each device + */ - using TupleType = std::tuple; + // place -> variable info (name, memory size, place, scope_idx) + std::map> place_to_vars; - std::unordered_map> place_to_vars; - std::unordered_map total_memory_size; + // place -> total memory sizes + std::map place_to_size; for (auto &op_vars_pair : lod_tensors) { - auto scope_idx = op_vars_pair.first->GetScopeIdx(); - int64_t size = 0; - for (auto &var_name : op_vars_pair.second) { - auto var_size = GetNumel(vars, var_name, scope_idx); - size += std::abs(var_size); - place_to_vars[scope_idx].emplace_back(var_name, op_vars_pair.first, - var_size); + auto *op = op_vars_pair.first; + auto &var_names = op_vars_pair.second; + auto scope_idx = op->GetScopeIdx(); + auto &place = places[scope_idx]; + + for (auto &var_name : var_names) { + auto var_size = GetMemorySize(vars[scope_idx], var_name); + GCVarInfo var_info(var_name, var_size, op, scope_idx); + place_to_size[place] += var_info.AbsMemorySize(); + place_to_vars[place].emplace_back(std::move(var_info)); } - total_memory_size.emplace(scope_idx, size); } - for (auto &pair : place_to_vars) { - std::sort(pair.second.begin(), pair.second.end(), - [](const TupleType &t1, const TupleType &t2) { - return std::abs(std::get<2>(t1)) > std::abs(std::get<2>(t2)); + /** + * Step 3: sort GCVarInfos, and only delete the largest variables. + */ + OpToVarNameSetMap partial_vars; + for (auto &place_to_var_pair : place_to_vars) { + auto &place = place_to_var_pair.first; + auto &gc_vars = place_to_var_pair.second; + std::sort(gc_vars.begin(), gc_vars.end(), + [](const GCVarInfo &var1, const GCVarInfo &var2) { + return var1.AbsMemorySize() > var2.AbsMemorySize(); }); - } - OpToVarNameSetMap ret; - for (auto &pair : place_to_vars) { - auto desired_delete_size = static_cast( - fraction_of_memory_size * total_memory_size.at(pair.first)); - int64_t cur_size = 0; - for (size_t i = 0; i < pair.second.size() && cur_size < desired_delete_size; + int64_t accumulated_size = 0; + int64_t size_threshold = + static_cast(fraction_of_memory_size * place_to_size[place]); + for (size_t i = 0; i < gc_vars.size() && accumulated_size < size_threshold; ++i) { - auto &var_name = std::get<0>(pair.second[i]); - auto *op = std::get<1>(pair.second[i]); - cur_size += std::get<2>(pair.second[i]); - ret[op].insert(var_name); + partial_vars[gc_vars[i].op_].insert(gc_vars[i].name_); + accumulated_size += gc_vars[i].AbsMemorySize(); } } + /** + * Step 4: Combine other vars (SelectedRows, LoDTensorArray) + */ if (!delete_lod_tensor_only) { for (auto &op_vars_pair : other_vars) { - for (auto &var_name : op_vars_pair.second) { - ret[op_vars_pair.first].insert(var_name); - } + partial_vars[op_vars_pair.first].insert(op_vars_pair.second.begin(), + op_vars_pair.second.end()); } } - return ret; + return partial_vars; } +class EagerDeletionPass : public ir::Pass { + protected: + std::unique_ptr ApplyImpl( + std::unique_ptr graph) const override; +}; + std::unique_ptr EagerDeletionPass::ApplyImpl( std::unique_ptr graph) const { auto &ref_cnts = @@ -166,9 +203,8 @@ std::unique_ptr EagerDeletionPass::ApplyImpl( } } - op_vars_map = - ShrinkGCVars(op_vars_map, vars, FLAGS_fraction_of_eager_deletion, - FLAGS_eager_delete_tensor_only); + op_vars_map = ShrinkGCVars(op_vars_map, vars, places, + FLAGS_memory_fraction_of_eager_deletion); for (auto &pair : op_vars_map) { auto *op = pair.first; @@ -200,12 +236,13 @@ std::unique_ptr EagerDeletionPass::ApplyImpl( eager_deletion_op->AddOutput(dummy_leaf); } - VLOG(10) << "FLAGS_fraction_of_eager_deletion = " - << FLAGS_fraction_of_eager_deletion; - VLOG(10) << "FLAGS_eager_delete_tensor_only = " - << FLAGS_eager_delete_tensor_only; + VLOG(10) << "FLAGS_memory_fraction_of_eager_deletion = " + << FLAGS_memory_fraction_of_eager_deletion; VLOG(10) << "Create " << op_vars_map.size() << " EagerDeletionOpHandle(s)"; - return graph; + + auto while_op_eager_deletion_pass = + ir::PassRegistry::Instance().Get("while_op_eager_deletion_pass"); + return while_op_eager_deletion_pass->Apply(std::move(graph)); } } // namespace details @@ -218,3 +255,5 @@ REGISTER_PASS(eager_deletion_pass, .RequirePassAttr(paddle::framework::details::kLastLiveOpsOfVars) .RequirePassAttr(paddle::framework::details::kAllPlaces) .RequirePassAttr(paddle::framework::details::kGarbageCollector); + +USE_PASS(while_op_eager_deletion_pass); diff --git a/paddle/fluid/framework/details/eager_deletion_pass.h b/paddle/fluid/framework/details/eager_deletion_pass.h deleted file mode 100644 index d7a7a9709d..0000000000 --- a/paddle/fluid/framework/details/eager_deletion_pass.h +++ /dev/null @@ -1,32 +0,0 @@ -// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once - -#include "paddle/fluid/framework/ir/graph.h" -#include "paddle/fluid/framework/ir/pass.h" - -namespace paddle { -namespace framework { -namespace details { - -class EagerDeletionPass : public ir::Pass { - protected: - std::unique_ptr ApplyImpl( - std::unique_ptr graph) const override; -}; - -} // namespace details -} // namespace framework -} // namespace paddle diff --git a/paddle/fluid/framework/details/while_op_eager_deletion_pass.cc b/paddle/fluid/framework/details/while_op_eager_deletion_pass.cc new file mode 100644 index 0000000000..fd6b6dd227 --- /dev/null +++ b/paddle/fluid/framework/details/while_op_eager_deletion_pass.cc @@ -0,0 +1,62 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/framework/details/computation_op_handle.h" +#include "paddle/fluid/framework/details/multi_devices_helper.h" +#include "paddle/fluid/framework/ir/graph_helper.h" +#include "paddle/fluid/operators/controlflow/while_op_helper.h" + +namespace paddle { +namespace framework { +namespace details { + +class WhileOpEagerDeletionPass : public ir::Pass { + protected: + std::unique_ptr ApplyImpl( + std::unique_ptr graph) const override { + auto all_ops = ir::FilterByNodeWrapper(*graph); + + // Find all while_op and while_grad_op + std::unordered_map, + std::vector>> + target_ops; + for (auto *op : all_ops) { + auto compute_op = dynamic_cast(op); + if (compute_op == nullptr) continue; + + if (compute_op->Name() == "while") { + target_ops[compute_op->GetScopeIdx()].first.emplace_back( + compute_op->GetOp()); + } else if (compute_op->Name() == "while_grad") { + target_ops[compute_op->GetScopeIdx()].second.emplace_back( + compute_op->GetOp()); + } + } + + for (auto &ops_pair : target_ops) { + auto &while_ops = ops_pair.second.first; + auto &while_grad_ops = ops_pair.second.second; + operators::PrepareSafeEagerDeletionOnWhileOpAndWhileGradOp( + while_ops, while_grad_ops); + } + return graph; + } +}; + +} // namespace details +} // namespace framework +} // namespace paddle + +REGISTER_PASS(while_op_eager_deletion_pass, + paddle::framework::details::WhileOpEagerDeletionPass); diff --git a/paddle/fluid/framework/executor.cc b/paddle/fluid/framework/executor.cc index c31d0beec3..5555679412 100644 --- a/paddle/fluid/framework/executor.cc +++ b/paddle/fluid/framework/executor.cc @@ -23,6 +23,7 @@ limitations under the License. */ #include "paddle/fluid/framework/threadpool.h" #include "paddle/fluid/framework/transfer_scope_cache.h" #include "paddle/fluid/framework/variable_helper.h" +#include "paddle/fluid/operators/controlflow/while_op_helper.h" #include "paddle/fluid/operators/distributed/distributed.h" #include "paddle/fluid/platform/place.h" #include "paddle/fluid/platform/profiler.h" @@ -409,8 +410,7 @@ void Executor::RunPreparedContext(ExecutorPrepareContext* ctx, Scope* scope, int64_t max_memory_size = GetEagerDeletionThreshold(); std::unique_ptr gc; - // skip while_op and while_grad_op temporarily - if (max_memory_size >= 0 && !keep_kids) { + if (max_memory_size >= 0) { ctx->ResetReferenceCount(); #ifdef PADDLE_WITH_CUDA if (platform::is_gpu_place(place_)) { @@ -428,6 +428,10 @@ void Executor::RunPreparedContext(ExecutorPrepareContext* ctx, Scope* scope, #ifdef PADDLE_WITH_CUDA } #endif + if (gc) { + operators::PrepareSafeEagerDeletionOnWhileOpAndWhileGradOp(ctx->block_id_, + ctx->ops_); + } } for (auto& op : ctx->ops_) { diff --git a/paddle/fluid/operators/controlflow/CMakeLists.txt b/paddle/fluid/operators/controlflow/CMakeLists.txt index b614e9b035..7aa1c44eaa 100644 --- a/paddle/fluid/operators/controlflow/CMakeLists.txt +++ b/paddle/fluid/operators/controlflow/CMakeLists.txt @@ -1,4 +1,5 @@ include(operators) register_operators(DEPS naive_executor) +cc_library(while_op_helper SRCS while_op_helper.cc DEPS operator) file(APPEND ${pybind_file} "USE_OP(less_than);\nUSE_OP(logical_and);\nUSE_NO_KERNEL_OP(read_from_array);\n") diff --git a/paddle/fluid/operators/controlflow/while_op.cc b/paddle/fluid/operators/controlflow/while_op.cc index 0360cf5273..8352ba4f2b 100644 --- a/paddle/fluid/operators/controlflow/while_op.cc +++ b/paddle/fluid/operators/controlflow/while_op.cc @@ -18,6 +18,7 @@ #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/operator.h" #include "paddle/fluid/framework/var_type.h" +#include "paddle/fluid/operators/controlflow/while_op_helper.h" #include "paddle/fluid/operators/detail/safe_ref.h" namespace paddle { @@ -26,14 +27,6 @@ namespace operators { using StepScopeVar = std::vector; using LoDTensor = framework::LoDTensor; -static constexpr char kStepBlock[] = "sub_block"; -static constexpr char kCondition[] = "Condition"; -static constexpr char kStepScopes[] = "StepScopes"; -static constexpr char kX[] = "X"; -static constexpr char kXGRAD[] = "X@GRAD"; -static constexpr char kOutputs[] = "Out"; -static constexpr char kSkipEagerDeletionVars[] = "skip_eager_deletion_vars"; - namespace { // NOLINT static std::string GetSkipEagerDeletionVarsDebugString( const std::vector &vars) { diff --git a/paddle/fluid/operators/controlflow/while_op_helper.cc b/paddle/fluid/operators/controlflow/while_op_helper.cc new file mode 100644 index 0000000000..0324a1586a --- /dev/null +++ b/paddle/fluid/operators/controlflow/while_op_helper.cc @@ -0,0 +1,292 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/operators/controlflow/while_op_helper.h" +#include +#include "paddle/fluid/framework/program_desc.h" + +namespace paddle { +namespace operators { + +// OpVariant is a wrapper class of OpDesc and OperatorBase +// So that API would be the same. +class OpVariant { + struct InputsVisitor + : public boost::static_visitor { + template + const framework::VariableNameMap *operator()(const OpType *op) const { + return &(op->Inputs()); + } + }; + + struct OutputsVisitor + : public boost::static_visitor { + template + const framework::VariableNameMap *operator()(const OpType *op) const { + return &(op->Outputs()); + } + }; + + struct AttributeMapVisitor + : public boost::static_visitor { + const framework::AttributeMap *operator()( + const framework::OpDesc *op) const { + return &(op->GetAttrMap()); + } + + const framework::AttributeMap *operator()( + const framework::OperatorBase *op) const { + return &(op->Attrs()); + } + }; + + struct RawPointerVisitor : public boost::static_visitor { + template + const void *operator()(const OpType *op) const { + return op; + } + }; + + public: + OpVariant(const framework::OperatorBase *op) : op_(op) {} // NOLINT + + OpVariant(const framework::OpDesc *op) : op_(op) {} // NOLINT + + const framework::VariableNameMap &Inputs() const { + return *boost::apply_visitor(InputsVisitor(), op_); + } + + const framework::VariableNameMap &Outputs() const { + return *boost::apply_visitor(OutputsVisitor(), op_); + } + + const framework::AttributeMap &Attrs() const { + return *boost::apply_visitor(AttributeMapVisitor(), op_); + } + + template + const AttrType &Attr(const std::string &name) const { + auto &attrs = Attrs(); + auto it = attrs.find(name); + PADDLE_ENFORCE(it != attrs.end(), "Cannot find attribute %s", name); + return boost::get(it->second); + } + + bool operator==(const OpVariant &other) const { + return RawPointer() == other.RawPointer(); + } + + const void *RawPointer() const { + return boost::apply_visitor(RawPointerVisitor(), op_); + } + + int which() const { return static_cast(op_.which()); } + + struct Hasher { + size_t operator()(const OpVariant &op) const { + return reinterpret_cast(op.RawPointer()); + } + }; + + private: + const boost::variant + op_; +}; + +static std::string GetDebugString(const std::vector &names) { + if (names.empty()) return ""; + std::string ret = names[0]; + for (size_t i = 1; i < names.size(); ++i) { + ret += (" " + names[i]); + } + return ret; +} + +// Set skip variables of while_op and while_grad_op +// These variables should be skipped when eager deletion enables. +// It is because: +// 1. while_grad_op needs some variables defined in while_op. +// 2. while_grad_op needs variables from the previous time step. +static void SetSkipVars(const OpVariant &op, std::vector attr) { + auto &attrs = const_cast(op.Attrs()); + VLOG(2) << "Prepare to skip " << attr.size() + << " var(s): " << GetDebugString(attr); + attrs[kSkipEagerDeletionVars] = std::move(attr); +} + +// Check whether the forward while_op and while_grad_op match +// The program may have many while_ops. +static bool IsMatchedWhileOpAndWhileGradOp(const OpVariant &fwd_op, + const OpVariant &grad_op) { + return fwd_op.Inputs().at(kX) == grad_op.Inputs().at(kX) && + fwd_op.Outputs().at(kOutputs) == grad_op.Inputs().at(kOutputs); +} + +// Test whether the variable is skippable in forward while_op +// The variable is skippable in while_op when the variable used in while_grad +// is not from grad_block. +static bool IsSkippableVar(const std::string &name, + framework::BlockDesc *grad_block) { + return name != framework::kEmptyVarName && !grad_block->HasVar(name); +} + +static void ModifyWhileOpAndWhileGradOpAttr(const OpVariant &fwd_op, + const OpVariant &bwd_op) { + auto *grad_block = bwd_op.Attr(kStepBlock); + + // Find all skippable variables in forward while_op + std::unordered_set forward_skip_vars; + for (auto *op_desc : grad_block->AllOps()) { + for (auto &in_arg_name : op_desc->InputArgumentNames()) { + if (IsSkippableVar(in_arg_name, grad_block)) { + forward_skip_vars.insert(in_arg_name); + } + } + + for (auto &out_arg_name : op_desc->OutputArgumentNames()) { + if (IsSkippableVar(out_arg_name, grad_block)) { + forward_skip_vars.insert(out_arg_name); + } + } + } + + SetSkipVars(fwd_op, std::vector(forward_skip_vars.begin(), + forward_skip_vars.end())); + + // Find all skippable variables in while_grad_op + // The skipped variables are those which would be used across time steps. + auto &fwd_input = fwd_op.Inputs().at(kX); + auto &in_grads = bwd_op.Outputs().at(framework::GradVarName(kX)); + PADDLE_ENFORCE_EQ( + fwd_input.size(), in_grads.size(), + "Backward input gradient number does not match forward input number."); + + std::unordered_set backward_skip_vars; + for (size_t i = 0; i < in_grads.size(); ++i) { + if (in_grads[i] == framework::kEmptyVarName) { + continue; + } + backward_skip_vars.insert(in_grads[i]); + backward_skip_vars.insert(framework::GradVarName(fwd_input[i])); + } + + SetSkipVars(bwd_op, std::vector(backward_skip_vars.begin(), + backward_skip_vars.end())); +} + +// Find all while_ops and while_grad_ops in the graph or program +// The while_grad_op and while_op may located in different blocks +// So we should traverse all blocks in the program and find them out. +static void FindAllWhileAndWhileGradOp(std::vector *while_ops, + std::vector *while_grad_ops) { + PADDLE_ENFORCE_GE(while_ops->size(), while_grad_ops->size()); + + if (while_ops->empty()) return; + + const auto *program = + while_ops->front().Attr(kStepBlock)->Program(); + for (size_t i = 1; i < program->Size(); ++i) { + auto &block = program->Block(i); + for (size_t j = 0; j < block.OpSize(); ++j) { + auto *op = block.Op(j); + if (op->Type() == "while") { + while_ops->emplace_back(op); + } else if (op->Type() == "while_grad") { + while_grad_ops->emplace_back(op); + } + } + } + + PADDLE_ENFORCE_GE(while_ops->size(), while_grad_ops->size(), + "There are extra while_grad ops in the graph or program"); +} + +static void PrepareSafeEagerDeletionOnWhileOpAndWhileGradOpImpl( + std::vector *while_ops, std::vector *while_grad_ops) { + FindAllWhileAndWhileGradOp(while_ops, while_grad_ops); + + VLOG(2) << "Found while op num: " << while_ops->size() + << ", while grad op num: " << while_grad_ops->size(); + + if (while_grad_ops->empty()) { + return; + } + + std::unordered_set while_op_set( + while_ops->begin(), while_ops->end()); + + for (auto &bwd_op : *while_grad_ops) { + const OpVariant *matched_fwd_op = nullptr; + for (auto &fwd_op : while_op_set) { + if (IsMatchedWhileOpAndWhileGradOp(fwd_op, bwd_op)) { + PADDLE_ENFORCE(matched_fwd_op == nullptr, + "Found multiple matched while ops"); + matched_fwd_op = &fwd_op; + } + } + PADDLE_ENFORCE_NOT_NULL(matched_fwd_op, + "Cannot find matched forward while op."); + ModifyWhileOpAndWhileGradOpAttr(*matched_fwd_op, bwd_op); + while_op_set.erase(*matched_fwd_op); + } + + PADDLE_ENFORCE(while_op_set.empty(), + "There are not matched while_grad op in graph."); +} + +void PrepareSafeEagerDeletionOnWhileOpAndWhileGradOp( + int block_id, + const std::vector> &all_ops) { + // If block_id is not 0, returns + // This is because all while_ops and while_grad_ops in the whole program + // would be processed when block_id is 0 (i.e. when Executor::Run() or + // ParallelExecutor constructs). + + // What's more, all while_ops and while_grad_ops must be processed when + // block_id is zero. If not, while_op may run first and erase variables + // used in while_grad_op, and in this moment, while_grad_ops may be not + // constructed yet. + if (block_id != 0) return; + + std::vector fwd_ops, bwd_ops; + for (auto &op : all_ops) { + if (op->Type() == "while") { + fwd_ops.emplace_back(op.get()); + } else if (op->Type() == "while_grad") { + bwd_ops.emplace_back(op.get()); + } + } + PrepareSafeEagerDeletionOnWhileOpAndWhileGradOpImpl(&fwd_ops, &bwd_ops); +} + +void PrepareSafeEagerDeletionOnWhileOpAndWhileGradOp( + const std::vector &while_ops, + const std::vector &while_grad_ops) { + std::vector fwd_ops, bwd_ops; + fwd_ops.reserve(while_ops.size()); + for (auto *op : while_ops) { + fwd_ops.emplace_back(op); + } + + bwd_ops.reserve(while_grad_ops.size()); + for (auto *op : while_grad_ops) { + bwd_ops.emplace_back(op); + } + + PrepareSafeEagerDeletionOnWhileOpAndWhileGradOpImpl(&fwd_ops, &bwd_ops); +} + +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/controlflow/while_op_helper.h b/paddle/fluid/operators/controlflow/while_op_helper.h new file mode 100644 index 0000000000..456ba8642b --- /dev/null +++ b/paddle/fluid/operators/controlflow/while_op_helper.h @@ -0,0 +1,43 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include +#include +#include "paddle/fluid/framework/operator.h" +#include "paddle/fluid/platform/variant.h" + +namespace paddle { +namespace operators { + +static constexpr char kStepBlock[] = "sub_block"; +static constexpr char kCondition[] = "Condition"; +static constexpr char kStepScopes[] = "StepScopes"; +static constexpr char kX[] = "X"; +static constexpr char kXGRAD[] = "X@GRAD"; +static constexpr char kOutputs[] = "Out"; +static constexpr char kSkipEagerDeletionVars[] = "skip_eager_deletion_vars"; + +void PrepareSafeEagerDeletionOnWhileOpAndWhileGradOp( + int block_id, + const std::vector> &all_ops); + +void PrepareSafeEagerDeletionOnWhileOpAndWhileGradOp( + const std::vector &while_ops, + const std::vector &while_grad_ops); + +} // namespace operators +} // namespace paddle diff --git a/python/paddle/fluid/tests/unittests/test_eager_deletion_while_op.py b/python/paddle/fluid/tests/unittests/test_eager_deletion_while_op.py new file mode 100644 index 0000000000..7fa1636579 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/test_eager_deletion_while_op.py @@ -0,0 +1,153 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function + +import os +os.environ['CPU_NUM'] = '2' +os.environ['FLAGS_eager_delete_tensor_gb'] = '0.0' +os.environ['FLAGS_fast_eager_deletion_mode'] = '1' + +import unittest +import paddle.fluid as fluid +import paddle.fluid.layers as layers +from paddle.fluid.executor import Executor +import paddle.fluid.core as core +from paddle.fluid.backward import append_backward +import paddle.fluid.compiler as compiler +import numpy +import multiprocessing + + +class TestEagerDeletionWhileOpBase(unittest.TestCase): + def test_main(self): + places = [core.CPUPlace(), ] + if core.is_compiled_with_cuda(): + places.append(core.CUDAPlace(0)) + + for p in places: + for with_data_parallel in [False, True]: + with fluid.program_guard(fluid.Program(), fluid.Program()): + with fluid.scope_guard(fluid.Scope()): + self.run_main(p, with_data_parallel) + + def run_main(self, place, with_data_parallel): + self.place = place + self.with_data_parallel = with_data_parallel + + if not core.is_compiled_with_cuda() and isinstance(self.place, + core.CUDPlace): + return + + if isinstance(self.place, core.CUDAPlace): + device_cnt = core.get_cuda_device_count( + ) if self.with_data_parallel else 1 + else: + device_cnt = int( + os.environ['CPU_NUM'], + multiprocessing.cpu_count()) if self.with_data_parallel else 1 + + d0 = layers.data( + "d0", shape=[10], append_batch_size=False, dtype='float32') + d1 = layers.data( + "d1", shape=[10], append_batch_size=False, dtype='float32') + d2 = layers.data( + "d2", shape=[10], append_batch_size=False, dtype='float32') + + i = layers.zeros(shape=[1], dtype='int64') + i.stop_gradient = True + + init = layers.zeros(shape=[10], dtype='float32') + mem_array = layers.array_write(x=init, i=i) + data_array = layers.array_write(x=d0, i=i) + + i = layers.increment(i) + layers.array_write(d1, i, array=data_array) + + i = layers.increment(i) + layers.array_write(d2, i, array=data_array) + + i = layers.zeros(shape=[1], dtype='int64') + i.stop_gradient = True + + array_len = layers.fill_constant(shape=[1], dtype='int64', value=1) + array_len.stop_gradient = True + cond = layers.less_than(x=i, y=array_len) + + j = layers.fill_constant(shape=[1], dtype='int64', value=1) + j.stop_gradient = True + + array_len2 = layers.fill_constant(shape=[1], dtype='int64', value=3) + array_len2.stop_gradient = True + cond2 = layers.less_than(x=j, y=array_len2) + + while_op = layers.While(cond=cond) + while_op2 = layers.While(cond=cond2) + with while_op.block(): + d = layers.array_read(array=data_array, i=i) + prev = layers.array_read(array=mem_array, i=i) + d = layers.reshape(d, shape=[10]) + prev = layers.reshape(prev, shape=[10]) + result = layers.sums(input=[d, prev]) + + i = layers.increment(x=i, in_place=True) + layers.array_write(result, i=i, array=mem_array) + layers.less_than(x=i, y=array_len, cond=cond) + with while_op2.block(): + d2 = layers.array_read(array=data_array, i=j) + prev2 = layers.array_read(array=mem_array, i=j) + d2 = layers.reshape(d2, shape=[10]) + prev2 = layers.reshape(prev2, shape=[10]) + result2 = layers.sums(input=[d2, prev2]) + + j = layers.increment(x=j, in_place=True) + layers.array_write(result2, i=j, array=mem_array) + layers.less_than(x=j, y=array_len2, cond=cond2) + + sum_result = layers.array_read(array=mem_array, i=j) + sum_result.persistable = True + tmp = layers.unsqueeze(sum_result, axes=[0]) + tmp = layers.expand(tmp, expand_times=[10, 1]) + fc = layers.fc(tmp, size=256) + loss = layers.mean(sum_result) + + optim = fluid.optimizer.Adam(learning_rate=1e-3) + optim.minimize(loss) + + exe = Executor(self.place) + exe.run(fluid.default_startup_program()) + + prog = compiler.CompiledProgram(fluid.default_main_program()) + if self.with_data_parallel: + prog = prog.with_data_parallel() + + for _ in range(5): + d = [] + for i in range(3): + tmp = numpy.random.random(size=[10]).astype('float32') + if not self.with_data_parallel: + d.append(tmp) + else: + d.append(numpy.array([tmp] * device_cnt)) + + outs = exe.run(program=prog, + feed={'d0': d[0], + 'd1': d[1], + 'd2': d[2]}, + fetch_list=[sum_result]) + self.assertAlmostEqual(numpy.sum(d), numpy.sum(outs[0]), delta=0.01) + + +if __name__ == '__main__': + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_partial_eager_deletion_transformer.py b/python/paddle/fluid/tests/unittests/test_partial_eager_deletion_transformer.py new file mode 100644 index 0000000000..d44e4627d8 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/test_partial_eager_deletion_transformer.py @@ -0,0 +1,26 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +import unittest +os.environ['FLAGS_eager_delete_tensor_gb'] = "0.0" +os.environ['FLAGS_memory_fraction_of_eager_deletion'] = "0.55" + +os.environ[ + 'RECORDIO_FILENAME'] = '/tmp/eager_deletion_transformer.wmt16.recordio' + +from test_parallel_executor_transformer import TestTransformer + +if __name__ == '__main__': + unittest.main() From 806832e09163500fa01b8e9eabb871424dc26dbd Mon Sep 17 00:00:00 2001 From: Zhen Wang Date: Tue, 5 Mar 2019 20:15:41 +0800 Subject: [PATCH 020/230] update the input format of channel wise dequantize op. --- paddle/fluid/operators/fake_dequantize_op.cc | 42 ++++++++----------- paddle/fluid/operators/fake_dequantize_op.h | 38 +++++++---------- .../unittests/test_fake_dequantize_op.py | 27 ++++++------ 3 files changed, 46 insertions(+), 61 deletions(-) diff --git a/paddle/fluid/operators/fake_dequantize_op.cc b/paddle/fluid/operators/fake_dequantize_op.cc index 73ffaae6a5..68c7227e5a 100644 --- a/paddle/fluid/operators/fake_dequantize_op.cc +++ b/paddle/fluid/operators/fake_dequantize_op.cc @@ -14,6 +14,7 @@ limitations under the License. */ #include "paddle/fluid/operators/fake_dequantize_op.h" #include +#include namespace paddle { namespace operators { @@ -84,8 +85,8 @@ class FakeChannelWiseDequantizeMaxAbsOp : public framework::OperatorWithKernel { PADDLE_ENFORCE( ctx->HasInput("X"), "Input(X) of FakeChannelWiseDequantizeMaxAbsOp should not be null."); - PADDLE_ENFORCE(ctx->HasInput("WeightScales"), - "Input(WeightScales) of FakeChannelWiseDequantizeMaxAbsOp " + PADDLE_ENFORCE(ctx->HasInputs("Scales"), + "Input(Scales) of FakeChannelWiseDequantizeMaxAbsOp " "should not be null."); PADDLE_ENFORCE( ctx->HasOutput("Out"), @@ -103,39 +104,32 @@ class FakeChannelWiseDequantizeMaxAbsOpMaker AddInput("X", "(Tensor) The input with float-32/64 type is the " "low precision tensor."); - AddInput("ActivationScale", - "(float) The activation scale in quantization stage.") - .AsDispensable(); - AddInput("WeightScales", - "(float array) The weight scales in quantization stage."); + AddInput("Scales", + "(Tensors) The scales in quantization stage. " + "Now, `Scales` is a vector with at most two tensors. " + "If Scales has two elements, the second tensor should only have " + "one value.") + .AsDuplicable(); AddOutput("Out", "(Tensor) The output is the dequantized high " "precision tensor."); - AddAttr("activation_bits", "Quantization bit number for activation.") - .SetDefault(8) - .AddCustomChecker([](const int& bit_length) { - PADDLE_ENFORCE(bit_length >= 1 && bit_length <= 16, - "'activation_bits' should be between 1 and 16."); - }); - AddAttr("weight_bits", "Quantization bit number for weights.") - .SetDefault(8) - .AddCustomChecker([](const int& bit_length) { - PADDLE_ENFORCE(bit_length >= 1 && bit_length <= 16, - "'weight_bits' should be between 1 and 16."); - }); + AddAttr>( + "quant_bits", + "Quantization bit numbers in quantization stage. " + "The size of `quant_bits` should be equal to the size of `Scales`.") + .SetDefault({8}); AddComment(R"DOC( FakeChannelWiseDequantizeMaxAbsOp operator. This calculation is an opposite operation of FakeChannelWiseQuantizeMaxAbsOp: -$$Out_c = \frac{ActivationScale*WeightScale_c*X_c}{(2^{weight\_bits-1}-1)*(2^{activation\_bits-1}-1)}$$ +$$Out_c = \frac{X_c\prod_{i=1}^{n}Scales_{ic}}{\prod_{i=1}^{n}(2^{quant\_bits_i-1}-1)}$$ -In the above formula, the range value of c is as follow: -$$0 \leq c \lt \ the\ channel\ number\ of\ X$$ +In the above formula, the range value of $c$ can be represented as $0 \leq c \lt \ the\ channel\ number\ of\ X$. +Besides, the size of $quant\_bits$ should be equal to the size of $Scales$, and it is called $n$ in the formula. -Notes: Tha per-channel quantization is only applied to weights(channel size scale). -And the activations use per-layer quantization(only one scale). +Notes: In general, the per-channel quantization is only applied to weights and the activations use per-layer quantization. )DOC"); } }; diff --git a/paddle/fluid/operators/fake_dequantize_op.h b/paddle/fluid/operators/fake_dequantize_op.h index c26dfa8332..549f5039f4 100644 --- a/paddle/fluid/operators/fake_dequantize_op.h +++ b/paddle/fluid/operators/fake_dequantize_op.h @@ -14,6 +14,7 @@ limitations under the License. */ #pragma once +#include #include "paddle/fluid/framework/eigen.h" #include "paddle/fluid/framework/op_registry.h" @@ -50,47 +51,40 @@ class FakeChannelWiseDequantizeMaxAbsKernel : public framework::OpKernel { public: virtual void Compute(const framework::ExecutionContext& ctx) const { auto* in = ctx.Input("X"); - auto* weight_scales = ctx.Input("WeightScales"); + auto scales = ctx.MultiInput("Scales"); auto* out = ctx.Output("Out"); - PADDLE_ENFORCE_EQ(weight_scales->numel(), in->dims()[0], - "The weight uses the per-channel quantization type, so " - "the number of weight scale values must be the same with " + PADDLE_ENFORCE_EQ(scales[0]->numel(), in->dims()[0], + "The number of first scale values must be the same with " "first dimension value of Input(X)."); - int ativation_bits = ctx.Attr("activation_bits"); - int weight_bits = ctx.Attr("weight_bits"); - int range = std::pow(2, weight_bits - 1) - 1; + auto quant_bits = ctx.Attr>("quant_bits"); + int max_range = std::pow(2, quant_bits[0] - 1) - 1; auto& dev_ctx = ctx.template device_context(); out->mutable_data(dev_ctx.GetPlace()); auto dequant = DequantizeFunctor(); - if (ctx.HasInput("ActivationScale")) { - auto* activation_scale = ctx.Input("ActivationScale"); - PADDLE_ENFORCE_EQ(activation_scale->numel(), 1, - "The activation uses per-layer quantization type, so " - "it must have only one value."); - framework::Tensor cpu_weigth_scales; - framework::TensorCopy(*weight_scales, platform::CPUPlace(), - &cpu_weigth_scales); - dev_ctx.Wait(); - const T* weight_scales_data = cpu_weigth_scales.data(); - range *= (std::pow(2, ativation_bits - 1) - 1); + if (scales.size() == 2) { + PADDLE_ENFORCE_EQ( + scales[1]->numel(), 1, + "The second scale tensor should only have one value at now."); for (int64_t i = 0; i < in->dims()[0]; i++) { framework::Tensor one_channel_in = in->Slice(i, i + 1); framework::Tensor one_channel_out = out->Slice(i, i + 1); - auto max_range = range / weight_scales_data[i]; - dequant(dev_ctx, &one_channel_in, activation_scale, + framework::Tensor one_channel_scale = scales[0]->Slice(i, i + 1); + max_range *= (std::pow(2, quant_bits[1] - 1) - 1); + dequant(dev_ctx, &one_channel_in, &one_channel_scale, static_cast(max_range), &one_channel_out); } + dequant(dev_ctx, out, scales[1], static_cast(1), out); } else { for (int64_t i = 0; i < in->dims()[0]; i++) { framework::Tensor one_channel_in = in->Slice(i, i + 1); framework::Tensor one_channel_out = out->Slice(i, i + 1); - framework::Tensor one_channel_scale = weight_scales->Slice(i, i + 1); + framework::Tensor one_channel_scale = scales[0]->Slice(i, i + 1); dequant(dev_ctx, &one_channel_in, &one_channel_scale, - static_cast(range), &one_channel_out); + static_cast(max_range), &one_channel_out); } } } diff --git a/python/paddle/fluid/tests/unittests/test_fake_dequantize_op.py b/python/paddle/fluid/tests/unittests/test_fake_dequantize_op.py index bd8dad4d59..8d91d8fd1d 100644 --- a/python/paddle/fluid/tests/unittests/test_fake_dequantize_op.py +++ b/python/paddle/fluid/tests/unittests/test_fake_dequantize_op.py @@ -49,53 +49,50 @@ def channel_wise_dequantize_max_abs(x, scales, max_range): return y -class TestFakeChannelWiseDequantizeMaxAbsOp(OpTest): +class TestFakeChannelWiseDequantizeMaxAbsOpTwoScales(OpTest): def set_args(self): - self.weight_bits = 8 - self.activation_bits = 2 + self.quant_bits = [8, 2] self.data_type = "float32" def setUp(self): self.set_args() self.op_type = "fake_channel_wise_dequantize_max_abs" x = np.random.randn(4, 3, 64, 64).astype(self.data_type) - max_range = math.pow(2, self.weight_bits - 1) - 1 + max_range = math.pow(2, self.quant_bits[0] - 1) - 1 + max_range *= (math.pow(2, self.quant_bits[1] - 1) - 1) yq, scales = channel_wise_quantize_max_abs(x, max_range) ydq = channel_wise_dequantize_max_abs(yq, scales, max_range) self.inputs = { 'X': yq, - 'ActivationScale': np.array(1.0).astype(self.data_type), - 'WeightScales': np.array(scales).astype(self.data_type) - } - self.attrs = { - 'weight_bits': self.weight_bits, - 'activation_bits': self.activation_bits + 'Scales': [("scales0", np.array(scales).astype(self.data_type)), + ("scales1", np.array([1.0]).astype(self.data_type))] } + self.attrs = {'quant_bits': self.quant_bits} self.outputs = {'Out': ydq} def test_check_output(self): self.check_output() -class TestFakeChannelWiseDequantizeMaxAbsOpNoActivationScale(OpTest): +class TestFakeChannelWiseDequantizeMaxAbsOpOneScale(OpTest): def set_args(self): - self.weight_bits = 8 + self.quant_bits = [8] self.data_type = "float32" def setUp(self): self.set_args() self.op_type = "fake_channel_wise_dequantize_max_abs" x = np.random.randn(4, 3, 64, 64).astype(self.data_type) - max_range = math.pow(2, self.weight_bits - 1) - 1 + max_range = math.pow(2, self.quant_bits[0] - 1) - 1 yq, scales = channel_wise_quantize_max_abs(x, max_range) ydq = channel_wise_dequantize_max_abs(yq, scales, max_range) self.inputs = { 'X': yq, - 'WeightScales': np.array(scales).astype(self.data_type) + 'Scales': [("scales0", np.array(scales).astype(self.data_type))] } - self.attrs = {'weight_bits': self.weight_bits} + self.attrs = {'quant_bits': self.quant_bits} self.outputs = {'Out': ydq} def test_check_output(self): From 8063b31e2d485b665303a2010e63909ba53d1664 Mon Sep 17 00:00:00 2001 From: Zhen Wang Date: Tue, 5 Mar 2019 22:54:22 +0800 Subject: [PATCH 021/230] Reduce redundant code for channel wise dequant op. test=develop --- paddle/fluid/operators/fake_dequantize_op.h | 27 +++++++---------- .../unittests/test_fake_dequantize_op.py | 30 +++++++++++-------- 2 files changed, 28 insertions(+), 29 deletions(-) diff --git a/paddle/fluid/operators/fake_dequantize_op.h b/paddle/fluid/operators/fake_dequantize_op.h index 549f5039f4..d05f203853 100644 --- a/paddle/fluid/operators/fake_dequantize_op.h +++ b/paddle/fluid/operators/fake_dequantize_op.h @@ -65,27 +65,20 @@ class FakeChannelWiseDequantizeMaxAbsKernel : public framework::OpKernel { out->mutable_data(dev_ctx.GetPlace()); auto dequant = DequantizeFunctor(); + for (int64_t i = 0; i < in->dims()[0]; i++) { + framework::Tensor one_channel_in = in->Slice(i, i + 1); + framework::Tensor one_channel_out = out->Slice(i, i + 1); + framework::Tensor one_channel_scale = scales[0]->Slice(i, i + 1); + dequant(dev_ctx, &one_channel_in, &one_channel_scale, + static_cast(max_range), &one_channel_out); + } + if (scales.size() == 2) { PADDLE_ENFORCE_EQ( scales[1]->numel(), 1, "The second scale tensor should only have one value at now."); - for (int64_t i = 0; i < in->dims()[0]; i++) { - framework::Tensor one_channel_in = in->Slice(i, i + 1); - framework::Tensor one_channel_out = out->Slice(i, i + 1); - framework::Tensor one_channel_scale = scales[0]->Slice(i, i + 1); - max_range *= (std::pow(2, quant_bits[1] - 1) - 1); - dequant(dev_ctx, &one_channel_in, &one_channel_scale, - static_cast(max_range), &one_channel_out); - } - dequant(dev_ctx, out, scales[1], static_cast(1), out); - } else { - for (int64_t i = 0; i < in->dims()[0]; i++) { - framework::Tensor one_channel_in = in->Slice(i, i + 1); - framework::Tensor one_channel_out = out->Slice(i, i + 1); - framework::Tensor one_channel_scale = scales[0]->Slice(i, i + 1); - dequant(dev_ctx, &one_channel_in, &one_channel_scale, - static_cast(max_range), &one_channel_out); - } + max_range = std::pow(2, quant_bits[1] - 1) - 1; + dequant(dev_ctx, out, scales[1], static_cast(max_range), out); } } }; diff --git a/python/paddle/fluid/tests/unittests/test_fake_dequantize_op.py b/python/paddle/fluid/tests/unittests/test_fake_dequantize_op.py index 8d91d8fd1d..32cb23cbfa 100644 --- a/python/paddle/fluid/tests/unittests/test_fake_dequantize_op.py +++ b/python/paddle/fluid/tests/unittests/test_fake_dequantize_op.py @@ -31,42 +31,49 @@ def dequantize_max_abs(x, scale, max_range): return y -def channel_wise_quantize_max_abs(x, max_range): +def channel_wise_quantize_max_abs(x, quant_bit=8): scales = [] for i in range(x.shape[0]): scales.append(np.max(np.abs(x[i])).astype("float32")) y = x.copy() + max_range = math.pow(2, quant_bit - 1) - 1 for i, scale in enumerate(scales): y[i] = np.round(y[i] / scale * max_range) return y, scales -def channel_wise_dequantize_max_abs(x, scales, max_range): +def channel_wise_dequantize_max_abs(x, + scales, + quant_bits, + activation_scale=None): y = x.copy() for i in range(x.shape[0]): - y[i] = (scales[i] / max_range) * y[i] + y[i] = (scales[i] / (math.pow(2, quant_bits[0] - 1) - 1)) * y[i] + if activation_scale is not None: + y *= activation_scale / (math.pow(2, quant_bits[1] - 1) - 1) return y class TestFakeChannelWiseDequantizeMaxAbsOpTwoScales(OpTest): def set_args(self): - self.quant_bits = [8, 2] + self.quant_bits = [8, 8] self.data_type = "float32" + self.activation_scale = 0.7861 def setUp(self): self.set_args() self.op_type = "fake_channel_wise_dequantize_max_abs" x = np.random.randn(4, 3, 64, 64).astype(self.data_type) - max_range = math.pow(2, self.quant_bits[0] - 1) - 1 - max_range *= (math.pow(2, self.quant_bits[1] - 1) - 1) - yq, scales = channel_wise_quantize_max_abs(x, max_range) - ydq = channel_wise_dequantize_max_abs(yq, scales, max_range) + yq, scales = channel_wise_quantize_max_abs(x, self.quant_bits[0]) + ydq = channel_wise_dequantize_max_abs(yq, scales, self.quant_bits, + self.activation_scale) self.inputs = { 'X': yq, 'Scales': [("scales0", np.array(scales).astype(self.data_type)), - ("scales1", np.array([1.0]).astype(self.data_type))] + ("scales1", np.array( + [self.activation_scale]).astype(self.data_type))] } self.attrs = {'quant_bits': self.quant_bits} self.outputs = {'Out': ydq} @@ -84,9 +91,8 @@ class TestFakeChannelWiseDequantizeMaxAbsOpOneScale(OpTest): self.set_args() self.op_type = "fake_channel_wise_dequantize_max_abs" x = np.random.randn(4, 3, 64, 64).astype(self.data_type) - max_range = math.pow(2, self.quant_bits[0] - 1) - 1 - yq, scales = channel_wise_quantize_max_abs(x, max_range) - ydq = channel_wise_dequantize_max_abs(yq, scales, max_range) + yq, scales = channel_wise_quantize_max_abs(x, self.quant_bits[0]) + ydq = channel_wise_dequantize_max_abs(yq, scales, self.quant_bits) self.inputs = { 'X': yq, From 33138a421d72912d0b51ddaf9d1e32f0c8b808d5 Mon Sep 17 00:00:00 2001 From: sneaxiy Date: Tue, 5 Mar 2019 11:32:23 +0000 Subject: [PATCH 022/230] remove match check test=develop --- paddle/fluid/operators/controlflow/while_op_helper.cc | 3 --- .../fluid/tests/unittests/test_eager_deletion_while_op.py | 6 +++--- .../unittests/test_partial_eager_deletion_transformer.py | 2 +- 3 files changed, 4 insertions(+), 7 deletions(-) diff --git a/paddle/fluid/operators/controlflow/while_op_helper.cc b/paddle/fluid/operators/controlflow/while_op_helper.cc index 0324a1586a..848ff5e8f1 100644 --- a/paddle/fluid/operators/controlflow/while_op_helper.cc +++ b/paddle/fluid/operators/controlflow/while_op_helper.cc @@ -241,9 +241,6 @@ static void PrepareSafeEagerDeletionOnWhileOpAndWhileGradOpImpl( ModifyWhileOpAndWhileGradOpAttr(*matched_fwd_op, bwd_op); while_op_set.erase(*matched_fwd_op); } - - PADDLE_ENFORCE(while_op_set.empty(), - "There are not matched while_grad op in graph."); } void PrepareSafeEagerDeletionOnWhileOpAndWhileGradOp( diff --git a/python/paddle/fluid/tests/unittests/test_eager_deletion_while_op.py b/python/paddle/fluid/tests/unittests/test_eager_deletion_while_op.py index 7fa1636579..898d04ebe1 100644 --- a/python/paddle/fluid/tests/unittests/test_eager_deletion_while_op.py +++ b/python/paddle/fluid/tests/unittests/test_eager_deletion_while_op.py @@ -47,7 +47,7 @@ class TestEagerDeletionWhileOpBase(unittest.TestCase): self.with_data_parallel = with_data_parallel if not core.is_compiled_with_cuda() and isinstance(self.place, - core.CUDPlace): + core.CUDAPlace): return if isinstance(self.place, core.CUDAPlace): @@ -55,8 +55,8 @@ class TestEagerDeletionWhileOpBase(unittest.TestCase): ) if self.with_data_parallel else 1 else: device_cnt = int( - os.environ['CPU_NUM'], - multiprocessing.cpu_count()) if self.with_data_parallel else 1 + os.environ.get('CPU_NUM', multiprocessing.cpu_count( + ))) if self.with_data_parallel else 1 d0 = layers.data( "d0", shape=[10], append_batch_size=False, dtype='float32') diff --git a/python/paddle/fluid/tests/unittests/test_partial_eager_deletion_transformer.py b/python/paddle/fluid/tests/unittests/test_partial_eager_deletion_transformer.py index d44e4627d8..ba3b275c7e 100644 --- a/python/paddle/fluid/tests/unittests/test_partial_eager_deletion_transformer.py +++ b/python/paddle/fluid/tests/unittests/test_partial_eager_deletion_transformer.py @@ -18,7 +18,7 @@ os.environ['FLAGS_eager_delete_tensor_gb'] = "0.0" os.environ['FLAGS_memory_fraction_of_eager_deletion'] = "0.55" os.environ[ - 'RECORDIO_FILENAME'] = '/tmp/eager_deletion_transformer.wmt16.recordio' + 'RECORDIO_FILENAME'] = '/tmp/partial_eager_deletion_transformer.wmt16.recordio' from test_parallel_executor_transformer import TestTransformer From 7b608396feb0106733295b6ff00cef8d87e505d1 Mon Sep 17 00:00:00 2001 From: sneaxiy Date: Wed, 6 Mar 2019 07:11:34 +0000 Subject: [PATCH 023/230] fix travis-ci format check test=develop --- paddle/fluid/framework/details/computation_op_handle.h | 1 + paddle/fluid/framework/details/eager_deletion_op_handle.cc | 5 ++++- paddle/fluid/framework/details/eager_deletion_pass.cc | 5 ++++- paddle/fluid/framework/details/reference_count_pass.cc | 4 ++++ .../fluid/framework/details/reference_count_pass_helper.h | 1 + paddle/fluid/framework/executor.cc | 6 +++++- paddle/fluid/operators/controlflow/while_op_helper.cc | 2 ++ .../tests/unittests/test_eager_deletion_transformer.py | 3 +-- .../unittests/test_partial_eager_deletion_transformer.py | 2 +- 9 files changed, 23 insertions(+), 6 deletions(-) diff --git a/paddle/fluid/framework/details/computation_op_handle.h b/paddle/fluid/framework/details/computation_op_handle.h index 67f7cb738f..e98b16e6b3 100644 --- a/paddle/fluid/framework/details/computation_op_handle.h +++ b/paddle/fluid/framework/details/computation_op_handle.h @@ -14,6 +14,7 @@ #pragma once +#include #include #include diff --git a/paddle/fluid/framework/details/eager_deletion_op_handle.cc b/paddle/fluid/framework/details/eager_deletion_op_handle.cc index e58e501e6d..dbc90737f2 100644 --- a/paddle/fluid/framework/details/eager_deletion_op_handle.cc +++ b/paddle/fluid/framework/details/eager_deletion_op_handle.cc @@ -12,11 +12,14 @@ // See the License for the specific language governing permissions and // limitations under the License. +#include +#include +#include + #include "paddle/fluid/framework/details/eager_deletion_op_handle.h" #include "paddle/fluid/framework/lod_tensor_array.h" #include "paddle/fluid/framework/scope.h" #include "paddle/fluid/framework/selected_rows.h" -#include "paddle/fluid/platform/profiler.h" #ifdef PADDLE_WITH_CUDA #include "paddle/fluid/platform/cuda_device_guard.h" #endif diff --git a/paddle/fluid/framework/details/eager_deletion_pass.cc b/paddle/fluid/framework/details/eager_deletion_pass.cc index 566bc15c17..377bb915e0 100644 --- a/paddle/fluid/framework/details/eager_deletion_pass.cc +++ b/paddle/fluid/framework/details/eager_deletion_pass.cc @@ -25,7 +25,10 @@ #include "paddle/fluid/framework/ir/graph_helper.h" DEFINE_double(memory_fraction_of_eager_deletion, 1.0, - "Fraction of eager deletion"); + "Fraction of eager deletion. If less than 1.0, all variables in " + "the program would be sorted according to its memory size, and " + "only the FLAGS_memory_fraction_of_eager_deletion of the largest " + "variables would be deleted."); namespace paddle { namespace framework { diff --git a/paddle/fluid/framework/details/reference_count_pass.cc b/paddle/fluid/framework/details/reference_count_pass.cc index 892f638f1f..6092143449 100644 --- a/paddle/fluid/framework/details/reference_count_pass.cc +++ b/paddle/fluid/framework/details/reference_count_pass.cc @@ -12,9 +12,13 @@ // See the License for the specific language governing permissions and // limitations under the License. +#include #include #include #include +#include +#include +#include #include #include "paddle/fluid/framework/details/computation_op_handle.h" diff --git a/paddle/fluid/framework/details/reference_count_pass_helper.h b/paddle/fluid/framework/details/reference_count_pass_helper.h index d9e8776d7e..ce700119c5 100644 --- a/paddle/fluid/framework/details/reference_count_pass_helper.h +++ b/paddle/fluid/framework/details/reference_count_pass_helper.h @@ -16,6 +16,7 @@ #include #include +#include #include #include #include diff --git a/paddle/fluid/framework/executor.cc b/paddle/fluid/framework/executor.cc index 5555679412..7eef9ec504 100644 --- a/paddle/fluid/framework/executor.cc +++ b/paddle/fluid/framework/executor.cc @@ -14,6 +14,10 @@ limitations under the License. */ #include "paddle/fluid/framework/executor.h" #include +#include +#include +#include +#include #include "paddle/fluid/framework/feed_fetch_method.h" #include "paddle/fluid/framework/lod_rank_table.h" @@ -428,7 +432,7 @@ void Executor::RunPreparedContext(ExecutorPrepareContext* ctx, Scope* scope, #ifdef PADDLE_WITH_CUDA } #endif - if (gc) { + if (gc && keep_kids) { operators::PrepareSafeEagerDeletionOnWhileOpAndWhileGradOp(ctx->block_id_, ctx->ops_); } diff --git a/paddle/fluid/operators/controlflow/while_op_helper.cc b/paddle/fluid/operators/controlflow/while_op_helper.cc index 848ff5e8f1..2cbd94a061 100644 --- a/paddle/fluid/operators/controlflow/while_op_helper.cc +++ b/paddle/fluid/operators/controlflow/while_op_helper.cc @@ -14,6 +14,8 @@ #include "paddle/fluid/operators/controlflow/while_op_helper.h" #include +#include +#include #include "paddle/fluid/framework/program_desc.h" namespace paddle { diff --git a/python/paddle/fluid/tests/unittests/test_eager_deletion_transformer.py b/python/paddle/fluid/tests/unittests/test_eager_deletion_transformer.py index 603c8e7488..05cc41b96f 100644 --- a/python/paddle/fluid/tests/unittests/test_eager_deletion_transformer.py +++ b/python/paddle/fluid/tests/unittests/test_eager_deletion_transformer.py @@ -16,8 +16,7 @@ import os import unittest os.environ['FLAGS_eager_delete_tensor_gb'] = "0.0" -os.environ[ - 'RECORDIO_FILENAME'] = '/tmp/eager_deletion_transformer.wmt16.recordio' +os.environ['RECORDIO_FILENAME'] = './eager_deletion_transformer.wmt16.recordio' from test_parallel_executor_transformer import TestTransformer diff --git a/python/paddle/fluid/tests/unittests/test_partial_eager_deletion_transformer.py b/python/paddle/fluid/tests/unittests/test_partial_eager_deletion_transformer.py index ba3b275c7e..fc1d762ec9 100644 --- a/python/paddle/fluid/tests/unittests/test_partial_eager_deletion_transformer.py +++ b/python/paddle/fluid/tests/unittests/test_partial_eager_deletion_transformer.py @@ -18,7 +18,7 @@ os.environ['FLAGS_eager_delete_tensor_gb'] = "0.0" os.environ['FLAGS_memory_fraction_of_eager_deletion'] = "0.55" os.environ[ - 'RECORDIO_FILENAME'] = '/tmp/partial_eager_deletion_transformer.wmt16.recordio' + 'RECORDIO_FILENAME'] = './partial_eager_deletion_transformer.wmt16.recordio' from test_parallel_executor_transformer import TestTransformer From f49d9b393c6b333d741b2df722fa8b41ab923d5e Mon Sep 17 00:00:00 2001 From: minqiyang Date: Thu, 7 Mar 2019 10:15:49 +0800 Subject: [PATCH 024/230] Transfer GRU unit test=develop --- python/paddle/fluid/imperative/nn.py | 137 ++++++++++++++++++++++++++- 1 file changed, 136 insertions(+), 1 deletion(-) diff --git a/python/paddle/fluid/imperative/nn.py b/python/paddle/fluid/imperative/nn.py index 6c5961cc63..bad2c325be 100644 --- a/python/paddle/fluid/imperative/nn.py +++ b/python/paddle/fluid/imperative/nn.py @@ -22,7 +22,7 @@ from . import layers from ..framework import Variable, OpProtoHolder from ..param_attr import ParamAttr from ..initializer import Normal, Constant -__all__ = ['Conv2D', 'Pool2D', 'FC', 'BatchNorm', 'Embedding'] +__all__ = ['Conv2D', 'Pool2D', 'FC', 'BatchNorm', 'Embedding', 'GRUUnit'] class Conv2D(layers.Layer): @@ -496,3 +496,138 @@ class Embedding(layers.Layer): }) return out + + +class GRUUnit(layers.Layer): + """ + **GRU unit layer** + + if origin_mode is True, then the equation of a gru step is from paper + `Learning Phrase Representations using RNN Encoder-Decoder for Statistical + Machine Translation `_ + + .. math:: + u_t & = actGate(xu_{t} + W_u h_{t-1} + b_u) + + r_t & = actGate(xr_{t} + W_r h_{t-1} + b_r) + + m_t & = actNode(xm_t + W_c dot(r_t, h_{t-1}) + b_m) + + h_t & = dot(u_t, h_{t-1}) + dot((1-u_t), m_t) + + if origin_mode is False, then the equation of a gru step is from paper + `Empirical Evaluation of Gated Recurrent Neural Networks on Sequence + Modeling `_ + + .. math:: + u_t & = actGate(xu_{t} + W_u h_{t-1} + b_u) + + r_t & = actGate(xr_{t} + W_r h_{t-1} + b_r) + + m_t & = actNode(xm_t + W_c dot(r_t, h_{t-1}) + b_m) + + h_t & = dot((1-u_t), h_{t-1}) + dot(u_t, m_t) + + + The inputs of gru unit includes :math:`z_t`, :math:`h_{t-1}`. In terms + of the equation above, the :math:`z_t` is split into 3 parts - + :math:`xu_t`, :math:`xr_t` and :math:`xm_t`. This means that in order to + implement a full GRU unit operator for an input, a fully + connected layer has to be applied, such that :math:`z_t = W_{fc}x_t`. + + The terms :math:`u_t` and :math:`r_t` represent the update and reset gates + of the GRU cell. Unlike LSTM, GRU has one lesser gate. However, there is + an intermediate candidate hidden output, which is denoted by :math:`m_t`. + This layer has three outputs :math:`h_t`, :math:`dot(r_t, h_{t-1})` + and concatenation of :math:`u_t`, :math:`r_t` and :math:`m_t`. + + Args: + input (Variable): The fc transformed input value of current step. + hidden (Variable): The hidden value of gru unit from previous step. + size (integer): The input dimension value. + param_attr(ParamAttr|None): The parameter attribute for the learnable + hidden-hidden weight matrix. Note: + + - The shape of the weight matrix is :math:`(T \\times 3D)`, where + :math:`D` is the hidden size. + - All elements in the weight matrix can be divided into two parts. + The first part are weights of the update gate and reset gate with + shape :math:`(D \\times 2D)`, and the second part are weights for + candidate hidden state with shape :math:`(D \\times D)`. + + If it is set to None or one attribute of ParamAttr, gru_unit will + create ParamAttr as param_attr. If the Initializer of the param_attr + is not set, the parameter is initialized with Xavier. Default: None. + bias_attr (ParamAttr|bool|None): The parameter attribute for the bias + of GRU.Note that the bias with :math:`(1 \\times 3D)` concatenates + the bias in the update gate, reset gate and candidate calculations. + If it is set to False, no bias will be applied to the update gate, + reset gate and candidate calculations. If it is set to None or one + attribute of ParamAttr, gru_unit will create ParamAttr as + bias_attr. If the Initializer of the bias_attr is not set, the bias + is initialized zero. Default: None. + activation (string): The activation type for cell (actNode). + Default: 'tanh' + gate_activation (string): The activation type for gates (actGate). + Default: 'sigmoid' + + Returns: + tuple: The hidden value, reset-hidden value and gate values. + """ + + def __init__(self, + hidden, + size, + param_attr=None, + bias_attr=None, + activation='tanh', + gate_activation='sigmoid', + origin_mode=False, + dtype='float32'): + + super(GRUUnit, self).__init__() + activation_dict = dict( + identity=0, + sigmoid=1, + tanh=2, + relu=3, ) + activation = activation_dict[activation] + gate_activation = activation_dict[gate_activation] + + helper = LayerHelper('gru_unit', **locals()) + dtype = helper.input_dtype() + size = size // 3 + + # create weight + weight = helper.create_parameter( + attr=helper.param_attr, shape=[size, 3 * size], dtype=dtype) + + gate = helper.create_variable_for_type_inference(dtype) + reset_hidden_pre = helper.create_variable_for_type_inference(dtype) + updated_hidden = helper.create_variable_for_type_inference(dtype) + inputs = {'Input': input, 'HiddenPrev': hidden, 'Weight': weight} + # create bias + if helper.bias_attr: + bias_size = [1, 3 * size] + bias = helper.create_parameter( + attr=helper.bias_attr, + shape=bias_size, + dtype=dtype, + is_bias=True) + inputs['Bias'] = bias + + def forward(self, input): + self._helper.append_op( + type='gru_unit', + inputs=inputs, + outputs={ + 'Gate': gate, + 'ResetHiddenPrev': reset_hidden_pre, + 'Hidden': updated_hidden, + }, + attrs={ + 'activation': 2, # tanh + 'gate_activation': 1, # sigmoid + }) + + return updated_hidden, reset_hidden_pre, gate From fad06cb92868a3a3bf78ae00d7bcc499dad7cbce Mon Sep 17 00:00:00 2001 From: luotao1 Date: Thu, 7 Mar 2019 11:57:44 +0800 Subject: [PATCH 025/230] unify ZeroCopy in analysis_test --- paddle/fluid/inference/api/helper.h | 15 +- .../tests/api/analyzer_pyramid_dnn_tester.cc | 3 + .../fluid/inference/tests/api/tester_helper.h | 147 ++++++++++-------- 3 files changed, 96 insertions(+), 69 deletions(-) diff --git a/paddle/fluid/inference/api/helper.h b/paddle/fluid/inference/api/helper.h index b92781e4f2..8114754a27 100644 --- a/paddle/fluid/inference/api/helper.h +++ b/paddle/fluid/inference/api/helper.h @@ -127,9 +127,8 @@ static void TensorAssignData(PaddleTensor *tensor, } template -static int ZeroCopyTensorAssignData(ZeroCopyTensor *tensor, - const std::vector> &data) { - int size{0}; +static void ZeroCopyTensorAssignData(ZeroCopyTensor *tensor, + const std::vector> &data) { auto *ptr = tensor->mutable_data(PaddlePlace::kCPU); int c = 0; for (const auto &f : data) { @@ -137,7 +136,15 @@ static int ZeroCopyTensorAssignData(ZeroCopyTensor *tensor, ptr[c++] = v; } } - return size; +} + +template +static void ZeroCopyTensorAssignData(ZeroCopyTensor *tensor, + const PaddleBuf &data) { + auto *ptr = tensor->mutable_data(PaddlePlace::kCPU); + for (size_t i = 0; i < data.length() / sizeof(T); i++) { + ptr[i] = *(reinterpret_cast(data.data()) + i); + } } static bool CompareTensor(const PaddleTensor &a, const PaddleTensor &b) { diff --git a/paddle/fluid/inference/tests/api/analyzer_pyramid_dnn_tester.cc b/paddle/fluid/inference/tests/api/analyzer_pyramid_dnn_tester.cc index 3f6c933f2b..df834e75df 100644 --- a/paddle/fluid/inference/tests/api/analyzer_pyramid_dnn_tester.cc +++ b/paddle/fluid/inference/tests/api/analyzer_pyramid_dnn_tester.cc @@ -107,6 +107,9 @@ void SetConfig(AnalysisConfig *cfg) { cfg->DisableGpu(); cfg->SwitchSpecifyInputNames(); cfg->SwitchIrOptim(); + if (FLAGS_zero_copy) { + cfg->SwitchUseFeedFetchOps(false); + } } void SetInput(std::vector> *inputs) { diff --git a/paddle/fluid/inference/tests/api/tester_helper.h b/paddle/fluid/inference/tests/api/tester_helper.h index 2e53fddfe7..3becb4bf68 100644 --- a/paddle/fluid/inference/tests/api/tester_helper.h +++ b/paddle/fluid/inference/tests/api/tester_helper.h @@ -51,6 +51,7 @@ DEFINE_bool(use_analysis, true, DEFINE_bool(record_benchmark, false, "Record benchmark after profiling the model"); DEFINE_double(accuracy, 1e-3, "Result Accuracy."); +DEFINE_bool(zero_copy, false, "Use ZeroCopy to speedup Feed/Fetch."); DECLARE_bool(profile); DECLARE_int32(paddle_num_threads); @@ -198,61 +199,104 @@ void GetInputPerBatch(const std::vector> &in, } } -void TestOneThreadPrediction( - const PaddlePredictor::Config *config, - const std::vector> &inputs, - std::vector *outputs, bool use_analysis = true) { - int batch_size = FLAGS_batch_size; - int num_times = FLAGS_repeat; - auto predictor = CreateTestPredictor(config, use_analysis); +void ConvertPaddleTensorToZeroCopyTensor( + PaddlePredictor *predictor, const std::vector &inputs) { + for (size_t i = 0; i < inputs.size(); i++) { + auto input = inputs[i]; + auto tensor = predictor->GetInputTensor(input.name); + tensor->Reshape(input.shape); + tensor->SetLoD({input.lod}); + if (input.dtype == PaddleDType::INT64) { + ZeroCopyTensorAssignData(tensor.get(), input.data); + } else if (input.dtype == PaddleDType::FLOAT32) { + ZeroCopyTensorAssignData(tensor.get(), input.data); + } else { + LOG(ERROR) << "unsupported feed type " << input.dtype; + } + } +} - // warmup run - LOG(INFO) << "Warm up run..."; - { - Timer warmup_timer; - warmup_timer.tic(); +void PredictionWarmUp(PaddlePredictor *predictor, + const std::vector> &inputs, + std::vector *outputs, int num_threads, + int tid) { + int batch_size = FLAGS_batch_size; + LOG(INFO) << "Running thread " << tid << ", warm up run..."; + if (FLAGS_zero_copy) { + ConvertPaddleTensorToZeroCopyTensor(predictor, inputs[0]); + } + Timer warmup_timer; + warmup_timer.tic(); + if (!FLAGS_zero_copy) { predictor->Run(inputs[0], outputs, batch_size); - PrintTime(batch_size, 1, 1, 0, warmup_timer.toc(), 1); - if (FLAGS_profile) { - paddle::platform::ResetProfiler(); - } + } else { + predictor->ZeroCopyRun(); } + PrintTime(batch_size, 1, num_threads, tid, warmup_timer.toc(), 1); + if (FLAGS_profile) { + paddle::platform::ResetProfiler(); + } +} - LOG(INFO) << "Run " << num_times << " times..."; - { - Timer run_timer; - run_timer.tic(); +void PredictionRun(PaddlePredictor *predictor, + const std::vector> &inputs, + std::vector *outputs, int num_threads, + int tid) { + int batch_size = FLAGS_batch_size; + int num_times = FLAGS_repeat; + LOG(INFO) << "Thread " << tid << " run " << num_times << " times..."; + Timer run_timer; + double elapsed_time = 0; #ifdef WITH_GPERFTOOLS - ProfilerStart("paddle_inference.prof"); + ProfilerStart("paddle_inference.prof"); #endif - for (int i = 0; i < num_times; i++) { - for (size_t j = 0; j < inputs.size(); j++) { - predictor->Run(inputs[j], outputs, batch_size); + if (!FLAGS_zero_copy) { + run_timer.tic(); + for (size_t i = 0; i < inputs.size(); i++) { + for (int j = 0; j < num_times; j++) { + predictor->Run(inputs[i], outputs, batch_size); } } + elapsed_time = run_timer.toc(); + } else { + for (size_t i = 0; i < inputs.size(); i++) { + ConvertPaddleTensorToZeroCopyTensor(predictor, inputs[i]); + run_timer.tic(); + for (int j = 0; j < num_times; j++) { + predictor->ZeroCopyRun(); + } + elapsed_time += run_timer.toc(); + } + } #ifdef WITH_GPERFTOOLS - ProfilerStop(); + ProfilerStop(); #endif - double latency = run_timer.toc() / (num_times > 1 ? num_times : 1); - PrintTime(batch_size, num_times, 1, 0, latency, inputs.size()); - if (FLAGS_record_benchmark) { - Benchmark benchmark; - benchmark.SetName(FLAGS_model_name); - benchmark.SetBatchSize(batch_size); - benchmark.SetLatency(latency); - benchmark.PersistToFile("benchmark_record.txt"); - } + PrintTime(batch_size, num_times, num_threads, tid, elapsed_time / num_times, + inputs.size()); + if (FLAGS_record_benchmark) { + Benchmark benchmark; + benchmark.SetName(FLAGS_model_name); + benchmark.SetBatchSize(batch_size); + benchmark.SetLatency(elapsed_time / num_times); + benchmark.PersistToFile("benchmark_record.txt"); } } +void TestOneThreadPrediction( + const PaddlePredictor::Config *config, + const std::vector> &inputs, + std::vector *outputs, bool use_analysis = true) { + auto predictor = CreateTestPredictor(config, use_analysis); + PredictionWarmUp(predictor.get(), inputs, outputs, 1, 0); + PredictionRun(predictor.get(), inputs, outputs, 1, 0); +} + void TestMultiThreadPrediction( const PaddlePredictor::Config *config, const std::vector> &inputs, std::vector *outputs, int num_threads, bool use_analysis = true) { - int batch_size = FLAGS_batch_size; - int num_times = FLAGS_repeat; std::vector threads; std::vector> predictors; predictors.emplace_back(CreateTestPredictor(config, use_analysis)); @@ -260,7 +304,6 @@ void TestMultiThreadPrediction( predictors.emplace_back(predictors.front()->Clone()); } - size_t total_time{0}; for (int tid = 0; tid < num_threads; ++tid) { threads.emplace_back([&, tid]() { // Each thread should have local inputs and outputs. @@ -273,34 +316,8 @@ void TestMultiThreadPrediction( ->SetMkldnnThreadID(static_cast(tid) + 1); } #endif - - // warmup run - LOG(INFO) << "Running thread " << tid << ", warm up run..."; - { - Timer warmup_timer; - warmup_timer.tic(); - predictor->Run(inputs[0], outputs, batch_size); - PrintTime(batch_size, 1, num_threads, tid, warmup_timer.toc(), 1); - if (FLAGS_profile) { - paddle::platform::ResetProfiler(); - } - } - - LOG(INFO) << "Thread " << tid << " run " << num_times << " times..."; - { - Timer timer; - timer.tic(); - for (int i = 0; i < num_times; i++) { - for (const auto &input : inputs) { - ASSERT_TRUE(predictor->Run(input, &outputs_tid)); - } - } - - auto time = timer.toc(); - total_time += time; - PrintTime(batch_size, num_times, num_threads, tid, time / num_times, - inputs.size()); - } + PredictionWarmUp(predictor.get(), inputs, outputs, num_threads, tid); + PredictionRun(predictor.get(), inputs, outputs, num_threads, tid); }); } for (int i = 0; i < num_threads; ++i) { From d0f8d94ca4c81d765ba0bc47235e25d3456381b1 Mon Sep 17 00:00:00 2001 From: sneaxiy Date: Thu, 7 Mar 2019 05:09:16 +0000 Subject: [PATCH 026/230] try to fix unittest test=develop --- .../tests/unittests/test_partial_eager_deletion_transformer.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/python/paddle/fluid/tests/unittests/test_partial_eager_deletion_transformer.py b/python/paddle/fluid/tests/unittests/test_partial_eager_deletion_transformer.py index fc1d762ec9..7607189454 100644 --- a/python/paddle/fluid/tests/unittests/test_partial_eager_deletion_transformer.py +++ b/python/paddle/fluid/tests/unittests/test_partial_eager_deletion_transformer.py @@ -17,8 +17,7 @@ import unittest os.environ['FLAGS_eager_delete_tensor_gb'] = "0.0" os.environ['FLAGS_memory_fraction_of_eager_deletion'] = "0.55" -os.environ[ - 'RECORDIO_FILENAME'] = './partial_eager_deletion_transformer.wmt16.recordio' +os.environ['RECORDIO_FILENAME'] = './p_gc_transformer.wmt16.recordio' from test_parallel_executor_transformer import TestTransformer From 802f362ac44e99a06e3c499893f3c8279a367f0d Mon Sep 17 00:00:00 2001 From: tensor-tang Date: Thu, 7 Mar 2019 07:21:35 +0000 Subject: [PATCH 027/230] unify the kernelfuncs cache and add unit test test=develop --- paddle/fluid/operators/crf_decoding_op.h | 5 +- .../mkldnn/elementwise_mul_mkldnn_op.cc | 6 ++- .../fused/fused_embedding_seq_pool_op.h | 12 +++-- paddle/fluid/operators/fused/fusion_gru_op.cc | 49 +++++++++-------- .../fluid/operators/fused/fusion_lstm_op.cc | 54 ++++++++++--------- .../fused/fusion_repeated_fc_relu_op.cc | 10 ++-- .../fused/fusion_seqpool_concat_op.cc | 6 +-- .../fused/fusion_squared_mat_sub_op.cc | 32 ++++++----- paddle/fluid/operators/jit/CMakeLists.txt | 2 +- paddle/fluid/operators/jit/benchmark.cc | 2 +- paddle/fluid/operators/jit/helper.h | 34 ++++++++---- paddle/fluid/operators/jit/test.cc | 30 ++++++++--- paddle/fluid/operators/layer_norm_op.h | 6 +-- .../fluid/operators/math/sequence_pooling.cc | 6 +-- paddle/fluid/operators/optimizers/sgd_op.h | 10 ++-- 15 files changed, 158 insertions(+), 106 deletions(-) diff --git a/paddle/fluid/operators/crf_decoding_op.h b/paddle/fluid/operators/crf_decoding_op.h index 72774a878d..3d98790a4d 100644 --- a/paddle/fluid/operators/crf_decoding_op.h +++ b/paddle/fluid/operators/crf_decoding_op.h @@ -82,8 +82,9 @@ class CRFDecodingOpKernel : public framework::OpKernel { Tensor track; int* track_value = track.mutable_data(emission_dims, platform::CPUPlace()); - auto ker = jit::Get, - platform::CPUPlace>(tag_num); + auto ker = jit::KernelFuncs, + platform::CPUPlace>::Cache() + .At(tag_num); ker(static_cast(seq_len), x, w, alpha_value, track_value, tag_num); T max_score = -std::numeric_limits::max(); int max_i = 0; diff --git a/paddle/fluid/operators/elementwise/mkldnn/elementwise_mul_mkldnn_op.cc b/paddle/fluid/operators/elementwise/mkldnn/elementwise_mul_mkldnn_op.cc index 04e8800bbc..e37bbd2837 100644 --- a/paddle/fluid/operators/elementwise/mkldnn/elementwise_mul_mkldnn_op.cc +++ b/paddle/fluid/operators/elementwise/mkldnn/elementwise_mul_mkldnn_op.cc @@ -110,8 +110,10 @@ class ElementwiseMulMKLDNNKernel : public framework::OpKernel { constexpr int simd_width = 16; int C = c / simd_width; - auto multiply = jit::Get, - platform::CPUPlace>(0); + auto multiply = + jit::KernelFuncs, + platform::CPUPlace>::Cache() + .At(0); #pragma omp parallel for collapse(2) for (int ni = 0; ni < n; ni++) { for (int ci = 0; ci < C; ci++) { diff --git a/paddle/fluid/operators/fused/fused_embedding_seq_pool_op.h b/paddle/fluid/operators/fused/fused_embedding_seq_pool_op.h index f13c020386..fe43545e60 100644 --- a/paddle/fluid/operators/fused/fused_embedding_seq_pool_op.h +++ b/paddle/fluid/operators/fused/fused_embedding_seq_pool_op.h @@ -52,8 +52,10 @@ struct EmbeddingVSumFunctor { out_width, jit::SeqPoolType::kSum); for (size_t i = 0; i != ids_lod.size() - 1; ++i) { attr.index_height = ids_lod[i + 1] - ids_lod[i]; - auto emb_seqpool = jit::Get, - platform::CPUPlace>(attr); + auto emb_seqpool = + jit::KernelFuncs, + platform::CPUPlace>::Cache() + .At(attr); emb_seqpool(table, ids + ids_lod[i] * idx_width, output + i * out_width, &attr); } @@ -135,8 +137,10 @@ class FusedEmbeddingSeqPoolGradKernel : public framework::OpKernel { T *d_table_data = d_table_value->mutable_data(context.GetPlace()); const T *d_output_data = d_output->data(); - auto vbroadcast = jit::Get, - platform::CPUPlace>(out_width); + auto vbroadcast = + jit::KernelFuncs, + platform::CPUPlace>::Cache() + .At(out_width); for (int i = 0; i < static_cast(lod.size()) - 1; ++i) { int64_t h = static_cast(lod[i + 1] - lod[i]); const T *src = d_output_data + i * out_width; diff --git a/paddle/fluid/operators/fused/fusion_gru_op.cc b/paddle/fluid/operators/fused/fusion_gru_op.cc index 66acba49e5..cd8a6a55d4 100644 --- a/paddle/fluid/operators/fused/fusion_gru_op.cc +++ b/paddle/fluid/operators/fused/fusion_gru_op.cc @@ -182,29 +182,32 @@ class FusionGRUKernel : public framework::OpKernel { const int total_T = x_dims[0]; \ const int D3 = wh_dims[1] -#define INIT_OTHER_DEFINES \ - auto* h0 = ctx.Input("H0"); \ - auto* wx = ctx.Input("WeightX"); \ - auto* bias = ctx.Input("Bias"); \ - auto* hidden_out = ctx.Output("Hidden"); \ - bool is_reverse = ctx.Attr("is_reverse"); \ - const int M = x_dims[1]; \ - const int D = wh_dims[0]; \ - const int D2 = D * 2; \ - const jit::gru_attr_t attr( \ - D, jit::to_kerneltype(ctx.Attr("gate_activation")), \ - jit::to_kerneltype(ctx.Attr("activation"))); \ - jit::gru_t one_step; \ - auto ComputeH1 = \ - jit::Get, platform::CPUPlace>(attr); \ - auto ComputeHtPart1 = \ - jit::Get, platform::CPUPlace>(attr); \ - auto ComputeHtPart2 = \ - jit::Get, platform::CPUPlace>(attr); \ - const T* x_data = x->data(); \ - const T* wx_data = wx->data(); \ - const T* wh_data = wh->data(); \ - auto place = ctx.GetPlace(); \ +#define INIT_OTHER_DEFINES \ + auto* h0 = ctx.Input("H0"); \ + auto* wx = ctx.Input("WeightX"); \ + auto* bias = ctx.Input("Bias"); \ + auto* hidden_out = ctx.Output("Hidden"); \ + bool is_reverse = ctx.Attr("is_reverse"); \ + const int M = x_dims[1]; \ + const int D = wh_dims[0]; \ + const int D2 = D * 2; \ + const jit::gru_attr_t attr( \ + D, jit::to_kerneltype(ctx.Attr("gate_activation")), \ + jit::to_kerneltype(ctx.Attr("activation"))); \ + jit::gru_t one_step; \ + auto ComputeH1 = jit::KernelFuncs, \ + platform::CPUPlace>::Cache() \ + .At(attr); \ + auto ComputeHtPart1 = jit::KernelFuncs, \ + platform::CPUPlace>::Cache() \ + .At(attr); \ + auto ComputeHtPart2 = jit::KernelFuncs, \ + platform::CPUPlace>::Cache() \ + .At(attr); \ + const T* x_data = x->data(); \ + const T* wx_data = wx->data(); \ + const T* wh_data = wh->data(); \ + auto place = ctx.GetPlace(); \ T* xx_data = xx->mutable_data(place) void SeqCompute(const framework::ExecutionContext& ctx) const { diff --git a/paddle/fluid/operators/fused/fusion_lstm_op.cc b/paddle/fluid/operators/fused/fusion_lstm_op.cc index b11b7c11bf..d7d12df4bf 100644 --- a/paddle/fluid/operators/fused/fusion_lstm_op.cc +++ b/paddle/fluid/operators/fused/fusion_lstm_op.cc @@ -235,32 +235,34 @@ class FuisonLSTMKernel : public framework::OpKernel { const int D = wh_dims[0]; \ const int D4 = wh_dims[1] -#define INIT_OTHER_DEFINES \ - const T* x_data = x->data(); \ - const T* wx_data = wx->data(); \ - const T* wh_data = wh->data(); \ - /* diagonal weight*/ \ - const T* wp_data = bias->data() + D4; \ - /* for peephole only*/ \ - T* checked_cell_data = nullptr; \ - auto place = ctx.GetPlace(); \ - if (use_peepholes) { \ - /* w_ic * Ct-1, w_fc * Ct-1 ; w_oc * Ct => ih*/ \ - auto* checked_cell = ctx.Output("CheckedCell"); \ - checked_cell_data = checked_cell->mutable_data(place); \ - } \ - const jit::lstm_attr_t attr( \ - D, jit::to_kerneltype(ctx.Attr("gate_activation")), \ - jit::to_kerneltype(ctx.Attr("candidate_activation")), \ - jit::to_kerneltype(ctx.Attr("cell_activation")), \ - use_peepholes); \ - jit::lstm_t one_step; \ - one_step.wp = wp_data; \ - one_step.checked = checked_cell_data; \ - auto ComputeC1H1 = \ - jit::Get, platform::CPUPlace>(attr); \ - auto ComputeCtHt = \ - jit::Get, platform::CPUPlace>(attr) +#define INIT_OTHER_DEFINES \ + const T* x_data = x->data(); \ + const T* wx_data = wx->data(); \ + const T* wh_data = wh->data(); \ + /* diagonal weight*/ \ + const T* wp_data = bias->data() + D4; \ + /* for peephole only*/ \ + T* checked_cell_data = nullptr; \ + auto place = ctx.GetPlace(); \ + if (use_peepholes) { \ + /* w_ic * Ct-1, w_fc * Ct-1 ; w_oc * Ct => ih*/ \ + auto* checked_cell = ctx.Output("CheckedCell"); \ + checked_cell_data = checked_cell->mutable_data(place); \ + } \ + const jit::lstm_attr_t attr( \ + D, jit::to_kerneltype(ctx.Attr("gate_activation")), \ + jit::to_kerneltype(ctx.Attr("candidate_activation")), \ + jit::to_kerneltype(ctx.Attr("cell_activation")), \ + use_peepholes); \ + jit::lstm_t one_step; \ + one_step.wp = wp_data; \ + one_step.checked = checked_cell_data; \ + auto ComputeC1H1 = jit::KernelFuncs, \ + platform::CPUPlace>::Cache() \ + .At(attr); \ + auto ComputeCtHt = jit::KernelFuncs, \ + platform::CPUPlace>::Cache() \ + .At(attr) // Wh GEMM #define GEMM_WH_ADDON(bs, prev, out) \ diff --git a/paddle/fluid/operators/fused/fusion_repeated_fc_relu_op.cc b/paddle/fluid/operators/fused/fusion_repeated_fc_relu_op.cc index 8ecdf2ed9d..e057724b5a 100644 --- a/paddle/fluid/operators/fused/fusion_repeated_fc_relu_op.cc +++ b/paddle/fluid/operators/fused/fusion_repeated_fc_relu_op.cc @@ -81,10 +81,12 @@ void FusionRepeatedFCReluOpMaker::Make() { template static void fc_relu(const T* x, const T* w, const T* b, T* y, const jit::matmul_attr_t& attr) { - auto matmul = - jit::Get, platform::CPUPlace>(attr); - auto addbias_relu = - jit::Get, platform::CPUPlace>(attr.n); + auto matmul = jit::KernelFuncs, + platform::CPUPlace>::Cache() + .At(attr); + auto addbias_relu = jit::KernelFuncs, + platform::CPUPlace>::Cache() + .At(attr.n); matmul(x, w, y, &attr); T* dst = y; for (int i = 0; i < attr.m; ++i) { diff --git a/paddle/fluid/operators/fused/fusion_seqpool_concat_op.cc b/paddle/fluid/operators/fused/fusion_seqpool_concat_op.cc index d48bdafe0a..7aeeabc512 100644 --- a/paddle/fluid/operators/fused/fusion_seqpool_concat_op.cc +++ b/paddle/fluid/operators/fused/fusion_seqpool_concat_op.cc @@ -97,9 +97,9 @@ class FusionSeqPoolConcatKernel : public framework::OpKernel { } else if (pooltype == "SQRT") { attr.type = jit::SeqPoolType::kSqrt; } - auto seqpool = - jit::Get, platform::CPUPlace>( - attr); + auto seqpool = jit::KernelFuncs, + platform::CPUPlace>::Cache() + .At(attr); size_t n = ins.size(); size_t dst_step_size = n * w; for (size_t i = 0; i < n; ++i) { diff --git a/paddle/fluid/operators/fused/fusion_squared_mat_sub_op.cc b/paddle/fluid/operators/fused/fusion_squared_mat_sub_op.cc index 8493f4468f..9382bf0ebb 100644 --- a/paddle/fluid/operators/fused/fusion_squared_mat_sub_op.cc +++ b/paddle/fluid/operators/fused/fusion_squared_mat_sub_op.cc @@ -93,20 +93,24 @@ class FusionSquaredMatSubKernel : public framework::OpKernel { attr.n = y_dims[1]; int o_numel = attr.m * attr.n; - auto vsquare_x = - jit::Get, platform::CPUPlace>(attr.m * - attr.k); - auto vsquare_y = - jit::Get, platform::CPUPlace>(attr.k * - attr.n); - auto vsquare_xy = - jit::Get, platform::CPUPlace>(o_numel); - auto vsub = - jit::Get, platform::CPUPlace>(o_numel); - auto vscal = - jit::Get, platform::CPUPlace>(o_numel); - auto matmul = - jit::Get, platform::CPUPlace>(attr); + auto vsquare_x = jit::KernelFuncs, + platform::CPUPlace>::Cache() + .At(attr.m * attr.k); + auto vsquare_y = jit::KernelFuncs, + platform::CPUPlace>::Cache() + .At(attr.k * attr.n); + auto vsquare_xy = jit::KernelFuncs, + platform::CPUPlace>::Cache() + .At(o_numel); + auto vsub = jit::KernelFuncs, + platform::CPUPlace>::Cache() + .At(o_numel); + auto vscal = jit::KernelFuncs, + platform::CPUPlace>::Cache() + .At(o_numel); + auto matmul = jit::KernelFuncs, + platform::CPUPlace>::Cache() + .At(attr); const T* x_data = x->data(); const T* y_data = y->data(); diff --git a/paddle/fluid/operators/jit/CMakeLists.txt b/paddle/fluid/operators/jit/CMakeLists.txt index 35775d7ec9..47d6c83f2a 100644 --- a/paddle/fluid/operators/jit/CMakeLists.txt +++ b/paddle/fluid/operators/jit/CMakeLists.txt @@ -5,7 +5,7 @@ file(APPEND ${jit_file} "\#pragma once\n") file(APPEND ${jit_file} "\#include \"paddle/fluid/operators/jit/helper.h\"\n") file(APPEND ${jit_file} "\#include \"paddle/fluid/operators/jit/registry.h\"\n\n") -set(JIT_KERNEL_DEPS cpu_info cblas gflags enforce place) +set(JIT_KERNEL_DEPS cpu_info cblas gflags enforce place xxhash) file(GLOB jit_kernel_cc_srcs RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "*.cc") list(REMOVE_ITEM jit_kernel_cc_srcs test.cc benchmark.cc) diff --git a/paddle/fluid/operators/jit/benchmark.cc b/paddle/fluid/operators/jit/benchmark.cc index 3088280bb9..deb96ee6cd 100644 --- a/paddle/fluid/operators/jit/benchmark.cc +++ b/paddle/fluid/operators/jit/benchmark.cc @@ -142,7 +142,7 @@ void BenchAllImpls(const typename KernelTuples::attr_type& attr, Args... args) { } } // Test result from Get function - auto tgt = jit::Get(attr); + auto tgt = jit::KernelFuncs::Cache().At(attr); if (!tgt) { LOG(FATAL) << "Target can not be empty!"; } diff --git a/paddle/fluid/operators/jit/helper.h b/paddle/fluid/operators/jit/helper.h index d85c719c1c..1af1add3ee 100644 --- a/paddle/fluid/operators/jit/helper.h +++ b/paddle/fluid/operators/jit/helper.h @@ -14,6 +14,9 @@ #pragma once +extern "C" { +#include +} #include #include #include @@ -127,23 +130,36 @@ class KernelFuncs { return g_func_cache; } - bool Has(int key) const { return funcs_.find(key) != funcs_.end(); } - - void Insert(int key, typename KernelTuples::func_type func) { - funcs_.emplace(key, func); - } - - typename KernelTuples::func_type At(int key) { + // the exposed interface to use + typename KernelTuples::func_type At( + const typename KernelTuples::attr_type& attr) { + // XXH64: 13.8 GB/s + int64_t key = XXH64(&attr, sizeof(typename KernelTuples::attr_type), 0); if (Has(key)) { return funcs_.at(key); } - auto func = Get(key); + // If do not have this attr in cache, + // then could run some runtime benchmark of this attr and save the best one. + // Here just get the offline benchmarked best one. + auto func = Get(attr); Insert(key, func); return func; } + typename KernelTuples::func_type operator[]( + const typename KernelTuples::attr_type& attr) { + return At(attr); + } + + protected: + bool Has(int64_t key) const { return funcs_.find(key) != funcs_.end(); } + + void Insert(int64_t key, typename KernelTuples::func_type func) { + funcs_.emplace(key, func); + } + private: - std::unordered_map funcs_; + std::unordered_map funcs_; DISABLE_COPY_AND_ASSIGN(KernelFuncs); }; diff --git a/paddle/fluid/operators/jit/test.cc b/paddle/fluid/operators/jit/test.cc index cdec14dc43..18f8c09f14 100644 --- a/paddle/fluid/operators/jit/test.cc +++ b/paddle/fluid/operators/jit/test.cc @@ -462,7 +462,7 @@ void TestAllImpls(const typename KernelTuples::attr_type& attr, Args... args) { } // test result from Get function // VLOG(10) << "Test Get function "; - auto tgt = jit::Get(attr); + auto tgt = jit::KernelFuncs::Cache().At(attr); test(tgt, args...); } @@ -845,7 +845,9 @@ void TestKernelNCHW16CMulNCTuples() { T* zjit_data = zjit.data(); constexpr int simd_width = ZMM_FLOAT_BLOCK; int C = c / simd_width; - auto tgt = jit::Get, PlaceType>(0); + auto tgt = + jit::KernelFuncs, PlaceType>::Cache().At( + 0); auto jitcode = jit::GetJitCode, PlaceType>(0); EXPECT_TRUE(tgt != nullptr); @@ -967,10 +969,10 @@ void TestKernelVBroadcastTuples() { } } -#define TEST_CPU_KERNEL(test_tuple, kernel_type) \ - TEST(JITKernel, kernel_type) { \ - TestKernel##test_tuple(); \ - TestKernel##test_tuple(); \ +#define TEST_CPU_KERNEL(test_tuple, kernel_type) \ + TEST(JITKernel, kernel_type) { \ + TestKernel##test_tuple(); \ + TestKernel##test_tuple(); \ } TEST_CPU_KERNEL(XYZNTuples, kVMul); @@ -1041,4 +1043,18 @@ TEST(JITKernel_key, gru) { EXPECT_TRUE(key2 == key3); EXPECT_TRUE(key3 != key4); } -// TODO(TJ): add more test about key and pool + +TEST(JITKernel, kernel_func) { + auto f1 = + jit::KernelFuncs, CPUPlace>::Cache() + .At(3); + auto f2 = jit::KernelFuncs, + CPUPlace>::Cache()[3]; + EXPECT_TRUE(f1 == f2); + + f1 = jit::KernelFuncs, CPUPlace>::Cache() + .At(3); + f2 = jit::KernelFuncs, CPUPlace>::Cache() + .At(4); + EXPECT_TRUE(f1 != f2); +} diff --git a/paddle/fluid/operators/layer_norm_op.h b/paddle/fluid/operators/layer_norm_op.h index f564a10396..f0c3064d41 100644 --- a/paddle/fluid/operators/layer_norm_op.h +++ b/paddle/fluid/operators/layer_norm_op.h @@ -229,9 +229,9 @@ class LayerNormKernel : public framework::OpKernel { PADDLE_ENFORCE_EQ(scale->numel(), right); PADDLE_ENFORCE_EQ(bias->numel(), right); - auto ker = - jit::Get, platform::CPUPlace>( - right); + auto ker = jit::KernelFuncs, + platform::CPUPlace>::Cache() + .At(right); ker(x.data(), out.data(), mean->data(), var->data(), scale->data(), bias->data(), static_cast(left), static_cast(epsilon), right); diff --git a/paddle/fluid/operators/math/sequence_pooling.cc b/paddle/fluid/operators/math/sequence_pooling.cc index 2a47502614..db103e5fab 100644 --- a/paddle/fluid/operators/math/sequence_pooling.cc +++ b/paddle/fluid/operators/math/sequence_pooling.cc @@ -255,9 +255,9 @@ class SequencePoolFunctor { jit::seq_pool_attr_t attr( static_cast(input.numel() / input.dims()[0]), jit::SeqPoolType::kSum); - auto seqpool = - jit::Get, platform::CPUPlace>( - attr); + auto seqpool = jit::KernelFuncs, + platform::CPUPlace>::Cache() + .At(attr); for (int i = 0; i < static_cast(lod.size()) - 1; ++i) { attr.h = static_cast(lod[i + 1] - lod[i]); seqpool(src, dst, &attr); diff --git a/paddle/fluid/operators/optimizers/sgd_op.h b/paddle/fluid/operators/optimizers/sgd_op.h index c9c9f530fe..0425a3d194 100644 --- a/paddle/fluid/operators/optimizers/sgd_op.h +++ b/paddle/fluid/operators/optimizers/sgd_op.h @@ -47,8 +47,9 @@ class SGDOpKernel : public framework::OpKernel { int64_t rows_idx = 0; T *out_data = param_out->mutable_data(ctx.GetPlace()); - auto sgd = - jit::Get, platform::CPUPlace>(attr); + auto sgd = jit::KernelFuncs, + platform::CPUPlace>::Cache() + .At(attr); sgd(lr, param_data, grad_data, &rows_idx, out_data, &attr); } else if (grad_var->IsType()) { // TODO(qijun): In Sparse SGD operator, in-place update is enforced. @@ -81,8 +82,9 @@ class SGDOpKernel : public framework::OpKernel { attr.selected_rows_size = grad_rows.size(); PADDLE_ENFORCE_EQ(attr.grad_width, attr.param_width); - auto sgd = - jit::Get, platform::CPUPlace>(attr); + auto sgd = jit::KernelFuncs, + platform::CPUPlace>::Cache() + .At(attr); sgd(lr, param_data, grad_data, rows_data, out_data, &attr); } else { PADDLE_THROW("Unsupported Variable Type of Grad"); From 14a764c930c4ff895168d482db51c21b6338f283 Mon Sep 17 00:00:00 2001 From: tensor-tang Date: Fri, 8 Mar 2019 08:04:04 +0000 Subject: [PATCH 028/230] simplify the jitkernel templates and tests test=develop --- paddle/fluid/operators/crf_decoding_op.h | 6 +- .../mkldnn/elementwise_mul_mkldnn_op.cc | 7 +- .../fused/fused_embedding_seq_pool_op.h | 6 +- paddle/fluid/operators/fused/fusion_gru_op.cc | 52 +- .../fluid/operators/fused/fusion_lstm_op.cc | 56 +- .../fused/fusion_repeated_fc_relu_op.cc | 12 +- .../fused/fusion_seqpool_concat_op.cc | 6 +- .../fused/fusion_squared_mat_sub_op.cc | 36 +- paddle/fluid/operators/jit/benchmark.cc | 269 ++-- paddle/fluid/operators/jit/helper.h | 71 +- paddle/fluid/operators/jit/kernel_base.h | 93 +- .../jit/more/intrinsic/crf_decoding.h | 5 +- .../operators/jit/more/intrinsic/layer_norm.h | 4 +- paddle/fluid/operators/jit/more/mix/mix.cc | 68 +- paddle/fluid/operators/jit/more/mix/mix.h | 28 +- paddle/fluid/operators/jit/more/mkl/mkl.cc | 32 +- paddle/fluid/operators/jit/more/mkl/mkl.h | 49 +- paddle/fluid/operators/jit/refer/refer.cc | 80 +- paddle/fluid/operators/jit/refer/refer.h | 80 +- paddle/fluid/operators/jit/test.cc | 1282 ++++++++--------- paddle/fluid/operators/layer_norm_op.h | 6 +- paddle/fluid/operators/math/fc_compute.h | 11 +- .../fluid/operators/math/sequence_pooling.cc | 6 +- paddle/fluid/operators/math/softmax_impl.h | 3 +- paddle/fluid/operators/optimizers/sgd_op.h | 12 +- 25 files changed, 1135 insertions(+), 1145 deletions(-) diff --git a/paddle/fluid/operators/crf_decoding_op.h b/paddle/fluid/operators/crf_decoding_op.h index 3d98790a4d..d6b54038ec 100644 --- a/paddle/fluid/operators/crf_decoding_op.h +++ b/paddle/fluid/operators/crf_decoding_op.h @@ -82,9 +82,9 @@ class CRFDecodingOpKernel : public framework::OpKernel { Tensor track; int* track_value = track.mutable_data(emission_dims, platform::CPUPlace()); - auto ker = jit::KernelFuncs, - platform::CPUPlace>::Cache() - .At(tag_num); + auto ker = + jit::KernelFuncs, platform::CPUPlace>::Cache() + .At(tag_num); ker(static_cast(seq_len), x, w, alpha_value, track_value, tag_num); T max_score = -std::numeric_limits::max(); int max_i = 0; diff --git a/paddle/fluid/operators/elementwise/mkldnn/elementwise_mul_mkldnn_op.cc b/paddle/fluid/operators/elementwise/mkldnn/elementwise_mul_mkldnn_op.cc index e37bbd2837..f2f4d3fee0 100644 --- a/paddle/fluid/operators/elementwise/mkldnn/elementwise_mul_mkldnn_op.cc +++ b/paddle/fluid/operators/elementwise/mkldnn/elementwise_mul_mkldnn_op.cc @@ -110,10 +110,9 @@ class ElementwiseMulMKLDNNKernel : public framework::OpKernel { constexpr int simd_width = 16; int C = c / simd_width; - auto multiply = - jit::KernelFuncs, - platform::CPUPlace>::Cache() - .At(0); + auto multiply = jit::KernelFuncs, + platform::CPUPlace>::Cache() + .At(0); #pragma omp parallel for collapse(2) for (int ni = 0; ni < n; ni++) { for (int ci = 0; ci < C; ci++) { diff --git a/paddle/fluid/operators/fused/fused_embedding_seq_pool_op.h b/paddle/fluid/operators/fused/fused_embedding_seq_pool_op.h index fe43545e60..5e2e336e71 100644 --- a/paddle/fluid/operators/fused/fused_embedding_seq_pool_op.h +++ b/paddle/fluid/operators/fused/fused_embedding_seq_pool_op.h @@ -53,8 +53,7 @@ struct EmbeddingVSumFunctor { for (size_t i = 0; i != ids_lod.size() - 1; ++i) { attr.index_height = ids_lod[i + 1] - ids_lod[i]; auto emb_seqpool = - jit::KernelFuncs, - platform::CPUPlace>::Cache() + jit::KernelFuncs, platform::CPUPlace>::Cache() .At(attr); emb_seqpool(table, ids + ids_lod[i] * idx_width, output + i * out_width, &attr); @@ -138,8 +137,7 @@ class FusedEmbeddingSeqPoolGradKernel : public framework::OpKernel { const T *d_output_data = d_output->data(); auto vbroadcast = - jit::KernelFuncs, - platform::CPUPlace>::Cache() + jit::KernelFuncs, platform::CPUPlace>::Cache() .At(out_width); for (int i = 0; i < static_cast(lod.size()) - 1; ++i) { int64_t h = static_cast(lod[i + 1] - lod[i]); diff --git a/paddle/fluid/operators/fused/fusion_gru_op.cc b/paddle/fluid/operators/fused/fusion_gru_op.cc index cd8a6a55d4..ba5f0747c4 100644 --- a/paddle/fluid/operators/fused/fusion_gru_op.cc +++ b/paddle/fluid/operators/fused/fusion_gru_op.cc @@ -182,32 +182,32 @@ class FusionGRUKernel : public framework::OpKernel { const int total_T = x_dims[0]; \ const int D3 = wh_dims[1] -#define INIT_OTHER_DEFINES \ - auto* h0 = ctx.Input("H0"); \ - auto* wx = ctx.Input("WeightX"); \ - auto* bias = ctx.Input("Bias"); \ - auto* hidden_out = ctx.Output("Hidden"); \ - bool is_reverse = ctx.Attr("is_reverse"); \ - const int M = x_dims[1]; \ - const int D = wh_dims[0]; \ - const int D2 = D * 2; \ - const jit::gru_attr_t attr( \ - D, jit::to_kerneltype(ctx.Attr("gate_activation")), \ - jit::to_kerneltype(ctx.Attr("activation"))); \ - jit::gru_t one_step; \ - auto ComputeH1 = jit::KernelFuncs, \ - platform::CPUPlace>::Cache() \ - .At(attr); \ - auto ComputeHtPart1 = jit::KernelFuncs, \ - platform::CPUPlace>::Cache() \ - .At(attr); \ - auto ComputeHtPart2 = jit::KernelFuncs, \ - platform::CPUPlace>::Cache() \ - .At(attr); \ - const T* x_data = x->data(); \ - const T* wx_data = wx->data(); \ - const T* wh_data = wh->data(); \ - auto place = ctx.GetPlace(); \ +#define INIT_OTHER_DEFINES \ + auto* h0 = ctx.Input("H0"); \ + auto* wx = ctx.Input("WeightX"); \ + auto* bias = ctx.Input("Bias"); \ + auto* hidden_out = ctx.Output("Hidden"); \ + bool is_reverse = ctx.Attr("is_reverse"); \ + const int M = x_dims[1]; \ + const int D = wh_dims[0]; \ + const int D2 = D * 2; \ + const jit::gru_attr_t attr( \ + D, jit::to_kerneltype(ctx.Attr("gate_activation")), \ + jit::to_kerneltype(ctx.Attr("activation"))); \ + jit::gru_t one_step; \ + auto ComputeH1 = \ + jit::KernelFuncs, platform::CPUPlace>::Cache().At( \ + attr); \ + auto ComputeHtPart1 = \ + jit::KernelFuncs, platform::CPUPlace>::Cache() \ + .At(attr); \ + auto ComputeHtPart2 = \ + jit::KernelFuncs, platform::CPUPlace>::Cache() \ + .At(attr); \ + const T* x_data = x->data(); \ + const T* wx_data = wx->data(); \ + const T* wh_data = wh->data(); \ + auto place = ctx.GetPlace(); \ T* xx_data = xx->mutable_data(place) void SeqCompute(const framework::ExecutionContext& ctx) const { diff --git a/paddle/fluid/operators/fused/fusion_lstm_op.cc b/paddle/fluid/operators/fused/fusion_lstm_op.cc index d7d12df4bf..c8c07bd126 100644 --- a/paddle/fluid/operators/fused/fusion_lstm_op.cc +++ b/paddle/fluid/operators/fused/fusion_lstm_op.cc @@ -235,34 +235,34 @@ class FuisonLSTMKernel : public framework::OpKernel { const int D = wh_dims[0]; \ const int D4 = wh_dims[1] -#define INIT_OTHER_DEFINES \ - const T* x_data = x->data(); \ - const T* wx_data = wx->data(); \ - const T* wh_data = wh->data(); \ - /* diagonal weight*/ \ - const T* wp_data = bias->data() + D4; \ - /* for peephole only*/ \ - T* checked_cell_data = nullptr; \ - auto place = ctx.GetPlace(); \ - if (use_peepholes) { \ - /* w_ic * Ct-1, w_fc * Ct-1 ; w_oc * Ct => ih*/ \ - auto* checked_cell = ctx.Output("CheckedCell"); \ - checked_cell_data = checked_cell->mutable_data(place); \ - } \ - const jit::lstm_attr_t attr( \ - D, jit::to_kerneltype(ctx.Attr("gate_activation")), \ - jit::to_kerneltype(ctx.Attr("candidate_activation")), \ - jit::to_kerneltype(ctx.Attr("cell_activation")), \ - use_peepholes); \ - jit::lstm_t one_step; \ - one_step.wp = wp_data; \ - one_step.checked = checked_cell_data; \ - auto ComputeC1H1 = jit::KernelFuncs, \ - platform::CPUPlace>::Cache() \ - .At(attr); \ - auto ComputeCtHt = jit::KernelFuncs, \ - platform::CPUPlace>::Cache() \ - .At(attr) +#define INIT_OTHER_DEFINES \ + const T* x_data = x->data(); \ + const T* wx_data = wx->data(); \ + const T* wh_data = wh->data(); \ + /* diagonal weight*/ \ + const T* wp_data = bias->data() + D4; \ + /* for peephole only*/ \ + T* checked_cell_data = nullptr; \ + auto place = ctx.GetPlace(); \ + if (use_peepholes) { \ + /* w_ic * Ct-1, w_fc * Ct-1 ; w_oc * Ct => ih*/ \ + auto* checked_cell = ctx.Output("CheckedCell"); \ + checked_cell_data = checked_cell->mutable_data(place); \ + } \ + const jit::lstm_attr_t attr( \ + D, jit::to_kerneltype(ctx.Attr("gate_activation")), \ + jit::to_kerneltype(ctx.Attr("candidate_activation")), \ + jit::to_kerneltype(ctx.Attr("cell_activation")), \ + use_peepholes); \ + jit::lstm_t one_step; \ + one_step.wp = wp_data; \ + one_step.checked = checked_cell_data; \ + auto ComputeC1H1 = \ + jit::KernelFuncs, platform::CPUPlace>::Cache().At( \ + attr); \ + auto ComputeCtHt = \ + jit::KernelFuncs, platform::CPUPlace>::Cache().At( \ + attr) // Wh GEMM #define GEMM_WH_ADDON(bs, prev, out) \ diff --git a/paddle/fluid/operators/fused/fusion_repeated_fc_relu_op.cc b/paddle/fluid/operators/fused/fusion_repeated_fc_relu_op.cc index e057724b5a..6be35de65f 100644 --- a/paddle/fluid/operators/fused/fusion_repeated_fc_relu_op.cc +++ b/paddle/fluid/operators/fused/fusion_repeated_fc_relu_op.cc @@ -81,12 +81,12 @@ void FusionRepeatedFCReluOpMaker::Make() { template static void fc_relu(const T* x, const T* w, const T* b, T* y, const jit::matmul_attr_t& attr) { - auto matmul = jit::KernelFuncs, - platform::CPUPlace>::Cache() - .At(attr); - auto addbias_relu = jit::KernelFuncs, - platform::CPUPlace>::Cache() - .At(attr.n); + auto matmul = + jit::KernelFuncs, platform::CPUPlace>::Cache().At( + attr); + auto addbias_relu = + jit::KernelFuncs, platform::CPUPlace>::Cache().At( + attr.n); matmul(x, w, y, &attr); T* dst = y; for (int i = 0; i < attr.m; ++i) { diff --git a/paddle/fluid/operators/fused/fusion_seqpool_concat_op.cc b/paddle/fluid/operators/fused/fusion_seqpool_concat_op.cc index 7aeeabc512..25916768c0 100644 --- a/paddle/fluid/operators/fused/fusion_seqpool_concat_op.cc +++ b/paddle/fluid/operators/fused/fusion_seqpool_concat_op.cc @@ -97,9 +97,9 @@ class FusionSeqPoolConcatKernel : public framework::OpKernel { } else if (pooltype == "SQRT") { attr.type = jit::SeqPoolType::kSqrt; } - auto seqpool = jit::KernelFuncs, - platform::CPUPlace>::Cache() - .At(attr); + auto seqpool = + jit::KernelFuncs, platform::CPUPlace>::Cache().At( + attr); size_t n = ins.size(); size_t dst_step_size = n * w; for (size_t i = 0; i < n; ++i) { diff --git a/paddle/fluid/operators/fused/fusion_squared_mat_sub_op.cc b/paddle/fluid/operators/fused/fusion_squared_mat_sub_op.cc index 9382bf0ebb..53679ebdde 100644 --- a/paddle/fluid/operators/fused/fusion_squared_mat_sub_op.cc +++ b/paddle/fluid/operators/fused/fusion_squared_mat_sub_op.cc @@ -93,24 +93,24 @@ class FusionSquaredMatSubKernel : public framework::OpKernel { attr.n = y_dims[1]; int o_numel = attr.m * attr.n; - auto vsquare_x = jit::KernelFuncs, - platform::CPUPlace>::Cache() - .At(attr.m * attr.k); - auto vsquare_y = jit::KernelFuncs, - platform::CPUPlace>::Cache() - .At(attr.k * attr.n); - auto vsquare_xy = jit::KernelFuncs, - platform::CPUPlace>::Cache() - .At(o_numel); - auto vsub = jit::KernelFuncs, - platform::CPUPlace>::Cache() - .At(o_numel); - auto vscal = jit::KernelFuncs, - platform::CPUPlace>::Cache() - .At(o_numel); - auto matmul = jit::KernelFuncs, - platform::CPUPlace>::Cache() - .At(attr); + auto vsquare_x = + jit::KernelFuncs, platform::CPUPlace>::Cache().At( + attr.m * attr.k); + auto vsquare_y = + jit::KernelFuncs, platform::CPUPlace>::Cache().At( + attr.k * attr.n); + auto vsquare_xy = + jit::KernelFuncs, platform::CPUPlace>::Cache().At( + o_numel); + auto vsub = + jit::KernelFuncs, platform::CPUPlace>::Cache().At( + o_numel); + auto vscal = + jit::KernelFuncs, platform::CPUPlace>::Cache().At( + o_numel); + auto matmul = + jit::KernelFuncs, platform::CPUPlace>::Cache().At( + attr); const T* x_data = x->data(); const T* y_data = y->data(); diff --git a/paddle/fluid/operators/jit/benchmark.cc b/paddle/fluid/operators/jit/benchmark.cc index deb96ee6cd..773cf38eb9 100644 --- a/paddle/fluid/operators/jit/benchmark.cc +++ b/paddle/fluid/operators/jit/benchmark.cc @@ -59,8 +59,6 @@ BenchJITKernel* InsertBenchmark(BenchJITKernel* b) { InsertBenchmark(new BenchJITKernel_##name##_##dtype##_##place##_()); \ void BenchJITKernel_##name##_##dtype##_##place##_::Run() -#define BENCH_FP32_CPU(name) BENCH_JITKERNEL(name, FP32, CPU) - void RUN_ALL_BENCHMARK() { for (auto p : g_all_benchmarks) { if (!FLAGS_filter.empty() && FLAGS_filter != p->Name()) { @@ -90,11 +88,11 @@ std::vector TestSizes() { return s; } -template +template struct BenchFunc { // return this function avg time // TODO(TJ): clear cache every time - double operator()(const typename KernelTuples::func_type tgt, Args... args) { + double operator()(const typename KernelTuple::func_type tgt, Args... args) { for (int i = 0; i < FLAGS_burning; ++i) { tgt(args...); } @@ -109,31 +107,30 @@ struct BenchFunc { namespace jit = paddle::operators::jit; -template -void BenchAllImpls(const typename KernelTuples::attr_type& attr, Args... args) { - BenchFunc benchmark; +template +void BenchAllImpls(const typename KernelTuple::attr_type& attr, Args... args) { + BenchFunc benchmark; std::vector> infos; // test refer - auto refer = jit::GetRefer(); + auto refer = jit::GetRefer(); if (!refer) { LOG(FATAL) << "Refer can not be empty!"; } infos.push_back(std::make_pair("Refer", benchmark(refer, args...))); // test jitcode - auto jitcode = jit::GetJitCode(attr); + auto jitcode = jit::GetJitCode(attr); if (jitcode) { infos.push_back(std::make_pair("JitCode", benchmark(jitcode, args...))); } // test all impls in more - jit::KernelKey kkey(KT, PlaceType()); + jit::KernelKey kkey(KernelTuple::kernel_type, PlaceType()); auto& pool = jit::KernelPool().Instance().AllKernels(); auto iter = pool.find(kkey); if (iter != pool.end()) { auto& impls = iter->second; for (auto& impl : impls) { - auto i = dynamic_cast*>(impl.get()); + auto i = dynamic_cast*>(impl.get()); if (i && i->UseMe(attr)) { auto more = i->GetFunc(); infos.push_back( @@ -142,7 +139,7 @@ void BenchAllImpls(const typename KernelTuples::attr_type& attr, Args... args) { } } // Test result from Get function - auto tgt = jit::KernelFuncs::Cache().At(attr); + auto tgt = jit::KernelFuncs::Cache().At(attr); if (!tgt) { LOG(FATAL) << "Target can not be empty!"; } @@ -150,7 +147,8 @@ void BenchAllImpls(const typename KernelTuples::attr_type& attr, Args... args) { // print std::ostringstream loginfos; - loginfos << "Kernel Type " << jit::to_string(KT) << ": " << attr << ": "; + loginfos << "Kernel Type " << jit::to_string(KernelTuple::kernel_type) << ": " + << attr << ": "; for (auto pair : infos) { loginfos << pair.first << " takes " << pair.second << " us; "; } @@ -159,8 +157,9 @@ void BenchAllImpls(const typename KernelTuples::attr_type& attr, Args... args) { using Tensor = paddle::framework::Tensor; -template -void BenchXYZNKernel() { +template +void BenchKernelXYZN() { + using T = typename KernelTuple::data_type; for (int d : TestSizes()) { Tensor x, y, z; x.Resize({d}); @@ -171,16 +170,16 @@ void BenchXYZNKernel() { T* z_data = z.mutable_data(PlaceType()); RandomVec(d, x_data); RandomVec(d, y_data); - BenchAllImpls, PlaceType>(d, x.data(), - y.data(), z_data, d); + BenchAllImpls(d, x.data(), y.data(), z_data, + d); // test inplace - BenchAllImpls, PlaceType>(d, x.data(), z_data, - z_data, d); + BenchAllImpls(d, x.data(), z_data, z_data, d); } } -template -void BenchAXYNKernel() { +template +void BenchKernelAXYN() { + using T = typename KernelTuple::data_type; for (int d : TestSizes()) { const T a = static_cast(3); Tensor x, y; @@ -189,26 +188,26 @@ void BenchAXYNKernel() { T* x_data = x.mutable_data(PlaceType()); T* y_data = y.mutable_data(PlaceType()); RandomVec(d, x_data); - BenchAllImpls, PlaceType>(d, &a, x.data(), y_data, - d); + BenchAllImpls(d, &a, x.data(), y_data, d); // test inplace - BenchAllImpls, PlaceType>(d, &a, x.data(), x_data, - d); + BenchAllImpls(d, &a, x.data(), x_data, d); } } -template -void BenchXRNKernel() { +template +void BenchKernelXRN() { + using T = typename KernelTuple::data_type; for (int d : TestSizes()) { Tensor x; RandomVec(d, x.mutable_data({d}, PlaceType())); T res; - BenchAllImpls, PlaceType>(d, x.data(), &res, d); + BenchAllImpls(d, x.data(), &res, d); } } -template -void BenchXYNKernel() { +template +void BenchKernelXYN() { + using T = typename KernelTuple::data_type; for (int d : TestSizes()) { Tensor x, y; x.Resize({d}); @@ -216,12 +215,13 @@ void BenchXYNKernel() { T* x_data = x.mutable_data(PlaceType()); T* y_data = y.mutable_data(PlaceType()); RandomVec(d, x_data); - BenchAllImpls, PlaceType>(d, x.data(), y_data, d); + BenchAllImpls(d, x.data(), y_data, d); } } -template -void BenchLSTMKernel() { +template +void BenchKernelLSTM() { + using T = typename KernelTuple::data_type; for (bool use_peephole : {true, false}) { for (int d : TestSizes()) { const jit::lstm_attr_t attr(d, jit::kVSigmoid, jit::kVTanh, jit::kVTanh, @@ -252,13 +252,14 @@ void BenchLSTMKernel() { step.wp = wp_data; step.checked = checked_data; } - BenchAllImpls, PlaceType>(attr, &step, &attr); + BenchAllImpls(attr, &step, &attr); } } } -template -void BenchGRUKernel() { +template +void BenchKernelGRU() { + using T = typename KernelTuple::data_type; for (int d : TestSizes()) { const jit::gru_attr_t attr(d, jit::kVSigmoid, jit::kVTanh); auto place = PlaceType(); @@ -275,12 +276,13 @@ void BenchGRUKernel() { step.gates = x_data; step.ht_1 = ht_1_data; step.ht = ht_data; - BenchAllImpls, PlaceType>(attr, &step, &attr); + BenchAllImpls(attr, &step, &attr); } } -template -void BenchSeqPoolKernel() { +template +void BenchKernelSeqPool() { + using T = typename KernelTuple::data_type; std::vector pool_types = { jit::SeqPoolType::kSum, jit::SeqPoolType::kAvg, jit::SeqPoolType::kSqrt}; for (auto type : pool_types) { @@ -294,15 +296,15 @@ void BenchSeqPoolKernel() { RandomVec(h * w, x.mutable_data(PlaceType()), -2.f, 2.f); const T* x_data = x.data(); T* y_data = y.mutable_data(PlaceType()); - BenchAllImpls, PlaceType>(attr, x_data, - y_data, &attr); + BenchAllImpls(attr, x_data, y_data, &attr); } } } } -template -void BenchEmbSeqPoolKernel() { +template +void BenchKernelEmbSeqPool() { + using T = typename KernelTuple::data_type; std::vector pool_types = {jit::SeqPoolType::kSum}; int64_t tbl_h = 1e4; for (int tbl_w : {10, 16, 256}) { @@ -324,16 +326,17 @@ void BenchEmbSeqPoolKernel() { tbl_h - 1); const int64_t* idx_data = idx.data(); T* o_data = out.mutable_data(PlaceType()); - BenchAllImpls, PlaceType>( - attr, table_data, idx_data, o_data, &attr); + BenchAllImpls(attr, table_data, idx_data, + o_data, &attr); } } } } } -template -void BenchSgdKernel() { +template +void BenchKernelSgd() { + using T = typename KernelTuple::data_type; const T lr = 0.1; auto UnDuplicatedRandomVec = [](int n, const int64_t lower, const int64_t upper) -> std::vector { @@ -364,15 +367,16 @@ void BenchSgdKernel() { const T* grad_data = grad.data(); const int64_t* rows_data = rows.data(); jit::sgd_attr_t attr(param_h, grad_w, rows_size, grad_w, rows_size); - BenchAllImpls, PlaceType>( - attr, &lr, param_data, grad_data, rows_data, param_data, &attr); + BenchAllImpls(attr, &lr, param_data, grad_data, + rows_data, param_data, &attr); } } } } -template -void BenchMatMulKernel() { +template +void BenchKernelMatMul() { + using T = typename KernelTuple::data_type; for (int m : {1, 2, 3, 4}) { for (int n : TestSizes()) { for (int k : TestSizes()) { @@ -386,15 +390,16 @@ void BenchMatMulKernel() { const T* b_data = b.data(); T* c_data = c.mutable_data(PlaceType()); const jit::matmul_attr_t attr{m, n, k}; - BenchAllImpls, PlaceType>(attr, a_data, b_data, - c_data, &attr); + BenchAllImpls(attr, a_data, b_data, c_data, + &attr); } } } } -template -void BenchSoftmaxKernel() { +template +void BenchKernelSoftmax() { + using T = typename KernelTuple::data_type; for (int bs : {1, 2, 10}) { for (int n : TestSizes()) { Tensor x, y; @@ -403,14 +408,14 @@ void BenchSoftmaxKernel() { RandomVec(bs * n, x.mutable_data(PlaceType()), -2.f, 2.f); const T* x_data = x.data(); T* y_data = y.mutable_data(PlaceType()); - BenchAllImpls, PlaceType>(n, x_data, y_data, n, - bs); + BenchAllImpls(n, x_data, y_data, n, bs); } } } -template -void BenchLayerNormKernel() { +template +void BenchKernelLayerNorm() { + using T = typename KernelTuple::data_type; const T epsilon = 9.99999975e-06; for (int n : {1, 2, 10}) { for (int x_dim_0 : {1, 9, 17, 50}) { @@ -439,16 +444,17 @@ void BenchLayerNormKernel() { T* var_data = var.data(); T* out_data = out.mutable_data(PlaceType()); - BenchAllImpls, PlaceType>( - right, x_data, out_data, mean_data, var_data, scale_data, bias_data, - left, epsilon, right); + BenchAllImpls(right, x_data, out_data, + mean_data, var_data, scale_data, + bias_data, left, epsilon, right); } } } } -template -void BenchCRFDecodingKernel() { +template +void BenchKernelCRFDecoding() { + using T = typename KernelTuple::data_type; constexpr int state_trans_base_idx = 2; for (int seq_len : {1, 11, 17, 50}) { for (int tag_num : TestSizes()) { @@ -468,14 +474,15 @@ void BenchCRFDecodingKernel() { T* alpha_data = alpha.mutable_data(PlaceType()); int* track_data = track.mutable_data(PlaceType()); - BenchAllImpls, PlaceType>( - tag_num, seq_len, x_data, w_data, alpha_data, track_data, tag_num); + BenchAllImpls(tag_num, seq_len, x_data, w_data, + alpha_data, track_data, tag_num); } } } -template -void BenchVBroadcastKernel() { +template +void BenchKernelVBroadcast() { + using T = typename KernelTuple::data_type; for (int64_t w : {1, 16, 64, 100, 256}) { Tensor x; x.Resize({w}); @@ -485,78 +492,86 @@ void BenchVBroadcastKernel() { Tensor y; y.Resize({h * w}); T* y_data = y.mutable_data(PlaceType()); - BenchAllImpls, PlaceType>( - w, x_data, y_data, static_cast(h), w); + BenchAllImpls(w, x_data, y_data, + static_cast(h), w); } } } -using T = float; -using CPUPlace = paddle::platform::CPUPlace; +#define BenchKernelVMul BenchKernelXYZN +#define BenchKernelVAdd BenchKernelXYZN +#define BenchKernelVAddRelu BenchKernelXYZN +#define BenchKernelVSub BenchKernelXYZN -// xyzn -BENCH_FP32_CPU(kVMul) { BenchXYZNKernel(); } -BENCH_FP32_CPU(kVAdd) { BenchXYZNKernel(); } -BENCH_FP32_CPU(kVAddRelu) { BenchXYZNKernel(); } -BENCH_FP32_CPU(kVSub) { BenchXYZNKernel(); } +#define BenchKernelVScal BenchKernelAXYN +#define BenchKernelVAddBias BenchKernelAXYN -// axyn -BENCH_FP32_CPU(kVScal) { BenchAXYNKernel(); } -BENCH_FP32_CPU(kVAddBias) { BenchAXYNKernel(); } +#define BenchKernelVRelu BenchKernelXYN +#define BenchKernelVIdentity BenchKernelXYN +#define BenchKernelVSquare BenchKernelXYN +#define BenchKernelVExp BenchKernelXYN +#define BenchKernelVSigmoid BenchKernelXYN +#define BenchKernelVTanh BenchKernelXYN +#define BenchKernelVCopy BenchKernelXYN -// xrn -BENCH_FP32_CPU(kHSum) { BenchXRNKernel(); } -BENCH_FP32_CPU(kHMax) { BenchXRNKernel(); } +#define BenchKernelHMax BenchKernelXRN +#define BenchKernelHSum BenchKernelXRN -// xyn -BENCH_FP32_CPU(kVRelu) { BenchXYNKernel(); } -BENCH_FP32_CPU(kVIdentity) { BenchXYNKernel(); } -BENCH_FP32_CPU(kVSquare) { BenchXYNKernel(); } -BENCH_FP32_CPU(kVExp) { BenchXYNKernel(); } -BENCH_FP32_CPU(kVSigmoid) { BenchXYNKernel(); } -BENCH_FP32_CPU(kVTanh) { BenchXYNKernel(); } -BENCH_FP32_CPU(kVCopy) { BenchXYNKernel(); } - -// lstm and peephole -BENCH_FP32_CPU(kLSTMCtHt) { BenchLSTMKernel(); } -BENCH_FP32_CPU(kLSTMC1H1) { BenchLSTMKernel(); } - -// gru functions -BENCH_FP32_CPU(kGRUH1) { BenchGRUKernel(); } -BENCH_FP32_CPU(kGRUHtPart1) { BenchGRUKernel(); } -BENCH_FP32_CPU(kGRUHtPart2) { BenchGRUKernel(); } - -// seq pool function -BENCH_FP32_CPU(kSeqPool) { BenchSeqPoolKernel(); } - -// embedding seq pool function -BENCH_FP32_CPU(kEmbSeqPool) { - BenchEmbSeqPoolKernel(); -} +#define BenchKernelLSTMCtHt BenchKernelLSTM +#define BenchKernelLSTMC1H1 BenchKernelLSTM -// sgd function -BENCH_FP32_CPU(kSgd) { BenchSgdKernel(); } +#define BenchKernelGRUH1 BenchKernelGRU +#define BenchKernelGRUHtPart1 BenchKernelGRU +#define BenchKernelGRUHtPart2 BenchKernelGRU -// matmul -BENCH_FP32_CPU(kMatMul) { BenchMatMulKernel(); } +using CPUPlace = paddle::platform::CPUPlace; -// softmax -BENCH_FP32_CPU(kSoftmax) { BenchSoftmaxKernel(); } +#define BENCH_FP32_CPU(name) \ + BENCH_JITKERNEL(name, FP32, CPU) { \ + BenchKernel##name, CPUPlace>(); \ + } -// layernorm -BENCH_FP32_CPU(kLayerNorm) { - BenchLayerNormKernel(); -} +// xyzn +BENCH_FP32_CPU(VMul); +BENCH_FP32_CPU(VAdd); +BENCH_FP32_CPU(VAddRelu); +BENCH_FP32_CPU(VSub); -// crfdecoding -BENCH_FP32_CPU(kCRFDecoding) { - BenchCRFDecodingKernel(); -} +// axyn +BENCH_FP32_CPU(VScal); +BENCH_FP32_CPU(VAddBias); -// vbroadcast function -BENCH_FP32_CPU(kVBroadcast) { - BenchVBroadcastKernel(); -} +// xyn +BENCH_FP32_CPU(VRelu); +BENCH_FP32_CPU(VIdentity); +BENCH_FP32_CPU(VSquare); +BENCH_FP32_CPU(VExp); +BENCH_FP32_CPU(VSigmoid); +BENCH_FP32_CPU(VTanh); +BENCH_FP32_CPU(VCopy); + +// xrn +BENCH_FP32_CPU(HMax); +BENCH_FP32_CPU(HSum); + +// LSTM +BENCH_FP32_CPU(LSTMCtHt); +BENCH_FP32_CPU(LSTMC1H1); + +// GRU +BENCH_FP32_CPU(GRUH1); +BENCH_FP32_CPU(GRUHtPart1); +BENCH_FP32_CPU(GRUHtPart2); + +BENCH_FP32_CPU(LayerNorm); +BENCH_FP32_CPU(CRFDecoding); + +BENCH_FP32_CPU(SeqPool); +BENCH_FP32_CPU(EmbSeqPool); +BENCH_FP32_CPU(MatMul); +BENCH_FP32_CPU(Softmax); +BENCH_FP32_CPU(Sgd); +BENCH_FP32_CPU(VBroadcast); // Benchmark all jit kernels including jitcode, mkl and refer. // To use this tool, run command: ./benchmark [options...] diff --git a/paddle/fluid/operators/jit/helper.h b/paddle/fluid/operators/jit/helper.h index 1af1add3ee..85f4072dd3 100644 --- a/paddle/fluid/operators/jit/helper.h +++ b/paddle/fluid/operators/jit/helper.h @@ -19,6 +19,8 @@ extern "C" { } #include #include +#include +#include // for std::move #include #include "paddle/fluid/operators/jit/gen_base.h" #include "paddle/fluid/operators/jit/kernel_base.h" @@ -30,22 +32,22 @@ namespace paddle { namespace operators { namespace jit { -template +template inline typename std::enable_if< - std::is_same::value && + std::is_same::value && std::is_same::value, - typename KernelTuples::func_type>::type -GetJitCode(const typename KernelTuples::attr_type& attr) { - using Func = typename KernelTuples::func_type; - using Attr = typename KernelTuples::attr_type; + typename KernelTuple::func_type>::type +GetJitCode(const typename KernelTuple::attr_type& attr) { + using Func = typename KernelTuple::func_type; + using Attr = typename KernelTuple::attr_type; size_t key = JitCodeKey(attr); - auto& codes = JitCodePool().Instance(); + auto& codes = JitCodePool().Instance(); if (codes.Has(key)) { return codes.AllKernels().at(key)->template getCode(); } // creator is not related with attr, so can use KernelKey as key - KernelKey kkey(KT, PlaceType()); + KernelKey kkey(KernelTuple::kernel_type, PlaceType()); // pool: (KernelKey(type, place), vector) auto& creator_map = JitCodeCreatorPool().Instance().AllCreators(); auto iter = creator_map.find(kkey); @@ -66,27 +68,27 @@ GetJitCode(const typename KernelTuples::attr_type& attr) { return nullptr; } -template +template inline typename std::enable_if< - !std::is_same::value || + !std::is_same::value || !std::is_same::value, - typename KernelTuples::func_type>::type -GetJitCode(const typename KernelTuples::attr_type& attr) { + typename KernelTuple::func_type>::type +GetJitCode(const typename KernelTuple::attr_type& attr) { return nullptr; } // Refer code do not related with attr, which is just for cast // Refer is always on CPUPlace -template -inline typename KernelTuples::func_type GetRefer() { +template +inline typename KernelTuple::func_type GetRefer() { auto& ref_pool = ReferKernelPool().Instance().AllKernels(); - KernelKey kkey(KT, platform::CPUPlace()); + KernelKey kkey(KernelTuple::kernel_type, platform::CPUPlace()); auto ref_iter = ref_pool.find(kkey); PADDLE_ENFORCE(ref_iter != ref_pool.end(), "Every Kernel should have reference function."); auto& ref_impls = ref_iter->second; for (auto& impl : ref_impls) { - auto i = dynamic_cast*>(impl.get()); + auto i = dynamic_cast*>(impl.get()); if (i) { return i->GetFunc(); } @@ -94,23 +96,22 @@ inline typename KernelTuples::func_type GetRefer() { return nullptr; } -template -typename KernelTuples::func_type Get( - const typename KernelTuples::attr_type& attr) { - auto jitfunc = GetJitCode(attr); +template +typename KernelTuple::func_type Get( + const typename KernelTuple::attr_type& attr) { + auto jitfunc = GetJitCode(attr); if (jitfunc) { return jitfunc; } // pool: (KernelKey(type, place), vector) - KernelKey kkey(KT, PlaceType()); + KernelKey kkey(KernelTuple::kernel_type, PlaceType()); auto& pool = KernelPool().Instance().AllKernels(); auto iter = pool.find(kkey); if (iter != pool.end()) { auto& impls = iter->second; for (auto& impl : impls) { - auto i = dynamic_cast*>(impl.get()); + auto i = dynamic_cast*>(impl.get()); if (i && i->UseMe(attr)) { return i->GetFunc(); } @@ -118,48 +119,50 @@ typename KernelTuples::func_type Get( } // The last implementation should be reference function on CPUPlace. - return GetRefer(); + return GetRefer(); } -template +template class KernelFuncs { public: KernelFuncs() = default; static KernelFuncs& Cache() { - static thread_local KernelFuncs g_func_cache; + static thread_local KernelFuncs g_func_cache; return g_func_cache; } // the exposed interface to use - typename KernelTuples::func_type At( - const typename KernelTuples::attr_type& attr) { + typename KernelTuple::func_type At( + const typename KernelTuple::attr_type& attr) { // XXH64: 13.8 GB/s - int64_t key = XXH64(&attr, sizeof(typename KernelTuples::attr_type), 0); + // TODO(TJ): change me, maybe not all attr change need one key, should be + // attrkey + int64_t key = XXH64(&attr, sizeof(typename KernelTuple::attr_type), 0); if (Has(key)) { return funcs_.at(key); } // If do not have this attr in cache, // then could run some runtime benchmark of this attr and save the best one. // Here just get the offline benchmarked best one. - auto func = Get(attr); + auto func = Get(attr); Insert(key, func); return func; } - typename KernelTuples::func_type operator[]( - const typename KernelTuples::attr_type& attr) { + typename KernelTuple::func_type operator[]( + const typename KernelTuple::attr_type& attr) { return At(attr); } protected: bool Has(int64_t key) const { return funcs_.find(key) != funcs_.end(); } - void Insert(int64_t key, typename KernelTuples::func_type func) { + void Insert(int64_t key, typename KernelTuple::func_type func) { funcs_.emplace(key, func); } private: - std::unordered_map funcs_; + std::unordered_map funcs_; DISABLE_COPY_AND_ASSIGN(KernelFuncs); }; diff --git a/paddle/fluid/operators/jit/kernel_base.h b/paddle/fluid/operators/jit/kernel_base.h index 96e162a21b..e8dbcced4f 100644 --- a/paddle/fluid/operators/jit/kernel_base.h +++ b/paddle/fluid/operators/jit/kernel_base.h @@ -62,26 +62,55 @@ typedef enum { kSqrt, } SeqPoolType; +// x, y, z, n template -struct XYZNTuples { +struct XYZNTuple { typedef T data_type; typedef int attr_type; typedef void (*func_type)(const T*, const T*, T*, int); }; +// a, x, y, n template -struct AXYNTuples : public XYZNTuples {}; +struct AXYNTuple : public XYZNTuple {}; +// x, y, n template -struct XYNTuples { +struct XYNTuple { typedef T data_type; typedef int attr_type; typedef void (*func_type)(const T*, T*, int); }; -// x, return and int +// x, returned value, n template -struct XRNTuples : public XYNTuples {}; +struct XRNTuple : public XYNTuple {}; + +#define DECLARE_KERNELTUPLE(kernel_tuple, type) \ + template \ + struct type##Tuple : public kernel_tuple { \ + static constexpr KernelType kernel_type = k##type; \ + } + +// Tuple should be corresponding to the KernelType +DECLARE_KERNELTUPLE(XYZNTuple, VMul); +DECLARE_KERNELTUPLE(XYZNTuple, VAdd); +DECLARE_KERNELTUPLE(XYZNTuple, VAddRelu); +DECLARE_KERNELTUPLE(XYZNTuple, VSub); + +DECLARE_KERNELTUPLE(AXYNTuple, VScal); +DECLARE_KERNELTUPLE(AXYNTuple, VAddBias); + +DECLARE_KERNELTUPLE(XYNTuple, VRelu); +DECLARE_KERNELTUPLE(XYNTuple, VIdentity); +DECLARE_KERNELTUPLE(XYNTuple, VSquare); +DECLARE_KERNELTUPLE(XYNTuple, VExp); +DECLARE_KERNELTUPLE(XYNTuple, VSigmoid); +DECLARE_KERNELTUPLE(XYNTuple, VTanh); +DECLARE_KERNELTUPLE(XYNTuple, VCopy); + +DECLARE_KERNELTUPLE(XRNTuple, HMax); +DECLARE_KERNELTUPLE(XRNTuple, HSum); typedef struct { void* gates; // gates: x_ch, x_ih, x_fh, x_oh @@ -122,21 +151,31 @@ typedef struct rnn_attr_s gru_attr_t; typedef struct lstm_attr_s lstm_attr_t; template -struct LSTMTuples { +struct LSTMTuple { typedef T data_type; typedef lstm_attr_t attr_type; typedef void (*func_type)(lstm_t*, const lstm_attr_t*); }; template -struct GRUTuples { +struct GRUTuple { typedef T data_type; typedef gru_attr_t attr_type; typedef void (*func_type)(gru_t*, const gru_attr_t*); }; +DECLARE_KERNELTUPLE(LSTMTuple, LSTMCtHt); +DECLARE_KERNELTUPLE(LSTMTuple, LSTMC1H1); + +DECLARE_KERNELTUPLE(GRUTuple, GRUH1); +DECLARE_KERNELTUPLE(GRUTuple, GRUHtPart1); +DECLARE_KERNELTUPLE(GRUTuple, GRUHtPart2); + +#undef DECLARE_KERNELTUPLE + template -struct VBroadcastTuples { +struct VBroadcastTuple { + static constexpr KernelType kernel_type = kVBroadcast; typedef T data_type; typedef int64_t attr_type; typedef void (*func_type)(const T*, T*, int64_t, int64_t); @@ -151,7 +190,8 @@ typedef struct seq_pool_attr_s { } seq_pool_attr_t; template -struct SeqPoolTuples { +struct SeqPoolTuple { + static constexpr KernelType kernel_type = kSeqPool; typedef T data_type; typedef seq_pool_attr_t attr_type; typedef void (*func_type)(const T*, T*, const seq_pool_attr_t*); @@ -176,7 +216,8 @@ typedef struct emb_seq_pool_attr_s { } emb_seq_pool_attr_t; template -struct EmbSeqPoolTuples { +struct EmbSeqPoolTuple { + static constexpr KernelType kernel_type = kEmbSeqPool; typedef T data_type; typedef emb_seq_pool_attr_t attr_type; typedef void (*func_type)(const T*, const int64_t*, T*, @@ -198,7 +239,8 @@ typedef struct sgd_attr_s { } sgd_attr_t; template -struct SgdTuples { +struct SgdTuple { + static constexpr KernelType kernel_type = kSgd; typedef T data_type; typedef sgd_attr_t attr_type; typedef void (*func_type)(const T*, const T*, const T*, const int64_t*, T*, @@ -214,21 +256,24 @@ typedef struct matmul_attr_s { } matmul_attr_t; template -struct MatMulTuples { +struct MatMulTuple { + static constexpr KernelType kernel_type = kMatMul; typedef T data_type; typedef matmul_attr_t attr_type; typedef void (*func_type)(const T*, const T*, T*, const matmul_attr_t*); }; template -struct CRFDecodingTuples { +struct CRFDecodingTuple { + static constexpr KernelType kernel_type = kCRFDecoding; typedef T data_type; typedef int attr_type; typedef void (*func_type)(const int, const T*, const T*, T*, int*, int); }; template -struct LayerNormTuples { +struct LayerNormTuple { + static constexpr KernelType kernel_type = kLayerNorm; typedef T data_type; typedef int attr_type; typedef void (*func_type)(T*, T*, T*, T*, const T*, const T*, int, @@ -236,7 +281,8 @@ struct LayerNormTuples { }; template -struct SoftmaxTuples { +struct SoftmaxTuple { + static constexpr KernelType kernel_type = kSoftmax; typedef T data_type; typedef int attr_type; typedef void (*func_type)(const T*, T*, int, int); @@ -244,7 +290,8 @@ struct SoftmaxTuples { // nChw16c = nChw16c .* NC template -struct NCHW16CMulNCTuples { +struct NCHW16CMulNCTuple { + static constexpr KernelType kernel_type = kNCHW16CMulNC; typedef T data_type; typedef int attr_type; typedef void (*func_type)(const T*, const T*, T*, int, int); @@ -258,12 +305,12 @@ class Kernel { DISABLE_COPY_AND_ASSIGN(Kernel); }; -template +template class KernelMore : public Kernel { public: - using T = typename KernelTuples::data_type; - using Func = typename KernelTuples::func_type; - using Attr = typename KernelTuples::attr_type; + using T = typename KernelTuple::data_type; + using Func = typename KernelTuple::func_type; + using Attr = typename KernelTuple::attr_type; virtual Func GetFunc() const { return func; } virtual bool UseMe(const Attr& attr) const = 0; virtual const char* ImplType() const = 0; @@ -272,11 +319,11 @@ class KernelMore : public Kernel { Func func{nullptr}; }; -template -class ReferKernel : public KernelMore { +template +class ReferKernel : public KernelMore { public: // Refer code can always be used - bool UseMe(const typename KernelTuples::attr_type& attr) const override { + bool UseMe(const typename KernelTuple::attr_type& attr) const override { return true; } const char* ImplType() const override { return "Refer"; } diff --git a/paddle/fluid/operators/jit/more/intrinsic/crf_decoding.h b/paddle/fluid/operators/jit/more/intrinsic/crf_decoding.h index 24179d90dd..f4187bd3ba 100644 --- a/paddle/fluid/operators/jit/more/intrinsic/crf_decoding.h +++ b/paddle/fluid/operators/jit/more/intrinsic/crf_decoding.h @@ -26,11 +26,10 @@ namespace intrinsic { void CRFDecoding(const int seq_len, const float* x, const float* w, float* alpha, int* track, int tag_num); -class CRFDecodingKernel : public KernelMore> { +class CRFDecodingKernel : public KernelMore> { public: CRFDecodingKernel() { this->func = CRFDecoding; } - bool UseMe( - const typename CRFDecodingTuples::attr_type&) const override; + bool UseMe(const typename CRFDecodingTuple::attr_type&) const override; const char* ImplType() const override { return "Intrinsic"; } }; diff --git a/paddle/fluid/operators/jit/more/intrinsic/layer_norm.h b/paddle/fluid/operators/jit/more/intrinsic/layer_norm.h index 89da2940f4..dfa4c2f072 100644 --- a/paddle/fluid/operators/jit/more/intrinsic/layer_norm.h +++ b/paddle/fluid/operators/jit/more/intrinsic/layer_norm.h @@ -27,10 +27,10 @@ void LayerNorm(float* x, float* out, float* mean, float* var, const float* scale, const float* bias, int height, const float epsilon, int right); -class LayerNormKernel : public KernelMore> { +class LayerNormKernel : public KernelMore> { public: LayerNormKernel() { this->func = LayerNorm; } - bool UseMe(const typename LayerNormTuples::attr_type&) const override; + bool UseMe(const typename LayerNormTuple::attr_type&) const override; const char* ImplType() const override { return "Intrinsic"; } }; diff --git a/paddle/fluid/operators/jit/more/mix/mix.cc b/paddle/fluid/operators/jit/more/mix/mix.cc index 0036d1c238..9ee1032e95 100644 --- a/paddle/fluid/operators/jit/more/mix/mix.cc +++ b/paddle/fluid/operators/jit/more/mix/mix.cc @@ -23,6 +23,8 @@ namespace jit { namespace more { namespace mix { +using CPUPlace = platform::CPUPlace; + void VSigmoid(const T* x, T* y, int n) { const float min = SIGMOID_THRESHOLD_MIN; const float max = SIGMOID_THRESHOLD_MAX; @@ -30,7 +32,7 @@ void VSigmoid(const T* x, T* y, int n) { y[i] = (x[i] < min) ? min : ((x[i] > max) ? max : x[i]); y[i] = static_cast(0) - y[i]; } - auto compute = Get, platform::CPUPlace>(n); + auto compute = KernelFuncs, CPUPlace>::Cache().At(n); compute(y, y, n); for (int i = 0; i < n; ++i) { y[i] = static_cast(1) / (static_cast(1) + y[i]); @@ -39,9 +41,9 @@ void VSigmoid(const T* x, T* y, int n) { void VTanh(const T* x, T* y, int n) { const T a = 2, b = -1; - auto compute_scal = Get, platform::CPUPlace>(n); - auto compute_addbias = Get, platform::CPUPlace>(n); - auto compute_sigmoid = Get, platform::CPUPlace>(n); + auto compute_scal = KernelFuncs, CPUPlace>::Cache().At(n); + auto compute_addbias = KernelFuncs, CPUPlace>::Cache().At(n); + auto compute_sigmoid = KernelFuncs, CPUPlace>::Cache().At(n); compute_scal(&a, x, y, n); compute_sigmoid(y, y, n); compute_scal(&a, y, y, n); @@ -49,16 +51,12 @@ void VTanh(const T* x, T* y, int n) { } void Softmax(const T* x, T* y, int n, int bs) { - auto compute_hmax = - KernelFuncs, platform::CPUPlace>::Cache().At(n); - auto compute_hsum = - KernelFuncs, platform::CPUPlace>::Cache().At(n); - auto compute_vscal = - KernelFuncs, platform::CPUPlace>::Cache().At(n); + auto compute_hmax = KernelFuncs, CPUPlace>::Cache().At(n); + auto compute_hsum = KernelFuncs, CPUPlace>::Cache().At(n); + auto compute_vscal = KernelFuncs, CPUPlace>::Cache().At(n); auto compute_vaddbias = - KernelFuncs, platform::CPUPlace>::Cache().At(n); - auto compute_vexp = - KernelFuncs, platform::CPUPlace>::Cache().At(n); + KernelFuncs, CPUPlace>::Cache().At(n); + auto compute_vexp = KernelFuncs, CPUPlace>::Cache().At(n); for (int i = 0; i < bs; ++i) { T scalar; @@ -76,13 +74,13 @@ void Softmax(const T* x, T* y, int n, int bs) { void (*getActFunc(KernelType type, int d))(const T*, T*, int) { // NOLINT if (type == kVSigmoid) { - return Get, platform::CPUPlace>(d); + return KernelFuncs, CPUPlace>::Cache().At(d); } else if (type == kVRelu) { - return Get, platform::CPUPlace>(d); + return KernelFuncs, CPUPlace>::Cache().At(d); } else if (type == kVTanh) { - return Get, platform::CPUPlace>(d); + return KernelFuncs, CPUPlace>::Cache().At(d); } else if (type == kVIdentity) { - return Get, platform::CPUPlace>(d); + return KernelFuncs, CPUPlace>::Cache().At(d); } PADDLE_THROW("Not support type: %s", type); return nullptr; @@ -98,9 +96,9 @@ void LSTMCtHt(lstm_t* step, const lstm_attr_t* attr) { const int d = attr->d; const int d2 = d * 2; const int d3 = d * 3; - auto vmul_d = Get, platform::CPUPlace>(d); - auto vadd_d = Get, platform::CPUPlace>(d); - auto vadd_d2 = Get, platform::CPUPlace>(d2); + auto vmul_d = KernelFuncs, CPUPlace>::Cache().At(d); + auto vadd_d = KernelFuncs, CPUPlace>::Cache().At(d); + auto vadd_d2 = KernelFuncs, CPUPlace>::Cache().At(d2); auto act_gate_d = getActFunc(attr->act_gate, d); auto act_gate_d2 = getActFunc(attr->act_gate, d2); auto act_gate_d3 = getActFunc(attr->act_gate, d3); @@ -140,8 +138,8 @@ void LSTMC1H1(lstm_t* step, const lstm_attr_t* attr) { int d = attr->d; int d2 = d * 2; int d3 = d * 3; - auto vmul_d = Get, platform::CPUPlace>(d); - auto vadd_d = Get, platform::CPUPlace>(d); + auto vmul_d = KernelFuncs, CPUPlace>::Cache().At(d); + auto vadd_d = KernelFuncs, CPUPlace>::Cache().At(d); auto act_gate_d = getActFunc(attr->act_gate, d); auto act_cand_d = getActFunc(attr->act_cand, d); auto act_cell_d = getActFunc(attr->act_cell, d); @@ -169,7 +167,7 @@ void GRUH1(gru_t* step, const gru_attr_t* attr) { int d2 = d * 2; auto act_gate = getActFunc(attr->act_gate, d); auto act_cand = getActFunc(attr->act_cand, d); - auto vmul_d = Get, platform::CPUPlace>(d); + auto vmul_d = KernelFuncs, CPUPlace>::Cache().At(d); act_gate(gates, gates, d); act_cand(gates + d2, gates + d2, d); vmul_d(gates, gates + d2, ht, d); @@ -182,7 +180,7 @@ void GRUHtPart1(gru_t* step, const gru_attr_t* attr) { T* ht = reinterpret_cast(step->ht); const T* ht_1 = reinterpret_cast(step->ht_1); auto act_gate = getActFunc(attr->act_gate, attr->d); - auto vmul_d = Get, platform::CPUPlace>(attr->d); + auto vmul_d = KernelFuncs, CPUPlace>::Cache().At(attr->d); act_gate(gates + attr->d, gates + attr->d, attr->d); vmul_d(ht_1, gates + attr->d, ht, attr->d); } @@ -230,16 +228,16 @@ bool GRUHtPart2Kernel::UseMe(const gru_attr_t& attr) const { return true; } namespace mix = paddle::operators::jit::more::mix; -#define REGISTER_MORE_KERNEL(key, func) \ - REGISTER_JITKERNEL_MORE(key, mix, mix::func##Kernel) - -REGISTER_MORE_KERNEL(kVSigmoid, VSigmoid); -REGISTER_MORE_KERNEL(kVTanh, VTanh); -REGISTER_MORE_KERNEL(kSoftmax, Softmax); -REGISTER_MORE_KERNEL(kLSTMCtHt, LSTMCtHt); -REGISTER_MORE_KERNEL(kLSTMC1H1, LSTMC1H1); -REGISTER_MORE_KERNEL(kGRUH1, GRUH1); -REGISTER_MORE_KERNEL(kGRUHtPart1, GRUHtPart1); -REGISTER_MORE_KERNEL(kGRUHtPart2, GRUHtPart2); +#define REGISTER_MORE_KERNEL(func) \ + REGISTER_JITKERNEL_MORE(k##func, mix, mix::func##Kernel) + +REGISTER_MORE_KERNEL(VSigmoid); +REGISTER_MORE_KERNEL(VTanh); +REGISTER_MORE_KERNEL(Softmax); +REGISTER_MORE_KERNEL(LSTMCtHt); +REGISTER_MORE_KERNEL(LSTMC1H1); +REGISTER_MORE_KERNEL(GRUH1); +REGISTER_MORE_KERNEL(GRUHtPart1); +REGISTER_MORE_KERNEL(GRUHtPart2); #undef REGISTER_MORE_KERNEL diff --git a/paddle/fluid/operators/jit/more/mix/mix.h b/paddle/fluid/operators/jit/more/mix/mix.h index d64af19219..17eb96462f 100644 --- a/paddle/fluid/operators/jit/more/mix/mix.h +++ b/paddle/fluid/operators/jit/more/mix/mix.h @@ -34,27 +34,27 @@ void GRUH1(gru_t* step, const gru_attr_t* attr); void GRUHtPart1(gru_t* step, const gru_attr_t* attr); void GRUHtPart2(gru_t* step, const gru_attr_t* attr); -#define DECLARE_MORE_KERNEL(name, tuples) \ - class name##Kernel : public KernelMore> { \ - public: \ - name##Kernel() { this->func = name; } \ - bool UseMe(const typename tuples::attr_type&) const override; \ - const char* ImplType() const override { return "Mixed"; } \ +#define DECLARE_MORE_KERNEL(name) \ + class name##Kernel : public KernelMore> { \ + public: \ + name##Kernel() { this->func = name; } \ + bool UseMe(const typename name##Tuple::attr_type&) const override; \ + const char* ImplType() const override { return "Mixed"; } \ } // XYN -DECLARE_MORE_KERNEL(VSigmoid, XYNTuples); -DECLARE_MORE_KERNEL(VTanh, XYNTuples); +DECLARE_MORE_KERNEL(VSigmoid); +DECLARE_MORE_KERNEL(VTanh); // XRN -DECLARE_MORE_KERNEL(Softmax, SoftmaxTuples); +DECLARE_MORE_KERNEL(Softmax); -DECLARE_MORE_KERNEL(LSTMCtHt, LSTMTuples); -DECLARE_MORE_KERNEL(LSTMC1H1, LSTMTuples); +DECLARE_MORE_KERNEL(LSTMCtHt); +DECLARE_MORE_KERNEL(LSTMC1H1); -DECLARE_MORE_KERNEL(GRUH1, GRUTuples); -DECLARE_MORE_KERNEL(GRUHtPart1, GRUTuples); -DECLARE_MORE_KERNEL(GRUHtPart2, GRUTuples); +DECLARE_MORE_KERNEL(GRUH1); +DECLARE_MORE_KERNEL(GRUHtPart1); +DECLARE_MORE_KERNEL(GRUHtPart2); #undef DECLARE_MORE_KERNEL diff --git a/paddle/fluid/operators/jit/more/mkl/mkl.cc b/paddle/fluid/operators/jit/more/mkl/mkl.cc index 4f51353bce..084ea571ce 100644 --- a/paddle/fluid/operators/jit/more/mkl/mkl.cc +++ b/paddle/fluid/operators/jit/more/mkl/mkl.cc @@ -250,23 +250,23 @@ AWALYS_USE_ME_WITH_DOUBLE(Softmax); namespace mkl = paddle::operators::jit::more::mkl; -#define REGISTER_MKL_KERNEL(key, func) \ - REGISTER_JITKERNEL_MORE(key, mkl, mkl::func##Kernel, \ +#define REGISTER_MKL_KERNEL(func) \ + REGISTER_JITKERNEL_MORE(k##func, mkl, mkl::func##Kernel, \ mkl::func##Kernel) -REGISTER_MKL_KERNEL(kMatMul, MatMul); -REGISTER_MKL_KERNEL(kVMul, VMul); -REGISTER_MKL_KERNEL(kVAdd, VAdd); -REGISTER_MKL_KERNEL(kVScal, VScal); -REGISTER_MKL_KERNEL(kVExp, VExp); -REGISTER_MKL_KERNEL(kVSquare, VSquare); -REGISTER_MKL_KERNEL(kVCopy, VCopy); -REGISTER_MKL_KERNEL(kVBroadcast, VBroadcast); -REGISTER_MKL_KERNEL(kVSigmoid, VSigmoid); -REGISTER_MKL_KERNEL(kVTanh, VTanh); -REGISTER_MKL_KERNEL(kSeqPool, SeqPool); -REGISTER_MKL_KERNEL(kEmbSeqPool, EmbSeqPool); -REGISTER_MKL_KERNEL(kSoftmax, Softmax); -REGISTER_MKL_KERNEL(kSgd, Sgd); +REGISTER_MKL_KERNEL(MatMul); +REGISTER_MKL_KERNEL(VMul); +REGISTER_MKL_KERNEL(VAdd); +REGISTER_MKL_KERNEL(VScal); +REGISTER_MKL_KERNEL(VExp); +REGISTER_MKL_KERNEL(VSquare); +REGISTER_MKL_KERNEL(VCopy); +REGISTER_MKL_KERNEL(VBroadcast); +REGISTER_MKL_KERNEL(VSigmoid); +REGISTER_MKL_KERNEL(VTanh); +REGISTER_MKL_KERNEL(SeqPool); +REGISTER_MKL_KERNEL(EmbSeqPool); +REGISTER_MKL_KERNEL(Softmax); +REGISTER_MKL_KERNEL(Sgd); #undef REGISTER_MKL_KERNEL diff --git a/paddle/fluid/operators/jit/more/mkl/mkl.h b/paddle/fluid/operators/jit/more/mkl/mkl.h index db2d6faed4..8c1d8b57e0 100644 --- a/paddle/fluid/operators/jit/more/mkl/mkl.h +++ b/paddle/fluid/operators/jit/more/mkl/mkl.h @@ -175,41 +175,38 @@ void Sgd(const T* lr, const T* param, const T* grad, const int64_t* rows, } } -#define DECLARE_MKL_KERNEL(name, tuples) \ - template \ - class name##Kernel : public KernelMore> { \ - public: \ - name##Kernel() { this->func = name; } \ - bool UseMe(const typename tuples::attr_type&) const override; \ - const char* ImplType() const override { return "MKL"; } \ +#define DECLARE_MKL_KERNEL(name) \ + template \ + class name##Kernel : public KernelMore> { \ + public: \ + name##Kernel() { this->func = name; } \ + bool UseMe(const typename name##Tuple::attr_type&) const override; \ + const char* ImplType() const override { return "MKL"; } \ } // ABCMNK -DECLARE_MKL_KERNEL(MatMul, MatMulTuples); +DECLARE_MKL_KERNEL(MatMul); // XYZN -DECLARE_MKL_KERNEL(VMul, XYZNTuples); -DECLARE_MKL_KERNEL(VAdd, XYZNTuples); +DECLARE_MKL_KERNEL(VMul); +DECLARE_MKL_KERNEL(VAdd); // AXYN -DECLARE_MKL_KERNEL(VScal, AXYNTuples); +DECLARE_MKL_KERNEL(VScal); // XYN -DECLARE_MKL_KERNEL(VExp, XYNTuples); -DECLARE_MKL_KERNEL(VSigmoid, XYNTuples); -DECLARE_MKL_KERNEL(VTanh, XYNTuples); -DECLARE_MKL_KERNEL(VSquare, XYNTuples); -DECLARE_MKL_KERNEL(VCopy, XYNTuples); - -DECLARE_MKL_KERNEL(SeqPool, SeqPoolTuples); - -DECLARE_MKL_KERNEL(EmbSeqPool, EmbSeqPoolTuples); - -DECLARE_MKL_KERNEL(Softmax, SoftmaxTuples); - -DECLARE_MKL_KERNEL(Sgd, SgdTuples); - -DECLARE_MKL_KERNEL(VBroadcast, VBroadcastTuples); +DECLARE_MKL_KERNEL(VExp); +DECLARE_MKL_KERNEL(VSigmoid); +DECLARE_MKL_KERNEL(VTanh); +DECLARE_MKL_KERNEL(VSquare); +DECLARE_MKL_KERNEL(VCopy); + +// others +DECLARE_MKL_KERNEL(SeqPool); +DECLARE_MKL_KERNEL(EmbSeqPool); +DECLARE_MKL_KERNEL(Softmax); +DECLARE_MKL_KERNEL(Sgd); +DECLARE_MKL_KERNEL(VBroadcast); #undef DECLARE_MKL_KERNEL diff --git a/paddle/fluid/operators/jit/refer/refer.cc b/paddle/fluid/operators/jit/refer/refer.cc index c279d1b2ca..0d1c477090 100644 --- a/paddle/fluid/operators/jit/refer/refer.cc +++ b/paddle/fluid/operators/jit/refer/refer.cc @@ -17,51 +17,43 @@ namespace refer = paddle::operators::jit::refer; -#define REGISTER_REFER_KERNEL(key, func) \ - REGISTER_JITKERNEL_REFER(key, refer::func##Kernel, \ +#define REGISTER_REFER_KERNEL(func) \ + REGISTER_JITKERNEL_REFER(k##func, refer::func##Kernel, \ refer::func##Kernel) -REGISTER_REFER_KERNEL(kVMul, VMul); -REGISTER_REFER_KERNEL(kVAdd, VAdd); -REGISTER_REFER_KERNEL(kVAddRelu, VAddRelu); -REGISTER_REFER_KERNEL(kVSub, VSub); - -REGISTER_REFER_KERNEL(kVScal, VScal); -REGISTER_REFER_KERNEL(kVAddBias, VAddBias); - -REGISTER_REFER_KERNEL(kVRelu, VRelu); -REGISTER_REFER_KERNEL(kVCopy, VCopy); -REGISTER_REFER_KERNEL(kVIdentity, VIdentity); -REGISTER_REFER_KERNEL(kVSquare, VSquare); -REGISTER_REFER_KERNEL(kVExp, VExp); -REGISTER_REFER_KERNEL(kVSigmoid, VSigmoid); -REGISTER_REFER_KERNEL(kVTanh, VTanh); - -REGISTER_REFER_KERNEL(kLSTMCtHt, LSTMCtHt); -REGISTER_REFER_KERNEL(kLSTMC1H1, LSTMC1H1); - -REGISTER_REFER_KERNEL(kGRUH1, GRUH1); -REGISTER_REFER_KERNEL(kGRUHtPart1, GRUHtPart1); -REGISTER_REFER_KERNEL(kGRUHtPart2, GRUHtPart2); - -REGISTER_REFER_KERNEL(kCRFDecoding, CRFDecoding); -REGISTER_REFER_KERNEL(kLayerNorm, LayerNorm); - -REGISTER_REFER_KERNEL(kNCHW16CMulNC, NCHW16CMulNC); - -REGISTER_REFER_KERNEL(kSeqPool, SeqPool); - -REGISTER_REFER_KERNEL(kMatMul, MatMul); - -REGISTER_REFER_KERNEL(kHMax, HMax); -REGISTER_REFER_KERNEL(kHSum, HSum); - -REGISTER_REFER_KERNEL(kSoftmax, Softmax); - -REGISTER_REFER_KERNEL(kEmbSeqPool, EmbSeqPool); - -REGISTER_REFER_KERNEL(kSgd, Sgd); - -REGISTER_REFER_KERNEL(kVBroadcast, VBroadcast); +REGISTER_REFER_KERNEL(VMul); +REGISTER_REFER_KERNEL(VAdd); +REGISTER_REFER_KERNEL(VAddRelu); +REGISTER_REFER_KERNEL(VSub); + +REGISTER_REFER_KERNEL(VScal); +REGISTER_REFER_KERNEL(VAddBias); + +REGISTER_REFER_KERNEL(VRelu); +REGISTER_REFER_KERNEL(VCopy); +REGISTER_REFER_KERNEL(VIdentity); +REGISTER_REFER_KERNEL(VSquare); +REGISTER_REFER_KERNEL(VExp); +REGISTER_REFER_KERNEL(VSigmoid); +REGISTER_REFER_KERNEL(VTanh); + +REGISTER_REFER_KERNEL(LSTMCtHt); +REGISTER_REFER_KERNEL(LSTMC1H1); + +REGISTER_REFER_KERNEL(GRUH1); +REGISTER_REFER_KERNEL(GRUHtPart1); +REGISTER_REFER_KERNEL(GRUHtPart2); + +REGISTER_REFER_KERNEL(CRFDecoding); +REGISTER_REFER_KERNEL(LayerNorm); +REGISTER_REFER_KERNEL(NCHW16CMulNC); +REGISTER_REFER_KERNEL(SeqPool); +REGISTER_REFER_KERNEL(MatMul); +REGISTER_REFER_KERNEL(HMax); +REGISTER_REFER_KERNEL(HSum); +REGISTER_REFER_KERNEL(Softmax); +REGISTER_REFER_KERNEL(EmbSeqPool); +REGISTER_REFER_KERNEL(Sgd); +REGISTER_REFER_KERNEL(VBroadcast); #undef REGISTER_REFER_KERNEL diff --git a/paddle/fluid/operators/jit/refer/refer.h b/paddle/fluid/operators/jit/refer/refer.h index b3b2097828..cac705a484 100644 --- a/paddle/fluid/operators/jit/refer/refer.h +++ b/paddle/fluid/operators/jit/refer/refer.h @@ -490,60 +490,54 @@ void Sgd(const T* lr, const T* param, const T* grad, const int64_t* rows, } } -#define DECLARE_REFER_KERNEL(name, tuples) \ - template \ - class name##Kernel : public ReferKernel> { \ - public: \ - name##Kernel() { this->func = name; } \ +#define DECLARE_REFER_KERNEL(name) \ + template \ + class name##Kernel : public ReferKernel> { \ + public: \ + name##Kernel() { this->func = name; } \ } // const T* x, const T* y, T* z, int n -DECLARE_REFER_KERNEL(VMul, XYZNTuples); -DECLARE_REFER_KERNEL(VAdd, XYZNTuples); -DECLARE_REFER_KERNEL(VAddRelu, XYZNTuples); -DECLARE_REFER_KERNEL(VSub, XYZNTuples); +DECLARE_REFER_KERNEL(VMul); +DECLARE_REFER_KERNEL(VAdd); +DECLARE_REFER_KERNEL(VAddRelu); +DECLARE_REFER_KERNEL(VSub); // const T* a, const T* x, T* y, int n -DECLARE_REFER_KERNEL(VScal, AXYNTuples); -DECLARE_REFER_KERNEL(VAddBias, AXYNTuples); +DECLARE_REFER_KERNEL(VScal); +DECLARE_REFER_KERNEL(VAddBias); // const T* x, T* y, int n -DECLARE_REFER_KERNEL(VRelu, XYNTuples); -DECLARE_REFER_KERNEL(VIdentity, XYNTuples); -DECLARE_REFER_KERNEL(VExp, XYNTuples); -DECLARE_REFER_KERNEL(VSigmoid, XYNTuples); -DECLARE_REFER_KERNEL(VTanh, XYNTuples); -DECLARE_REFER_KERNEL(VSquare, XYNTuples); -DECLARE_REFER_KERNEL(VCopy, XYNTuples); +DECLARE_REFER_KERNEL(VRelu); +DECLARE_REFER_KERNEL(VIdentity); +DECLARE_REFER_KERNEL(VExp); +DECLARE_REFER_KERNEL(VSigmoid); +DECLARE_REFER_KERNEL(VTanh); +DECLARE_REFER_KERNEL(VSquare); +DECLARE_REFER_KERNEL(VCopy); // lstm_t*, const lstm_attr_t* -DECLARE_REFER_KERNEL(LSTMCtHt, LSTMTuples); -DECLARE_REFER_KERNEL(LSTMC1H1, LSTMTuples); +DECLARE_REFER_KERNEL(LSTMCtHt); +DECLARE_REFER_KERNEL(LSTMC1H1); // gru_t*, const gru_attr_t* -DECLARE_REFER_KERNEL(GRUH1, GRUTuples); -DECLARE_REFER_KERNEL(GRUHtPart1, GRUTuples); -DECLARE_REFER_KERNEL(GRUHtPart2, GRUTuples); - -DECLARE_REFER_KERNEL(CRFDecoding, CRFDecodingTuples); -DECLARE_REFER_KERNEL(LayerNorm, LayerNormTuples); - -DECLARE_REFER_KERNEL(NCHW16CMulNC, NCHW16CMulNCTuples); - -DECLARE_REFER_KERNEL(SeqPool, SeqPoolTuples); - -DECLARE_REFER_KERNEL(MatMul, MatMulTuples); - -DECLARE_REFER_KERNEL(HMax, XRNTuples); -DECLARE_REFER_KERNEL(HSum, XRNTuples); - -DECLARE_REFER_KERNEL(Softmax, SoftmaxTuples); - -DECLARE_REFER_KERNEL(EmbSeqPool, EmbSeqPoolTuples); - -DECLARE_REFER_KERNEL(Sgd, SgdTuples); - -DECLARE_REFER_KERNEL(VBroadcast, VBroadcastTuples); +DECLARE_REFER_KERNEL(GRUH1); +DECLARE_REFER_KERNEL(GRUHtPart1); +DECLARE_REFER_KERNEL(GRUHtPart2); + +DECLARE_REFER_KERNEL(HMax); +DECLARE_REFER_KERNEL(HSum); + +// others +DECLARE_REFER_KERNEL(CRFDecoding); +DECLARE_REFER_KERNEL(LayerNorm); +DECLARE_REFER_KERNEL(NCHW16CMulNC); +DECLARE_REFER_KERNEL(SeqPool); +DECLARE_REFER_KERNEL(MatMul); +DECLARE_REFER_KERNEL(Softmax); +DECLARE_REFER_KERNEL(EmbSeqPool); +DECLARE_REFER_KERNEL(Sgd); +DECLARE_REFER_KERNEL(VBroadcast); #undef DECLARE_REFER_KERNEL diff --git a/paddle/fluid/operators/jit/test.cc b/paddle/fluid/operators/jit/test.cc index 18f8c09f14..a574bf2079 100644 --- a/paddle/fluid/operators/jit/test.cc +++ b/paddle/fluid/operators/jit/test.cc @@ -64,413 +64,43 @@ std::vector TestSizes() { namespace jit = paddle::operators::jit; using CPUPlace = paddle::platform::CPUPlace; -template -struct TestFuncWithRefer { - void operator()(const typename KernelTuples::func_type tgt, Args... args) { - LOG(FATAL) << "Should specify this function."; - } -}; - -template -struct TestFuncWithRefer, std::vector, std::vector, - std::vector> { - void operator()(const typename jit::XYZNTuples::func_type tgt, - const std::vector& x, const std::vector& y, - const std::vector& zref) { - EXPECT_TRUE(tgt != nullptr); - EXPECT_EQ(zref.size(), x.size()); - EXPECT_EQ(zref.size(), y.size()); - const T* x_data = x.data(); - const T* y_data = y.data(); - const T* zref_data = zref.data(); - const int d = zref.size(); - - std::vector ztgt(d); - T* ztgt_data = ztgt.data(); - // test normal - tgt(x_data, y_data, ztgt_data, d); - ExpectEQ(ztgt_data, zref_data, d); - // test inplace x - std::copy(x.begin(), x.end(), ztgt.begin()); - tgt(ztgt_data, y_data, ztgt_data, d); - ExpectEQ(ztgt_data, zref_data, d); - // test inplace y - std::copy(y.begin(), y.end(), ztgt.begin()); - tgt(x_data, ztgt_data, ztgt_data, d); - ExpectEQ(ztgt_data, zref_data, d); - } -}; - -template -struct TestFuncWithRefer, T, std::vector, - std::vector> { - void operator()(const typename jit::AXYNTuples::func_type tgt, const T a, - const std::vector& x, const std::vector& yref) { - EXPECT_TRUE(tgt != nullptr); - EXPECT_EQ(yref.size(), x.size()); - const T* x_data = x.data(); - const T* yref_data = yref.data(); - const int d = yref.size(); - std::vector ytgt(d); - T* ytgt_data = ytgt.data(); - // test normal - tgt(&a, x_data, ytgt_data, d); - ExpectEQ(ytgt_data, yref_data, d); - // test inplace x - std::copy(x.begin(), x.end(), ytgt.begin()); - tgt(&a, ytgt_data, ytgt_data, d); - ExpectEQ(ytgt_data, yref_data, d); - } -}; - -template -struct TestFuncWithRefer, std::vector, std::vector, - int, int> { - void operator()(const typename jit::SoftmaxTuples::func_type tgt, - const std::vector& x, const std::vector& yref, int n, - int bs) { - EXPECT_TRUE(tgt != nullptr); - EXPECT_EQ(yref.size(), x.size()); - EXPECT_EQ(x.size(), static_cast(n * bs)); - const T* x_data = x.data(); - const T* yref_data = yref.data(); - std::vector ytgt(n * bs); - T* ytgt_data = ytgt.data(); - // test normal - tgt(x_data, ytgt_data, n, bs); - ExpectEQ(ytgt_data, yref_data, n * bs); - // test inplace x - std::copy(x.begin(), x.end(), ytgt.begin()); - tgt(ytgt_data, ytgt_data, n, bs); - ExpectEQ(ytgt_data, yref_data, n * bs); - } -}; - -template -struct TestFuncWithRefer, std::vector, T> { - void operator()(const typename jit::XRNTuples::func_type tgt, - const std::vector& x, const T ref_res) { - EXPECT_TRUE(tgt != nullptr); - T tgt_res; - tgt(x.data(), &tgt_res, x.size()); - ExpectEQ(&tgt_res, &ref_res, 1); - } -}; - -template -struct TestFuncWithRefer, std::vector, - std::vector, int64_t, - typename jit::VBroadcastTuples::attr_type> { - void operator()(const typename jit::VBroadcastTuples::func_type tgt, - const std::vector& x, const std::vector& yref, - int64_t h, - const typename jit::VBroadcastTuples::attr_type& attr) { - EXPECT_TRUE(tgt != nullptr); - EXPECT_EQ(x.size(), static_cast(attr)); - EXPECT_EQ(yref.size(), x.size() * h); - std::vector y(yref.size()); - const T* x_data = x.data(); - const T* yref_data = yref.data(); - T* y_data = y.data(); - tgt(x_data, y_data, h, attr); - ExpectEQ(y_data, yref_data, yref.size()); - } -}; - -template -struct TestFuncWithRefer, std::vector, std::vector> { - void operator()(const typename jit::XYNTuples::func_type tgt, - const std::vector& x, const std::vector& yref) { - EXPECT_TRUE(tgt != nullptr); - EXPECT_EQ(yref.size(), x.size()); - const T* x_data = x.data(); - const T* yref_data = yref.data(); - const int d = yref.size(); - std::vector ytgt(d); - T* ytgt_data = ytgt.data(); - // test normal - tgt(x_data, ytgt_data, d); - ExpectEQ(ytgt_data, yref_data, d); - // test inplace x - std::copy(x.begin(), x.end(), ytgt.begin()); - tgt(ytgt_data, ytgt_data, d); - ExpectEQ(ytgt_data, yref_data, d); - } -}; - -template -struct TestFuncWithRefer, std::vector, std::vector, - std::vector, std::vector, std::vector, - typename jit::LSTMTuples::attr_type> { - void operator()(const typename jit::LSTMTuples::func_type tgt, - const std::vector& xsrc, const std::vector& wp, - const std::vector& ct_1, const std::vector& ct_ref, - const std::vector& ht_ref, - const typename jit::LSTMTuples::attr_type& attr) { - EXPECT_TRUE(tgt != nullptr); - EXPECT_EQ(ct_ref.size(), ht_ref.size()); - EXPECT_EQ(ct_1.size(), ht_ref.size()); - EXPECT_EQ(xsrc.size(), 4 * ht_ref.size()); - EXPECT_EQ(wp.size(), 3 * ht_ref.size()); - - // x could be changed after compute, so copy to save src - int d = ht_ref.size(); - std::vector x(xsrc.size()), ct(ct_ref.size()), ht(ht_ref.size()); - std::vector checked(2 * d); - std::copy(xsrc.begin(), xsrc.end(), x.begin()); - - const T* ct_1_data = ct_1.data(); - const T* wp_data = wp.data(); - const T* ct_ref_data = ct_ref.data(); - const T* ht_ref_data = ht_ref.data(); - T* x_data = x.data(); - T* ct_data = ct.data(); - T* ht_data = ht.data(); - T* checked_data = checked.data(); - - jit::lstm_t step; - step.gates = x_data; - step.ct_1 = ct_1_data; - step.ct = ct_data; - step.ht = ht_data; - if (attr.use_peephole) { - step.wp = wp_data; - step.checked = checked_data; - } - - tgt(&step, &attr); - ExpectEQ(ct_data, ct_ref_data, d); - ExpectEQ(ht_data, ht_ref_data, d); - } -}; - -template -struct TestFuncWithRefer, std::vector, std::vector, - std::vector, - typename jit::GRUTuples::attr_type> { - void operator()(const typename jit::GRUTuples::func_type tgt, - const std::vector& xsrc, const std::vector& ht_1, - const std::vector& ht_ref, - const typename jit::GRUTuples::attr_type& attr) { - EXPECT_TRUE(tgt != nullptr); - EXPECT_EQ(ht_1.size(), ht_ref.size()); - EXPECT_EQ(xsrc.size(), 3 * ht_ref.size()); - - // x could be changed after compute, so copy to save src - int d = ht_ref.size(); - std::vector x(xsrc.size()), ht(ht_ref.size()); - std::copy(xsrc.begin(), xsrc.end(), x.begin()); - const T* ht_1_data = ht_1.data(); - const T* ht_ref_data = ht_ref.data(); - T* x_data = x.data(); - T* ht_data = ht.data(); - jit::gru_t step; - step.gates = x_data; - step.ht_1 = ht_1_data; - step.ht = ht_data; - tgt(&step, &attr); - ExpectEQ(ht_data, ht_ref_data, d); - } -}; - -template -struct TestFuncWithRefer, std::vector, std::vector, - typename jit::SeqPoolTuples::attr_type> { - void operator()(const typename jit::SeqPoolTuples::func_type tgt, - const std::vector& x, const std::vector& yref, - const typename jit::SeqPoolTuples::attr_type& attr) { - EXPECT_TRUE(tgt != nullptr); - EXPECT_EQ(x.size() % yref.size(), static_cast(0)); - int w = yref.size(); - std::vector y(w); - const T* x_data = x.data(); - const T* yref_data = yref.data(); - T* y_data = y.data(); - tgt(x_data, y_data, &attr); - ExpectEQ(y_data, yref_data, w); - } -}; - -template -struct TestFuncWithRefer, std::vector, - std::vector, std::vector, - typename jit::EmbSeqPoolTuples::attr_type> { - void operator()(const typename jit::EmbSeqPoolTuples::func_type tgt, - const std::vector& table, const std::vector& idx, - const std::vector& oref, - const typename jit::EmbSeqPoolTuples::attr_type& attr) { - EXPECT_TRUE(tgt != nullptr); - EXPECT_EQ(table.size(), - static_cast(attr.table_height * attr.table_width)); - EXPECT_EQ(idx.size(), - static_cast(attr.index_height * attr.index_width)); - EXPECT_EQ(oref.size(), - static_cast(attr.table_width * attr.index_width)); - const T* table_data = table.data(); - const int64_t* idx_data = idx.data(); - const T* oref_data = oref.data(); - int o_w = oref.size(); - std::vector out(o_w); - T* o_data = out.data(); - tgt(table_data, idx_data, o_data, &attr); - ExpectEQ(o_data, oref_data, o_w); - } -}; - -template -struct TestFuncWithRefer, T, std::vector, std::vector, - std::vector, std::vector, - typename jit::SgdTuples::attr_type> { - void operator()(const typename jit::SgdTuples::func_type tgt, const T lr, - const std::vector& param, const std::vector& grad, - const std::vector& rows, const std::vector& oref, - const typename jit::SgdTuples::attr_type& attr) { - EXPECT_TRUE(tgt != nullptr); - EXPECT_EQ(param.size(), - static_cast(attr.param_height * attr.param_width)); - EXPECT_EQ(grad.size(), - static_cast(attr.grad_height * attr.grad_width)); - EXPECT_EQ(rows.size(), static_cast(attr.selected_rows_size)); - EXPECT_EQ(param.size(), oref.size()); - const T* param_data = param.data(); - const T* grad_data = grad.data(); - const int64_t* rows_data = rows.data(); - const T* oref_data = oref.data(); - - std::vector out(oref.size()); - T* o_data = out.data(); - tgt(&lr, param_data, grad_data, rows_data, o_data, &attr); - // only the selected rows should be equal - for (size_t i = 0; i < rows.size(); ++i) { - ExpectEQ(o_data + rows[i] * attr.grad_width, - oref_data + rows[i] * attr.grad_width, attr.grad_width); - } - - // inplace - std::copy(param.begin(), param.end(), out.begin()); - tgt(&lr, o_data, grad_data, rows_data, o_data, &attr); - for (size_t i = 0; i < rows.size(); ++i) { - ExpectEQ(o_data + rows[i] * attr.grad_width, - oref_data + rows[i] * attr.grad_width, attr.grad_width); - } - } -}; - -template -struct TestFuncWithRefer, std::vector, std::vector, - std::vector, - typename jit::MatMulTuples::attr_type> { - void operator()(const typename jit::MatMulTuples::func_type tgt, - const std::vector& a, const std::vector& b, - const std::vector& cref, - const typename jit::MatMulTuples::attr_type& attr) { - EXPECT_TRUE(tgt != nullptr); - EXPECT_EQ(a.size(), static_cast(attr.m * attr.k)); - EXPECT_EQ(b.size(), static_cast(attr.k * attr.n)); - EXPECT_EQ(cref.size(), static_cast(attr.m * attr.n)); - std::vector c(cref.size()); - const T* a_data = a.data(); - const T* b_data = b.data(); - const T* cref_data = cref.data(); - T* c_data = c.data(); - tgt(a_data, b_data, c_data, &attr); - ExpectEQ(c_data, cref_data, attr.m * attr.n); - } -}; - -template -struct TestFuncWithRefer, std::vector, - std::vector, std::vector, std::vector, - std::vector, std::vector, int, float, int> { - void operator()(const typename jit::LayerNormTuples::func_type tgt, - std::vector& x, std::vector& outref, // NOLINT - std::vector& mean, std::vector& var, // NOLINT - const std::vector& scale, const std::vector& bias, - int left, const float epsilon, int right) { - EXPECT_TRUE(tgt != nullptr); - EXPECT_EQ(x.size(), static_cast(left * right)); - EXPECT_EQ(outref.size(), static_cast(left * right)); - EXPECT_EQ(mean.size(), static_cast(left)); - EXPECT_EQ(var.size(), static_cast(left)); - EXPECT_EQ(scale.size(), static_cast(right)); - EXPECT_EQ(bias.size(), static_cast(right)); - std::vector outtgt(outref.size()); - const T* scale_data = scale.data(); - const T* bias_data = bias.data(); - T* x_data = x.data(); - T* mean_data = mean.data(); - T* var_data = var.data(); - T* outref_data = outref.data(); - T* outtgt_data = outtgt.data(); - - tgt(x_data, outtgt_data, mean_data, var_data, scale_data, bias_data, left, - epsilon, right); - ExpectEQ(outtgt_data, outref_data, left * right); - } -}; - -template -struct TestFuncWithRefer, int, std::vector, - std::vector, std::vector, std::vector, - int> { - void operator()(const typename jit::CRFDecodingTuples::func_type tgt, - const int seq_len, const std::vector& x, - const std::vector& w, std::vector& alpharef, // NOLINT - std::vector& trackref, int tag_num) { // NOLINT - constexpr int state_trans_base_idx = 2; - EXPECT_TRUE(tgt != nullptr); - EXPECT_EQ(x.size(), static_cast(seq_len * tag_num)); - EXPECT_EQ(w.size(), - static_cast((tag_num + state_trans_base_idx) * tag_num)); - EXPECT_EQ(alpharef.size(), static_cast(seq_len * tag_num)); - EXPECT_EQ(trackref.size(), static_cast(seq_len * tag_num)); - std::vector alphatgt(alpharef.size()); - std::vector tracktgt(trackref.size()); - - memcpy(trackref.data(), tracktgt.data(), tag_num * sizeof(int)); - tgt(seq_len, (const T*)x.data(), (const T*)w.data(), alphatgt.data(), - tracktgt.data(), tag_num); - ExpectEQ(alpharef.data(), alphatgt.data(), seq_len * tag_num); - ExpectEQ(trackref.data(), tracktgt.data(), seq_len * tag_num); - } -}; - -template -void TestAllImpls(const typename KernelTuples::attr_type& attr, Args... args) { - TestFuncWithRefer test; +void TestAllImpls(const typename KernelTuple::attr_type& attr, + const Tester& verifier, const Args&... args) { // test jitcode - auto jitcode = jit::GetJitCode(attr); + auto jitcode = jit::GetJitCode(attr); if (jitcode) { VLOG(10) << "Test Jitcode Kernel "; - test(jitcode, args...); + verifier(jitcode, args...); } // test all impls in more - jit::KernelKey kkey(KT, PlaceType()); + jit::KernelKey kkey(KernelTuple::kernel_type, PlaceType()); auto& pool = jit::KernelPool().Instance().AllKernels(); auto iter = pool.find(kkey); if (iter != pool.end()) { auto& impls = iter->second; for (auto& impl : impls) { - auto i = dynamic_cast*>(impl.get()); + auto i = dynamic_cast*>(impl.get()); if (i && i->UseMe(attr)) { auto more = i->GetFunc(); VLOG(10) << "Test More Kernel : " << i->ImplType(); - test(more, args...); + verifier(more, args...); } } } // test result from Get function - // VLOG(10) << "Test Get function "; - auto tgt = jit::KernelFuncs::Cache().At(attr); - test(tgt, args...); + VLOG(10) << "Test final get function "; + auto tgt = jit::KernelFuncs::Cache().At(attr); + verifier(tgt, args...); } -template -void TestKernelXYZNTuples() { - VLOG(10) << "===== Test JITKernel " << jit::to_string(KT); +template +void TestKernelXYZN() { + using T = typename KernelTuple::data_type; + VLOG(10) << "Test JITKernel: " << jit::to_string(KernelTuple::kernel_type); for (int d : TestSizes()) { - auto ref = jit::GetRefer>(); + auto ref = jit::GetRefer(); EXPECT_TRUE(ref != nullptr); std::vector x(d), y(d), zref(d); @@ -494,16 +124,42 @@ void TestKernelXYZNTuples() { ExpectEQ(xinp_data, zref_data, d); ExpectEQ(yinp_data, zref_data, d); - TestAllImpls, PlaceType, std::vector, - std::vector, std::vector>(d, x, y, zref); + auto verifier = [](const typename KernelTuple::func_type tgt, + const std::vector& x, const std::vector& y, + const std::vector& zref) { + EXPECT_TRUE(tgt != nullptr); + EXPECT_EQ(zref.size(), x.size()); + EXPECT_EQ(zref.size(), y.size()); + const T* x_data = x.data(); + const T* y_data = y.data(); + const T* zref_data = zref.data(); + const int d = zref.size(); + + std::vector ztgt(d); + T* ztgt_data = ztgt.data(); + // test normal + tgt(x_data, y_data, ztgt_data, d); + ExpectEQ(ztgt_data, zref_data, d); + // test inplace x + std::copy(x.begin(), x.end(), ztgt.begin()); + tgt(ztgt_data, y_data, ztgt_data, d); + ExpectEQ(ztgt_data, zref_data, d); + // test inplace y + std::copy(y.begin(), y.end(), ztgt.begin()); + tgt(x_data, ztgt_data, ztgt_data, d); + ExpectEQ(ztgt_data, zref_data, d); + }; + + TestAllImpls(d, verifier, x, y, zref); } } -template -void TestKernelAXYNTuples() { - VLOG(10) << "===== Test JITKernel " << jit::to_string(KT); +template +void TestKernelAXYN() { + using T = typename KernelTuple::data_type; + VLOG(10) << "Test JITKernel: " << jit::to_string(KernelTuple::kernel_type); for (int d : TestSizes()) { - auto ref = jit::GetRefer>(); + auto ref = jit::GetRefer(); EXPECT_TRUE(ref != nullptr); const T a = static_cast(3); @@ -520,34 +176,33 @@ void TestKernelAXYNTuples() { ref(&a, xinp_data, xinp_data, d); ExpectEQ(xinp_data, yref_data, d); - TestAllImpls, PlaceType, T, std::vector, - std::vector>(d, a, x, yref); - } -} - -template -void TestKernelXRNTuples() { - VLOG(10) << "===== Test JITKernel " << jit::to_string(KT); - auto last_acc = FLAGS_acc; - FLAGS_acc = 1e-4; - for (int d : TestSizes()) { - auto ref = jit::GetRefer>(); - EXPECT_TRUE(ref != nullptr); - std::vector x(d); - RandomVec(d, x.data()); - T ref_res; - ref(x.data(), &ref_res, d); - TestAllImpls, PlaceType, std::vector, T>(d, x, - ref_res); + auto verifier = [](const typename KernelTuple::func_type tgt, const T a, + const std::vector& x, const std::vector& yref) { + EXPECT_TRUE(tgt != nullptr); + EXPECT_EQ(yref.size(), x.size()); + const T* x_data = x.data(); + const T* yref_data = yref.data(); + const int d = yref.size(); + std::vector ytgt(d); + T* ytgt_data = ytgt.data(); + // test normal + tgt(&a, x_data, ytgt_data, d); + ExpectEQ(ytgt_data, yref_data, d); + // test inplace x + std::copy(x.begin(), x.end(), ytgt.begin()); + tgt(&a, ytgt_data, ytgt_data, d); + ExpectEQ(ytgt_data, yref_data, d); + }; + TestAllImpls(d, verifier, a, x, yref); } - FLAGS_acc = last_acc; } -template -void TestKernelXYNTuples() { - VLOG(10) << "===== Test JITKernel " << jit::to_string(KT); +template +void TestKernelXYN() { + using T = typename KernelTuple::data_type; + VLOG(10) << "Test JITKernel: " << jit::to_string(KernelTuple::kernel_type); for (int d : TestSizes()) { - auto ref = jit::GetRefer>(); + auto ref = jit::GetRefer(); EXPECT_TRUE(ref != nullptr); std::vector x(d), yref(d); @@ -562,15 +217,57 @@ void TestKernelXYNTuples() { ref(x_data, yref_data, d); ref(xinp_data, xinp_data, d); ExpectEQ(xinp_data, yref_data, d); + auto verifier = [](const typename KernelTuple::func_type tgt, + const std::vector& x, const std::vector& yref) { + EXPECT_TRUE(tgt != nullptr); + EXPECT_EQ(yref.size(), x.size()); + const T* x_data = x.data(); + const T* yref_data = yref.data(); + const int d = yref.size(); + std::vector ytgt(d); + T* ytgt_data = ytgt.data(); + // test normal + tgt(x_data, ytgt_data, d); + ExpectEQ(ytgt_data, yref_data, d); + // test inplace x + std::copy(x.begin(), x.end(), ytgt.begin()); + tgt(ytgt_data, ytgt_data, d); + ExpectEQ(ytgt_data, yref_data, d); + }; + TestAllImpls(d, verifier, x, yref); + } +} + +template +void TestKernelXRN() { + using T = typename KernelTuple::data_type; + VLOG(10) << "Test JITKernel: " << jit::to_string(KernelTuple::kernel_type); + auto last_acc = FLAGS_acc; + FLAGS_acc = 1e-4; + for (int d : TestSizes()) { + auto ref = jit::GetRefer(); + EXPECT_TRUE(ref != nullptr); + std::vector x(d); + RandomVec(d, x.data()); + T ref_res; + ref(x.data(), &ref_res, d); - TestAllImpls, PlaceType, std::vector, - std::vector>(d, x, yref); + auto verifier = [](const typename KernelTuple::func_type tgt, + const std::vector& x, const T ref_res) { + EXPECT_TRUE(tgt != nullptr); + T tgt_res; + tgt(x.data(), &tgt_res, x.size()); + ExpectEQ(&tgt_res, &ref_res, 1); + }; + TestAllImpls(d, verifier, x, ref_res); } + FLAGS_acc = last_acc; } -template -void TestKernelLSTMTuples() { - VLOG(10) << "===== Test JITKernel " << jit::to_string(KT); +template +void TestKernelLSTM() { + using T = typename KernelTuple::data_type; + VLOG(10) << "Test JITKernel: " << jit::to_string(KernelTuple::kernel_type); std::vector all_acts = {"sigmoid", "tanh", "relu", "identity"}; auto test_sizes = TestSizes(); test_sizes.erase(std::remove(test_sizes.begin(), test_sizes.end(), 1000)); @@ -582,7 +279,7 @@ void TestKernelLSTMTuples() { const jit::lstm_attr_t attr( d, jit::to_kerneltype(act_gate), jit::to_kerneltype(act_cand), jit::to_kerneltype(act_cell), use_peephole); - auto ref = jit::GetRefer>(); + auto ref = jit::GetRefer(); EXPECT_TRUE(ref != nullptr); std::vector xsrc(4 * d), wp(3 * d), ct_1(d); std::vector ct_ref(d), ht_ref(d), checked(2 * d); @@ -609,10 +306,51 @@ void TestKernelLSTMTuples() { } ref(&step, &attr); VLOG(10) << attr; - TestAllImpls, PlaceType, std::vector, - std::vector, std::vector, std::vector, - std::vector>(attr, xsrc, wp, ct_1, ct_ref, ht_ref, - attr); + + auto verifier = []( + const typename KernelTuple::func_type tgt, + const std::vector& xsrc, const std::vector& wp, + const std::vector& ct_1, const std::vector& ct_ref, + const std::vector& ht_ref, + const typename KernelTuple::attr_type& attr) { + EXPECT_TRUE(tgt != nullptr); + EXPECT_EQ(ct_ref.size(), ht_ref.size()); + EXPECT_EQ(ct_1.size(), ht_ref.size()); + EXPECT_EQ(xsrc.size(), 4 * ht_ref.size()); + EXPECT_EQ(wp.size(), 3 * ht_ref.size()); + + // x could be changed after compute, so copy to save src + int d = ht_ref.size(); + std::vector x(xsrc.size()), ct(ct_ref.size()), + ht(ht_ref.size()); + std::vector checked(2 * d); + std::copy(xsrc.begin(), xsrc.end(), x.begin()); + + const T* ct_1_data = ct_1.data(); + const T* wp_data = wp.data(); + const T* ct_ref_data = ct_ref.data(); + const T* ht_ref_data = ht_ref.data(); + T* x_data = x.data(); + T* ct_data = ct.data(); + T* ht_data = ht.data(); + T* checked_data = checked.data(); + + jit::lstm_t step; + step.gates = x_data; + step.ct_1 = ct_1_data; + step.ct = ct_data; + step.ht = ht_data; + if (attr.use_peephole) { + step.wp = wp_data; + step.checked = checked_data; + } + + tgt(&step, &attr); + ExpectEQ(ct_data, ct_ref_data, d); + ExpectEQ(ht_data, ht_ref_data, d); + }; + TestAllImpls(attr, verifier, xsrc, wp, ct_1, + ct_ref, ht_ref, attr); } } } @@ -620,9 +358,10 @@ void TestKernelLSTMTuples() { } } -template -void TestKernelGRUTuples() { - VLOG(10) << "===== Test JITKernel " << jit::to_string(KT); +template +void TestKernelGRU() { + using T = typename KernelTuple::data_type; + VLOG(10) << "Test JITKernel: " << jit::to_string(KernelTuple::kernel_type); std::vector all_acts = {"sigmoid", "tanh", "relu", "identity"}; auto test_sizes = TestSizes(); test_sizes.erase(std::remove(test_sizes.begin(), test_sizes.end(), 1000)); @@ -631,7 +370,7 @@ void TestKernelGRUTuples() { for (auto& act_cand : all_acts) { const jit::gru_attr_t attr(d, jit::to_kerneltype(act_gate), jit::to_kerneltype(act_cand)); - auto ref = jit::GetRefer>(); + auto ref = jit::GetRefer(); EXPECT_TRUE(ref != nullptr); std::vector xsrc(3 * d), ht_1(d), ht_ref(d); RandomVec(3 * d, xsrc.data()); @@ -648,17 +387,216 @@ void TestKernelGRUTuples() { step.ht = ht_ref_data; ref(&step, &attr); VLOG(10) << attr; - TestAllImpls, PlaceType, std::vector, - std::vector, std::vector>(attr, xsrc, ht_1, ht_ref, - attr); + auto verifier = [](const typename KernelTuple::func_type tgt, + const std::vector& xsrc, + const std::vector& ht_1, + const std::vector& ht_ref, + const typename KernelTuple::attr_type& attr) { + EXPECT_TRUE(tgt != nullptr); + EXPECT_EQ(ht_1.size(), ht_ref.size()); + EXPECT_EQ(xsrc.size(), 3 * ht_ref.size()); + + // x could be changed after compute, so copy to save src + int d = ht_ref.size(); + std::vector x(xsrc.size()), ht(ht_ref.size()); + std::copy(xsrc.begin(), xsrc.end(), x.begin()); + const T* ht_1_data = ht_1.data(); + const T* ht_ref_data = ht_ref.data(); + T* x_data = x.data(); + T* ht_data = ht.data(); + jit::gru_t step; + step.gates = x_data; + step.ht_1 = ht_1_data; + step.ht = ht_data; + tgt(&step, &attr); + ExpectEQ(ht_data, ht_ref_data, d); + }; + TestAllImpls(attr, verifier, xsrc, ht_1, ht_ref, + attr); } } } } -template -void TestKernelSeqPoolTuples() { - VLOG(10) << "===== Test JITKernel " << jit::to_string(KT); +template +void TestKernelNCHW16CMulNC() { + using T = typename KernelTuple::data_type; + VLOG(10) << "Test JITKernel: " << jit::to_string(KernelTuple::kernel_type); + const int n = 3, c = 16 * 4, h = 10, w = 10; + auto ref = jit::GetRefer(); + EXPECT_TRUE(ref != nullptr); + int sz = n * c * h * w; + std::vector x(sz), y(n * c), zref(sz); + std::vector ztgt(sz), zjit(sz); + RandomVec(sz, x.data()); + RandomVec(n * c, y.data()); + + const T* x_data = x.data(); + const T* y_data = y.data(); + T* zref_data = zref.data(); + T* ztgt_data = ztgt.data(); + T* zjit_data = zjit.data(); + constexpr int simd_width = ZMM_FLOAT_BLOCK; + int C = c / simd_width; + auto tgt = jit::KernelFuncs::Cache().At(0); + auto jitcode = jit::GetJitCode(0); + EXPECT_TRUE(tgt != nullptr); + + if (std::is_same::value && + paddle::platform::MayIUse(paddle::platform::avx512f)) { + EXPECT_TRUE(jitcode != nullptr); + } + for (int ni = 0; ni < n; ni++) { + for (int ci = 0; ci < C; ci++) { + auto ptr_x = + x_data + ni * C * h * w * simd_width + ci * h * w * simd_width; + auto ptr_y = y_data + ni * C * simd_width + ci * simd_width; + auto ptr_zref = + zref_data + ni * C * h * w * simd_width + ci * h * w * simd_width; + auto ptr_ztgt = + ztgt_data + ni * C * h * w * simd_width + ci * h * w * simd_width; + + ref(ptr_x, ptr_y, ptr_zref, h, w); + tgt(ptr_x, ptr_y, ptr_ztgt, h, w); + + if (jitcode) { + auto ptr_zjit = + zjit_data + ni * C * h * w * simd_width + ci * h * w * simd_width; + jitcode(ptr_x, ptr_y, ptr_zjit, h, w); + } + } + } + ExpectEQ(ztgt_data, zref_data, sz); + if (jitcode) { + ExpectEQ(zjit_data, zref_data, sz); + } +} + +template +void TestKernelLayerNorm() { + using T = typename KernelTuple::data_type; + VLOG(10) << "Test JITKernel: " << jit::to_string(KernelTuple::kernel_type); + const T epsilon = 9.99999975e-06; + for (int n : {1, 2, 10}) { + for (int x_dim_0 : {1, 9, 17, 50}) { + int left = n * x_dim_0; + for (int x_dim_1 : TestSizes()) { + int right = x_dim_1; + auto ref = jit::GetRefer(); + EXPECT_TRUE(ref != nullptr); + int sz = left * right; + std::vector x(sz), mean(left), var(left), scale(right), bias(right), + outref(sz); + RandomVec(sz, x.data()); + RandomVec(left, mean.data()); + RandomVec(left, var.data()); + RandomVec(right, scale.data()); + RandomVec(right, bias.data()); + + const T* scale_data = scale.data(); + const T* bias_data = bias.data(); + T* x_data = x.data(); + T* mean_data = mean.data(); + T* var_data = var.data(); + T* outref_data = outref.data(); + + ref(x_data, outref_data, mean_data, var_data, scale_data, bias_data, + left, epsilon, right); + + auto verifier = []( + const typename KernelTuple::func_type tgt, const std::vector& x_, + const std::vector& outref_, const std::vector& mean_, + const std::vector& var_, const std::vector& scale, + const std::vector& bias, const int& left, const float& epsilon, + const typename KernelTuple::attr_type& right) { + EXPECT_TRUE(tgt != nullptr); + std::vector outtgt(outref_.size()); + std::vector x(x_.size()); + std::vector mean(mean_.size()); + std::vector var(var_.size()); + std::vector outref(outref_.size()); + std::copy(x_.begin(), x_.end(), x.begin()); + std::copy(mean_.begin(), mean_.end(), mean.begin()); + std::copy(var_.begin(), var_.end(), var.begin()); + std::copy(outref_.begin(), outref_.end(), outref.begin()); + + EXPECT_EQ(x.size(), static_cast(left * right)); + EXPECT_EQ(outref.size(), static_cast(left * right)); + EXPECT_EQ(mean.size(), static_cast(left)); + EXPECT_EQ(var.size(), static_cast(left)); + EXPECT_EQ(scale.size(), static_cast(right)); + EXPECT_EQ(bias.size(), static_cast(right)); + + const T* scale_data = scale.data(); + const T* bias_data = bias.data(); + T* x_data = x.data(); + T* mean_data = mean.data(); + T* var_data = var.data(); + T* outref_data = outref.data(); + T* outtgt_data = outtgt.data(); + tgt(x_data, outtgt_data, mean_data, var_data, scale_data, bias_data, + left, epsilon, right); + ExpectEQ(outtgt_data, outref_data, left * right); + }; + TestAllImpls(right, verifier, x, outref, mean, + var, scale, bias, left, epsilon, + right); + } + } + } +} + +template +void TestKernelCRFDecoding() { + using T = typename KernelTuple::data_type; + VLOG(10) << "Test JITKernel: " << jit::to_string(KernelTuple::kernel_type); + constexpr int state_trans_base_idx = 2; + auto test_sizes = TestSizes(); + test_sizes.erase(std::remove(test_sizes.begin(), test_sizes.end(), 2000)); + for (int seq_len : {1, 11, 17, 50}) { + for (int tag_num : test_sizes) { + auto ref = jit::GetRefer(); + EXPECT_TRUE(ref != nullptr); + int x_sz = seq_len * tag_num; + int w_sz = (tag_num + state_trans_base_idx) * tag_num; + std::vector x(x_sz), w(w_sz), alpharef(x_sz); + std::vector trackref(x_sz); + RandomVec(x_sz, x.data()); + RandomVec(w_sz, w.data()); + + ref(seq_len, (const T*)x.data(), (const T*)w.data(), alpharef.data(), + trackref.data(), tag_num); + + auto verifier = []( + const typename KernelTuple::func_type tgt, const int& seq_len, + const std::vector& x, const std::vector& w, + const std::vector& alpharef, const std::vector& trackref, + const typename KernelTuple::attr_type& tag_num) { + constexpr int state_trans_base_idx = 2; + EXPECT_TRUE(tgt != nullptr); + EXPECT_EQ(x.size(), static_cast(seq_len * tag_num)); + EXPECT_EQ(w.size(), static_cast( + (tag_num + state_trans_base_idx) * tag_num)); + EXPECT_EQ(alpharef.size(), static_cast(seq_len * tag_num)); + EXPECT_EQ(trackref.size(), static_cast(seq_len * tag_num)); + std::vector alphatgt(alpharef.size()); + std::vector tracktgt(trackref.size()); + memcpy(tracktgt.data(), trackref.data(), tag_num * sizeof(int)); + tgt(seq_len, (const T*)x.data(), (const T*)w.data(), alphatgt.data(), + tracktgt.data(), tag_num); + ExpectEQ(alpharef.data(), alphatgt.data(), seq_len * tag_num); + ExpectEQ(trackref.data(), tracktgt.data(), seq_len * tag_num); + }; + TestAllImpls(tag_num, verifier, seq_len, x, w, + alpharef, trackref, tag_num); + } + } +} + +template +void TestKernelSeqPool() { + using T = typename KernelTuple::data_type; + VLOG(10) << "Test JITKernel: " << jit::to_string(KernelTuple::kernel_type); std::vector pool_types = { jit::SeqPoolType::kSum, jit::SeqPoolType::kAvg, jit::SeqPoolType::kSqrt}; auto test_sizes = TestSizes(); @@ -668,7 +606,7 @@ void TestKernelSeqPoolTuples() { jit::seq_pool_attr_t attr(w, type); for (int h : test_sizes) { attr.h = h; - auto ref = jit::GetRefer>(); + auto ref = jit::GetRefer(); EXPECT_TRUE(ref != nullptr); std::vector x(h * w), yref(w); RandomVec(h * w, x.data()); @@ -676,16 +614,86 @@ void TestKernelSeqPoolTuples() { T* yref_data = yref.data(); ref(x_data, yref_data, &attr); VLOG(10) << attr; - TestAllImpls, PlaceType, std::vector, - std::vector>(attr, x, yref, attr); + auto verifier = [](const typename KernelTuple::func_type tgt, + const std::vector& x, const std::vector& yref, + const typename KernelTuple::attr_type& attr) { + EXPECT_TRUE(tgt != nullptr); + EXPECT_EQ(x.size() % yref.size(), static_cast(0)); + int w = yref.size(); + std::vector y(w); + const T* x_data = x.data(); + const T* yref_data = yref.data(); + T* y_data = y.data(); + tgt(x_data, y_data, &attr); + ExpectEQ(y_data, yref_data, w); + }; + TestAllImpls(attr, verifier, x, yref, attr); + } + } + } +} + +template +void TestKernelEmbSeqPool() { + using T = typename KernelTuple::data_type; + VLOG(10) << "Test JITKernel: " << jit::to_string(KernelTuple::kernel_type); + int64_t tbl_h = 1e4; + std::vector pool_types = { + jit::SeqPoolType::kSum}; // only support sum yet + auto test_sizes = TestSizes(); + test_sizes.erase(std::remove(test_sizes.begin(), test_sizes.end(), 1000)); + for (int tbl_w : test_sizes) { + std::vector table(tbl_h * tbl_w); + RandomVec(tbl_h * tbl_w, table.data()); + const T* table_data = table.data(); + for (auto type : pool_types) { + for (int idx_w : {1, 2, 10, 16}) { + for (int idx_h : {1, 2, 9, 13, 16}) { + auto ref = jit::GetRefer(); + EXPECT_TRUE(ref != nullptr); + std::vector idx(idx_h * idx_w); + RandomVec(idx_h * idx_w, idx.data(), 0, tbl_h - 1); + int64_t out_w = tbl_w * idx_w; + std::vector oref(out_w); + const int64_t* idx_data = idx.data(); + T* o_data = oref.data(); + jit::emb_seq_pool_attr_t attr(tbl_h, tbl_w, idx_h, idx_w, out_w, + type); + ref(table_data, idx_data, o_data, &attr); + + auto verifier = [](const typename KernelTuple::func_type tgt, + const std::vector& table, + const std::vector& idx, + const std::vector& oref, + const typename KernelTuple::attr_type& attr) { + EXPECT_TRUE(tgt != nullptr); + EXPECT_EQ(table.size(), static_cast(attr.table_height * + attr.table_width)); + EXPECT_EQ(idx.size(), static_cast(attr.index_height * + attr.index_width)); + EXPECT_EQ(oref.size(), + static_cast(attr.table_width * attr.index_width)); + const T* table_data = table.data(); + const int64_t* idx_data = idx.data(); + const T* oref_data = oref.data(); + int o_w = oref.size(); + std::vector out(o_w); + T* o_data = out.data(); + tgt(table_data, idx_data, o_data, &attr); + ExpectEQ(o_data, oref_data, o_w); + }; + TestAllImpls(attr, verifier, table, idx, oref, + attr); + } } } } } -template -void TestKernelMatMulTuples() { - VLOG(10) << "===== Test JITKernel " << jit::to_string(KT); +template +void TestKernelMatMul() { + using T = typename KernelTuple::data_type; + VLOG(10) << "Test JITKernel: " << jit::to_string(KernelTuple::kernel_type); auto last_acc = FLAGS_acc; // export MKL_CBWR=AVX would make MKL force to use AVX // export KMP_DETERMINISTIC_REDUCTION=yes would make the result deterministic @@ -693,7 +701,7 @@ void TestKernelMatMulTuples() { for (int m : {1, 2, 3, 4}) { for (int n : {1, 2, 3, 4}) { for (int k : TestSizes()) { - auto ref = jit::GetRefer>(); + auto ref = jit::GetRefer(); EXPECT_TRUE(ref != nullptr); std::vector a(m * k), b(k * n), c(m * n); RandomVec(m * k, a.data()); @@ -703,20 +711,36 @@ void TestKernelMatMulTuples() { T* c_data = c.data(); const jit::matmul_attr_t attr{m, n, k}; ref(a_data, b_data, c_data, &attr); - TestAllImpls, PlaceType, std::vector, - std::vector, std::vector>(attr, a, b, c, attr); + auto verifier = [](const typename KernelTuple::func_type tgt, + const std::vector& a, const std::vector& b, + const std::vector& cref, + const typename KernelTuple::attr_type& attr) { + EXPECT_TRUE(tgt != nullptr); + EXPECT_EQ(a.size(), static_cast(attr.m * attr.k)); + EXPECT_EQ(b.size(), static_cast(attr.k * attr.n)); + EXPECT_EQ(cref.size(), static_cast(attr.m * attr.n)); + std::vector c(cref.size()); + const T* a_data = a.data(); + const T* b_data = b.data(); + const T* cref_data = cref.data(); + T* c_data = c.data(); + tgt(a_data, b_data, c_data, &attr); + ExpectEQ(c_data, cref_data, attr.m * attr.n); + }; + TestAllImpls(attr, verifier, a, b, c, attr); } } } FLAGS_acc = last_acc; } -template -void TestKernelSoftmaxTuples() { - VLOG(10) << "===== Test JITKernel " << jit::to_string(KT); +template +void TestKernelSoftmax() { + using T = typename KernelTuple::data_type; + VLOG(10) << "Test JITKernel: " << jit::to_string(KernelTuple::kernel_type); for (int bs : {1, 2, 10}) { for (int n : TestSizes()) { - auto ref = jit::GetRefer>(); + auto ref = jit::GetRefer(); EXPECT_TRUE(ref != nullptr); std::vector x(bs * n), y(bs * n); RandomVec(bs * n, x.data()); @@ -730,51 +754,33 @@ void TestKernelSoftmaxTuples() { ref(xinp_data, xinp_data, n, bs); ExpectEQ(xinp_data, y_data, n * bs); - TestAllImpls, PlaceType, std::vector, - std::vector>(n, x, y, n, bs); - } - } -} - -template -void TestKernelEmbSeqPoolTuples() { - VLOG(10) << "===== Test JITKernel " << jit::to_string(KT); - int64_t tbl_h = 1e4; - std::vector pool_types = { - jit::SeqPoolType::kSum}; // only support sum yet - auto test_sizes = TestSizes(); - test_sizes.erase(std::remove(test_sizes.begin(), test_sizes.end(), 1000)); - for (int tbl_w : test_sizes) { - std::vector table(tbl_h * tbl_w); - RandomVec(tbl_h * tbl_w, table.data()); - const T* table_data = table.data(); - for (auto type : pool_types) { - for (int idx_w : {1, 2, 10, 16}) { - for (int idx_h : {1, 2, 9, 13, 16}) { - auto ref = jit::GetRefer>(); - EXPECT_TRUE(ref != nullptr); - std::vector idx(idx_h * idx_w); - RandomVec(idx_h * idx_w, idx.data(), 0, tbl_h - 1); - int64_t out_w = tbl_w * idx_w; - std::vector oref(out_w); - const int64_t* idx_data = idx.data(); - T* o_data = oref.data(); - jit::emb_seq_pool_attr_t attr(tbl_h, tbl_w, idx_h, idx_w, out_w, - type); - ref(table_data, idx_data, o_data, &attr); - - TestAllImpls, PlaceType, std::vector, - std::vector, std::vector>(attr, table, idx, - oref, attr); - } - } + auto verifier = [](const typename KernelTuple::func_type tgt, + const std::vector& x, const std::vector& yref, + int n, int bs) { + EXPECT_TRUE(tgt != nullptr); + EXPECT_EQ(yref.size(), x.size()); + EXPECT_EQ(x.size(), static_cast(n * bs)); + const T* x_data = x.data(); + const T* yref_data = yref.data(); + std::vector ytgt(n * bs); + T* ytgt_data = ytgt.data(); + // test normal + tgt(x_data, ytgt_data, n, bs); + ExpectEQ(ytgt_data, yref_data, n * bs); + // test inplace x + std::copy(x.begin(), x.end(), ytgt.begin()); + tgt(ytgt_data, ytgt_data, n, bs); + ExpectEQ(ytgt_data, yref_data, n * bs); + }; + TestAllImpls(n, verifier, x, y, n, bs); } } } -template -void TestKernelSgdTuples() { - VLOG(10) << "===== Test JITKernel " << jit::to_string(KT); +template +void TestKernelSgd() { + using T = typename KernelTuple::data_type; + VLOG(10) << "Test JITKernel: " << jit::to_string(KernelTuple::kernel_type); const T lr = 0.1; auto UnDuplicatedRandomVec = [](int n, const int64_t lower, const int64_t upper) -> std::vector { @@ -802,7 +808,7 @@ void TestKernelSgdTuples() { RandomVec(rows_size * grad_w, grad.data()); const int64_t* rows_data = rows.data(); const T* grad_data = grad.data(); - auto ref = jit::GetRefer>(); + auto ref = jit::GetRefer(); EXPECT_TRUE(ref != nullptr); jit::sgd_attr_t attr(param_h, grad_w, rows_size, grad_w, rows_size); ref(&lr, param_data, grad_data, rows_data, out_data, &attr); @@ -818,199 +824,150 @@ void TestKernelSgdTuples() { grad_w); } - TestAllImpls, PlaceType, T, std::vector, - std::vector, std::vector, std::vector>( - attr, lr, param, grad, rows, param_out, attr); - } - } - } -} - -template -void TestKernelNCHW16CMulNCTuples() { - VLOG(10) << "===== Test JITKernel " << jit::to_string(KT); - const int n = 3, c = 16 * 4, h = 10, w = 10; - auto ref = jit::GetRefer>(); - EXPECT_TRUE(ref != nullptr); - int sz = n * c * h * w; - std::vector x(sz), y(n * c), zref(sz); - std::vector ztgt(sz), zjit(sz); - RandomVec(sz, x.data()); - RandomVec(n * c, y.data()); - - const T* x_data = x.data(); - const T* y_data = y.data(); - T* zref_data = zref.data(); - T* ztgt_data = ztgt.data(); - T* zjit_data = zjit.data(); - constexpr int simd_width = ZMM_FLOAT_BLOCK; - int C = c / simd_width; - auto tgt = - jit::KernelFuncs, PlaceType>::Cache().At( - 0); - auto jitcode = jit::GetJitCode, PlaceType>(0); - EXPECT_TRUE(tgt != nullptr); - - if (std::is_same::value && - paddle::platform::MayIUse(paddle::platform::avx512f)) { - EXPECT_TRUE(jitcode != nullptr); - } - for (int ni = 0; ni < n; ni++) { - for (int ci = 0; ci < C; ci++) { - auto ptr_x = - x_data + ni * C * h * w * simd_width + ci * h * w * simd_width; - auto ptr_y = y_data + ni * C * simd_width + ci * simd_width; - auto ptr_zref = - zref_data + ni * C * h * w * simd_width + ci * h * w * simd_width; - auto ptr_ztgt = - ztgt_data + ni * C * h * w * simd_width + ci * h * w * simd_width; - - ref(ptr_x, ptr_y, ptr_zref, h, w); - tgt(ptr_x, ptr_y, ptr_ztgt, h, w); - - if (jitcode) { - auto ptr_zjit = - zjit_data + ni * C * h * w * simd_width + ci * h * w * simd_width; - jitcode(ptr_x, ptr_y, ptr_zjit, h, w); - } - } - } - ExpectEQ(ztgt_data, zref_data, sz); - if (jitcode) { - ExpectEQ(zjit_data, zref_data, sz); - } -} - -template -void TestKernelLayerNormTuples() { - VLOG(10) << "===== Test JITKernel " << jit::to_string(KT); - const T epsilon = 9.99999975e-06; - for (int n : {1, 2, 10}) { - for (int x_dim_0 : {1, 9, 17, 50}) { - int left = n * x_dim_0; - for (int x_dim_1 : TestSizes()) { - int right = x_dim_1; - auto ref = jit::GetRefer>(); - EXPECT_TRUE(ref != nullptr); - int sz = left * right; - std::vector x(sz), mean(left), var(left), scale(right), bias(right), - outref(sz); - RandomVec(sz, x.data()); - RandomVec(left, mean.data()); - RandomVec(left, var.data()); - RandomVec(right, scale.data()); - RandomVec(right, bias.data()); - - const T* scale_data = scale.data(); - const T* bias_data = bias.data(); - T* x_data = x.data(); - T* mean_data = mean.data(); - T* var_data = var.data(); - T* outref_data = outref.data(); - - ref(x_data, outref_data, mean_data, var_data, scale_data, bias_data, - left, epsilon, right); + auto verifier = []( + const typename KernelTuple::func_type tgt, const T lr, + const std::vector& param, const std::vector& grad, + const std::vector& rows, const std::vector& oref, + const typename KernelTuple::attr_type& attr) { + EXPECT_TRUE(tgt != nullptr); + EXPECT_EQ(param.size(), + static_cast(attr.param_height * attr.param_width)); + EXPECT_EQ(grad.size(), + static_cast(attr.grad_height * attr.grad_width)); + EXPECT_EQ(rows.size(), static_cast(attr.selected_rows_size)); + EXPECT_EQ(param.size(), oref.size()); + const T* param_data = param.data(); + const T* grad_data = grad.data(); + const int64_t* rows_data = rows.data(); + const T* oref_data = oref.data(); + + std::vector out(oref.size()); + T* o_data = out.data(); + tgt(&lr, param_data, grad_data, rows_data, o_data, &attr); + // only the selected rows should be equal + for (size_t i = 0; i < rows.size(); ++i) { + ExpectEQ(o_data + rows[i] * attr.grad_width, + oref_data + rows[i] * attr.grad_width, attr.grad_width); + } - TestAllImpls, PlaceType, std::vector, - std::vector, std::vector, std::vector, - std::vector, std::vector, int, float>( - right, x, outref, mean, var, scale, bias, left, epsilon, right); + // inplace + std::copy(param.begin(), param.end(), out.begin()); + tgt(&lr, o_data, grad_data, rows_data, o_data, &attr); + for (size_t i = 0; i < rows.size(); ++i) { + ExpectEQ(o_data + rows[i] * attr.grad_width, + oref_data + rows[i] * attr.grad_width, attr.grad_width); + } + }; + TestAllImpls(attr, verifier, lr, param, grad, + rows, param_out, attr); } } } } -template -void TestKernelCRFDecodingTuples() { - VLOG(10) << "===== Test JITKernel " << jit::to_string(KT); - constexpr int state_trans_base_idx = 2; - auto test_sizes = TestSizes(); - test_sizes.erase(std::remove(test_sizes.begin(), test_sizes.end(), 2000)); - for (int seq_len : {1, 11, 17, 50}) { - for (int tag_num : test_sizes) { - auto ref = jit::GetRefer>(); - EXPECT_TRUE(ref != nullptr); - int x_sz = seq_len * tag_num; - int w_sz = (tag_num + state_trans_base_idx) * tag_num; - std::vector x(x_sz), w(w_sz), alpharef(x_sz); - std::vector trackref(x_sz); - RandomVec(x_sz, x.data()); - RandomVec(w_sz, w.data()); - - ref(seq_len, (const T*)x.data(), (const T*)w.data(), alpharef.data(), - trackref.data(), tag_num); - - TestAllImpls, PlaceType, int, - std::vector, std::vector, std::vector, - std::vector, int>(tag_num, seq_len, x, w, alpharef, - trackref, tag_num); - } - } -} - -template -void TestKernelVBroadcastTuples() { - VLOG(10) << "===== Test JITKernel " << jit::to_string(KT); +template +void TestKernelVBroadcast() { + using T = typename KernelTuple::data_type; + VLOG(10) << "Test JITKernel: " << jit::to_string(KernelTuple::kernel_type); for (int w : TestSizes()) { std::vector x(w); RandomVec(w, x.data()); const T* x_data = x.data(); for (int64_t h : {1, 2, 6}) { - auto ref = jit::GetRefer>(); + auto ref = jit::GetRefer(); EXPECT_TRUE(ref != nullptr); std::vector y(w * h); T* y_data = y.data(); ref(x_data, y_data, h, w); - TestAllImpls, PlaceType, std::vector, - std::vector, int64_t>(static_cast(w), x, y, h, - static_cast(w)); + auto verifier = [](const typename KernelTuple::func_type tgt, + const std::vector& x, const std::vector& yref, + const int64_t& h, + const typename KernelTuple::attr_type& attr) { + EXPECT_TRUE(tgt != nullptr); + EXPECT_EQ(x.size(), static_cast(attr)); + EXPECT_EQ(yref.size(), x.size() * h); + std::vector y(yref.size()); + const T* x_data = x.data(); + const T* yref_data = yref.data(); + T* y_data = y.data(); + tgt(x_data, y_data, h, attr); + ExpectEQ(y_data, yref_data, yref.size()); + }; + TestAllImpls(static_cast(w), verifier, x, + y, h, static_cast(w)); } } } -#define TEST_CPU_KERNEL(test_tuple, kernel_type) \ - TEST(JITKernel, kernel_type) { \ - TestKernel##test_tuple(); \ - TestKernel##test_tuple(); \ +#define TestKernelVMul TestKernelXYZN +#define TestKernelVAdd TestKernelXYZN +#define TestKernelVAddRelu TestKernelXYZN +#define TestKernelVSub TestKernelXYZN + +#define TestKernelVScal TestKernelAXYN +#define TestKernelVAddBias TestKernelAXYN + +#define TestKernelVRelu TestKernelXYN +#define TestKernelVIdentity TestKernelXYN +#define TestKernelVSquare TestKernelXYN +#define TestKernelVExp TestKernelXYN +#define TestKernelVSigmoid TestKernelXYN +#define TestKernelVTanh TestKernelXYN +#define TestKernelVCopy TestKernelXYN + +#define TestKernelHMax TestKernelXRN +#define TestKernelHSum TestKernelXRN + +#define TestKernelLSTMCtHt TestKernelLSTM +#define TestKernelLSTMC1H1 TestKernelLSTM + +#define TestKernelGRUH1 TestKernelGRU +#define TestKernelGRUHtPart1 TestKernelGRU +#define TestKernelGRUHtPart2 TestKernelGRU + +#define TEST_CPU_KERNEL(kernel_type) \ + TEST(JITKernel, kernel_type) { \ + TestKernel##kernel_type, CPUPlace>(); \ + TestKernel##kernel_type, CPUPlace>(); \ } -TEST_CPU_KERNEL(XYZNTuples, kVMul); -TEST_CPU_KERNEL(XYZNTuples, kVAdd); -TEST_CPU_KERNEL(XYZNTuples, kVAddRelu); -TEST_CPU_KERNEL(XYZNTuples, kVSub); +TEST_CPU_KERNEL(VMul); +TEST_CPU_KERNEL(VAdd); +TEST_CPU_KERNEL(VAddRelu); +TEST_CPU_KERNEL(VSub); -TEST_CPU_KERNEL(AXYNTuples, kVScal); -TEST_CPU_KERNEL(AXYNTuples, kVAddBias); +TEST_CPU_KERNEL(VScal); +TEST_CPU_KERNEL(VAddBias); -TEST_CPU_KERNEL(XRNTuples, kHMax); -TEST_CPU_KERNEL(XRNTuples, kHSum); +TEST_CPU_KERNEL(VRelu); +TEST_CPU_KERNEL(VIdentity); +TEST_CPU_KERNEL(VSquare); +TEST_CPU_KERNEL(VExp); +TEST_CPU_KERNEL(VSigmoid); +TEST_CPU_KERNEL(VTanh); +TEST_CPU_KERNEL(VCopy); -TEST_CPU_KERNEL(XYNTuples, kVRelu); -TEST_CPU_KERNEL(XYNTuples, kVIdentity); -TEST_CPU_KERNEL(XYNTuples, kVSquare); -TEST_CPU_KERNEL(XYNTuples, kVExp); -TEST_CPU_KERNEL(XYNTuples, kVSigmoid); -TEST_CPU_KERNEL(XYNTuples, kVTanh); -TEST_CPU_KERNEL(XYNTuples, kVCopy); +TEST_CPU_KERNEL(HMax); +TEST_CPU_KERNEL(HSum); -TEST_CPU_KERNEL(LSTMTuples, kLSTMCtHt); -TEST_CPU_KERNEL(LSTMTuples, kLSTMC1H1); +TEST_CPU_KERNEL(LSTMCtHt); +TEST_CPU_KERNEL(LSTMC1H1); -TEST_CPU_KERNEL(GRUTuples, kGRUH1); -TEST_CPU_KERNEL(GRUTuples, kGRUHtPart1); -TEST_CPU_KERNEL(GRUTuples, kGRUHtPart2); +TEST_CPU_KERNEL(GRUH1); +TEST_CPU_KERNEL(GRUHtPart1); +TEST_CPU_KERNEL(GRUHtPart2); -TEST_CPU_KERNEL(NCHW16CMulNCTuples, kNCHW16CMulNC); +TEST_CPU_KERNEL(NCHW16CMulNC); +TEST_CPU_KERNEL(LayerNorm); +TEST_CPU_KERNEL(CRFDecoding); -TEST_CPU_KERNEL(SeqPoolTuples, kSeqPool); -TEST_CPU_KERNEL(MatMulTuples, kMatMul); -TEST_CPU_KERNEL(SoftmaxTuples, kSoftmax); -TEST_CPU_KERNEL(EmbSeqPoolTuples, kEmbSeqPool); -TEST_CPU_KERNEL(SgdTuples, kSgd); -TEST_CPU_KERNEL(LayerNormTuples, kLayerNorm); -TEST_CPU_KERNEL(CRFDecodingTuples, kCRFDecoding); -TEST_CPU_KERNEL(VBroadcastTuples, kVBroadcast); +TEST_CPU_KERNEL(SeqPool); +TEST_CPU_KERNEL(EmbSeqPool); +TEST_CPU_KERNEL(MatMul); +TEST_CPU_KERNEL(Softmax); +TEST_CPU_KERNEL(Sgd); +TEST_CPU_KERNEL(VBroadcast); TEST(JITKernel_key, lstm) { jit::lstm_attr_t attr1(8, jit::kVIdentity, jit::kVSigmoid, jit::kVTanh); @@ -1045,16 +1002,9 @@ TEST(JITKernel_key, gru) { } TEST(JITKernel, kernel_func) { - auto f1 = - jit::KernelFuncs, CPUPlace>::Cache() - .At(3); - auto f2 = jit::KernelFuncs, - CPUPlace>::Cache()[3]; + auto f1 = jit::KernelFuncs, CPUPlace>::Cache().At(3); + auto f2 = jit::KernelFuncs, CPUPlace>::Cache()[3]; + EXPECT_TRUE(f1 != nullptr); EXPECT_TRUE(f1 == f2); - - f1 = jit::KernelFuncs, CPUPlace>::Cache() - .At(3); - f2 = jit::KernelFuncs, CPUPlace>::Cache() - .At(4); - EXPECT_TRUE(f1 != f2); + // TODO(TJ): check not equal } diff --git a/paddle/fluid/operators/layer_norm_op.h b/paddle/fluid/operators/layer_norm_op.h index f0c3064d41..8627c83b43 100644 --- a/paddle/fluid/operators/layer_norm_op.h +++ b/paddle/fluid/operators/layer_norm_op.h @@ -229,9 +229,9 @@ class LayerNormKernel : public framework::OpKernel { PADDLE_ENFORCE_EQ(scale->numel(), right); PADDLE_ENFORCE_EQ(bias->numel(), right); - auto ker = jit::KernelFuncs, - platform::CPUPlace>::Cache() - .At(right); + auto ker = + jit::KernelFuncs, platform::CPUPlace>::Cache() + .At(right); ker(x.data(), out.data(), mean->data(), var->data(), scale->data(), bias->data(), static_cast(left), static_cast(epsilon), right); diff --git a/paddle/fluid/operators/math/fc_compute.h b/paddle/fluid/operators/math/fc_compute.h index 0ad57c51be..66ce57594a 100644 --- a/paddle/fluid/operators/math/fc_compute.h +++ b/paddle/fluid/operators/math/fc_compute.h @@ -30,17 +30,16 @@ inline void FCCompute(const BlasT& blas, const int M, return; } if (relu) { - auto compute = jit::KernelFuncs, - platform::CPUPlace>::Cache() - .At(N); + auto compute = + jit::KernelFuncs, platform::CPUPlace>::Cache().At( + N); for (int i = 0; i < M; i++) { T* dst = Y + i * N; compute(B, dst, dst, N); } } else { - auto compute = jit::KernelFuncs, - platform::CPUPlace>::Cache() - .At(N); + auto compute = + jit::KernelFuncs, platform::CPUPlace>::Cache().At(N); #ifdef PADDLE_WITH_MKLML #pragma omp parallel for #endif diff --git a/paddle/fluid/operators/math/sequence_pooling.cc b/paddle/fluid/operators/math/sequence_pooling.cc index db103e5fab..7af44f2b2c 100644 --- a/paddle/fluid/operators/math/sequence_pooling.cc +++ b/paddle/fluid/operators/math/sequence_pooling.cc @@ -255,9 +255,9 @@ class SequencePoolFunctor { jit::seq_pool_attr_t attr( static_cast(input.numel() / input.dims()[0]), jit::SeqPoolType::kSum); - auto seqpool = jit::KernelFuncs, - platform::CPUPlace>::Cache() - .At(attr); + auto seqpool = + jit::KernelFuncs, platform::CPUPlace>::Cache() + .At(attr); for (int i = 0; i < static_cast(lod.size()) - 1; ++i) { attr.h = static_cast(lod[i + 1] - lod[i]); seqpool(src, dst, &attr); diff --git a/paddle/fluid/operators/math/softmax_impl.h b/paddle/fluid/operators/math/softmax_impl.h index a1cb3f9728..d77b6712c5 100644 --- a/paddle/fluid/operators/math/softmax_impl.h +++ b/paddle/fluid/operators/math/softmax_impl.h @@ -82,8 +82,7 @@ class SoftmaxFunctor> { const int kClassDim = 1; // 2D data. Batch x C auto compute_softmax = - jit::KernelFuncs, - platform::CPUPlace>::Cache() + jit::KernelFuncs, platform::CPUPlace>::Cache() .At(in_dims[kClassDim]); compute_softmax(in_data, out_data, in_dims[kClassDim], in_dims[kBatchDim]); } diff --git a/paddle/fluid/operators/optimizers/sgd_op.h b/paddle/fluid/operators/optimizers/sgd_op.h index 0425a3d194..5dd5f67e00 100644 --- a/paddle/fluid/operators/optimizers/sgd_op.h +++ b/paddle/fluid/operators/optimizers/sgd_op.h @@ -47,9 +47,9 @@ class SGDOpKernel : public framework::OpKernel { int64_t rows_idx = 0; T *out_data = param_out->mutable_data(ctx.GetPlace()); - auto sgd = jit::KernelFuncs, - platform::CPUPlace>::Cache() - .At(attr); + auto sgd = + jit::KernelFuncs, platform::CPUPlace>::Cache().At( + attr); sgd(lr, param_data, grad_data, &rows_idx, out_data, &attr); } else if (grad_var->IsType()) { // TODO(qijun): In Sparse SGD operator, in-place update is enforced. @@ -82,9 +82,9 @@ class SGDOpKernel : public framework::OpKernel { attr.selected_rows_size = grad_rows.size(); PADDLE_ENFORCE_EQ(attr.grad_width, attr.param_width); - auto sgd = jit::KernelFuncs, - platform::CPUPlace>::Cache() - .At(attr); + auto sgd = + jit::KernelFuncs, platform::CPUPlace>::Cache().At( + attr); sgd(lr, param_data, grad_data, rows_data, out_data, &attr); } else { PADDLE_THROW("Unsupported Variable Type of Grad"); From 6bb84b74b2b67dce9b4e0b397088da0cb7c3c960 Mon Sep 17 00:00:00 2001 From: Liu Yiqun Date: Fri, 8 Mar 2019 16:24:43 +0800 Subject: [PATCH 029/230] Change the download and compress command of cmake. test=develop --- paddle/fluid/inference/tests/test.cmake | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/paddle/fluid/inference/tests/test.cmake b/paddle/fluid/inference/tests/test.cmake index 6c5fe043ff..5ceb630976 100644 --- a/paddle/fluid/inference/tests/test.cmake +++ b/paddle/fluid/inference/tests/test.cmake @@ -30,13 +30,14 @@ function(inference_download_and_uncompress INSTALL_DIR URL FILENAME) ${EXTERNAL_PROJECT_NAME} ${EXTERNAL_PROJECT_LOG_ARGS} PREFIX ${INSTALL_DIR} - URL ${URL}/${FILENAME} + DOWNLOAD_COMMAND wget -q -O ${INSTALL_DIR}/${FILENAME} ${URL}/${FILENAME} && + ${CMAKE_COMMAND} -E tar xzf ${INSTALL_DIR}/${FILENAME} DOWNLOAD_DIR ${INSTALL_DIR} DOWNLOAD_NO_PROGRESS 1 CONFIGURE_COMMAND "" BUILD_COMMAND "" UPDATE_COMMAND "" - INSTALL_COMMAND ${CMAKE_COMMAND} -E copy_directory ${UNPACK_DIR} ${INSTALL_DIR} + INSTALL_COMMAND "" ) endfunction() From aeee4cbe7149a08bf19932ed8a657d931153ca4c Mon Sep 17 00:00:00 2001 From: luotao1 Date: Fri, 8 Mar 2019 18:48:50 +0800 Subject: [PATCH 030/230] add compare between zerocopy and analysis --- .../tests/api/analyzer_pyramid_dnn_tester.cc | 8 ++- .../fluid/inference/tests/api/tester_helper.h | 60 +++++++++++++++++++ 2 files changed, 67 insertions(+), 1 deletion(-) diff --git a/paddle/fluid/inference/tests/api/analyzer_pyramid_dnn_tester.cc b/paddle/fluid/inference/tests/api/analyzer_pyramid_dnn_tester.cc index df834e75df..5ba553aad6 100644 --- a/paddle/fluid/inference/tests/api/analyzer_pyramid_dnn_tester.cc +++ b/paddle/fluid/inference/tests/api/analyzer_pyramid_dnn_tester.cc @@ -134,7 +134,7 @@ TEST(Analyzer_Pyramid_DNN, profile) { TestPrediction(reinterpret_cast(&cfg), input_slots_all, &outputs, FLAGS_num_threads); - if (FLAGS_num_threads == 1 && !FLAGS_test_all_data) { + if (FLAGS_num_threads == 1 && !FLAGS_test_all_data && !FLAGS_zero_copy) { PADDLE_ENFORCE_EQ(outputs.size(), 1UL); size_t size = GetSize(outputs[0]); PADDLE_ENFORCE_GT(size, 0); @@ -167,6 +167,12 @@ TEST(Analyzer_Pyramid_DNN, compare) { SetInput(&input_slots_all); CompareNativeAndAnalysis( reinterpret_cast(&cfg), input_slots_all); + + // Compare AnalysisConfig and AnalysisConfig + ZeroCopy + std::vector outputs_name; + outputs_name.emplace_back("cos_sim_2.tmp_0"); + CompareAnalysisAndZeroCopy(reinterpret_cast(&cfg), + input_slots_all, outputs_name); } // Compare Deterministic result diff --git a/paddle/fluid/inference/tests/api/tester_helper.h b/paddle/fluid/inference/tests/api/tester_helper.h index 3becb4bf68..9a843e8d02 100644 --- a/paddle/fluid/inference/tests/api/tester_helper.h +++ b/paddle/fluid/inference/tests/api/tester_helper.h @@ -69,6 +69,7 @@ void PrintConfig(const PaddlePredictor::Config *config, bool use_analysis) { LOG(INFO) << analysis_config->ToNativeConfig(); } +// Compare result between two PaddleTensor void CompareResult(const std::vector &outputs, const std::vector &ref_outputs) { EXPECT_GT(outputs.size(), 0UL); @@ -102,6 +103,41 @@ void CompareResult(const std::vector &outputs, } } +// Compare result between a PaddleTensor and a ZeroCopyTensor +void CompareResult(const std::vector &outputs, + const std::vector &ref_outputs) { + EXPECT_GT(outputs.size(), 0UL); + EXPECT_EQ(outputs.size(), ref_outputs.size()); + for (size_t i = 0; i < outputs.size(); i++) { + auto &out = outputs[i]; + auto &ref_out = ref_outputs[i]; + size_t size = VecReduceToInt(out.shape); + EXPECT_GT(size, 0UL); + int ref_size = 0; // this is the number of elements not memory size + PaddlePlace place; + switch (out.dtype) { + case PaddleDType::INT64: { + int64_t *pdata = static_cast(out.data.data()); + int64_t *pdata_ref = ref_out.data(&place, &ref_size); + EXPECT_EQ(size, ref_size); + for (size_t j = 0; j < size; ++j) { + EXPECT_EQ(pdata_ref[j], pdata[j]); + } + break; + } + case PaddleDType::FLOAT32: { + float *pdata = static_cast(out.data.data()); + float *pdata_ref = ref_out.data(&place, &ref_size); + EXPECT_EQ(size, ref_size); + for (size_t j = 0; j < size; ++j) { + CHECK_LE(std::abs(pdata_ref[j] - pdata[j]), FLAGS_accuracy); + } + break; + } + } + } +} + std::unique_ptr CreateTestPredictor( const PaddlePredictor::Config *config, bool use_analysis = true) { const auto *analysis_config = @@ -377,6 +413,30 @@ void CompareNativeAndAnalysis( CompareResult(analysis_outputs, native_outputs); } +void CompareAnalysisAndZeroCopy( + PaddlePredictor::Config *config, + const std::vector> &inputs, + const std::vector &outputs_name) { + int batch_size = FLAGS_batch_size; + // analysis + std::vector analysis_outputs; + auto predictor = CreateTestPredictor(config, true); + predictor->Run(inputs[0], &analysis_outputs, batch_size); + // analysis + zero_copy + std::vector zerocopy_outputs; + reinterpret_cast(config)->SwitchUseFeedFetchOps(false); + predictor = CreateTestPredictor(config, true); + ConvertPaddleTensorToZeroCopyTensor(predictor.get(), inputs[0]); + predictor->ZeroCopyRun(); + for (size_t i = 0; i < outputs_name.size(); i++) { + ZeroCopyTensor zerocopy_output = + *predictor->GetOutputTensor(outputs_name[i]).get(); + zerocopy_outputs.emplace_back(zerocopy_output); + } + // compare + CompareResult(analysis_outputs, zerocopy_outputs); +} + template std::string LoDTensorSummary(const framework::LoDTensor &tensor) { std::stringstream ss; From 837ad7f86fe5de124b2d3a6c4933747fa30d92ea Mon Sep 17 00:00:00 2001 From: tink2123 Date: Sun, 10 Mar 2019 09:42:39 +0000 Subject: [PATCH 031/230] Add the inverse trigonometric function test=develop --- paddle/fluid/API.spec | 3 + paddle/fluid/operators/activation_op.cc | 27 +++++ paddle/fluid/operators/activation_op.h | 103 +++++++++++++++++- python/paddle/fluid/layers/ops.py | 3 + .../tests/unittests/test_activation_op.py | 54 +++++++++ 5 files changed, 187 insertions(+), 3 deletions(-) diff --git a/paddle/fluid/API.spec b/paddle/fluid/API.spec index a4e683da0b..00bd23b1fe 100644 --- a/paddle/fluid/API.spec +++ b/paddle/fluid/API.spec @@ -292,6 +292,7 @@ paddle.fluid.layers.sigmoid (ArgSpec(args=['x', 'name'], varargs=None, keywords= paddle.fluid.layers.logsigmoid (ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '81ccb7acafd06c7728e11581f5d342e3')) paddle.fluid.layers.exp (ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', 'e6b3e769413d96aab4176f96db25984b')) paddle.fluid.layers.tanh (ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', 'e9d586a0b5bd05f67ee78048f9d503b6')) +paddle.fluid.layers.atan (ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '7ca26a8235099486bdf243754439c6b6')) paddle.fluid.layers.tanh_shrink (ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '1e521554b9fdda9061ec6d306f0709b7')) paddle.fluid.layers.softshrink (ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '9eef31597bbafa2bd49691e072296e13')) paddle.fluid.layers.sqrt (ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '072a8541e0f632366bba10f67cb0db27')) @@ -299,7 +300,9 @@ paddle.fluid.layers.abs (ArgSpec(args=['x', 'name'], varargs=None, keywords=None paddle.fluid.layers.ceil (ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', 'c75d67dc5fe28f68e4cfffead4f698ad')) paddle.fluid.layers.floor (ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '647b16c5da5ef909649ae02abb434973')) paddle.fluid.layers.cos (ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '485f2686bcc2fe37a4bd893769c8a3e2')) +paddle.fluid.layers.acos (ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', 'c721122352acfc1853bffadf2d59103b')) paddle.fluid.layers.sin (ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '01f1766aa76eff1df30147505b59f7c4')) +paddle.fluid.layers.asin (ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '0619b891e80f419b28016cde3d106c68')) paddle.fluid.layers.round (ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', 'b47f5da13913d3e56bdb1e612a73f3f2')) paddle.fluid.layers.reciprocal (ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', 'cc6ac2f14f03c52aaa83a59bf83b8d26')) paddle.fluid.layers.square (ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '48dfb45d773dbc30126c3a7f777de5ee')) diff --git a/paddle/fluid/operators/activation_op.cc b/paddle/fluid/operators/activation_op.cc index 2feb8e4c47..cc948f2697 100644 --- a/paddle/fluid/operators/activation_op.cc +++ b/paddle/fluid/operators/activation_op.cc @@ -269,6 +269,27 @@ $$out = \\frac{x}{1 + \|x\|}$$ )DOC"; +UNUSED constexpr char AcosDoc[] = R"DOC( +Arccosine Activation Operator. + +$${out}_{i} = \cos^{-1}({input}_{i})$$ + +)DOC"; + +UNUSED constexpr char AsinDoc[] = R"DOC( +Arcsine Activation Operator. + +$out = \sin^{-1}({input}_{i})$ + +)DOC"; + +UNUSED constexpr char AtanDoc[] = R"DOC( +Arctanh Activation Operator. + +$out = \tanh^{-1}({input}_{i})$ + +)DOC"; + class LeakyReluOpMaker : public framework::OpProtoAndCheckerMaker { public: void Make() override { @@ -494,13 +515,16 @@ REGISTER_ACTIVATION_OP_MAKER(Exp, ExpDoc); REGISTER_ACTIVATION_OP_MAKER(Relu, ReluDoc); REGISTER_ACTIVATION_OP_MAKER(Gelu, GeluDoc); REGISTER_ACTIVATION_OP_MAKER(Tanh, TanhDoc); +REGISTER_ACTIVATION_OP_MAKER(Atan, AtanDoc); REGISTER_ACTIVATION_OP_MAKER(TanhShrink, TanhShrinkDoc); REGISTER_ACTIVATION_OP_MAKER(Sqrt, SqrtDoc); REGISTER_ACTIVATION_OP_MAKER(Abs, AbsDoc); REGISTER_ACTIVATION_OP_MAKER(Ceil, CeilDoc); REGISTER_ACTIVATION_OP_MAKER(Floor, FloorDoc); REGISTER_ACTIVATION_OP_MAKER(Cos, CosDoc); +REGISTER_ACTIVATION_OP_MAKER(Acos, AcosDoc); REGISTER_ACTIVATION_OP_MAKER(Sin, SinDoc); +REGISTER_ACTIVATION_OP_MAKER(Asin, AsinDoc); REGISTER_ACTIVATION_OP_MAKER(Round, RoundDoc); REGISTER_ACTIVATION_OP_MAKER(Reciprocal, ReciprocalDoc); REGISTER_ACTIVATION_OP_MAKER(Log, LogDoc); @@ -543,7 +567,10 @@ namespace ops = paddle::operators; __macro(SoftShrink, softshrink); \ __macro(Abs, abs); \ __macro(Cos, cos); \ + __macro(Acos, acos); \ __macro(Sin, sin); \ + __macro(Asin, asin); \ + __macro(Atan, atan); \ __macro(Round, round); \ __macro(Log, log); \ __macro(Square, square); \ diff --git a/paddle/fluid/operators/activation_op.h b/paddle/fluid/operators/activation_op.h index 1f5ae7fb5c..ff7e623f6f 100644 --- a/paddle/fluid/operators/activation_op.h +++ b/paddle/fluid/operators/activation_op.h @@ -39,9 +39,8 @@ namespace operators { Please refer to the layer_helper.py and get the details. */ static std::unordered_set InplaceOpSet = { - "sigmoid", "exp", "relu", "tanh", "sqrt", "ceil", - "floor", "reciprocal", "relu6", "soft_relu", "hard_sigmoid", -}; + "sigmoid", "exp", "relu", "tanh", "sqrt", "ceil", + "floor", "reciprocal", "relu6", "soft_relu", "hard_sigmoid"}; static bool IsInplace(const std::string& op) { bool inplace = InplaceOpSet.count(op); @@ -553,6 +552,101 @@ struct SinFunctor : public BaseActivationFunctor { } }; +template +struct Acos { + HOSTDEVICE T operator()(const T& val) const { return acos(val); } +}; + +template <> +struct Acos { + HOSTDEVICE platform::float16 operator()(const platform::float16& val) const { + return platform::float16(acos(static_cast(val))); + } +}; + +// Acos(x) = acos(x) +template +struct AcosFunctor : public BaseActivationFunctor { + template + void operator()(Device d, X x, Out out) const { + out.device(d) = x.unaryExpr(Acos()); + } +}; + +// acos'(x) = -1/sqrt(1-x^2) +template +struct AcosGradFunctor : public BaseActivationFunctor { + template + void operator()(Device d, X x, Out out, dOut dout, dX dx) const { + dx.device(d) = + -dout * static_cast(1) / (static_cast(1) - x.square()).sqrt(); + } +}; + +template +struct Asin { + HOSTDEVICE T operator()(const T& val) const { return asin(val); } +}; + +template <> +struct Asin { + HOSTDEVICE platform::float16 operator()(const platform::float16& val) const { + return platform::float16(asin(static_cast(val))); + } +}; + +// Asin(x) = asin(x) +template +struct AsinFunctor : public BaseActivationFunctor { + template + void operator()(Device d, X x, Out out) const { + out.device(d) = x.unaryExpr(Asin()); + } +}; + +// asin'(x) = 1/sqrt(1-x^2) +template +struct AsinGradFunctor : public BaseActivationFunctor { + template + void operator()(Device d, X x, Out out, dOut dout, dX dx) const { + dx.device(d) = + dout * static_cast(1) / (static_cast(1) - x.square()).sqrt(); + } +}; + +template +struct Atan { + HOSTDEVICE T operator()(const T& val) const { return atan(val); } +}; + +template <> +struct Atan { + HOSTDEVICE platform::float16 operator()(const platform::float16& val) const { + return platform::float16(atan(static_cast(val))); + } +}; + +// Atan(x) = atan(x) +template +struct AtanFunctor : public BaseActivationFunctor { + template + void operator()(Device d, X x, Out out) const { + out.device(d) = x.unaryExpr(Atan()); + } +}; + +// atan'(x) = 1 / (1 + x^2) +template +struct AtanGradFunctor : public BaseActivationFunctor { + template + void operator()(Device d, X x, Out out, dOut dout, dX dx) const { + dx.device(d) = dout * static_cast(1) / (static_cast(1) + x.square()); + } +}; + // round(x) = [x] template struct RoundFunctor : public BaseActivationFunctor { @@ -1001,13 +1095,16 @@ struct SwishGradFunctor : public BaseActivationFunctor { __macro(relu, ReluFunctor, ReluGradFunctor); \ __macro(gelu, GeluFunctor, GeluGradFunctor); \ __macro(tanh, TanhFunctor, TanhGradFunctor); \ + __macro(atan, AtanFunctor, AtanGradFunctor); \ __macro(softshrink, SoftShrinkFunctor, SoftShrinkGradFunctor); \ __macro(sqrt, SqrtFunctor, SqrtGradFunctor); \ __macro(abs, AbsFunctor, AbsGradFunctor); \ __macro(ceil, CeilFunctor, ZeroGradFunctor); \ __macro(floor, FloorFunctor, ZeroGradFunctor); \ __macro(cos, CosFunctor, CosGradFunctor); \ + __macro(acos, AcosFunctor, AcosGradFunctor); \ __macro(sin, SinFunctor, SinGradFunctor); \ + __macro(asin, AsinFunctor, AsinGradFunctor); \ __macro(round, RoundFunctor, ZeroGradFunctor); \ __macro(reciprocal, ReciprocalFunctor, ReciprocalGradFunctor); \ __macro(log, LogFunctor, LogGradFunctor); \ diff --git a/python/paddle/fluid/layers/ops.py b/python/paddle/fluid/layers/ops.py index 4381727a09..f018bb8af8 100644 --- a/python/paddle/fluid/layers/ops.py +++ b/python/paddle/fluid/layers/ops.py @@ -23,6 +23,7 @@ __activations_noattr__ = [ 'logsigmoid', 'exp', 'tanh', + 'atan', 'tanh_shrink', 'softshrink', 'sqrt', @@ -30,6 +31,8 @@ __activations_noattr__ = [ 'ceil', 'floor', 'cos', + 'acos', + 'asin', 'sin', 'round', 'reciprocal', diff --git a/python/paddle/fluid/tests/unittests/test_activation_op.py b/python/paddle/fluid/tests/unittests/test_activation_op.py index d5a8385409..d587715d60 100644 --- a/python/paddle/fluid/tests/unittests/test_activation_op.py +++ b/python/paddle/fluid/tests/unittests/test_activation_op.py @@ -100,6 +100,23 @@ class TestTanh(TestActivation): self.check_grad(['X'], 'Out', max_relative_error=0.007) +class TestAtan(TestActivation): + def setUp(self): + self.op_type = "atan" + self.init_dtype() + + x = np.random.uniform(0.1, 1, [11, 17]).astype(self.dtype) + out = np.arctan(x) + + self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(x)} + self.outputs = {'Out': out} + + def test_check_grad(self): + if self.dtype == np.float16: + return + self.check_grad(['X'], 'Out', max_relative_error=0.007) + + class TestTanhShrink(TestActivation): def setUp(self): self.op_type = "tanh_shrink" @@ -248,6 +265,23 @@ class TestCos(TestActivation): self.check_grad(['X'], 'Out', max_relative_error=0.007) +class TestAcos(TestActivation): + def setUp(self): + self.op_type = "acos" + self.init_dtype() + + x = np.random.uniform(-1, 1, [4, 4]).astype(self.dtype) + out = np.arccos(x) + + self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(x)} + self.outputs = {'Out': out} + + def test_check_grad(self): + if self.dtype == np.float16: + return + self.check_grad(['X'], 'Out', max_relative_error=0.007) + + class TestSin(TestActivation): def setUp(self): self.op_type = "sin" @@ -265,6 +299,23 @@ class TestSin(TestActivation): self.check_grad(['X'], 'Out', max_relative_error=0.007) +class TestAsin(TestActivation): + def setUp(self): + self.op_type = "asin" + self.init_dtype() + + x = np.random.uniform(-1, 1, [4, 4]).astype(self.dtype) + out = np.arcsin(x) + + self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(x)} + self.outputs = {'Out': out} + + def test_check_grad(self): + if self.dtype == np.float16: + return + self.check_grad(['X'], 'Out', max_relative_error=0.007) + + class TestRound(TestActivation): def setUp(self): self.op_type = "round" @@ -665,7 +716,10 @@ create_test_act_fp16_class(TestAbs) create_test_act_fp16_class(TestCeil, grad_check=False) create_test_act_fp16_class(TestFloor, grad_check=False) create_test_act_fp16_class(TestCos, grad_atol=0.85) +create_test_act_fp16_class(TestAcos, grad_atol=0.85) create_test_act_fp16_class(TestSin) +create_test_act_fp16_class(TestAsin) +create_test_act_fp16_class(TestAtan) create_test_act_fp16_class(TestRound, grad_check=False) create_test_act_fp16_class(TestRelu) create_test_act_fp16_class(TestGelu) From 45bdd84dac51b6f3fb4315b144f958e6e15c8389 Mon Sep 17 00:00:00 2001 From: tensor-tang Date: Sun, 10 Mar 2019 16:26:59 +0000 Subject: [PATCH 032/230] enhance the jitkernel helper and add unit tests test=develop --- paddle/fluid/operators/jit/benchmark.cc | 28 +-- paddle/fluid/operators/jit/gen/act.cc | 14 +- paddle/fluid/operators/jit/gen/blas.cc | 4 +- paddle/fluid/operators/jit/gen/embseqpool.cc | 2 +- paddle/fluid/operators/jit/gen/gru.cc | 2 +- paddle/fluid/operators/jit/gen/hopv.cc | 2 +- paddle/fluid/operators/jit/gen/jitcode.h | 2 +- paddle/fluid/operators/jit/gen/lstm.cc | 2 +- paddle/fluid/operators/jit/gen/matmul.cc | 2 +- paddle/fluid/operators/jit/gen/seqpool.cc | 2 +- paddle/fluid/operators/jit/gen/sgd.cc | 2 +- paddle/fluid/operators/jit/gen/vbroadcast.cc | 2 +- paddle/fluid/operators/jit/gen_base.cc | 2 +- paddle/fluid/operators/jit/gen_base.h | 7 +- paddle/fluid/operators/jit/helper.h | 116 +++++++--- paddle/fluid/operators/jit/kernel_base.h | 7 +- paddle/fluid/operators/jit/kernel_key.cc | 3 + .../jit/more/intrinsic/crf_decoding.cc | 2 +- .../jit/more/intrinsic/crf_decoding.h | 3 +- .../jit/more/intrinsic/layer_norm.cc | 2 +- .../operators/jit/more/intrinsic/layer_norm.h | 3 +- paddle/fluid/operators/jit/more/mix/mix.cc | 16 +- paddle/fluid/operators/jit/more/mix/mix.h | 12 +- paddle/fluid/operators/jit/more/mkl/mkl.cc | 47 ++-- paddle/fluid/operators/jit/more/mkl/mkl.h | 14 +- paddle/fluid/operators/jit/registry.h | 4 +- paddle/fluid/operators/jit/test.cc | 208 ++++++++++++++---- 27 files changed, 328 insertions(+), 182 deletions(-) diff --git a/paddle/fluid/operators/jit/benchmark.cc b/paddle/fluid/operators/jit/benchmark.cc index 773cf38eb9..fbb04a166e 100644 --- a/paddle/fluid/operators/jit/benchmark.cc +++ b/paddle/fluid/operators/jit/benchmark.cc @@ -111,33 +111,11 @@ template void BenchAllImpls(const typename KernelTuple::attr_type& attr, Args... args) { BenchFunc benchmark; std::vector> infos; - // test refer - auto refer = jit::GetRefer(); - if (!refer) { - LOG(FATAL) << "Refer can not be empty!"; + auto funcs = jit::GetAllCandidateFuncsWithTypes(attr); + for (auto f : funcs) { + infos.push_back(std::make_pair(f.first, benchmark(f.second, args...))); } - infos.push_back(std::make_pair("Refer", benchmark(refer, args...))); - // test jitcode - auto jitcode = jit::GetJitCode(attr); - if (jitcode) { - infos.push_back(std::make_pair("JitCode", benchmark(jitcode, args...))); - } - // test all impls in more - jit::KernelKey kkey(KernelTuple::kernel_type, PlaceType()); - auto& pool = jit::KernelPool().Instance().AllKernels(); - auto iter = pool.find(kkey); - if (iter != pool.end()) { - auto& impls = iter->second; - for (auto& impl : impls) { - auto i = dynamic_cast*>(impl.get()); - if (i && i->UseMe(attr)) { - auto more = i->GetFunc(); - infos.push_back( - std::make_pair(i->ImplType(), benchmark(more, args...))); - } - } - } // Test result from Get function auto tgt = jit::KernelFuncs::Cache().At(attr); if (!tgt) { diff --git a/paddle/fluid/operators/jit/gen/act.cc b/paddle/fluid/operators/jit/gen/act.cc index e7a7375879..5cac219f95 100644 --- a/paddle/fluid/operators/jit/gen/act.cc +++ b/paddle/fluid/operators/jit/gen/act.cc @@ -81,7 +81,7 @@ void VActJitCode::genCode() { #define DECLARE_ACT_CREATOR(name) \ class name##Creator : public JitCodeCreator { \ public: \ - bool UseMe(const int& attr) const override; \ + bool CanBeUsed(const int& attr) const override; \ size_t CodeSize(const int& d) const override; \ std::unique_ptr CreateJitCode(const int& attr) const override { \ return make_unique(attr, CodeSize(attr)); \ @@ -96,27 +96,27 @@ DECLARE_ACT_CREATOR(VSigmoid); DECLARE_ACT_CREATOR(VTanh); // TODO(TJ): tuning use me -bool VReluCreator::UseMe(const int& d) const { +bool VReluCreator::CanBeUsed(const int& d) const { return platform::MayIUse(platform::avx); } -bool VSquareCreator::UseMe(const int& d) const { +bool VSquareCreator::CanBeUsed(const int& d) const { return platform::MayIUse(platform::avx); } -bool VIdentityCreator::UseMe(const int& d) const { +bool VIdentityCreator::CanBeUsed(const int& d) const { return platform::MayIUse(platform::avx); } -bool VExpCreator::UseMe(const int& d) const { +bool VExpCreator::CanBeUsed(const int& d) const { return platform::MayIUse(platform::avx) && d < 32; } -bool VSigmoidCreator::UseMe(const int& d) const { +bool VSigmoidCreator::CanBeUsed(const int& d) const { return platform::MayIUse(platform::avx); } -bool VTanhCreator::UseMe(const int& d) const { +bool VTanhCreator::CanBeUsed(const int& d) const { return platform::MayIUse(platform::avx); } diff --git a/paddle/fluid/operators/jit/gen/blas.cc b/paddle/fluid/operators/jit/gen/blas.cc index 5da24c359e..e764a7983d 100644 --- a/paddle/fluid/operators/jit/gen/blas.cc +++ b/paddle/fluid/operators/jit/gen/blas.cc @@ -142,7 +142,7 @@ void NCHW16CMulNCJitCode::genCode() { class NCHW16CMulNCCreator : public JitCodeCreator { public: - bool UseMe(const int& attr) const override { + bool CanBeUsed(const int& attr) const override { return platform::MayIUse(platform::avx512f); } size_t CodeSize(const int& d) const override { return 256 * 1024; } @@ -154,7 +154,7 @@ class NCHW16CMulNCCreator : public JitCodeCreator { #define DECLARE_BLAS_CREATOR(name) \ class name##Creator : public JitCodeCreator { \ public: \ - bool UseMe(const int& attr) const override { \ + bool CanBeUsed(const int& attr) const override { \ return platform::MayIUse(platform::avx) && attr <= 1024; \ } \ size_t CodeSize(const int& d) const override { \ diff --git a/paddle/fluid/operators/jit/gen/embseqpool.cc b/paddle/fluid/operators/jit/gen/embseqpool.cc index 23837a3fb9..6e8ecc07e7 100644 --- a/paddle/fluid/operators/jit/gen/embseqpool.cc +++ b/paddle/fluid/operators/jit/gen/embseqpool.cc @@ -121,7 +121,7 @@ void EmbSeqPoolJitCode::genCode() { class EmbSeqPoolCreator : public JitCodeCreator { public: - bool UseMe(const emb_seq_pool_attr_t& attr) const override { + bool CanBeUsed(const emb_seq_pool_attr_t& attr) const override { return platform::MayIUse(platform::avx) && attr.table_width % YMM_FLOAT_BLOCK == 0; } diff --git a/paddle/fluid/operators/jit/gen/gru.cc b/paddle/fluid/operators/jit/gen/gru.cc index 13f7a14111..4bc9247f6f 100644 --- a/paddle/fluid/operators/jit/gen/gru.cc +++ b/paddle/fluid/operators/jit/gen/gru.cc @@ -86,7 +86,7 @@ void GRUJitCode::genCode() { class name##Creator : public JitCodeCreator { \ public: \ /* TODO(TJ): enable more */ \ - bool UseMe(const gru_attr_t& attr) const override { \ + bool CanBeUsed(const gru_attr_t& attr) const override { \ return platform::MayIUse(platform::avx) && attr.d % 8 == 0; \ } \ size_t CodeSize(const gru_attr_t& attr) const override { \ diff --git a/paddle/fluid/operators/jit/gen/hopv.cc b/paddle/fluid/operators/jit/gen/hopv.cc index e788401719..3383f17df8 100644 --- a/paddle/fluid/operators/jit/gen/hopv.cc +++ b/paddle/fluid/operators/jit/gen/hopv.cc @@ -76,7 +76,7 @@ void HOPVJitCode::genCode() { #define DECLARE_HOP_CREATOR(name) \ class name##Creator : public JitCodeCreator { \ public: \ - bool UseMe(const int& attr) const override { \ + bool CanBeUsed(const int& attr) const override { \ return platform::MayIUse(platform::avx); \ } \ size_t CodeSize(const int& d) const override { \ diff --git a/paddle/fluid/operators/jit/gen/jitcode.h b/paddle/fluid/operators/jit/gen/jitcode.h index 39847d1b65..228db7cc72 100644 --- a/paddle/fluid/operators/jit/gen/jitcode.h +++ b/paddle/fluid/operators/jit/gen/jitcode.h @@ -73,7 +73,7 @@ class JitCode : public GenBase, public Xbyak::CodeGenerator { virtual void genCode() = 0; size_t getSize() const override { return CodeGenerator::getSize(); } - const unsigned char* getCodeInternal() override { + const unsigned char* getCodeInternal() const override { const Xbyak::uint8* code = CodeGenerator::getCode(); return code; } diff --git a/paddle/fluid/operators/jit/gen/lstm.cc b/paddle/fluid/operators/jit/gen/lstm.cc index 08bafb5a81..5e7789aede 100644 --- a/paddle/fluid/operators/jit/gen/lstm.cc +++ b/paddle/fluid/operators/jit/gen/lstm.cc @@ -114,7 +114,7 @@ void LSTMJitCode::genCode() { class name##Creator : public JitCodeCreator { \ public: \ /* TODO(TJ): enable more */ \ - bool UseMe(const lstm_attr_t& attr) const override { \ + bool CanBeUsed(const lstm_attr_t& attr) const override { \ return platform::MayIUse(platform::avx) && attr.d % 8 == 0; \ } \ size_t CodeSize(const lstm_attr_t& attr) const override { \ diff --git a/paddle/fluid/operators/jit/gen/matmul.cc b/paddle/fluid/operators/jit/gen/matmul.cc index ae3858eab2..ca50f26ce5 100644 --- a/paddle/fluid/operators/jit/gen/matmul.cc +++ b/paddle/fluid/operators/jit/gen/matmul.cc @@ -98,7 +98,7 @@ void MatMulJitCode::genCode() { class MatMulCreator : public JitCodeCreator { public: - bool UseMe(const matmul_attr_t& attr) const override { + bool CanBeUsed(const matmul_attr_t& attr) const override { return attr.m == 1 && platform::MayIUse(platform::avx512f) && attr.n % ZMM_FLOAT_BLOCK == 0 && attr.k < 512; } diff --git a/paddle/fluid/operators/jit/gen/seqpool.cc b/paddle/fluid/operators/jit/gen/seqpool.cc index 530d24ee1f..ceca104cc9 100644 --- a/paddle/fluid/operators/jit/gen/seqpool.cc +++ b/paddle/fluid/operators/jit/gen/seqpool.cc @@ -57,7 +57,7 @@ void SeqPoolJitCode::genCode() { class SeqPoolCreator : public JitCodeCreator { public: - bool UseMe(const seq_pool_attr_t& attr) const override { + bool CanBeUsed(const seq_pool_attr_t& attr) const override { return platform::MayIUse(platform::avx); } size_t CodeSize(const seq_pool_attr_t& attr) const override { diff --git a/paddle/fluid/operators/jit/gen/sgd.cc b/paddle/fluid/operators/jit/gen/sgd.cc index a745a27f95..a40da9b993 100644 --- a/paddle/fluid/operators/jit/gen/sgd.cc +++ b/paddle/fluid/operators/jit/gen/sgd.cc @@ -104,7 +104,7 @@ void SgdJitCode::genCode() { class SgdCreator : public JitCodeCreator { public: - bool UseMe(const sgd_attr_t& attr) const override { + bool CanBeUsed(const sgd_attr_t& attr) const override { return platform::MayIUse(platform::avx) && attr.grad_width % YMM_FLOAT_BLOCK == 0; } diff --git a/paddle/fluid/operators/jit/gen/vbroadcast.cc b/paddle/fluid/operators/jit/gen/vbroadcast.cc index 3f9fbdbd82..66a8d75fd4 100644 --- a/paddle/fluid/operators/jit/gen/vbroadcast.cc +++ b/paddle/fluid/operators/jit/gen/vbroadcast.cc @@ -69,7 +69,7 @@ void VBroadcastJitCode::genCode() { class VBroadcastCreator : public JitCodeCreator { public: - bool UseMe(const int64_t& w) const override { + bool CanBeUsed(const int64_t& w) const override { return platform::MayIUse(platform::avx) && w % YMM_FLOAT_BLOCK == 0; } size_t CodeSize(const int64_t& w) const override { diff --git a/paddle/fluid/operators/jit/gen_base.cc b/paddle/fluid/operators/jit/gen_base.cc index f3603875ad..4c49eff49e 100644 --- a/paddle/fluid/operators/jit/gen_base.cc +++ b/paddle/fluid/operators/jit/gen_base.cc @@ -31,7 +31,7 @@ namespace paddle { namespace operators { namespace jit { -// refer do not need useme, it would be the last one. +// refer do not need CanBeUsed, it would be the last one. void GenBase::dumpCode(const unsigned char* code) const { if (code) { static int counter = 0; diff --git a/paddle/fluid/operators/jit/gen_base.h b/paddle/fluid/operators/jit/gen_base.h index a7c7a35a7e..033c603c07 100644 --- a/paddle/fluid/operators/jit/gen_base.h +++ b/paddle/fluid/operators/jit/gen_base.h @@ -31,9 +31,10 @@ class GenBase : public Kernel { virtual ~GenBase() = default; virtual std::string name() const = 0; virtual size_t getSize() const = 0; - virtual const unsigned char* getCodeInternal() = 0; + virtual const unsigned char* getCodeInternal() const = 0; + const char* ImplType() const override { return "JitCode"; } template - Func getCode() { + Func getCode() const { const unsigned char* code = this->getCodeInternal(); if (FLAGS_dump_jitcode) { this->dumpCode(code); @@ -65,7 +66,7 @@ class JitCodeCreator : public GenCreator { virtual ~JitCodeCreator() = default; // condition when this jit code can be used. - virtual bool UseMe(const Attr& attr) const = 0; + virtual bool CanBeUsed(const Attr& attr) const = 0; // estimate this code size virtual size_t CodeSize(const Attr& attr) const = 0; diff --git a/paddle/fluid/operators/jit/helper.h b/paddle/fluid/operators/jit/helper.h index 85f4072dd3..d98eada81c 100644 --- a/paddle/fluid/operators/jit/helper.h +++ b/paddle/fluid/operators/jit/helper.h @@ -14,9 +14,6 @@ #pragma once -extern "C" { -#include -} #include #include #include @@ -36,31 +33,30 @@ template inline typename std::enable_if< std::is_same::value && std::is_same::value, - typename KernelTuple::func_type>::type + const Kernel*>::type GetJitCode(const typename KernelTuple::attr_type& attr) { - using Func = typename KernelTuple::func_type; using Attr = typename KernelTuple::attr_type; size_t key = JitCodeKey(attr); - auto& codes = JitCodePool().Instance(); + auto& codes = JitCodePool::Instance(); if (codes.Has(key)) { - return codes.AllKernels().at(key)->template getCode(); + return codes.AllKernels().at(key).get(); } // creator is not related with attr, so can use KernelKey as key KernelKey kkey(KernelTuple::kernel_type, PlaceType()); // pool: (KernelKey(type, place), vector) - auto& creator_map = JitCodeCreatorPool().Instance().AllCreators(); + auto& creator_map = JitCodeCreatorPool::Instance().AllCreators(); auto iter = creator_map.find(kkey); if (iter != creator_map.end()) { auto& creators = iter->second; for (auto& cur : creators) { auto i = dynamic_cast*>(cur.get()); - if (i && i->UseMe(attr)) { + if (i && i->CanBeUsed(attr)) { auto p = i->CreateJitCode(attr); if (p) { - auto f = p->template getCode(); + auto res = p.get(); codes.Insert(key, std::move(p)); - return f; + return res; } } } @@ -72,7 +68,7 @@ template inline typename std::enable_if< !std::is_same::value || !std::is_same::value, - typename KernelTuple::func_type>::type + const Kernel*>::type GetJitCode(const typename KernelTuple::attr_type& attr) { return nullptr; } @@ -80,8 +76,8 @@ GetJitCode(const typename KernelTuple::attr_type& attr) { // Refer code do not related with attr, which is just for cast // Refer is always on CPUPlace template -inline typename KernelTuple::func_type GetRefer() { - auto& ref_pool = ReferKernelPool().Instance().AllKernels(); +inline const Kernel* GetReferKernel() { + auto& ref_pool = ReferKernelPool::Instance().AllKernels(); KernelKey kkey(KernelTuple::kernel_type, platform::CPUPlace()); auto ref_iter = ref_pool.find(kkey); PADDLE_ENFORCE(ref_iter != ref_pool.end(), @@ -90,36 +86,93 @@ inline typename KernelTuple::func_type GetRefer() { for (auto& impl : ref_impls) { auto i = dynamic_cast*>(impl.get()); if (i) { - return i->GetFunc(); + return i; } } return nullptr; } -template -typename KernelTuple::func_type Get( +template +inline typename KernelTuple::func_type GetReferFunc() { + auto ker = GetReferKernel(); + auto p = dynamic_cast*>(ker); + PADDLE_ENFORCE(p, "The Refer kernel should exsit"); + return p->GetFunc(); +} + +// Return all Kernels that can be used +template +std::vector GetAllCandidateKernels( const typename KernelTuple::attr_type& attr) { - auto jitfunc = GetJitCode(attr); - if (jitfunc) { - return jitfunc; + // the search order shoudl be jitcode > more > refer + std::vector res; + auto jitker = GetJitCode(attr); + if (jitker) { + res.emplace_back(jitker); } - // pool: (KernelKey(type, place), vector) + // more kernelpool: (KernelKey(type, place), vector) KernelKey kkey(KernelTuple::kernel_type, PlaceType()); - auto& pool = KernelPool().Instance().AllKernels(); + auto& pool = KernelPool::Instance().AllKernels(); auto iter = pool.find(kkey); if (iter != pool.end()) { auto& impls = iter->second; for (auto& impl : impls) { auto i = dynamic_cast*>(impl.get()); - if (i && i->UseMe(attr)) { - return i->GetFunc(); + if (i && i->CanBeUsed(attr)) { + res.emplace_back(i); } } } // The last implementation should be reference function on CPUPlace. - return GetRefer(); + auto ref = GetReferKernel(); + PADDLE_ENFORCE(ref != nullptr, "Refer Kernel can not be empty."); + res.emplace_back(ref); + return res; +} + +template +std::vector> +GetAllCandidateFuncsWithTypes(const typename KernelTuple::attr_type& attr) { + using Func = typename KernelTuple::func_type; + auto kers = GetAllCandidateKernels(attr); + std::vector> res; + for (auto k : kers) { + std::string name = k->ImplType(); + if (name == "JitCode") { + auto i = dynamic_cast(k); + PADDLE_ENFORCE(i, "jitcode kernel cast can not fail."); + res.emplace_back(std::make_pair(name, i->template getCode())); + } else { + auto i = dynamic_cast*>(k); + PADDLE_ENFORCE(i, "kernel cast can not fail."); + res.emplace_back(std::make_pair(name, i->GetFunc())); + } + } + return res; +} + +template +std::vector GetAllCandidateFuncs( + const typename KernelTuple::attr_type& attr) { + auto funcs = GetAllCandidateFuncsWithTypes(attr); + std::vector res; + for (auto& i : funcs) { + res.emplace_back(i.second); + } + return res; +} + +template +typename KernelTuple::func_type GetDefaultBestFunc( + const typename KernelTuple::attr_type& attr) { + auto funcs = GetAllCandidateFuncs(attr); + PADDLE_ENFORCE_GE(funcs.size(), 1UL); + // Here could do some runtime benchmark of this attr and return the best one. + // But yet just get the first one as the default best one, + // which is searched in order and tuned by offline. + return funcs[0]; } template @@ -134,17 +187,13 @@ class KernelFuncs { // the exposed interface to use typename KernelTuple::func_type At( const typename KernelTuple::attr_type& attr) { - // XXH64: 13.8 GB/s - // TODO(TJ): change me, maybe not all attr change need one key, should be - // attrkey - int64_t key = XXH64(&attr, sizeof(typename KernelTuple::attr_type), 0); + // Maybe here is not good enough, not all kernels should have jitcode + int64_t key = JitCodeKey(attr); if (Has(key)) { return funcs_.at(key); } - // If do not have this attr in cache, - // then could run some runtime benchmark of this attr and save the best one. - // Here just get the offline benchmarked best one. - auto func = Get(attr); + // If do not have this attr in cache then get the default best + auto func = GetDefaultBestFunc(attr); Insert(key, func); return func; } @@ -156,7 +205,6 @@ class KernelFuncs { protected: bool Has(int64_t key) const { return funcs_.find(key) != funcs_.end(); } - void Insert(int64_t key, typename KernelTuple::func_type func) { funcs_.emplace(key, func); } diff --git a/paddle/fluid/operators/jit/kernel_base.h b/paddle/fluid/operators/jit/kernel_base.h index e8dbcced4f..bd34d7dfc7 100644 --- a/paddle/fluid/operators/jit/kernel_base.h +++ b/paddle/fluid/operators/jit/kernel_base.h @@ -302,6 +302,7 @@ class Kernel { public: Kernel() = default; virtual ~Kernel() = default; + virtual const char* ImplType() const = 0; DISABLE_COPY_AND_ASSIGN(Kernel); }; @@ -312,8 +313,8 @@ class KernelMore : public Kernel { using Func = typename KernelTuple::func_type; using Attr = typename KernelTuple::attr_type; virtual Func GetFunc() const { return func; } - virtual bool UseMe(const Attr& attr) const = 0; - virtual const char* ImplType() const = 0; + // specify this kernel can be used, means it should not fail if use it. + virtual bool CanBeUsed(const Attr& attr) const = 0; protected: Func func{nullptr}; @@ -323,7 +324,7 @@ template class ReferKernel : public KernelMore { public: // Refer code can always be used - bool UseMe(const typename KernelTuple::attr_type& attr) const override { + bool CanBeUsed(const typename KernelTuple::attr_type& attr) const override { return true; } const char* ImplType() const override { return "Refer"; } diff --git a/paddle/fluid/operators/jit/kernel_key.cc b/paddle/fluid/operators/jit/kernel_key.cc index 1c2fddcae7..6987c893de 100644 --- a/paddle/fluid/operators/jit/kernel_key.cc +++ b/paddle/fluid/operators/jit/kernel_key.cc @@ -13,6 +13,7 @@ * limitations under the License. */ #include "paddle/fluid/operators/jit/kernel_key.h" +#include #include "paddle/fluid/platform/enforce.h" namespace paddle { @@ -49,6 +50,8 @@ static inline int act_type_convert(KernelType type) { template <> size_t JitCodeKey(const lstm_attr_t& attr) { + // XXH64: 13.8 GB/s + size_t key = attr.d; int gate_key = act_type_convert(attr.act_gate) << 1; int cand_key = act_type_convert(attr.act_cand) << (1 + act_type_shift); diff --git a/paddle/fluid/operators/jit/more/intrinsic/crf_decoding.cc b/paddle/fluid/operators/jit/more/intrinsic/crf_decoding.cc index 16c91f8246..1254d00189 100644 --- a/paddle/fluid/operators/jit/more/intrinsic/crf_decoding.cc +++ b/paddle/fluid/operators/jit/more/intrinsic/crf_decoding.cc @@ -161,7 +161,7 @@ void CRFDecoding(const int seq_len, const float* x, const float* w, } } -bool CRFDecodingKernel::UseMe(const int& d) const { +bool CRFDecodingKernel::CanBeUsed(const int& d) const { #ifdef __AVX512F__ constexpr int block = ZMM_FLOAT_BLOCK; #else diff --git a/paddle/fluid/operators/jit/more/intrinsic/crf_decoding.h b/paddle/fluid/operators/jit/more/intrinsic/crf_decoding.h index f4187bd3ba..49b1a1fea4 100644 --- a/paddle/fluid/operators/jit/more/intrinsic/crf_decoding.h +++ b/paddle/fluid/operators/jit/more/intrinsic/crf_decoding.h @@ -29,7 +29,8 @@ void CRFDecoding(const int seq_len, const float* x, const float* w, class CRFDecodingKernel : public KernelMore> { public: CRFDecodingKernel() { this->func = CRFDecoding; } - bool UseMe(const typename CRFDecodingTuple::attr_type&) const override; + bool CanBeUsed( + const typename CRFDecodingTuple::attr_type&) const override; const char* ImplType() const override { return "Intrinsic"; } }; diff --git a/paddle/fluid/operators/jit/more/intrinsic/layer_norm.cc b/paddle/fluid/operators/jit/more/intrinsic/layer_norm.cc index e9b6e401c6..a4e3246f10 100644 --- a/paddle/fluid/operators/jit/more/intrinsic/layer_norm.cc +++ b/paddle/fluid/operators/jit/more/intrinsic/layer_norm.cc @@ -153,7 +153,7 @@ void LayerNorm(float* x, float* out, float* mean, float* var, } } -bool LayerNormKernel::UseMe(const int& d) const { +bool LayerNormKernel::CanBeUsed(const int& d) const { return platform::MayIUse(platform::avx) && d >= YMM_FLOAT_BLOCK; } diff --git a/paddle/fluid/operators/jit/more/intrinsic/layer_norm.h b/paddle/fluid/operators/jit/more/intrinsic/layer_norm.h index dfa4c2f072..7b9f676050 100644 --- a/paddle/fluid/operators/jit/more/intrinsic/layer_norm.h +++ b/paddle/fluid/operators/jit/more/intrinsic/layer_norm.h @@ -30,7 +30,8 @@ void LayerNorm(float* x, float* out, float* mean, float* var, class LayerNormKernel : public KernelMore> { public: LayerNormKernel() { this->func = LayerNorm; } - bool UseMe(const typename LayerNormTuple::attr_type&) const override; + bool CanBeUsed( + const typename LayerNormTuple::attr_type&) const override; const char* ImplType() const override { return "Intrinsic"; } }; diff --git a/paddle/fluid/operators/jit/more/mix/mix.cc b/paddle/fluid/operators/jit/more/mix/mix.cc index 9ee1032e95..6e709a16d2 100644 --- a/paddle/fluid/operators/jit/more/mix/mix.cc +++ b/paddle/fluid/operators/jit/more/mix/mix.cc @@ -204,21 +204,21 @@ void GRUHtPart2(gru_t* step, const gru_attr_t* attr) { } // TODO(TJ): tuning me -bool VSigmoidKernel::UseMe(const int& d) const { return true; } +bool VSigmoidKernel::CanBeUsed(const int& d) const { return true; } -bool VTanhKernel::UseMe(const int& d) const { return true; } +bool VTanhKernel::CanBeUsed(const int& d) const { return true; } -bool SoftmaxKernel::UseMe(const int& d) const { return true; } +bool SoftmaxKernel::CanBeUsed(const int& d) const { return true; } -bool LSTMCtHtKernel::UseMe(const lstm_attr_t& attr) const { return true; } +bool LSTMCtHtKernel::CanBeUsed(const lstm_attr_t& attr) const { return true; } -bool LSTMC1H1Kernel::UseMe(const lstm_attr_t& attr) const { return true; } +bool LSTMC1H1Kernel::CanBeUsed(const lstm_attr_t& attr) const { return true; } -bool GRUH1Kernel::UseMe(const gru_attr_t& attr) const { return true; } +bool GRUH1Kernel::CanBeUsed(const gru_attr_t& attr) const { return true; } -bool GRUHtPart1Kernel::UseMe(const gru_attr_t& attr) const { return true; } +bool GRUHtPart1Kernel::CanBeUsed(const gru_attr_t& attr) const { return true; } -bool GRUHtPart2Kernel::UseMe(const gru_attr_t& attr) const { return true; } +bool GRUHtPart2Kernel::CanBeUsed(const gru_attr_t& attr) const { return true; } } // namespace mix } // namespace more diff --git a/paddle/fluid/operators/jit/more/mix/mix.h b/paddle/fluid/operators/jit/more/mix/mix.h index 17eb96462f..994d485909 100644 --- a/paddle/fluid/operators/jit/more/mix/mix.h +++ b/paddle/fluid/operators/jit/more/mix/mix.h @@ -34,12 +34,12 @@ void GRUH1(gru_t* step, const gru_attr_t* attr); void GRUHtPart1(gru_t* step, const gru_attr_t* attr); void GRUHtPart2(gru_t* step, const gru_attr_t* attr); -#define DECLARE_MORE_KERNEL(name) \ - class name##Kernel : public KernelMore> { \ - public: \ - name##Kernel() { this->func = name; } \ - bool UseMe(const typename name##Tuple::attr_type&) const override; \ - const char* ImplType() const override { return "Mixed"; } \ +#define DECLARE_MORE_KERNEL(name) \ + class name##Kernel : public KernelMore> { \ + public: \ + name##Kernel() { this->func = name; } \ + bool CanBeUsed(const typename name##Tuple::attr_type&) const override; \ + const char* ImplType() const override { return "Mixed"; } \ } // XYN diff --git a/paddle/fluid/operators/jit/more/mkl/mkl.cc b/paddle/fluid/operators/jit/more/mkl/mkl.cc index 084ea571ce..4f600b3814 100644 --- a/paddle/fluid/operators/jit/more/mkl/mkl.cc +++ b/paddle/fluid/operators/jit/more/mkl/mkl.cc @@ -130,105 +130,106 @@ void ASum(const double* x, double* res, int n) { // TODO(TJ): tuning me carefully on AVX, AVX2 and AVX512 template <> -bool VMulKernel::UseMe(const int& d) const { +bool VMulKernel::CanBeUsed(const int& d) const { return platform::MayIUse(platform::avx512f) && d > 512; } template <> -bool VAddKernel::UseMe(const int& d) const { +bool VAddKernel::CanBeUsed(const int& d) const { return platform::MayIUse(platform::avx) && d > 512; } template <> -bool VScalKernel::UseMe(const int& d) const { +bool VScalKernel::CanBeUsed(const int& d) const { return platform::MayIUse(platform::avx512f) && d > 512; } template <> -bool VExpKernel::UseMe(const int& d) const { +bool VExpKernel::CanBeUsed(const int& d) const { return d > 7; } template <> -bool VSquareKernel::UseMe(const int& d) const { +bool VSquareKernel::CanBeUsed(const int& d) const { return d > 7; } template <> -bool VCopyKernel::UseMe(const int& d) const { +bool VCopyKernel::CanBeUsed(const int& d) const { return d > 15; } template <> -bool VBroadcastKernel::UseMe(const int64_t& d) const { +bool VBroadcastKernel::CanBeUsed(const int64_t& d) const { return d > 127; } template <> -bool VBroadcastKernel::UseMe(const int64_t& attr) const { +bool VBroadcastKernel::CanBeUsed(const int64_t& attr) const { return true; } template <> -bool VSigmoidKernel::UseMe(const int& d) const { +bool VSigmoidKernel::CanBeUsed(const int& d) const { return d > 7; } template <> -bool VTanhKernel::UseMe(const int& d) const { +bool VTanhKernel::CanBeUsed(const int& d) const { return d > 7; } template <> -bool SeqPoolKernel::UseMe(const seq_pool_attr_t& attr) const { +bool SeqPoolKernel::CanBeUsed(const seq_pool_attr_t& attr) const { return true; } template <> -bool SeqPoolKernel::UseMe(const seq_pool_attr_t& attr) const { +bool SeqPoolKernel::CanBeUsed(const seq_pool_attr_t& attr) const { return true; } template <> -bool EmbSeqPoolKernel::UseMe(const emb_seq_pool_attr_t& attr) const { +bool EmbSeqPoolKernel::CanBeUsed(const emb_seq_pool_attr_t& attr) const { return true; } template <> -bool EmbSeqPoolKernel::UseMe(const emb_seq_pool_attr_t& attr) const { +bool EmbSeqPoolKernel::CanBeUsed( + const emb_seq_pool_attr_t& attr) const { return true; } template <> -bool SgdKernel::UseMe(const sgd_attr_t& attr) const { +bool SgdKernel::CanBeUsed(const sgd_attr_t& attr) const { return true; } template <> -bool SgdKernel::UseMe(const sgd_attr_t& attr) const { +bool SgdKernel::CanBeUsed(const sgd_attr_t& attr) const { return true; } template <> -bool MatMulKernel::UseMe(const matmul_attr_t& attr) const { +bool MatMulKernel::CanBeUsed(const matmul_attr_t& attr) const { return platform::MayIUse(platform::avx); } template <> -bool MatMulKernel::UseMe(const matmul_attr_t& attr) const { +bool MatMulKernel::CanBeUsed(const matmul_attr_t& attr) const { return true; } template <> -bool SoftmaxKernel::UseMe(const int& d) const { +bool SoftmaxKernel::CanBeUsed(const int& d) const { // tuned on avx2 return platform::MayIUse(platform::avx) && d < 60; } -#define AWALYS_USE_ME_WITH_DOUBLE(func) \ - template <> \ - bool func##Kernel::UseMe(const int& d) const { \ - return true; \ +#define AWALYS_USE_ME_WITH_DOUBLE(func) \ + template <> \ + bool func##Kernel::CanBeUsed(const int& d) const { \ + return true; \ } AWALYS_USE_ME_WITH_DOUBLE(VMul); diff --git a/paddle/fluid/operators/jit/more/mkl/mkl.h b/paddle/fluid/operators/jit/more/mkl/mkl.h index 8c1d8b57e0..f51dca654c 100644 --- a/paddle/fluid/operators/jit/more/mkl/mkl.h +++ b/paddle/fluid/operators/jit/more/mkl/mkl.h @@ -175,13 +175,13 @@ void Sgd(const T* lr, const T* param, const T* grad, const int64_t* rows, } } -#define DECLARE_MKL_KERNEL(name) \ - template \ - class name##Kernel : public KernelMore> { \ - public: \ - name##Kernel() { this->func = name; } \ - bool UseMe(const typename name##Tuple::attr_type&) const override; \ - const char* ImplType() const override { return "MKL"; } \ +#define DECLARE_MKL_KERNEL(name) \ + template \ + class name##Kernel : public KernelMore> { \ + public: \ + name##Kernel() { this->func = name; } \ + bool CanBeUsed(const typename name##Tuple::attr_type&) const override; \ + const char* ImplType() const override { return "MKL"; } \ } // ABCMNK diff --git a/paddle/fluid/operators/jit/registry.h b/paddle/fluid/operators/jit/registry.h index cb32c48720..c8da92c0c5 100644 --- a/paddle/fluid/operators/jit/registry.h +++ b/paddle/fluid/operators/jit/registry.h @@ -49,8 +49,8 @@ struct JitKernelRegistrarFunctor { void operator()(KernelType kt) const { KernelKey kkey(kt, PlaceType()); - Pool().Instance().Insert(kkey, - std::move(make_unique())); + Pool::Instance().Insert(kkey, + std::move(make_unique())); constexpr auto size = std::tuple_size>::value; JitKernelRegistrarFunctor diff --git a/paddle/fluid/operators/jit/test.cc b/paddle/fluid/operators/jit/test.cc index a574bf2079..898133a03b 100644 --- a/paddle/fluid/operators/jit/test.cc +++ b/paddle/fluid/operators/jit/test.cc @@ -13,6 +13,7 @@ See the License for the specific language governing permissions and limitations under the License. */ #include +#include #include #include #include @@ -68,31 +69,11 @@ template void TestAllImpls(const typename KernelTuple::attr_type& attr, const Tester& verifier, const Args&... args) { - // test jitcode - auto jitcode = jit::GetJitCode(attr); - if (jitcode) { - VLOG(10) << "Test Jitcode Kernel "; - verifier(jitcode, args...); + auto funcs = jit::GetAllCandidateFuncsWithTypes(attr); + for (auto f : funcs) { + VLOG(10) << "Test Kernel " << f.first; + verifier(f.second, args...); } - // test all impls in more - jit::KernelKey kkey(KernelTuple::kernel_type, PlaceType()); - auto& pool = jit::KernelPool().Instance().AllKernels(); - auto iter = pool.find(kkey); - if (iter != pool.end()) { - auto& impls = iter->second; - for (auto& impl : impls) { - auto i = dynamic_cast*>(impl.get()); - if (i && i->UseMe(attr)) { - auto more = i->GetFunc(); - VLOG(10) << "Test More Kernel : " << i->ImplType(); - verifier(more, args...); - } - } - } - // test result from Get function - VLOG(10) << "Test final get function "; - auto tgt = jit::KernelFuncs::Cache().At(attr); - verifier(tgt, args...); } template @@ -100,7 +81,7 @@ void TestKernelXYZN() { using T = typename KernelTuple::data_type; VLOG(10) << "Test JITKernel: " << jit::to_string(KernelTuple::kernel_type); for (int d : TestSizes()) { - auto ref = jit::GetRefer(); + auto ref = jit::GetReferFunc(); EXPECT_TRUE(ref != nullptr); std::vector x(d), y(d), zref(d); @@ -159,7 +140,7 @@ void TestKernelAXYN() { using T = typename KernelTuple::data_type; VLOG(10) << "Test JITKernel: " << jit::to_string(KernelTuple::kernel_type); for (int d : TestSizes()) { - auto ref = jit::GetRefer(); + auto ref = jit::GetReferFunc(); EXPECT_TRUE(ref != nullptr); const T a = static_cast(3); @@ -202,7 +183,7 @@ void TestKernelXYN() { using T = typename KernelTuple::data_type; VLOG(10) << "Test JITKernel: " << jit::to_string(KernelTuple::kernel_type); for (int d : TestSizes()) { - auto ref = jit::GetRefer(); + auto ref = jit::GetReferFunc(); EXPECT_TRUE(ref != nullptr); std::vector x(d), yref(d); @@ -245,7 +226,7 @@ void TestKernelXRN() { auto last_acc = FLAGS_acc; FLAGS_acc = 1e-4; for (int d : TestSizes()) { - auto ref = jit::GetRefer(); + auto ref = jit::GetReferFunc(); EXPECT_TRUE(ref != nullptr); std::vector x(d); RandomVec(d, x.data()); @@ -279,7 +260,7 @@ void TestKernelLSTM() { const jit::lstm_attr_t attr( d, jit::to_kerneltype(act_gate), jit::to_kerneltype(act_cand), jit::to_kerneltype(act_cell), use_peephole); - auto ref = jit::GetRefer(); + auto ref = jit::GetReferFunc(); EXPECT_TRUE(ref != nullptr); std::vector xsrc(4 * d), wp(3 * d), ct_1(d); std::vector ct_ref(d), ht_ref(d), checked(2 * d); @@ -370,7 +351,7 @@ void TestKernelGRU() { for (auto& act_cand : all_acts) { const jit::gru_attr_t attr(d, jit::to_kerneltype(act_gate), jit::to_kerneltype(act_cand)); - auto ref = jit::GetRefer(); + auto ref = jit::GetReferFunc(); EXPECT_TRUE(ref != nullptr); std::vector xsrc(3 * d), ht_1(d), ht_ref(d); RandomVec(3 * d, xsrc.data()); @@ -423,7 +404,7 @@ void TestKernelNCHW16CMulNC() { using T = typename KernelTuple::data_type; VLOG(10) << "Test JITKernel: " << jit::to_string(KernelTuple::kernel_type); const int n = 3, c = 16 * 4, h = 10, w = 10; - auto ref = jit::GetRefer(); + auto ref = jit::GetReferFunc(); EXPECT_TRUE(ref != nullptr); int sz = n * c * h * w; std::vector x(sz), y(n * c), zref(sz); @@ -439,7 +420,9 @@ void TestKernelNCHW16CMulNC() { constexpr int simd_width = ZMM_FLOAT_BLOCK; int C = c / simd_width; auto tgt = jit::KernelFuncs::Cache().At(0); - auto jitcode = jit::GetJitCode(0); + auto funcs = jit::GetAllCandidateFuncs(0); + EXPECT_GT(funcs.size(), 0UL); + auto jitcode = funcs[0]; EXPECT_TRUE(tgt != nullptr); if (std::is_same::value && @@ -482,7 +465,7 @@ void TestKernelLayerNorm() { int left = n * x_dim_0; for (int x_dim_1 : TestSizes()) { int right = x_dim_1; - auto ref = jit::GetRefer(); + auto ref = jit::GetReferFunc(); EXPECT_TRUE(ref != nullptr); int sz = left * right; std::vector x(sz), mean(left), var(left), scale(right), bias(right), @@ -555,7 +538,7 @@ void TestKernelCRFDecoding() { test_sizes.erase(std::remove(test_sizes.begin(), test_sizes.end(), 2000)); for (int seq_len : {1, 11, 17, 50}) { for (int tag_num : test_sizes) { - auto ref = jit::GetRefer(); + auto ref = jit::GetReferFunc(); EXPECT_TRUE(ref != nullptr); int x_sz = seq_len * tag_num; int w_sz = (tag_num + state_trans_base_idx) * tag_num; @@ -606,7 +589,7 @@ void TestKernelSeqPool() { jit::seq_pool_attr_t attr(w, type); for (int h : test_sizes) { attr.h = h; - auto ref = jit::GetRefer(); + auto ref = jit::GetReferFunc(); EXPECT_TRUE(ref != nullptr); std::vector x(h * w), yref(w); RandomVec(h * w, x.data()); @@ -649,7 +632,7 @@ void TestKernelEmbSeqPool() { for (auto type : pool_types) { for (int idx_w : {1, 2, 10, 16}) { for (int idx_h : {1, 2, 9, 13, 16}) { - auto ref = jit::GetRefer(); + auto ref = jit::GetReferFunc(); EXPECT_TRUE(ref != nullptr); std::vector idx(idx_h * idx_w); RandomVec(idx_h * idx_w, idx.data(), 0, tbl_h - 1); @@ -701,7 +684,7 @@ void TestKernelMatMul() { for (int m : {1, 2, 3, 4}) { for (int n : {1, 2, 3, 4}) { for (int k : TestSizes()) { - auto ref = jit::GetRefer(); + auto ref = jit::GetReferFunc(); EXPECT_TRUE(ref != nullptr); std::vector a(m * k), b(k * n), c(m * n); RandomVec(m * k, a.data()); @@ -740,7 +723,7 @@ void TestKernelSoftmax() { VLOG(10) << "Test JITKernel: " << jit::to_string(KernelTuple::kernel_type); for (int bs : {1, 2, 10}) { for (int n : TestSizes()) { - auto ref = jit::GetRefer(); + auto ref = jit::GetReferFunc(); EXPECT_TRUE(ref != nullptr); std::vector x(bs * n), y(bs * n); RandomVec(bs * n, x.data()); @@ -808,7 +791,7 @@ void TestKernelSgd() { RandomVec(rows_size * grad_w, grad.data()); const int64_t* rows_data = rows.data(); const T* grad_data = grad.data(); - auto ref = jit::GetRefer(); + auto ref = jit::GetReferFunc(); EXPECT_TRUE(ref != nullptr); jit::sgd_attr_t attr(param_h, grad_w, rows_size, grad_w, rows_size); ref(&lr, param_data, grad_data, rows_data, out_data, &attr); @@ -874,7 +857,7 @@ void TestKernelVBroadcast() { RandomVec(w, x.data()); const T* x_data = x.data(); for (int64_t h : {1, 2, 6}) { - auto ref = jit::GetRefer(); + auto ref = jit::GetReferFunc(); EXPECT_TRUE(ref != nullptr); std::vector y(w * h); T* y_data = y.data(); @@ -900,6 +883,135 @@ void TestKernelVBroadcast() { } } +// test pool +TEST(JITKernel_pool, jitcreator) { + const auto& jitcreators = jit::JitCodeCreatorPool::Instance().AllCreators(); + EXPECT_EQ(jitcreators.size(), 25UL); +} + +TEST(JITKernel_pool, jitpool) { + // jitpool is related with attr + const auto& kers = jit::JitCodePool().Instance().AllKernels(); + EXPECT_EQ(kers.size(), 0UL); + jit::GetAllCandidateKernels, CPUPlace>(3); + // after call GetAllCandidateKernels, it will create jitcode Automatically + EXPECT_EQ(kers.size(), 1UL); +} + +TEST(JITKernel_pool, more) { + const auto& kers = jit::KernelPool::Instance().AllKernels(); + EXPECT_EQ(kers.size(), 21UL); +} + +TEST(JITKernel_pool, refer) { + const auto& kers = jit::ReferKernelPool::Instance().AllKernels(); + EXPECT_EQ(kers.size(), 29UL); +} + +// test helper +TEST(JITKernel_helper, GetAllCandidateKernels) { + auto fp_kers = + jit::GetAllCandidateKernels, CPUPlace>(10); +#if defined(_WIN32) || defined(__APPLE__) || defined(__OSX__) + EXPECT_GE(fp_kers.size(), 1UL); // refer +#else + EXPECT_GE(fp_kers.size(), 3UL); // jitcode, mkl, refer +#endif + + auto db_kers = + jit::GetAllCandidateKernels, CPUPlace>(10); +#if defined(_WIN32) || defined(__APPLE__) || defined(__OSX__) + EXPECT_GE(db_kers.size(), 1UL); // refer +#else + EXPECT_GE(db_kers.size(), 2UL); // mkl, refer +#endif +} + +TEST(JITKernel_helper, GetAllCandidateFuncsWithTypes) { + auto fp_kers = + jit::GetAllCandidateFuncsWithTypes, CPUPlace>(10); + EXPECT_GE(fp_kers.size(), 3UL); // jitcode, mkl, refer + + auto db_kers = + jit::GetAllCandidateFuncsWithTypes, CPUPlace>(10); + EXPECT_GE(db_kers.size(), 2UL); // mkl, refer +} + +TEST(JITKernel_helper, GetAllCandidateFuncs) { + auto funcs = jit::GetAllCandidateFuncs, CPUPlace>(10); + auto kers = jit::GetAllCandidateKernels, CPUPlace>(10); + EXPECT_EQ(funcs.size(), kers.size()); + + std::vector x(10), tgt(10); + RandomVec(10, x.data()); + auto best = jit::GetDefaultBestFunc, CPUPlace>(10); + best(x.data(), tgt.data(), 10); + for (auto f : funcs) { + std::vector y(10); + f(x.data(), y.data(), 10); + ExpectEQ(y.data(), tgt.data(), 10); + } +} + +TEST(JITKernel_helper, attr) { + std::ostringstream out; + + // KernelTypes + out << jit::to_string(jit::kNone) << jit::to_string(jit::kCRFDecoding) + << jit::to_string(jit::kEmbSeqPool) << jit::to_string(jit::kGRUH1) + << jit::to_string(jit::kGRUHtPart1) << jit::to_string(jit::kGRUHtPart2) + << jit::to_string(jit::kHSum) << jit::to_string(jit::kHMax) + << jit::to_string(jit::kLSTMCtHt) << jit::to_string(jit::kLSTMC1H1) + << jit::to_string(jit::kLayerNorm) << jit::to_string(jit::kMatMul) + << jit::to_string(jit::kNCHW16CMulNC) << jit::to_string(jit::kSeqPool) + << jit::to_string(jit::kSoftmax) << jit::to_string(jit::kVAdd) + << jit::to_string(jit::kVAddBias) << jit::to_string(jit::kVAddRelu) + << jit::to_string(jit::kVBroadcast) << jit::to_string(jit::kVCopy) + << jit::to_string(jit::kVExp) << jit::to_string(jit::kVIdentity) + << jit::to_string(jit::kVMul) << jit::to_string(jit::kVRelu) + << jit::to_string(jit::kVScal) << jit::to_string(jit::kSgd) + << jit::to_string(jit::kVSigmoid) << jit::to_string(jit::kVSquare) + << jit::to_string(jit::kVSub) << jit::to_string(jit::kVTanh); + EXPECT_EQ(out.str().size(), 234); + + // SeqPoolTypes + out.str(""); + out << jit::to_string(jit::kSum) << jit::to_string(jit::kAvg) + << jit::to_string(jit::kSqrt); + EXPECT_EQ(out.str().size(), 13); + + EXPECT_EQ(jit::to_kerneltype("relu"), jit::kVRelu); + EXPECT_EQ(jit::to_kerneltype("Identity"), jit::kVIdentity); + EXPECT_EQ(jit::to_kerneltype("VEXP"), jit::kVExp); + EXPECT_EQ(jit::to_kerneltype("SigmoiD"), jit::kVSigmoid); + EXPECT_EQ(jit::to_kerneltype("VTanh"), jit::kVTanh); + + out.str(""); + out << jit::lstm_attr_t(8, jit::kVIdentity, jit::kVSigmoid, jit::kVTanh); + EXPECT_EQ(out.str().size(), 89); + + out.str(""); + out << jit::gru_attr_t(8, jit::kVIdentity, jit::kVSigmoid); + EXPECT_EQ(out.str().size(), 52); + + out.str(""); + out << jit::seq_pool_attr_t(8, jit::SeqPoolType::kSum); + EXPECT_EQ(out.str().size(), 44); + + out.str(""); + out << jit::emb_seq_pool_attr_t(1, 2, 3, 4, 5, jit::SeqPoolType::kAvg); + EXPECT_EQ(out.str().size(), 93); + + out.str(""); + out << jit::sgd_attr_t(1, 2, 3, 4, 5); + EXPECT_EQ(out.str().size(), 81); + + out.str(""); + out << jit::matmul_attr_t(1, 2, 3); + EXPECT_EQ(out.str().size(), 14); +} + +// test kernerls #define TestKernelVMul TestKernelXYZN #define TestKernelVAdd TestKernelXYZN #define TestKernelVAddRelu TestKernelXYZN @@ -969,6 +1081,14 @@ TEST_CPU_KERNEL(Softmax); TEST_CPU_KERNEL(Sgd); TEST_CPU_KERNEL(VBroadcast); +TEST(JITKernel, kernel_func) { + auto f1 = jit::KernelFuncs, CPUPlace>::Cache().At(3); + auto f2 = jit::KernelFuncs, CPUPlace>::Cache()[3]; + EXPECT_TRUE(f1 != nullptr); + EXPECT_TRUE(f1 == f2); + // TODO(TJ): check not equal +} + TEST(JITKernel_key, lstm) { jit::lstm_attr_t attr1(8, jit::kVIdentity, jit::kVSigmoid, jit::kVTanh); jit::lstm_attr_t attr2(9, jit::kVIdentity, jit::kVSigmoid, jit::kVTanh); @@ -1000,11 +1120,3 @@ TEST(JITKernel_key, gru) { EXPECT_TRUE(key2 == key3); EXPECT_TRUE(key3 != key4); } - -TEST(JITKernel, kernel_func) { - auto f1 = jit::KernelFuncs, CPUPlace>::Cache().At(3); - auto f2 = jit::KernelFuncs, CPUPlace>::Cache()[3]; - EXPECT_TRUE(f1 != nullptr); - EXPECT_TRUE(f1 == f2); - // TODO(TJ): check not equal -} From 5579fae1d2717443825e7489ed8b6da1b436bd95 Mon Sep 17 00:00:00 2001 From: Tink_Y <31891223+tink2123@users.noreply.github.com> Date: Mon, 11 Mar 2019 01:01:21 +0800 Subject: [PATCH 033/230] Update activation_op.cc test=develop --- paddle/fluid/operators/activation_op.cc | 2 ++ 1 file changed, 2 insertions(+) diff --git a/paddle/fluid/operators/activation_op.cc b/paddle/fluid/operators/activation_op.cc index cc948f2697..27e1463570 100644 --- a/paddle/fluid/operators/activation_op.cc +++ b/paddle/fluid/operators/activation_op.cc @@ -12,6 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ +#include +#include #include "paddle/fluid/operators/activation_op.h" #include #include "paddle/fluid/operators/mkldnn/mkldnn_activation_op.h" From e4e0d034594f34b6327ecdcd0f0fcbba956f18a3 Mon Sep 17 00:00:00 2001 From: tink2123 Date: Mon, 11 Mar 2019 02:12:40 +0000 Subject: [PATCH 034/230] fix format test=develop --- paddle/fluid/operators/activation_op.cc | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/paddle/fluid/operators/activation_op.cc b/paddle/fluid/operators/activation_op.cc index 27e1463570..747b9ca278 100644 --- a/paddle/fluid/operators/activation_op.cc +++ b/paddle/fluid/operators/activation_op.cc @@ -12,10 +12,10 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include -#include #include "paddle/fluid/operators/activation_op.h" +#include #include +#include #include "paddle/fluid/operators/mkldnn/mkldnn_activation_op.h" #include "paddle/fluid/platform/port.h" #ifdef PADDLE_WITH_CUDA From 732fa00eaf905e531d907493132ba948bf159639 Mon Sep 17 00:00:00 2001 From: sneaxiy Date: Fri, 8 Mar 2019 13:05:11 +0000 Subject: [PATCH 035/230] disable gc in recurrent_op currently test=develop --- paddle/fluid/framework/executor.cc | 38 ++++++++++++++++---------- paddle/fluid/framework/executor.h | 17 +++++++++--- paddle/fluid/operators/recurrent_op.cc | 8 ++++-- paddle/fluid/pybind/pybind.cc | 6 ++-- python/paddle/fluid/executor.py | 2 +- 5 files changed, 47 insertions(+), 24 deletions(-) diff --git a/paddle/fluid/framework/executor.cc b/paddle/fluid/framework/executor.cc index 7eef9ec504..f3869ceb6d 100644 --- a/paddle/fluid/framework/executor.cc +++ b/paddle/fluid/framework/executor.cc @@ -80,11 +80,11 @@ static std::unordered_map GetNonPersistableReferenceCounts( ExecutorPrepareContext::ExecutorPrepareContext( const framework::ProgramDesc& prog, size_t block_id, - const std::vector& skip_ref_cnt_vars) - : prog_(prog), block_id_(block_id) { - if (GetEagerDeletionThreshold() >= 0) { - global_ref_cnts_ = GetNonPersistableReferenceCounts(prog.Block(block_id), - skip_ref_cnt_vars); + const std::vector& keep_vars, bool force_disable_gc) + : prog_(prog), block_id_(block_id), force_disable_gc_(force_disable_gc) { + if (GetEagerDeletionThreshold() >= 0 && !force_disable_gc_) { + global_ref_cnts_ = + GetNonPersistableReferenceCounts(prog.Block(block_id), keep_vars); } } @@ -189,13 +189,15 @@ void Executor::CreateVariables(const ProgramDesc& pdesc, Scope* scope, } void Executor::Run(const ProgramDesc& pdesc, Scope* scope, int block_id, - bool create_local_scope, bool create_vars) { + bool create_local_scope, bool create_vars, + const std::vector& skip_ref_cnt_vars, + bool force_disable_gc) { platform::RecordBlock b(block_id); if (FLAGS_use_mkldnn) EnableMKLDNN(pdesc); #ifdef PADDLE_WITH_NGRAPH if (FLAGS_use_ngraph) operators::NgraphEngine::EnableNgraph(pdesc); #endif - auto ctx = Prepare(pdesc, block_id); + auto ctx = Prepare(pdesc, block_id, skip_ref_cnt_vars, force_disable_gc); RunPreparedContext(ctx.get(), scope, create_local_scope, create_vars); } @@ -362,9 +364,9 @@ void Executor::Run(const ProgramDesc& program, Scope* scope, std::unique_ptr Executor::Prepare( const ProgramDesc& program, int block_id, - const std::vector& skip_ref_cnt_vars) { - std::unique_ptr ctx( - new ExecutorPrepareContext(program, block_id, skip_ref_cnt_vars)); + const std::vector& skip_ref_cnt_vars, bool force_disable_gc) { + std::unique_ptr ctx(new ExecutorPrepareContext( + program, block_id, skip_ref_cnt_vars, force_disable_gc)); PADDLE_ENFORCE_LT(static_cast(block_id), program.Size()); auto& block = program.Block(block_id); for (auto& op_desc : block.AllOps()) { @@ -375,7 +377,8 @@ std::unique_ptr Executor::Prepare( std::vector> Executor::Prepare( const ProgramDesc& program, const std::vector& block_ids, - const std::vector>& skip_ref_cnt_vars) { + const std::vector>& skip_ref_cnt_vars, + bool force_disable_gc) { PADDLE_ENFORCE( skip_ref_cnt_vars.empty() || skip_ref_cnt_vars.size() == block_ids.size(), "skip_ref_cnt_vars should be either empty or equals to block number %d", @@ -385,9 +388,11 @@ std::vector> Executor::Prepare( for (auto& bid : block_ids) { ExecutorPrepareContext* ctx; if (skip_ref_cnt_vars.empty()) { - ctx = new ExecutorPrepareContext(program, bid); + ctx = new ExecutorPrepareContext(program, bid, std::vector(), + force_disable_gc); } else { - ctx = new ExecutorPrepareContext(program, bid, skip_ref_cnt_vars[idx]); + ctx = new ExecutorPrepareContext(program, bid, skip_ref_cnt_vars[idx], + force_disable_gc); } PADDLE_ENFORCE_LT(static_cast(bid), program.Size()); auto& block = program.Block(bid); @@ -414,7 +419,9 @@ void Executor::RunPreparedContext(ExecutorPrepareContext* ctx, Scope* scope, int64_t max_memory_size = GetEagerDeletionThreshold(); std::unique_ptr gc; - if (max_memory_size >= 0) { + // FIXME(zjl): recurrent_op is rather complex, we would + // disable gc forcely in recurrent_op + if (!ctx->force_disable_gc_ && max_memory_size >= 0) { ctx->ResetReferenceCount(); #ifdef PADDLE_WITH_CUDA if (platform::is_gpu_place(place_)) { @@ -432,7 +439,8 @@ void Executor::RunPreparedContext(ExecutorPrepareContext* ctx, Scope* scope, #ifdef PADDLE_WITH_CUDA } #endif - if (gc && keep_kids) { + // If gc is enabled and block size > 1 + if (gc && ctx->prog_.Size() > 1) { operators::PrepareSafeEagerDeletionOnWhileOpAndWhileGradOp(ctx->block_id_, ctx->ops_); } diff --git a/paddle/fluid/framework/executor.h b/paddle/fluid/framework/executor.h index 5a040ac641..65cb9e51ab 100644 --- a/paddle/fluid/framework/executor.h +++ b/paddle/fluid/framework/executor.h @@ -15,7 +15,9 @@ limitations under the License. */ #pragma once #include +#include #include +#include #include #include "paddle/fluid/framework/garbage_collector.h" #include "paddle/fluid/framework/op_info.h" @@ -30,7 +32,8 @@ namespace framework { struct ExecutorPrepareContext { ExecutorPrepareContext(const framework::ProgramDesc& prog, size_t block_id, const std::vector& skip_ref_cnt_vars = - std::vector()); + std::vector(), + bool force_disable_gc = false); ~ExecutorPrepareContext(); @@ -38,6 +41,7 @@ struct ExecutorPrepareContext { const framework::ProgramDesc& prog_; size_t block_id_; + bool force_disable_gc_; std::vector> ops_; std::unordered_map global_ref_cnts_; @@ -66,7 +70,10 @@ class Executor { * Scope */ void Run(const ProgramDesc& prog, Scope* scope, int block_id, - bool create_local_scope = true, bool create_vars = true); + bool create_local_scope = true, bool create_vars = true, + const std::vector& skip_ref_cnt_vars = + std::vector(), + bool force_disable_gc = false); // This API is very slow. void Run(const ProgramDesc& program, Scope* scope, @@ -79,12 +86,14 @@ class Executor { static std::unique_ptr Prepare( const ProgramDesc& program, int block_id, const std::vector& skip_ref_cnt_vars = - std::vector()); + std::vector(), + bool force_disable_gc = false); static std::vector> Prepare( const ProgramDesc& program, const std::vector& block_ids, const std::vector>& skip_ref_cnt_vars = - std::vector>()); + std::vector>(), + bool force_disable_gc = false); void CreateVariables(const ProgramDesc& pdesc, Scope* scope, int block_id); diff --git a/paddle/fluid/operators/recurrent_op.cc b/paddle/fluid/operators/recurrent_op.cc index a1e02a3fd0..eb39b3119c 100644 --- a/paddle/fluid/operators/recurrent_op.cc +++ b/paddle/fluid/operators/recurrent_op.cc @@ -270,7 +270,9 @@ class RecurrentOp : public RecurrentBase { // Every inputs are linked now, execute! executor.Run(*program, &cur_scope, block->ID(), - false /*create_local_scope*/); + false /*create_local_scope*/, true /*create_vars*/, + std::vector() /*skip_ref_cnt_vars*/, + true /*force_disable_gc*/); // get device context from pool platform::DeviceContextPool &pool = @@ -385,7 +387,9 @@ class RecurrentGradOp : public RecurrentBase { VLOG(5) << "Recurrent memory linking finished "; // Run step block with cur_scope executor.Run(*program, &cur_scope, block->ID(), - false /*create_local_scope*/); + false /*create_local_scope*/, true /*create_vars*/, + std::vector() /*skip_ref_cnt_vars*/, + true /*force_disable_gc*/); VLOG(5) << "executor.Run finished "; diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc index cf59ff6d3b..439d9aa83f 100644 --- a/paddle/fluid/pybind/pybind.cc +++ b/paddle/fluid/pybind/pybind.cc @@ -876,9 +876,11 @@ All parameter, weight, gradient are variables in Paddle. .def(py::init()) .def("close", &Executor::Close) .def("run", [](Executor &self, const ProgramDesc &prog, Scope *scope, - int block_id, bool create_local_scope, bool create_vars) { + int block_id, bool create_local_scope, bool create_vars, + const std::vector &fetch_vars) { pybind11::gil_scoped_release release; - self.Run(prog, scope, block_id, create_local_scope, create_vars); + self.Run(prog, scope, block_id, create_local_scope, create_vars, + fetch_vars); }); m.def("init_gflags", framework::InitGflags); diff --git a/python/paddle/fluid/executor.py b/python/paddle/fluid/executor.py index dfa50e721c..cc3c0dd689 100644 --- a/python/paddle/fluid/executor.py +++ b/python/paddle/fluid/executor.py @@ -590,7 +590,7 @@ class Executor(object): fetch_var_name=fetch_var_name) self._feed_data(program, feed, feed_var_name, scope) - exe.run(program.desc, scope, 0, True, True) + exe.run(program.desc, scope, 0, True, True, fetch_var_name) outs = self._fetch_data(fetch_list, fetch_var_name, scope) if return_numpy: outs = as_numpy(outs) From cfc59b13e9a3bb9af39edb1acfc34c7e69791166 Mon Sep 17 00:00:00 2001 From: tink2123 Date: Mon, 11 Mar 2019 03:12:16 +0000 Subject: [PATCH 036/230] modified api.spec test=develop --- paddle/fluid/API.spec | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/paddle/fluid/API.spec b/paddle/fluid/API.spec index 00bd23b1fe..e6d638031f 100644 --- a/paddle/fluid/API.spec +++ b/paddle/fluid/API.spec @@ -301,8 +301,8 @@ paddle.fluid.layers.ceil (ArgSpec(args=['x', 'name'], varargs=None, keywords=Non paddle.fluid.layers.floor (ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '647b16c5da5ef909649ae02abb434973')) paddle.fluid.layers.cos (ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '485f2686bcc2fe37a4bd893769c8a3e2')) paddle.fluid.layers.acos (ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', 'c721122352acfc1853bffadf2d59103b')) -paddle.fluid.layers.sin (ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '01f1766aa76eff1df30147505b59f7c4')) paddle.fluid.layers.asin (ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '0619b891e80f419b28016cde3d106c68')) +paddle.fluid.layers.sin (ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '01f1766aa76eff1df30147505b59f7c4')) paddle.fluid.layers.round (ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', 'b47f5da13913d3e56bdb1e612a73f3f2')) paddle.fluid.layers.reciprocal (ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', 'cc6ac2f14f03c52aaa83a59bf83b8d26')) paddle.fluid.layers.square (ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '48dfb45d773dbc30126c3a7f777de5ee')) From 9e2c7e69fb1ba277a0558b18d61676110db7049e Mon Sep 17 00:00:00 2001 From: luotao1 Date: Mon, 11 Mar 2019 11:51:08 +0800 Subject: [PATCH 037/230] simplify the zero_copy tests test=develop --- .../tests/api/analyzer_pyramid_dnn_tester.cc | 9 +- .../tests/api/analyzer_rnn1_tester.cc | 139 ++--------------- .../tests/api/analyzer_seq_pool1_tester.cc | 143 ++---------------- .../fluid/inference/tests/api/tester_helper.h | 1 + 4 files changed, 35 insertions(+), 257 deletions(-) diff --git a/paddle/fluid/inference/tests/api/analyzer_pyramid_dnn_tester.cc b/paddle/fluid/inference/tests/api/analyzer_pyramid_dnn_tester.cc index 5ba553aad6..5157bd280d 100644 --- a/paddle/fluid/inference/tests/api/analyzer_pyramid_dnn_tester.cc +++ b/paddle/fluid/inference/tests/api/analyzer_pyramid_dnn_tester.cc @@ -167,8 +167,15 @@ TEST(Analyzer_Pyramid_DNN, compare) { SetInput(&input_slots_all); CompareNativeAndAnalysis( reinterpret_cast(&cfg), input_slots_all); +} + +// Compare result of AnalysisConfig and AnalysisConfig + ZeroCopy +TEST(Analyzer_Pyramid_DNN, compare_zero_copy) { + AnalysisConfig cfg; + SetConfig(&cfg); - // Compare AnalysisConfig and AnalysisConfig + ZeroCopy + std::vector> input_slots_all; + SetInput(&input_slots_all); std::vector outputs_name; outputs_name.emplace_back("cos_sim_2.tmp_0"); CompareAnalysisAndZeroCopy(reinterpret_cast(&cfg), diff --git a/paddle/fluid/inference/tests/api/analyzer_rnn1_tester.cc b/paddle/fluid/inference/tests/api/analyzer_rnn1_tester.cc index 36282b3efe..dcf4b38ce8 100644 --- a/paddle/fluid/inference/tests/api/analyzer_rnn1_tester.cc +++ b/paddle/fluid/inference/tests/api/analyzer_rnn1_tester.cc @@ -207,6 +207,9 @@ void SetConfig(AnalysisConfig *cfg) { cfg->DisableGpu(); cfg->SwitchSpecifyInputNames(); cfg->SwitchIrOptim(); + if (FLAGS_zero_copy) { + cfg->SwitchUseFeedFetchOps(false); + } } void SetInput(std::vector> *inputs) { @@ -285,133 +288,17 @@ TEST(Analyzer_rnn1, multi_thread) { input_slots_all, &outputs, 2 /* multi_thread */); } -// Validate that the AnalysisPredictor + ZeroCopyTensor really works by testing -// on the complex RNN1 model. -TEST(Analyzer_rnn1, ZeroCopy) { - AnalysisConfig config; - SetConfig(&config); - config.SwitchUseFeedFetchOps(false); - - PaddlePlace place; - - auto predictor = CreatePaddlePredictor(config); - - config.SwitchUseFeedFetchOps(true); - auto native_predictor = - CreatePaddlePredictor(config.ToNativeConfig()); - - config.SwitchUseFeedFetchOps( - true); // the analysis predictor needs feed/fetch. - auto analysis_predictor = CreatePaddlePredictor(config); - -#define NEW_TENSOR(name__) \ - auto name__##_tensor = predictor->GetInputTensor(#name__); - NEW_TENSOR(data_lod_attention); - NEW_TENSOR(cell_init); - NEW_TENSOR(data); - NEW_TENSOR(week); - NEW_TENSOR(minute); - NEW_TENSOR(hidden_init); - - // Prepare data for AnalysisPredictor - DataRecord data(FLAGS_infer_data, FLAGS_batch_size); - PrepareZeroCopyInputs(data_lod_attention_tensor.get(), cell_init_tensor.get(), - data_tensor.get(), hidden_init_tensor.get(), - week_tensor.get(), minute_tensor.get(), &data, - FLAGS_batch_size); - - // Prepare data for NativePredictor - std::vector> native_inputs; - SetInput(&native_inputs); - std::vector native_outputs; - std::vector analysis_outputs; - - auto output_tensor = predictor->GetOutputTensor("final_output.tmp_1"); - // Run analysis predictor - - int num_ops; - auto fuse_statis = GetFuseStatis(predictor.get(), &num_ops); - ASSERT_TRUE(fuse_statis.count("fc_fuse")); - ASSERT_EQ(fuse_statis.at("fc_fuse"), 1); - ASSERT_EQ(fuse_statis.at("fc_nobias_lstm_fuse"), 2); // bi-directional LSTM - ASSERT_EQ(fuse_statis.at("seq_concat_fc_fuse"), 1); - ASSERT_EQ(num_ops, - 13); // After graph optimization, only 13 operators exists. - - Timer timer; - double total_time{0}; - for (int i = 0; i < FLAGS_repeat; i++) { - timer.tic(); - predictor->ZeroCopyRun(); - total_time += timer.toc(); - } - LOG(INFO) << "ZeroCopy output: " << DescribeZeroCopyTensor(*output_tensor); - - ASSERT_TRUE(native_predictor->Run(native_inputs.front(), &native_outputs)); - LOG(INFO) << "native output " << DescribeTensor(native_outputs.front()); - - int output_size{0}; // this is the number of elements not memory size - auto *zero_copy_data = output_tensor->data(&place, &output_size); - auto *native_data = static_cast(native_outputs.front().data.data()); - for (int i = 0; i < output_size; i++) { - EXPECT_NEAR(zero_copy_data[i], native_data[i], 1e-3); - } -} - -TEST(Analyzer_rnn1, ZeroCopyMultiThread) { - AnalysisConfig config; - SetConfig(&config); - config.SwitchUseFeedFetchOps(false); - -#define NEW_TENSOR(name__) \ - auto name__##_tensor = predictor->GetInputTensor(#name__); - - std::vector> predictors; - predictors.emplace_back(CreatePaddlePredictor(config)); - for (int tid = 1; tid < FLAGS_num_threads; tid++) { - predictors.emplace_back(predictors.front()->Clone()); - } - double total_time_of_threads{0}; - std::vector threads; - - for (int tid = 0; tid < FLAGS_num_threads; tid++) { - threads.emplace_back([&, tid] { - auto &predictor = predictors[tid]; - NEW_TENSOR(data_lod_attention); - NEW_TENSOR(cell_init); - NEW_TENSOR(data); - NEW_TENSOR(week); - NEW_TENSOR(minute); - NEW_TENSOR(hidden_init); - - // Prepare data for AnalysisPredictor - DataRecord data(FLAGS_infer_data, FLAGS_batch_size); - Timer timer; - double total_time{0}; - - for (int i = 0; i < FLAGS_repeat; i++) { - PrepareZeroCopyInputs(data_lod_attention_tensor.get(), - cell_init_tensor.get(), data_tensor.get(), - hidden_init_tensor.get(), week_tensor.get(), - minute_tensor.get(), &data, FLAGS_batch_size); - - timer.tic(); - predictor->ZeroCopyRun(); - total_time += timer.toc(); - } - - total_time_of_threads += total_time; - - LOG(INFO) << "thread time: " << total_time / FLAGS_repeat; - }); - } - - for (auto &t : threads) { - t.join(); - } +// Compare result of AnalysisConfig and AnalysisConfig + ZeroCopy +TEST(Analyzer_rnn1, compare_zero_copy) { + AnalysisConfig cfg; + SetConfig(&cfg); - LOG(INFO) << "average time: " - << total_time_of_threads / FLAGS_num_threads / FLAGS_repeat; + std::vector> input_slots_all; + SetInput(&input_slots_all); + std::vector outputs_name; + outputs_name.emplace_back("final_output.tmp_1"); + CompareAnalysisAndZeroCopy(reinterpret_cast(&cfg), + input_slots_all, outputs_name); } } // namespace inference diff --git a/paddle/fluid/inference/tests/api/analyzer_seq_pool1_tester.cc b/paddle/fluid/inference/tests/api/analyzer_seq_pool1_tester.cc index cca2ab1ee1..19fa5528da 100644 --- a/paddle/fluid/inference/tests/api/analyzer_seq_pool1_tester.cc +++ b/paddle/fluid/inference/tests/api/analyzer_seq_pool1_tester.cc @@ -144,6 +144,9 @@ void SetConfig(AnalysisConfig *cfg, bool use_mkldnn = false) { cfg->SwitchSpecifyInputNames(); cfg->SwitchIrDebug(); cfg->SetCpuMathLibraryNumThreads(FLAGS_paddle_num_threads); + if (FLAGS_zero_copy) { + cfg->SwitchUseFeedFetchOps(false); + } if (use_mkldnn) { cfg->EnableMKLDNN(); } @@ -184,10 +187,10 @@ TEST(Analyzer_seq_pool1, compare_determine) { input_slots_all); } -void analysis_fuse_statis(bool use_zerocopy) { +// Check the fuse status +TEST(Analyzer_seq_pool1, fuse_statis) { AnalysisConfig cfg; SetConfig(&cfg); - cfg.SwitchUseFeedFetchOps(!use_zerocopy); int num_ops; auto predictor = CreatePaddlePredictor(cfg); auto fuse_statis = GetFuseStatis(predictor.get(), &num_ops); @@ -203,137 +206,17 @@ void analysis_fuse_statis(bool use_zerocopy) { EXPECT_EQ(num_ops, 171); } -// Check the fuse status -TEST(Analyzer_seq_pool1, fuse_statis) { analysis_fuse_statis(false); } - -void PrepareZeroCopyInputs( - const std::unique_ptr &predictor, - std::vector> *inputs) { - DataRecord data(FLAGS_infer_data, FLAGS_batch_size); - // only feed one batch - const auto &one_batch = data.NextBatch(); - inputs->clear(); - for (size_t i = 0; i < one_batch.size(); ++i) { - auto &slot = one_batch[i]; - auto tensor = predictor->GetInputTensor(slot.name + "_embed"); - tensor->Reshape(slot.shape); - tensor->SetLoD({slot.lod}); - ZeroCopyTensorAssignData(tensor.get(), slot.data); - inputs->emplace_back(std::move(tensor)); - } -} - -// return the output values -std::vector zerocopy_profile(int repeat_times) { - AnalysisConfig config; - SetConfig(&config); - config.SwitchUseFeedFetchOps(false); - auto predictor = CreatePaddlePredictor(config); - std::vector> inputs; - PrepareZeroCopyInputs(predictor, &inputs); - auto output_tensor = predictor->GetOutputTensor(out_var_name); - Timer timer; - LOG(INFO) << "Warm up run..."; - timer.tic(); - predictor->ZeroCopyRun(); - PrintTime(FLAGS_batch_size, 1, 1, 0, timer.toc(), 1); - if (FLAGS_profile) { - paddle::platform::ResetProfiler(); - } - LOG(INFO) << "Run " << repeat_times << " times..."; - timer.tic(); - for (int i = 0; i < repeat_times; i++) { - predictor->ZeroCopyRun(); - } - PrintTime(FLAGS_batch_size, repeat_times, 1, 0, timer.toc() / repeat_times, - 1); - - LOG(INFO) << "ZeroCopy output: " << DescribeZeroCopyTensor(*output_tensor); - PaddlePlace place; - int output_size{0}; - auto *pdata = output_tensor->data(&place, &output_size); - std::vector res(output_size); - for (int i = 0; i < output_size; ++i) { - res[i] = pdata[i]; - } - return res; -} - -TEST(Analyzer_seq_pool1, zerocopy_profile) { zerocopy_profile(FLAGS_repeat); } - -TEST(Analyzer_seq_pool1, zerocopy_profile_threads) { - AnalysisConfig config; - SetConfig(&config); - config.SwitchUseFeedFetchOps(false); - - std::vector> predictors; - predictors.emplace_back(CreatePaddlePredictor(config)); - for (int tid = 1; tid < FLAGS_num_threads; tid++) { - predictors.emplace_back(predictors.front()->Clone()); - } - double total_time_of_threads{0}; - std::vector threads; - - for (int tid = 0; tid < FLAGS_num_threads; tid++) { - threads.emplace_back([&, tid] { - auto &predictor = predictors[tid]; - std::vector> inputs; - PrepareZeroCopyInputs(predictor, &inputs); - auto output_tensor = predictor->GetOutputTensor(out_var_name); - Timer timer; - double total_time{0}; - - LOG(INFO) << "Warm up run..."; - timer.tic(); - predictor->ZeroCopyRun(); - PrintTime(FLAGS_batch_size, 1, FLAGS_num_threads, tid, timer.toc(), 1); - if (FLAGS_profile) { - paddle::platform::ResetProfiler(); - } - int repeat_times = FLAGS_repeat; - LOG(INFO) << "Run " << repeat_times << " times..."; - timer.tic(); - - for (int i = 0; i < repeat_times; i++) { - predictor->ZeroCopyRun(); - } - total_time += timer.toc(); - total_time_of_threads += total_time; - - LOG(INFO) << "thread time: " << total_time / repeat_times; - }); - } - - for (auto &t : threads) { - t.join(); - } - - LOG(INFO) << "average time: " - << total_time_of_threads / FLAGS_num_threads / FLAGS_repeat; -} - -TEST(Analyzer_seq_pool1, zerocopy_fuse_statis) { analysis_fuse_statis(true); } +// Compare result of AnalysisConfig and AnalysisConfig + ZeroCopy +TEST(Analyzer_seq_pool1, compare_zero_copy) { + AnalysisConfig cfg; + SetConfig(&cfg); -TEST(Analyzer_seq_pool1, zerocopy_compare_native) { - AnalysisConfig config; - SetConfig(&config); - config.SwitchUseFeedFetchOps(true); - auto predictor = CreatePaddlePredictor(config.ToNativeConfig()); - std::vector native_outputs; std::vector> input_slots_all; SetInput(&input_slots_all); - ASSERT_TRUE(predictor->Run(input_slots_all[0], &native_outputs)); - EXPECT_EQ(native_outputs.size(), 1UL); - - auto zerocopy_output = zerocopy_profile(1); - EXPECT_EQ(zerocopy_output.size() * sizeof(float), - native_outputs.front().data.length()); - auto *native_data = static_cast(native_outputs.front().data.data()); - for (size_t i = 0; i < zerocopy_output.size(); ++i) { - EXPECT_LT( - std::fabs((zerocopy_output[i] - native_data[i]) / zerocopy_output[i]), - 1e-3); - } + std::vector outputs_name; + outputs_name.emplace_back(out_var_name); + CompareAnalysisAndZeroCopy(reinterpret_cast(&cfg), + input_slots_all, outputs_name); } } // namespace analysis diff --git a/paddle/fluid/inference/tests/api/tester_helper.h b/paddle/fluid/inference/tests/api/tester_helper.h index 9a843e8d02..c32e6e3857 100644 --- a/paddle/fluid/inference/tests/api/tester_helper.h +++ b/paddle/fluid/inference/tests/api/tester_helper.h @@ -432,6 +432,7 @@ void CompareAnalysisAndZeroCopy( ZeroCopyTensor zerocopy_output = *predictor->GetOutputTensor(outputs_name[i]).get(); zerocopy_outputs.emplace_back(zerocopy_output); + LOG(INFO) << "ZeroCopy output: " << DescribeZeroCopyTensor(zerocopy_output); } // compare CompareResult(analysis_outputs, zerocopy_outputs); From d7407c90aa3ee847fda052fdca9f10b788249875 Mon Sep 17 00:00:00 2001 From: sneaxiy Date: Fri, 8 Mar 2019 13:01:30 +0000 Subject: [PATCH 038/230] refine cross_entropy mem test=develop --- paddle/fluid/operators/cross_entropy2_op.cc | 218 ++++++++++++++++++++ paddle/fluid/operators/cross_entropy2_op.cu | 29 +++ paddle/fluid/operators/cross_entropy2_op.h | 188 +++++++++++++++++ python/paddle/fluid/layers/nn.py | 16 ++ 4 files changed, 451 insertions(+) create mode 100644 paddle/fluid/operators/cross_entropy2_op.cc create mode 100644 paddle/fluid/operators/cross_entropy2_op.cu create mode 100644 paddle/fluid/operators/cross_entropy2_op.h diff --git a/paddle/fluid/operators/cross_entropy2_op.cc b/paddle/fluid/operators/cross_entropy2_op.cc new file mode 100644 index 0000000000..03b217a974 --- /dev/null +++ b/paddle/fluid/operators/cross_entropy2_op.cc @@ -0,0 +1,218 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/cross_entropy2_op.h" +#include +#include +#include + +namespace paddle { +namespace operators { + +class CrossEntropyOp2 : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext* ctx) const override { + PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should be not null."); + PADDLE_ENFORCE(ctx->HasInput("Label"), "Input(Label) should be not null."); + + PADDLE_ENFORCE(ctx->HasOutput("Y"), "Output(Y) should be not null."); + PADDLE_ENFORCE(ctx->HasOutput("XShape"), + "Output(XShape) should be not null."); + + auto x_dims = ctx->GetInputDim("X"); + auto label_dims = ctx->GetInputDim("Label"); + int rank = x_dims.size(); + PADDLE_ENFORCE_EQ(rank, label_dims.size(), + "Input(X) and Input(Label) shall have the same rank."); + bool check = true; + if ((!ctx->IsRuntime()) && (framework::product(x_dims) <= 0 || + framework::product(label_dims) <= 0)) { + check = false; + } + if (check) { + PADDLE_ENFORCE_EQ(framework::slice_ddim(x_dims, 0, rank - 1), + framework::slice_ddim(label_dims, 0, rank - 1), + "Input(X) and Input(Label) shall have the same shape " + "except the last dimension."); + } + + PADDLE_ENFORCE_EQ(label_dims[rank - 1], 1UL, + "Last dimension of Input(Label) should be 1."); + auto y_dims = x_dims; + y_dims[rank - 1] = 1; + ctx->SetOutputDim("Y", y_dims); + ctx->ShareLoD("X", /*->*/ "Y"); + + auto x_dims_vec = framework::vectorize(x_dims); + x_dims_vec.push_back(0); + ctx->SetOutputDim("XShape", framework::make_ddim(x_dims_vec)); + ctx->ShareLoD("X", /*->*/ "XShape"); + } + + protected: + // Explicitly set that the data type of computation kernel of cross_entropy + // is determined by its input "X". + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext& ctx) const override { + return framework::OpKernelType(ctx.Input("X")->type(), + ctx.device_context()); + } +}; + +class CrossEntropyGradientOp2 : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext* ctx) const override { + PADDLE_ENFORCE(ctx->HasInput("Label"), "Input(Label) should be not null."); + PADDLE_ENFORCE(ctx->HasInput("XShape"), + "Input(XShape) should be not null."); + PADDLE_ENFORCE(ctx->HasInput("Y"), "Input(Y) should be not null."); + + PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Y")), + "Input(Y@GRAD) shoudl be not null."); + + PADDLE_ENFORCE(ctx->HasOutput(framework::GradVarName("X")), + "Output(X@GRAD) should be not null."); + + auto x_shapes = ctx->GetInputDim("XShape"); + framework::DDim x_dims(x_shapes.Get(), x_shapes.size() - 1); + auto label_dims = ctx->GetInputDim("Label"); + auto dy_dims = ctx->GetInputDim(framework::GradVarName("Y")); + int rank = x_dims.size(); + PADDLE_ENFORCE_EQ(dy_dims.size(), rank, + "Input(Y@Grad) and Input(X) should have the same rank."); + PADDLE_ENFORCE_EQ(label_dims.size(), rank, + "Input(Label) and Input(X) should have the same rank."); + + bool check = true; + if ((!ctx->IsRuntime()) && (framework::product(x_dims) <= 0 || + framework::product(label_dims) <= 0)) { + check = false; + } + + if (check) { + PADDLE_ENFORCE_EQ(framework::slice_ddim(x_dims, 0, rank - 1), + framework::slice_ddim(label_dims, 0, rank - 1), + "The Input(X) and Input(Label) should have the same " + "shape except the last dimension."); + PADDLE_ENFORCE_EQ(framework::slice_ddim(x_dims, 0, rank - 1), + framework::slice_ddim(dy_dims, 0, rank - 1), + "The Input(X) and Input(Y@Grad) should have the same " + "shape except the last dimension."); + } + PADDLE_ENFORCE_EQ(dy_dims[rank - 1], 1, + "The last dimension of Input(Y@Grad) should be 1."); + PADDLE_ENFORCE_EQ(label_dims[rank - 1], 1, + "Last dimension of Input(Label) should be 1."); + ctx->SetOutputDim(framework::GradVarName("X"), x_dims); + ctx->ShareLoD("XShape", framework::GradVarName("X")); + } + + protected: + // Explicitly set that the data type of computation kernel of cross_entropy + // is determined by its input "X". + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext& ctx) const override { + return framework::OpKernelType( + ctx.Input(framework::GradVarName("Y"))->type(), + ctx.device_context()); + } +}; + +class CrossEntropyOpMaker2 : public framework::OpProtoAndCheckerMaker { + public: + void Make() override { + AddInput("X", + "(Tensor, default Tensor), a tensor whose last dimension " + "size is equal to the number of classes. This input is a " + "probability computed by the previous operator, which is almost " + "always the result of a softmax operator."); + AddInput( + "Label", + "(Tensor), the tensor which represents the ground truth. It has the " + "same shape with 'X' except the last dimension. One hot Tensor."); + AddOutput("Y", + "(Tensor, default Tensor), a tensor whose shape is same " + "with 'X' except that the last dimension size is 1. It " + "represents the cross entropy loss."); + AddOutput("XShape", "Temporaily variable to save shape and LoD of X."); + AddAttr("ignore_index", + "(int, default -100), Specifies a target value that is" + "ignored and does not contribute to the input gradient." + "Only valid if soft_label is set to False") + .SetDefault(-100); + AddComment(R"DOC( +CrossEntropy Operator. + +The input 'X' and 'Label' will first be logically flattened to 2-D matrixs. +The matrix's second dimension(row length) is as same as the original last +dimension, and the first dimension(column length) is the product of all other +original dimensions. Then the softmax computation will take palce on each raw +of flattened matrixs. + +Only support hard label. + +Both the input X and Label can carry the LoD (Level of Details) information, +or not. But the output only shares the LoD information with input X. + +)DOC"); + } +}; + +class CrossEntropyOpInferVarType2 + : public framework::PassInDtypeAndVarTypeToOutput { + protected: + std::unordered_map GetInputOutputWithSameType() + const override { + return std::unordered_map{{"X", /*->*/ "Y"}}; + } +}; + +class CrossEntropyGradOpMaker2 : public framework::SingleGradOpDescMaker { + public: + using framework::SingleGradOpDescMaker::SingleGradOpDescMaker; + + protected: + std::unique_ptr Apply() const override { + std::unique_ptr op(new framework::OpDesc()); + op->SetType("cross_entropy_grad2"); + op->SetInput("Label", Input("Label")); + op->SetInput("Y", Output("Y")); + op->SetInput("XShape", Output("XShape")); + op->SetInput(framework::GradVarName("Y"), OutputGrad("Y")); + op->SetOutput(framework::GradVarName("X"), InputGrad("X")); + op->SetAttrMap(Attrs()); + return op; + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +using CPUCtx = paddle::platform::CPUDeviceContext; + +REGISTER_OPERATOR(cross_entropy2, ops::CrossEntropyOp2, + ops::CrossEntropyOpMaker2, ops::CrossEntropyOpInferVarType2, + ops::CrossEntropyGradOpMaker2); +REGISTER_OPERATOR(cross_entropy_grad2, ops::CrossEntropyGradientOp2); +REGISTER_OP_CPU_KERNEL(cross_entropy2, + ops::CrossEntropyOpKernel2, + ops::CrossEntropyOpKernel2); +REGISTER_OP_CPU_KERNEL(cross_entropy_grad2, + ops::CrossEntropyGradientOpKernel2, + ops::CrossEntropyGradientOpKernel2); diff --git a/paddle/fluid/operators/cross_entropy2_op.cu b/paddle/fluid/operators/cross_entropy2_op.cu new file mode 100644 index 0000000000..1868c1b866 --- /dev/null +++ b/paddle/fluid/operators/cross_entropy2_op.cu @@ -0,0 +1,29 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/cross_entropy2_op.h" +#include "paddle/fluid/platform/float16.h" + +namespace plat = paddle::platform; +namespace ops = paddle::operators; +using CUDACtx = paddle::platform::CUDADeviceContext; +REGISTER_OP_CUDA_KERNEL(cross_entropy2, + ops::CrossEntropyOpKernel2, + ops::CrossEntropyOpKernel2, + ops::CrossEntropyOpKernel2); + +REGISTER_OP_CUDA_KERNEL( + cross_entropy_grad2, ops::CrossEntropyGradientOpKernel2, + ops::CrossEntropyGradientOpKernel2, + ops::CrossEntropyGradientOpKernel2); diff --git a/paddle/fluid/operators/cross_entropy2_op.h b/paddle/fluid/operators/cross_entropy2_op.h new file mode 100644 index 0000000000..3d209f7c5c --- /dev/null +++ b/paddle/fluid/operators/cross_entropy2_op.h @@ -0,0 +1,188 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include +#include "paddle/fluid/framework/eigen.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/operators/math/cross_entropy.h" +#include "paddle/fluid/operators/math/math_function.h" +#include "paddle/fluid/platform/for_range.h" + +namespace paddle { +namespace operators { + +using Tensor = framework::Tensor; + +HOSTDEVICE inline platform::float16 RealLog(platform::float16 x) { +#ifdef __NVCC__ + return static_cast(logf(static_cast(x))); +#else + return static_cast(std::log(static_cast(x))); +#endif +} + +HOSTDEVICE inline float RealLog(float x) { +#ifdef __NVCC__ + return logf(x); +#else + return std::log(x); +#endif +} + +HOSTDEVICE inline double RealLog(double x) { +#ifdef __NVCC__ + return log(x); +#else + return std::log(x); +#endif +} + +HOSTDEVICE inline platform::float16 RealExp(platform::float16 x) { +#ifdef __NVCC__ + return static_cast(expf(static_cast(x))); +#else + return static_cast(std::exp(static_cast(x))); +#endif +} + +HOSTDEVICE inline float RealExp(float x) { +#ifdef __NVCC__ + return expf(x); +#else + return std::exp(x); +#endif +} + +HOSTDEVICE inline double RealExp(double x) { +#ifdef __NVCC__ + return exp(x); +#else + return std::exp(x); +#endif +} + +template +struct CrossEntropyForwardFunctor { + CrossEntropyForwardFunctor(const T *x, T *y, const int64_t *label, + int64_t ignore_index, int64_t feature_size) + : x_(x), + y_(y), + label_(label), + ignore_index_(ignore_index), + feature_size_(feature_size) {} + + HOSTDEVICE void operator()(int64_t row_idx) const { + auto col_idx = label_[row_idx]; + if (col_idx != ignore_index_) { + y_[row_idx] = -math::TolerableValue()( + RealLog(x_[row_idx * feature_size_ + col_idx])); + } else { + y_[row_idx] = 0; + } + } + + const T *x_; + T *y_; + const int64_t *label_; + int64_t ignore_index_; + int64_t feature_size_; +}; + +template +struct CrossEntropyBackwardFunctor { + CrossEntropyBackwardFunctor(T *dx, const T *y, const T *dy, + const int64_t *label, int64_t ignore_index, + int64_t feature_size) + : dx_(dx), + y_(y), + dy_(dy), + label_(label), + ignore_index_(ignore_index), + feature_size_(feature_size) {} + + HOSTDEVICE void operator()(int64_t idx) const { + auto row_idx = idx / feature_size_; + auto col_idx = idx % feature_size_; + auto label = label_[row_idx]; + if (label == col_idx && label != ignore_index_) { + dx_[idx] = -dy_[row_idx] * RealExp(y_[row_idx]); + } else { + dx_[idx] = 0; + } + } + + T *dx_; + const T *y_; + const T *dy_; + const int64_t *label_; + int64_t ignore_index_; + int64_t feature_size_; +}; + +template +class CrossEntropyOpKernel2 : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext &ctx) const override { + auto *x = ctx.Input("X"); + auto *label = ctx.Input("Label"); + auto *y = ctx.Output("Y"); + + auto *p_y = y->mutable_data(ctx.GetPlace()); + auto *p_x = x->data(); + auto *p_label = label->data(); + + int rank = x->dims().size(); + int64_t feature_size = x->dims()[rank - 1]; + int64_t batch_size = framework::product(x->dims()) / feature_size; + + int64_t ignore_index = ctx.Attr("ignore_index"); + + platform::ForRange for_range( + ctx.template device_context(), batch_size); + for_range(CrossEntropyForwardFunctor(p_x, p_y, p_label, ignore_index, + feature_size)); + } +}; + +template +class CrossEntropyGradientOpKernel2 : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext &ctx) const override { + auto *dx = ctx.Output(framework::GradVarName("X")); + auto *y = ctx.Input("Y"); + auto *dy = ctx.Input(framework::GradVarName("Y")); + auto *label = ctx.Input("Label"); + + auto *p_dx = dx->mutable_data(ctx.GetPlace()); + auto *p_y = y->data(); + auto *p_dy = dy->data(); + auto *p_label = label->data(); + + int64_t ignore_index = ctx.Attr("ignore_index"); + int rank = dx->dims().size(); + int64_t feature_size = dx->dims()[rank - 1]; + int64_t batch_size = framework::product(dx->dims()) / feature_size; + + platform::ForRange for_range( + ctx.template device_context(), + batch_size * feature_size); + for_range(CrossEntropyBackwardFunctor(p_dx, p_y, p_dy, p_label, + ignore_index, feature_size)); + } +}; + +} // namespace operators +} // namespace paddle diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py index 9d1d5fe093..4f384ce37d 100644 --- a/python/paddle/fluid/layers/nn.py +++ b/python/paddle/fluid/layers/nn.py @@ -1432,6 +1432,8 @@ def cross_entropy(input, label, soft_label=False, ignore_index=kIgnoreIndex): predict = fluid.layers.fc(input=net, size=classdim, act='softmax') cost = fluid.layers.cross_entropy(input=predict, label=label) """ + if not soft_label: + return cross_entropy2(input, label, ignore_index) helper = LayerHelper('cross_entropy', **locals()) out = helper.create_variable_for_type_inference(dtype=input.dtype) helper.append_op( @@ -1444,6 +1446,20 @@ def cross_entropy(input, label, soft_label=False, ignore_index=kIgnoreIndex): return out +def cross_entropy2(input, label, ignore_index=kIgnoreIndex): + helper = LayerHelper('cross_entropy2', **locals()) + out = helper.create_variable_for_type_inference(dtype=input.dtype) + xshape = helper.create_variable_for_type_inference(dtype=input.dtype) + helper.append_op( + type='cross_entropy2', + inputs={'X': [input], + 'Label': [label]}, + outputs={'Y': [out], + 'XShape': [xshape]}, + attrs={'ignore_index': ignore_index}) + return out + + def bpr_loss(input, label, name=None): """ Bayesian Personalized Ranking Loss Operator. From cfd012e2cb82dc0a2f4ddcc0d23eeefbb28aff0a Mon Sep 17 00:00:00 2001 From: sneaxiy Date: Mon, 11 Mar 2019 04:05:00 +0000 Subject: [PATCH 039/230] add unittest test=develop --- paddle/fluid/operators/expand_op.cc | 18 ++++- .../tests/unittests/test_cross_entropy2_op.py | 79 +++++++++++++++++++ .../tests/unittests/test_dist_transpiler.py | 20 ++--- 3 files changed, 106 insertions(+), 11 deletions(-) create mode 100644 python/paddle/fluid/tests/unittests/test_cross_entropy2_op.py diff --git a/paddle/fluid/operators/expand_op.cc b/paddle/fluid/operators/expand_op.cc index 44a2f37b66..ce3d9a7aac 100644 --- a/paddle/fluid/operators/expand_op.cc +++ b/paddle/fluid/operators/expand_op.cc @@ -138,12 +138,28 @@ class ExpandGradOp : public framework::OperatorWithKernel { } }; +class ExpandGradOpDescMaker : public framework::SingleGradOpDescMaker { + public: + using framework::SingleGradOpDescMaker::SingleGradOpDescMaker; + + protected: + std::unique_ptr Apply() const override { + std::unique_ptr op(new framework::OpDesc()); + op->SetType("expand_grad"); + op->SetInput("X", Input("X")); + op->SetInput(framework::GradVarName("Out"), OutputGrad("Out")); + op->SetOutput(framework::GradVarName("X"), InputGrad("X")); + op->SetAttrMap(Attrs()); + return op; + } +}; + } // namespace operators } // namespace paddle namespace ops = paddle::operators; REGISTER_OPERATOR(expand, ops::ExpandOp, ops::ExpandOpMaker, - paddle::framework::DefaultGradOpDescMaker); + ops::ExpandGradOpDescMaker); REGISTER_OPERATOR(expand_grad, ops::ExpandGradOp); REGISTER_OP_CPU_KERNEL( expand, ops::ExpandKernel, diff --git a/python/paddle/fluid/tests/unittests/test_cross_entropy2_op.py b/python/paddle/fluid/tests/unittests/test_cross_entropy2_op.py new file mode 100644 index 0000000000..c29d422361 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/test_cross_entropy2_op.py @@ -0,0 +1,79 @@ +# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from op_test import OpTest +import unittest +import numpy as np +import six + + +class CrossEntropy2OpTestBase(OpTest): + def initParameters(self): + return [32, 64], 'float32', -100 + + def calc_output(self, logits, label, ignore_index): + ret = np.zeros(shape=label.shape, dtype=logits.dtype) + for idx in six.moves.range(label.shape[0]): + if label[idx] == ignore_index: + continue + ret[idx] = -np.log(logits[idx][label[idx]]) + return ret + + def setUp(self): + self.shape, self.dtype, self.ignore_index = self.initParameters() + self.op_type = 'cross_entropy2' + feature_size = int(self.shape[-1]) + batch_size = int(np.prod(self.shape) / feature_size) + logits = (np.random.random(size=self.shape) + 1).astype(self.dtype) + label = np.random.random_integers( + low=0, high=feature_size - 1, + size=self.shape[0:-1] + [1]).astype('int64') + outputs = self.calc_output( + np.reshape(logits, [batch_size, feature_size]), + np.reshape(label, [batch_size, 1]), self.ignore_index) + self.inputs = {'X': logits, 'Label': label} + self.outputs = { + 'Y': np.reshape(outputs, label.shape), + 'XShape': np.zeros( + shape=logits.shape, dtype=logits.dtype) + } + self.attrs = {'ignore_index': self.ignore_index} + + def test_check_output(self): + self.check_output(no_check_set=['XShape']) + + def test_check_grad(self): + self.check_grad( + inputs_to_check=['X'], + output_names=['Y'], + no_grad_set=['XShape', 'Label']) + + +class CrossEntropy2OpTest2(CrossEntropy2OpTestBase): + def initParameters(self): + return [32, 64], 'float64', 3 + + +class CrossEntropy2OpTest3(CrossEntropy2OpTestBase): + def initParameters(self): + return [4, 8, 16, 32], 'float32', -100 + + +class CrossEntropy2OpTest4(CrossEntropy2OpTestBase): + def initParameters(self): + return [4, 8, 16, 32], 'float32', 3 + + +if __name__ == '__main__': + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_dist_transpiler.py b/python/paddle/fluid/tests/unittests/test_dist_transpiler.py index 12132477d2..f81d4fda50 100644 --- a/python/paddle/fluid/tests/unittests/test_dist_transpiler.py +++ b/python/paddle/fluid/tests/unittests/test_dist_transpiler.py @@ -524,8 +524,8 @@ class TestLocalLookupTable(TestDistLookupTableBase): ops = [ 'lookup_table', 'sequence_pool', 'lookup_table', 'sequence_pool', 'lookup_table', 'sequence_pool', 'concat', 'mul', 'elementwise_add', - 'cross_entropy', 'mean', 'fill_constant', 'mean_grad', - 'cross_entropy_grad', 'elementwise_add_grad', 'send', 'mul_grad', + 'cross_entropy2', 'mean', 'fill_constant', 'mean_grad', + 'cross_entropy_grad2', 'elementwise_add_grad', 'send', 'mul_grad', 'send', 'concat_grad', 'sequence_pool_grad', 'lookup_table_grad', 'split_selected_rows', 'send', 'sequence_pool_grad', 'lookup_table_grad', 'sequence_pool_grad', 'lookup_table_grad', @@ -564,8 +564,8 @@ class TestDistLookupTable(TestDistLookupTableBase): ops = [ 'split_ids', 'prefetch', 'merge_ids', 'sequence_pool', 'sequence_pool', 'lookup_table', 'sequence_pool', 'concat', 'mul', - 'elementwise_add', 'cross_entropy', 'mean', 'fill_constant', - 'mean_grad', 'cross_entropy_grad', 'elementwise_add_grad', 'send', + 'elementwise_add', 'cross_entropy2', 'mean', 'fill_constant', + 'mean_grad', 'cross_entropy_grad2', 'elementwise_add_grad', 'send', 'mul_grad', 'send', 'concat_grad', 'sequence_pool_grad', 'lookup_table_grad', 'split_selected_rows', 'send', 'sequence_pool_grad', 'lookup_table_grad', 'sequence_pool_grad', @@ -612,8 +612,8 @@ class TestAsyncLocalLookupTable(TestDistLookupTableBase): ops = [ 'lookup_table', 'sequence_pool', 'lookup_table', 'sequence_pool', 'lookup_table', 'sequence_pool', 'concat', 'mul', 'elementwise_add', - 'cross_entropy', 'mean', 'fill_constant', 'mean_grad', - 'cross_entropy_grad', 'elementwise_add_grad', 'send', 'mul_grad', + 'cross_entropy2', 'mean', 'fill_constant', 'mean_grad', + 'cross_entropy_grad2', 'elementwise_add_grad', 'send', 'mul_grad', 'send', 'concat_grad', 'sequence_pool_grad', 'lookup_table_grad', 'split_selected_rows', 'send', 'sequence_pool_grad', 'lookup_table_grad', 'sequence_pool_grad', 'lookup_table_grad', @@ -652,8 +652,8 @@ class TestAsyncDistLookupTable(TestDistLookupTableBase): ops = [ 'split_ids', 'prefetch', 'merge_ids', 'sequence_pool', 'sequence_pool', 'lookup_table', 'sequence_pool', 'concat', 'mul', - 'elementwise_add', 'cross_entropy', 'mean', 'fill_constant', - 'mean_grad', 'cross_entropy_grad', 'elementwise_add_grad', 'send', + 'elementwise_add', 'cross_entropy2', 'mean', 'fill_constant', + 'mean_grad', 'cross_entropy_grad2', 'elementwise_add_grad', 'send', 'mul_grad', 'send', 'concat_grad', 'sequence_pool_grad', 'lookup_table_grad', 'split_selected_rows', 'send', 'sequence_pool_grad', 'lookup_table_grad', 'sequence_pool_grad', @@ -841,8 +841,8 @@ class TestRemoteLookupTable(TestDistLookupTableBase): ops = [ 'lookup_table', 'sequence_pool', 'lookup_table', 'sequence_pool', 'lookup_table', 'sequence_pool', 'concat', 'mul', 'elementwise_add', - 'cross_entropy', 'mean', 'fill_constant', 'mean_grad', - 'cross_entropy_grad', 'elementwise_add_grad', 'send', 'mul_grad', + 'cross_entropy2', 'mean', 'fill_constant', 'mean_grad', + 'cross_entropy_grad2', 'elementwise_add_grad', 'send', 'mul_grad', 'send', 'concat_grad', 'sequence_pool_grad', 'lookup_table_grad', 'split_selected_rows', 'send', 'sequence_pool_grad', 'lookup_table_grad', 'sequence_pool_grad', 'lookup_table_grad', From 24fbe6d6109b5db9bfa4d4d2445d0b4b89746091 Mon Sep 17 00:00:00 2001 From: ceci3 <592712189@qq.com> Date: Mon, 11 Mar 2019 04:05:25 +0000 Subject: [PATCH 040/230] test=develop, replace sce --- python/paddle/fluid/layers/nn.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py index 9d1d5fe093..d0bff52e43 100644 --- a/python/paddle/fluid/layers/nn.py +++ b/python/paddle/fluid/layers/nn.py @@ -10704,8 +10704,9 @@ def npair_loss(anchor, positive, labels, l2_reg=0.002): similarity_matrix = matmul( anchor, positive, transpose_x=False, transpose_y=True) - softmax_value = softmax(similarity_matrix) - cross_entropy = -1 * reduce_sum(labels * log(softmax_value), 0) + softmax_ce = softmax_with_cross_entropy( + logits=similarity_matrix, label=labels, soft_label=True) + cross_entropy = reduce_sum(labels * softmax_ce, 0) celoss = reduce_mean(cross_entropy) return l2loss + celoss From aa2335c218a45804b35b760896d894e4f4f46842 Mon Sep 17 00:00:00 2001 From: sneaxiy Date: Mon, 11 Mar 2019 04:14:21 +0000 Subject: [PATCH 041/230] add py_reader doc --- python/paddle/fluid/layers/io.py | 41 +++++++++++++++++++------------- python/paddle/fluid/reader.py | 32 ++++++++++++++----------- 2 files changed, 43 insertions(+), 30 deletions(-) diff --git a/python/paddle/fluid/layers/io.py b/python/paddle/fluid/layers/io.py index a9b391fd53..da3ffc9a60 100644 --- a/python/paddle/fluid/layers/io.py +++ b/python/paddle/fluid/layers/io.py @@ -563,22 +563,26 @@ def _py_reader(capacity, def start_provide_thread(func): def __provider_thread__(): - for tensors in func(): - array = core.LoDTensorArray() - for item in tensors: - if not isinstance(item, core.LoDTensor): - tmp = core.LoDTensor() - tmp.set(item, core.CPUPlace()) - item = tmp - - array.append(item) - - if reader.exited: - break - feed_queue.push(array) - if reader.exited: - break - feed_queue.close() + try: + for tensors in func(): + array = core.LoDTensorArray() + for item in tensors: + if not isinstance(item, core.LoDTensor): + tmp = core.LoDTensor() + tmp.set(item, core.CPUPlace()) + item = tmp + + array.append(item) + + if reader.exited: + break + feed_queue.push(array) + if reader.exited: + break + feed_queue.close() + except Exception as ex: + feed_queue.close() + raise ex reader.thread = threading.Thread(target=__provider_thread__) reader.thread.daemon = True @@ -692,6 +696,11 @@ def py_reader(capacity, >>> exe.run(fetch_list=[loss.name]) >>> except fluid.core.EOFException: >>> reader.reset() + >>> + >>> ... + >>> + >>> fluid.io.save_inference_model(dirname='./model', feeded_var_names=[img, label], + >>> target_vars=[loss], executor=fluid.Executor(fluid.CUDAPlace(0))) 2. When training and testing are both performed, two different :code:`py_reader` should be created with different names, e.g.: diff --git a/python/paddle/fluid/reader.py b/python/paddle/fluid/reader.py index 49ea1b83b5..ef212014b5 100644 --- a/python/paddle/fluid/reader.py +++ b/python/paddle/fluid/reader.py @@ -250,20 +250,24 @@ class PyReader(object): def _start(self): def __thread_main__(): - for tensors in self._tensor_reader(): - array = core.LoDTensorArray() - for item in tensors: - if not isinstance(item, core.LoDTensor): - tmp = core.LoDTensor() - tmp.set(item, core.CPUPlace()) - item = tmp - - array.append(item) - - if not self._queue.push(array): - break - - self._queue.close() + try: + for tensors in self._tensor_reader(): + array = core.LoDTensorArray() + for item in tensors: + if not isinstance(item, core.LoDTensor): + tmp = core.LoDTensor() + tmp.set(item, core.CPUPlace()) + item = tmp + + array.append(item) + + if not self._queue.push(array): + break + + self._queue.close() + except Exception as ex: + self._queue.close() + raise ex self._thread = threading.Thread(target=__thread_main__) self._thread.daemon = True From fc12f38394426c98cf31e3c44f6cbecc6ed7e2f2 Mon Sep 17 00:00:00 2001 From: sneaxiy Date: Mon, 11 Mar 2019 05:15:37 +0000 Subject: [PATCH 042/230] add API.spec test=develop --- paddle/fluid/API.spec | 535 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 535 insertions(+) create mode 100644 paddle/fluid/API.spec diff --git a/paddle/fluid/API.spec b/paddle/fluid/API.spec new file mode 100644 index 0000000000..fcdc98b0c4 --- /dev/null +++ b/paddle/fluid/API.spec @@ -0,0 +1,535 @@ +paddle.fluid.Program.__init__ (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) +paddle.fluid.Program.block (ArgSpec(args=['self', 'index'], varargs=None, keywords=None, defaults=None), ('document', 'af5346376065ff4cf6832a8ac0ae0945')) +paddle.fluid.Program.clone (ArgSpec(args=['self', 'for_test'], varargs=None, keywords=None, defaults=(False,)), ('document', 'ebb7765b2962bd2be041d19720e49d0f')) +paddle.fluid.Program.current_block (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '5e162d3bf8dd625703463d9e4be36adb')) +paddle.fluid.Program.global_block (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', 'cfb7e05a002b2e64650778cabde7301c')) +paddle.fluid.Program.list_vars (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '1c8647b14fe57c7824b1c9562394dd3c')) +paddle.fluid.Program.parse_from_string (ArgSpec(args=['binary_str'], varargs=None, keywords=None, defaults=None), ('document', 'b6a7ffb239a30bf2ce58cfaca8d8b8d5')) +paddle.fluid.Program.to_string (ArgSpec(args=['self', 'throw_on_error', 'with_details'], varargs=None, keywords=None, defaults=(False,)), ('document', 'faec17e5a04af28e3776160e34504d15')) +paddle.fluid.default_startup_program (ArgSpec(args=[], varargs=None, keywords=None, defaults=None), ('document', '99e5d53d92d82797093332719c9e3ccd')) +paddle.fluid.default_main_program (ArgSpec(args=[], varargs=None, keywords=None, defaults=None), ('document', '5430f54ab4895f9f47db6bebbaf71659')) +paddle.fluid.program_guard (ArgSpec(args=['main_program', 'startup_program'], varargs=None, keywords=None, defaults=(None,)), ('document', 'b54f403e57825a1592aece03afe3afb6')) +paddle.fluid.name_scope (ArgSpec(args=['prefix'], varargs=None, keywords=None, defaults=(None,)), ('document', '0ef753f5cec69fef9ae6ad8b867b33a2')) +paddle.fluid.cuda_places (ArgSpec(args=['device_ids'], varargs=None, keywords=None, defaults=(None,)), ('document', '7d9a51fc9cf3c5245b5227080a8064c3')) +paddle.fluid.cpu_places (ArgSpec(args=['device_count'], varargs=None, keywords=None, defaults=(None,)), ('document', '4c0cd83f0b401fc2ff84c70974e5d210')) +paddle.fluid.cuda_pinned_places (ArgSpec(args=['device_count'], varargs=None, keywords=None, defaults=(None,)), ('document', 'd0c3ebd813c39958c92b78e3eef7e912')) +paddle.fluid.Executor.__init__ (ArgSpec(args=['self', 'place'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) +paddle.fluid.Executor.close (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', 'f5369953dd0c443961cf79f7a00e1a03')) +paddle.fluid.Executor.run (ArgSpec(args=['self', 'program', 'feed', 'fetch_list', 'feed_var_name', 'fetch_var_name', 'scope', 'return_numpy', 'use_program_cache'], varargs=None, keywords=None, defaults=(None, None, None, 'feed', 'fetch', None, True, False)), ('document', 'aba8093edebf2d5c869b735b92811e45')) +paddle.fluid.global_scope (ArgSpec(args=[], varargs=None, keywords=None, defaults=None), ('document', 'e148d3ab1ed8edf3e928212a375959c0')) +paddle.fluid.scope_guard (ArgSpec(args=['scope'], varargs=None, keywords=None, defaults=None), ('document', 'b94d1f6bcc29c4fb58fc0058561250c2')) +paddle.fluid.DistributeTranspiler.__init__ (ArgSpec(args=['self', 'config'], varargs=None, keywords=None, defaults=(None,)), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) +paddle.fluid.DistributeTranspiler.get_pserver_program (ArgSpec(args=['self', 'endpoint'], varargs=None, keywords=None, defaults=None), ('document', '292ab72977afbe58e6a3bde175452680')) +paddle.fluid.DistributeTranspiler.get_pserver_programs (ArgSpec(args=['self', 'endpoint'], varargs=None, keywords=None, defaults=None), ('document', '78f4949aedf317666a89ca74b3748ba8')) +paddle.fluid.DistributeTranspiler.get_startup_program (ArgSpec(args=['self', 'endpoint', 'pserver_program', 'startup_program'], varargs=None, keywords=None, defaults=(None, None)), ('document', 'd796fc0c8d51503b556fcf6dc15c4f0c')) +paddle.fluid.DistributeTranspiler.get_trainer_program (ArgSpec(args=['self', 'wait_port'], varargs=None, keywords=None, defaults=(True,)), ('document', '736330e31a7a54abccc0c7fd9119d9ff')) +paddle.fluid.DistributeTranspiler.transpile (ArgSpec(args=['self', 'trainer_id', 'program', 'pservers', 'trainers', 'sync_mode', 'startup_program', 'current_endpoint'], varargs=None, keywords=None, defaults=(None, '127.0.0.1:6174', 1, True, None, '127.0.0.1:6174')), ('document', '06ce55338dfe96311ad1078235ab3bf4')) +paddle.fluid.memory_optimize (ArgSpec(args=['input_program', 'skip_opt_set', 'print_log', 'level', 'skip_grads'], varargs=None, keywords=None, defaults=(None, False, 0, False)), ('document', 'eda17d0f1639bc6ca215cecf87f588a4')) +paddle.fluid.release_memory (ArgSpec(args=['input_program', 'skip_opt_set'], varargs=None, keywords=None, defaults=(None,)), ('document', 'ac4114d3df16264f1946deb3a8434a6f')) +paddle.fluid.DistributeTranspilerConfig.__init__ +paddle.fluid.ParallelExecutor.__init__ (ArgSpec(args=['self', 'use_cuda', 'loss_name', 'main_program', 'share_vars_from', 'exec_strategy', 'build_strategy', 'num_trainers', 'trainer_id', 'scope'], varargs=None, keywords=None, defaults=(None, None, None, None, None, 1, 0, None)), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) +paddle.fluid.ParallelExecutor.run (ArgSpec(args=['self', 'fetch_list', 'feed', 'feed_dict', 'return_numpy'], varargs=None, keywords=None, defaults=(None, None, True)), ('document', '2cb4bd74481861345c70228a0f57620c')) +paddle.fluid.create_lod_tensor (ArgSpec(args=['data', 'recursive_seq_lens', 'place'], varargs=None, keywords=None, defaults=None), ('document', '8e7bb21e83ff4604f5b379672e285b94')) +paddle.fluid.create_random_int_lodtensor (ArgSpec(args=['recursive_seq_lens', 'base_shape', 'place', 'low', 'high'], varargs=None, keywords=None, defaults=None), ('document', '368f638b99f1dfe59e9b02aa6f077752')) +paddle.fluid.DataFeedDesc.__init__ (ArgSpec(args=['self', 'proto_file'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) +paddle.fluid.DataFeedDesc.desc (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '4294493e31c4bc9fc4bd48753044235f')) +paddle.fluid.DataFeedDesc.set_batch_size (ArgSpec(args=['self', 'batch_size'], varargs=None, keywords=None, defaults=None), ('document', '8d9f44601e0a99dd431f14fd9250cd21')) +paddle.fluid.DataFeedDesc.set_dense_slots (ArgSpec(args=['self', 'dense_slots_name'], varargs=None, keywords=None, defaults=None), ('document', 'eb894b464bbcd1b4bc8038398954f766')) +paddle.fluid.DataFeedDesc.set_use_slots (ArgSpec(args=['self', 'use_slots_name'], varargs=None, keywords=None, defaults=None), ('document', '415c56600ce4e198c071cad01409a690')) +paddle.fluid.AsyncExecutor.__init__ (ArgSpec(args=['self', 'place', 'run_mode'], varargs=None, keywords=None, defaults=(None, '')), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) +paddle.fluid.AsyncExecutor.config_distributed_nodes (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '4810dbe1870452f16b3c60b6c5fd1459')) +paddle.fluid.AsyncExecutor.download_data (ArgSpec(args=['self', 'afs_path', 'local_path', 'fs_default_name', 'ugi', 'file_cnt', 'hadoop_home', 'process_num'], varargs=None, keywords=None, defaults=('$HADOOP_HOME', 12)), ('document', '799a2066cc26819f1ed31f47c15ad083')) +paddle.fluid.AsyncExecutor.get_instance (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', 'f8688f76a2db1243c7097a60c507b182')) +paddle.fluid.AsyncExecutor.init_model (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '504f39be2007404a17e5cabea1256c7d')) +paddle.fluid.AsyncExecutor.init_server (ArgSpec(args=['self', 'dist_desc'], varargs=None, keywords=None, defaults=None), ('document', 'c403ab46c5d3ef25c0f7e94ae75dcb68')) +paddle.fluid.AsyncExecutor.init_worker (ArgSpec(args=['self', 'dist_desc', 'startup_program'], varargs=None, keywords=None, defaults=None), ('document', 'dcf08f4bf2f3282acf11391f5d39c536')) +paddle.fluid.AsyncExecutor.run (ArgSpec(args=['self', 'program', 'data_feed', 'filelist', 'thread_num', 'fetch', 'mode', 'debug'], varargs=None, keywords=None, defaults=('', False)), ('document', '848fc53484e8326f6325feea87fe955c')) +paddle.fluid.AsyncExecutor.save_model (ArgSpec(args=['self', 'save_path'], varargs=None, keywords=None, defaults=None), ('document', 'c8ac0dfcb3b187aba25d03af7fea56b2')) +paddle.fluid.AsyncExecutor.stop (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '5f23d043607bb5d55e466ec3f578e093')) +paddle.fluid.CompiledProgram.__init__ (ArgSpec(args=['self', 'program_or_graph'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) +paddle.fluid.CompiledProgram.with_data_parallel (ArgSpec(args=['self', 'loss_name', 'build_strategy', 'exec_strategy', 'share_vars_from', 'places'], varargs=None, keywords=None, defaults=(None, None, None, None, None)), ('document', 'dbf542d1384741650a1238ddb05daa37')) +paddle.fluid.CompiledProgram.with_inference_optimize (ArgSpec(args=['self', 'config'], varargs=None, keywords=None, defaults=None), ('document', '9e5b009d850191a010e859189c127fd8')) +paddle.fluid.ExecutionStrategy.__init__ __init__(self: paddle.fluid.core.ParallelExecutor.ExecutionStrategy) -> None +paddle.fluid.BuildStrategy.GradientScaleStrategy.__init__ __init__(self: paddle.fluid.core.ParallelExecutor.BuildStrategy.GradientScaleStrategy, arg0: int) -> None +paddle.fluid.BuildStrategy.ReduceStrategy.__init__ __init__(self: paddle.fluid.core.ParallelExecutor.BuildStrategy.ReduceStrategy, arg0: int) -> None +paddle.fluid.BuildStrategy.__init__ __init__(self: paddle.fluid.core.ParallelExecutor.BuildStrategy) -> None +paddle.fluid.io.save_vars (ArgSpec(args=['executor', 'dirname', 'main_program', 'vars', 'predicate', 'filename'], varargs=None, keywords=None, defaults=(None, None, None, None)), ('document', 'b55d6193a1d4198d45b013fc5779e1f2')) +paddle.fluid.io.save_params (ArgSpec(args=['executor', 'dirname', 'main_program', 'filename'], varargs=None, keywords=None, defaults=(None, None)), ('document', '3a7a99abac3e1bf898871fe609354218')) +paddle.fluid.io.save_persistables (ArgSpec(args=['executor', 'dirname', 'main_program', 'filename'], varargs=None, keywords=None, defaults=(None, None)), ('document', '9141bb5f32caf7975eb3fd88c8a1b2da')) +paddle.fluid.io.load_vars (ArgSpec(args=['executor', 'dirname', 'main_program', 'vars', 'predicate', 'filename'], varargs=None, keywords=None, defaults=(None, None, None, None)), ('document', '0a5308f496632ab1ec3ba1f1377e6f95')) +paddle.fluid.io.load_params (ArgSpec(args=['executor', 'dirname', 'main_program', 'filename'], varargs=None, keywords=None, defaults=(None, None)), ('document', '41779819cef32f2246e83aebc5a002e2')) +paddle.fluid.io.load_persistables (ArgSpec(args=['executor', 'dirname', 'main_program', 'filename'], varargs=None, keywords=None, defaults=(None, None)), ('document', '28df5bfe26ca7a077f91156abb0fe6d2')) +paddle.fluid.io.save_inference_model (ArgSpec(args=['dirname', 'feeded_var_names', 'target_vars', 'executor', 'main_program', 'model_filename', 'params_filename', 'export_for_deployment'], varargs=None, keywords=None, defaults=(None, None, None, True)), ('document', '582d87b8df75a5a639a107db8ff86f9c')) +paddle.fluid.io.load_inference_model (ArgSpec(args=['dirname', 'executor', 'model_filename', 'params_filename', 'pserver_endpoints'], varargs=None, keywords=None, defaults=(None, None, None)), ('document', '7a5255386075dac3c75b7058254fcdcb')) +paddle.fluid.io.PyReader.__init__ (ArgSpec(args=['self', 'feed_list', 'capacity', 'use_double_buffer', 'iterable'], varargs=None, keywords=None, defaults=(True, False)), ('document', '18211b287474b401bc460d3f73dbc1c7')) +paddle.fluid.io.PyReader.decorate_paddle_reader (ArgSpec(args=['self', 'reader', 'places'], varargs=None, keywords=None, defaults=(None,)), ('document', 'faef298f73e91aedcfaf5d184f3109b7')) +paddle.fluid.io.PyReader.decorate_sample_generator (ArgSpec(args=['self', 'sample_generator', 'batch_size', 'drop_last', 'places'], varargs=None, keywords=None, defaults=(True, None)), ('document', 'd3fe49fc342e7778ed086e965f41bf12')) +paddle.fluid.io.PyReader.decorate_tensor_provider (ArgSpec(args=['self', 'reader', 'places'], varargs=None, keywords=None, defaults=(None,)), ('document', 'd10224fef1095247063b6976da793021')) +paddle.fluid.io.PyReader.reset (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', 'ff1cc1e2beb8824d453656c72c28ddfb')) +paddle.fluid.io.PyReader.start (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', 'b7ea0a548991924e4cfe61a577b8e56d')) +paddle.fluid.initializer.ConstantInitializer.__init__ (ArgSpec(args=['self', 'value', 'force_cpu'], varargs=None, keywords=None, defaults=(0.0, False)), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) +paddle.fluid.initializer.UniformInitializer.__init__ (ArgSpec(args=['self', 'low', 'high', 'seed'], varargs=None, keywords=None, defaults=(-1.0, 1.0, 0)), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) +paddle.fluid.initializer.NormalInitializer.__init__ (ArgSpec(args=['self', 'loc', 'scale', 'seed'], varargs=None, keywords=None, defaults=(0.0, 1.0, 0)), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) +paddle.fluid.initializer.TruncatedNormalInitializer.__init__ (ArgSpec(args=['self', 'loc', 'scale', 'seed'], varargs=None, keywords=None, defaults=(0.0, 1.0, 0)), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) +paddle.fluid.initializer.XavierInitializer.__init__ (ArgSpec(args=['self', 'uniform', 'fan_in', 'fan_out', 'seed'], varargs=None, keywords=None, defaults=(True, None, None, 0)), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) +paddle.fluid.initializer.BilinearInitializer.__init__ (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', 'd389912dc079cbef432335a00017cec0')) +paddle.fluid.initializer.MSRAInitializer.__init__ (ArgSpec(args=['self', 'uniform', 'fan_in', 'seed'], varargs=None, keywords=None, defaults=(True, None, 0)), ('document', '53c757bed9345f2ad3361902531e7cf5')) +paddle.fluid.initializer.force_init_on_cpu (ArgSpec(args=[], varargs=None, keywords=None, defaults=None), ('document', '6d0f3e22c90d9d500d36ff57daf056ee')) +paddle.fluid.initializer.init_on_cpu (ArgSpec(args=[], varargs=None, keywords=None, defaults=None), ('document', 'a6d7011ca3d8c0d454dac3a56eae0c29')) +paddle.fluid.initializer.NumpyArrayInitializer.__init__ (ArgSpec(args=['self', 'value'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) +paddle.fluid.layers.fc (ArgSpec(args=['input', 'size', 'num_flatten_dims', 'param_attr', 'bias_attr', 'act', 'is_test', 'name'], varargs=None, keywords=None, defaults=(1, None, None, None, False, None)), ('document', '1929058262994f212620599c63aea6bd')) +paddle.fluid.layers.embedding (ArgSpec(args=['input', 'size', 'is_sparse', 'is_distributed', 'padding_idx', 'param_attr', 'dtype'], varargs=None, keywords=None, defaults=(False, False, None, None, 'float32')), ('document', '89c2c55a0b0656b106064048e068e77a')) +paddle.fluid.layers.dynamic_lstm (ArgSpec(args=['input', 'size', 'h_0', 'c_0', 'param_attr', 'bias_attr', 'use_peepholes', 'is_reverse', 'gate_activation', 'cell_activation', 'candidate_activation', 'dtype', 'name'], varargs=None, keywords=None, defaults=(None, None, None, None, True, False, 'sigmoid', 'tanh', 'tanh', 'float32', None)), ('document', 'dfbb624f85015df29e994ca6999e8ff6')) +paddle.fluid.layers.dynamic_lstmp (ArgSpec(args=['input', 'size', 'proj_size', 'param_attr', 'bias_attr', 'use_peepholes', 'is_reverse', 'gate_activation', 'cell_activation', 'candidate_activation', 'proj_activation', 'dtype', 'name', 'h_0', 'c_0', 'cell_clip', 'proj_clip'], varargs=None, keywords=None, defaults=(None, None, True, False, 'sigmoid', 'tanh', 'tanh', 'tanh', 'float32', None, None, None, None, None)), ('document', 'b4b608b986eb9617aa0525e1be21d32d')) +paddle.fluid.layers.dynamic_gru (ArgSpec(args=['input', 'size', 'param_attr', 'bias_attr', 'is_reverse', 'gate_activation', 'candidate_activation', 'h_0', 'origin_mode'], varargs=None, keywords=None, defaults=(None, None, False, 'sigmoid', 'tanh', None, False)), ('document', '4ec4845fd7d991bcac822f8b0dfc101f')) +paddle.fluid.layers.gru_unit (ArgSpec(args=['input', 'hidden', 'size', 'param_attr', 'bias_attr', 'activation', 'gate_activation', 'origin_mode'], varargs=None, keywords=None, defaults=(None, None, 'tanh', 'sigmoid', False)), ('document', 'e0e2439f7af069b57badca18a6ba60b8')) +paddle.fluid.layers.linear_chain_crf (ArgSpec(args=['input', 'label', 'param_attr'], varargs=None, keywords=None, defaults=(None,)), ('document', '7c49ef4bbf0adfd4b9a1d98e2e5f3fea')) +paddle.fluid.layers.crf_decoding (ArgSpec(args=['input', 'param_attr', 'label'], varargs=None, keywords=None, defaults=(None,)), ('document', '7642373ab65d3fc3b96d16d10fef1538')) +paddle.fluid.layers.cos_sim (ArgSpec(args=['X', 'Y'], varargs=None, keywords=None, defaults=None), ('document', 'd740824aa7316b807c4b4a3c6c8c0bbe')) +paddle.fluid.layers.cross_entropy (ArgSpec(args=['input', 'label', 'soft_label', 'ignore_index'], varargs=None, keywords=None, defaults=(False, -100)), ('document', '025b364dafb4b7975c801eb33e7831a1')) +paddle.fluid.layers.bpr_loss (ArgSpec(args=['input', 'label', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '30add751a0f99347a6257634c03ff254')) +paddle.fluid.layers.square_error_cost (ArgSpec(args=['input', 'label'], varargs=None, keywords=None, defaults=None), ('document', '44b6eef4a0f2bc15f7d9745782406736')) +paddle.fluid.layers.chunk_eval (ArgSpec(args=['input', 'label', 'chunk_scheme', 'num_chunk_types', 'excluded_chunk_types'], varargs=None, keywords=None, defaults=(None,)), ('document', 'ee152a7ba3036e7b9ede9184545179b4')) +paddle.fluid.layers.sequence_conv (ArgSpec(args=['input', 'num_filters', 'filter_size', 'filter_stride', 'padding', 'bias_attr', 'param_attr', 'act', 'name'], varargs=None, keywords=None, defaults=(3, 1, None, None, None, None, None)), ('document', 'b6543768e1afaa2ecb869709d6e9c7e2')) +paddle.fluid.layers.conv2d (ArgSpec(args=['input', 'num_filters', 'filter_size', 'stride', 'padding', 'dilation', 'groups', 'param_attr', 'bias_attr', 'use_cudnn', 'act', 'name'], varargs=None, keywords=None, defaults=(1, 0, 1, None, None, None, True, None, None)), ('document', '8ca6121acd6d23cd8806a93f493c2e17')) +paddle.fluid.layers.conv3d (ArgSpec(args=['input', 'num_filters', 'filter_size', 'stride', 'padding', 'dilation', 'groups', 'param_attr', 'bias_attr', 'use_cudnn', 'act', 'name'], varargs=None, keywords=None, defaults=(1, 0, 1, None, None, None, True, None, None)), ('document', '37042620f9bd3a2da6e5d3138b2f724b')) +paddle.fluid.layers.sequence_pool (ArgSpec(args=['input', 'pool_type', 'is_test'], varargs=None, keywords=None, defaults=(False,)), ('document', 'a194fb80614023f543df3949fbd0d0b8')) +paddle.fluid.layers.sequence_softmax (ArgSpec(args=['input', 'use_cudnn', 'name'], varargs=None, keywords=None, defaults=(False, None)), ('document', '19ef6f9cdd27feac8a1ae060f19c10b4')) +paddle.fluid.layers.softmax (ArgSpec(args=['input', 'use_cudnn', 'name'], varargs=None, keywords=None, defaults=(False, None)), ('document', 'f19dd380864e61134ce3814e4be0de4b')) +paddle.fluid.layers.pool2d (ArgSpec(args=['input', 'pool_size', 'pool_type', 'pool_stride', 'pool_padding', 'global_pooling', 'use_cudnn', 'ceil_mode', 'name', 'exclusive'], varargs=None, keywords=None, defaults=(-1, 'max', 1, 0, False, True, False, None, True)), ('document', 'bbd84e855e660cd1084bb71a2fd0cdaa')) +paddle.fluid.layers.pool3d (ArgSpec(args=['input', 'pool_size', 'pool_type', 'pool_stride', 'pool_padding', 'global_pooling', 'use_cudnn', 'ceil_mode', 'name', 'exclusive'], varargs=None, keywords=None, defaults=(-1, 'max', 1, 0, False, True, False, None, True)), ('document', '043de7333b79ee0ac55053c14ed81625')) +paddle.fluid.layers.adaptive_pool2d (ArgSpec(args=['input', 'pool_size', 'pool_type', 'require_index', 'name'], varargs=None, keywords=None, defaults=('max', False, None)), ('document', '859b887174d06f361658f69cb7c06d95')) +paddle.fluid.layers.adaptive_pool3d (ArgSpec(args=['input', 'pool_size', 'pool_type', 'require_index', 'name'], varargs=None, keywords=None, defaults=('max', False, None)), ('document', '120f4323a3d7ed9c0916f15a59f0e497')) +paddle.fluid.layers.batch_norm (ArgSpec(args=['input', 'act', 'is_test', 'momentum', 'epsilon', 'param_attr', 'bias_attr', 'data_layout', 'in_place', 'name', 'moving_mean_name', 'moving_variance_name', 'do_model_average_for_mean_and_var', 'fuse_with_relu', 'use_global_stats'], varargs=None, keywords=None, defaults=(None, False, 0.9, 1e-05, None, None, 'NCHW', False, None, None, None, False, False, False)), ('document', 'c527b71b8a4c60dca8df8a745c2b598d')) +paddle.fluid.layers.data_norm (ArgSpec(args=['input', 'act', 'epsilon', 'param_attr', 'data_layout', 'in_place', 'name', 'moving_mean_name', 'moving_variance_name', 'do_model_average_for_mean_and_var'], varargs=None, keywords=None, defaults=(None, 1e-05, None, 'NCHW', False, None, None, None, False)), ('document', 'e45e09e65a2658e07cad987222f0d9ab')) +paddle.fluid.layers.beam_search_decode (ArgSpec(args=['ids', 'scores', 'beam_size', 'end_id', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', 'b0b8d53821716cd50c42e09b593f3feb')) +paddle.fluid.layers.conv2d_transpose (ArgSpec(args=['input', 'num_filters', 'output_size', 'filter_size', 'padding', 'stride', 'dilation', 'groups', 'param_attr', 'bias_attr', 'use_cudnn', 'act', 'name'], varargs=None, keywords=None, defaults=(None, None, 0, 1, 1, None, None, None, True, None, None)), ('document', '03993955ab1e6d3044c44e6f17fc85e9')) +paddle.fluid.layers.conv3d_transpose (ArgSpec(args=['input', 'num_filters', 'output_size', 'filter_size', 'padding', 'stride', 'dilation', 'groups', 'param_attr', 'bias_attr', 'use_cudnn', 'act', 'name'], varargs=None, keywords=None, defaults=(None, None, 0, 1, 1, None, None, None, True, None, None)), ('document', 'ec113c6a3686ac94f8fccd1a7953d445')) +paddle.fluid.layers.sequence_expand (ArgSpec(args=['x', 'y', 'ref_level', 'name'], varargs=None, keywords=None, defaults=(-1, None)), ('document', '79c375214fa427faac504043d162dae9')) +paddle.fluid.layers.sequence_expand_as (ArgSpec(args=['x', 'y', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '9d2611f84ab364c5da545e6a82f1770a')) +paddle.fluid.layers.sequence_pad (ArgSpec(args=['x', 'pad_value', 'maxlen', 'name'], varargs=None, keywords=None, defaults=(None, None)), ('document', '6a1adf3067b20f6e4bcb354d71c19184')) +paddle.fluid.layers.sequence_unpad (ArgSpec(args=['x', 'length', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', 'd12803c903c99aa36ec03aaac5f0cc5b')) +paddle.fluid.layers.lstm_unit (ArgSpec(args=['x_t', 'hidden_t_prev', 'cell_t_prev', 'forget_bias', 'param_attr', 'bias_attr', 'name'], varargs=None, keywords=None, defaults=(0.0, None, None, None)), ('document', '027723966f3ef0d7bc598f22287a96cc')) +paddle.fluid.layers.reduce_sum (ArgSpec(args=['input', 'dim', 'keep_dim', 'name'], varargs=None, keywords=None, defaults=(None, False, None)), ('document', 'b69998ce3ff4980fb21da0df05565f1b')) +paddle.fluid.layers.reduce_mean (ArgSpec(args=['input', 'dim', 'keep_dim', 'name'], varargs=None, keywords=None, defaults=(None, False, None)), ('document', 'd4d80dd98a1a5839f41eeb3a0f85f370')) +paddle.fluid.layers.reduce_max (ArgSpec(args=['input', 'dim', 'keep_dim', 'name'], varargs=None, keywords=None, defaults=(None, False, None)), ('document', '66a622db727551761ce4eb73eaa7f6a4')) +paddle.fluid.layers.reduce_min (ArgSpec(args=['input', 'dim', 'keep_dim', 'name'], varargs=None, keywords=None, defaults=(None, False, None)), ('document', 'd50ac552b5d131468ed466d08bb2d38c')) +paddle.fluid.layers.reduce_prod (ArgSpec(args=['input', 'dim', 'keep_dim', 'name'], varargs=None, keywords=None, defaults=(None, False, None)), ('document', 'fcd8301a0ce15f219c7a4bcd0c1e8eca')) +paddle.fluid.layers.sequence_first_step (ArgSpec(args=['input'], varargs=None, keywords=None, defaults=None), ('document', '2b290d3d77882bfe9bb8d331cac8cdd3')) +paddle.fluid.layers.sequence_last_step (ArgSpec(args=['input'], varargs=None, keywords=None, defaults=None), ('document', 'c16a892f44f7fe71bfa5afc32d3f34ce')) +paddle.fluid.layers.sequence_slice (ArgSpec(args=['input', 'offset', 'length', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', 'fdcea0e8b5bc7d8d4b1b072c521014e6')) +paddle.fluid.layers.dropout (ArgSpec(args=['x', 'dropout_prob', 'is_test', 'seed', 'name', 'dropout_implementation'], varargs=None, keywords=None, defaults=(False, None, None, 'downgrade_in_infer')), ('document', 'dc7042734c6d8b8ce97321f017f01d6f')) +paddle.fluid.layers.split (ArgSpec(args=['input', 'num_or_sections', 'dim', 'name'], varargs=None, keywords=None, defaults=(-1, None)), ('document', '652625345c2acb900029c78cc75f8aa6')) +paddle.fluid.layers.ctc_greedy_decoder (ArgSpec(args=['input', 'blank', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', 'ebbf2adbd79683dc93db03454dfa18c2')) +paddle.fluid.layers.edit_distance (ArgSpec(args=['input', 'label', 'normalized', 'ignored_tokens'], varargs=None, keywords=None, defaults=(True, None)), ('document', '97f0262f97602644c83142789d784571')) +paddle.fluid.layers.l2_normalize (ArgSpec(args=['x', 'axis', 'epsilon', 'name'], varargs=None, keywords=None, defaults=(1e-12, None)), ('document', '6e428384ce6a77207fa2c70d9f011990')) +paddle.fluid.layers.matmul (ArgSpec(args=['x', 'y', 'transpose_x', 'transpose_y', 'alpha', 'name'], varargs=None, keywords=None, defaults=(False, False, 1.0, None)), ('document', 'b4cbe1ac451005df6dad12e9ffdccca9')) +paddle.fluid.layers.topk (ArgSpec(args=['input', 'k', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', 'd3570c02f71bcd78e60b3f31dc8f5b32')) +paddle.fluid.layers.warpctc (ArgSpec(args=['input', 'label', 'blank', 'norm_by_times', 'use_cudnn'], varargs=None, keywords=None, defaults=(0, False, False)), ('document', 'aaba49c038ba927f0a8e45c0c9a686ab')) +paddle.fluid.layers.sequence_reshape (ArgSpec(args=['input', 'new_dim'], varargs=None, keywords=None, defaults=None), ('document', 'a10ab9bf88d4a7e328882d411abb6fd1')) +paddle.fluid.layers.transpose (ArgSpec(args=['x', 'perm', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', 'a1feac48b843d679db82312dc85885f4')) +paddle.fluid.layers.im2sequence (ArgSpec(args=['input', 'filter_size', 'stride', 'padding', 'input_image_size', 'out_stride', 'name'], varargs=None, keywords=None, defaults=(1, 1, 0, None, 1, None)), ('document', '3ce01160ede80b1c26f776f8fef9340f')) +paddle.fluid.layers.nce (ArgSpec(args=['input', 'label', 'num_total_classes', 'sample_weight', 'param_attr', 'bias_attr', 'num_neg_samples', 'name', 'sampler', 'custom_dist', 'seed', 'is_sparse'], varargs=None, keywords=None, defaults=(None, None, None, None, None, 'uniform', None, 0, False)), ('document', 'fddad4896dee5193e1cdf70882c2a347')) +paddle.fluid.layers.sampled_softmax_with_cross_entropy (ArgSpec(args=['logits', 'label', 'num_samples', 'num_true', 'remove_accidental_hits', 'use_customized_samples', 'customized_samples', 'customized_probabilities', 'seed'], varargs=None, keywords=None, defaults=(1, True, False, None, None, 0)), ('document', '5db30b8a74e8c93687943a3e8d221da0')) +paddle.fluid.layers.hsigmoid (ArgSpec(args=['input', 'label', 'num_classes', 'param_attr', 'bias_attr', 'name', 'path_table', 'path_code', 'is_custom', 'is_sparse'], varargs=None, keywords=None, defaults=(None, None, None, None, None, False, False)), ('document', '80641ee6810b1cdc3fd6e14fc89ecc9d')) +paddle.fluid.layers.beam_search (ArgSpec(args=['pre_ids', 'pre_scores', 'ids', 'scores', 'beam_size', 'end_id', 'level', 'is_accumulated', 'name', 'return_parent_idx'], varargs=None, keywords=None, defaults=(0, True, None, False)), ('document', 'b350b9a30a18e7efd7e1bb740eef6996')) +paddle.fluid.layers.row_conv (ArgSpec(args=['input', 'future_context_size', 'param_attr', 'act'], varargs=None, keywords=None, defaults=(None, None)), ('document', '17485788fffe4e2d36dc58c2ac8d174e')) +paddle.fluid.layers.multiplex (ArgSpec(args=['inputs', 'index'], varargs=None, keywords=None, defaults=None), ('document', '013795af319e2e86d3506741941078ee')) +paddle.fluid.layers.layer_norm (ArgSpec(args=['input', 'scale', 'shift', 'begin_norm_axis', 'epsilon', 'param_attr', 'bias_attr', 'act', 'name'], varargs=None, keywords=None, defaults=(True, True, 1, 1e-05, None, None, None, None)), ('document', 'de6a906950bae9f3c245cb744d22b94e')) +paddle.fluid.layers.group_norm (ArgSpec(args=['input', 'groups', 'epsilon', 'param_attr', 'bias_attr', 'act', 'data_layout', 'name'], varargs=None, keywords=None, defaults=(1e-05, None, None, None, 'NCHW', None)), ('document', '419c3a24a83cc89219a029cf4092788b')) +paddle.fluid.layers.spectral_norm (ArgSpec(args=['weight', 'dim', 'power_iters', 'eps', 'name'], varargs=None, keywords=None, defaults=(0, 1, 1e-12, None)), ('document', '3f536aafba30d793287b52d231baff1b')) +paddle.fluid.layers.softmax_with_cross_entropy (ArgSpec(args=['logits', 'label', 'soft_label', 'ignore_index', 'numeric_stable_mode', 'return_softmax'], varargs=None, keywords=None, defaults=(False, -100, True, False)), ('document', 'bce1b75e3d95b75cacd1099655cbb3c3')) +paddle.fluid.layers.smooth_l1 (ArgSpec(args=['x', 'y', 'inside_weight', 'outside_weight', 'sigma'], varargs=None, keywords=None, defaults=(None, None, None)), ('document', 'c6b175d253c55baf4b9c0eca9b1dda88')) +paddle.fluid.layers.one_hot (ArgSpec(args=['input', 'depth'], varargs=None, keywords=None, defaults=None), ('document', '6148b6a555cbfb62fdcd030d8982c18c')) +paddle.fluid.layers.autoincreased_step_counter (ArgSpec(args=['counter_name', 'begin', 'step'], varargs=None, keywords=None, defaults=(None, 1, 1)), ('document', '3f6c828594720c9b2da89c464be94478')) +paddle.fluid.layers.reshape (ArgSpec(args=['x', 'shape', 'actual_shape', 'act', 'inplace', 'name'], varargs=None, keywords=None, defaults=(None, None, False, None)), ('document', '323c019f257e55ddea4a824a362de62f')) +paddle.fluid.layers.squeeze (ArgSpec(args=['input', 'axes', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '3229d06517f794e86ca3da14c38b1465')) +paddle.fluid.layers.unsqueeze (ArgSpec(args=['input', 'axes', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', 'bbd62da391b1df984a1909d069a759b2')) +paddle.fluid.layers.lod_reset (ArgSpec(args=['x', 'y', 'target_lod'], varargs=None, keywords=None, defaults=(None, None)), ('document', 'f122194c562bd674f6ecdccf33785f99')) +paddle.fluid.layers.lrn (ArgSpec(args=['input', 'n', 'k', 'alpha', 'beta', 'name'], varargs=None, keywords=None, defaults=(5, 1.0, 0.0001, 0.75, None)), ('document', '0795e9940e42dcd62953514ff7e09f77')) +paddle.fluid.layers.pad (ArgSpec(args=['x', 'paddings', 'pad_value', 'name'], varargs=None, keywords=None, defaults=(0.0, None)), ('document', '2f28153bdd2d5ea6f7bad5867bd03eeb')) +paddle.fluid.layers.pad_constant_like (ArgSpec(args=['x', 'y', 'pad_value', 'name'], varargs=None, keywords=None, defaults=(0.0, None)), ('document', 'd2e1f45fef51b2c214e3f2aa8976c46c')) +paddle.fluid.layers.label_smooth (ArgSpec(args=['label', 'prior_dist', 'epsilon', 'dtype', 'name'], varargs=None, keywords=None, defaults=(None, 0.1, 'float32', None)), ('document', '70c113658102a11cc5d8e3d45145737a')) +paddle.fluid.layers.roi_pool (ArgSpec(args=['input', 'rois', 'pooled_height', 'pooled_width', 'spatial_scale'], varargs=None, keywords=None, defaults=(1, 1, 1.0)), ('document', 'c317aa595deb31649083c8faa91cdb97')) +paddle.fluid.layers.roi_align (ArgSpec(args=['input', 'rois', 'pooled_height', 'pooled_width', 'spatial_scale', 'sampling_ratio', 'name'], varargs=None, keywords=None, defaults=(1, 1, 1.0, -1, None)), ('document', '12c5bbb8b38c42e623fbc47611d766e1')) +paddle.fluid.layers.dice_loss (ArgSpec(args=['input', 'label', 'epsilon'], varargs=None, keywords=None, defaults=(1e-05,)), ('document', '1ba0508d573f65feecf3564dce22aa1d')) +paddle.fluid.layers.image_resize (ArgSpec(args=['input', 'out_shape', 'scale', 'name', 'resample', 'actual_shape', 'align_corners', 'align_mode'], varargs=None, keywords=None, defaults=(None, None, None, 'BILINEAR', None, True, 1)), ('document', '7a1966d7c3a48f1fc0881cdaf5d83b0b')) +paddle.fluid.layers.image_resize_short (ArgSpec(args=['input', 'out_short_len', 'resample'], varargs=None, keywords=None, defaults=('BILINEAR',)), ('document', '06211aefc50c5a3e940d7204d859cdf7')) +paddle.fluid.layers.resize_bilinear (ArgSpec(args=['input', 'out_shape', 'scale', 'name', 'actual_shape', 'align_corners', 'align_mode'], varargs=None, keywords=None, defaults=(None, None, None, None, True, 1)), ('document', 'e4fb4ed511b2293b8f04f7e872afbfd7')) +paddle.fluid.layers.resize_nearest (ArgSpec(args=['input', 'out_shape', 'scale', 'name', 'actual_shape', 'align_corners'], varargs=None, keywords=None, defaults=(None, None, None, None, True)), ('document', '735fa9758a6d7ff3b47d7b827f961c1d')) +paddle.fluid.layers.gather (ArgSpec(args=['input', 'index'], varargs=None, keywords=None, defaults=None), ('document', '98f1c86716b9b7f4dda83f20e2adeee2')) +paddle.fluid.layers.scatter (ArgSpec(args=['input', 'index', 'updates', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '65f8e9d8ddfd0b412f940579c4faa342')) +paddle.fluid.layers.sequence_scatter (ArgSpec(args=['input', 'index', 'updates', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '15b522457dfef103f0c20ca9d397678b')) +paddle.fluid.layers.random_crop (ArgSpec(args=['x', 'shape', 'seed'], varargs=None, keywords=None, defaults=(None,)), ('document', 'c9ab9e460ef0a1823249935a30e82c66')) +paddle.fluid.layers.mean_iou (ArgSpec(args=['input', 'label', 'num_classes'], varargs=None, keywords=None, defaults=None), ('document', '35cbbdfa585d027bb490707c95a176b9')) +paddle.fluid.layers.relu (ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '866ffa1cc93f29e23662b526a7596537')) +paddle.fluid.layers.selu (ArgSpec(args=['x', 'scale', 'alpha', 'name'], varargs=None, keywords=None, defaults=(None, None, None)), ('document', '9044c7fe667b76cb2d9264f2db11f417')) +paddle.fluid.layers.log (ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '98247c59d1c9b40af6730001b2aea73d')) +paddle.fluid.layers.crop (ArgSpec(args=['x', 'shape', 'offsets', 'name'], varargs=None, keywords=None, defaults=(None, None, None)), ('document', '883104791204d3127e24234bb630b2e7')) +paddle.fluid.layers.rank_loss (ArgSpec(args=['label', 'left', 'right', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', 'c542e39ac6add24a6bef6e79bf5617e2')) +paddle.fluid.layers.margin_rank_loss (ArgSpec(args=['label', 'left', 'right', 'margin', 'name'], varargs=None, keywords=None, defaults=(0.1, None)), ('document', '6d19dcc19917080b7ff3e03bde451bc8')) +paddle.fluid.layers.elu (ArgSpec(args=['x', 'alpha', 'name'], varargs=None, keywords=None, defaults=(1.0, None)), ('document', '463258ee9f8b60760eb1e26357cc9bfa')) +paddle.fluid.layers.relu6 (ArgSpec(args=['x', 'threshold', 'name'], varargs=None, keywords=None, defaults=(6.0, None)), ('document', '6f367339caf6c7124bc262fe1475df70')) +paddle.fluid.layers.pow (ArgSpec(args=['x', 'factor', 'name'], varargs=None, keywords=None, defaults=(1.0, None)), ('document', 'a5117c1eb84aca2ac0b0abab337a4799')) +paddle.fluid.layers.stanh (ArgSpec(args=['x', 'scale_a', 'scale_b', 'name'], varargs=None, keywords=None, defaults=(0.6666666666666666, 1.7159, None)), ('document', '959936a477efc6c1447a9c8bf8ce94bb')) +paddle.fluid.layers.hard_sigmoid (ArgSpec(args=['x', 'slope', 'offset', 'name'], varargs=None, keywords=None, defaults=(0.2, 0.5, None)), ('document', 'c82059b6fea1aa730f9aac911807b756')) +paddle.fluid.layers.swish (ArgSpec(args=['x', 'beta', 'name'], varargs=None, keywords=None, defaults=(1.0, None)), ('document', 'ef745e55a48763ee7b46b21a81dc7e84')) +paddle.fluid.layers.prelu (ArgSpec(args=['x', 'mode', 'param_attr', 'name'], varargs=None, keywords=None, defaults=(None, None)), ('document', 'f6acef7ff7d887e49ff499fbb1dad4a9')) +paddle.fluid.layers.brelu (ArgSpec(args=['x', 't_min', 't_max', 'name'], varargs=None, keywords=None, defaults=(0.0, 24.0, None)), ('document', '3db337c195e156e6ef2b8b4a57113600')) +paddle.fluid.layers.leaky_relu (ArgSpec(args=['x', 'alpha', 'name'], varargs=None, keywords=None, defaults=(0.02, None)), ('document', 'f878486c82b576938151daad0de995a0')) +paddle.fluid.layers.soft_relu (ArgSpec(args=['x', 'threshold', 'name'], varargs=None, keywords=None, defaults=(40.0, None)), ('document', '869adce548c342d6cc1bd88a948d83c9')) +paddle.fluid.layers.flatten (ArgSpec(args=['x', 'axis', 'name'], varargs=None, keywords=None, defaults=(1, None)), ('document', 'cb295c13cb957db85cd9609269d7784d')) +paddle.fluid.layers.sequence_mask (ArgSpec(args=['x', 'maxlen', 'dtype', 'name'], varargs=None, keywords=None, defaults=(None, 'int64', None)), ('document', 'f0dd6eddd3bff015a3c05269d82fcbd8')) +paddle.fluid.layers.stack (ArgSpec(args=['x', 'axis'], varargs=None, keywords=None, defaults=(0,)), ('document', '367cfbb642839beacb5d117e2d2b4041')) +paddle.fluid.layers.pad2d (ArgSpec(args=['input', 'paddings', 'mode', 'pad_value', 'data_format', 'name'], varargs=None, keywords=None, defaults=([0, 0, 0, 0], 'constant', 0.0, 'NCHW', None)), ('document', '7f4d46320cc077ca2e8db600c35f4030')) +paddle.fluid.layers.unstack (ArgSpec(args=['x', 'axis', 'num'], varargs=None, keywords=None, defaults=(0, None)), ('document', '98eb9d633116efcfc6f90c114bd44fd6')) +paddle.fluid.layers.sequence_enumerate (ArgSpec(args=['input', 'win_size', 'pad_value', 'name'], varargs=None, keywords=None, defaults=(0, None)), ('document', 'f6028537085dc296103bbbd85fa7763d')) +paddle.fluid.layers.expand (ArgSpec(args=['x', 'expand_times', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '117d3607d1ffa0571835bbaebc7857ff')) +paddle.fluid.layers.sequence_concat (ArgSpec(args=['input', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '3a1d155dd1bf6e72a0a3e3e1519591d1')) +paddle.fluid.layers.scale (ArgSpec(args=['x', 'scale', 'bias', 'bias_after_scale', 'act', 'name'], varargs=None, keywords=None, defaults=(1.0, 0.0, True, None, None)), ('document', '30190413b2fa442e7466d6cf2ce5ea07')) +paddle.fluid.layers.elementwise_add (ArgSpec(args=['x', 'y', 'axis', 'act', 'name'], varargs=None, keywords=None, defaults=(-1, None, None)), ('document', '6bfbe72cbadc95ac7ab88c05ed5bf9f0')) +paddle.fluid.layers.elementwise_div (ArgSpec(args=['x', 'y', 'axis', 'act', 'name'], varargs=None, keywords=None, defaults=(-1, None, None)), ('document', 'cc6e6cc1cb942a152dde3ef08d5f165c')) +paddle.fluid.layers.elementwise_sub (ArgSpec(args=['x', 'y', 'axis', 'act', 'name'], varargs=None, keywords=None, defaults=(-1, None, None)), ('document', 'a12abdab09c3e57af5a6e1e9f138684a')) +paddle.fluid.layers.elementwise_mul (ArgSpec(args=['x', 'y', 'axis', 'act', 'name'], varargs=None, keywords=None, defaults=(-1, None, None)), ('document', '422c77dbfcff355a57b5fdd4ec876daa')) +paddle.fluid.layers.elementwise_max (ArgSpec(args=['x', 'y', 'axis', 'act', 'name'], varargs=None, keywords=None, defaults=(-1, None, None)), ('document', 'f0bb0b2c454541cfafa761021a5cc776')) +paddle.fluid.layers.elementwise_min (ArgSpec(args=['x', 'y', 'axis', 'act', 'name'], varargs=None, keywords=None, defaults=(-1, None, None)), ('document', '8a9cdefefbccbf9f6b0991c0946a21e9')) +paddle.fluid.layers.elementwise_pow (ArgSpec(args=['x', 'y', 'axis', 'act', 'name'], varargs=None, keywords=None, defaults=(-1, None, None)), ('document', '1aea4e197c552a284f83888a3c67a32e')) +paddle.fluid.layers.uniform_random_batch_size_like (ArgSpec(args=['input', 'shape', 'dtype', 'input_dim_idx', 'output_dim_idx', 'min', 'max', 'seed'], varargs=None, keywords=None, defaults=('float32', 0, 0, -1.0, 1.0, 0)), ('document', '129e0a3257f1d532a948eedf9d5bf671')) +paddle.fluid.layers.gaussian_random (ArgSpec(args=['shape', 'mean', 'std', 'seed', 'dtype'], varargs=None, keywords=None, defaults=(0.0, 1.0, 0, 'float32')), ('document', '389dafe36e099841b6a7fb18d11f1b4c')) +paddle.fluid.layers.sampling_id (ArgSpec(args=['x', 'min', 'max', 'seed', 'dtype'], varargs=None, keywords=None, defaults=(0.0, 1.0, 0, 'float32')), ('document', '840fdac643d1341c1cae218d4511dbb9')) +paddle.fluid.layers.gaussian_random_batch_size_like (ArgSpec(args=['input', 'shape', 'input_dim_idx', 'output_dim_idx', 'mean', 'std', 'seed', 'dtype'], varargs=None, keywords=None, defaults=(0, 0, 0.0, 1.0, 0, 'float32')), ('document', '840026b4766613c5705e06563cd103b6')) +paddle.fluid.layers.sum (ArgSpec(args=['x'], varargs=None, keywords=None, defaults=None), ('document', 'a418e3ccb5e2ac21bd60f5cc221d5860')) +paddle.fluid.layers.slice (ArgSpec(args=['input', 'axes', 'starts', 'ends'], varargs=None, keywords=None, defaults=None), ('document', '01dbb91e7c74cb11336cd531013de51a')) +paddle.fluid.layers.shape (ArgSpec(args=['input'], varargs=None, keywords=None, defaults=None), ('document', '17db0f814eb7bb5a3fac1ca6e60e16d8')) +paddle.fluid.layers.logical_and (ArgSpec(args=['x', 'y', 'out', 'name'], varargs=None, keywords=None, defaults=(None, None)), ('document', 'cdcf20c494c92060d10feb9374532f42')) +paddle.fluid.layers.logical_or (ArgSpec(args=['x', 'y', 'out', 'name'], varargs=None, keywords=None, defaults=(None, None)), ('document', '0eae3f726a4afe590757552fa3ced012')) +paddle.fluid.layers.logical_xor (ArgSpec(args=['x', 'y', 'out', 'name'], varargs=None, keywords=None, defaults=(None, None)), ('document', 'b0daaa3fa4a0aa62f9b58c43d959eb25')) +paddle.fluid.layers.logical_not (ArgSpec(args=['x', 'out', 'name'], varargs=None, keywords=None, defaults=(None, None)), ('document', 'cd1c8cf31e040427d4e05711044caeb6')) +paddle.fluid.layers.clip (ArgSpec(args=['x', 'min', 'max', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', 'b020b7aab59719be98a4ae229a76deba')) +paddle.fluid.layers.clip_by_norm (ArgSpec(args=['x', 'max_norm', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', 'a1ea0bc5a926f427458c4254ca022749')) +paddle.fluid.layers.mean (ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', 'd638d915195ce86a8d7963b81110d4c8')) +paddle.fluid.layers.mul (ArgSpec(args=['x', 'y', 'x_num_col_dims', 'y_num_col_dims', 'name'], varargs=None, keywords=None, defaults=(1, 1, None)), ('document', 'ccd37fa6b53f074adbfb732d738c4c2d')) +paddle.fluid.layers.sigmoid_cross_entropy_with_logits (ArgSpec(args=['x', 'label', 'ignore_index', 'name', 'normalize'], varargs=None, keywords=None, defaults=(-100, None, False)), ('document', '180c284317ea45ef89a460d8d79c0b72')) +paddle.fluid.layers.maxout (ArgSpec(args=['x', 'groups', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '891870d069a6aea746d34cc53b61690c')) +paddle.fluid.layers.space_to_depth (ArgSpec(args=['x', 'blocksize', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '5f207ae10589ebe38a63575ef6ff8e1e')) +paddle.fluid.layers.affine_grid (ArgSpec(args=['theta', 'out_shape', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '51def402b8910e163cbace9d0c0526ed')) +paddle.fluid.layers.sequence_reverse (ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '77a6d80aa5551ca70324fc975c44507f')) +paddle.fluid.layers.affine_channel (ArgSpec(args=['x', 'scale', 'bias', 'data_layout', 'name'], varargs=None, keywords=None, defaults=(None, None, 'NCHW', None)), ('document', '2f46f1ff39a13ab00857e7b9f44b2fa7')) +paddle.fluid.layers.similarity_focus (ArgSpec(args=['input', 'axis', 'indexes', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '70e3b5182a18b40b47ecabd7c8490a35')) +paddle.fluid.layers.hash (ArgSpec(args=['input', 'hash_size', 'num_hash', 'name'], varargs=None, keywords=None, defaults=(1, None)), ('document', '9bb77f8dc002dd2ce75d4769eaaf5007')) +paddle.fluid.layers.grid_sampler (ArgSpec(args=['x', 'grid', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', 'd256cba1c41a5ed92ce3f31e24a2ca6d')) +paddle.fluid.layers.log_loss (ArgSpec(args=['input', 'label', 'epsilon', 'name'], varargs=None, keywords=None, defaults=(0.0001, None)), ('document', '4b5a2341023afe63157a066c14254f98')) +paddle.fluid.layers.add_position_encoding (ArgSpec(args=['input', 'alpha', 'beta', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '4b9c2e8af5817937d831820874b5aa77')) +paddle.fluid.layers.bilinear_tensor_product (ArgSpec(args=['x', 'y', 'size', 'act', 'name', 'param_attr', 'bias_attr'], varargs=None, keywords=None, defaults=(None, None, None, None)), ('document', 'aa7540a0fa73ff69a02e11b4091aab75')) +paddle.fluid.layers.merge_selected_rows (ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', 'dc63315b84f591ac79ecca0c3632027a')) +paddle.fluid.layers.get_tensor_from_selected_rows (ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '7ffc849e71f31dfe29030ff94e662de6')) +paddle.fluid.layers.lstm (ArgSpec(args=['input', 'init_h', 'init_c', 'max_len', 'hidden_size', 'num_layers', 'dropout_prob', 'is_bidirec', 'is_test', 'name', 'default_initializer', 'seed'], varargs=None, keywords=None, defaults=(0.0, False, False, None, None, -1)), ('document', 'd5e6c494ac35100e2ed4d4bd9a1ed932')) +paddle.fluid.layers.shuffle_channel (ArgSpec(args=['x', 'group', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '2fa6782d43d02ae64482d21235a82949')) +paddle.fluid.layers.py_func (ArgSpec(args=['func', 'x', 'out', 'backward_func', 'skip_vars_in_backward_input'], varargs=None, keywords=None, defaults=(None, None)), ('document', '8404e472ac12b4a30a505d3d3a3e5fdb')) +paddle.fluid.layers.psroi_pool (ArgSpec(args=['input', 'rois', 'output_channels', 'spatial_scale', 'pooled_height', 'pooled_width', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '1546136806fef5c08f6918544bd9151d')) +paddle.fluid.layers.teacher_student_sigmoid_loss (ArgSpec(args=['input', 'label', 'soft_max_up_bound', 'soft_max_lower_bound'], varargs=None, keywords=None, defaults=(15.0, -15.0)), ('document', '2f6ff96864054a31aa4bb659c6722c99')) +paddle.fluid.layers.huber_loss (ArgSpec(args=['input', 'label', 'delta'], varargs=None, keywords=None, defaults=None), ('document', '431a4301c35032166ec029f7432c80a7')) +paddle.fluid.layers.tree_conv (ArgSpec(args=['nodes_vector', 'edge_set', 'output_size', 'num_filters', 'max_depth', 'act', 'param_attr', 'bias_attr', 'name'], varargs=None, keywords=None, defaults=(1, 2, 'tanh', None, None, None)), ('document', '34ea12ac9f10a65dccbc50100d12e607')) +paddle.fluid.layers.npair_loss (ArgSpec(args=['anchor', 'positive', 'labels', 'l2_reg'], varargs=None, keywords=None, defaults=(0.002,)), ('document', '46994d10276dd4cb803b4062b5d14329')) +paddle.fluid.layers.data (ArgSpec(args=['name', 'shape', 'append_batch_size', 'dtype', 'lod_level', 'type', 'stop_gradient'], varargs=None, keywords=None, defaults=(True, 'float32', 0, VarType.LOD_TENSOR, True)), ('document', '33bbd42027d872b3818b3d64ec52e139')) +paddle.fluid.layers.open_files (ArgSpec(args=['filenames', 'shapes', 'lod_levels', 'dtypes', 'thread_num', 'buffer_size', 'pass_num', 'is_test'], varargs=None, keywords=None, defaults=(None, None, 1, None)), ('document', 'b1ae2e1cc0750e58726374061ea90ecc')) +paddle.fluid.layers.read_file (ArgSpec(args=['reader'], varargs=None, keywords=None, defaults=None), ('document', 'b0a1c2fc51c27a106da28f3308c41f5e')) +paddle.fluid.layers.shuffle (ArgSpec(args=['reader', 'buffer_size'], varargs=None, keywords=None, defaults=None), ('document', 'f967a73426db26f970bc70bfb03cffca')) +paddle.fluid.layers.batch (ArgSpec(args=['reader', 'batch_size'], varargs=None, keywords=None, defaults=None), ('document', 'f563d376d35e1a4c4db100fd11b381a0')) +paddle.fluid.layers.double_buffer (ArgSpec(args=['reader', 'place', 'name'], varargs=None, keywords=None, defaults=(None, None)), ('document', '07e5b796674796eb1ef3fee9c10d24e3')) +paddle.fluid.layers.random_data_generator (ArgSpec(args=['low', 'high', 'shapes', 'lod_levels', 'for_parallel'], varargs=None, keywords=None, defaults=(True,)), ('document', '9b7f0f86ec24bbc97643cadcb6499cff')) +paddle.fluid.layers.py_reader (ArgSpec(args=['capacity', 'shapes', 'dtypes', 'lod_levels', 'name', 'use_double_buffer'], varargs=None, keywords=None, defaults=(None, None, True)), ('document', '4357643685cfd65454ba5a15f0151709')) +paddle.fluid.layers.create_py_reader_by_data (ArgSpec(args=['capacity', 'feed_list', 'name', 'use_double_buffer'], varargs=None, keywords=None, defaults=(None, True)), ('document', '350f74d93fab9adb2ac4950f1c26416b')) +paddle.fluid.layers.Preprocessor.__init__ (ArgSpec(args=['self', 'reader', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) +paddle.fluid.layers.Preprocessor.block (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) +paddle.fluid.layers.Preprocessor.inputs (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) +paddle.fluid.layers.Preprocessor.outputs (ArgSpec(args=['self'], varargs='outs', keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) +paddle.fluid.layers.load (ArgSpec(args=['out', 'file_path', 'load_as_fp16'], varargs=None, keywords=None, defaults=(None,)), ('document', '9d1a4bc97bbce9fa1d4f7a4200a771ff')) +paddle.fluid.layers.create_tensor (ArgSpec(args=['dtype', 'name', 'persistable'], varargs=None, keywords=None, defaults=(None, False)), ('document', 'c0c3d0194f83fff8ea99ce0820657dae')) +paddle.fluid.layers.create_parameter (ArgSpec(args=['shape', 'dtype', 'name', 'attr', 'is_bias', 'default_initializer'], varargs=None, keywords=None, defaults=(None, None, False, None)), ('document', 'd62b866c899bc1fedb5385f95b88e1f8')) +paddle.fluid.layers.create_global_var (ArgSpec(args=['shape', 'value', 'dtype', 'persistable', 'force_cpu', 'name'], varargs=None, keywords=None, defaults=(False, False, None)), ('document', 'ab914fac893607e29ac6e52bbdbea1a4')) +paddle.fluid.layers.cast (ArgSpec(args=['x', 'dtype'], varargs=None, keywords=None, defaults=None), ('document', '992eb42590fc1c380841a6db72ce78b3')) +paddle.fluid.layers.tensor_array_to_tensor (ArgSpec(args=['input', 'axis', 'name'], varargs=None, keywords=None, defaults=(1, None)), ('document', 'b12717d3d4567e6119589f7f655b0cbb')) +paddle.fluid.layers.concat (ArgSpec(args=['input', 'axis', 'name'], varargs=None, keywords=None, defaults=(0, None)), ('document', 'b19b79be4f05e85d1d6cec642c9fb535')) +paddle.fluid.layers.sums (ArgSpec(args=['input', 'out'], varargs=None, keywords=None, defaults=(None,)), ('document', '42912092418620b4be07f36af31e7816')) +paddle.fluid.layers.assign (ArgSpec(args=['input', 'output'], varargs=None, keywords=None, defaults=(None,)), ('document', 'b690184f3537df5501e4d9d8f31152a5')) +paddle.fluid.layers.fill_constant_batch_size_like (ArgSpec(args=['input', 'shape', 'dtype', 'value', 'input_dim_idx', 'output_dim_idx'], varargs=None, keywords=None, defaults=(0, 0)), ('document', 'd4059a2f5763036b07018d76429f9acb')) +paddle.fluid.layers.fill_constant (ArgSpec(args=['shape', 'dtype', 'value', 'force_cpu', 'out'], varargs=None, keywords=None, defaults=(False, None)), ('document', '1d8b14729639fa38509c79b9784740fa')) +paddle.fluid.layers.argmin (ArgSpec(args=['x', 'axis'], varargs=None, keywords=None, defaults=(0,)), ('document', '2778a1d34be49263a51211885599ea37')) +paddle.fluid.layers.argmax (ArgSpec(args=['x', 'axis'], varargs=None, keywords=None, defaults=(0,)), ('document', '04114996cfb98994ba222804a1a6109f')) +paddle.fluid.layers.argsort (ArgSpec(args=['input', 'axis', 'name'], varargs=None, keywords=None, defaults=(-1, None)), ('document', '68ec45c6fb6b93e47de9c9a0945fb98e')) +paddle.fluid.layers.ones (ArgSpec(args=['shape', 'dtype', 'force_cpu'], varargs=None, keywords=None, defaults=(False,)), ('document', 'b402489c62e668df42e7daceb63c142b')) +paddle.fluid.layers.zeros (ArgSpec(args=['shape', 'dtype', 'force_cpu'], varargs=None, keywords=None, defaults=(False,)), ('document', 'c155e2efc56ffa5ed4658cca0272e491')) +paddle.fluid.layers.reverse (ArgSpec(args=['x', 'axis'], varargs=None, keywords=None, defaults=None), ('document', '8ee7cb6ca639e7460e825f953b65d94d')) +paddle.fluid.layers.has_inf (ArgSpec(args=['x'], varargs=None, keywords=None, defaults=None), ('document', '8f8c0306117ea441f20dcbbdba1f0ecc')) +paddle.fluid.layers.has_nan (ArgSpec(args=['x'], varargs=None, keywords=None, defaults=None), ('document', '2e53e83127dbfd86e7098bdfe9a549e8')) +paddle.fluid.layers.isfinite (ArgSpec(args=['x'], varargs=None, keywords=None, defaults=None), ('document', '0a437011c3906079fd8947ed3e52d292')) +paddle.fluid.layers.While.__init__ (ArgSpec(args=['self', 'cond', 'is_test', 'name'], varargs=None, keywords=None, defaults=(False, None)), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) +paddle.fluid.layers.While.block (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) +paddle.fluid.layers.Switch.__init__ (ArgSpec(args=['self', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) +paddle.fluid.layers.Switch.case (ArgSpec(args=['self', 'condition'], varargs=None, keywords=None, defaults=None), ('document', 'f7c7160014c1b46cfeda9dd5808d1789')) +paddle.fluid.layers.Switch.default (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '50853ae884df03d9c36703bb46d9ef07')) +paddle.fluid.layers.increment (ArgSpec(args=['x', 'value', 'in_place'], varargs=None, keywords=None, defaults=(1.0, True)), ('document', '73bb96ec4783ec1a11e760e8851b0e77')) +paddle.fluid.layers.array_write (ArgSpec(args=['x', 'i', 'array'], varargs=None, keywords=None, defaults=(None,)), ('document', '40b6d15f4c86b2b09df340d7778ad713')) +paddle.fluid.layers.create_array (ArgSpec(args=['dtype'], varargs=None, keywords=None, defaults=None), ('document', '2d4f20087080ba5105b55205ad5c5b6a')) +paddle.fluid.layers.less_than (ArgSpec(args=['x', 'y', 'force_cpu', 'cond'], varargs=None, keywords=None, defaults=(None, None)), ('document', '067bbc799c66289ca8b8924c26b6673f')) +paddle.fluid.layers.equal (ArgSpec(args=['x', 'y', 'cond'], varargs=None, keywords=None, defaults=(None,)), ('document', '80c29b1dc64718f0116de90d1ac88a77')) +paddle.fluid.layers.array_read (ArgSpec(args=['array', 'i'], varargs=None, keywords=None, defaults=None), ('document', '0275133f1dde2aed528b4d3230edf823')) +paddle.fluid.layers.array_length (ArgSpec(args=['array'], varargs=None, keywords=None, defaults=None), ('document', 'ffb8b9578ec66db565b223d313aa82a2')) +paddle.fluid.layers.IfElse.__init__ (ArgSpec(args=['self', 'cond', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) +paddle.fluid.layers.IfElse.false_block (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) +paddle.fluid.layers.IfElse.input (ArgSpec(args=['self', 'x'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) +paddle.fluid.layers.IfElse.output (ArgSpec(args=['self'], varargs='outs', keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) +paddle.fluid.layers.IfElse.true_block (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) +paddle.fluid.layers.DynamicRNN.__init__ (ArgSpec(args=['self', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) +paddle.fluid.layers.DynamicRNN.block (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '6d3e0a5d9aa519a9773a36e1620ea9b7')) +paddle.fluid.layers.DynamicRNN.memory (ArgSpec(args=['self', 'init', 'shape', 'value', 'need_reorder', 'dtype'], varargs=None, keywords=None, defaults=(None, None, 0.0, False, 'float32')), ('document', 'b9174d4e91505b0c8ecc193eb51e248d')) +paddle.fluid.layers.DynamicRNN.output (ArgSpec(args=['self'], varargs='outputs', keywords=None, defaults=None), ('document', 'b439a176a3328de8a75bdc5c08eece4a')) +paddle.fluid.layers.DynamicRNN.static_input (ArgSpec(args=['self', 'x'], varargs=None, keywords=None, defaults=None), ('document', 'f29ad2478b6b2ad4f413d2936a331ea0')) +paddle.fluid.layers.DynamicRNN.step_input (ArgSpec(args=['self', 'x'], varargs=None, keywords=None, defaults=None), ('document', '169d694d2224f62b4f3afdc3dbc19e95')) +paddle.fluid.layers.DynamicRNN.update_memory (ArgSpec(args=['self', 'ex_mem', 'new_mem'], varargs=None, keywords=None, defaults=None), ('document', '5d83987da13b98363d6a807a52d8024f')) +paddle.fluid.layers.StaticRNN.__init__ (ArgSpec(args=['self', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) +paddle.fluid.layers.StaticRNN.memory (ArgSpec(args=['self', 'init', 'shape', 'batch_ref', 'init_value', 'init_batch_dim_idx', 'ref_batch_dim_idx'], varargs=None, keywords=None, defaults=(None, None, None, 0.0, 0, 1)), ('document', 'c24e368e23afac1ed91a78a639d7a9c7')) +paddle.fluid.layers.StaticRNN.output (ArgSpec(args=['self'], varargs='outputs', keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) +paddle.fluid.layers.StaticRNN.step (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) +paddle.fluid.layers.StaticRNN.step_input (ArgSpec(args=['self', 'x'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) +paddle.fluid.layers.StaticRNN.step_output (ArgSpec(args=['self', 'o'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) +paddle.fluid.layers.StaticRNN.update_memory (ArgSpec(args=['self', 'mem', 'var'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) +paddle.fluid.layers.reorder_lod_tensor_by_rank (ArgSpec(args=['x', 'rank_table'], varargs=None, keywords=None, defaults=None), ('document', '3545f529ef04e8f6ecb76b47fa3df01a')) +paddle.fluid.layers.Print (ArgSpec(args=['input', 'first_n', 'message', 'summarize', 'print_tensor_name', 'print_tensor_type', 'print_tensor_shape', 'print_tensor_lod', 'print_phase'], varargs=None, keywords=None, defaults=(-1, None, -1, True, True, True, True, 'both')), ('document', '5fef91b0e21c93610785f2b1f7161732')) +paddle.fluid.layers.is_empty (ArgSpec(args=['x', 'cond'], varargs=None, keywords=None, defaults=(None,)), ('document', 'bbe578dbb49ad13e15b014e98c22b519')) +paddle.fluid.layers.sigmoid (ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '29a25ba78de79152076cacfc5443137d')) +paddle.fluid.layers.logsigmoid (ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '81ccb7acafd06c7728e11581f5d342e3')) +paddle.fluid.layers.exp (ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', 'e6b3e769413d96aab4176f96db25984b')) +paddle.fluid.layers.tanh (ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', 'e9d586a0b5bd05f67ee78048f9d503b6')) +paddle.fluid.layers.tanh_shrink (ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '1e521554b9fdda9061ec6d306f0709b7')) +paddle.fluid.layers.softshrink (ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '9eef31597bbafa2bd49691e072296e13')) +paddle.fluid.layers.sqrt (ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '072a8541e0f632366bba10f67cb0db27')) +paddle.fluid.layers.abs (ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '64650ac42cf82e9920cb0b172b1d29fd')) +paddle.fluid.layers.ceil (ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', 'c75d67dc5fe28f68e4cfffead4f698ad')) +paddle.fluid.layers.floor (ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '647b16c5da5ef909649ae02abb434973')) +paddle.fluid.layers.cos (ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '485f2686bcc2fe37a4bd893769c8a3e2')) +paddle.fluid.layers.sin (ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '01f1766aa76eff1df30147505b59f7c4')) +paddle.fluid.layers.round (ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', 'b47f5da13913d3e56bdb1e612a73f3f2')) +paddle.fluid.layers.reciprocal (ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', 'cc6ac2f14f03c52aaa83a59bf83b8d26')) +paddle.fluid.layers.square (ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '48dfb45d773dbc30126c3a7f777de5ee')) +paddle.fluid.layers.softplus (ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '459c5781e9d1dd88283b7c5769d7872a')) +paddle.fluid.layers.softsign (ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '80846bcd4bd457207457a6d5411f4148')) +paddle.fluid.layers.uniform_random (ArgSpec(args=['shape', 'dtype', 'min', 'max', 'seed'], varargs=None, keywords=None, defaults=('float32', -1.0, 1.0, 0)), ('document', '308b619af849caa82bbc31e897f5e641')) +paddle.fluid.layers.hard_shrink (ArgSpec(args=['x', 'threshold'], varargs=None, keywords=None, defaults=(None,)), ('document', 'c142f5884f3255e0d6075c286bbd531e')) +paddle.fluid.layers.cumsum (ArgSpec(args=['x', 'axis', 'exclusive', 'reverse'], varargs=None, keywords=None, defaults=(None, None, None)), ('document', '944d7c03057f5fc88bc78acd4d82f926')) +paddle.fluid.layers.thresholded_relu (ArgSpec(args=['x', 'threshold'], varargs=None, keywords=None, defaults=(None,)), ('document', '90566ea449ea4c681435546e2f70610a')) +paddle.fluid.layers.prior_box (ArgSpec(args=['input', 'image', 'min_sizes', 'max_sizes', 'aspect_ratios', 'variance', 'flip', 'clip', 'steps', 'offset', 'name', 'min_max_aspect_ratios_order'], varargs=None, keywords=None, defaults=(None, [1.0], [0.1, 0.1, 0.2, 0.2], False, False, [0.0, 0.0], 0.5, None, False)), ('document', '14cac0ee643fa6e026ad82aeeee75bd8')) +paddle.fluid.layers.density_prior_box (ArgSpec(args=['input', 'image', 'densities', 'fixed_sizes', 'fixed_ratios', 'variance', 'clip', 'steps', 'offset', 'flatten_to_2d', 'name'], varargs=None, keywords=None, defaults=(None, None, None, [0.1, 0.1, 0.2, 0.2], False, [0.0, 0.0], 0.5, False, None)), ('document', 'a0d762bb08de9ce93bc780aa57cd5cd9')) +paddle.fluid.layers.multi_box_head (ArgSpec(args=['inputs', 'image', 'base_size', 'num_classes', 'aspect_ratios', 'min_ratio', 'max_ratio', 'min_sizes', 'max_sizes', 'steps', 'step_w', 'step_h', 'offset', 'variance', 'flip', 'clip', 'kernel_size', 'pad', 'stride', 'name', 'min_max_aspect_ratios_order'], varargs=None, keywords=None, defaults=(None, None, None, None, None, None, None, 0.5, [0.1, 0.1, 0.2, 0.2], True, False, 1, 0, 1, None, False)), ('document', 'a6ab47a2fe681e52fabb7057ddf0efdd')) +paddle.fluid.layers.bipartite_match (ArgSpec(args=['dist_matrix', 'match_type', 'dist_threshold', 'name'], varargs=None, keywords=None, defaults=(None, None, None)), ('document', '3ddb9b966f193900193a95a3df77c3c1')) +paddle.fluid.layers.target_assign (ArgSpec(args=['input', 'matched_indices', 'negative_indices', 'mismatch_value', 'name'], varargs=None, keywords=None, defaults=(None, None, None)), ('document', 'c0b334f917828f95056f6ebe10907b1c')) +paddle.fluid.layers.detection_output (ArgSpec(args=['loc', 'scores', 'prior_box', 'prior_box_var', 'background_label', 'nms_threshold', 'nms_top_k', 'keep_top_k', 'score_threshold', 'nms_eta'], varargs=None, keywords=None, defaults=(0, 0.3, 400, 200, 0.01, 1.0)), ('document', 'c33093a82a46e3091e789e5572588db1')) +paddle.fluid.layers.ssd_loss (ArgSpec(args=['location', 'confidence', 'gt_box', 'gt_label', 'prior_box', 'prior_box_var', 'background_label', 'overlap_threshold', 'neg_pos_ratio', 'neg_overlap', 'loc_loss_weight', 'conf_loss_weight', 'match_type', 'mining_type', 'normalize', 'sample_size'], varargs=None, keywords=None, defaults=(None, 0, 0.5, 3.0, 0.5, 1.0, 1.0, 'per_prediction', 'max_negative', True, None)), ('document', '6d5028fd09d01ab82d296adc0ea95aee')) +paddle.fluid.layers.detection_map (ArgSpec(args=['detect_res', 'label', 'class_num', 'background_label', 'overlap_threshold', 'evaluate_difficult', 'has_state', 'input_states', 'out_states', 'ap_version'], varargs=None, keywords=None, defaults=(0, 0.3, True, None, None, None, 'integral')), ('document', '1467d91b50c22cd52103b4aa1ee9d0a1')) +paddle.fluid.layers.rpn_target_assign (ArgSpec(args=['bbox_pred', 'cls_logits', 'anchor_box', 'anchor_var', 'gt_boxes', 'is_crowd', 'im_info', 'rpn_batch_size_per_im', 'rpn_straddle_thresh', 'rpn_fg_fraction', 'rpn_positive_overlap', 'rpn_negative_overlap', 'use_random'], varargs=None, keywords=None, defaults=(256, 0.0, 0.5, 0.7, 0.3, True)), ('document', '1dddef3eb4b3cbd4df8e03ac480dbf97')) +paddle.fluid.layers.anchor_generator (ArgSpec(args=['input', 'anchor_sizes', 'aspect_ratios', 'variance', 'stride', 'offset', 'name'], varargs=None, keywords=None, defaults=(None, None, [0.1, 0.1, 0.2, 0.2], None, 0.5, None)), ('document', '23337cc57bbf5be73884b6bd0f849603')) +paddle.fluid.layers.roi_perspective_transform (ArgSpec(args=['input', 'rois', 'transformed_height', 'transformed_width', 'spatial_scale'], varargs=None, keywords=None, defaults=(1.0,)), ('document', '5761f9ed83654314416e24372b33bb84')) +paddle.fluid.layers.generate_proposal_labels (ArgSpec(args=['rpn_rois', 'gt_classes', 'is_crowd', 'gt_boxes', 'im_info', 'batch_size_per_im', 'fg_fraction', 'fg_thresh', 'bg_thresh_hi', 'bg_thresh_lo', 'bbox_reg_weights', 'class_nums', 'use_random'], varargs=None, keywords=None, defaults=(256, 0.25, 0.25, 0.5, 0.0, [0.1, 0.1, 0.2, 0.2], None, True)), ('document', '87863717edeb7fe87a1268976cbc015d')) +paddle.fluid.layers.generate_proposals (ArgSpec(args=['scores', 'bbox_deltas', 'im_info', 'anchors', 'variances', 'pre_nms_top_n', 'post_nms_top_n', 'nms_thresh', 'min_size', 'eta', 'name'], varargs=None, keywords=None, defaults=(6000, 1000, 0.5, 0.1, 1.0, None)), ('document', '57ab49f3f324f310b7eed322e7c1057a')) +paddle.fluid.layers.generate_mask_labels (ArgSpec(args=['im_info', 'gt_classes', 'is_crowd', 'gt_segms', 'rois', 'labels_int32', 'num_classes', 'resolution'], varargs=None, keywords=None, defaults=None), ('document', 'f73706a65468e9ca3e0bee4a31521b0a')) +paddle.fluid.layers.iou_similarity (ArgSpec(args=['x', 'y', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '587845f60c5d97ffdf2dfd21da52eca1')) +paddle.fluid.layers.box_coder (ArgSpec(args=['prior_box', 'prior_box_var', 'target_box', 'code_type', 'box_normalized', 'name', 'axis'], varargs=None, keywords=None, defaults=('encode_center_size', True, None, 0)), ('document', '032d0f4b7d8f6235ee5d91e473344f0e')) +paddle.fluid.layers.polygon_box_transform (ArgSpec(args=['input', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '0e5ac2507723a0b5adec473f9556799b')) +paddle.fluid.layers.yolov3_loss (ArgSpec(args=['x', 'gtbox', 'gtlabel', 'anchors', 'anchor_mask', 'class_num', 'ignore_thresh', 'downsample_ratio', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '991e934c3e09abf0edec7c9c978b4691')) +paddle.fluid.layers.box_clip (ArgSpec(args=['input', 'im_info', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '397e9e02b451d99c56e20f268fa03f2e')) +paddle.fluid.layers.multiclass_nms (ArgSpec(args=['bboxes', 'scores', 'score_threshold', 'nms_top_k', 'keep_top_k', 'nms_threshold', 'normalized', 'nms_eta', 'background_label', 'name'], varargs=None, keywords=None, defaults=(0.3, True, 1.0, 0, None)), ('document', 'ca7d1107b6c5d2d6d8221039a220fde0')) +paddle.fluid.layers.distribute_fpn_proposals (ArgSpec(args=['fpn_rois', 'min_level', 'max_level', 'refer_level', 'refer_scale', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '7bb011ec26bace2bc23235aa4a17647d')) +paddle.fluid.layers.box_decoder_and_assign (ArgSpec(args=['prior_box', 'prior_box_var', 'target_box', 'box_score', 'box_clip', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '005a5ae47d6c8fff721931d69d072b9f')) +paddle.fluid.layers.accuracy (ArgSpec(args=['input', 'label', 'k', 'correct', 'total'], varargs=None, keywords=None, defaults=(1, None, None)), ('document', '9808534c12c5e739a10f73ebb0b4eafd')) +paddle.fluid.layers.auc (ArgSpec(args=['input', 'label', 'curve', 'num_thresholds', 'topk', 'slide_steps'], varargs=None, keywords=None, defaults=('ROC', 4095, 1, 1)), ('document', 'e0e95334fce92d16c2d9db6e7caffc47')) +paddle.fluid.layers.exponential_decay (ArgSpec(args=['learning_rate', 'decay_steps', 'decay_rate', 'staircase'], varargs=None, keywords=None, defaults=(False,)), ('document', '98a5050bee8522fcea81aa795adaba51')) +paddle.fluid.layers.natural_exp_decay (ArgSpec(args=['learning_rate', 'decay_steps', 'decay_rate', 'staircase'], varargs=None, keywords=None, defaults=(False,)), ('document', '676a7bc2a218691db50bca233903d21e')) +paddle.fluid.layers.inverse_time_decay (ArgSpec(args=['learning_rate', 'decay_steps', 'decay_rate', 'staircase'], varargs=None, keywords=None, defaults=(False,)), ('document', 'd07e767d59c4a5e6c930f3e6756d3f82')) +paddle.fluid.layers.polynomial_decay (ArgSpec(args=['learning_rate', 'decay_steps', 'end_learning_rate', 'power', 'cycle'], varargs=None, keywords=None, defaults=(0.0001, 1.0, False)), ('document', '882634f420f626642f0874481263da40')) +paddle.fluid.layers.piecewise_decay (ArgSpec(args=['boundaries', 'values'], varargs=None, keywords=None, defaults=None), ('document', 'c717d9d1d78a53c809d01b8bc56f3cae')) +paddle.fluid.layers.noam_decay (ArgSpec(args=['d_model', 'warmup_steps'], varargs=None, keywords=None, defaults=None), ('document', 'd9a95746353fd574be36dc28d8726c28')) +paddle.fluid.layers.append_LARS (ArgSpec(args=['params_grads', 'learning_rate', 'weight_decay'], varargs=None, keywords=None, defaults=None), ('document', 'd24fa1e7d62ac8a534fc6a86002f84f8')) +paddle.fluid.layers.cosine_decay (ArgSpec(args=['learning_rate', 'step_each_epoch', 'epochs'], varargs=None, keywords=None, defaults=None), ('document', '9588c64c26ffaef3c466e404a6af9d9b')) +paddle.fluid.contrib.InitState.__init__ (ArgSpec(args=['self', 'init', 'shape', 'value', 'init_boot', 'need_reorder', 'dtype'], varargs=None, keywords=None, defaults=(None, None, 0.0, None, False, 'float32')), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) +paddle.fluid.contrib.StateCell.__init__ (ArgSpec(args=['self', 'inputs', 'states', 'out_state', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) +paddle.fluid.contrib.StateCell.compute_state (ArgSpec(args=['self', 'inputs'], varargs=None, keywords=None, defaults=None), ('document', '92973b3f222081a1d17069c683cf4a99')) +paddle.fluid.contrib.StateCell.get_input (ArgSpec(args=['self', 'input_name'], varargs=None, keywords=None, defaults=None), ('document', '6f24a007cfa184e32f01a960703bfd70')) +paddle.fluid.contrib.StateCell.get_state (ArgSpec(args=['self', 'state_name'], varargs=None, keywords=None, defaults=None), ('document', '630a4945cfe659ea4f307598fbbce5d2')) +paddle.fluid.contrib.StateCell.out_state (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '7ad681dff0393ddf13a724194e720f28')) +paddle.fluid.contrib.StateCell.set_state (ArgSpec(args=['self', 'state_name', 'state_value'], varargs=None, keywords=None, defaults=None), ('document', 'd4e0e08cd5d9d9a571cbc52d114f5ae9')) +paddle.fluid.contrib.StateCell.state_updater (ArgSpec(args=['self', 'updater'], varargs=None, keywords=None, defaults=None), ('document', 'd5afe1b7665d94fb023b15cf913ca510')) +paddle.fluid.contrib.StateCell.update_states (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', 'fe0b0f1338723516a35a30247899c81b')) +paddle.fluid.contrib.TrainingDecoder.__init__ (ArgSpec(args=['self', 'state_cell', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) +paddle.fluid.contrib.TrainingDecoder.block (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '98d88fa1c989748410a12517c6a585bf')) +paddle.fluid.contrib.TrainingDecoder.output (ArgSpec(args=['self'], varargs='outputs', keywords=None, defaults=None), ('document', 'f0a457dee586559036202087ce2eff69')) +paddle.fluid.contrib.TrainingDecoder.static_input (ArgSpec(args=['self', 'x'], varargs=None, keywords=None, defaults=None), ('document', 'a024c72664fe815068423ba630b7658a')) +paddle.fluid.contrib.TrainingDecoder.step_input (ArgSpec(args=['self', 'x'], varargs=None, keywords=None, defaults=None), ('document', '4659db7a888a2495e71c1838a0483909')) +paddle.fluid.contrib.BeamSearchDecoder.__init__ (ArgSpec(args=['self', 'state_cell', 'init_ids', 'init_scores', 'target_dict_dim', 'word_dim', 'input_var_dict', 'topk_size', 'sparse_emb', 'max_len', 'beam_size', 'end_id', 'name'], varargs=None, keywords=None, defaults=({}, 50, True, 100, 1, 1, None)), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) +paddle.fluid.contrib.BeamSearchDecoder.block (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '98d88fa1c989748410a12517c6a585bf')) +paddle.fluid.contrib.BeamSearchDecoder.decode (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '1e47c60f080c1343ebb6ceaef89656b2')) +paddle.fluid.contrib.BeamSearchDecoder.early_stop (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '3a84a7454ed6707f79b9e954d92a7575')) +paddle.fluid.contrib.BeamSearchDecoder.read_array (ArgSpec(args=['self', 'init', 'is_ids', 'is_scores'], varargs=None, keywords=None, defaults=(False, False)), ('document', 'aa89eb8fd5e4cabaf5cc1bcae14665a4')) +paddle.fluid.contrib.BeamSearchDecoder.update_array (ArgSpec(args=['self', 'array', 'value'], varargs=None, keywords=None, defaults=None), ('document', '5754e9b3212b7c09497151516a0de5a7')) +paddle.fluid.contrib.memory_usage (ArgSpec(args=['program', 'batch_size'], varargs=None, keywords=None, defaults=None), ('document', '8fcb2f93bb743693baa8d4860a5ccc47')) +paddle.fluid.contrib.op_freq_statistic (ArgSpec(args=['program'], varargs=None, keywords=None, defaults=None), ('document', '4d43687113c4bf5b29d15aee2f4e4afa')) +paddle.fluid.contrib.QuantizeTranspiler.__init__ (ArgSpec(args=['self', 'weight_bits', 'activation_bits', 'activation_quantize_type', 'weight_quantize_type', 'window_size'], varargs=None, keywords=None, defaults=(8, 8, 'abs_max', 'abs_max', 10000)), ('document', '14b39f1fcd5667ff556b1aad94357d1d')) +paddle.fluid.contrib.QuantizeTranspiler.convert_to_int8 (ArgSpec(args=['self', 'program', 'place', 'scope'], varargs=None, keywords=None, defaults=(None,)), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) +paddle.fluid.contrib.QuantizeTranspiler.freeze_program (ArgSpec(args=['self', 'program', 'place', 'fuse_bn', 'scope'], varargs=None, keywords=None, defaults=(False, None)), ('document', '909675a1ab055c69b436a7893fcae4fd')) +paddle.fluid.contrib.QuantizeTranspiler.training_transpile (ArgSpec(args=['self', 'program', 'startup_program'], varargs=None, keywords=None, defaults=(None, None)), ('document', '6dd9909f10b283ba2892a99058a72884')) +paddle.fluid.contrib.Calibrator.__init__ (ArgSpec(args=['self'], varargs='args', keywords='kwargs', defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) +paddle.fluid.contrib.Calibrator.sample_data (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '3b8c85ca1e2cf753cc8c90a6c6992958')) +paddle.fluid.contrib.Calibrator.save_int8_model (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) +paddle.fluid.contrib.reader.ctr_reader.ctr_reader (ArgSpec(args=['feed_dict', 'file_type', 'file_format', 'dense_slot_index', 'sparse_slot_index', 'capacity', 'thread_num', 'batch_size', 'file_list', 'slots', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', 'b2ebf3de2a6ef1af2c3b88d2db7591ab')) +paddle.fluid.contrib.build_compressor (ArgSpec(args=['place', 'data_reader', 'data_feeder', 'scope', 'metrics', 'epoch', 'config'], varargs=None, keywords=None, defaults=(None, None, None, None, None, None, None)), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) +paddle.fluid.contrib.CompressPass.__init__ (ArgSpec(args=['self', 'place', 'data_reader', 'data_feeder', 'scope', 'metrics', 'epoch', 'program_exe'], varargs=None, keywords=None, defaults=(None, None, None, None, None, None, None)), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) +paddle.fluid.contrib.CompressPass.add_strategy (ArgSpec(args=['self', 'strategy'], varargs=None, keywords=None, defaults=None), ('document', '3bf6010b6f47d3c86df0ec8957be95e0')) +paddle.fluid.contrib.CompressPass.apply (ArgSpec(args=['self', 'graph'], varargs=None, keywords=None, defaults=None), ('document', 'a92bf85d4b59bd4f2ac1706d7c4899a6')) +paddle.fluid.contrib.ImitationGraph.__init__ (ArgSpec(args=['self', 'program'], varargs=None, keywords=None, defaults=(None,)), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) +paddle.fluid.contrib.ImitationGraph.all_parameters (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) +paddle.fluid.contrib.SensitivePruneStrategy.__init__ (ArgSpec(args=['self', 'pruner', 'start_epoch', 'end_epoch', 'delta_rate', 'acc_loss_threshold', 'sensitivities'], varargs=None, keywords=None, defaults=(None, 0, 10, 0.2, 0.2, None)), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) +paddle.fluid.contrib.SensitivePruneStrategy.on_batch_begin (ArgSpec(args=['self', 'context'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) +paddle.fluid.contrib.SensitivePruneStrategy.on_batch_end (ArgSpec(args=['self', 'context'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) +paddle.fluid.contrib.SensitivePruneStrategy.on_compress_begin (ArgSpec(args=['self', 'context'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) +paddle.fluid.contrib.SensitivePruneStrategy.on_compress_end (ArgSpec(args=['self', 'context'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) +paddle.fluid.contrib.SensitivePruneStrategy.on_epoch_begin (ArgSpec(args=['self', 'context'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) +paddle.fluid.contrib.SensitivePruneStrategy.on_epoch_end (ArgSpec(args=['self', 'context'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) +paddle.fluid.contrib.MagnitudePruner.__init__ (ArgSpec(args=['self', 'threshold'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) +paddle.fluid.contrib.MagnitudePruner.prune (ArgSpec(args=['self', 'param', 'threshold'], varargs=None, keywords=None, defaults=(None,)), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) +paddle.fluid.contrib.RatioPruner.__init__ (ArgSpec(args=['self', 'ratios'], varargs=None, keywords=None, defaults=(None,)), ('document', 'e7a81a325b296a9ca502ee5adb4fc85d')) +paddle.fluid.contrib.RatioPruner.prune (ArgSpec(args=['self', 'param', 'ratio'], varargs=None, keywords=None, defaults=(None,)), ('document', '358cbf2978c91028fb96a195a9884645')) +paddle.fluid.contrib.load_persistables_for_increment (ArgSpec(args=['dirname', 'executor', 'program', 'lookup_table_var', 'lookup_table_var_path'], varargs=None, keywords=None, defaults=None), ('document', '11fbf7e8dd2289805de291b453a33ee7')) +paddle.fluid.contrib.load_persistables_for_inference (ArgSpec(args=['dirname', 'executor', 'program', 'lookup_table_var_name'], varargs=None, keywords=None, defaults=None), ('document', '5b5577bb3d24070da819674255d16196')) +paddle.fluid.contrib.convert_dist_to_sparse_program (ArgSpec(args=['program'], varargs=None, keywords=None, defaults=None), ('document', '4efbd93876832d4d35497cdbc7a1e6d8')) +paddle.fluid.contrib.HDFSClient.__init__ (ArgSpec(args=['self', 'hadoop_home', 'configs'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) +paddle.fluid.contrib.HDFSClient.delete (ArgSpec(args=['self', 'hdfs_path'], varargs=None, keywords=None, defaults=None), ('document', 'c3721aa2d4d9ef5a857dd47b2681c03e')) +paddle.fluid.contrib.HDFSClient.download (ArgSpec(args=['self', 'hdfs_path', 'local_path', 'overwrite', 'unzip'], varargs=None, keywords=None, defaults=(False, False)), ('document', 'ca55bde92184d3fd0f9f5c963b25e634')) +paddle.fluid.contrib.HDFSClient.is_dir (ArgSpec(args=['self', 'hdfs_path'], varargs=None, keywords=None, defaults=(None,)), ('document', '45bde1bae02605a205c8245b58b9156d')) +paddle.fluid.contrib.HDFSClient.is_exist (ArgSpec(args=['self', 'hdfs_path'], varargs=None, keywords=None, defaults=(None,)), ('document', 'be9c94bccff7ba0c1d95883ac62b5864')) +paddle.fluid.contrib.HDFSClient.ls (ArgSpec(args=['self', 'hdfs_path'], varargs=None, keywords=None, defaults=None), ('document', '808acac504870c7e46594b95674f8a86')) +paddle.fluid.contrib.HDFSClient.lsr (ArgSpec(args=['self', 'hdfs_path', 'only_file', 'sort'], varargs=None, keywords=None, defaults=(True, True)), ('document', 'fae835aa3354eb6a0434c0f9ba3c2747')) +paddle.fluid.contrib.HDFSClient.make_local_dirs (ArgSpec(args=['local_path'], varargs=None, keywords=None, defaults=None), ('document', 'e76b89c8e7f019b5da576c0026fcf689')) +paddle.fluid.contrib.HDFSClient.makedirs (ArgSpec(args=['self', 'hdfs_path'], varargs=None, keywords=None, defaults=None), ('document', '44d9972aae390aedf40aaea731a37e4b')) +paddle.fluid.contrib.HDFSClient.rename (ArgSpec(args=['self', 'hdfs_src_path', 'hdfs_dst_path', 'overwrite'], varargs=None, keywords=None, defaults=(False,)), ('document', '0eb133644d9a9f4da45bb39261ff0955')) +paddle.fluid.contrib.HDFSClient.upload (ArgSpec(args=['self', 'hdfs_path', 'local_path', 'overwrite', 'retry_times'], varargs=None, keywords=None, defaults=(False, 5)), ('document', '7d053b4bfd6dcfdd2c9dda0e0dbd9665')) +paddle.fluid.contrib.multi_download (ArgSpec(args=['client', 'hdfs_path', 'local_path', 'trainer_id', 'trainers', 'multi_processes'], varargs=None, keywords=None, defaults=(5,)), ('document', '100927be598ed8f9eaa1f3ef1b23568a')) +paddle.fluid.contrib.multi_upload (ArgSpec(args=['client', 'hdfs_path', 'local_path', 'multi_processes', 'overwrite', 'sync'], varargs=None, keywords=None, defaults=(5, False, True)), ('document', '183f34c83d30dbe16e09e8716c41958a')) +paddle.fluid.transpiler.DistributeTranspiler.__init__ (ArgSpec(args=['self', 'config'], varargs=None, keywords=None, defaults=(None,)), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) +paddle.fluid.transpiler.DistributeTranspiler.get_pserver_program (ArgSpec(args=['self', 'endpoint'], varargs=None, keywords=None, defaults=None), ('document', '292ab72977afbe58e6a3bde175452680')) +paddle.fluid.transpiler.DistributeTranspiler.get_pserver_programs (ArgSpec(args=['self', 'endpoint'], varargs=None, keywords=None, defaults=None), ('document', '78f4949aedf317666a89ca74b3748ba8')) +paddle.fluid.transpiler.DistributeTranspiler.get_startup_program (ArgSpec(args=['self', 'endpoint', 'pserver_program', 'startup_program'], varargs=None, keywords=None, defaults=(None, None)), ('document', 'd796fc0c8d51503b556fcf6dc15c4f0c')) +paddle.fluid.transpiler.DistributeTranspiler.get_trainer_program (ArgSpec(args=['self', 'wait_port'], varargs=None, keywords=None, defaults=(True,)), ('document', '736330e31a7a54abccc0c7fd9119d9ff')) +paddle.fluid.transpiler.DistributeTranspiler.transpile (ArgSpec(args=['self', 'trainer_id', 'program', 'pservers', 'trainers', 'sync_mode', 'startup_program', 'current_endpoint'], varargs=None, keywords=None, defaults=(None, '127.0.0.1:6174', 1, True, None, '127.0.0.1:6174')), ('document', '06ce55338dfe96311ad1078235ab3bf4')) +paddle.fluid.transpiler.memory_optimize (ArgSpec(args=['input_program', 'skip_opt_set', 'print_log', 'level', 'skip_grads'], varargs=None, keywords=None, defaults=(None, False, 0, False)), ('document', 'eda17d0f1639bc6ca215cecf87f588a4')) +paddle.fluid.transpiler.release_memory (ArgSpec(args=['input_program', 'skip_opt_set'], varargs=None, keywords=None, defaults=(None,)), ('document', 'ac4114d3df16264f1946deb3a8434a6f')) +paddle.fluid.transpiler.HashName.__init__ (ArgSpec(args=['self', 'pserver_endpoints'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) +paddle.fluid.transpiler.HashName.dispatch (ArgSpec(args=['self', 'varlist'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) +paddle.fluid.transpiler.HashName.reset (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) +paddle.fluid.transpiler.RoundRobin.__init__ (ArgSpec(args=['self', 'pserver_endpoints'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) +paddle.fluid.transpiler.RoundRobin.dispatch (ArgSpec(args=['self', 'varlist'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) +paddle.fluid.transpiler.RoundRobin.reset (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) +paddle.fluid.transpiler.DistributeTranspilerConfig.__init__ +paddle.fluid.nets.simple_img_conv_pool (ArgSpec(args=['input', 'num_filters', 'filter_size', 'pool_size', 'pool_stride', 'pool_padding', 'pool_type', 'global_pooling', 'conv_stride', 'conv_padding', 'conv_dilation', 'conv_groups', 'param_attr', 'bias_attr', 'act', 'use_cudnn'], varargs=None, keywords=None, defaults=(0, 'max', False, 1, 0, 1, 1, None, None, None, True)), ('document', 'e0f67f35abf27f666f81003113b90244')) +paddle.fluid.nets.sequence_conv_pool (ArgSpec(args=['input', 'num_filters', 'filter_size', 'param_attr', 'act', 'pool_type', 'bias_attr'], varargs=None, keywords=None, defaults=(None, 'sigmoid', 'max', None)), ('document', '48c434dd7bb827f69d90e5135d77470f')) +paddle.fluid.nets.glu (ArgSpec(args=['input', 'dim'], varargs=None, keywords=None, defaults=(-1,)), ('document', '08c1c57e1db6b20bf87b264cb7cf3ca8')) +paddle.fluid.nets.scaled_dot_product_attention (ArgSpec(args=['queries', 'keys', 'values', 'num_heads', 'dropout_rate'], varargs=None, keywords=None, defaults=(1, 0.0)), ('document', '921714c9bfb351b41403418265393203')) +paddle.fluid.nets.img_conv_group (ArgSpec(args=['input', 'conv_num_filter', 'pool_size', 'conv_padding', 'conv_filter_size', 'conv_act', 'param_attr', 'conv_with_batchnorm', 'conv_batchnorm_drop_rate', 'pool_stride', 'pool_type', 'use_cudnn'], varargs=None, keywords=None, defaults=(1, 3, None, None, False, 0.0, 1, 'max', True)), ('document', '3802be78fbfb206dae64a2d9f8480970')) +paddle.fluid.optimizer.SGDOptimizer.__init__ (ArgSpec(args=['self', 'learning_rate', 'regularization', 'name'], varargs=None, keywords=None, defaults=(None, None)), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) +paddle.fluid.optimizer.SGDOptimizer.apply_gradients (ArgSpec(args=['self', 'params_grads'], varargs=None, keywords=None, defaults=None), ('document', 'bfe7305918552aaecfdaa22411dbe871')) +paddle.fluid.optimizer.SGDOptimizer.backward (ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set', 'callbacks'], varargs=None, keywords=None, defaults=(None, None, None, None)), ('document', 'ba3a113d0229ff7bc9d39bda0a6d947f')) +paddle.fluid.optimizer.SGDOptimizer.minimize (ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None)), ('document', '35fd5d3330c97903528c7e0dacc7f6ea')) +paddle.fluid.optimizer.MomentumOptimizer.__init__ (ArgSpec(args=['self', 'learning_rate', 'momentum', 'use_nesterov', 'regularization', 'name'], varargs=None, keywords=None, defaults=(False, None, None)), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) +paddle.fluid.optimizer.MomentumOptimizer.apply_gradients (ArgSpec(args=['self', 'params_grads'], varargs=None, keywords=None, defaults=None), ('document', 'bfe7305918552aaecfdaa22411dbe871')) +paddle.fluid.optimizer.MomentumOptimizer.backward (ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set', 'callbacks'], varargs=None, keywords=None, defaults=(None, None, None, None)), ('document', 'ba3a113d0229ff7bc9d39bda0a6d947f')) +paddle.fluid.optimizer.MomentumOptimizer.minimize (ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None)), ('document', '35fd5d3330c97903528c7e0dacc7f6ea')) +paddle.fluid.optimizer.AdagradOptimizer.__init__ (ArgSpec(args=['self', 'learning_rate', 'epsilon', 'regularization', 'name', 'initial_accumulator_value'], varargs=None, keywords=None, defaults=(1e-06, None, None, 0.0)), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) +paddle.fluid.optimizer.AdagradOptimizer.apply_gradients (ArgSpec(args=['self', 'params_grads'], varargs=None, keywords=None, defaults=None), ('document', 'bfe7305918552aaecfdaa22411dbe871')) +paddle.fluid.optimizer.AdagradOptimizer.backward (ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set', 'callbacks'], varargs=None, keywords=None, defaults=(None, None, None, None)), ('document', 'ba3a113d0229ff7bc9d39bda0a6d947f')) +paddle.fluid.optimizer.AdagradOptimizer.minimize (ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None)), ('document', '35fd5d3330c97903528c7e0dacc7f6ea')) +paddle.fluid.optimizer.AdamOptimizer.__init__ (ArgSpec(args=['self', 'learning_rate', 'beta1', 'beta2', 'epsilon', 'regularization', 'name', 'lazy_mode'], varargs=None, keywords=None, defaults=(0.001, 0.9, 0.999, 1e-08, None, None, False)), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) +paddle.fluid.optimizer.AdamOptimizer.apply_gradients (ArgSpec(args=['self', 'params_grads'], varargs=None, keywords=None, defaults=None), ('document', 'bfe7305918552aaecfdaa22411dbe871')) +paddle.fluid.optimizer.AdamOptimizer.backward (ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set', 'callbacks'], varargs=None, keywords=None, defaults=(None, None, None, None)), ('document', 'ba3a113d0229ff7bc9d39bda0a6d947f')) +paddle.fluid.optimizer.AdamOptimizer.minimize (ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None)), ('document', '35fd5d3330c97903528c7e0dacc7f6ea')) +paddle.fluid.optimizer.AdamaxOptimizer.__init__ (ArgSpec(args=['self', 'learning_rate', 'beta1', 'beta2', 'epsilon', 'regularization', 'name'], varargs=None, keywords=None, defaults=(0.001, 0.9, 0.999, 1e-08, None, None)), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) +paddle.fluid.optimizer.AdamaxOptimizer.apply_gradients (ArgSpec(args=['self', 'params_grads'], varargs=None, keywords=None, defaults=None), ('document', 'bfe7305918552aaecfdaa22411dbe871')) +paddle.fluid.optimizer.AdamaxOptimizer.backward (ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set', 'callbacks'], varargs=None, keywords=None, defaults=(None, None, None, None)), ('document', 'ba3a113d0229ff7bc9d39bda0a6d947f')) +paddle.fluid.optimizer.AdamaxOptimizer.minimize (ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None)), ('document', '35fd5d3330c97903528c7e0dacc7f6ea')) +paddle.fluid.optimizer.DecayedAdagradOptimizer.__init__ (ArgSpec(args=['self', 'learning_rate', 'decay', 'epsilon', 'regularization', 'name'], varargs=None, keywords=None, defaults=(0.95, 1e-06, None, None)), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) +paddle.fluid.optimizer.DecayedAdagradOptimizer.apply_gradients (ArgSpec(args=['self', 'params_grads'], varargs=None, keywords=None, defaults=None), ('document', 'bfe7305918552aaecfdaa22411dbe871')) +paddle.fluid.optimizer.DecayedAdagradOptimizer.backward (ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set', 'callbacks'], varargs=None, keywords=None, defaults=(None, None, None, None)), ('document', 'ba3a113d0229ff7bc9d39bda0a6d947f')) +paddle.fluid.optimizer.DecayedAdagradOptimizer.minimize (ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None)), ('document', '35fd5d3330c97903528c7e0dacc7f6ea')) +paddle.fluid.optimizer.FtrlOptimizer.__init__ (ArgSpec(args=['self', 'learning_rate', 'l1', 'l2', 'lr_power', 'regularization', 'name'], varargs=None, keywords=None, defaults=(0.0, 0.0, -0.5, None, None)), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) +paddle.fluid.optimizer.FtrlOptimizer.apply_gradients (ArgSpec(args=['self', 'params_grads'], varargs=None, keywords=None, defaults=None), ('document', 'bfe7305918552aaecfdaa22411dbe871')) +paddle.fluid.optimizer.FtrlOptimizer.backward (ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set', 'callbacks'], varargs=None, keywords=None, defaults=(None, None, None, None)), ('document', 'ba3a113d0229ff7bc9d39bda0a6d947f')) +paddle.fluid.optimizer.FtrlOptimizer.minimize (ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None)), ('document', '35fd5d3330c97903528c7e0dacc7f6ea')) +paddle.fluid.optimizer.RMSPropOptimizer.__init__ (ArgSpec(args=['self', 'learning_rate', 'rho', 'epsilon', 'momentum', 'centered', 'regularization', 'name'], varargs=None, keywords=None, defaults=(0.95, 1e-06, 0.0, False, None, None)), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) +paddle.fluid.optimizer.RMSPropOptimizer.apply_gradients (ArgSpec(args=['self', 'params_grads'], varargs=None, keywords=None, defaults=None), ('document', 'bfe7305918552aaecfdaa22411dbe871')) +paddle.fluid.optimizer.RMSPropOptimizer.backward (ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set', 'callbacks'], varargs=None, keywords=None, defaults=(None, None, None, None)), ('document', 'ba3a113d0229ff7bc9d39bda0a6d947f')) +paddle.fluid.optimizer.RMSPropOptimizer.minimize (ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None)), ('document', '35fd5d3330c97903528c7e0dacc7f6ea')) +paddle.fluid.optimizer.AdadeltaOptimizer.__init__ (ArgSpec(args=['self', 'learning_rate', 'epsilon', 'rho', 'regularization', 'name'], varargs=None, keywords=None, defaults=(1e-06, 0.95, None, None)), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) +paddle.fluid.optimizer.AdadeltaOptimizer.apply_gradients (ArgSpec(args=['self', 'params_grads'], varargs=None, keywords=None, defaults=None), ('document', 'bfe7305918552aaecfdaa22411dbe871')) +paddle.fluid.optimizer.AdadeltaOptimizer.backward (ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set', 'callbacks'], varargs=None, keywords=None, defaults=(None, None, None, None)), ('document', 'ba3a113d0229ff7bc9d39bda0a6d947f')) +paddle.fluid.optimizer.AdadeltaOptimizer.minimize (ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None)), ('document', '35fd5d3330c97903528c7e0dacc7f6ea')) +paddle.fluid.optimizer.ModelAverage.__init__ (ArgSpec(args=['self', 'average_window_rate', 'min_average_window', 'max_average_window', 'regularization', 'name'], varargs=None, keywords=None, defaults=(10000, 10000, None, None)), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) +paddle.fluid.optimizer.ModelAverage.apply (ArgSpec(args=['self', 'executor', 'need_restore'], varargs=None, keywords=None, defaults=(True,)), ('document', '46234a5470590feb336346f70a3db715')) +paddle.fluid.optimizer.ModelAverage.apply_gradients (ArgSpec(args=['self', 'params_grads'], varargs=None, keywords=None, defaults=None), ('document', 'bfe7305918552aaecfdaa22411dbe871')) +paddle.fluid.optimizer.ModelAverage.backward (ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set', 'callbacks'], varargs=None, keywords=None, defaults=(None, None, None, None)), ('document', 'ba3a113d0229ff7bc9d39bda0a6d947f')) +paddle.fluid.optimizer.ModelAverage.minimize (ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None)), ('document', '35fd5d3330c97903528c7e0dacc7f6ea')) +paddle.fluid.optimizer.ModelAverage.restore (ArgSpec(args=['self', 'executor'], varargs=None, keywords=None, defaults=None), ('document', '18db9c70be9c4dd466f9844457b21bfe')) +paddle.fluid.optimizer.LarsMomentumOptimizer.__init__ (ArgSpec(args=['self', 'learning_rate', 'momentum', 'lars_coeff', 'lars_weight_decay', 'regularization', 'name'], varargs=None, keywords=None, defaults=(0.001, 0.0005, None, None)), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) +paddle.fluid.optimizer.LarsMomentumOptimizer.apply_gradients (ArgSpec(args=['self', 'params_grads'], varargs=None, keywords=None, defaults=None), ('document', 'bfe7305918552aaecfdaa22411dbe871')) +paddle.fluid.optimizer.LarsMomentumOptimizer.backward (ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set', 'callbacks'], varargs=None, keywords=None, defaults=(None, None, None, None)), ('document', 'ba3a113d0229ff7bc9d39bda0a6d947f')) +paddle.fluid.optimizer.LarsMomentumOptimizer.minimize (ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None)), ('document', '35fd5d3330c97903528c7e0dacc7f6ea')) +paddle.fluid.backward.append_backward (ArgSpec(args=['loss', 'parameter_list', 'no_grad_set', 'callbacks'], varargs=None, keywords=None, defaults=(None, None, None)), ('document', '1a79bd7d10ae54ca763ec81bca36ba24')) +paddle.fluid.regularizer.L1DecayRegularizer.__init__ (ArgSpec(args=['self', 'regularization_coeff'], varargs=None, keywords=None, defaults=(0.0,)), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) +paddle.fluid.regularizer.L2DecayRegularizer.__init__ (ArgSpec(args=['self', 'regularization_coeff'], varargs=None, keywords=None, defaults=(0.0,)), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) +paddle.fluid.LoDTensor.__init__ 1. __init__(self: paddle.fluid.core.LoDTensor, arg0: List[List[int]]) -> None 2. __init__(self: paddle.fluid.core.LoDTensor) -> None +paddle.fluid.LoDTensor.has_valid_recursive_sequence_lengths has_valid_recursive_sequence_lengths(self: paddle.fluid.core.LoDTensor) -> bool +paddle.fluid.LoDTensor.lod lod(self: paddle.fluid.core.LoDTensor) -> List[List[int]] +paddle.fluid.LoDTensor.recursive_sequence_lengths recursive_sequence_lengths(self: paddle.fluid.core.LoDTensor) -> List[List[int]] +paddle.fluid.LoDTensor.set 1. set(self: paddle.fluid.core.Tensor, arg0: numpy.ndarray[float32], arg1: paddle::platform::CPUPlace) -> None 2. set(self: paddle.fluid.core.Tensor, arg0: numpy.ndarray[int32], arg1: paddle::platform::CPUPlace) -> None 3. set(self: paddle.fluid.core.Tensor, arg0: numpy.ndarray[float64], arg1: paddle::platform::CPUPlace) -> None 4. set(self: paddle.fluid.core.Tensor, arg0: numpy.ndarray[int64], arg1: paddle::platform::CPUPlace) -> None 5. set(self: paddle.fluid.core.Tensor, arg0: numpy.ndarray[bool], arg1: paddle::platform::CPUPlace) -> None 6. set(self: paddle.fluid.core.Tensor, arg0: numpy.ndarray[uint16], arg1: paddle::platform::CPUPlace) -> None 7. set(self: paddle.fluid.core.Tensor, arg0: numpy.ndarray[uint8], arg1: paddle::platform::CPUPlace) -> None 8. set(self: paddle.fluid.core.Tensor, arg0: numpy.ndarray[int8], arg1: paddle::platform::CPUPlace) -> None 9. set(self: paddle.fluid.core.Tensor, arg0: numpy.ndarray[float32], arg1: paddle::platform::CUDAPlace) -> None 10. set(self: paddle.fluid.core.Tensor, arg0: numpy.ndarray[int32], arg1: paddle::platform::CUDAPlace) -> None 11. set(self: paddle.fluid.core.Tensor, arg0: numpy.ndarray[float64], arg1: paddle::platform::CUDAPlace) -> None 12. set(self: paddle.fluid.core.Tensor, arg0: numpy.ndarray[int64], arg1: paddle::platform::CUDAPlace) -> None 13. set(self: paddle.fluid.core.Tensor, arg0: numpy.ndarray[bool], arg1: paddle::platform::CUDAPlace) -> None 14. set(self: paddle.fluid.core.Tensor, arg0: numpy.ndarray[uint16], arg1: paddle::platform::CUDAPlace) -> None 15. set(self: paddle.fluid.core.Tensor, arg0: numpy.ndarray[uint8], arg1: paddle::platform::CUDAPlace) -> None 16. set(self: paddle.fluid.core.Tensor, arg0: numpy.ndarray[int8], arg1: paddle::platform::CUDAPlace) -> None 17. set(self: paddle.fluid.core.Tensor, arg0: numpy.ndarray[float32], arg1: paddle::platform::CUDAPinnedPlace) -> None 18. set(self: paddle.fluid.core.Tensor, arg0: numpy.ndarray[int32], arg1: paddle::platform::CUDAPinnedPlace) -> None 19. set(self: paddle.fluid.core.Tensor, arg0: numpy.ndarray[float64], arg1: paddle::platform::CUDAPinnedPlace) -> None 20. set(self: paddle.fluid.core.Tensor, arg0: numpy.ndarray[int64], arg1: paddle::platform::CUDAPinnedPlace) -> None 21. set(self: paddle.fluid.core.Tensor, arg0: numpy.ndarray[bool], arg1: paddle::platform::CUDAPinnedPlace) -> None 22. set(self: paddle.fluid.core.Tensor, arg0: numpy.ndarray[uint16], arg1: paddle::platform::CUDAPinnedPlace) -> None 23. set(self: paddle.fluid.core.Tensor, arg0: numpy.ndarray[uint8], arg1: paddle::platform::CUDAPinnedPlace) -> None 24. set(self: paddle.fluid.core.Tensor, arg0: numpy.ndarray[int8], arg1: paddle::platform::CUDAPinnedPlace) -> None +paddle.fluid.LoDTensor.set_lod set_lod(self: paddle.fluid.core.LoDTensor, lod: List[List[int]]) -> None +paddle.fluid.LoDTensor.set_recursive_sequence_lengths set_recursive_sequence_lengths(self: paddle.fluid.core.LoDTensor, recursive_sequence_lengths: List[List[int]]) -> None +paddle.fluid.LoDTensor.shape shape(self: paddle.fluid.core.Tensor) -> List[int] +paddle.fluid.LoDTensorArray.__init__ __init__(self: paddle.fluid.core.LoDTensorArray) -> None +paddle.fluid.LoDTensorArray.append append(self: paddle.fluid.core.LoDTensorArray, tensor: paddle.fluid.core.LoDTensor) -> None +paddle.fluid.CPUPlace.__init__ __init__(self: paddle.fluid.core.CPUPlace) -> None +paddle.fluid.CUDAPlace.__init__ __init__(self: paddle.fluid.core.CUDAPlace, arg0: int) -> None +paddle.fluid.CUDAPinnedPlace.__init__ __init__(self: paddle.fluid.core.CUDAPinnedPlace) -> None +paddle.fluid.ParamAttr.__init__ (ArgSpec(args=['self', 'name', 'initializer', 'learning_rate', 'regularizer', 'trainable', 'gradient_clip', 'do_model_average'], varargs=None, keywords=None, defaults=(None, None, 1.0, None, True, None, False)), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) +paddle.fluid.WeightNormParamAttr.__init__ (ArgSpec(args=['self', 'dim', 'name', 'initializer', 'learning_rate', 'regularizer', 'trainable', 'gradient_clip', 'do_model_average'], varargs=None, keywords=None, defaults=(None, None, None, 1.0, None, True, None, False)), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) +paddle.fluid.DataFeeder.__init__ (ArgSpec(args=['self', 'feed_list', 'place', 'program'], varargs=None, keywords=None, defaults=(None,)), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) +paddle.fluid.DataFeeder.decorate_reader (ArgSpec(args=['self', 'reader', 'multi_devices', 'num_places', 'drop_last'], varargs=None, keywords=None, defaults=(None, True)), ('document', '0eed2f198dc73c08a41b61edbc755753')) +paddle.fluid.DataFeeder.feed (ArgSpec(args=['self', 'iterable'], varargs=None, keywords=None, defaults=None), ('document', '459e316301279dfd82001b46f0b8ffca')) +paddle.fluid.DataFeeder.feed_parallel (ArgSpec(args=['self', 'iterable', 'num_places'], varargs=None, keywords=None, defaults=(None,)), ('document', '543863d1f9d4853758adb613b8659e85')) +paddle.fluid.clip.ErrorClipByValue.__init__ (ArgSpec(args=['self', 'max', 'min'], varargs=None, keywords=None, defaults=(None,)), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) +paddle.fluid.clip.GradientClipByValue.__init__ (ArgSpec(args=['self', 'max', 'min'], varargs=None, keywords=None, defaults=(None,)), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) +paddle.fluid.clip.GradientClipByNorm.__init__ (ArgSpec(args=['self', 'clip_norm'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) +paddle.fluid.clip.GradientClipByGlobalNorm.__init__ (ArgSpec(args=['self', 'clip_norm', 'group_name'], varargs=None, keywords=None, defaults=('default_group',)), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) +paddle.fluid.profiler.cuda_profiler (ArgSpec(args=['output_file', 'output_mode', 'config'], varargs=None, keywords=None, defaults=(None, None)), ('document', '2e2fb1cfc469a67f19fb578a2ed6be79')) +paddle.fluid.profiler.reset_profiler (ArgSpec(args=[], varargs=None, keywords=None, defaults=None), ('document', '397ce757fabbe5c622e0c3458c41fcd0')) +paddle.fluid.profiler.profiler (ArgSpec(args=['state', 'sorted_key', 'profile_path'], varargs=None, keywords=None, defaults=(None, '/tmp/profile')), ('document', 'bd3a07eeb68e384f4d2d416cb2e28d86')) +paddle.fluid.profiler.start_profiler (ArgSpec(args=['state'], varargs=None, keywords=None, defaults=None), ('document', '88da8fb6dbebaee2f7520188a09574f9')) +paddle.fluid.profiler.stop_profiler (ArgSpec(args=['sorted_key', 'profile_path'], varargs=None, keywords=None, defaults=(None, '/tmp/profile')), ('document', 'a7500e39dd033f1e64f562e909333a8a')) +paddle.fluid.unique_name.generate (ArgSpec(args=['key'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) +paddle.fluid.unique_name.switch (ArgSpec(args=['new_generator'], varargs=None, keywords=None, defaults=(None,)), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) +paddle.fluid.unique_name.guard (ArgSpec(args=['new_generator'], varargs=None, keywords=None, defaults=(None,)), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) +paddle.fluid.recordio_writer.convert_reader_to_recordio_file (ArgSpec(args=['filename', 'reader_creator', 'feeder', 'compressor', 'max_num_records', 'feed_order'], varargs=None, keywords=None, defaults=(Compressor.Snappy, 1000, None)), ('document', '65c7523e86f0c50bb729b01667f36310')) +paddle.fluid.recordio_writer.convert_reader_to_recordio_files (ArgSpec(args=['filename', 'batch_per_file', 'reader_creator', 'feeder', 'compressor', 'max_num_records', 'feed_order'], varargs=None, keywords=None, defaults=(Compressor.Snappy, 1000, None)), ('document', 'bc643f0f5f1b9db57ff0d8a57d379bd7')) +paddle.fluid.Scope Scope() -> paddle.fluid.core._Scope +paddle.reader.cache (ArgSpec(args=['reader'], varargs=None, keywords=None, defaults=None), ('document', '83b94750674c6a04b5f96599d4bf3105')) +paddle.reader.map_readers (ArgSpec(args=['func'], varargs='readers', keywords=None, defaults=None), ('document', '77cbadb09df588e21e5cc0819b69c87d')) +paddle.reader.buffered (ArgSpec(args=['reader', 'size'], varargs=None, keywords=None, defaults=None), ('document', '0d6186f109feceb99f60ec50a0a624cb')) +paddle.reader.compose (ArgSpec(args=[], varargs='readers', keywords='kwargs', defaults=None), ('document', '884291104e1c3f37f33aae44b7deeb0d')) +paddle.reader.chain (ArgSpec(args=[], varargs='readers', keywords=None, defaults=None), ('document', 'd22c34e379a53901ae67a6bca7f4def4')) +paddle.reader.shuffle (ArgSpec(args=['reader', 'buf_size'], varargs=None, keywords=None, defaults=None), ('document', 'e42ea6fee23ce26b23cb142cd1d6522d')) +paddle.reader.ComposeNotAligned.__init__ +paddle.reader.firstn (ArgSpec(args=['reader', 'n'], varargs=None, keywords=None, defaults=None), ('document', 'c5bb8f7dd4f917f1569a368aab5b8aad')) +paddle.reader.xmap_readers (ArgSpec(args=['mapper', 'reader', 'process_num', 'buffer_size', 'order'], varargs=None, keywords=None, defaults=(False,)), ('document', '283bc0b8a0e26ae186b8b9bee4aec560')) +paddle.reader.PipeReader.__init__ (ArgSpec(args=['self', 'command', 'bufsize', 'file_type'], varargs=None, keywords=None, defaults=(8192, 'plain')), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) +paddle.reader.PipeReader.get_line (ArgSpec(args=['self', 'cut_lines', 'line_break'], varargs=None, keywords=None, defaults=(True, '\n')), ('document', '5f80a7ed70052f01665e4c74acccfa69')) +paddle.reader.multiprocess_reader (ArgSpec(args=['readers', 'use_pipe', 'queue_size'], varargs=None, keywords=None, defaults=(True, 1000)), ('document', '7d8b3a96e592107c893d5d51ce968ba0')) +paddle.reader.Fake.__init__ (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) +paddle.reader.creator.np_array (ArgSpec(args=['x'], varargs=None, keywords=None, defaults=None), ('document', '28d457fbc9a71efa4ac91a3be179cada')) +paddle.reader.creator.text_file (ArgSpec(args=['path'], varargs=None, keywords=None, defaults=None), ('document', '44fe286ab6175a5464d3a961a68c266a')) +paddle.reader.creator.recordio (ArgSpec(args=['paths', 'buf_size'], varargs=None, keywords=None, defaults=(100,)), ('document', '11b3704ea42cfd537953387a7e58dae8')) From cfc83c14452a4776c2d8c4fe5c04c6e1ef05e945 Mon Sep 17 00:00:00 2001 From: tensor-tang Date: Mon, 11 Mar 2019 06:49:30 +0000 Subject: [PATCH 043/230] refine jitcodekey and enhance unit tests test=develop --- paddle/fluid/operators/jit/gen/act.cc | 1 + paddle/fluid/operators/jit/gen/blas.cc | 1 + paddle/fluid/operators/jit/gen/embseqpool.cc | 1 + paddle/fluid/operators/jit/gen/gru.cc | 1 + paddle/fluid/operators/jit/gen/hopv.cc | 1 + paddle/fluid/operators/jit/gen/lstm.cc | 1 + paddle/fluid/operators/jit/gen/matmul.cc | 2 +- paddle/fluid/operators/jit/gen/seqpool.cc | 1 + paddle/fluid/operators/jit/gen/sgd.cc | 1 + paddle/fluid/operators/jit/helper.h | 2 +- paddle/fluid/operators/jit/kernel_key.cc | 61 ++--- paddle/fluid/operators/jit/kernel_key.h | 2 +- paddle/fluid/operators/jit/kernel_pool.h | 6 +- paddle/fluid/operators/jit/registry.h | 1 + paddle/fluid/operators/jit/test.cc | 220 +++++++++++++++---- 15 files changed, 211 insertions(+), 91 deletions(-) diff --git a/paddle/fluid/operators/jit/gen/act.cc b/paddle/fluid/operators/jit/gen/act.cc index 5cac219f95..ad68e792c7 100644 --- a/paddle/fluid/operators/jit/gen/act.cc +++ b/paddle/fluid/operators/jit/gen/act.cc @@ -13,6 +13,7 @@ * limitations under the License. */ #include "paddle/fluid/operators/jit/gen/act.h" +#include #include "paddle/fluid/operators/jit/registry.h" #include "paddle/fluid/platform/cpu_info.h" diff --git a/paddle/fluid/operators/jit/gen/blas.cc b/paddle/fluid/operators/jit/gen/blas.cc index e764a7983d..c126b9077a 100644 --- a/paddle/fluid/operators/jit/gen/blas.cc +++ b/paddle/fluid/operators/jit/gen/blas.cc @@ -13,6 +13,7 @@ * limitations under the License. */ #include "paddle/fluid/operators/jit/gen/blas.h" +#include #include "paddle/fluid/operators/jit/registry.h" #include "paddle/fluid/platform/cpu_info.h" diff --git a/paddle/fluid/operators/jit/gen/embseqpool.cc b/paddle/fluid/operators/jit/gen/embseqpool.cc index 6e8ecc07e7..331a4b0d07 100644 --- a/paddle/fluid/operators/jit/gen/embseqpool.cc +++ b/paddle/fluid/operators/jit/gen/embseqpool.cc @@ -14,6 +14,7 @@ #include "paddle/fluid/operators/jit/gen/embseqpool.h" #include // offsetof +#include #include #include "paddle/fluid/operators/jit/gen/act.h" // for exp_float_consts ones #include "paddle/fluid/operators/jit/registry.h" diff --git a/paddle/fluid/operators/jit/gen/gru.cc b/paddle/fluid/operators/jit/gen/gru.cc index 4bc9247f6f..b5b0cffa80 100644 --- a/paddle/fluid/operators/jit/gen/gru.cc +++ b/paddle/fluid/operators/jit/gen/gru.cc @@ -14,6 +14,7 @@ #include "paddle/fluid/operators/jit/gen/gru.h" #include // offsetof +#include #include "paddle/fluid/operators/jit/registry.h" #include "paddle/fluid/platform/cpu_info.h" diff --git a/paddle/fluid/operators/jit/gen/hopv.cc b/paddle/fluid/operators/jit/gen/hopv.cc index 3383f17df8..462ac68a93 100644 --- a/paddle/fluid/operators/jit/gen/hopv.cc +++ b/paddle/fluid/operators/jit/gen/hopv.cc @@ -13,6 +13,7 @@ * limitations under the License. */ #include "paddle/fluid/operators/jit/gen/hopv.h" +#include #include "paddle/fluid/operators/jit/registry.h" #include "paddle/fluid/platform/cpu_info.h" diff --git a/paddle/fluid/operators/jit/gen/lstm.cc b/paddle/fluid/operators/jit/gen/lstm.cc index 5e7789aede..2c3bc985e9 100644 --- a/paddle/fluid/operators/jit/gen/lstm.cc +++ b/paddle/fluid/operators/jit/gen/lstm.cc @@ -14,6 +14,7 @@ #include "paddle/fluid/operators/jit/gen/lstm.h" #include // offsetof +#include #include "paddle/fluid/operators/jit/registry.h" #include "paddle/fluid/platform/cpu_info.h" diff --git a/paddle/fluid/operators/jit/gen/matmul.cc b/paddle/fluid/operators/jit/gen/matmul.cc index ca50f26ce5..d9955c8cc6 100644 --- a/paddle/fluid/operators/jit/gen/matmul.cc +++ b/paddle/fluid/operators/jit/gen/matmul.cc @@ -14,8 +14,8 @@ #include "paddle/fluid/operators/jit/gen/matmul.h" #include // offsetof +#include #include - #include "paddle/fluid/operators/jit/registry.h" #include "paddle/fluid/platform/cpu_info.h" diff --git a/paddle/fluid/operators/jit/gen/seqpool.cc b/paddle/fluid/operators/jit/gen/seqpool.cc index ceca104cc9..d9e5904add 100644 --- a/paddle/fluid/operators/jit/gen/seqpool.cc +++ b/paddle/fluid/operators/jit/gen/seqpool.cc @@ -13,6 +13,7 @@ * limitations under the License. */ #include "paddle/fluid/operators/jit/gen/seqpool.h" +#include #include "paddle/fluid/operators/jit/gen/act.h" // for exp_float_consts ones #include "paddle/fluid/operators/jit/registry.h" #include "paddle/fluid/platform/cpu_info.h" diff --git a/paddle/fluid/operators/jit/gen/sgd.cc b/paddle/fluid/operators/jit/gen/sgd.cc index a40da9b993..e65d3500b4 100644 --- a/paddle/fluid/operators/jit/gen/sgd.cc +++ b/paddle/fluid/operators/jit/gen/sgd.cc @@ -14,6 +14,7 @@ #include "paddle/fluid/operators/jit/gen/sgd.h" #include // offsetof +#include #include #include "paddle/fluid/operators/jit/registry.h" #include "paddle/fluid/platform/cpu_info.h" diff --git a/paddle/fluid/operators/jit/helper.h b/paddle/fluid/operators/jit/helper.h index d98eada81c..1ac5318d46 100644 --- a/paddle/fluid/operators/jit/helper.h +++ b/paddle/fluid/operators/jit/helper.h @@ -36,7 +36,7 @@ inline typename std::enable_if< const Kernel*>::type GetJitCode(const typename KernelTuple::attr_type& attr) { using Attr = typename KernelTuple::attr_type; - size_t key = JitCodeKey(attr); + int64_t key = JitCodeKey(attr); auto& codes = JitCodePool::Instance(); if (codes.Has(key)) { return codes.AllKernels().at(key).get(); diff --git a/paddle/fluid/operators/jit/kernel_key.cc b/paddle/fluid/operators/jit/kernel_key.cc index 6987c893de..1ad220b397 100644 --- a/paddle/fluid/operators/jit/kernel_key.cc +++ b/paddle/fluid/operators/jit/kernel_key.cc @@ -13,7 +13,7 @@ * limitations under the License. */ #include "paddle/fluid/operators/jit/kernel_key.h" -#include +#include // XXH64: 13.8 GB/s #include "paddle/fluid/platform/enforce.h" namespace paddle { @@ -21,73 +21,46 @@ namespace operators { namespace jit { template <> -size_t JitCodeKey(const int& d) { +int64_t JitCodeKey(const int& d) { return d; } template <> -size_t JitCodeKey(const int64_t& d) { +int64_t JitCodeKey(const int64_t& d) { return d; } -// TODO(TJ): refine and benchmark JitCodeKey generatation -constexpr int act_type_shift = 3; // suppot 2^3 act types -static inline int act_type_convert(KernelType type) { - if (type == kVIdentity) { - return 0; - } else if (type == kVExp) { - return 1; - } else if (type == kVRelu) { - return 2; - } else if (type == kVSigmoid) { - return 3; - } else if (type == kVTanh) { - return 4; - } - PADDLE_THROW("Unsupported act type %d", type); - return 0; -} - template <> -size_t JitCodeKey(const lstm_attr_t& attr) { - // XXH64: 13.8 GB/s - - size_t key = attr.d; - int gate_key = act_type_convert(attr.act_gate) << 1; - int cand_key = act_type_convert(attr.act_cand) << (1 + act_type_shift); - int cell_key = act_type_convert(attr.act_cell) << (1 + act_type_shift * 2); - return (key << (1 + act_type_shift * 3)) + gate_key + cand_key + cell_key + - attr.use_peephole; +int64_t JitCodeKey(const gru_attr_t& attr) { + return XXH64(&attr, sizeof(gru_attr_t), 0); } template <> -size_t JitCodeKey(const gru_attr_t& attr) { - size_t key = attr.d; - return (key << (act_type_shift * 2)) + act_type_convert(attr.act_gate) + - (act_type_convert(attr.act_cand) << act_type_shift); +int64_t JitCodeKey(const lstm_attr_t& attr) { + int keys[5] = { + attr.d, static_cast(attr.act_gate), static_cast(attr.act_cand), + static_cast(attr.act_cell), static_cast(attr.use_peephole)}; + return XXH64(keys, sizeof(int) * 5, 0); } template <> -size_t JitCodeKey(const seq_pool_attr_t& attr) { - size_t key = attr.w; - constexpr int pool_type_shift = 3; - return (key << pool_type_shift) + static_cast(attr.type); +int64_t JitCodeKey(const seq_pool_attr_t& attr) { + int keys[2] = {attr.w, static_cast(attr.type)}; + return XXH64(keys, sizeof(int) * 2, 0); } template <> -size_t JitCodeKey(const matmul_attr_t& attr) { - size_t key = attr.m; - constexpr int shift = 21; - return (key << shift * 2) + ((static_cast(attr.n)) << shift) + attr.k; +int64_t JitCodeKey(const matmul_attr_t& attr) { + return XXH64(&attr, sizeof(int) * 3, 0); // m, n, k } template <> -size_t JitCodeKey(const emb_seq_pool_attr_t& attr) { +int64_t JitCodeKey(const emb_seq_pool_attr_t& attr) { return attr.table_width; } template <> -size_t JitCodeKey(const sgd_attr_t& attr) { +int64_t JitCodeKey(const sgd_attr_t& attr) { return attr.grad_width; } diff --git a/paddle/fluid/operators/jit/kernel_key.h b/paddle/fluid/operators/jit/kernel_key.h index 611a0210d6..b2cf92f23e 100644 --- a/paddle/fluid/operators/jit/kernel_key.h +++ b/paddle/fluid/operators/jit/kernel_key.h @@ -46,7 +46,7 @@ struct KernelKey { // Every JitCode should have a method to get the key from attribution template -size_t JitCodeKey(const Attr& attr); +int64_t JitCodeKey(const Attr& attr); } // namespace jit } // namespace operators diff --git a/paddle/fluid/operators/jit/kernel_pool.h b/paddle/fluid/operators/jit/kernel_pool.h index 3e15242af2..ec5c2be55b 100644 --- a/paddle/fluid/operators/jit/kernel_pool.h +++ b/paddle/fluid/operators/jit/kernel_pool.h @@ -30,7 +30,7 @@ namespace jit { template class JitCodePool { typedef std::unique_ptr GenBasePtr; - typedef std::unordered_map JitCodeMap; + typedef std::unordered_map JitCodeMap; public: JitCodePool() = default; @@ -41,9 +41,9 @@ class JitCodePool { const JitCodeMap& AllKernels() { return codes_; } - bool Has(size_t key) const { return codes_.find(key) != codes_.end(); } + bool Has(int64_t key) const { return codes_.find(key) != codes_.end(); } - void Insert(size_t key, GenBasePtr value) { + void Insert(int64_t key, GenBasePtr value) { codes_.emplace(key, std::move(value)); } diff --git a/paddle/fluid/operators/jit/registry.h b/paddle/fluid/operators/jit/registry.h index c8da92c0c5..567a903236 100644 --- a/paddle/fluid/operators/jit/registry.h +++ b/paddle/fluid/operators/jit/registry.h @@ -17,6 +17,7 @@ #include #include #include +#include // for std::move #include "paddle/fluid/operators/jit/kernel_base.h" #include "paddle/fluid/operators/jit/kernel_pool.h" #include "paddle/fluid/platform/place.h" diff --git a/paddle/fluid/operators/jit/test.cc b/paddle/fluid/operators/jit/test.cc index 898133a03b..068b0ba7ae 100644 --- a/paddle/fluid/operators/jit/test.cc +++ b/paddle/fluid/operators/jit/test.cc @@ -886,7 +886,11 @@ void TestKernelVBroadcast() { // test pool TEST(JITKernel_pool, jitcreator) { const auto& jitcreators = jit::JitCodeCreatorPool::Instance().AllCreators(); +#if defined(_WIN32) || defined(__APPLE__) || defined(__OSX__) + EXPECT_EQ(jitcreators.size(), 0UL); +#else EXPECT_EQ(jitcreators.size(), 25UL); +#endif } TEST(JITKernel_pool, jitpool) { @@ -894,13 +898,25 @@ TEST(JITKernel_pool, jitpool) { const auto& kers = jit::JitCodePool().Instance().AllKernels(); EXPECT_EQ(kers.size(), 0UL); jit::GetAllCandidateKernels, CPUPlace>(3); - // after call GetAllCandidateKernels, it will create jitcode Automatically +// after call GetAllCandidateKernels, it will create jitcode Automatically +#if defined(_WIN32) || defined(__APPLE__) || defined(__OSX__) + EXPECT_EQ(kers.size(), 0UL); +#else EXPECT_EQ(kers.size(), 1UL); +#endif } TEST(JITKernel_pool, more) { const auto& kers = jit::KernelPool::Instance().AllKernels(); +#if defined(__APPLE__) || defined(__OSX__) + EXPECT_EQ(kers.size(), 10UL); +#else +#ifdef PADDLE_WITH_MKLML EXPECT_EQ(kers.size(), 21UL); +#else + EXPECT_EQ(kers.size(), 8UL); +#endif +#endif } TEST(JITKernel_pool, refer) { @@ -915,7 +931,11 @@ TEST(JITKernel_helper, GetAllCandidateKernels) { #if defined(_WIN32) || defined(__APPLE__) || defined(__OSX__) EXPECT_GE(fp_kers.size(), 1UL); // refer #else +#ifdef PADDLE_WITH_MKLML EXPECT_GE(fp_kers.size(), 3UL); // jitcode, mkl, refer +#else + EXPECT_GE(fp_kers.size(), 2UL); // jitcode, refer +#endif #endif auto db_kers = @@ -923,18 +943,48 @@ TEST(JITKernel_helper, GetAllCandidateKernels) { #if defined(_WIN32) || defined(__APPLE__) || defined(__OSX__) EXPECT_GE(db_kers.size(), 1UL); // refer #else +#ifdef PADDLE_WITH_MKLML EXPECT_GE(db_kers.size(), 2UL); // mkl, refer +#else + EXPECT_GE(db_kers.size(), 1UL); // refer +#endif #endif } TEST(JITKernel_helper, GetAllCandidateFuncsWithTypes) { auto fp_kers = jit::GetAllCandidateFuncsWithTypes, CPUPlace>(10); +#if defined(__APPLE__) || defined(__OSX__) + EXPECT_GE(fp_kers.size(), 1UL); // refer +#else +#if !defined(PADDLE_WITH_MKLML) || defined(_WIN32) + EXPECT_GE(fp_kers.size(), 2UL); // jitcode/mkl, refer +#else EXPECT_GE(fp_kers.size(), 3UL); // jitcode, mkl, refer +#endif +#endif auto db_kers = jit::GetAllCandidateFuncsWithTypes, CPUPlace>(10); +#if defined(__APPLE__) || defined(__OSX__) || !defined(PADDLE_WITH_MKLML) + EXPECT_GE(db_kers.size(), 1UL); // refer +#else EXPECT_GE(db_kers.size(), 2UL); // mkl, refer +#endif +} + +TEST(JITKernel_helper, KernelFuncs) { + auto f1 = jit::KernelFuncs, CPUPlace>::Cache().At(3); + auto f2 = jit::KernelFuncs, CPUPlace>::Cache()[3]; + EXPECT_TRUE(f1 != nullptr); + EXPECT_TRUE(f1 == f2); + + auto f3 = jit::KernelFuncs, CPUPlace>::Cache()[5]; +#if defined(_WIN32) || defined(__APPLE__) || defined(__OSX__) + EXPECT_TRUE(f2 == f3); +#else + EXPECT_TRUE(f2 != f3); +#endif } TEST(JITKernel_helper, GetAllCandidateFuncs) { @@ -1011,6 +1061,134 @@ TEST(JITKernel_helper, attr) { EXPECT_EQ(out.str().size(), 14); } +// test keys +TEST(JITKernel_key, int) { + EXPECT_TRUE(jit::JitCodeKey(2) == jit::JitCodeKey(2)); + EXPECT_TRUE(jit::JitCodeKey(2) == jit::JitCodeKey(2)); + EXPECT_TRUE(jit::JitCodeKey(2) != jit::JitCodeKey(3)); +} + +TEST(JITKernel_key, gru) { + jit::gru_attr_t attr1(8, jit::kVSigmoid, jit::kVTanh); + jit::gru_attr_t attr2(8, jit::kVSigmoid, jit::kVTanh); + jit::gru_attr_t attr3(9, jit::kVSigmoid, jit::kVTanh); + jit::gru_attr_t attr4(9, jit::kVSigmoid, jit::kVIdentity); + jit::gru_attr_t attr5(9, jit::kVTanh, jit::kVIdentity); + + auto key1 = jit::JitCodeKey(attr1); + auto key2 = jit::JitCodeKey(attr2); + auto key3 = jit::JitCodeKey(attr3); + auto key4 = jit::JitCodeKey(attr4); + auto key5 = jit::JitCodeKey(attr5); + + EXPECT_TRUE(key1 == key2); + EXPECT_TRUE(key2 != key3); + EXPECT_TRUE(key2 != key4); + EXPECT_TRUE(key2 != key5); + EXPECT_TRUE(key3 != key4); + EXPECT_TRUE(key3 != key5); + EXPECT_TRUE(key4 != key5); +} + +TEST(JITKernel_key, lstm) { + jit::lstm_attr_t attr1(8, jit::kVIdentity, jit::kVSigmoid, jit::kVTanh); + jit::lstm_attr_t attr2(8, jit::kVIdentity, jit::kVSigmoid, jit::kVTanh); + jit::lstm_attr_t attr3(9, jit::kVIdentity, jit::kVSigmoid, jit::kVTanh); + jit::lstm_attr_t attr4(9, jit::kVRelu, jit::kVSigmoid, jit::kVTanh); + jit::lstm_attr_t attr5(9, jit::kVRelu, jit::kVSigmoid, jit::kVTanh, true); + jit::lstm_attr_t attr6(9, jit::kVRelu, jit::kVSigmoid, jit::kVTanh, true); + + auto key1 = jit::JitCodeKey(attr1); + auto key2 = jit::JitCodeKey(attr2); + auto key3 = jit::JitCodeKey(attr3); + auto key4 = jit::JitCodeKey(attr4); + auto key5 = jit::JitCodeKey(attr5); + auto key6 = jit::JitCodeKey(attr6); + + EXPECT_TRUE(key1 == key2); + EXPECT_TRUE(key2 != key3); + EXPECT_TRUE(key2 != key4); + EXPECT_TRUE(key2 != key5); + EXPECT_TRUE(key3 != key4); + EXPECT_TRUE(key3 != key5); + EXPECT_TRUE(key4 != key5); + EXPECT_TRUE(key5 == key6); +} + +TEST(JITKernel_key, seq_pool) { + jit::seq_pool_attr_t attr1(2, jit::SeqPoolType::kSum, 1); + jit::seq_pool_attr_t attr2(2, jit::SeqPoolType::kSum, 3); + jit::seq_pool_attr_t attr3(3, jit::SeqPoolType::kSum, 3); + jit::seq_pool_attr_t attr4(3, jit::SeqPoolType::kAvg, 3); + + auto key1 = jit::JitCodeKey(attr1); + auto key2 = jit::JitCodeKey(attr2); + auto key3 = jit::JitCodeKey(attr3); + auto key4 = jit::JitCodeKey(attr4); + + EXPECT_TRUE(key1 == key2); + EXPECT_TRUE(key2 != key3); + EXPECT_TRUE(key2 != key4); + EXPECT_TRUE(key3 != key4); +} + +TEST(JITKernel_key, matmul) { + jit::matmul_attr_t attr1(1, 2, 3); + jit::matmul_attr_t attr2(1, 2, 3); + jit::matmul_attr_t attr3(1, 3, 3); + jit::matmul_attr_t attr4(2, 3, 4); + + auto key1 = jit::JitCodeKey(attr1); + auto key2 = jit::JitCodeKey(attr2); + auto key3 = jit::JitCodeKey(attr3); + auto key4 = jit::JitCodeKey(attr4); + + EXPECT_TRUE(key1 == key2); + EXPECT_TRUE(key2 != key3); + EXPECT_TRUE(key2 != key4); + EXPECT_TRUE(key3 != key4); +} + +TEST(JITKernel_key, emb_seq_pool) { + jit::emb_seq_pool_attr_t attr1(1, 2, 3, 4, 5, jit::SeqPoolType::kSum); + jit::emb_seq_pool_attr_t attr2(1, 2, 3, 4, 5, jit::SeqPoolType::kSum); + jit::emb_seq_pool_attr_t attr3(10, 2, 9, 8, 7, jit::SeqPoolType::kAvg); + jit::emb_seq_pool_attr_t attr4(10, 3, 9, 8, 7, jit::SeqPoolType::kSum); + jit::emb_seq_pool_attr_t attr5(1, 6, 3, 4, 5, jit::SeqPoolType::kSum); + + auto key1 = jit::JitCodeKey(attr1); + auto key2 = jit::JitCodeKey(attr2); + auto key3 = jit::JitCodeKey(attr3); + auto key4 = jit::JitCodeKey(attr4); + auto key5 = jit::JitCodeKey(attr5); + + EXPECT_TRUE(key1 == key2); + EXPECT_TRUE(key2 == key3); + EXPECT_TRUE(key2 != key4); + EXPECT_TRUE(key2 != key5); + EXPECT_TRUE(key4 != key5); +} + +TEST(JITKernel_key, sgd) { + jit::sgd_attr_t attr1(1, 2, 3, 4, 5); + jit::sgd_attr_t attr2(1, 2, 3, 4, 5); + jit::sgd_attr_t attr3(9, 8, 7, 4, 6); + jit::sgd_attr_t attr4(1, 2, 3, 6, 5); + jit::sgd_attr_t attr5(10, 9, 8, 7, 6); + + auto key1 = jit::JitCodeKey(attr1); + auto key2 = jit::JitCodeKey(attr2); + auto key3 = jit::JitCodeKey(attr3); + auto key4 = jit::JitCodeKey(attr4); + auto key5 = jit::JitCodeKey(attr5); + + EXPECT_TRUE(key1 == key2); + EXPECT_TRUE(key2 == key3); + EXPECT_TRUE(key3 != key4); + EXPECT_TRUE(key3 != key5); + EXPECT_TRUE(key4 != key5); +} + // test kernerls #define TestKernelVMul TestKernelXYZN #define TestKernelVAdd TestKernelXYZN @@ -1080,43 +1258,3 @@ TEST_CPU_KERNEL(MatMul); TEST_CPU_KERNEL(Softmax); TEST_CPU_KERNEL(Sgd); TEST_CPU_KERNEL(VBroadcast); - -TEST(JITKernel, kernel_func) { - auto f1 = jit::KernelFuncs, CPUPlace>::Cache().At(3); - auto f2 = jit::KernelFuncs, CPUPlace>::Cache()[3]; - EXPECT_TRUE(f1 != nullptr); - EXPECT_TRUE(f1 == f2); - // TODO(TJ): check not equal -} - -TEST(JITKernel_key, lstm) { - jit::lstm_attr_t attr1(8, jit::kVIdentity, jit::kVSigmoid, jit::kVTanh); - jit::lstm_attr_t attr2(9, jit::kVIdentity, jit::kVSigmoid, jit::kVTanh); - jit::lstm_attr_t attr3(9, jit::kVIdentity, jit::kVSigmoid, jit::kVTanh); - jit::lstm_attr_t attr4(9, jit::kVRelu, jit::kVSigmoid, jit::kVTanh); - - auto key1 = jit::JitCodeKey(attr1); - auto key2 = jit::JitCodeKey(attr2); - auto key3 = jit::JitCodeKey(attr3); - auto key4 = jit::JitCodeKey(attr4); - - EXPECT_TRUE(key1 != key2); - EXPECT_TRUE(key2 == key3); - EXPECT_TRUE(key3 != key4); -} - -TEST(JITKernel_key, gru) { - jit::gru_attr_t attr1(8, jit::kVSigmoid, jit::kVTanh); - jit::gru_attr_t attr2(9, jit::kVSigmoid, jit::kVTanh); - jit::gru_attr_t attr3(9, jit::kVSigmoid, jit::kVTanh); - jit::gru_attr_t attr4(9, jit::kVSigmoid, jit::kVIdentity); - - auto key1 = jit::JitCodeKey(attr1); - auto key2 = jit::JitCodeKey(attr2); - auto key3 = jit::JitCodeKey(attr3); - auto key4 = jit::JitCodeKey(attr4); - - EXPECT_TRUE(key1 != key2); - EXPECT_TRUE(key2 == key3); - EXPECT_TRUE(key3 != key4); -} From 1283833395645c8d52d7b603c2e8bc3092d4ef12 Mon Sep 17 00:00:00 2001 From: luotao1 Date: Mon, 11 Mar 2019 15:18:32 +0800 Subject: [PATCH 044/230] zero_copy tensor support INT32 test=develop --- .../fluid/inference/api/details/zero_copy_tensor.cc | 5 +++++ paddle/fluid/inference/tests/api/tester_helper.h | 11 +++++++++++ 2 files changed, 16 insertions(+) diff --git a/paddle/fluid/inference/api/details/zero_copy_tensor.cc b/paddle/fluid/inference/api/details/zero_copy_tensor.cc index cf02901d96..9a40cf4b60 100644 --- a/paddle/fluid/inference/api/details/zero_copy_tensor.cc +++ b/paddle/fluid/inference/api/details/zero_copy_tensor.cc @@ -126,15 +126,20 @@ void ZeroCopyTensor::copy_to_cpu(T *data) { } template void ZeroCopyTensor::copy_from_cpu(const float *data); template void ZeroCopyTensor::copy_from_cpu(const int64_t *data); +template void ZeroCopyTensor::copy_from_cpu(const int32_t *data); template void ZeroCopyTensor::copy_to_cpu(float *data); template void ZeroCopyTensor::copy_to_cpu(int64_t *data); +template void ZeroCopyTensor::copy_to_cpu(int32_t *data); template float *ZeroCopyTensor::data(PaddlePlace *place, int *size) const; template int64_t *ZeroCopyTensor::data(PaddlePlace *place, int *size) const; +template int32_t *ZeroCopyTensor::data(PaddlePlace *place, + int *size) const; template float *ZeroCopyTensor::mutable_data(PaddlePlace place); template int64_t *ZeroCopyTensor::mutable_data(PaddlePlace place); +template int32_t *ZeroCopyTensor::mutable_data(PaddlePlace place); void *ZeroCopyTensor::FindTensor() const { PADDLE_ENFORCE(!name_.empty(), diff --git a/paddle/fluid/inference/tests/api/tester_helper.h b/paddle/fluid/inference/tests/api/tester_helper.h index 915ea772ed..a4881afe58 100644 --- a/paddle/fluid/inference/tests/api/tester_helper.h +++ b/paddle/fluid/inference/tests/api/tester_helper.h @@ -141,6 +141,15 @@ void CompareResult(const std::vector &outputs, } break; } + case PaddleDType::INT32: { + int32_t *pdata = static_cast(out.data.data()); + int32_t *pdata_ref = ref_out.data(&place, &ref_size); + EXPECT_EQ(size, ref_size); + for (size_t j = 0; j < size; ++j) { + EXPECT_EQ(pdata_ref[j], pdata[j]); + } + break; + } } } } @@ -253,6 +262,8 @@ void ConvertPaddleTensorToZeroCopyTensor( ZeroCopyTensorAssignData(tensor.get(), input.data); } else if (input.dtype == PaddleDType::FLOAT32) { ZeroCopyTensorAssignData(tensor.get(), input.data); + } else if (input.dtype == PaddleDType::INT32) { + ZeroCopyTensorAssignData(tensor.get(), input.data); } else { LOG(ERROR) << "unsupported feed type " << input.dtype; } From 4e052e0ac9af166869ad38b7649bdd0696cdf72a Mon Sep 17 00:00:00 2001 From: Liu Yiqun Date: Mon, 11 Mar 2019 17:25:12 +0800 Subject: [PATCH 045/230] Disable inference download for WIN32 temporary. test=develop --- paddle/fluid/inference/tests/test.cmake | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/paddle/fluid/inference/tests/test.cmake b/paddle/fluid/inference/tests/test.cmake index 5ceb630976..f551b322fe 100644 --- a/paddle/fluid/inference/tests/test.cmake +++ b/paddle/fluid/inference/tests/test.cmake @@ -42,8 +42,8 @@ function(inference_download_and_uncompress INSTALL_DIR URL FILENAME) endfunction() set(WORD2VEC_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/word2vec") -if (NOT EXISTS ${WORD2VEC_INSTALL_DIR}) - inference_download_and_uncompress(${WORD2VEC_INSTALL_DIR} ${INFERENCE_URL} "word2vec.inference.model.tar.gz") +if(NOT EXISTS ${WORD2VEC_INSTALL_DIR} AND NOT WIN32) + inference_download_and_uncompress(${WORD2VEC_INSTALL_DIR} ${INFERENCE_URL} "word2vec.inference.model.tar.gz") endif() set(WORD2VEC_MODEL_DIR "${WORD2VEC_INSTALL_DIR}/word2vec.inference.model") From d17bb4e6006880ed6331fed7e33f2e5d7624101e Mon Sep 17 00:00:00 2001 From: minqiyang Date: Mon, 11 Mar 2019 18:05:15 +0800 Subject: [PATCH 046/230] Add unit test for gru unit test=develop --- python/paddle/fluid/imperative/nn.py | 39 ++++--- .../paddle/fluid/tests/unittests/op_test.py | 107 +++++++++++++++++- .../fluid/tests/unittests/test_gru_op.py | 2 +- .../fluid/tests/unittests/test_layers.py | 41 +++++++ 4 files changed, 164 insertions(+), 25 deletions(-) diff --git a/python/paddle/fluid/imperative/nn.py b/python/paddle/fluid/imperative/nn.py index 2d2b70e3f7..6681b42341 100644 --- a/python/paddle/fluid/imperative/nn.py +++ b/python/paddle/fluid/imperative/nn.py @@ -22,6 +22,7 @@ from . import layers from ..framework import Variable, OpProtoHolder from ..param_attr import ParamAttr from ..initializer import Normal, Constant + __all__ = ['Conv2D', 'Pool2D', 'FC', 'BatchNorm', 'Embedding', 'GRUUnit'] @@ -548,7 +549,7 @@ class GRUUnit(layers.Layer): """ def __init__(self, - hidden, + name_scope, size, param_attr=None, bias_attr=None, @@ -556,8 +557,8 @@ class GRUUnit(layers.Layer): gate_activation='sigmoid', origin_mode=False, dtype='float32'): + super(GRUUnit, self).__init__(name_scope) - super(GRUUnit, self).__init__() activation_dict = dict( identity=0, sigmoid=1, @@ -566,29 +567,27 @@ class GRUUnit(layers.Layer): activation = activation_dict[activation] gate_activation = activation_dict[gate_activation] - helper = LayerHelper('gru_unit', **locals()) - dtype = helper.input_dtype() + self._dtype = dtype size = size // 3 - # create weight - weight = helper.create_parameter( - attr=helper.param_attr, shape=[size, 3 * size], dtype=dtype) + self._weight = self.create_parameter( + attr=param_attr, shape=[size, 3 * size], dtype=dtype) - gate = helper.create_variable_for_type_inference(dtype) - reset_hidden_pre = helper.create_variable_for_type_inference(dtype) - updated_hidden = helper.create_variable_for_type_inference(dtype) - inputs = {'Input': input, 'HiddenPrev': hidden, 'Weight': weight} # create bias - if helper.bias_attr: - bias_size = [1, 3 * size] - bias = helper.create_parameter( - attr=helper.bias_attr, - shape=bias_size, - dtype=dtype, - is_bias=True) - inputs['Bias'] = bias + bias_size = [1, 3 * size] + self._bias = self.create_parameter( + attr=bias_attr, shape=bias_size, dtype=dtype, is_bias=True) - def forward(self, input): + def forward(self, input, hidden): + inputs = {'Input': input, 'HiddenPrev': hidden, 'Weight': self._weight} + if self._bias: + inputs['Bias'] = self._bias + + gate = self._helper.create_variable_for_type_inference(self._dtype) + reset_hidden_pre = self._helper.create_variable_for_type_inference( + self._dtype) + updated_hidden = self._helper.create_variable_for_type_inference( + self._dtype) self._helper.append_op( type='gru_unit', inputs=inputs, diff --git a/python/paddle/fluid/tests/unittests/op_test.py b/python/paddle/fluid/tests/unittests/op_test.py index 8234457243..9fa62a692e 100644 --- a/python/paddle/fluid/tests/unittests/op_test.py +++ b/python/paddle/fluid/tests/unittests/op_test.py @@ -22,6 +22,7 @@ import six import time import itertools import collections +from collections import defaultdict import paddle.fluid as fluid import paddle.fluid.core as core @@ -257,8 +258,65 @@ class OpTest(unittest.TestCase): outs, _ = self._calc_output(place) return outs - def _calc_output(self, place, parallel=False, no_check_set=None): + def _create_var_from_numpy(self, value): + if isinstance(value, tuple): + data = value[0] + lod = value[1] + v = fluid.imperative.base.to_variable(value=data) + v._ivar.value().get_tensor().set_recursive_sequence_lengths(lod) + return v + else: + return fluid.imperative.base.to_variable(value) + + def _calc_imperative_output(self, place, parallel=False, no_check_set=None): + with fluid.imperative.base.guard(place=place): + block = fluid.default_main_program().global_block() + + # prepare input variable + inputs = defaultdict(list) + for name, np_value in six.iteritems(self.inputs): + if not isinstance(np_value, list): + np_value = [np_value] + + for i in range(len(np_value)): + inputs[name].append( + self._create_var_from_numpy(np_value[i])) + + # prepare output variable + outputs = defaultdict(list) + for name, np_value in six.iteritems(self.outputs): + if not isinstance(np_value, list): + np_value = [np_value] + + for i in range(len(np_value)): + value = np_value[i] + if isinstance(value, tuple): + v = block.create_var( + name="%s_out%d" % (name, i), + dtype=value[0].dtype, + type=core.VarDesc.VarType.LOD_TENSOR, + persistable=False, + stop_gradient=False) + v._ivar.value().get_tensor( + ).set_recursive_sequence_lengths(value[1]) + else: + v = block.create_var( + name="%s_out%d" % (name, i), + dtype=value.dtype, + type=core.VarDesc.VarType.LOD_TENSOR, + persistable=False, + stop_gradient=False) + outputs[name].append(v) + + block.append_op( + type=self.op_type, + inputs=inputs, + outputs=outputs, + attrs=self.attrs) + + return outputs + def _calc_output(self, place, parallel=False, no_check_set=None): program = Program() block = program.global_block() self._append_ops(block) @@ -305,8 +363,13 @@ class OpTest(unittest.TestCase): place, atol, no_check_set=None, - equal_nan=False): + equal_nan=False, + check_imperative=False): + if check_imperative: + imperative_outs = self._calc_imperative_output( + place, no_check_set=no_check_set) outs, fetch_list = self._calc_output(place, no_check_set=no_check_set) + for out_name, out_dup in Operator.get_op_outputs(self.op_type): if out_name not in self.outputs: continue @@ -330,6 +393,10 @@ class OpTest(unittest.TestCase): type(sub_out)) for item in sub_out: sub_out_name, expect = item[0], item[1] + if check_imperative: + imperative_actual = imperative_outs[sub_out_name][0] + imperative_actual_t = np.array( + imperative_actual._ivar.value().get_tensor()) idx = find_actual(sub_out_name, fetch_list) actual = outs[idx] actual_t = np.array(actual) @@ -340,12 +407,24 @@ class OpTest(unittest.TestCase): actual_t, expect_t, atol=atol, equal_nan=equal_nan), "Output (" + sub_out_name + ") has diff at " + str(place)) + self.assertTrue( + np.allclose( + imperative_actual_t, + expect_t, + atol=atol, + equal_nan=equal_nan), + "Output (" + sub_out_name + ") has diff at " + + str(place) + " in imperative mode") if isinstance(expect, tuple): self.assertListEqual( actual.recursive_sequence_lengths(), expect[1], "Output (" + sub_out_name + ") has different lod at " + str(place)) else: + if check_imperative: + imperative_actual = imperative_outs[out_name][0] + imperative_actual_t = np.array( + imperative_actual._ivar.value().get_tensor()) idx = find_actual(out_name, fetch_list) actual = outs[idx] actual_t = np.array(actual) @@ -357,10 +436,25 @@ class OpTest(unittest.TestCase): "Output (" + out_name + ") has diff at " + str(place) + "\nExpect " + str(expect_t) + "\n" + "But Got" + str(actual_t) + " in class " + self.__class__.__name__) + self.assertTrue( + np.allclose( + imperative_actual_t, + expect_t, + atol=atol, + equal_nan=equal_nan), + "Output (" + out_name + ") has diff at " + str(place) + + "\nExpect " + str(expect_t) + "\n" + "But Got" + + str(imperative_actual_t) + " in class " + + self.__class__.__name__) if isinstance(expect, tuple): self.assertListEqual(actual.recursive_sequence_lengths(), expect[1], "Output (" + out_name + ") has different lod at " + str(place)) + if check_imperative: + self.assertListEqual( + imperative_actual._ivar.value().get_tensor() + .recursive_sequence_lengths(), expect[1], "Output (" + + out_name + ") has different lod at " + str(place)) def _get_places(self): if self.dtype == np.float16: @@ -383,10 +477,15 @@ class OpTest(unittest.TestCase): places.append(core.CUDAPlace(0)) return places - def check_output(self, atol=1e-5, no_check_set=None, equal_nan=False): + def check_output(self, + atol=1e-5, + no_check_set=None, + equal_nan=False, + check_imperative=False): places = self._get_places() for place in places: - self.check_output_with_place(place, atol, no_check_set, equal_nan) + self.check_output_with_place(place, atol, no_check_set, equal_nan, + check_imperative) def check_output_customized(self, checker): places = self._get_places() diff --git a/python/paddle/fluid/tests/unittests/test_gru_op.py b/python/paddle/fluid/tests/unittests/test_gru_op.py index 6606162733..848c9a4952 100644 --- a/python/paddle/fluid/tests/unittests/test_gru_op.py +++ b/python/paddle/fluid/tests/unittests/test_gru_op.py @@ -156,7 +156,7 @@ class TestGRUOp(OpTest): } def test_check_output(self): - self.check_output(atol=1e-8) + self.check_output(atol=1e-8, check_imperative=True) def test_check_grad(self): self.check_grad(['Input', 'H0', 'Weight', 'Bias'], ['Hidden']) diff --git a/python/paddle/fluid/tests/unittests/test_layers.py b/python/paddle/fluid/tests/unittests/test_layers.py index b29ad25870..5b186ae038 100644 --- a/python/paddle/fluid/tests/unittests/test_layers.py +++ b/python/paddle/fluid/tests/unittests/test_layers.py @@ -112,6 +112,47 @@ class TestLayer(LayerTest): self.assertTrue(np.allclose(static_ret, dy_ret._numpy())) self.assertTrue(np.allclose(static_ret, static_ret2)) + def test_gru_unit(self): + lod = [[2, 4, 3]] + D = 5 + T = sum(lod[0]) + N = len(lod[0]) + + input = np.random.rand(T, 3 * D).astype('float32') + hidden_input = np.random.rand(T, D).astype('float32') + + with self.static_graph(): + x = layers.data(name='x', shape=[-1, D * 3], dtype='float32') + hidden = layers.data(name='hidden', shape=[-1, D], dtype='float32') + updated_hidden, reset_hidden_pre, gate = layers.gru_unit( + input=x, hidden=hidden, size=D * 3) + static_ret = self.get_static_graph_result( + feed={'x': input, + 'hidden': hidden_input}, + fetch_list=[updated_hidden, reset_hidden_pre, gate]) + + with self.static_graph(): + x = layers.data(name='x', shape=[-1, D * 3], dtype='float32') + hidden = layers.data(name='hidden', shape=[-1, D], dtype='float32') + updated_hidden, reset_hidden_pre, gate = layers.gru_unit( + input=x, hidden=hidden, size=D * 3) + gru = nn.GRUUnit('gru', size=D * 3) + updated_hidden, reset_hidden_pre, gate = gru(x, hidden) + + static_ret2 = self.get_static_graph_result( + feed={'x': input, + 'hidden': hidden_input}, + fetch_list=[updated_hidden, reset_hidden_pre, gate]) + + with self.dynamic_graph(): + gru = nn.GRUUnit('gru', size=D * 3) + dy_ret = gru( + base.to_variable(input), base.to_variable(hidden_input)) + + for i in range(len(static_ret)): + self.assertTrue(np.allclose(static_ret[i], static_ret2[i])) + self.assertTrue(np.allclose(static_ret[i], dy_ret[i]._numpy())) + class TestBook(unittest.TestCase): def test_fit_a_line(self): From 14d871121b0d1f9ba17c219ecfb95a49836bbe73 Mon Sep 17 00:00:00 2001 From: tensor-tang Date: Mon, 11 Mar 2019 10:48:57 +0000 Subject: [PATCH 047/230] enhance jitkernel unit test test=develop --- paddle/fluid/operators/jit/kernel_pool.h | 1 + paddle/fluid/operators/jit/test.cc | 36 +++++++++++++++++++++++- 2 files changed, 36 insertions(+), 1 deletion(-) diff --git a/paddle/fluid/operators/jit/kernel_pool.h b/paddle/fluid/operators/jit/kernel_pool.h index ec5c2be55b..04710a54ac 100644 --- a/paddle/fluid/operators/jit/kernel_pool.h +++ b/paddle/fluid/operators/jit/kernel_pool.h @@ -17,6 +17,7 @@ #include // for unique_ptr #include #include +#include // for move #include #include "paddle/fluid/operators/jit/gen_base.h" #include "paddle/fluid/operators/jit/kernel_base.h" diff --git a/paddle/fluid/operators/jit/test.cc b/paddle/fluid/operators/jit/test.cc index 068b0ba7ae..6c099a7a06 100644 --- a/paddle/fluid/operators/jit/test.cc +++ b/paddle/fluid/operators/jit/test.cc @@ -1003,9 +1003,43 @@ TEST(JITKernel_helper, GetAllCandidateFuncs) { } } +TEST(JITKernel_helper, pack_weights) { + const int N = 8 * 60, K = 2; + float src[K][N], yref[K][N], y[K * N]; + float* x = &(src[0][0]); + float* ref = &(yref[0][0]); + for (int i = 0; i < N * K; ++i) { + *(x + i) = static_cast(i); + } + int block = 0; + std::vector groups; + if (paddle::platform::MayIUse(paddle::platform::avx512f)) { + block = ZMM_FLOAT_BLOCK; + groups.push_back(30); + } else { + block = YMM_FLOAT_BLOCK; + groups.insert(groups.end(), {14, 14, 14, 14, 4}); + } + + int offset = 0; + int acc = 0; + for (int g : groups) { + g = g * block; + for (int k = 0; k < K; ++k) { + for (int i = 0; i < g; ++i) { + *(ref + offset) = src[k][i + acc]; + offset++; + } + } + acc += g; + } + + jit::pack_weights(x, y, N, K); + ExpectEQ(y, ref, N * K); +} + TEST(JITKernel_helper, attr) { std::ostringstream out; - // KernelTypes out << jit::to_string(jit::kNone) << jit::to_string(jit::kCRFDecoding) << jit::to_string(jit::kEmbSeqPool) << jit::to_string(jit::kGRUH1) From 31ccaf091641b991af885427eb3071a276ccc70e Mon Sep 17 00:00:00 2001 From: luotao1 Date: Mon, 11 Mar 2019 19:58:41 +0800 Subject: [PATCH 048/230] add all_kernels_must_compute_runtime_shape example for speedup infershape test=develop --- paddle/fluid/framework/operator.cc | 11 +++++++++-- .../operators/fused/fused_embedding_seq_pool_op.cc | 11 ++++++++--- paddle/fluid/operators/hash_op.cc | 11 ++++++++--- .../operators/sequence_ops/sequence_enumerate_op.cc | 11 ++++++++--- 4 files changed, 33 insertions(+), 11 deletions(-) diff --git a/paddle/fluid/framework/operator.cc b/paddle/fluid/framework/operator.cc index df1689764d..9f48b8cb9e 100644 --- a/paddle/fluid/framework/operator.cc +++ b/paddle/fluid/framework/operator.cc @@ -926,8 +926,15 @@ void OperatorWithKernel::RunImpl(const Scope& scope, dev_ctx = pool.Get(expected_kernel_key.place_); } - RuntimeInferShapeContext infer_shape_ctx(*this, exec_scope, ctx); - this->InferShape(&infer_shape_ctx); + // If Op has attribute all_kernels_must_compute_runtime_shape, + // all the kernels of this Op would compute runtime shape, + // and skip infershape in runtime for speedup. + // TODO(luotao): Note that it is a temporal attribute, after all ops + // implement computing runtime shape, this attribute would be deleted. + if (!HasAttr("all_kernels_must_compute_runtime_shape")) { + RuntimeInferShapeContext infer_shape_ctx(*this, exec_scope, ctx); + this->InferShape(&infer_shape_ctx); + } // TODO(panyx0718): ExecutionContext should only depend on RuntimeContext // not Scope. Imperative mode only pass inputs and get outputs. kernel_iter->second( diff --git a/paddle/fluid/operators/fused/fused_embedding_seq_pool_op.cc b/paddle/fluid/operators/fused/fused_embedding_seq_pool_op.cc index 80caf70b08..17a81d3e88 100644 --- a/paddle/fluid/operators/fused/fused_embedding_seq_pool_op.cc +++ b/paddle/fluid/operators/fused/fused_embedding_seq_pool_op.cc @@ -23,9 +23,6 @@ class FusedEmbeddingSeqPoolOp : public framework::OperatorWithKernel { using framework::OperatorWithKernel::OperatorWithKernel; void InferShape(framework::InferShapeContext* ctx) const override { - if (ctx->IsRuntime()) { - return; - } PADDLE_ENFORCE(ctx->HasInput("W"), "Input W of FusedEmbeddingSeqPoolOp should not be null."); PADDLE_ENFORCE(ctx->HasInput("Ids"), @@ -91,6 +88,14 @@ class FusedEmbeddingSeqPoolOpMaker : public framework::OpProtoAndCheckerMaker { "(boolean, default false) " "Sparse update.") .SetDefault(false); + AddAttr( + "all_kernels_must_compute_runtime_shape", + "(boolean, default true) " + "An attribute to speed up OperatorWithKernel::RunImpl." + "If true, all the kernels of this Op would compute runtime " + "shape, but skip infershape in runtime. Note that it is a temporal " + "attribute, please do DOT set it in python layer.") + .SetDefault(true); AddComment(R"DOC( FusedEmbeddingSeqPool Operator. diff --git a/paddle/fluid/operators/hash_op.cc b/paddle/fluid/operators/hash_op.cc index 7a29f80ff1..b39eba081e 100644 --- a/paddle/fluid/operators/hash_op.cc +++ b/paddle/fluid/operators/hash_op.cc @@ -26,9 +26,6 @@ class HashOp : public framework::OperatorWithKernel { : OperatorWithKernel(type, inputs, outputs, attrs) {} void InferShape(framework::InferShapeContext *ctx) const override { - if (ctx->IsRuntime()) { - return; - } PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) of HashOp should not be null."); PADDLE_ENFORCE(ctx->HasOutput("Out"), @@ -57,6 +54,14 @@ $$Out = scale * X$$ )DOC"); AddAttr("num_hash", "").SetDefault(1); AddAttr("mod_by", "").SetDefault(100000); + AddAttr( + "all_kernels_must_compute_runtime_shape", + "(boolean, default true) " + "An attribute to speed up OperatorWithKernel::RunImpl." + "If true, all the kernels of this Op would compute runtime " + "shape, but skip infershape in runtime. Note that it is a temporal " + "attribute, please do DOT set it in python layer.") + .SetDefault(true); } }; diff --git a/paddle/fluid/operators/sequence_ops/sequence_enumerate_op.cc b/paddle/fluid/operators/sequence_ops/sequence_enumerate_op.cc index d3dcd1f96a..63e95e8654 100644 --- a/paddle/fluid/operators/sequence_ops/sequence_enumerate_op.cc +++ b/paddle/fluid/operators/sequence_ops/sequence_enumerate_op.cc @@ -22,9 +22,6 @@ class SequenceEnumerateOp : public framework::OperatorWithKernel { using framework::OperatorWithKernel::OperatorWithKernel; void InferShape(framework::InferShapeContext* ctx) const override { - if (ctx->IsRuntime()) { - return; - } PADDLE_ENFORCE( ctx->HasInput("X"), "Input(X) of SequecceEnumerate operator should not be null."); @@ -62,6 +59,14 @@ class SequenceEnumerateOpMaker : public framework::OpProtoAndCheckerMaker { }); AddAttr("pad_value", "(int) The enumerate sequence padding value.") .SetDefault(0); + AddAttr( + "all_kernels_must_compute_runtime_shape", + "(boolean, default true) " + "An attribute to speed up OperatorWithKernel::RunImpl." + "If true, all the kernels of this Op would compute runtime " + "shape, but skip infershape in runtime. Note that it is a temporal " + "attribute, please do DOT set it in python layer.") + .SetDefault(true); AddComment(R"DOC( Sequence Enumerate Operator. From 1f4aa7a202c8fe0e2418ad9424ba39b475b33994 Mon Sep 17 00:00:00 2001 From: Qiyang Min Date: Mon, 11 Mar 2019 22:24:05 +0800 Subject: [PATCH 049/230] Imperative remove all descs (#16045) * Remove Desc in Forward Pass * Refactor VarBase * Add dbg info * Only check type in imperative mode * Polish code and support optimizer test=develop * Fix stop gradient problem in PyLayer test=develop --- paddle/fluid/imperative/layer.cc | 65 +-- paddle/fluid/imperative/layer.h | 127 +++-- paddle/fluid/imperative/tracer.cc | 237 +++++---- paddle/fluid/imperative/tracer.h | 7 +- .../memory/allocation/legacy_allocator.cc | 1 + paddle/fluid/pybind/imperative.cc | 16 +- paddle/fluid/pybind/imperative.h | 3 + paddle/fluid/pybind/protobuf.cc | 92 +--- paddle/fluid/pybind/pybind.cc | 70 +-- paddle/fluid/pybind/pybind_boost_headers.h | 115 +++++ python/paddle/fluid/framework.py | 480 ++++++++++-------- python/paddle/fluid/imperative/layers.py | 2 +- python/paddle/fluid/imperative/tracer.py | 11 +- python/paddle/fluid/optimizer.py | 9 +- .../tests/unittests/test_imperative_basic.py | 2 +- .../tests/unittests/test_imperative_resnet.py | 4 +- 16 files changed, 724 insertions(+), 517 deletions(-) create mode 100644 paddle/fluid/pybind/pybind_boost_headers.h diff --git a/paddle/fluid/imperative/layer.cc b/paddle/fluid/imperative/layer.cc index 012dfc1c7f..5530823b90 100644 --- a/paddle/fluid/imperative/layer.cc +++ b/paddle/fluid/imperative/layer.cc @@ -159,10 +159,9 @@ class Autograd { for (auto it : candidate->pre_ops_) { for (OpBase* pre_op : it.second) { if (!pre_op) continue; - VLOG(5) << "op dep " << candidate->op_desc_->Type() << " trace id " + VLOG(5) << "op dep " << candidate->Type() << " trace id " << candidate->trace_id_ << " <---- " << it.first << " <---- " - << pre_op->op_desc_->Type() << " trace id " - << pre_op->trace_id_; + << pre_op->Type() << " trace id " << pre_op->trace_id_; if (visited.find(pre_op) == visited.end()) { visited.insert(pre_op); queue.push_back(pre_op); @@ -180,10 +179,12 @@ std::unique_ptr VarBase::NewVarBase(const platform::Place& dst_place, PADDLE_ENFORCE(var_->IsInitialized(), "Variable must be initialized when getting numpy tensor"); - std::unique_ptr new_var(new VarBase()); + // TODO(minqiyang): change this after move unique_name generator to CXX + const framework::LoDTensor& self_tensor = var_->Get(); + std::unique_ptr new_var(new VarBase( + "Itmp", self_tensor.type(), self_tensor.dims(), dst_place, true, false)); framework::LoDTensor* tensor = new_var->var_->GetMutable(); - tensor->Resize(var_->Get().dims()); tensor->set_lod(var_->Get().lod()); if (blocking) { @@ -199,52 +200,62 @@ std::unique_ptr VarBase::NewVarBase(const platform::Place& dst_place, } if (platform::is_gpu_place(dst_place)) { - VLOG(3) << "copy tensor " << var_desc_->Name() << " from gpu"; + VLOG(3) << "copy tensor " << Name() << " from gpu"; } return new_var; } framework::LoDTensor& VarBase::GradValue() { - VLOG(3) << "get var grad " << var_desc_->Name(); + VLOG(3) << "get var grad " << Name(); + PADDLE_ENFORCE_NOT_NULL(grads_, + "Could not get grad value from no grad variable"); return *(grads_->var_->GetMutable()); } std::map> OpBase::ApplyGrad() { if (grad_op_descs_.empty() && backward_id_ <= 0) { - VLOG(3) << "op with no grad: " << op_desc_->Type(); + VLOG(3) << "op with no grad: " << Type(); return {}; } - VLOG(3) << "apply op grad: " << op_desc_->Type(); - std::vector grad_outputs; + VLOG(3) << "apply op grad: " << Type(); + std::vector tmp_grad_outputs; if (backward_id_ > 0) { VLOG(3) << "py_layer_grad"; - grad_outputs.resize(1); - grad_outputs[0][framework::GradVarName(PyLayer::kFwdOut)] = + tmp_grad_outputs.resize(1); + tmp_grad_outputs[0][framework::GradVarName(PyLayer::kFwdOut)] = PyLayer::ApplyGrad( backward_id_, grad_input_vars_[0][framework::GradVarName(PyLayer::kFwdInp)]); } else { - grad_outputs.resize(grad_op_descs_.size()); - for (size_t k = 0; k < grad_op_descs_.size(); ++k) { + const size_t grad_op_count = grad_op_descs_.size(); + + tmp_grad_outputs.resize(grad_op_count); + for (size_t k = 0; k < grad_op_count; ++k) { framework::OpDesc* grad_op_desc = grad_op_descs_[k]; - VLOG(3) << "op grad " << grad_op_desc->Type(); - for (auto it : grad_output_vars_[k]) { - auto& outputs = grad_outputs[k][it.first]; + auto& grad_output_variable_map = grad_output_vars_[k]; + + VLOG(3) << "apply grad op " << grad_op_desc->Type(); + + // Allocate tmp grad output variable + for (auto it : grad_output_variable_map) { + auto& outputs = tmp_grad_outputs[k][it.first]; + outputs.reserve(it.second.size()); for (size_t i = 0; i < it.second.size(); ++i) { // Allocate a new variable Variable* tmp_var = new framework::Variable(); tmp_var->GetMutable(); - outputs.push_back(tmp_var); + outputs.emplace_back(tmp_var); } } - framework::RuntimeContext ctx(grad_input_vars_[k], grad_outputs[k]); + // Run grad op + framework::RuntimeContext ctx(grad_input_vars_[k], tmp_grad_outputs[k]); // No need to do compile time infer shape here. // grad_op_desc_->InferShape(*block_); - grad_op_desc->InferVarType(block_); + // grad_op_desc->InferVarType(block_); std::unique_ptr opbase = framework::OpRegistry::CreateOp(*grad_op_desc); @@ -260,9 +271,10 @@ std::map> OpBase::ApplyGrad() { } } + // Add tmp grad outputs to original grad vars for (size_t k = 0; k < grad_output_vars_.size(); ++k) { for (auto it : grad_output_vars_[k]) { - auto& outputs = grad_outputs[k][it.first]; + auto& outputs = tmp_grad_outputs[k][it.first]; auto& origin_outputs = it.second; PADDLE_ENFORCE_EQ(outputs.size(), origin_outputs.size()); @@ -316,19 +328,14 @@ void PyLayer::RegisterFunc(int func_id, const py::object& py_func) { int PyLayer::NumFuncs() { return py_funcs_.size(); } -std::vector PyLayer::Apply(int func_id, - const std::vector& inputs) { +std::vector PyLayer::Apply(int func_id, + const std::vector& inputs) { std::vector invars; for (const VarBase* in : inputs) { invars.push_back(in->var_); } PADDLE_ENFORCE(py_funcs_.find(func_id) != py_funcs_.end()); - std::vector outvars = CallPythonFunc(py_funcs_[func_id], invars); - std::vector ret; - for (Variable* v : outvars) { - ret.push_back(new VarBase(v, new VarBase(true))); - } - return ret; + return CallPythonFunc(py_funcs_[func_id], invars); } std::vector PyLayer::ApplyGrad( diff --git a/paddle/fluid/imperative/layer.h b/paddle/fluid/imperative/layer.h index 7a9f33dc1e..618a5b7a03 100644 --- a/paddle/fluid/imperative/layer.h +++ b/paddle/fluid/imperative/layer.h @@ -112,31 +112,53 @@ class OpBase; */ class VarBase { public: - VarBase() : VarBase(new framework::Variable(), new VarBase(true)) {} - - explicit VarBase(bool stop_gradient) - : VarBase(new framework::Variable(), - stop_gradient ? nullptr : new VarBase(true), stop_gradient) {} - - VarBase(framework::Variable* var, VarBase* grad) - : VarBase(var, grad, false) {} + // Internal interface, create VarBase from exist variable + VarBase(const std::string& name, framework::Variable* var, VarBase* grad, + bool stop_gradient) + : VarBase(name, var->Get().type(), + var->Get().dims(), + var->Get().place(), var, grad, + stop_gradient, false) {} + + // Python interface + VarBase(const std::string& name, const framework::proto::VarType::Type dtype, + const std::vector& shape, const platform::Place& place, + bool stop_gradient, bool persistable) + : VarBase(name, dtype, framework::make_ddim(shape), place, stop_gradient, + persistable) {} + + // Internal interface, create VarBase from with ddim + VarBase(const std::string& name, const framework::proto::VarType::Type dtype, + const framework::DDim& shape, const platform::Place& place, + bool stop_gradient, bool persistable) + : VarBase(name, dtype, shape, place, nullptr, nullptr, stop_gradient, + persistable) {} private: - VarBase(framework::Variable* var, VarBase* grad, bool stop_gradient) - : name_(), - var_desc_(nullptr), + VarBase(const std::string& name, framework::proto::VarType::Type dtype, + const framework::DDim& shape, const platform::Place& place, + framework::Variable* var, VarBase* grad, bool stop_gradient, + bool persistable) + : name_(name), + dtype_(dtype), + place_(place), var_(var), grads_(grad), - block_(nullptr), - persistable_(false), stop_gradient_(stop_gradient), + persistable_(persistable), pre_op_(nullptr), pre_op_out_name_(), - pre_op_out_idx_(-1) {} + pre_op_out_idx_(-1) { + if (!var_) { + var_ = new framework::Variable(); + auto tensor = var_->GetMutable(); + tensor->Resize(shape); + tensor->mutable_data(place_, dtype_); + } + } public: virtual ~VarBase() { - // TODO(minqiyang): remove var desc from block desc if (var_) { delete var_; var_ = nullptr; @@ -151,14 +173,30 @@ class VarBase { pre_op_out_idx_ = -1; } - inline OpBase* PreOp() const { return pre_op_; } - inline int PreOpOutIdx() const { return pre_op_out_idx_; } + inline void SetName(const std::string& name) { name_ = name; } + inline std::string Name() const { return name_; } + + inline std::vector Shape() const { + if (var_->IsInitialized()) { + return framework::vectorize(var_->Get().dims()); + } else { + return {}; + } + } + + inline framework::proto::VarType::Type DType() const { return dtype_; } inline void SetStopGradient(bool stop_gradient) { stop_gradient_ = stop_gradient; } inline bool IsStopGradient() const { return stop_gradient_; } + inline void SetPersistable(bool persistable) { persistable_ = persistable; } + inline bool IsPersistable() const { return persistable_; } + + inline OpBase* PreOp() const { return pre_op_; } + inline int PreOpOutIdx() const { return pre_op_out_idx_; } + void RunBackward(); inline void ResetPreOp(OpBase* op) { @@ -180,7 +218,7 @@ class VarBase { } void ClearGradient() { - VLOG(1) << "clear gradient of " << var_desc_->Name(); + VLOG(1) << "clear gradient of " << Name(); if (grads_ && grads_->var_ && grads_->var_->IsInitialized()) { auto grads_t = grads_->var_->GetMutable(); operators::math::set_constant( @@ -196,23 +234,20 @@ class VarBase { const bool blocking) const; inline std::string GradName() const { - PADDLE_ENFORCE( - var_desc_, - "Couldn't get gradient variable's name, please call backward() first"); - return string::Sprintf("%s@IGrad", var_desc_->Name()); + return string::Sprintf("%s@IGrad", Name()); } std::string name_; - framework::VarDesc* var_desc_; + framework::proto::VarType::Type dtype_; + platform::Place place_; framework::Variable* var_; VarBase* grads_; - framework::BlockDesc* block_; - bool persistable_; - private: bool stop_gradient_; + bool persistable_; + OpBase* pre_op_; std::string pre_op_out_name_; int pre_op_out_idx_; @@ -223,11 +258,11 @@ class VarBase { */ class PYBIND11_HIDDEN OpBase { public: - OpBase() - : op_desc_(nullptr), + OpBase(const std::string& type) + : type_(type), + trace_id_(-1), forward_id_(-1), backward_id_(-1), - trace_id_(-1), place_(platform::CPUPlace()), backward_hooks_() {} @@ -249,13 +284,34 @@ class PYBIND11_HIDDEN OpBase { std::map> ApplyGrad(); + inline std::string Type() const { return type_; } + inline std::string GradOpType(size_t index) const { + PADDLE_ENFORCE_NOT_NULL(grad_op_descs_[index]); + return grad_op_descs_[index]->Type(); + } + void RegisterBackwardHooks(const py::object& callable); void InvokeBackwardHooks(); - // One of `op_desc_` or `forward_id_` is set, not both. - // For pure python PyLayer, use `forward_id_`, otherwise, use op_desc_. - framework::OpDesc* op_desc_; + void TrackPreOp(const VarBase* inp_var, const std::string& inp_name) { + if (inp_var->PreOp() && !inp_var->IsStopGradient()) { + VLOG(3) << "add pre op " << inp_var->PreOp()->Type() << " in slot " + << inp_name; + pre_ops_[inp_name].push_back(inp_var->PreOp()); + pre_ops_out_idx_[inp_name].push_back(inp_var->PreOpOutIdx()); + } else { + VLOG(3) << "no pre op in slot " << inp_name + << " input var stop_gradient: " << inp_var->IsStopGradient(); + pre_ops_[inp_name].push_back(nullptr); + // pre_ops_out_idx_[inp_name].push_back(-1); + } + } + + std::string type_; + // One of `trace_id_` or `forward_id_` is set, not both. + // For pure python PyLayer, use `forward_id_`, otherwise, use trace_id_. + int trace_id_; int forward_id_; // When has backward, one of `grad_op_descs_` or `backward_id_` is set, @@ -263,7 +319,6 @@ class PYBIND11_HIDDEN OpBase { // Note: each fwd op corresponds to a vector of bwd ops. std::vector grad_op_descs_; int backward_id_; - int trace_id_; platform::Place place_; @@ -277,8 +332,6 @@ class PYBIND11_HIDDEN OpBase { // Outputs to a vector of bwd ops. std::vector grad_output_vars_; - framework::BlockDesc* block_; - std::vector backward_hooks_; }; @@ -303,8 +356,8 @@ class PyLayer { static int NumFuncs(); - static std::vector Apply(int func_id, - const std::vector& inputs); + static std::vector Apply( + int func_id, const std::vector& inputs); static std::vector ApplyGrad( int func_id, const std::vector& inputs); diff --git a/paddle/fluid/imperative/tracer.cc b/paddle/fluid/imperative/tracer.cc index 0cb1676372..7ee92b4d8c 100644 --- a/paddle/fluid/imperative/tracer.cc +++ b/paddle/fluid/imperative/tracer.cc @@ -56,15 +56,19 @@ void CreateGradOp(const framework::OpDesc& op_desc, } } -void InitVar(framework::Variable* var, framework::Variable* grad_var, - platform::DeviceContext* dev_ctx) { +void InitGrad(VarBase* var, platform::DeviceContext* dev_ctx) { + PADDLE_ENFORCE_NOT_NULL(var, "Could not get valid var base"); PADDLE_ENFORCE_NOT_NULL(dev_ctx, "Could not get valid device from forward op"); - auto& var_t = var->Get(); - grad_var->GetMutable()->mutable_data( - var_t.dims(), dev_ctx->GetPlace()); - operators::math::set_constant( - *dev_ctx, grad_var->GetMutable(), 0.0); + + if (var->grads_ == nullptr) { + auto& var_t = var->var_->Get(); + var->grads_ = new VarBase(var->GradName(), framework::proto::VarType::FP32, + framework::vectorize(var_t.dims()), + dev_ctx->GetPlace(), true, false); + auto grad_t = var->grads_->var_->GetMutable(); + operators::math::set_constant(*dev_ctx, grad_t, 0.0); + } } platform::Place GetExpectedPlace(platform::Place place, VarBasePtrMap inputs) { @@ -85,6 +89,62 @@ platform::Place GetExpectedPlace(platform::Place place, VarBasePtrMap inputs) { return result; } +framework::VariableNameMap CreateInputVarNameMap( + const OpBase* op, const VarBasePtrMap& varbase_map) { + framework::VariableNameMap result; + + auto& info_map = framework::OpInfoMap::Instance(); + auto* op_info = info_map.GetNullable(op->Type()); + if (op_info == nullptr || op_info->proto_ == nullptr) { + return result; + } + + for (auto& in : op_info->Proto().inputs()) { + auto it = varbase_map.find(in.name()); + if (it == varbase_map.end()) { + PADDLE_ENFORCE(in.dispensable()); + result[in.name()] = {}; + } else { + auto var_vector = it->second; + std::vector args; + args.reserve(var_vector.size()); + for (VarBase* var_base : var_vector) { + args.emplace_back(var_base->Name()); + } + result[in.name()] = args; + } + } + return result; +} + +framework::VariableNameMap CreateOutputVarNameMap( + const OpBase* op, const VarBasePtrMap& varbase_map) { + framework::VariableNameMap result; + + auto& info_map = framework::OpInfoMap::Instance(); + auto* op_info = info_map.GetNullable(op->Type()); + if (op_info == nullptr || op_info->proto_ == nullptr) { + return result; + } + + for (auto& out : op_info->Proto().outputs()) { + auto it = varbase_map.find(out.name()); + if (it == varbase_map.end()) { + PADDLE_ENFORCE(out.dispensable()); + result[out.name()] = {}; + } else { + auto var_vector = it->second; + std::vector args; + args.reserve(var_vector.size()); + for (VarBase* var_base : var_vector) { + args.emplace_back(var_base->Name()); + } + result[out.name()] = args; + } + } + return result; +} + Tracer::Tracer(framework::BlockDesc* root_block) : root_block_(root_block) { if (!FLAGS_tracer_profile_fname.empty()) { std::call_once(gTracerProfileOnce, [] { @@ -101,7 +161,7 @@ Tracer::Tracer(framework::BlockDesc* root_block) : root_block_(root_block) { std::set Tracer::Trace(OpBase* op, const VarBasePtrMap& inputs, const VarBasePtrMap& outputs, - framework::BlockDesc* block, + framework::AttributeMap attrs_map, const platform::Place expected_place, const bool stop_gradient) { #ifdef WITH_GPERFTOOLS @@ -110,40 +170,27 @@ std::set Tracer::Trace(OpBase* op, const VarBasePtrMap& inputs, } #endif - std::map vars; - - framework::OpDesc* op_desc = op->op_desc_; - VLOG(3) << "tracer tracing " << op_desc->Type() << " trace id " - << op->trace_id_; - op_desc->InferShape(*block); - op_desc->InferVarType(block); - - std::unique_ptr op_base = - framework::OpRegistry::CreateOp(*op_desc); - framework::VariableValueMap invars_map; framework::VariableValueMap outvars_map; + // Construct input_vars_map and output_vars_map + std::map current_vars_map; op->input_vars_ = inputs; for (auto it : op->input_vars_) { auto& invars = invars_map[it.first]; invars.reserve(it.second.size()); for (VarBase* inp : it.second) { - PADDLE_ENFORCE_NOT_NULL(inp->var_, "op %s input %s nullptr", - op->op_desc_->Type(), inp->var_desc_->Name()); + PADDLE_ENFORCE_NOT_NULL(inp->var_, "op %s input %s nullptr", op->Type(), + inp->Name()); invars.emplace_back(inp->var_); - vars[inp->var_desc_->Name()] = inp; - if (inp->PreOp() && !inp->IsStopGradient()) { - op->pre_ops_[it.first].push_back(inp->PreOp()); - op->pre_ops_out_idx_[it.first].push_back(inp->PreOpOutIdx()); - VLOG(3) << "add pre op " << inp->PreOp()->op_desc_->Type(); - } else { - op->pre_ops_[it.first].push_back(nullptr); + op->TrackPreOp(inp, it.first); + if (!stop_gradient) { + current_vars_map[inp->Name()] = inp; } - VLOG(3) << "input vname " << inp->var_desc_->Name() << " " - << inp->var_->IsInitialized() << " stop_gradient " - << inp->IsStopGradient(); + VLOG(3) << "input var name: " << inp->Name() + << " inited: " << inp->var_->IsInitialized() + << " stop_grad: " << inp->IsStopGradient(); } } @@ -152,25 +199,38 @@ std::set Tracer::Trace(OpBase* op, const VarBasePtrMap& inputs, auto& outvars = outvars_map[it.first]; const std::vector& outputs = it.second; outvars.reserve(outputs.size()); - for (size_t i = 0; i < outputs.size(); ++i) { + for (size_t i = 0U; i < outputs.size(); ++i) { VarBase* out = outputs[i]; outvars.emplace_back(out->var_); - vars[out->var_desc_->Name()] = out; - - framework::VarDesc* var_desc = block->FindVar(out->var_desc_->Name()); - if (var_desc->GetType() == framework::proto::VarType::LOD_TENSOR) { - out->var_->GetMutable(); - } else { - LOG(ERROR) << "tracer doesn't support yet"; - } out->TrackPreOp(op, it.first, i, stop_gradient); + if (!stop_gradient) { + current_vars_map[out->Name()] = out; + } - VLOG(3) << "output vname " << out->var_desc_->Name() << " " - << out->var_->IsInitialized(); + VLOG(3) << "input var name: " << out->Name() + << " inited: " << out->var_->IsInitialized() + << " stop_grad: " << out->IsStopGradient(); } } - VLOG(3) << "tracer running " << op_desc->Type(); + // Check attrs and create op + framework::VariableNameMap invars_name_map = + CreateInputVarNameMap(op, inputs); + framework::VariableNameMap outvars_name_map = + CreateOutputVarNameMap(op, outputs); + + auto& info = framework::OpInfoMap::Instance().Get(op->Type()); + if (info.Checker() != nullptr) { + info.Checker()->Check(&attrs_map); + } + + std::unique_ptr op_base = + framework::OpRegistry::CreateOp(op->Type(), invars_name_map, + outvars_name_map, attrs_map); + + // TODO(minqiyang): Support infer var type in imperative mode + // Run forward op + VLOG(3) << "tracer running " << op->Type(); framework::RuntimeContext ctx(invars_map, outvars_map); // TODO(panyx0718): Cache p. @@ -186,36 +246,44 @@ std::set Tracer::Trace(OpBase* op, const VarBasePtrMap& inputs, framework::ExecutionContext(prepared_op.op, scope, *prepared_op.dev_ctx, prepared_op.ctx, prepared_op.kernel_configs)); + // construct backward op std::set vars_saved_for_backward; - if (!stop_gradient) { + VLOG(5) << "start construct backward op"; + + // construct grad op descs + std::unique_ptr fwd_op_desc(new framework::OpDesc( + op->Type(), invars_name_map, outvars_name_map, attrs_map)); std::unique_ptr> grad_to_var( new std::unordered_map()); - CreateGradOp(*op_desc, {}, {block}, &op->grad_op_descs_, grad_to_var.get()); + // NOTE(minqiyang): We don't support control flow op in imperative now + // Add grad_block_ when we want to support it + CreateGradOp(*fwd_op_desc, {}, {}, &op->grad_op_descs_, grad_to_var.get()); - op->grad_input_vars_.resize(op->grad_op_descs_.size()); - op->grad_output_vars_.resize(op->grad_op_descs_.size()); + VLOG(5) << "create grad op desc: " << op->grad_op_descs_[0]->Type(); - for (size_t i = 0; i < op->grad_op_descs_.size(); ++i) { + const size_t grad_op_count = op->grad_op_descs_.size(); + + op->grad_input_vars_.resize(grad_op_count); + op->grad_output_vars_.resize(grad_op_count); + + for (size_t i = 0; i < grad_op_count; ++i) { framework::OpDesc* grad_op_desc = op->grad_op_descs_[i]; for (auto it : grad_op_desc->Inputs()) { auto& grad_in_vars = op->grad_input_vars_[i][it.first]; + grad_in_vars.reserve(it.second.size()); for (const std::string& grad_invar : it.second) { - block->FindRecursiveOrCreateVar(grad_invar); auto var_it = grad_to_var->find(grad_invar); if (var_it == grad_to_var->end()) { - auto fwd_var_it = vars.find(grad_invar); - PADDLE_ENFORCE(fwd_var_it != vars.end()); + auto fwd_var_it = current_vars_map.find(grad_invar); + PADDLE_ENFORCE(fwd_var_it != current_vars_map.end()); // Forward inputs or outputs. - grad_in_vars.push_back(fwd_var_it->second->var_); + grad_in_vars.emplace_back(fwd_var_it->second->var_); } else { - VarBase* var = vars[var_it->second]; - if (!var->grads_->var_->IsInitialized()) { - InitVar(var->var_, var->grads_->var_, - prepared_op.GetDeviceContext()); - } + VarBase* var = current_vars_map[var_it->second]; + InitGrad(var, prepared_op.GetDeviceContext()); // Douts. - grad_in_vars.push_back(var->grads_->var_); + grad_in_vars.emplace_back(var->grads_->var_); } vars_saved_for_backward.insert(it.first); @@ -225,48 +293,48 @@ std::set Tracer::Trace(OpBase* op, const VarBasePtrMap& inputs, for (auto it : grad_op_desc->Outputs()) { auto& grad_out_vars = op->grad_output_vars_[i][it.first]; for (const std::string& grad_outvar : it.second) { - block->FindRecursiveOrCreateVar(grad_outvar); auto var_it = grad_to_var->find(grad_outvar); PADDLE_ENFORCE(var_it != grad_to_var->end(), "Could not found the grad op output var, should this " "operator %s's stop gradient be True", - op_desc->Type()); - VarBase* var = vars[var_it->second]; - if (!var->grads_->var_->IsInitialized()) { - InitVar(var->var_, var->grads_->var_, - prepared_op.GetDeviceContext()); - } + op->Type()); + VarBase* var = current_vars_map[var_it->second]; + InitGrad(var, prepared_op.GetDeviceContext()); grad_out_vars.push_back(var->grads_->var_); } } } } - op->block_ = block; return vars_saved_for_backward; } std::vector Tracer::PyTrace(OpBase* op, const std::vector& inputs, bool stop_gradient) { - VLOG(3) << "py_trace"; + VLOG(3) << "py_trace " << op->Type(); + op->input_vars_[PyLayer::kFwdInp] = inputs; - op->output_vars_[PyLayer::kFwdOut] = PyLayer::Apply(op->forward_id_, inputs); + + std::vector ret_vars = + PyLayer::Apply(op->forward_id_, inputs); + for (VarBase* inp : inputs) { - if (inp->PreOp() && !inp->IsStopGradient()) { - op->pre_ops_[PyLayer::kFwdInp].push_back(inp->PreOp()); - op->pre_ops_out_idx_[PyLayer::kFwdInp].push_back(inp->PreOpOutIdx()); - } else { - op->pre_ops_[PyLayer::kFwdInp].push_back(nullptr); - } + op->TrackPreOp(inp, PyLayer::kFwdInp); } - auto& outputs = op->output_vars_[PyLayer::kFwdOut]; - for (size_t i = 0; i < outputs.size(); ++i) { - VarBase* out = outputs[i]; + std::vector& outputs = op->output_vars_[PyLayer::kFwdOut]; + outputs.reserve(ret_vars.size()); + for (size_t i = 0U; i != ret_vars.size(); ++i) { + framework::Variable* v = ret_vars[i]; + VarBase* out = new VarBase(string::Sprintf("%s_out_%d", op->Type(), i), v, + nullptr, stop_gradient); + outputs.emplace_back(out); out->TrackPreOp(op, PyLayer::kFwdOut, i, stop_gradient); } + if (!stop_gradient) { + VLOG(5) << "start construct backward op"; op->grad_input_vars_.resize(1); op->grad_output_vars_.resize(1); auto& grad_input_vars = @@ -281,23 +349,16 @@ std::vector Tracer::PyTrace(OpBase* op, grad_input_vars.push_back(out->var_); } + // TODO(minqiyang): Add GPU support for PyLayer, only support CPU now platform::CPUPlace place; for (VarBase* out : outputs) { + InitGrad(out, platform::DeviceContextPool::Instance().Get(place)); grad_input_vars.push_back(out->grads_->var_); - if (!grad_input_vars.back()->IsInitialized()) { - // TODO(minqiyang): Add GPU support for PyLayer, only support CPU now - InitVar(out->var_, grad_input_vars.back(), - platform::DeviceContextPool::Instance().Get(place)); - } } - for (const VarBase* inp : inputs) { + for (VarBase* inp : inputs) { + InitGrad(inp, platform::DeviceContextPool::Instance().Get(place)); grad_output_vars.push_back(inp->grads_->var_); - if (!grad_output_vars.back()->IsInitialized()) { - // TODO(minqiyang): Add GPU support for PyLayer, only support CPU now - InitVar(inp->var_, grad_output_vars.back(), - platform::DeviceContextPool::Instance().Get(place)); - } } } return outputs; diff --git a/paddle/fluid/imperative/tracer.h b/paddle/fluid/imperative/tracer.h index 8a0267c37f..7b65d55e9e 100644 --- a/paddle/fluid/imperative/tracer.h +++ b/paddle/fluid/imperative/tracer.h @@ -17,6 +17,8 @@ #include #include #include +#include +#include #include #include "paddle/fluid/framework/op_desc.h" @@ -34,7 +36,8 @@ void CreateGradOp(const framework::OpDesc& op_desc, framework::OpDesc** grad_op_desc, std::unordered_map* grad_to_var); -void InitVar(framework::Variable* var, framework::Variable* grad_var); +void InitVar(const VarBase* var, framework::Variable* grad_var, + platform::DeviceContext* dev_ctx); platform::Place GetExpectedPlace(platform::Place place, VarBasePtrMap inputs); @@ -46,7 +49,7 @@ class Tracer { std::set Trace(OpBase* op, const VarBasePtrMap& inputs, const VarBasePtrMap& outputs, - framework::BlockDesc* block, + framework::AttributeMap attrs_map, const platform::Place expected_place, const bool stop_gradient = false); diff --git a/paddle/fluid/memory/allocation/legacy_allocator.cc b/paddle/fluid/memory/allocation/legacy_allocator.cc index 1936f9d4cd..a97d54a191 100644 --- a/paddle/fluid/memory/allocation/legacy_allocator.cc +++ b/paddle/fluid/memory/allocation/legacy_allocator.cc @@ -14,6 +14,7 @@ #include "paddle/fluid/memory/allocation/legacy_allocator.h" +#include #include #include #include diff --git a/paddle/fluid/pybind/imperative.cc b/paddle/fluid/pybind/imperative.cc index aeabed19ab..6bbda69297 100644 --- a/paddle/fluid/pybind/imperative.cc +++ b/paddle/fluid/pybind/imperative.cc @@ -13,10 +13,18 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/pybind/imperative.h" + +#include +#include +#include +#include + #include "paddle/fluid/framework/block_desc.h" #include "paddle/fluid/imperative/tracer.h" #include "paddle/fluid/imperative/type_defs.h" +#include "paddle/fluid/pybind/pybind_boost_headers.h" + namespace paddle { namespace pybind { @@ -31,20 +39,20 @@ void BindTracer(pybind11::module* m) { [](imperative::Tracer& self, imperative::OpBase* op, const imperative::VarBasePtrMap& inputs, const imperative::VarBasePtrMap& outputs, - framework::BlockDesc* block, + framework::AttributeMap attrs_map, const platform::CPUPlace expected_place, const bool stop_gradient = false) { - return self.Trace(op, inputs, outputs, block, expected_place, + return self.Trace(op, inputs, outputs, attrs_map, expected_place, stop_gradient); }) .def("trace", [](imperative::Tracer& self, imperative::OpBase* op, const imperative::VarBasePtrMap& inputs, const imperative::VarBasePtrMap& outputs, - framework::BlockDesc* block, + framework::AttributeMap attrs_map, const platform::CUDAPlace expected_place, const bool stop_gradient = false) { - return self.Trace(op, inputs, outputs, block, expected_place, + return self.Trace(op, inputs, outputs, attrs_map, expected_place, stop_gradient); }) .def("py_trace", &imperative::Tracer::PyTrace, diff --git a/paddle/fluid/pybind/imperative.h b/paddle/fluid/pybind/imperative.h index 8c48b2a715..8496cbfcb1 100644 --- a/paddle/fluid/pybind/imperative.h +++ b/paddle/fluid/pybind/imperative.h @@ -14,6 +14,7 @@ limitations under the License. */ #pragma once #include +#include #include #include "paddle/fluid/imperative/layer.h" #include "pybind11/pybind11.h" @@ -36,6 +37,8 @@ class Layer : public imperative::Layer { class PYBIND11_HIDDEN PyOpBase : public imperative::OpBase { public: using imperative::OpBase::OpBase; // Inherit constructors + + PyOpBase(const std::string& name) : OpBase(name) {} }; class PyVarBase : public imperative::VarBase { diff --git a/paddle/fluid/pybind/protobuf.cc b/paddle/fluid/pybind/protobuf.cc index e729be4a95..7b5e417504 100644 --- a/paddle/fluid/pybind/protobuf.cc +++ b/paddle/fluid/pybind/protobuf.cc @@ -23,97 +23,7 @@ limitations under the License. */ #include "paddle/fluid/framework/program_desc.h" #include "paddle/fluid/framework/var_desc.h" -// Cast boost::variant for PyBind. -// Copy from -// https://github.com/pybind/pybind11/issues/576#issuecomment-269563199 -namespace pybind11 { -namespace detail { - -#if !defined(PYBIND11_HIDDEN) -#ifdef _WIN32 -#define PYBIND11_HIDDEN __declspec(dllexport) -#else -#define PYBIND11_HIDDEN __attribute__((visibility("hidden"))) -#endif -#endif - -// Can be replaced by a generic lambda in C++14 -struct PYBIND11_HIDDEN paddle_variant_caster_visitor - : public boost::static_visitor { - return_value_policy policy; - handle parent; - - paddle_variant_caster_visitor(return_value_policy policy, handle parent) - : policy(policy), parent(parent) {} - - template - handle operator()(T const &src) const { - return make_caster::cast(src, policy, parent); - } -}; - -template -struct paddle_variant_caster; - -template