diff --git a/Dockerfile b/Dockerfile index 84e1edbee9..716b164ab8 100644 --- a/Dockerfile +++ b/Dockerfile @@ -94,52 +94,52 @@ RUN localedef -i en_US -f UTF-8 en_US.UTF-8 # specify sphinx version as 1.5.6 and remove -U option for [pip install -U # sphinx-rtd-theme] since -U option will cause sphinx being updated to newest # version(1.7.1 for now), which causes building documentation failed. -RUN pip3 install -U wheel && \ - pip3 install -U docopt PyYAML sphinx==1.5.6 && \ - pip3 install sphinx-rtd-theme==0.1.9 recommonmark && \ - pip3.6 install -U wheel && \ - pip3.6 install -U docopt PyYAML sphinx==1.5.6 && \ - pip3.6 install sphinx-rtd-theme==0.1.9 recommonmark && \ - pip3.7 install -U wheel && \ - pip3.7 install -U docopt PyYAML sphinx==1.5.6 && \ - pip3.7 install sphinx-rtd-theme==0.1.9 recommonmark && \ +RUN pip3 --no-cache-dir install -U wheel && \ + pip3 --no-cache-dir install -U docopt PyYAML sphinx==1.5.6 && \ + pip3 --no-cache-dir install sphinx-rtd-theme==0.1.9 recommonmark && \ + pip3.6 --no-cache-dir install -U wheel && \ + pip3.6 --no-cache-dir install -U docopt PyYAML sphinx==1.5.6 && \ + pip3.6 --no-cache-dir install sphinx-rtd-theme==0.1.9 recommonmark && \ + pip3.7 --no-cache-dir install -U wheel && \ + pip3.7 --no-cache-dir install -U docopt PyYAML sphinx==1.5.6 && \ + pip3.7 --no-cache-dir install sphinx-rtd-theme==0.1.9 recommonmark && \ easy_install -U pip && \ - pip install -U pip setuptools wheel && \ - pip install -U docopt PyYAML sphinx==1.5.6 && \ - pip install sphinx-rtd-theme==0.1.9 recommonmark - -RUN pip3 install 'pre-commit==1.10.4' 'ipython==5.3.0' && \ - pip3 install 'ipykernel==4.6.0' 'jupyter==1.0.0' && \ - pip3 install opencv-python && \ - pip3.6 install 'pre-commit==1.10.4' 'ipython==5.3.0' && \ - pip3.6 install 'ipykernel==4.6.0' 'jupyter==1.0.0' && \ - pip3.6 install opencv-python && \ - pip3.7 install 'pre-commit==1.10.4' 'ipython==5.3.0' && \ - pip3.7 install 'ipykernel==4.6.0' 'jupyter==1.0.0' && \ - pip3.7 install opencv-python && \ - pip install 'pre-commit==1.10.4' 'ipython==5.3.0' && \ - pip install 'ipykernel==4.6.0' 'jupyter==1.0.0' && \ - pip install opencv-python + pip --no-cache-dir install -U pip setuptools wheel && \ + pip --no-cache-dir install -U docopt PyYAML sphinx==1.5.6 && \ + pip --no-cache-dir install sphinx-rtd-theme==0.1.9 recommonmark + +RUN pip3 --no-cache-dir install 'pre-commit==1.10.4' 'ipython==5.3.0' && \ + pip3 --no-cache-dir install 'ipykernel==4.6.0' 'jupyter==1.0.0' && \ + pip3 --no-cache-dir install opencv-python && \ + pip3.6 --no-cache-dir install 'pre-commit==1.10.4' 'ipython==5.3.0' && \ + pip3.6 --no-cache-dir install 'ipykernel==4.6.0' 'jupyter==1.0.0' && \ + pip3.6 --no-cache-dir install opencv-python && \ + pip3.7 --no-cache-dir install 'pre-commit==1.10.4' 'ipython==5.3.0' && \ + pip3.7 --no-cache-dir install 'ipykernel==4.6.0' 'jupyter==1.0.0' && \ + pip3.7 --no-cache-dir install opencv-python && \ + pip --no-cache-dir install 'pre-commit==1.10.4' 'ipython==5.3.0' && \ + pip --no-cache-dir install 'ipykernel==4.6.0' 'jupyter==1.0.0' && \ + pip --no-cache-dir install opencv-python #For docstring checker -RUN pip3 install pylint pytest astroid isort -RUN pip3.6 install pylint pytest astroid isort -RUN pip3.7 install pylint pytest astroid isort -RUN pip install pylint pytest astroid isort LinkChecker +RUN pip3 --no-cache-dir install pylint pytest astroid isort +RUN pip3.6 --no-cache-dir install pylint pytest astroid isort +RUN pip3.7 --no-cache-dir install pylint pytest astroid isort +RUN pip --no-cache-dir install pylint pytest astroid isort LinkChecker COPY ./python/requirements.txt /root/ -RUN pip3 install -r /root/requirements.txt -RUN pip3.6 install -r /root/requirements.txt -RUN pip3.7 install -r /root/requirements.txt -RUN pip install -r /root/requirements.txt +RUN pip3 --no-cache-dir install -r /root/requirements.txt +RUN pip3.6 --no-cache-dir install -r /root/requirements.txt +RUN pip3.7 --no-cache-dir install -r /root/requirements.txt +RUN pip --no-cache-dir install -r /root/requirements.txt # To fix https://github.com/PaddlePaddle/Paddle/issues/1954, we use # the solution in https://urllib3.readthedocs.io/en/latest/user-guide.html#ssl-py2 -RUN apt-get install -y libssl-dev libffi-dev -RUN pip3 install certifi urllib3[secure] -RUN pip3.6 install certifi urllib3[secure] -RUN pip3.7 install certifi urllib3[secure] -RUN pip install certifi urllib3[secure] +RUN apt-get install -y libssl-dev libffi-dev && apt-get clean -y +RUN pip3 --no-cache-dir install certifi urllib3[secure] +RUN pip3.6 --no-cache-dir install certifi urllib3[secure] +RUN pip3.7 --no-cache-dir install certifi urllib3[secure] +RUN pip --no-cache-dir install certifi urllib3[secure] # Install woboq_codebrowser to /woboq diff --git a/cmake/external/mkldnn.cmake b/cmake/external/mkldnn.cmake index c29375cd05..a9b99e9ab8 100644 --- a/cmake/external/mkldnn.cmake +++ b/cmake/external/mkldnn.cmake @@ -106,10 +106,10 @@ else(WIN32) SET(MKLDNN_SHARED_LIB ${MKLDNN_INSTALL_DIR}/libmkldnn.so.0) ADD_CUSTOM_COMMAND(OUTPUT ${MKLDNN_SHARED_LIB} COMMAND ${CMAKE_COMMAND} -E copy ${MKLDNN_LIB} ${MKLDNN_SHARED_LIB} - DEPENDS mkldnn) + DEPENDS mkldnn shared_mkldnn) endif(WIN32) ADD_CUSTOM_TARGET(mkldnn_shared_lib ALL DEPENDS ${MKLDNN_SHARED_LIB}) - +ADD_DEPENDENCIES(mkldnn_shared_lib ${MKLDNN_PROJECT} mkldnn) IF(WITH_C_API) INSTALL(FILES ${MKLDNN_SHARED_LIB} DESTINATION lib) ENDIF() diff --git a/cmake/inference_lib.cmake b/cmake/inference_lib.cmake index 48279bc809..3e11d332ff 100644 --- a/cmake/inference_lib.cmake +++ b/cmake/inference_lib.cmake @@ -136,7 +136,7 @@ if (WITH_MKLDNN) copy(mkldnn_lib SRCS ${MKLDNN_INC_DIR} ${MKLDNN_SHARED_LIB} DSTS ${dst_dir} ${dst_dir}/lib - DEPS mkldnn + DEPS mkldnn_shared_lib ) endif () diff --git a/cmake/simd.cmake b/cmake/simd.cmake index 86096d4fea..566dc75fda 100644 --- a/cmake/simd.cmake +++ b/cmake/simd.cmake @@ -57,46 +57,43 @@ int main() return 0; }" SSE3_FOUND) -# disable AVX by default on windows -if(NOT WIN32) - # Check AVX - set(CMAKE_REQUIRED_FLAGS ${AVX_FLAG}) - set(AVX_FOUND_EXITCODE 1 CACHE STRING "Result from TRY_RUN" FORCE) - CHECK_CXX_SOURCE_RUNS(" - #include - int main() - { - __m256 a = _mm256_set_ps (-1.0f, 2.0f, -3.0f, 4.0f, -1.0f, 2.0f, -3.0f, 4.0f); - __m256 b = _mm256_set_ps (1.0f, 2.0f, 3.0f, 4.0f, 1.0f, 2.0f, 3.0f, 4.0f); - __m256 result = _mm256_add_ps (a, b); - return 0; - }" AVX_FOUND) +# Check AVX +set(CMAKE_REQUIRED_FLAGS ${AVX_FLAG}) +set(AVX_FOUND_EXITCODE 1 CACHE STRING "Result from TRY_RUN" FORCE) +CHECK_CXX_SOURCE_RUNS(" +#include +int main() +{ + __m256 a = _mm256_set_ps (-1.0f, 2.0f, -3.0f, 4.0f, -1.0f, 2.0f, -3.0f, 4.0f); + __m256 b = _mm256_set_ps (1.0f, 2.0f, 3.0f, 4.0f, 1.0f, 2.0f, 3.0f, 4.0f); + __m256 result = _mm256_add_ps (a, b); + return 0; +}" AVX_FOUND) - # Check AVX 2 - set(CMAKE_REQUIRED_FLAGS ${AVX2_FLAG}) - set(AVX2_FOUND_EXITCODE 1 CACHE STRING "Result from TRY_RUN" FORCE) - CHECK_CXX_SOURCE_RUNS(" - #include - int main() - { - __m256i a = _mm256_set_epi32 (-1, 2, -3, 4, -1, 2, -3, 4); - __m256i result = _mm256_abs_epi32 (a); - return 0; - }" AVX2_FOUND) +# Check AVX 2 +set(CMAKE_REQUIRED_FLAGS ${AVX2_FLAG}) +set(AVX2_FOUND_EXITCODE 1 CACHE STRING "Result from TRY_RUN" FORCE) +CHECK_CXX_SOURCE_RUNS(" +#include +int main() +{ + __m256i a = _mm256_set_epi32 (-1, 2, -3, 4, -1, 2, -3, 4); + __m256i result = _mm256_abs_epi32 (a); + return 0; +}" AVX2_FOUND) - # Check AVX512F - set(CMAKE_REQUIRED_FLAGS ${AVX512F_FLAG}) - set(AVX512F_FOUND_EXITCODE 1 CACHE STRING "Result from TRY_RUN" FORCE) - CHECK_CXX_SOURCE_RUNS(" - #include - int main() - { - __m512i a = _mm512_set_epi32 (-1, 2, -3, 4, -1, 2, -3, 4, - 13, -5, 6, -7, 9, 2, -6, 3); - __m512i result = _mm512_abs_epi32 (a); - return 0; - }" AVX512F_FOUND) -endif(NOT WIN32) +# Check AVX512F +set(CMAKE_REQUIRED_FLAGS ${AVX512F_FLAG}) +set(AVX512F_FOUND_EXITCODE 1 CACHE STRING "Result from TRY_RUN" FORCE) +CHECK_CXX_SOURCE_RUNS(" +#include +int main() +{ + __m512i a = _mm512_set_epi32 (-1, 2, -3, 4, -1, 2, -3, 4, + 13, -5, 6, -7, 9, 2, -6, 3); + __m512i result = _mm512_abs_epi32 (a); + return 0; +}" AVX512F_FOUND) set(CMAKE_REQUIRED_FLAGS ${CMAKE_REQUIRED_FLAGS_RETAINED}) mark_as_advanced(MMX_FOUND SSE2_FOUND SSE3_FOUND AVX_FOUND AVX2_FOUND AVX512F_FOUND) diff --git a/paddle/contrib/float16/float16_transpiler.py b/paddle/contrib/float16/float16_transpiler.py index 8d95dc0591..500f64bed9 100644 --- a/paddle/contrib/float16/float16_transpiler.py +++ b/paddle/contrib/float16/float16_transpiler.py @@ -60,7 +60,7 @@ class Float16Transpiler: raise TypeError("place should be as CPUPlace/CUDAPlace type") if scope is None: scope = global_scope() - if not isinstance(scope, core.Scope): + if not isinstance(scope, core._Scope): raise TypeError("scope should be as Scope type or None") self.scope = scope diff --git a/paddle/fluid/API.spec b/paddle/fluid/API.spec index b6974c6af2..9872631553 100644 --- a/paddle/fluid/API.spec +++ b/paddle/fluid/API.spec @@ -464,11 +464,7 @@ paddle.fluid.unique_name.switch ArgSpec(args=['new_generator'], varargs=None, ke paddle.fluid.unique_name.guard ArgSpec(args=[], varargs='args', keywords='kwds', defaults=None) paddle.fluid.recordio_writer.convert_reader_to_recordio_file ArgSpec(args=['filename', 'reader_creator', 'feeder', 'compressor', 'max_num_records', 'feed_order'], varargs=None, keywords=None, defaults=(Compressor.Snappy, 1000, None)) paddle.fluid.recordio_writer.convert_reader_to_recordio_files ArgSpec(args=['filename', 'batch_per_file', 'reader_creator', 'feeder', 'compressor', 'max_num_records', 'feed_order'], varargs=None, keywords=None, defaults=(Compressor.Snappy, 1000, None)) -paddle.fluid.Scope.__init__ __init__(self: paddle.fluid.core.Scope) -> None -paddle.fluid.Scope.drop_kids drop_kids(self: paddle.fluid.core.Scope) -> None -paddle.fluid.Scope.find_var find_var(self: paddle.fluid.core.Scope, arg0: unicode) -> paddle.fluid.core.Variable -paddle.fluid.Scope.new_scope new_scope(self: paddle.fluid.core.Scope) -> paddle.fluid.core.Scope -paddle.fluid.Scope.var var(self: paddle.fluid.core.Scope, arg0: unicode) -> paddle.fluid.core.Variable +paddle.fluid.Scope Scope() -> paddle.fluid.core._Scope paddle.reader.map_readers ArgSpec(args=['func'], varargs='readers', keywords=None, defaults=None) paddle.reader.buffered ArgSpec(args=['reader', 'size'], varargs=None, keywords=None, defaults=None) paddle.reader.compose ArgSpec(args=[], varargs='readers', keywords='kwargs', defaults=None) diff --git a/paddle/fluid/framework/CMakeLists.txt b/paddle/fluid/framework/CMakeLists.txt index 412bc9cbe8..21e640cdf2 100644 --- a/paddle/fluid/framework/CMakeLists.txt +++ b/paddle/fluid/framework/CMakeLists.txt @@ -7,27 +7,17 @@ function(windows_symbolic TARGET) cmake_parse_arguments(windows_symbolic "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN}) set(final_path ${CMAKE_CURRENT_SOURCE_DIR}/${windows_symbolic_PATH}) foreach(src ${windows_symbolic_SRCS}) - get_filename_component(src ${src} NAME_WE) - if (NOT EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${src}.cc OR NOT EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${src}.cu) - message(FATAL " ${src}.cc and ${src}.cu must exsits, and ${src}.cu must be symbolic file.") - endif() - -#only copy the xx.cu to.xx.cu when the content are modified - set(copy_flag 1) - if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/.${src}.cu) - file(READ ${CMAKE_CURRENT_SOURCE_DIR}/${src}.cc SOURCE_STR) - file(READ ${CMAKE_CURRENT_SOURCE_DIR}/.${src}.cu TARGET_STR) - if (SOURCE_STR STREQUAL TARGET_STR) - set(copy_flag 0) - endif() - endif() - if (copy_flag) - add_custom_command(OUTPUT .${src}.cu - COMMAND ${CMAKE_COMMAND} -E remove ${CMAKE_CURRENT_SOURCE_DIR}/.${src}.cu - COMMAND ${CMAKE_COMMAND} -E copy "${CMAKE_CURRENT_SOURCE_DIR}/${src}.cc" "${CMAKE_CURRENT_SOURCE_DIR}/.${src}.cu" - COMMENT "create hidden file of ${src}.cu") - endif(copy_flag) - add_custom_target(${TARGET} ALL DEPENDS .${src}.cu) + get_filename_component(src ${src} NAME_WE) + if (NOT EXISTS ${final_path}/${src}.cc OR NOT EXISTS ${final_path}/${src}.cu) + message(FATAL " ${src}.cc and ${src}.cu must exsits, and ${src}.cu must be symbolic file.") + endif() + + file(GENERATE OUTPUT ${final_path}/.${src}.cu INPUT ${final_path}/${src}.cc) + + add_custom_command(OUTPUT ${final_path}/.${src}.cu + COMMAND ${CMAKE_COMMAND} -E copy_if_different "${final_path}/${src}.cc" "${final_path}/.${src}.cu" + COMMENT "create hidden file of ${src}.cu") + add_custom_target(${TARGET} ALL DEPENDS .${src}.cu) endforeach() endfunction() @@ -48,10 +38,10 @@ if(WITH_GPU) nv_library(tensor SRCS tensor.cc .tensor_util.cu DEPS place memory data_type device_context) add_dependencies(tensor tensor_util) else() - nv_library(tensor SRCS tensor.cc tensor_util.cu DEPS place memory data_type device_context) + nv_library(tensor SRCS tensor.cc tensor_util.cu DEPS place memory data_type device_context ) endif(WIN32) else() - cc_library(tensor SRCS tensor.cc tensor_util.cc DEPS place memory data_type device_context) + cc_library(tensor SRCS tensor.cc tensor_util.cc DEPS place memory data_type device_context ) endif() cc_test(tensor_test SRCS tensor_test.cc DEPS tensor) @@ -78,17 +68,23 @@ cc_library(garbage_collector SRCS garbage_collector.cc DEPS device_context memor cc_library(reader SRCS reader.cc DEPS lod_tensor ddim) cc_test(reader_test SRCS reader_test.cc DEPS reader) -cc_test(variable_test SRCS variable_test.cc) - cc_library(threadpool SRCS threadpool.cc DEPS enforce) cc_test(threadpool_test SRCS threadpool_test.cc DEPS threadpool) -cc_library(scope SRCS scope.cc DEPS glog threadpool) +cc_library(var_type_traits SRCS var_type_traits DEPS lod_tensor selected_rows framework_proto) +if (WITH_GPU) + target_link_libraries(var_type_traits dynload_cuda) +endif() +cc_test(var_type_traits_test SRCS var_type_traits_test.cc DEPS var_type_traits) + +cc_library(scope SRCS scope.cc DEPS glog threadpool var_type_traits) +cc_library(scope_pool SRCS scope_pool.cc DEPS scope) cc_test(scope_test SRCS scope_test.cc DEPS scope) +cc_test(variable_test SRCS variable_test.cc DEPS tensor var_type_traits) cc_library(data_device_transform SRCS data_device_transform.cc DEPS tensor) nv_test(data_device_transform_test SRCS data_device_transform_test.cu - DEPS operator op_registry device_context math_function) + DEPS operator op_registry device_context math_function scope) if(WITH_GPU) if (WIN32) diff --git a/paddle/fluid/framework/attribute.h b/paddle/fluid/framework/attribute.h index d9c76881b7..67054eccb3 100644 --- a/paddle/fluid/framework/attribute.h +++ b/paddle/fluid/framework/attribute.h @@ -165,7 +165,7 @@ template class GreaterThanChecker { public: explicit GreaterThanChecker(T lower_bound) : lower_bound_(lower_bound) {} - void operator()(T& value) const { + void operator()(const T& value) const { PADDLE_ENFORCE(value > lower_bound_, "larger_than check fails."); } @@ -177,7 +177,7 @@ template class EqualGreaterThanChecker { public: explicit EqualGreaterThanChecker(T lower_bound) : lower_bound_(lower_bound) {} - void operator()(T& value) const { + void operator()(const T& value) const { PADDLE_ENFORCE_GE(value, lower_bound_, "equal_larger_than check fails."); } @@ -193,7 +193,7 @@ class DefaultValueSetter { public: explicit DefaultValueSetter(T default_value) : default_value_(default_value) {} - void operator()(T& value) const { value = default_value_; } // NOLINT + void operator()(T* value) const { *value = default_value_; } private: T default_value_; @@ -203,7 +203,7 @@ template class EnumInContainer { public: explicit EnumInContainer(const std::unordered_set& c) : container_(c) {} - void operator()(T& val) const { + void operator()(const T& val) const { PADDLE_ENFORCE(container_.find(val) != container_.end(), "Value %s is not in enum container %s", val, ContainerDebugString()); @@ -232,7 +232,8 @@ class EnumInContainer { // an attribute can have more than one limits template class TypedAttrChecker { - typedef std::function ValueChecker; + typedef std::function DefaultValueChecker; + typedef std::function ValueChecker; public: explicit TypedAttrChecker(const std::string& attr_name) @@ -268,17 +269,17 @@ class TypedAttrChecker { return *this; } - void operator()(AttributeMap& attr_map) const { // NOLINT - if (!attr_map.count(attr_name_)) { + void operator()(AttributeMap* attr_map) const { + if (!attr_map->count(attr_name_)) { // user do not set this attr PADDLE_ENFORCE(!default_value_setter_.empty(), "Attribute '%s' is required!", attr_name_); // default_value_setter_ has no more than one element T val; - (default_value_setter_[0])(val); - attr_map[attr_name_] = val; + (default_value_setter_[0])(&val); + (*attr_map)[attr_name_] = val; } - Attribute& attr = attr_map.at(attr_name_); + Attribute& attr = attr_map->at(attr_name_); ExtractAttribute extract_attr(attr_name_); T* attr_value = extract_attr(attr); for (const auto& checker : value_checkers_) { @@ -289,12 +290,12 @@ class TypedAttrChecker { private: std::string attr_name_; std::vector value_checkers_; - std::vector default_value_setter_; + std::vector default_value_setter_; }; // check whether op's all attributes fit their own limits class OpAttrChecker { - typedef std::function AttrChecker; + typedef std::function AttrChecker; public: template @@ -304,7 +305,7 @@ class OpAttrChecker { return *(checker.target>()); } - void Check(AttributeMap& attr_map) const { // NOLINT + void Check(AttributeMap* attr_map) const { for (const auto& checker : attr_checkers_) { checker(attr_map); } diff --git a/paddle/fluid/framework/data_device_transform_test.cu b/paddle/fluid/framework/data_device_transform_test.cu index c9ec5e7a7b..96a2f9250f 100644 --- a/paddle/fluid/framework/data_device_transform_test.cu +++ b/paddle/fluid/framework/data_device_transform_test.cu @@ -17,6 +17,7 @@ limitations under the License. */ #include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/framework/op_info.h" #include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/framework/scope.h" #include "paddle/fluid/operators/elementwise/elementwise_op_function.h" #include "paddle/fluid/operators/math/math_function.h" #include "paddle/fluid/platform/device_context.h" diff --git a/paddle/fluid/framework/details/eager_deletion_op_handle.cc b/paddle/fluid/framework/details/eager_deletion_op_handle.cc index abacb11e3b..03fbfd7f24 100644 --- a/paddle/fluid/framework/details/eager_deletion_op_handle.cc +++ b/paddle/fluid/framework/details/eager_deletion_op_handle.cc @@ -88,7 +88,7 @@ void EagerDeletionOpHandle::RunImpl() { } } else { PADDLE_THROW("Type %s of %s is not supported eager deletion", - var->Type().name(), name); + framework::ToTypeName(var->Type()), name); } } diff --git a/paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.cc b/paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.cc index 949510e037..872bc5d654 100644 --- a/paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.cc +++ b/paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.cc @@ -120,6 +120,7 @@ FeedFetchList FastThreadedSSAGraphExecutor::Run( ClearFetchOp(graph_.get(), &fetch_ops); return fetches; } + void FastThreadedSSAGraphExecutor::RunOpAsync( std::unordered_map> *op_deps, OpHandleBase *op, diff --git a/paddle/fluid/framework/details/multi_devices_graph_pass.cc b/paddle/fluid/framework/details/multi_devices_graph_pass.cc index 036cef1daa..5b9a818117 100644 --- a/paddle/fluid/framework/details/multi_devices_graph_pass.cc +++ b/paddle/fluid/framework/details/multi_devices_graph_pass.cc @@ -42,6 +42,12 @@ namespace { typedef std::vector GraphOps; const char kGraphOps[] = "ops"; +bool OpHaveRole(const ir::Node &node, const framework::OpRole &role) { + return boost::get( + node.Op()->GetAttr(OpProtoAndCheckerMaker::OpRoleAttrName())) == + static_cast(role); +} + void PolishGraphToSupportDataHazards(ir::Graph *graph) { for (auto &var_map : graph->Get(kGraphVars)) { for (auto &name_pair : var_map) { @@ -147,6 +153,7 @@ void MultiDevSSAGraphBuilder::Init() const { #endif balance_vars_.resize(places_.size(), 0); + if (strategy_.enable_data_balance_ && places_.size() == 1) { LOG(WARNING) << "It is no need to enable data balance when there is only " "one place. enable_data_balance is set to False."; @@ -154,145 +161,16 @@ void MultiDevSSAGraphBuilder::Init() const { } } -void MultiDevSSAGraphBuilder::CreateOpHandleIOs(ir::Graph *result, - ir::Node *node, - size_t place_id) const { - auto p = places_[place_id]; - auto *op_handle = result->Get(kGraphOps).back(); - op_handle->SetDeviceContext(p, - platform::DeviceContextPool::Instance().Get(p)); - - for (ir::Node *input : node->inputs) { - VarHandle *var = CreateOrGetLatestVarHandle(result, input, p, place_id); - op_handle->AddInput(var); - } - - for (ir::Node *output : node->outputs) { - ir::Node *new_node = nullptr; - if (output->Var()) { - new_node = result->CreateVarNode(output->Var()); - } else { - new_node = - result->CreateEmptyNode(output->Name(), ir::Node::Type::kVariable); - } - CreateOpOutput(result, op_handle, new_node, p, place_id); - } -} - -std::vector MultiDevSSAGraphBuilder::FindDistTrainSendVars( - const std::vector &nodes) const { - std::vector send_vars; - // since parameters are all in block 0, - // it's enough to only scan send ops in block 0 - for (auto &node : nodes) { - OpDesc *op = node->Op(); - // TODO(Yancey1989): use a graceful method to find send op, - // instead of the the hard code string - if (op->Type() == "send") { - auto op_vars = op->InputArgumentNames(); - send_vars.reserve(send_vars.size() + - std::distance(op_vars.begin(), op_vars.end())); - send_vars.insert(send_vars.end(), op_vars.begin(), op_vars.end()); - } - } - return send_vars; -} - -std::vector MultiDevSSAGraphBuilder::FindDistTrainRecvVars( - const std::vector &nodes) const { - std::vector recv_vars; - for (auto &node : nodes) { - OpDesc *op = node->Op(); - // TODO(Yancey1989): use a graceful method to find recv op, - // instead of the hard code string - if (op->Type() == "recv") { - auto op_vars = op->OutputArgumentNames(); - recv_vars.reserve(recv_vars.size() + - std::distance(op_vars.begin(), op_vars.end())); - recv_vars.insert(recv_vars.end(), op_vars.begin(), op_vars.end()); - } - } - return recv_vars; -} - -size_t MultiDevSSAGraphBuilder::GetAppropriateDeviceID( - const std::vector &var_names) const { - int64_t numel_sum = 0; - for (auto var_name : var_names) { - if (all_vars_.find(var_name) == all_vars_.end()) continue; - auto var_desc = all_vars_.at(var_name); - PADDLE_ENFORCE_NOT_NULL(var_desc); - auto dim = framework::make_ddim(var_desc->GetShape()); - int64_t numel = framework::product(dim); - PADDLE_ENFORCE_GT(numel, 0); - numel_sum += numel; - } - - auto smallest = - std::min_element(std::begin(balance_vars_), std::end(balance_vars_)); - size_t dev_id = - static_cast(std::distance(std::begin(balance_vars_), smallest)); - balance_vars_[dev_id] += numel_sum; - return dev_id; -} - -// Topology sort the graph nodes from inputs to outputs. -// Since SSAGraphBuilder depends on forward/backward nodes to assign devices -// to parameter/gradients before optimizer ops, topo sort is insufficient. ( -// some optimizer ops might not depend on any nodes), we manually move all -// optimizer nodes after last backward nodes. -// However, the assumption by SSAGraphBuilder should be relaxed in the future. -std::vector SortOpsAndDelayOptimizeOp(const ir::Graph &graph) { - std::vector ret = ir::TopologySortOperations(graph); - size_t last_backward = 0; - for (size_t i = 0; i < ret.size(); ++i) { - if (boost::get( - ret[i]->Op()->GetAttr(OpProtoAndCheckerMaker::OpRoleAttrName())) == - static_cast(OpRole::kBackward)) { - last_backward = i; - } - } - - std::vector optimize_ops; - std::vector sorted_ret; - for (size_t i = 0; i < ret.size(); ++i) { - if (i < last_backward) { - if (static_cast(boost::get(ret[i]->Op()->GetAttr( - OpProtoAndCheckerMaker::OpRoleAttrName())) & - static_cast(OpRole::kOptimize))) { - optimize_ops.push_back(ret[i]); - } else { - sorted_ret.push_back(ret[i]); - } - } else if (i == last_backward) { - sorted_ret.push_back(ret[i]); - // Verify that no operations before optimize ops depends on optimize ops. - std::unordered_set optimize_set(optimize_ops.begin(), - optimize_ops.end()); - for (ir::Node *n : sorted_ret) { - for (ir::Node *in : n->inputs) { - for (ir::Node *pre_n : in->inputs) { - PADDLE_ENFORCE(optimize_set.find(pre_n) == optimize_set.end(), - "optimize operations cannot be depended by forward " - "or backward node %s -> %s", - pre_n->Name(), n->Name()); - } - } - } - sorted_ret.insert(sorted_ret.end(), optimize_ops.begin(), - optimize_ops.end()); - } else { - sorted_ret.push_back(ret[i]); - } - } - return sorted_ret; -} - std::unique_ptr MultiDevSSAGraphBuilder::ApplyImpl( std::unique_ptr graph) const { Init(); // Give the topology sort order and rebuild the graph structure. - std::vector sorted_ops = SortOpsAndDelayOptimizeOp(*graph); + std::vector sorted_ops = ir::TopologySortOperations(*graph); + + if (strategy_.reduce_ == BuildStrategy::ReduceStrategy::kReduce) { + sorted_ops = SortForReduceMode(sorted_ops); + } + auto nodes = graph->ReleaseNodes(); ir::Graph &result = *graph; @@ -303,31 +181,22 @@ std::unique_ptr MultiDevSSAGraphBuilder::ApplyImpl( all_vars_.emplace(node->Name(), node->Var()); } } - std::unordered_set og_has_been_broadcast; // We cannot invoke resize. It is a bug of GCC 4.8 result.Set(kGraphVars, new GraphVars(places_.size())); result.Set(kGraphDepVars, new GraphDepVars); result.Set(kGraphOps, new GraphOps); - // find send/recv vars so that we can place the distributed training - // related op in the place 0 - auto send_vars = FindDistTrainSendVars(sorted_ops); - auto recv_vars = FindDistTrainRecvVars(sorted_ops); - std::vector> bcast_var_name_set; bcast_var_name_set.resize(places_.size()); - size_t cur_device_id = 0; bool is_forwarding = true; bool is_dist_train = false; std::unordered_map sharded_var_device; for (ir::Node *node : sorted_ops) { - if (boost::get( - node->Op()->GetAttr(OpProtoAndCheckerMaker::OpRoleAttrName())) == - static_cast(OpRole::kRPC)) { + if (OpHaveRole(*node, OpRole::kRPC)) { int op_dev_id = CreateRPCOp(&result, node, &sharded_var_device); PADDLE_ENFORCE(op_dev_id != -1, "Can not schedule the RPC operator to the right place."); @@ -341,9 +210,7 @@ std::unique_ptr MultiDevSSAGraphBuilder::ApplyImpl( } } is_dist_train = true; - } else if (boost::get(node->Op()->GetAttr( - OpProtoAndCheckerMaker::OpRoleAttrName())) == - static_cast(OpRole::kDist)) { + } else if (OpHaveRole(*node, OpRole::kDist)) { int op_dev_id = CreateDistTrainOp(&result, node, &sharded_var_device); if (node->Op()->Type() == "concat") { auto origin_param_name = node->Op()->OutputArgumentNames()[0]; @@ -355,7 +222,9 @@ std::unique_ptr MultiDevSSAGraphBuilder::ApplyImpl( BuildStrategy::GradientScaleStrategy::kCustomized) { // TODO(paddle-dev): Why is there no input for this op_handle? auto loss_grad_name = node->Op()->OutputArgumentNames()[0]; - CreateScaleLossGradOp(&result, loss_grad_name, node->outputs[0]); + auto out_dtype = all_vars_.at(loss_grad_name)->GetDataType(); + CreateScaleLossGradOp(&result, loss_grad_name, node->outputs[0], + out_dtype); } // This assumes the backward generating code will ensure IsScaleLossOp // is true only for the op that scale the final scalar loss. @@ -363,7 +232,7 @@ std::unique_ptr MultiDevSSAGraphBuilder::ApplyImpl( // the block. is_forwarding = false; } else { - int op_dev_id = GetOpDeviceID(result, node, sharded_var_device); + int op_dev_id = GetOpDeviceID(node, sharded_var_device); if (op_dev_id != -1) { // This op only runs on one specific device. CreateComputationalOp(&result, node, op_dev_id); for (ir::Node *n : node->outputs) { @@ -383,47 +252,48 @@ std::unique_ptr MultiDevSSAGraphBuilder::ApplyImpl( } if (!is_forwarding && (places_.size() > 1 || num_trainers > 1)) { + bool is_bk_op = + static_cast(boost::get(node->Op()->GetAttr( + OpProtoAndCheckerMaker::OpRoleAttrName())) & + static_cast(OpRole::kBackward)); + if (!is_bk_op) continue; // Currently, we assume that once gradient is generated, it can be // broadcast, and each gradient is only broadcast once. - if (static_cast(boost::get(node->Op()->GetAttr( - OpProtoAndCheckerMaker::OpRoleAttrName())) & - static_cast(OpRole::kBackward))) { - try { - auto backward_vars = boost::get>( - node->Op()->GetNullableAttr( - OpProtoAndCheckerMaker::OpRoleVarAttrName())); - - PADDLE_ENFORCE_EQ(backward_vars.size() % 2, 0); - - for (size_t i = 0; i < backward_vars.size(); i += 2) { - auto &p_name = backward_vars[i]; - auto &g_name = backward_vars[i + 1]; - VLOG(10) << "Bcast " << g_name << " for parameter " << p_name; - - switch (strategy_.reduce_) { - case BuildStrategy::ReduceStrategy::kReduce: - cur_device_id = GetAppropriateDeviceID({g_name}); - CreateReduceOp(&result, g_name, cur_device_id); - sharded_var_device.emplace(g_name, cur_device_id); - if (!is_dist_train) { - bcast_var_name_set[cur_device_id].emplace(p_name); - } - break; - case BuildStrategy::ReduceStrategy::kAllReduce: - if (IsSparseGradient(g_name)) { - CreateReduceOp(&result, g_name, 0); - CreateBroadcastOp(&result, g_name, 0); - } else { - InsertAllReduceOp(&result, g_name); - } - break; - default: - LOG(FATAL) << "Unknown reduce strategy "; - break; - } + try { + auto backward_vars = boost::get>( + node->Op()->GetNullableAttr( + OpProtoAndCheckerMaker::OpRoleVarAttrName())); + + PADDLE_ENFORCE_EQ(backward_vars.size() % 2, 0); + + for (size_t i = 0; i < backward_vars.size(); i += 2) { + auto &p_name = backward_vars[i]; + auto &g_name = backward_vars[i + 1]; + VLOG(10) << "Bcast " << g_name << " for parameter " << p_name; + size_t cur_device_id = -1; + switch (strategy_.reduce_) { + case BuildStrategy::ReduceStrategy::kReduce: + cur_device_id = GetAppropriateDeviceID({g_name}); + CreateReduceOp(&result, g_name, cur_device_id); + sharded_var_device.emplace(g_name, cur_device_id); + if (!is_dist_train) { + bcast_var_name_set[cur_device_id].emplace(p_name); + } + break; + case BuildStrategy::ReduceStrategy::kAllReduce: + if (IsSparseGradient(g_name)) { + CreateReduceOp(&result, g_name, 0); + CreateBroadcastOp(&result, g_name, 0); + } else { + InsertAllReduceOp(&result, g_name); + } + break; + default: + LOG(FATAL) << "Unknown reduce strategy "; + break; } - } catch (boost::bad_get e) { } + } catch (boost::bad_get e) { } } } @@ -467,12 +337,108 @@ std::unique_ptr MultiDevSSAGraphBuilder::ApplyImpl( return graph; } -bool MultiDevSSAGraphBuilder::IsSparseGradient(const std::string &og) const { - PADDLE_ENFORCE(all_vars_.count(og) != 0); - if (all_vars_.at(og)->GetType() == proto::VarType::SELECTED_ROWS) { - return true; +std::vector MultiDevSSAGraphBuilder::SortForReduceMode( + const std::vector &topo_ops) const { + std::unordered_map sharded_var_device; + std::vector sorted_ops; + std::unordered_map> delayed_op; + sorted_ops.reserve(topo_ops.size()); + + auto insert_delayed_op = [&](const std::string &var_name, int dev_id) { + sharded_var_device.emplace(var_name, dev_id); + if (delayed_op.count(var_name)) { + auto &ops = delayed_op.at(var_name); + sorted_ops.insert(sorted_ops.end(), ops.begin(), ops.end()); + delayed_op.at(var_name).clear(); + } + }; + + for (ir::Node *node : topo_ops) { + int op_dev_id = GetOpDeviceID(node, sharded_var_device, &delayed_op); + if (op_dev_id > -1) { + // This op only runs on one specific device. + sorted_ops.emplace_back(node); + for (ir::Node *n : node->outputs) { + insert_delayed_op(n->Name(), op_dev_id); + } + } else if (op_dev_id == -1) { + // This op runs on all devices, and its output may have parameter's + // gradients. + sorted_ops.emplace_back(node); + bool is_bk_op = + static_cast(boost::get(node->Op()->GetAttr( + OpProtoAndCheckerMaker::OpRoleAttrName())) & + static_cast(OpRole::kBackward)); + if (!is_bk_op) continue; + // Currently, we assume that once gradient is generated, it can be + // broadcast, and each gradient is only broadcast once. + std::vector backward_vars; + try { + backward_vars = + boost::get>(node->Op()->GetNullableAttr( + OpProtoAndCheckerMaker::OpRoleVarAttrName())); + } catch (boost::bad_get e) { + } + PADDLE_ENFORCE_EQ(backward_vars.size() % 2, 0); + + for (size_t i = 0; i < backward_vars.size(); i += 2) { + auto &g_name = backward_vars[i + 1]; + size_t cur_device_id = GetAppropriateDeviceID({g_name}); + insert_delayed_op(g_name, static_cast(cur_device_id)); + } + } else if (op_dev_id == -2) { + // The Op on which the Op depends has not yet been generated. + } } - return false; + + PADDLE_ENFORCE_EQ(sorted_ops.size(), topo_ops.size()); + return sorted_ops; +} + +void MultiDevSSAGraphBuilder::CreateOpHandleIOs(ir::Graph *result, + ir::Node *node, + size_t place_id) const { + auto p = places_[place_id]; + auto *op_handle = result->Get(kGraphOps).back(); + op_handle->SetDeviceContext(p, + platform::DeviceContextPool::Instance().Get(p)); + + for (ir::Node *input : node->inputs) { + VarHandle *var = CreateOrGetLatestVarHandle(result, input, p, place_id); + op_handle->AddInput(var); + } + + for (ir::Node *output : node->outputs) { + ir::Node *new_node = nullptr; + if (output->Var()) { + new_node = result->CreateVarNode(output->Var()); + } else { + new_node = + result->CreateEmptyNode(output->Name(), ir::Node::Type::kVariable); + } + CreateOpOutput(result, op_handle, new_node, p, place_id); + } +} + +size_t MultiDevSSAGraphBuilder::GetAppropriateDeviceID( + const std::vector &var_names) const { + int64_t numel_sum = 0; + for (auto var_name : var_names) { + if (all_vars_.find(var_name) == all_vars_.end()) continue; + auto var_desc = all_vars_.at(var_name); + PADDLE_ENFORCE_NOT_NULL(var_desc); + auto dim = framework::make_ddim(var_desc->GetShape()); + int64_t numel = framework::product(dim); + PADDLE_ENFORCE_GT(numel, 0); + numel_sum += numel; + } + + auto smallest = + std::min_element(std::begin(balance_vars_), std::end(balance_vars_)); + size_t dev_id = + static_cast(std::distance(std::begin(balance_vars_), smallest)); + balance_vars_[dev_id] += numel_sum; + return dev_id; } void MultiDevSSAGraphBuilder::SetCommunicationContext( @@ -623,28 +589,52 @@ void MultiDevSSAGraphBuilder::InsertDataBalanceOp( } int MultiDevSSAGraphBuilder::GetOpDeviceID( - const ir::Graph &graph, ir::Node *node, + ir::Node *node, + const std::unordered_map &sharded_var_device, + std::unordered_map> *delay_ops) const { + if (strategy_.reduce_ != BuildStrategy::ReduceStrategy::kReduce) { + return -1; + } + + if (!OpHaveRole(*node, framework::OpRole::kOptimize)) { + return -1; + } + + auto param_grad = boost::get>( + node->Op()->GetAttr(OpProtoAndCheckerMaker::OpRoleVarAttrName())); + + PADDLE_ENFORCE_EQ(param_grad.size(), 2U); + int dev_id = GetVarDeviceID(param_grad[1], sharded_var_device); + + if (dev_id == -1) { + (*delay_ops)[param_grad[1]].push_back(node); + return -2; + } + return dev_id; +} + +int MultiDevSSAGraphBuilder::GetOpDeviceID( + ir::Node *node, const std::unordered_map &sharded_var_device) const { if (strategy_.reduce_ != BuildStrategy::ReduceStrategy::kReduce) { return -1; } - int op_role = boost::get( - node->Op()->GetAttr(framework::OpProtoAndCheckerMaker::OpRoleAttrName())); - if (op_role != static_cast(framework::OpRole::kOptimize)) { + + if (!OpHaveRole(*node, framework::OpRole::kOptimize)) { return -1; } auto param_grad = boost::get>( node->Op()->GetAttr(OpProtoAndCheckerMaker::OpRoleVarAttrName())); PADDLE_ENFORCE_EQ(param_grad.size(), 2U); - int dev_id = GetVarDeviceID(graph, param_grad[1], sharded_var_device); + int dev_id = GetVarDeviceID(param_grad[1], sharded_var_device); PADDLE_ENFORCE_NE(dev_id, -1, "dev_id should not be -1.[%s, %s, %s]", node->Op()->Type(), param_grad[0], param_grad[1]); return dev_id; } int MultiDevSSAGraphBuilder::GetVarDeviceID( - const ir::Graph &graph, const std::string &varname, + const std::string &varname, const std::unordered_map &sharded_var_device) const { auto got = sharded_var_device.find(varname); if (got == sharded_var_device.end()) { @@ -658,13 +648,13 @@ int MultiDevSSAGraphBuilder::GetVarDeviceID( void MultiDevSSAGraphBuilder::CreateScaleLossGradOp( ir::Graph *result, const std::string &loss_grad_name, - ir::Node *out_var_node) const { + ir::Node *out_var_node, proto::VarType::Type dtype) const { for (size_t i = 0; i < places_.size(); ++i) { // Insert ScaleCost OpHandle auto *dev_ctx = platform::DeviceContextPool::Instance().Get(places_[i]); auto *op_handle = new ScaleLossGradOpHandle( result->CreateEmptyNode("scale_loss_grad", ir::Node::Type::kOperation), - local_scopes_.size(), local_scopes_[i], places_[i], dev_ctx); + local_scopes_.size(), local_scopes_[i], places_[i], dev_ctx, dtype); result->Get(kGraphOps).emplace_back(op_handle); // FIXME: Currently ScaleLossGradOp only use device_count as scale @@ -738,8 +728,7 @@ int MultiDevSSAGraphBuilder::CreateDistTrainOp( node->Op()->Type() == "split_selected_rows" || node->Op()->Type() == "split_ids") { // TODO(paddle-dev): getting the first var is not safe. - op_dev_id = - GetVarDeviceID(*result, input_var_names[0], *sharded_var_device); + op_dev_id = GetVarDeviceID(input_var_names[0], *sharded_var_device); if (strategy_.reduce_ == BuildStrategy::ReduceStrategy::kAllReduce) { op_dev_id = GetAppropriateDeviceID(input_var_names); for (auto &varname : input_var_names) { @@ -750,8 +739,7 @@ int MultiDevSSAGraphBuilder::CreateDistTrainOp( sharded_var_device->emplace(varname, op_dev_id); } } else if (node->Op()->Type() == "concat") { - op_dev_id = - GetVarDeviceID(*result, input_var_names[0], *sharded_var_device); + op_dev_id = GetVarDeviceID(input_var_names[0], *sharded_var_device); for (auto &varname : output_var_names) { sharded_var_device->emplace(varname, op_dev_id); } @@ -792,8 +780,7 @@ int MultiDevSSAGraphBuilder::CreateRPCOp( int op_dev_id = -1; if (node->Op()->Type() == "send") { // TODO(paddle-dev): getting the first var is not safe. - op_dev_id = - GetVarDeviceID(*result, node->inputs[0]->Name(), *sharded_var_device); + op_dev_id = GetVarDeviceID(node->inputs[0]->Name(), *sharded_var_device); PADDLE_ENFORCE(!ir::IsControlDepVar(*node->inputs[0]), "This hack no longer holds, please fix."); // the variable name which contains .block means it was splited by @@ -823,8 +810,7 @@ int MultiDevSSAGraphBuilder::CreateRPCOp( auto recv_param_grad = boost::get>( node->Op()->GetAttr(OpProtoAndCheckerMaker::OpRoleVarAttrName())); if (recv_param_grad.size() == 2U) { - op_dev_id = - GetVarDeviceID(*result, recv_param_grad[1], *sharded_var_device); + op_dev_id = GetVarDeviceID(recv_param_grad[1], *sharded_var_device); VLOG(10) << "recv param " << recv_param_grad[0] << " get grad place: " << recv_param_grad[1] << " place: " << op_dev_id; @@ -859,8 +845,7 @@ int MultiDevSSAGraphBuilder::CreateRPCOp( for (ir::Node *output : node->outputs) { int outvar_dev_id = op_dev_id; if (node->Op()->Type() == "fetch_barrier") { - outvar_dev_id = - GetVarDeviceID(*result, output->Name(), *sharded_var_device); + outvar_dev_id = GetVarDeviceID(output->Name(), *sharded_var_device); PADDLE_ENFORCE_NE(outvar_dev_id, -1, "output name %s", output->Name()); } p = places_[outvar_dev_id]; @@ -877,6 +862,14 @@ int MultiDevSSAGraphBuilder::CreateRPCOp( return op_dev_id; } +bool MultiDevSSAGraphBuilder::IsSparseGradient(const std::string &og) const { + PADDLE_ENFORCE(all_vars_.count(og) != 0); + if (all_vars_.at(og)->GetType() == proto::VarType::SELECTED_ROWS) { + return true; + } + return false; +} + bool MultiDevSSAGraphBuilder::IsScaleLossOp(ir::Node *node) const { return boost::get( node->Op()->GetAttr(OpProtoAndCheckerMaker::OpRoleAttrName())) == diff --git a/paddle/fluid/framework/details/multi_devices_graph_pass.h b/paddle/fluid/framework/details/multi_devices_graph_pass.h index 0556232aa4..7029e9dc18 100644 --- a/paddle/fluid/framework/details/multi_devices_graph_pass.h +++ b/paddle/fluid/framework/details/multi_devices_graph_pass.h @@ -45,7 +45,7 @@ class MultiDevSSAGraphBuilder : public ir::Pass { #endif int GetVarDeviceID( - const ir::Graph &graph, const std::string &varname, + const std::string &varname, const std::unordered_map &sharded_var_device) const; bool IsScaleLossOp(ir::Node *node) const; @@ -57,18 +57,13 @@ class MultiDevSSAGraphBuilder : public ir::Pass { ir::Graph *result, ir::Node *node, std::unordered_map *sharded_var_device) const; - std::vector FindDistTrainSendVars( - const std::vector &nodes) const; - - std::vector FindDistTrainRecvVars( - const std::vector &nodes) const; - void CreateComputationalOps(ir::Graph *result, ir::Node *node, size_t num_places) const; void CreateScaleLossGradOp(ir::Graph *result, const std::string &loss_grad_name, - ir::Node *out_var_node) const; + ir::Node *out_var_node, + proto::VarType::Type dtype) const; VarHandle *CreateReduceOp(ir::Graph *result, const std::string &og, int dst_dev_id) const; @@ -76,7 +71,7 @@ class MultiDevSSAGraphBuilder : public ir::Pass { int dev_id) const; int GetOpDeviceID( - const ir::Graph &graph, ir::Node *node, + ir::Node *node, const std::unordered_map &sharded_var_device) const; void InsertAllReduceOp(ir::Graph *result, const std::string &og) const; @@ -99,6 +94,15 @@ class MultiDevSSAGraphBuilder : public ir::Pass { void SetCommunicationContext(OpHandleBase *op_handle, const platform::Place &p) const; + std::vector SortForReduceMode( + const std::vector &) const; + + int GetOpDeviceID( + ir::Node *node, + const std::unordered_map &shared_var_device, + std::unordered_map> *delay_ops) + const; + mutable std::string loss_var_name_; mutable std::vector places_; mutable std::vector local_scopes_; diff --git a/paddle/fluid/framework/details/scale_loss_grad_op_handle.cc b/paddle/fluid/framework/details/scale_loss_grad_op_handle.cc index ef16265997..e1b8e8fe05 100644 --- a/paddle/fluid/framework/details/scale_loss_grad_op_handle.cc +++ b/paddle/fluid/framework/details/scale_loss_grad_op_handle.cc @@ -22,39 +22,66 @@ namespace details { ScaleLossGradOpHandle::ScaleLossGradOpHandle(ir::Node *node, size_t num_dev, Scope *scope, platform::Place place, - platform::DeviceContext *dev_ctx) + platform::DeviceContext *dev_ctx, + proto::VarType::Type dtype) : OpHandleBase(node), coeff_(static_cast(1.0 / num_dev)), scope_(scope), - place_(place) { + place_(place), + out_dtype_(dtype) { this->SetDeviceContext(place_, dev_ctx); } ScaleLossGradOpHandle::~ScaleLossGradOpHandle() {} +struct ScaleLossGradFunctor { + float coeff_; + Tensor *out_; + platform::Place place_; + OpHandleBase *op_handle_; + proto::VarType::Type out_dtype_; + platform::DeviceContext *ctx_; + + ScaleLossGradFunctor(float coeff, Tensor *out, platform::Place place, + OpHandleBase *op_handle, proto::VarType::Type dtype, + platform::DeviceContext *ctx) + : coeff_(coeff), out_(out), place_(place), out_dtype_(dtype), ctx_(ctx) {} + + template + void apply() const { + auto *out_data = out_->mutable_data(place_); + if (platform::is_cpu_place(place_)) { + *out_data = static_cast(coeff_); + } else { +#ifdef PADDLE_WITH_CUDA + OutT cast_coeff = static_cast(coeff_); + auto stream = static_cast(ctx_)->stream(); + memory::Copy(boost::get(place_), out_data, + platform::CPUPlace(), &cast_coeff, SizeOfType(out_dtype_), + stream); + VLOG(10) << place_ << "RUN Scale loss grad op"; + +#endif + } + } +}; + void ScaleLossGradOpHandle::RunImpl() { // Doesn't wait any event std::string var_name = static_cast(this->outputs_[0])->name_; auto &local_scope = *scope_->FindVar(kLocalExecScopeName)->Get(); - float *tmp = local_scope.FindVar(var_name) - ->GetMutable() - ->mutable_data(make_ddim({1}), place_); + auto *tensor = local_scope.FindVar(var_name)->GetMutable(); + tensor->Resize(make_ddim({1})); - if (platform::is_cpu_place(place_)) { - *tmp = coeff_; - } else { #ifdef PADDLE_WITH_CUDA - this->RunAndRecordEvent([&] { - auto stream = static_cast( - this->dev_ctxes_.at(place_)) - ->stream(); - memory::Copy(boost::get(place_), tmp, - platform::CPUPlace(), &coeff_, sizeof(float), stream); - VLOG(10) << place_ << "RUN Scale loss grad op"; - }); + ScaleLossGradFunctor func(coeff_, tensor, place_, this, out_dtype_, + this->dev_ctxes_.at(place_)); + this->RunAndRecordEvent([&] { framework::VisitDataType(out_dtype_, func); }); +#else + ScaleLossGradFunctor func(coeff_, tensor, place_, this, out_dtype_, nullptr); + framework::VisitDataType(out_dtype_, func); #endif - } } std::string ScaleLossGradOpHandle::Name() const { return "Scale LossGrad"; } diff --git a/paddle/fluid/framework/details/scale_loss_grad_op_handle.h b/paddle/fluid/framework/details/scale_loss_grad_op_handle.h index 523b55724c..8bedd1643e 100644 --- a/paddle/fluid/framework/details/scale_loss_grad_op_handle.h +++ b/paddle/fluid/framework/details/scale_loss_grad_op_handle.h @@ -26,8 +26,8 @@ namespace details { struct ScaleLossGradOpHandle : public OpHandleBase { ScaleLossGradOpHandle(ir::Node *node, size_t num_dev, Scope *scope, - platform::Place place, - platform::DeviceContext *context); + platform::Place place, platform::DeviceContext *context, + proto::VarType::Type dtype); ~ScaleLossGradOpHandle() final; @@ -40,6 +40,7 @@ struct ScaleLossGradOpHandle : public OpHandleBase { float coeff_; Scope *scope_; platform::Place place_; + proto::VarType::Type out_dtype_; }; } // namespace details diff --git a/paddle/fluid/framework/details/variable_visitor.cc b/paddle/fluid/framework/details/variable_visitor.cc index 3dfd14419d..134f759081 100644 --- a/paddle/fluid/framework/details/variable_visitor.cc +++ b/paddle/fluid/framework/details/variable_visitor.cc @@ -24,7 +24,7 @@ static void VisitVariable(Variable* var, Func* func) { } else if (var->IsType()) { (*func)(var->GetMutable()); } else { - PADDLE_THROW("Not supported type %s", var->Type().name()); + PADDLE_THROW("Not supported type %s", ToTypeName(var->Type())); } } @@ -35,7 +35,7 @@ static void VisitVariable(const Variable& var, Func* func) { } else if (var.IsType()) { (*func)(var.Get()); } else { - PADDLE_THROW("Not supported type %s", var.Type().name()); + PADDLE_THROW("Not supported type %s", ToTypeName(var.Type())); } } diff --git a/paddle/fluid/framework/executor.cc b/paddle/fluid/framework/executor.cc index da9556c6c1..594fbb48a6 100644 --- a/paddle/fluid/framework/executor.cc +++ b/paddle/fluid/framework/executor.cc @@ -119,7 +119,7 @@ static void DeleteUnusedTensors( } } else { PADDLE_THROW("Type %s of %s is not supported eager deletion", - var->Type().name(), name); + framework::ToTypeName(var->Type()), name); } } } diff --git a/paddle/fluid/framework/ir/CMakeLists.txt b/paddle/fluid/framework/ir/CMakeLists.txt index b7f7e2ee8e..6d795e1e2d 100644 --- a/paddle/fluid/framework/ir/CMakeLists.txt +++ b/paddle/fluid/framework/ir/CMakeLists.txt @@ -45,6 +45,7 @@ pass_library(is_test_pass base) pass_library(conv_elementwise_add_act_fuse_pass inference) pass_library(conv_elementwise_add2_act_fuse_pass inference) pass_library(conv_elementwise_add_fuse_pass inference) +pass_library(conv_affine_channel_fuse_pass inference) if(WITH_MKLDNN) pass_library(mkldnn_placement_pass base) pass_library(depthwise_conv_mkldnn_pass base) diff --git a/paddle/fluid/framework/ir/conv_affine_channel_fuse_pass.cc b/paddle/fluid/framework/ir/conv_affine_channel_fuse_pass.cc new file mode 100644 index 0000000000..a7bfb8cf1e --- /dev/null +++ b/paddle/fluid/framework/ir/conv_affine_channel_fuse_pass.cc @@ -0,0 +1,222 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/framework/ir/conv_affine_channel_fuse_pass.h" +#include +#include +#include +#include "paddle/fluid/framework/lod_tensor.h" +#include "paddle/fluid/operators/math/cpu_vec.h" +#include "paddle/fluid/platform/enforce.h" + +namespace paddle { +namespace framework { +namespace ir { + +#define GET_CONV_BN_NODES(pattern_name) \ + /* OPERATORS */ \ + GET_IR_NODE_FROM_SUBGRAPH(conv, conv, pattern_name); \ + GET_IR_NODE_FROM_SUBGRAPH(affine_channel, affine_channel, pattern_name); \ + /* CONV inputs */ \ + GET_IR_NODE_FROM_SUBGRAPH(conv_weight, conv_weight, pattern_name); \ + /* CONV outputs */ \ + GET_IR_NODE_FROM_SUBGRAPH(conv_out, conv_out, pattern_name); \ + /* Affine Channel inputs */ \ + GET_IR_NODE_FROM_SUBGRAPH(ac_scale, ac_scale, pattern_name); \ + GET_IR_NODE_FROM_SUBGRAPH(ac_bias, ac_bias, pattern_name); \ + /* Affine channel outputs */ \ + GET_IR_NODE_FROM_SUBGRAPH(ac_out, ac_out, pattern_name); /* Out */ + +void recompute_bias_and_weights(const Scope* scope, ir::Node* conv_weight, + const ir::Node& ac_scale, + const LoDTensor& ac_bias_tensor, + LoDTensor* eltwise_y_in_tensor) { + using EigenVectorArrayMap = + Eigen::Map>; + using ConstEigenVectorArrayMap = + Eigen::Map>; + using EigenMatrixArrayMap = Eigen::Map< + Eigen::Array>; + + // Re-compute bias of conv2d from AffineChannel + PADDLE_ENFORCE_EQ(eltwise_y_in_tensor->dims(), ac_bias_tensor.dims()); + + auto* scale_tensor = scope->FindVar(ac_scale.Name())->GetMutable(); + + ConstEigenVectorArrayMap scale_array(scale_tensor->data(), + scale_tensor->numel(), 1); + ConstEigenVectorArrayMap ac_bias_array(ac_bias_tensor.data(), + ac_bias_tensor.numel(), 1); + + EigenVectorArrayMap eltwise_y_in_array( + eltwise_y_in_tensor->mutable_data(platform::CPUPlace()), + eltwise_y_in_tensor->numel(), 1); + + eltwise_y_in_array = (eltwise_y_in_array * scale_array) + ac_bias_array; + + // Re-compute weight of conv2d from AffineChannel + auto* weights = scope->FindVar(conv_weight->Name())->GetMutable(); + auto weights_shape = weights->dims(); + auto weights_shape_2d = flatten_to_2d(weights_shape, 1); + + EigenMatrixArrayMap weights_array_2d( + weights->mutable_data(platform::CPUPlace()), weights_shape_2d[0], + weights_shape_2d[1]); + + weights_array_2d.colwise() *= scale_array; +} + +std::unique_ptr ConvAffineChannelFusePass::ApplyImpl( + std::unique_ptr graph) const { + PADDLE_ENFORCE(graph.get()); + FusePassBase::Init(name_scope_, graph.get()); + + auto* scope = param_scope(); + PADDLE_ENFORCE(scope); + + GraphPatternDetector gpd; + auto* conv_input = + gpd.mutable_pattern() + ->NewNode(patterns::PDNodeName(name_scope_, "conv_input")) + ->AsInput() + ->assert_is_op_input("conv2d", "Input"); + patterns::ConvAffineChannel conv_ac_pattern(gpd.mutable_pattern(), + name_scope_); + conv_ac_pattern(conv_input, false /*with_eltwise_add*/); + + int found_conv_ac_count = 0; + auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph, + Graph* g) { + VLOG(4) << "handle ConvAffineChannel fuse"; + + GET_CONV_BN_NODES(conv_ac_pattern); + + // check if fuse can be done and if MKL-DNN should be used + FuseOptions fuse_option = FindFuseOption(*conv, *affine_channel); + if (fuse_option == DO_NOT_FUSE) { + VLOG(3) << "do not perform conv+affinechannel fuse"; + return; + } + + // Create eltwise_y (conv bias) variable + VarDesc eltwise_y_in_desc( + patterns::PDNodeName(name_scope_, "eltwise_y_in")); + eltwise_y_in_desc.SetPersistable(true); + auto* eltwise_y_in_node = g->CreateVarNode(&eltwise_y_in_desc); + auto* eltwise_y_in_tensor = + scope->Var(eltwise_y_in_node->Name())->GetMutable(); + + // Get affine_channel bias + auto* ac_bias_tensor = + scope->FindVar(ac_bias->Name())->GetMutable(); + + // Initialize eltwise_y + eltwise_y_in_tensor->Resize(ac_bias_tensor->dims()); + std::fill_n(eltwise_y_in_tensor->mutable_data(platform::CPUPlace()), + eltwise_y_in_tensor->numel(), 0.0f); + + // update weights and biases + recompute_bias_and_weights(scope, conv_weight, *ac_scale, *ac_bias_tensor, + eltwise_y_in_tensor); + + // create an elementwise add node. + OpDesc desc; + desc.SetInput("X", std::vector({conv_out->Name()})); + desc.SetInput("Y", std::vector({eltwise_y_in_node->Name()})); + desc.SetOutput("Out", std::vector({ac_out->Name()})); + desc.SetType("elementwise_add"); + desc.SetAttr("axis", 1); + auto eltwise_op = g->CreateOpNode(&desc); // OpDesc will be copied. + + GraphSafeRemoveNodes(graph.get(), {ac_scale, ac_bias, affine_channel}); + + IR_NODE_LINK_TO(conv_out, eltwise_op); + IR_NODE_LINK_TO(eltwise_y_in_node, eltwise_op); + IR_NODE_LINK_TO(eltwise_op, ac_out); + found_conv_ac_count++; + }; + + gpd(graph.get(), handler); + + AddStatis(found_conv_ac_count); + return graph; +} + +std::unique_ptr ConvEltwiseAddAffineChannelFusePass::ApplyImpl( + std::unique_ptr graph) const { + PADDLE_ENFORCE(graph.get()); + FusePassBase::Init(name_scope_, graph.get()); + + auto* scope = param_scope(); + PADDLE_ENFORCE(scope); + + GraphPatternDetector gpd; + auto* conv_input = + gpd.mutable_pattern() + ->NewNode(patterns::PDNodeName(name_scope_, "conv_input")) + ->AsInput() + ->assert_is_op_input("conv2d", "Input"); + patterns::ConvAffineChannel conv_ac_pattern(gpd.mutable_pattern(), + name_scope_); + conv_ac_pattern(conv_input, true /*with_eltwise_add*/); + + int found_conv_ac_count = 0; + auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph, + Graph* g) { + VLOG(4) << "handle ConvBN fuse"; + + GET_CONV_BN_NODES(conv_ac_pattern); + // OPERATORS + GET_IR_NODE_FROM_SUBGRAPH(eltwise, eltwise, conv_ac_pattern); + // BIAS inputs + GET_IR_NODE_FROM_SUBGRAPH(eltwise_y_in, eltwise_y_in, conv_ac_pattern); + // BIAS outputs + GET_IR_NODE_FROM_SUBGRAPH(eltwise_out, eltwise_out, conv_ac_pattern); + + // Get eltwise_y (conv bias) variable + auto* eltwise_y_in_tensor = + scope->FindVar(eltwise_y_in->Name())->GetMutable(); + + // Get batch norm bias + auto* ac_bias_tensor = + scope->FindVar(ac_bias->Name())->GetMutable(); + + recompute_bias_and_weights(scope, conv_weight, *ac_scale, *ac_bias_tensor, + eltwise_y_in_tensor); + + // Update the elementwise_add node + eltwise->Op()->SetAttr("axis", 1); + eltwise->Op()->SetOutput("Out", std::vector({ac_out->Name()})); + + GraphSafeRemoveNodes(graph.get(), + {ac_scale, ac_bias, affine_channel, eltwise_out}); + + IR_NODE_LINK_TO(eltwise, ac_out); + + found_conv_ac_count++; + }; + + gpd(graph.get(), handler); + AddStatis(found_conv_ac_count); + return graph; +} + +} // namespace ir +} // namespace framework +} // namespace paddle + +REGISTER_PASS(conv_affine_channel_fuse_pass, + paddle::framework::ir::ConvAffineChannelFusePass); +REGISTER_PASS(conv_eltwiseadd_affine_channel_fuse_pass, + paddle::framework::ir::ConvEltwiseAddAffineChannelFusePass); diff --git a/paddle/fluid/framework/ir/conv_affine_channel_fuse_pass.h b/paddle/fluid/framework/ir/conv_affine_channel_fuse_pass.h new file mode 100644 index 0000000000..ad966e11e6 --- /dev/null +++ b/paddle/fluid/framework/ir/conv_affine_channel_fuse_pass.h @@ -0,0 +1,49 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include "paddle/fluid/framework/ir/fuse_pass_base.h" +#include "paddle/fluid/framework/ir/graph.h" +#include "paddle/fluid/framework/ir/graph_pattern_detector.h" + +namespace paddle { +namespace framework { +namespace ir { + +/* + * Fuse the Conv and ConvAffineChannel. + */ +class ConvAffineChannelFusePass : public FusePassBase { + public: + virtual ~ConvAffineChannelFusePass() {} + + protected: + std::unique_ptr ApplyImpl(std::unique_ptr graph) const; + const std::string name_scope_{"conv_affine_channel_fuse"}; +}; + +class ConvEltwiseAddAffineChannelFusePass : public FusePassBase { + public: + virtual ~ConvEltwiseAddAffineChannelFusePass() {} + + protected: + std::unique_ptr ApplyImpl(std::unique_ptr graph) const; + const std::string name_scope_{"conv_eltwiseadd_affine_channel_fuse"}; +}; + +} // namespace ir +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/ir/conv_elementwise_add2_act_fuse_pass.cc b/paddle/fluid/framework/ir/conv_elementwise_add2_act_fuse_pass.cc index 23f343f631..c6121777e8 100644 --- a/paddle/fluid/framework/ir/conv_elementwise_add2_act_fuse_pass.cc +++ b/paddle/fluid/framework/ir/conv_elementwise_add2_act_fuse_pass.cc @@ -40,18 +40,20 @@ framework::proto::OpDesc PrepareOpDesc( const std::string& output) { auto proto = base_desc; framework::OpDesc desc(proto, nullptr); + desc.SetType("conv2d_fusion"); desc.SetInput("Bias", {bias}); desc.SetInput("ResidualData", {bias1}); desc.SetAttr("activation", activation); desc.SetOutput("Output", {output}); desc.SetAttr("is_test", true); - + desc.SetAttr("use_cudnn", false); + desc.Flush(); return *desc.Proto(); } std::unique_ptr ConvElementwiseAdd2ActFusePass::ApplyImpl( std::unique_ptr graph) const { - const std::string pattern_name = "conv_elementwise_add_act_fuse"; + const std::string pattern_name = "conv_elementwise_add2_act_fuse"; FusePassBase::Init(pattern_name, graph.get()); GraphPatternDetector gpd; @@ -76,22 +78,23 @@ std::unique_ptr ConvElementwiseAdd2ActFusePass::ApplyImpl( framework::OpDesc new_op_desc(new_op_proto, nullptr); // Create a new node for the fused op. - graph->CreateOpNode(&new_op_desc); + auto* new_conv_op = graph->CreateOpNode(&new_op_desc); // Link inputs and outputs. PADDLE_ENFORCE(subgraph.count(x)); auto* conv_in_node = subgraph.at(x); - IR_NODE_LINK_TO(conv_in_node, conv_op); // Input - IR_NODE_LINK_TO(conv_filter, conv_op); // Filter - IR_NODE_LINK_TO(conv_op, conv_out); // Output - IR_NODE_LINK_TO(elementwise_add_in_y, conv_op); // Bias - IR_NODE_LINK_TO(elementwise_add_in_y_1, conv_op); // Bias + IR_NODE_LINK_TO(conv_in_node, new_conv_op); // Input + IR_NODE_LINK_TO(conv_filter, new_conv_op); // Filter + IR_NODE_LINK_TO(elementwise_add_in_y, new_conv_op); // Bias + IR_NODE_LINK_TO(elementwise_add_in_y_1, new_conv_op); // Bias + IR_NODE_LINK_TO(new_conv_op, act_out); // Output // Delete the unneeded nodes. - GraphSafeRemoveNodes(graph.get(), - {conv_op, elementwise_add_op, elementwise_add_op_1, - elementwise_add_out}); + GraphSafeRemoveNodes( + graph.get(), + {conv_op, conv_out, elementwise_add_op, elementwise_add_op_1, + elementwise_add_out, elementwise_add_out_1, act_op}); }; gpd(graph.get(), handler); return graph; diff --git a/paddle/fluid/framework/ir/graph.cc b/paddle/fluid/framework/ir/graph.cc index 8670dcfed7..3eb5bdba3b 100644 --- a/paddle/fluid/framework/ir/graph.cc +++ b/paddle/fluid/framework/ir/graph.cc @@ -23,66 +23,8 @@ limitations under the License. */ namespace paddle { namespace framework { namespace ir { -namespace { - -void CheckProgram(const ProgramDesc &program) { -#define _INT(role) static_cast(role) - - std::map visit; - for (OpDesc *op : program.Block(0).AllOps()) { - // For backward compatibility, some program doesn't have role added. - if (!op->HasAttr(OpProtoAndCheckerMaker::OpRoleAttrName())) continue; - int role_id = - boost::get(op->GetAttr(OpProtoAndCheckerMaker::OpRoleAttrName())); - visit[role_id] = true; - switch (role_id) { - case _INT(OpRole::kForward): - if (visit.find(_INT(OpRole::kBackward)) != visit.end()) { - LOG(ERROR) << "Cannot add backward operator before forward operator " - << op->Type(); - } - break; - case _INT(OpRole::kBackward): - case _INT(OpRole::kBackward) | _INT(OpRole::kLoss): - PADDLE_ENFORCE( - visit.find(_INT(OpRole::kOptimize)) == visit.end(), - "Cannot add backward operator %s after optimize operator.", - op->Type()); - break; - case _INT(OpRole::kForward) | _INT(OpRole::kLoss): - PADDLE_ENFORCE(visit.find(_INT(OpRole::kBackward) | - _INT(OpRole::kLoss)) == visit.end(), - "Cannot add backward|loss operator before " - "forward|loss operator %s.", - op->Type()); - PADDLE_ENFORCE( - visit.find(_INT(OpRole::kOptimize)) == visit.end(), - "Cannot add forward|loss operator %s after optimize operator.", - op->Type()); - break; - case _INT(OpRole::kOptimize): - case _INT(OpRole::kOptimize) | _INT(OpRole::kLRSched): - PADDLE_ENFORCE(visit.find(_INT(OpRole::kBackward)) != visit.end(), - "Optimize operators %s must follow backward operator.", - op->Type()); - break; - case _INT(OpRole::kLRSched): - case _INT(OpRole::kDist): - case _INT(OpRole::kRPC): - case _INT(OpRole::kNotSpecified): - break; - default: - LOG(FATAL) << "Unknown operator role. Don't add new role because " - "you don't know what you are doing."; - } - } - -#undef _INT -} -} // namespace Graph::Graph(const ProgramDesc &program) : program_(program) { - CheckProgram(program_); auto var_nodes = InitFromProgram(program_); ResolveHazard(var_nodes); } diff --git a/paddle/fluid/framework/ir/graph_pattern_detector.cc b/paddle/fluid/framework/ir/graph_pattern_detector.cc index 13d752e516..c513fe2dd8 100644 --- a/paddle/fluid/framework/ir/graph_pattern_detector.cc +++ b/paddle/fluid/framework/ir/graph_pattern_detector.cc @@ -1101,9 +1101,7 @@ PDNode *patterns::ElementwiseAdd::operator()(PDNode *x_var, PDNode *y_var) { return out_var; } -std::unordered_set conv_act_set({"identity", "sigmoid", "relu", - "relu6", "relux", "tanh", - "band_pass"}); +std::unordered_set conv_act_set({"identity", "relu"}); PDNode *patterns::ConvElementwiseaddAct::operator()(PDNode *conv_in) { conv_in->AsInput(); @@ -1169,13 +1167,13 @@ PDNode *patterns::ConvElementwiseadd2Act::operator()(PDNode *conv_in) { ->AsInput(); auto elementwise_add_out = pattern->NewNode(elementwise_add_out_repr()) ->assert_is_op_output("elementwise_add") - ->assert_is_op_input("elementwise_add", "X") + ->assert_is_op_input("elementwise_add", "Y") ->AsIntermediate(); auto elementwise_add_op_1 = pattern->NewNode(elementwise_add_op_1_repr()) ->assert_is_op("elementwise_add"); auto elementwise_add_in_y_1 = pattern->NewNode(elementwise_add_in_y_1_repr()) - ->assert_is_op_input("elementwise_add", "Y") + ->assert_is_op_input("elementwise_add", "X") ->AsInput(); auto elementwise_add_out_1 = pattern->NewNode(elementwise_add_out_1_repr()) ->assert_is_op_output("elementwise_add") @@ -1203,8 +1201,8 @@ PDNode *patterns::ConvElementwiseadd2Act::operator()(PDNode *conv_in) { conv_op->LinksFrom({conv_in, conv_filter}).LinksTo({conv_out}); elementwise_add_op->LinksFrom({conv_out, elementwise_add_in_y}) .LinksTo({elementwise_add_out}); - elementwise_add_op_1->LinksFrom( - {elementwise_add_out, elementwise_add_in_y_1}); + elementwise_add_op_1->LinksFrom({elementwise_add_out, elementwise_add_in_y_1}) + .LinksTo({elementwise_add_out_1}); act_op->LinksFrom({elementwise_add_out_1}).LinksTo({act_out}); return act_out; } @@ -1236,6 +1234,78 @@ PDNode *patterns::ConvElementwiseadd::operator()(PDNode *conv_in) { return elementwise_add_out; } +PDNode *patterns::ConvAffineChannel::operator()( + paddle::framework::ir::PDNode *conv_input, bool with_eltwise_add) { + // Create Operators + conv_input->assert_is_op_input("conv2d", "Input"); + auto *conv_op = pattern->NewNode(conv_repr())->assert_is_op("conv2d"); + + PDNode *eltwise_op = nullptr; + if (with_eltwise_add) { + eltwise_op = + pattern->NewNode(eltwise_repr())->assert_is_op("elementwise_add"); + } + + auto *affine_channel_op = + pattern->NewNode(affine_channel_repr())->assert_is_op("affine_channel"); + // Create variables + // Conv Filter + auto *conv_weight_var = pattern->NewNode(conv_weight_repr()) + ->AsInput() + ->assert_is_persistable_var() + ->assert_is_op_input("conv2d", "Filter"); + + auto *conv_out_var = pattern->NewNode(conv_out_repr()) + ->AsIntermediate() + ->assert_is_only_output_of_op("conv2d"); + + PDNode *eltwise_y_in_var = nullptr; + PDNode *eltwise_out_var = nullptr; + if (with_eltwise_add) { + // Conv output as Bias input + conv_out_var->assert_is_op_input("elementwise_add", "X"); + // Bias + eltwise_y_in_var = pattern->NewNode(eltwise_y_in_repr()) + ->assert_is_op_input("elementwise_add", "Y") + ->AsInput(); + eltwise_out_var = pattern->NewNode(eltwise_out_repr()) + ->AsIntermediate() + ->assert_is_only_output_of_op("elementwise_add"); + } else { + // Conv output as AffineChannel input + conv_out_var->assert_is_op_input("affine_channel", "X"); + } + + // AC Scale + auto *ac_scale_var = pattern->NewNode(ac_scale_repr()) + ->AsInput() + ->assert_is_persistable_var() + ->assert_is_op_input("affine_channel", "Scale"); + // AC Bias + auto *ac_bias_var = pattern->NewNode(ac_bias_repr()) + ->AsInput() + ->assert_is_persistable_var() + ->assert_is_op_input("affine_channel", "Bias"); + + // AC output + auto *ac_out_var = pattern->NewNode(ac_out_repr()) + ->AsOutput() + ->assert_is_op_output("affine_channel"); + + conv_op->LinksFrom({conv_input, conv_weight_var}).LinksTo({conv_out_var}); + + if (with_eltwise_add) { + eltwise_op->LinksFrom({conv_out_var, eltwise_y_in_var}) + .LinksTo({eltwise_out_var}); + affine_channel_op->LinksFrom({eltwise_out_var, ac_scale_var, ac_bias_var}) + .LinksTo({ac_out_var}); + } else { + affine_channel_op->LinksFrom({conv_out_var, ac_scale_var, ac_bias_var}) + .LinksTo({ac_out_var}); + } + return ac_out_var; +} + } // namespace ir } // namespace framework } // namespace paddle diff --git a/paddle/fluid/framework/ir/graph_pattern_detector.h b/paddle/fluid/framework/ir/graph_pattern_detector.h index eaedd9d08e..61a5300344 100644 --- a/paddle/fluid/framework/ir/graph_pattern_detector.h +++ b/paddle/fluid/framework/ir/graph_pattern_detector.h @@ -734,6 +734,38 @@ struct ConvElementwiseadd : public PatternBase { PATTERN_DECL_NODE(elementwise_add_out); }; +// Conv with affine_channel +// op: conv + (elementwise_add +) affine_channel +// named nodes: +// conv_weight, conv_out, conv, +// ac_x, ac_scale, ac_bias +// affine_channel, ac_out +struct ConvAffineChannel : public PatternBase { + ConvAffineChannel(PDPattern* pattern, const std::string& name_scope) + : PatternBase(pattern, name_scope, "conv_affine_channel") {} + + PDNode* operator()(PDNode* conv_input, bool with_eltwise_add); + + // declare operator node's name + PATTERN_DECL_NODE(conv); + PATTERN_DECL_NODE(affine_channel); + PATTERN_DECL_NODE(eltwise); // ELEMENTWISE_ADD + // CONV inputs + PATTERN_DECL_NODE(conv_weight); // Filter + // CONV outputs + PATTERN_DECL_NODE(conv_out); // tmp + // ELTWISE inputs + PATTERN_DECL_NODE(eltwise_y_in); + // ELTWISE outputs + PATTERN_DECL_NODE(eltwise_out); // tmp + + // AC(Affine_Channel) inputs + PATTERN_DECL_NODE(ac_scale); + PATTERN_DECL_NODE(ac_bias); + // AC outputs + PATTERN_DECL_NODE(ac_out); // Out +}; + } // namespace patterns // Link two ir::Nodes from each other. diff --git a/paddle/fluid/framework/ir/multi_batch_merge_pass.cc b/paddle/fluid/framework/ir/multi_batch_merge_pass.cc index bd5b76426e..9e77f98e9e 100644 --- a/paddle/fluid/framework/ir/multi_batch_merge_pass.cc +++ b/paddle/fluid/framework/ir/multi_batch_merge_pass.cc @@ -75,6 +75,7 @@ std::unique_ptr BatchMergePass::ApplyImpl( std::vector optimize_ops; std::vector lr_ops; // ops other than forward/backward/optimize std::unordered_set grad_names; + std::unordered_map gradname2paramname; std::vector nodes = TopologySortOperations(*graph); auto origin_nodes = graph->ReleaseNodes(); @@ -99,6 +100,7 @@ std::unique_ptr BatchMergePass::ApplyImpl( auto op_role_vars = boost::get>(op_role_var); for (size_t i = 0; i < op_role_vars.size(); i += 2) { grad_names.insert(op_role_vars[i + 1]); + gradname2paramname[op_role_vars[i + 1]] = op_role_vars[i]; } } else if (op_role & static_cast(framework::OpRole::kLRSched)) { lr_ops.push_back(node); @@ -109,7 +111,7 @@ std::unique_ptr BatchMergePass::ApplyImpl( // 2. copy forward backward ir::Node* prev_repeat_last_op_node = nullptr; - // record origin_grad -> repeated grad list map. + // record origin_grad -> repeated_grad_list map. std::map> grad_repeated_map; std::map> created; std::unordered_set bn_vars_need_rename; @@ -124,10 +126,16 @@ std::unique_ptr BatchMergePass::ApplyImpl( if (grad_names.find(outname) != grad_names.end()) { std::string new_gname = string::Sprintf("%s.repeat.%d", outname, i); repeated_op.RenameOutput(outname, new_gname); + // remove op_role_var for backward ops that outputs grad for a + // parameter. + repeated_op.SetAttr(OpProtoAndCheckerMaker::OpRoleVarAttrName(), + std::vector()); } } // 3.5 let batch_norm ops use independent vars, note batch_norm_grad do - // not need this update + // not need this update, because only moving mean and variance should be + // differ, trainable parameter scale and bias is the same as other + // parameters. if (node->Name() == "batch_norm") { // NOTE: assume bn op created by layers use save var as output mean and // variance @@ -224,16 +232,25 @@ std::unique_ptr BatchMergePass::ApplyImpl( var->inputs.push_back(repeated_node); } } - } + } // end copy forward backward - // 5. create GRAD merge op node + // 5. create GRAD merge op node: sum(repeat.0...repeat.n) -> + // scale(1/num_repeats) for (auto kv : grad_repeated_map) { OpDesc sum_op; sum_op.SetType("sum"); std::vector repeated_grad_names; + std::vector param_grad_op_role_var; for (auto r : kv.second) { repeated_grad_names.push_back(r->Var()->Name()); } + // NOTE: use op_role_var to control allreduce op appending in + // multi_devices_graph_pass, we want to append op_role_var + // only once for the merged gradient, so break after first call. + param_grad_op_role_var.push_back( + gradname2paramname.at(kv.first->Var()->Name())); // param + param_grad_op_role_var.push_back(kv.first->Var()->Name()); // grad + sum_op.SetInput("X", repeated_grad_names); sum_op.SetOutput("Out", {kv.first->Var()->Name()}); sum_op.SetAttr(OpProtoAndCheckerMaker::OpRoleAttrName(), @@ -256,6 +273,10 @@ std::unique_ptr BatchMergePass::ApplyImpl( scale_op.SetAttr("scale", static_cast(1.0f / num_repeats)); scale_op.SetAttr(OpProtoAndCheckerMaker::OpRoleAttrName(), static_cast(OpRole::kBackward)); + + scale_op.SetAttr(OpProtoAndCheckerMaker::OpRoleVarAttrName(), + param_grad_op_role_var); + auto scale_op_node = result.CreateOpNode(&scale_op); scale_op_node->inputs.push_back(sum_out_var_node); sum_out_var_node->outputs.push_back(scale_op_node); diff --git a/paddle/fluid/framework/mixed_vector.h b/paddle/fluid/framework/mixed_vector.h index 6940250c3f..c3a044d22c 100644 --- a/paddle/fluid/framework/mixed_vector.h +++ b/paddle/fluid/framework/mixed_vector.h @@ -215,8 +215,8 @@ class Vector { auto stream = dev_ctx->stream(); void *src = gpu_->ptr(); void *dst = cpu_.data(); - memory::Copy(platform::CPUPlace(), dst, CUDAPlace().get(), src, - gpu_->size(), stream); + paddle::memory::Copy(platform::CPUPlace(), dst, CUDAPlace().get(), src, + gpu_->size(), stream); dev_ctx->Wait(); } @@ -261,8 +261,8 @@ class Vector { auto *dev_ctx = static_cast( platform::DeviceContextPool::Instance().Get(place)); auto stream = dev_ctx->stream(); - memory::Copy(CUDAPlace().get(), dst, platform::CPUPlace(), src, - gpu_->size(), stream); + paddle::memory::Copy(CUDAPlace().get(), dst, platform::CPUPlace(), src, + gpu_->size(), stream); } void ImmutableCPU() const { @@ -284,7 +284,7 @@ class Vector { bool IsInCPU() const { return flag_ & kDataInCPU; } mutable std::vector cpu_; - mutable memory::AllocationPtr gpu_; + mutable paddle::memory::AllocationPtr gpu_; mutable int flag_; mutable std::mutex mtx_; diff --git a/paddle/fluid/framework/op_desc.cc b/paddle/fluid/framework/op_desc.cc index 2fe1c94ec0..0e7b0cbeb9 100644 --- a/paddle/fluid/framework/op_desc.cc +++ b/paddle/fluid/framework/op_desc.cc @@ -643,7 +643,7 @@ void OpDesc::CheckAttrs() { // not by users. return; } - checker->Check(attrs_); + checker->Check(&attrs_); } void OpDesc::InferShape(const BlockDesc &block) const { diff --git a/paddle/fluid/framework/op_registry.cc b/paddle/fluid/framework/op_registry.cc index bfc411ca2c..346d14d408 100644 --- a/paddle/fluid/framework/op_registry.cc +++ b/paddle/fluid/framework/op_registry.cc @@ -24,7 +24,7 @@ std::unique_ptr OpRegistry::CreateOp( const VariableNameMap& outputs, AttributeMap attrs) { auto& info = OpInfoMap::Instance().Get(type); if (info.Checker() != nullptr) { - info.Checker()->Check(attrs); + info.Checker()->Check(&attrs); } auto op = info.Creator()(type, inputs, outputs, attrs); return std::unique_ptr(op); diff --git a/paddle/fluid/framework/op_registry.h b/paddle/fluid/framework/op_registry.h index 6d39bb3c52..2c1648c81f 100644 --- a/paddle/fluid/framework/op_registry.h +++ b/paddle/fluid/framework/op_registry.h @@ -23,7 +23,8 @@ limitations under the License. */ #include #include -#include "glog/logging.h" // For VLOG() +#define GLOG_NO_ABBREVIATED_SEVERITIES // msvc conflict logging with windows.h +#include "glog/logging.h" // For VLOG() #include "paddle/fluid/framework/attribute.h" #include "paddle/fluid/framework/details/op_registry.h" #include "paddle/fluid/framework/framework.pb.h" diff --git a/paddle/fluid/framework/operator.cc b/paddle/fluid/framework/operator.cc index fec311e3ee..f10da22aec 100644 --- a/paddle/fluid/framework/operator.cc +++ b/paddle/fluid/framework/operator.cc @@ -16,7 +16,6 @@ limitations under the License. */ #include #include - #include "paddle/fluid/framework/data_transform.h" #include "paddle/fluid/framework/executor.h" #include "paddle/fluid/framework/lod_tensor.h" @@ -181,11 +180,7 @@ void OperatorBase::Run(const Scope& scope, const platform::Place& place) { } bool OperatorBase::HasInputs(const std::string& name) const { - if (inputs_.find(name) != inputs_.end()) { - return true; - } else { - return false; - } + return inputs_.find(name) != inputs_.end(); } std::string OperatorBase::Input(const std::string& name) const { @@ -384,7 +379,7 @@ const Tensor* GetLoDTensorOrSelectedRowsValueFromVar(const Variable& var) { return &(var.Get().value()); } else { PADDLE_THROW("Variable type_id %s, expect LoDTensor/SelectedRows.", - var.Type().name()); + ToTypeName(var.Type())); } } @@ -395,7 +390,7 @@ Tensor* GetMutableLoDTensorOrSelectedRowsValueFromVar(Variable* var) { return var->GetMutable()->mutable_value(); } else { PADDLE_THROW("Variable type_id %s, expect LoDTensor/SelectedRows.", - var->Type().name()); + ToTypeName(var->Type())); } } @@ -489,7 +484,7 @@ const std::vector ExecutionContext::MultiInput( PADDLE_ENFORCE( var->IsType(), "should be LoDTensor, but the received type is %s", - var->Type().name()); + ToTypeName(var->Type())); return &(var->Get()); }); return res; @@ -508,7 +503,7 @@ const std::vector ExecutionContext::LegacyMultiInput( PADDLE_ENFORCE( var->IsType(), "%s should be LoDTensor, but the received type is %s", - sub_name, var->Type().name()); + sub_name, ToTypeName(var->Type())); return &(var->Get()); }); return res; @@ -537,7 +532,7 @@ std::vector ExecutionContext::MultiOutput( PADDLE_ENFORCE( var->IsType(), "%s should be LoDTensor, but the received type is %s", - sub_name, var->Type().name()); + sub_name, ToTypeName(var->Type())); return var->GetMutable(); }); return res; @@ -779,7 +774,7 @@ class RuntimeInferShapeContext : public InferShapeContext { PADDLE_THROW( "Only LoDTensor/SelectedRows support 'GetDim', but Variables " "type_id is %s.", - var->Type().name()); + ToTypeName(var->Type())); } } @@ -802,7 +797,7 @@ class RuntimeInferShapeContext : public InferShapeContext { var->GetMutable()->set_height(dim[0]); } else { PADDLE_THROW("Variable type_id %s, expect LoDTensor/SelectedRows.", - var->Type().name()); + ToTypeName(var->Type())); } } @@ -1045,12 +1040,11 @@ Scope* OperatorWithKernel::PrepareData( proto::VarType::Type OperatorWithKernel::IndicateDataType( const ExecutionContext& ctx) const { - auto& scope = ctx.scope(); int data_type = -1; - std::string last_input_name; for (auto& input : this->inputs_) { - for (auto& ipt_name : input.second) { - auto* var = scope.FindVar(ipt_name); + const std::vector vars = ctx.MultiInputVar(input.first); + for (size_t i = 0; i < vars.size(); ++i) { + const Variable* var = vars[i]; if (var != nullptr) { const Tensor* t = nullptr; if (var->IsType()) { @@ -1061,15 +1055,14 @@ proto::VarType::Type OperatorWithKernel::IndicateDataType( t = &(var->Get().value()); } if (t != nullptr) { - PADDLE_ENFORCE(t->IsInitialized(), "Input %s is not initialized: %s", - ipt_name, DebugString()); + PADDLE_ENFORCE(t->IsInitialized(), "Input %s(%lu)is not initialized", + input.first, i); int tmp = static_cast(t->type()); PADDLE_ENFORCE( tmp == data_type || data_type == -1, - "DataType of Paddle Op %s must be the same. Get %s(%d) != %s(%d)", - Type(), last_input_name, data_type, ipt_name, tmp); + "DataType of Paddle Op %s must be the same. Get (%d) != (%d)", + Type(), data_type, tmp); data_type = tmp; - last_input_name = ipt_name; } } } diff --git a/paddle/fluid/framework/operator.h b/paddle/fluid/framework/operator.h index 1fe2daacf1..e2bedc60d2 100644 --- a/paddle/fluid/framework/operator.h +++ b/paddle/fluid/framework/operator.h @@ -49,6 +49,8 @@ constexpr char kTempVarName[] = "@TEMP@"; /// e.g. Variable "x@GRAD" is the gradient of varibale "x". constexpr char kGradVarSuffix[] = "@GRAD"; +constexpr size_t kGradVarSuffixSize = 5U; + /// Variables with this suffix are supposed to be filled up with zeros. constexpr char kZeroVarSuffix[] = "@ZERO"; @@ -60,7 +62,11 @@ constexpr char kNewGradSuffix[] = "@NEWGRAD@"; extern std::vector> kKernelPriority; inline std::string GradVarName(const std::string& var_name) { - return var_name + kGradVarSuffix; + std::string result; + result.reserve(var_name.size() + kGradVarSuffixSize); + result += var_name; + result += kGradVarSuffix; + return result; } proto::VarType::Type GetDataTypeOfVar(const Variable* var); @@ -75,6 +81,10 @@ class RuntimeContext { RuntimeContext(const VariableNameMap& innames, const VariableNameMap& outnames, const Scope& scope); + RuntimeContext(const VariableValueMap& invars, + const VariableValueMap& outvars) + : inputs(invars), outputs(outvars) {} + VariableValueMap inputs; VariableValueMap outputs; }; @@ -110,8 +120,8 @@ class OperatorBase { bool HasAttr(const std::string& name) const { return attrs_.count(name); } template inline const T& Attr(const std::string& name) const { - PADDLE_ENFORCE(attrs_.count(name) != 0, "%s should be in AttributeMap", - name); + PADDLE_ENFORCE(attrs_.find(name) != attrs_.end(), + "%s should be in AttributeMap", name); return boost::get(attrs_.at(name)); } const AttributeMap& Attrs() const { return attrs_; } @@ -441,8 +451,9 @@ class OperatorWithKernel : public OperatorBase { void RuntimeInferShape(const Scope& scope, const platform::Place& place, const RuntimeContext& ctx) const override; - protected: virtual OpKernelType GetExpectedKernelType(const ExecutionContext& ctx) const; + + protected: virtual OpKernelType GetKernelTypeForVar( const std::string& var_name, const Tensor& tensor, const OpKernelType& expected_kernel_type) const; diff --git a/paddle/fluid/framework/parallel_executor.cc b/paddle/fluid/framework/parallel_executor.cc index a921f469f5..e14b74a873 100644 --- a/paddle/fluid/framework/parallel_executor.cc +++ b/paddle/fluid/framework/parallel_executor.cc @@ -320,6 +320,7 @@ void ParallelExecutor::BCastParamsToDevices( if (paddle::platform::is_gpu_place(main_tensor.place())) { #if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) std::vector buffers; + buffers.reserve(member_->places_.size()); size_t numel = main_tensor.numel(); ncclDataType_t data_type = platform::ToNCCLDataType(main_tensor.type()); for (size_t i = 0; i < member_->places_.size(); ++i) { @@ -353,9 +354,7 @@ void ParallelExecutor::BCastParamsToDevices( #endif } else { platform::CPUPlace cpu; - for (size_t i = 0; i < member_->places_.size(); ++i) { - if (i == 0) continue; - + for (size_t i = 1; i < member_->places_.size(); ++i) { auto local_scope = member_->local_scopes_[i]; auto *t = local_scope->Var(var)->GetMutable(); diff --git a/paddle/fluid/framework/scope.cc b/paddle/fluid/framework/scope.cc index 6fa5e99f9f..750b626603 100644 --- a/paddle/fluid/framework/scope.cc +++ b/paddle/fluid/framework/scope.cc @@ -165,11 +165,9 @@ std::string Scope::Rename(const std::string& origin_name) const { Variable* Scope::VarInternal(const std::string& name) { auto* v = FindVarLocally(name); if (v != nullptr) return v; - v = new Variable(); - vars_[name].reset(v); + vars_.emplace(name, std::unique_ptr(v)); VLOG(3) << "Create variable " << name; - v->name_ = &(vars_.find(name)->first); return v; } diff --git a/paddle/fluid/framework/scope_pool.cc b/paddle/fluid/framework/scope_pool.cc new file mode 100644 index 0000000000..5cb241a7a3 --- /dev/null +++ b/paddle/fluid/framework/scope_pool.cc @@ -0,0 +1,54 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/framework/scope_pool.h" +#include "paddle/fluid/framework/threadpool.h" + +namespace paddle { +namespace framework { + +ScopePool &ScopePool::Instance() { // NOLINT + static ScopePool pool; + return pool; +} + +void ScopePool::DeleteScope(Scope *scope) { delete scope; } + +void ScopePool::Insert(std::unique_ptr &&s) { + std::lock_guard guard(mtx_); + scopes_.insert(s.release()); +} + +void ScopePool::Remove(Scope *s) { + size_t has_scope; + { + std::lock_guard guard(mtx_); + has_scope = scopes_.erase(s); + } + PADDLE_ENFORCE(has_scope > 0, "Delete non-existing global scope"); + DeleteScope(s); +} + +ScopePool::~ScopePool() { Clear(); } + +void ScopePool::Clear() { + std::lock_guard guard(mtx_); + for (auto *s : scopes_) { + DeleteScope(s); + } + scopes_.clear(); +} + +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/scope_pool.h b/paddle/fluid/framework/scope_pool.h new file mode 100644 index 0000000000..a8b468699a --- /dev/null +++ b/paddle/fluid/framework/scope_pool.h @@ -0,0 +1,46 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include // NOLINT +#include +#include "paddle/fluid/framework/scope.h" + +namespace paddle { +namespace framework { + +class ScopePool { + public: + static ScopePool &Instance(); // NOLINT + + void Insert(std::unique_ptr &&s); + + void Remove(Scope *s); + + void Clear(); + + ~ScopePool(); + + private: + ScopePool() = default; + + static void DeleteScope(Scope *scope); + + std::unordered_set scopes_; + std::mutex mtx_; +}; + +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/tensor.cc b/paddle/fluid/framework/tensor.cc index 5b09cad06c..ef096c2b81 100644 --- a/paddle/fluid/framework/tensor.cc +++ b/paddle/fluid/framework/tensor.cc @@ -28,8 +28,7 @@ void Tensor::check_memory_size() const { "or maybe the required data-type mismatches the data already stored."); } -Tensor::Tensor(std::type_index type) - : type_(framework::ToDataType(type)), offset_(0) {} +Tensor::Tensor(const proto::VarType::Type& dtype) : type_(dtype), offset_(0) {} size_t Tensor::memory_size() const { return holder_ == nullptr ? 0UL : holder_->size() - offset_; diff --git a/paddle/fluid/framework/tensor.h b/paddle/fluid/framework/tensor.h index 2e110133a3..40606d9b06 100644 --- a/paddle/fluid/framework/tensor.h +++ b/paddle/fluid/framework/tensor.h @@ -69,7 +69,7 @@ class Tensor { public: Tensor() : type_(proto::VarType::FP32), offset_(0) {} - explicit Tensor(std::type_index type); + explicit Tensor(const proto::VarType::Type&); /*! Return a pointer to mutable memory block. */ template diff --git a/paddle/fluid/framework/tensor_util.h b/paddle/fluid/framework/tensor_util.h index cab6d9b67e..871c7bd2a7 100644 --- a/paddle/fluid/framework/tensor_util.h +++ b/paddle/fluid/framework/tensor_util.h @@ -19,6 +19,7 @@ limitations under the License. */ #include "paddle/fluid/framework/framework.pb.h" #include "paddle/fluid/framework/tensor.h" #include "paddle/fluid/platform/device_context.h" +#include "paddle/fluid/platform/temporary_allocator.h" namespace paddle { namespace framework { @@ -151,5 +152,26 @@ void TensorToVector(const Tensor& src, std::vector* dst) { src_ptr, size); } +template +paddle::framework::Tensor GetTensor( + memory::allocation::AllocationPtr temp_allocation_ptr, + const framework::DDim& dim) { + auto& deleter = temp_allocation_ptr.get_deleter(); + auto* allocation_ptr = temp_allocation_ptr.release(); + auto shared_allocation = + std::shared_ptr(allocation_ptr, deleter); + + PADDLE_ENFORCE( + dynamic_cast(allocation_ptr) != nullptr, + "The AllocationPtr must be TemporaryAllocation."); + PADDLE_ENFORCE_EQ(allocation_ptr->size(), + framework::product(dim) * sizeof(T)); + + paddle::framework::Tensor temp_tensor( + framework::ToDataType(std::type_index(typeid(T)))); + temp_tensor.Resize(dim); + temp_tensor.ResetHolder(std::move(shared_allocation)); + return temp_tensor; +} } // namespace framework } // namespace paddle diff --git a/paddle/fluid/framework/var_type.h b/paddle/fluid/framework/var_type.h index 3b6f1cdb8f..73be446f71 100644 --- a/paddle/fluid/framework/var_type.h +++ b/paddle/fluid/framework/var_type.h @@ -19,52 +19,50 @@ limitations under the License. */ #include "paddle/fluid/framework/lod_tensor_array.h" #include "paddle/fluid/framework/reader.h" #include "paddle/fluid/framework/selected_rows.h" +#include "paddle/fluid/framework/var_type_traits.h" #include "paddle/fluid/framework/variable.h" namespace paddle { namespace framework { template -inline bool IsType(const std::type_index& type_index) { - return type_index == std::type_index(typeid(T)); +inline bool IsType(const std::type_index& type) { + return type == typeid(T); } -inline proto::VarType::Type ToVarType(std::type_index type) { - if (IsType(type)) { - return proto::VarType_Type_LOD_TENSOR; - } else if (IsType(type)) { - return proto::VarType_Type_LOD_RANK_TABLE; - } else if (IsType(type)) { - return proto::VarType_Type_LOD_TENSOR_ARRAY; - } else if (IsType(type)) { - return proto::VarType_Type_SELECTED_ROWS; - } else if (IsType(type)) { - return proto::VarType_Type_READER; - } else { - PADDLE_THROW("ToVarType:Unsupported type %s", type.name()); +inline proto::VarType::Type ToVarType(int type) { + switch (type) { + case proto::VarType::LOD_TENSOR: + case proto::VarType::SELECTED_ROWS: + case proto::VarType::LOD_RANK_TABLE: + case proto::VarType::LOD_TENSOR_ARRAY: + case proto::VarType::READER: + return static_cast(type); + default: + PADDLE_THROW("ToVarType:Unsupported type %d", type); } } template inline void VisitVarType(const framework::Variable& var, Visitor visitor) { - switch (ToVarType(var.Type())) { - case proto::VarType_Type_LOD_TENSOR: + switch (var.Type()) { + case proto::VarType::LOD_TENSOR: visitor(var.Get()); return; - case proto::VarType_Type_LOD_RANK_TABLE: + case proto::VarType::LOD_RANK_TABLE: visitor(var.Get()); return; - case proto::VarType_Type_LOD_TENSOR_ARRAY: + case proto::VarType::LOD_TENSOR_ARRAY: visitor(var.Get()); return; - case proto::VarType_Type_SELECTED_ROWS: + case proto::VarType::SELECTED_ROWS: visitor(var.Get()); return; - case proto::VarType_Type_READER: + case proto::VarType::READER: visitor(var.Get()); return; default: - PADDLE_THROW("Not supported visit type, %d", ToVarType(var.Type())); + PADDLE_THROW("Not supported visit type, %s", ToTypeName(var.Type())); } } diff --git a/paddle/fluid/framework/var_type_inference_test.cc b/paddle/fluid/framework/var_type_inference_test.cc index 7842168f60..2a75394fca 100644 --- a/paddle/fluid/framework/var_type_inference_test.cc +++ b/paddle/fluid/framework/var_type_inference_test.cc @@ -108,7 +108,7 @@ TEST(InferVarType, sum_op_without_infer_var_type) { op->InferVarType(prog.MutableBlock(0)); - ASSERT_EQ(proto::VarType_Type_LOD_TENSOR, + ASSERT_EQ(proto::VarType::LOD_TENSOR, prog.MutableBlock(0)->Var("test2_out")->GetType()); } diff --git a/paddle/fluid/framework/var_type_traits.cc b/paddle/fluid/framework/var_type_traits.cc new file mode 100644 index 0000000000..c3c5bab23b --- /dev/null +++ b/paddle/fluid/framework/var_type_traits.cc @@ -0,0 +1,119 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/framework/var_type_traits.h" +#include "paddle/fluid/framework/lod_rank_table.h" +#include "paddle/fluid/framework/reader.h" +#include "paddle/fluid/framework/scope.h" +#include "paddle/fluid/framework/selected_rows.h" +#include "paddle/fluid/operators/reader/lod_tensor_blocking_queue.h" +#include "paddle/fluid/platform/macros.h" +#ifdef PADDLE_WITH_CUDA +#ifndef _WIN32 +#include "paddle/fluid/operators/nccl/nccl_gpu_common.h" +#endif +#include +#include "paddle/fluid/operators/conv_cudnn_op_cache.h" +#include "paddle/fluid/operators/cudnn_rnn_cache.h" +#endif + +namespace paddle { +namespace framework { + +// Besides registering variable type id, it is helpful to register a +// var_id -> std::type_index map (for example, get type names according to id) +namespace detail { + +template +struct VarIdToTypeIndexMapInitializerImpl { + template + static void Init(MapType1 *id_to_type, MapType2 *type_to_id) { + using Type = + typename std::tuple_element::type; + static_assert(!std::is_same::value, "Type cannot be void"); + constexpr int kId = VarTypeTrait::kId; + auto type = std::type_index(typeid(Type)); + PADDLE_ENFORCE(id_to_type->count(kId) == 0, + "Registered duplicate type id %d for type %s", kId, + type.name()); + PADDLE_ENFORCE(type_to_id->count(type) == 0, + "Registered duplicate type_index %s for id %d", type.name(), + kId); + id_to_type->emplace(kId, type); + type_to_id->emplace(type, kId); + VarIdToTypeIndexMapInitializerImpl::Init(id_to_type, + type_to_id); + } +}; + +template +struct VarIdToTypeIndexMapInitializerImpl { + template + static void Init(MapType1 *, MapType2 *) {} +}; + +// VarIdToTypeIndexMapInitializer is designed to initialize var_id -> +// std::type_index map and std::type_index -> var_id map +using VarIdToTypeIndexMapInitializer = + VarIdToTypeIndexMapInitializerImpl<0, VarTypeRegistry::kRegisteredTypeNum, + VarTypeRegistry::kRegisteredTypeNum == + 0>; + +struct VarIdToTypeIndexMapHolder { + DISABLE_COPY_AND_ASSIGN(VarIdToTypeIndexMapHolder); + + public: + static const std::type_index &ToTypeIndex(int var_id) { + auto it = Instance().id_to_type_map_.find(var_id); + PADDLE_ENFORCE(it != Instance().id_to_type_map_.end(), + "VarId %d is not registered.", var_id); + return it->second; + } + + static int ToTypeId(const std::type_index &type) { + auto it = Instance().type_to_id_map_.find(type); + PADDLE_ENFORCE(it != Instance().type_to_id_map_.end(), + "VarType %s is not registered.", type.name()); + return it->second; + } + + private: + VarIdToTypeIndexMapHolder() { + VarIdToTypeIndexMapInitializer::Init(&id_to_type_map_, &type_to_id_map_); + } + + static const VarIdToTypeIndexMapHolder &Instance() { + static const VarIdToTypeIndexMapHolder instance; + return instance; + } + + std::unordered_map id_to_type_map_; + std::unordered_map type_to_id_map_; +}; + +} // namespace detail + +const std::type_index &ToTypeIndex(int var_id) { + return detail::VarIdToTypeIndexMapHolder::ToTypeIndex(var_id); +} + +const char *ToTypeName(int var_id) { return ToTypeIndex(var_id).name(); } + +int ToTypeId(const std::type_index &type) { + return detail::VarIdToTypeIndexMapHolder::ToTypeId(type); +} + +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/var_type_traits.h b/paddle/fluid/framework/var_type_traits.h new file mode 100644 index 0000000000..cc68cf2ab8 --- /dev/null +++ b/paddle/fluid/framework/var_type_traits.h @@ -0,0 +1,195 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include +#include +#include +#include +#include "paddle/fluid/framework/framework.pb.h" +#include "paddle/fluid/framework/lod_tensor_array.h" +#include "paddle/fluid/platform/place.h" +#ifdef PADDLE_WITH_CUDA +#include +#ifndef _WIN32 +#include +#endif +#endif + +// Users should add forward declarations here +namespace paddle { + +namespace platform { +#ifdef PADDLE_WITH_CUDA +#ifndef _WIN32 +class Communicator; +#endif +#endif +} // namespace platform + +namespace framework { +class Tensor; +class LoDTensor; +class SelectedRows; +class LoDRankTable; +class ReaderHolder; +class Scope; +} // namespace framework + +namespace operators { +template +class AlgorithmsCache; + +class CudnnRNNCache; + +namespace reader { +class LoDTensorBlockingQueueHolder; +} // namespace reader +} // namespace operators + +} // namespace paddle + +namespace paddle { +namespace framework { + +const char *ToTypeName(int var_id); +const std::type_index &ToTypeIndex(int var_id); +int ToTypeId(const std::type_index &type); + +namespace detail { + +template +struct TypePosFinderImpl { + static constexpr int kPos = + std::is_same::value + ? kStart + : TypePosFinderImpl::kPos; +}; + +template +struct TypePosFinderImpl { + static constexpr int kPos = std::is_same::value ? kStart : -1; +}; + +// TypePosFinder helps to find the position in which T is inside Args... +// If T is not inside Args..., kPos would be -1 +template +struct TypePosFinder { + static constexpr int kPos = + TypePosFinderImpl::kPos; +}; + +template +struct VarTypeRegistryImpl { + static constexpr size_t kRegisteredTypeNum = sizeof...(Args); + using ArgTuple = std::tuple; + + // TypePos() returns the position in which T is inside Args... + // If T is not inside Args..., return -1 + template + static constexpr int TypePos() { + return TypePosFinder::kPos; + } + + // IsRegistered() returns whether T is registered inside RegistryImpl + template + static constexpr bool IsRegistered() { + return TypePos() >= 0; + } +}; + +} // namespace detail + +#define REG_PROTO_VAR_TYPE_TRAIT(type, proto_id) \ + template <> \ + struct VarTypeTrait { \ + static_assert(VarTypeRegistry::IsRegistered(), \ + "Must be registered type"); \ + using Type = type; \ + static constexpr int kId = static_cast(proto_id); \ + } + +/** + * The following codes are designed to register variable types. + * Only registered types can be stored in Variable. + * This registry mechanism is designed to speed up Variable. + * + * Caution: If you want to add more var types, please consider carefully + * whether you really need to add it. + */ + +// Users should add other variable types below. +// Paddle would generate unique Ids for each registered variable types. +using VarTypeRegistry = detail::VarTypeRegistryImpl< + Tensor, LoDTensor, SelectedRows, std::vector, LoDRankTable, + LoDTensorArray, platform::PlaceList, ReaderHolder, std::string, Scope *, + std::map, operators::reader::LoDTensorBlockingQueueHolder, +#ifdef PADDLE_WITH_CUDA +#ifndef _WIN32 + ncclUniqueId, platform::Communicator, +#endif + operators::AlgorithmsCache, + operators::AlgorithmsCache, + operators::AlgorithmsCache, + operators::CudnnRNNCache, +#endif + int, float>; + +template +struct VarTypeTrait { + static_assert(VarTypeRegistry::IsRegistered(), "Must be registered type"); + using Type = T; + /** + * Unique VarType Id generation. + * + * The auto-generated id should not be the same as any protobuf id defined in + * framework.proto. Therefore, we generate id by adding the type pos and + * maximum protobuf id (i.e., proto::VarType::TUPLE). + * + * However, we may need more protobuf id in the future. + * To avoid changing this auto id generation algorithm frequently, we + * generate id by adding the type pos and twice of maximum protobuf id (i.e., + * proto::VarType::TUPLE). + */ + static constexpr int kId = VarTypeRegistry::TypePos() + + static_cast(proto::VarType::TUPLE) * 2; +}; + +// Users should set some of variable type ids to be what is defined in +// framework.proto below +REG_PROTO_VAR_TYPE_TRAIT(LoDTensor, proto::VarType::LOD_TENSOR); +REG_PROTO_VAR_TYPE_TRAIT(SelectedRows, proto::VarType::SELECTED_ROWS); +REG_PROTO_VAR_TYPE_TRAIT(std::vector, proto::VarType::STEP_SCOPES); +REG_PROTO_VAR_TYPE_TRAIT(LoDRankTable, proto::VarType::LOD_RANK_TABLE); +REG_PROTO_VAR_TYPE_TRAIT(LoDTensorArray, proto::VarType::LOD_TENSOR_ARRAY); +REG_PROTO_VAR_TYPE_TRAIT(platform::PlaceList, proto::VarType::PLACE_LIST); +REG_PROTO_VAR_TYPE_TRAIT(ReaderHolder, proto::VarType::READER); +REG_PROTO_VAR_TYPE_TRAIT(int, proto::VarType::INT32); +REG_PROTO_VAR_TYPE_TRAIT(float, proto::VarType::FP32); + +/** End of variable type registration */ + +template +inline constexpr bool IsRegisteredVarType() { + return VarTypeRegistry::IsRegistered(); +} + +#undef REG_PROTO_VAR_TYPE_TRAIT +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/var_type_traits_test.cc b/paddle/fluid/framework/var_type_traits_test.cc new file mode 100644 index 0000000000..00840d634d --- /dev/null +++ b/paddle/fluid/framework/var_type_traits_test.cc @@ -0,0 +1,120 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include +#include +#include + +#include "paddle/fluid/framework/lod_rank_table.h" +#include "paddle/fluid/framework/reader.h" +#include "paddle/fluid/framework/scope.h" +#include "paddle/fluid/framework/selected_rows.h" +#include "paddle/fluid/framework/var_type_traits.h" +#include "paddle/fluid/operators/reader/lod_tensor_blocking_queue.h" +#ifdef PADDLE_WITH_CUDA +#ifndef _WIN32 +#include "paddle/fluid/operators/nccl/nccl_gpu_common.h" +#endif +#include "paddle/fluid/operators/conv_cudnn_op_cache.h" +#include "paddle/fluid/operators/cudnn_rnn_cache.h" +#endif + +namespace paddle { +namespace framework { + +template +struct TypeIndexChecker { + template + static void Check(SetType1 *var_id_set, SetType2 *type_index_set) { + using Type = + typename std::tuple_element::type; + static_assert(std::is_same::Type, Type>::value, + "Type must be the same"); + constexpr auto kId = VarTypeTrait::kId; + std::type_index actual_type(typeid(Type)); + EXPECT_EQ(std::string(ToTypeName(kId)), std::string(actual_type.name())); + EXPECT_EQ(ToTypeIndex(kId), actual_type); + EXPECT_EQ(ToTypeId(actual_type), kId); + EXPECT_EQ(ToTypeIndex(ToTypeId(actual_type)), actual_type); + EXPECT_EQ(ToTypeId(ToTypeIndex(kId)), kId); + + EXPECT_TRUE(var_id_set->count(kId) == 0); // NOLINT + EXPECT_TRUE(type_index_set->count(actual_type) == 0); // NOLINT + var_id_set->insert(kId); + type_index_set->insert(std::type_index(typeid(Type))); + TypeIndexChecker::Check(var_id_set, + type_index_set); + } +}; + +template +struct TypeIndexChecker { + template + static void Check(SetType1 *, SetType2 *) {} +}; + +TEST(var_type_traits, check_no_duplicate_registry) { + constexpr size_t kRegisteredNum = VarTypeRegistry::kRegisteredTypeNum; + std::unordered_set var_id_set; + std::unordered_set type_index_set; + TypeIndexChecker<0, kRegisteredNum, kRegisteredNum == 0>::Check( + &var_id_set, &type_index_set); +} + +template +bool CheckVarId(int proto_id) { + static_assert(std::is_same::Type, T>::value, + "Type must be the same"); + return VarTypeTrait::kId == proto_id; +} + +TEST(var_type_traits, check_proto_type_id) { + ASSERT_TRUE(CheckVarId(proto::VarType::LOD_TENSOR)); + ASSERT_TRUE(CheckVarId(proto::VarType::SELECTED_ROWS)); + ASSERT_TRUE(CheckVarId>(proto::VarType::STEP_SCOPES)); + ASSERT_TRUE(CheckVarId(proto::VarType::LOD_RANK_TABLE)); + ASSERT_TRUE(CheckVarId(proto::VarType::LOD_TENSOR_ARRAY)); + ASSERT_TRUE(CheckVarId(proto::VarType::PLACE_LIST)); + ASSERT_TRUE(CheckVarId(proto::VarType::READER)); + ASSERT_TRUE(CheckVarId(proto::VarType::INT32)); + ASSERT_TRUE(CheckVarId(proto::VarType::FP32)); + + ASSERT_EQ(proto::VarType_Type_LOD_TENSOR, proto::VarType::LOD_TENSOR); + ASSERT_EQ(proto::VarType_Type_SELECTED_ROWS, proto::VarType::SELECTED_ROWS); + ASSERT_EQ(proto::VarType_Type_STEP_SCOPES, proto::VarType::STEP_SCOPES); + ASSERT_EQ(proto::VarType_Type_LOD_RANK_TABLE, proto::VarType::LOD_RANK_TABLE); + ASSERT_EQ(proto::VarType_Type_LOD_TENSOR_ARRAY, + proto::VarType::LOD_TENSOR_ARRAY); + ASSERT_EQ(proto::VarType_Type_PLACE_LIST, proto::VarType::PLACE_LIST); + ASSERT_EQ(proto::VarType_Type_READER, proto::VarType::READER); + ASSERT_EQ(proto::VarType_Type_FEED_MINIBATCH, proto::VarType::FEED_MINIBATCH); + ASSERT_EQ(proto::VarType_Type_FETCH_LIST, proto::VarType::FETCH_LIST); + ASSERT_EQ(proto::VarType_Type_RAW, proto::VarType::RAW); + ASSERT_EQ(proto::VarType_Type_TUPLE, proto::VarType::TUPLE); + ASSERT_EQ(proto::VarType_Type_INT32, proto::VarType::INT32); + ASSERT_EQ(proto::VarType_Type_FP32, proto::VarType::FP32); +} + +TEST(var_type_traits, test_registry) { + using Registry = detail::VarTypeRegistryImpl; + ASSERT_TRUE(Registry::TypePos() == 0); + ASSERT_TRUE(Registry::TypePos() == 1); + ASSERT_TRUE(Registry::TypePos() == 2); + ASSERT_TRUE(Registry::TypePos() == 3); + ASSERT_TRUE(Registry::TypePos() == -1); +} + +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/variable.h b/paddle/fluid/framework/variable.h index 873e1b20a5..b9d07da822 100644 --- a/paddle/fluid/framework/variable.h +++ b/paddle/fluid/framework/variable.h @@ -18,7 +18,7 @@ #include #include -#include "paddle/fluid/platform/enforce.h" +#include "paddle/fluid/framework/var_type_traits.h" namespace paddle { namespace framework { @@ -27,10 +27,14 @@ class Variable { public: template const T& Get() const { + static_assert( + IsRegisteredVarType(), + "Not registered type. Please register T inside var_type_traits.h"); PADDLE_ENFORCE(holder_ != nullptr, "Variable must hold some thing"); - PADDLE_ENFORCE(IsType(), + PADDLE_ENFORCE(holder_->Type() == VarTypeTrait::kId, "Variable must be type %s, the holding type is %s", - typeid(T).name(), holder_->Type().name()); + ToTypeName(VarTypeTrait::kId), + ToTypeName(holder_->Type())); return *static_cast(holder_->Ptr()); } @@ -39,61 +43,61 @@ class Variable { template T* GetMutable() { if (!holder_) { - holder_.reset(new PlaceholderImpl(new T())); + holder_.reset(new PlaceholderImpl()); } else { - PADDLE_ENFORCE(IsType(), + PADDLE_ENFORCE(holder_->Type() == VarTypeTrait::kId, "Variable must be type %s, the holding type is %s", - typeid(T).name(), holder_->Type().name()); + ToTypeName(VarTypeTrait::kId), + ToTypeName(holder_->Type())); } return static_cast(holder_->Ptr()); } template bool IsType() const { - return holder_ != nullptr && - std::type_index(typeid(T)) == std::type_index(holder_->Type()); + return holder_ && holder_->Type() == VarTypeTrait::kId; } void Clear() { holder_.reset(); } - std::type_index Type() const { + int Type() const { PADDLE_ENFORCE(holder_ != nullptr, "Must hold memory"); return holder_->Type(); } private: struct Placeholder { - virtual ~Placeholder() {} - virtual const std::type_info& Type() const = 0; - virtual void* Ptr() const = 0; + virtual ~Placeholder() = default; + + inline int Type() const { return type_; } + inline const void* Ptr() const { return ptr_; } + inline void* Ptr() { return ptr_; } + + protected: + inline void Init(void* p, int type) { + ptr_ = p; + type_ = type; + } + + void* ptr_; + int type_; }; // Placeholder hides type T, so it doesn't appear as a template // parameter of Variable. template struct PlaceholderImpl : public Placeholder { - explicit PlaceholderImpl(T* ptr) : ptr_(ptr), type_(typeid(T)) {} - - virtual const std::type_info& Type() const { return type_; } - virtual void* Ptr() const { return static_cast(ptr_.get()); } + static_assert( + IsRegisteredVarType(), + "Not registered type. Please register T inside var_type_traits.h"); + PlaceholderImpl() { this->Init(&obj_, VarTypeTrait::kId); } - std::unique_ptr ptr_; - const std::type_info& type_; + private: + T obj_; }; - std::unique_ptr - holder_; // pointers to a PlaceholderImpl object indeed. - - // name_ is only meaningful with a Scope and accessible by it. - // - // NOTE: Please don't expose name_ by adding methods like - // Variable::Name or Scope::VarName! A variable could have a human - // readable name or an auto-generated scope-unique name. In the - // former case, the caller knows the name and doesn't need to access - // the name; in the latter case, the variable should be identified - // by its address but not the unreadable name. - friend class Scope; - const std::string* name_; + // pointers to a PlaceholderImpl object indeed. + std::unique_ptr holder_; }; } // namespace framework diff --git a/paddle/fluid/framework/variable_test.cc b/paddle/fluid/framework/variable_test.cc index 003dcfd3df..511c9c5214 100644 --- a/paddle/fluid/framework/variable_test.cc +++ b/paddle/fluid/framework/variable_test.cc @@ -16,27 +16,28 @@ #include #include "gtest/gtest.h" +#include "paddle/fluid/framework/tensor.h" #include "paddle/fluid/framework/variable.h" -TEST(Variable, GetMutable) { - using paddle::framework::Variable; - - struct Tensor { - int content_; - }; +namespace paddle { +namespace framework { +TEST(Variable, GetMutable) { std::unique_ptr v(new Variable()); - Tensor* t = v->GetMutable(); - t->content_ = 1234; + auto* t = v->GetMutable(); + *t = "1234"; - const Tensor& tt = v->Get(); - EXPECT_EQ(1234, tt.content_); + const auto& tt = v->Get(); + EXPECT_EQ("1234", tt); try { - v->GetMutable(); + v->GetMutable(); } catch (std::exception& e) { return; } EXPECT_TRUE(false); } + +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/imperative/layer.cc b/paddle/fluid/imperative/layer.cc index 342cb68ab2..26e7830265 100644 --- a/paddle/fluid/imperative/layer.cc +++ b/paddle/fluid/imperative/layer.cc @@ -42,13 +42,9 @@ void AddTo(Variable* src, Variable* dst) { class Autograd { public: - explicit Autograd(framework::Scope* scope) : scope_(scope) {} + Autograd() {} void RunBackward(VarBase* var) { - PADDLE_ENFORCE(var->pre_op_->op_desc_); - // TODO(panyx0718): Only create for vars that "require_grad" - (*var->pre_op_->output_vars_)[var->pre_op_out_idx_]->grads_ = var->grads_; - std::deque ready; ready.push_back(var->pre_op_); @@ -57,18 +53,22 @@ class Autograd { while (!ready.empty()) { OpBase* ready_op = ready.front(); ready.pop_front(); - std::vector input_grads = ready_op->ApplyGrad(scope_); - - for (size_t i = 0; i < input_grads.size(); ++i) { - if (!input_grads[i]) continue; - OpBase* pre_op = ready_op->pre_ops_->at(i); - if (!pre_op) continue; - - dep_counts[pre_op] -= 1; - PADDLE_ENFORCE(dep_counts[pre_op] >= 0); - bool pre_op_ready = dep_counts[pre_op] == 0; - if (pre_op_ready) { - ready.push_back(pre_op); + std::map> input_grads = + ready_op->ApplyGrad(); + + for (auto it : input_grads) { + const std::vector& ingrads = it.second; + for (size_t i = 0; i < ingrads.size(); ++i) { + if (!ingrads[i]) continue; + OpBase* pre_op = ready_op->pre_ops_[it.first][i]; + if (!pre_op) continue; + + dep_counts[pre_op] -= 1; + PADDLE_ENFORCE(dep_counts[pre_op] >= 0); + bool pre_op_ready = dep_counts[pre_op] == 0; + if (pre_op_ready) { + ready.push_back(pre_op); + } } } } @@ -85,138 +85,84 @@ class Autograd { while (!queue.empty()) { OpBase* candidate = queue.front(); queue.pop_front(); - for (OpBase* pre_op : *(candidate->pre_ops_)) { - if (!pre_op) continue; - if (visited.find(pre_op) == visited.end()) { - visited.insert(pre_op); - queue.push_back(pre_op); + for (auto it : candidate->pre_ops_) { + for (OpBase* pre_op : it.second) { + if (!pre_op) continue; + if (visited.find(pre_op) == visited.end()) { + visited.insert(pre_op); + queue.push_back(pre_op); + } + ret[pre_op] += 1; } - ret[pre_op] += 1; } } - return ret; } - - framework::Scope* scope_; }; -framework::Variable* CreateVariable(const std::string& name, - const framework::DDim& dim, float val, - framework::Scope* scope, - bool random_name = true) { - std::string varname = name; - if (random_name) { - std::mt19937 rng; - rng.seed(std::random_device()()); - std::uniform_int_distribution dist6( - 1, std::numeric_limits::max()); - int id = dist6(rng); - varname = string::Sprintf("%s@%d", varname, id); - } - - VLOG(3) << "creating var " << varname; - framework::Variable* var = scope->Var(varname); - framework::LoDTensor* tensor = var->GetMutable(); - - float* data = tensor->mutable_data(dim, platform::CPUPlace()); - std::fill(data, data + tensor->numel(), val); - return var; -} - framework::LoDTensor& VarBase::Grad() { VLOG(3) << "get var grad " << var_desc_->Name(); return *grads_->GetMutable(); } -void VarBase::ApplyGrad(framework::Scope* scope, Variable* grad) { - VLOG(3) << "apply var grad " << var_desc_->Name() << " " - << grad->Get().data()[0]; - if (!grads_) { - grads_ = - CreateVariable(string::Sprintf("%s@IGrad", var_desc_->Name()), - var_->Get().dims(), 0.0, scope); +std::map> OpBase::ApplyGrad() { + if (!grad_op_desc_) { + VLOG(3) << "op with no grad: " << op_desc_->Type(); + return {}; } - AddTo(grad, grads_); - VLOG(3) << "grad_ after apply var grad " << var_desc_->Name() << " " - << grads_->Get().data()[0]; -} - -std::vector OpBase::ApplyGrad(framework::Scope* scope) { VLOG(3) << "op grad " << grad_op_desc_->Type(); - for (const std::string& grad_invar : grad_op_desc_->InputArgumentNames()) { - if (grad_to_var_->find(grad_invar) == grad_to_var_->end()) { - // grad op inputs can be forward inputs, so not in grad_to_var. - continue; - } - VLOG(3) << "op grad in var " << grad_invar; - block_->FindRecursiveOrCreateVar(grad_invar); - framework::Variable* var = scope->Var(grad_invar); - const std::string& invar = grad_to_var_->at(grad_invar); - for (VarBase* varbase : *output_vars_) { - // Use the accumulated grads_ by sharing the input with grads_. - if (varbase->var_desc_->Name() == invar) { - var->GetMutable()->ShareDataWith( - varbase->grads_->Get()); - break; - } + std::vector> tmp_vars; + std::map> grad_outputs; + for (auto it : grad_output_vars_) { + auto& outputs = grad_outputs[it.first]; + for (size_t i = 0; i < it.second.size(); ++i) { + tmp_vars.emplace_back(new framework::Variable()); + outputs.push_back(tmp_vars.back().get()); + outputs.back()->GetMutable(); } } - for (const std::string& outvar : grad_op_desc_->OutputArgumentNames()) { - VLOG(3) << "grad outvar " << outvar; - block_->FindRecursiveOrCreateVar(outvar); - framework::Variable* var = scope->Var(outvar); - if (!var->IsInitialized()) { - framework::VarDesc* var_desc = block_->FindVar(outvar); - if (var_desc->GetType() == framework::proto::VarType::LOD_TENSOR) { - var->GetMutable(); - } else { - LOG(ERROR) << "tracer doesn't support yet"; - } - } - } - grad_op_desc_->InferShape(*block_); + framework::RuntimeContext ctx(grad_input_vars_, grad_outputs); + + // No need to do static infer shape here. + // grad_op_desc_->InferShape(*block_); grad_op_desc_->InferVarType(block_); + std::unique_ptr opbase = framework::OpRegistry::CreateOp(*grad_op_desc_); - - opbase->Run(*scope, platform::CPUPlace()); - - // `ret` matches exactly with `input_vars_` of forward op. - std::vector ret; - for (size_t i = 0; i < input_vars_->size(); ++i) { - bool found = false; - VarBase* origin_var = (*input_vars_)[i]; - for (const std::string& outvar : grad_op_desc_->OutputArgumentNames()) { - Variable* var = scope->FindVar(outvar); - std::string orig_var = grad_to_var_->at(outvar); - if (origin_var->var_desc_->Name() != orig_var) { - continue; - } - VLOG(3) << "apply grad " << outvar << " with origin " << orig_var; - origin_var->ApplyGrad(scope, var); - found = true; - ret.push_back(var); - // TODO(panyx0718): There might be another outvar with the same name. - // In that case, it doesn't matter the first one or the second one is - // used. - break; - } - if (!found) { - ret.push_back(nullptr); + framework::OperatorWithKernel* op_kernel = + dynamic_cast(opbase.get()); + PADDLE_ENFORCE_NOT_NULL(op_kernel, "only support op with kernel"); + + framework::Scope scope; + platform::CPUPlace place; + PreparedOp p = PreparedOp::Prepare(ctx, *op_kernel, place); + p.op.RuntimeInferShape(scope, place, ctx); + p.func(framework::ExecutionContext(p.op, scope, *p.dev_ctx, p.ctx)); + + for (auto it : grad_output_vars_) { + auto& outputs = grad_outputs[it.first]; + auto& origin_outputs = it.second; + for (size_t i = 0; i < outputs.size(); ++i) { + framework::Variable* orig_grad = origin_outputs[i]; + AddTo(outputs[i], orig_grad); } } - return ret; + return input_vars_; } -void VarBase::RunBackward(framework::Scope* scope) { - grads_ = CreateVariable(framework::GradVarName(var_desc_->Name()), - var_->Get().dims(), 1.0, scope, - false); +void VarBase::RunBackward() { if (!pre_op_) return; - Autograd(scope).RunBackward(this); + + auto grads_t = grads_->GetMutable(); + float* data = grads_t->mutable_data(platform::CPUPlace()); + std::fill(data, data + grads_t->numel(), 1.0); + + PADDLE_ENFORCE( + grads_ == + pre_op_->output_vars_[pre_op_out_name_][pre_op_out_idx_]->grads_); + Autograd().RunBackward(this); } } // namespace imperative diff --git a/paddle/fluid/imperative/layer.h b/paddle/fluid/imperative/layer.h index 85a71ca83d..ae4e8e0f8a 100644 --- a/paddle/fluid/imperative/layer.h +++ b/paddle/fluid/imperative/layer.h @@ -14,17 +14,69 @@ #pragma once +#include #include #include #include "paddle/fluid/framework/op_desc.h" #include "paddle/fluid/framework/operator.h" -#include "paddle/fluid/framework/scope.h" #include "paddle/fluid/framework/var_desc.h" #include "paddle/fluid/platform/enforce.h" namespace paddle { namespace imperative { +class PreparedOp { + public: + PreparedOp(const framework::OperatorBase& op, + const framework::RuntimeContext& ctx, + framework::OperatorWithKernel::OpKernelFunc func, + platform::DeviceContext* dev_ctx) + : op(op), ctx(ctx), func(func), dev_ctx(dev_ctx) {} + + static PreparedOp Prepare(const framework::RuntimeContext& ctx, + const framework::OperatorWithKernel& op, + const platform::Place& place) { + platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance(); + auto* dev_ctx = pool.Get(place); + + // check if op[type] has kernel registered. + auto& all_op_kernels = op.AllOpKernels(); + auto kernels_iter = all_op_kernels.find(op.Type()); + if (kernels_iter == all_op_kernels.end()) { + PADDLE_THROW( + "There are no kernels which are registered in the %s operator.", + op.Type()); + } + + framework::OperatorWithKernel::OpKernelMap& kernels = kernels_iter->second; + + auto expected_kernel_key = op.GetExpectedKernelType( + framework::ExecutionContext(op, framework::Scope(), *dev_ctx, ctx)); + VLOG(3) << "expected_kernel_key:" << expected_kernel_key; + + auto kernel_iter = kernels.find(expected_kernel_key); +#ifdef PADDLE_WITH_MKLDNN + // workaround for missing MKLDNN kernel when FLAGS_use_mkldnn env var is set + if (kernel_iter == kernels.end() && + expected_kernel_key.library_type_ == framework::LibraryType::kMKLDNN) { + VLOG(3) << "missing MKLDNN kernel: fallbacking to PLAIN one"; + expected_kernel_key.library_type_ = framework::LibraryType::kPlain; + expected_kernel_key.data_layout_ = framework::DataLayout::kAnyLayout; + kernel_iter = kernels.find(expected_kernel_key); + } +#endif + if (kernel_iter == kernels.end()) { + PADDLE_THROW("op %s does not have kernel for %s", op.Type(), + KernelTypeToString(expected_kernel_key)); + } + return PreparedOp(op, ctx, kernel_iter->second, dev_ctx); + } + + const framework::OperatorBase& op; + const framework::RuntimeContext& ctx; + framework::OperatorWithKernel::OpKernelFunc func; + platform::DeviceContext* dev_ctx; +}; class OpBase; class VarBase { @@ -33,18 +85,26 @@ class VarBase { : pre_op_(nullptr), pre_op_out_idx_(-1), var_desc_(nullptr), - var_(nullptr), - grads_(nullptr) {} - - virtual ~VarBase() {} - - void ApplyGrad(framework::Scope* scope, framework::Variable* grad); + var_(new framework::Variable()), + grads_(new framework::Variable()) {} + + virtual ~VarBase() { + if (var_) { + delete var_; + var_ = nullptr; + } + if (grads_) { + delete grads_; + grads_ = nullptr; + } + } - void RunBackward(framework::Scope* scope); + void RunBackward(); framework::LoDTensor& Grad(); OpBase* pre_op_; + std::string pre_op_out_name_; int pre_op_out_idx_; framework::VarDesc* var_desc_; @@ -54,35 +114,24 @@ class VarBase { class OpBase { public: - OpBase() - : input_vars_(new std::vector()), - output_vars_(new std::vector()), - pre_ops_(new std::vector()), - pre_ops_out_idx_(new std::vector()), - op_desc_(nullptr), - grad_op_desc_(nullptr) {} + OpBase() : op_desc_(nullptr), grad_op_desc_(nullptr) {} virtual ~OpBase() { - delete input_vars_; - delete output_vars_; - - delete pre_ops_; - delete pre_ops_out_idx_; - if (grad_op_desc_) delete grad_op_desc_; - if (grad_to_var_) delete grad_to_var_; } - std::vector ApplyGrad(framework::Scope* scope); + std::map> ApplyGrad(); - std::vector* input_vars_; - std::vector* output_vars_; - std::vector* pre_ops_; - std::vector* pre_ops_out_idx_; framework::OpDesc* op_desc_; - framework::OpDesc* grad_op_desc_; - std::unordered_map* grad_to_var_; + + std::map> input_vars_; + std::map> output_vars_; + std::map> pre_ops_; + std::map> pre_ops_out_idx_; + + std::map> grad_input_vars_; + std::map> grad_output_vars_; framework::BlockDesc* block_; }; diff --git a/paddle/fluid/imperative/tracer.h b/paddle/fluid/imperative/tracer.h index 97772dc110..c814da9853 100644 --- a/paddle/fluid/imperative/tracer.h +++ b/paddle/fluid/imperative/tracer.h @@ -20,7 +20,6 @@ #include "paddle/fluid/framework/op_desc.h" #include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/framework/scope.h" #include "paddle/fluid/imperative/engine.h" #include "paddle/fluid/imperative/layer.h" @@ -41,22 +40,28 @@ void CreateGradOp(const framework::OpDesc& op_desc, *grad_op_desc = grad_op_descs[0].release(); } +void InitVar(framework::Variable* var, framework::Variable* grad_var) { + auto& var_t = var->Get(); + float* data = + grad_var->GetMutable()->mutable_data( + var_t.dims(), platform::CPUPlace()); + std::fill(data, data + var_t.numel(), 0.0); +} + class Tracer { public: explicit Tracer(framework::BlockDesc* root_block, framework::BlockDesc* startup_block) - : root_block_(root_block), startup_block_(startup_block) { - root_scope_ = new framework::Scope(); - scopes_[root_block_] = root_scope_; - scopes_[startup_block_] = root_scope_; - } + : root_block_(root_block), startup_block_(startup_block) {} - virtual ~Tracer() { delete root_scope_; } + virtual ~Tracer() {} - void Trace(OpBase* op, const std::vector& inputs, - const std::vector& outputs, + void Trace(OpBase* op, + const std::map>& inputs, + const std::map>& outputs, framework::BlockDesc* block) { - framework::Scope* scope = GetScope(block); + std::map vars; + framework::OpDesc* op_desc = op->op_desc_; VLOG(3) << "tracer tracing " << op_desc->Type(); op_desc->InferShape(*block); @@ -64,77 +69,113 @@ class Tracer { std::unique_ptr op_base = framework::OpRegistry::CreateOp(*op_desc); - *op->input_vars_ = inputs; - for (VarBase* input : inputs) { - const std::string vname = input->var_desc_->Name(); - framework::Variable* var = scope->Var(vname); - input->var_ = var; - if (!var->IsInitialized()) { - framework::VarDesc* var_desc = block->FindVar(vname); - if (var_desc->GetType() == framework::proto::VarType::LOD_TENSOR) { - var->GetMutable(); + framework::VariableValueMap invars_map; + framework::VariableValueMap outvars_map; + + op->input_vars_ = inputs; + for (auto it : op->input_vars_) { + auto& invars = invars_map[it.first]; + for (VarBase* inp : it.second) { + PADDLE_ENFORCE_NOT_NULL(inp->var_, "op %s input %s nullptr", + op->op_desc_->Type(), inp->var_desc_->Name()); + + invars.push_back(inp->var_); + vars[inp->var_desc_->Name()] = inp; + if (inp->pre_op_) { + op->pre_ops_[it.first].push_back(inp->pre_op_); + op->pre_ops_out_idx_[it.first].push_back(inp->pre_op_out_idx_); } else { - LOG(ERROR) << "tracer doesn't support yet"; + op->pre_ops_[it.first].push_back(nullptr); } + VLOG(3) << "input vname " << inp->var_desc_->Name() << " " + << inp->var_->IsInitialized(); } - if (input->pre_op_) { - op->pre_ops_->push_back(input->pre_op_); - op->pre_ops_out_idx_->push_back(input->pre_op_out_idx_); - } else { - op->pre_ops_->push_back(nullptr); - } - VLOG(3) << "input vname " << vname << " " - << var->Get().dims().size(); } - *op->output_vars_ = outputs; - for (size_t i = 0; i < outputs.size(); ++i) { - const std::string vname = outputs[i]->var_desc_->Name(); - framework::Variable* var = scope->Var(vname); - if (!var->IsInitialized()) { - framework::VarDesc* var_desc = block->FindVar(vname); + op->output_vars_ = outputs; + for (auto it : op->output_vars_) { + auto& outvars = outvars_map[it.first]; + const std::vector& outputs = it.second; + for (size_t i = 0; i < outputs.size(); ++i) { + VarBase* out = outputs[i]; + outvars.push_back(out->var_); + vars[out->var_desc_->Name()] = out; + + framework::VarDesc* var_desc = block->FindVar(out->var_desc_->Name()); if (var_desc->GetType() == framework::proto::VarType::LOD_TENSOR) { - var->GetMutable(); + out->var_->GetMutable(); } else { LOG(ERROR) << "tracer doesn't support yet"; } + out->pre_op_ = op; + out->pre_op_out_name_ = it.first; + out->pre_op_out_idx_ = i; + + VLOG(3) << "output vname " << out->var_desc_->Name() << " " + << out->var_->IsInitialized(); } - outputs[i]->var_ = var; - outputs[i]->pre_op_ = op; - outputs[i]->pre_op_out_idx_ = i; } VLOG(3) << "tracer running " << op_desc->Type(); - op_base->Run(*scope, platform::CPUPlace()); + framework::RuntimeContext ctx(invars_map, outvars_map); + + // TODO(panyx0718): Cache p. + framework::OperatorWithKernel* op_kernel = + dynamic_cast(op_base.get()); + PADDLE_ENFORCE_NOT_NULL(op_kernel, "only support op with kernel"); + + framework::Scope scope; + platform::CPUPlace place; + PreparedOp p = PreparedOp::Prepare(ctx, *op_kernel, place); + p.op.RuntimeInferShape(scope, place, ctx); + p.func(framework::ExecutionContext(p.op, scope, *p.dev_ctx, p.ctx)); + if (block == startup_block_) { op->grad_op_desc_ = nullptr; - op->grad_to_var_ = nullptr; } else { framework::OpDesc* grad_op_desc; auto grad_to_var = new std::unordered_map(); CreateGradOp(*op_desc, {}, {block}, &grad_op_desc, grad_to_var); op->grad_op_desc_ = grad_op_desc; - op->grad_to_var_ = grad_to_var; - } - op->block_ = block; - } - framework::Scope* GetScope(framework::BlockDesc* block) { - if (scopes_.find(block) != scopes_.end()) { - return scopes_.at(block); + for (auto it : grad_op_desc->Inputs()) { + auto& grad_in_vars = op->grad_input_vars_[it.first]; + for (const std::string& grad_invar : it.second) { + block->FindRecursiveOrCreateVar(grad_invar); + auto var_it = grad_to_var->find(grad_invar); + if (var_it == grad_to_var->end()) { + auto fwd_var_it = vars.find(grad_invar); + PADDLE_ENFORCE(fwd_var_it != vars.end()); + grad_in_vars.push_back(fwd_var_it->second->var_); + } else { + VarBase* var = vars[var_it->second]; + if (!var->grads_->IsInitialized()) { + InitVar(var->var_, var->grads_); + } + grad_in_vars.push_back(var->grads_); + } + } + } + for (auto it : grad_op_desc->Outputs()) { + auto& grad_out_vars = op->grad_output_vars_[it.first]; + for (const std::string& grad_outvar : it.second) { + block->FindRecursiveOrCreateVar(grad_outvar); + auto var_it = grad_to_var->find(grad_outvar); + PADDLE_ENFORCE(var_it != grad_to_var->end()); + VarBase* var = vars[var_it->second]; + if (!var->grads_->IsInitialized()) { + InitVar(var->var_, var->grads_); + } + grad_out_vars.push_back(var->grads_); + } + } } - framework::BlockDesc* parent_block = block->ParentBlock(); - PADDLE_ENFORCE(scopes_.find(parent_block) != scopes_.end()); - framework::Scope* scope = &scopes_[parent_block]->NewScope(); - scopes_[block] = scope; - return scope; + op->block_ = block; } private: - std::map scopes_; framework::BlockDesc* root_block_; framework::BlockDesc* startup_block_; - framework::Scope* root_scope_; }; } // namespace imperative diff --git a/paddle/fluid/inference/analysis/analyzer_tester.cc b/paddle/fluid/inference/analysis/analyzer_tester.cc index cb88333d15..f84e1ab6b8 100644 --- a/paddle/fluid/inference/analysis/analyzer_tester.cc +++ b/paddle/fluid/inference/analysis/analyzer_tester.cc @@ -69,17 +69,17 @@ void TestWord2vecPrediction(const std::string& model_path) { std::vector outputs; CHECK(predictor->Run(slots, &outputs)); - PADDLE_ENFORCE(outputs.size(), 1UL); + PADDLE_ENFORCE_EQ(outputs.size(), 1UL); // Check the output buffer size and result of each tid. - PADDLE_ENFORCE(outputs.front().data.length(), 33168UL); + PADDLE_ENFORCE_EQ(outputs.front().data.length(), 33168UL); float result[5] = {0.00129761, 0.00151112, 0.000423564, 0.00108815, 0.000932706}; const size_t num_elements = outputs.front().data.length() / sizeof(float); // The outputs' buffers are in CPU memory. for (size_t i = 0; i < std::min(static_cast(5UL), num_elements); i++) { - LOG(INFO) << "data: " - << static_cast(outputs.front().data.data())[i]; + LOG(INFO) << "data: " << static_cast(outputs.front().data.data())[i] + << " result: " << result[i]; PADDLE_ENFORCE(static_cast(outputs.front().data.data())[i], result[i]); } diff --git a/paddle/fluid/inference/analysis/argument.h b/paddle/fluid/inference/analysis/argument.h index 83d411eecf..2db5705d09 100644 --- a/paddle/fluid/inference/analysis/argument.h +++ b/paddle/fluid/inference/analysis/argument.h @@ -127,6 +127,7 @@ struct Argument { std::function); DECL_ARGUMENT_FIELD(tensorrt_max_batch_size, TensorRtMaxBatchSize, int); DECL_ARGUMENT_FIELD(tensorrt_workspace_size, TensorRtWorkspaceSize, int); + DECL_ARGUMENT_FIELD(tensorrt_min_subgraph_size, TensorRtMinSubgraphSize, int); // The program transformed by IR analysis phase. DECL_ARGUMENT_UNIQUE_FIELD(ir_analyzed_program, IrAnalyzedProgram, diff --git a/paddle/fluid/inference/analysis/ir_pass_manager.cc b/paddle/fluid/inference/analysis/ir_pass_manager.cc index 51bca8039d..b8c9426ed3 100644 --- a/paddle/fluid/inference/analysis/ir_pass_manager.cc +++ b/paddle/fluid/inference/analysis/ir_pass_manager.cc @@ -75,6 +75,8 @@ void IRPassManager::CreatePasses(Argument *argument, argument->tensorrt_node_teller_ptr()); pass->Set("workspace_size", new int(argument->tensorrt_workspace_size())); pass->Set("max_batch_size", new int(argument->tensorrt_max_batch_size())); + pass->Set("min_subgraph_size", + new int(argument->tensorrt_min_subgraph_size())); } // graph_ = pass->Apply(std::move(graph_)); diff --git a/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc b/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc index 9c42b83e7a..ad10010e42 100644 --- a/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc +++ b/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc @@ -12,12 +12,14 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.h" +#include #include #include + #include "paddle/fluid/framework/ir/graph_pattern_detector.h" #include "paddle/fluid/inference/analysis/helper.h" #include "paddle/fluid/inference/analysis/ir_passes/subgraph_detector.h" +#include "paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.h" namespace paddle { namespace inference { @@ -36,7 +38,8 @@ std::unique_ptr analysis::TensorRtSubgraphPass::ApplyImpl( auto teller = Get("tensorrt_node_teller"); - SubGraphFuser fuser(graph.get(), teller, 2 /*min subgraph size*/); + SubGraphFuser fuser(graph.get(), teller, + Get("min_subgraph_size") /*min subgraph size*/); fuser(); for (auto *node : graph->Nodes()) { @@ -197,10 +200,26 @@ void TensorRtSubgraphPass::CreateTensorRTOp(framework::ir::Node *node, std::vector ExtractParameters( const std::unordered_set &nodes) { + // We can judge whether a variable is a parameter by + // its presistable property, but sometimes the presistable + // of the feed op output is true, so we have to identify it. + std::vector feed_outputs; + for (const auto &node : nodes) { + if (!node->IsOp()) continue; + std::string op_type = node->Op()->Type(); + if (op_type == "feed") { + std::vector output_names = node->Op()->OutputArgumentNames(); + std::copy(output_names.begin(), output_names.end(), + std::back_inserter(feed_outputs)); + } + } + std::vector parameters; for (const auto &node : nodes) { if (!node->IsVar()) continue; - if (node->Var()->Persistable()) { + if (node->Var()->Persistable() && + std::find(feed_outputs.begin(), feed_outputs.end(), node->Name()) == + feed_outputs.end()) { parameters.push_back(node->Name()); } } @@ -215,4 +234,5 @@ REGISTER_PASS(tensorrt_subgraph_pass, paddle::inference::analysis::TensorRtSubgraphPass) .RequirePassAttr("tensorrt_node_teller") .RequirePassAttr("max_batch_size") - .RequirePassAttr("workspace_size"); + .RequirePassAttr("workspace_size") + .RequirePassAttr("min_subgraph_size"); diff --git a/paddle/fluid/inference/api/analysis_config.cc b/paddle/fluid/inference/api/analysis_config.cc index dcefdd92f5..6d6e799fde 100644 --- a/paddle/fluid/inference/api/analysis_config.cc +++ b/paddle/fluid/inference/api/analysis_config.cc @@ -57,6 +57,7 @@ contrib::AnalysisConfig::AnalysisConfig(const contrib::AnalysisConfig &other) { use_tensorrt_ = other.use_tensorrt_; tensorrt_max_batchsize_ = other.tensorrt_max_batchsize_; tensorrt_workspace_size_ = other.tensorrt_workspace_size_; + tensorrt_min_subgraph_size_ = other.tensorrt_min_subgraph_size_; model_from_memory_ = other.model_from_memory_; if (use_gpu) { @@ -89,6 +90,7 @@ contrib::AnalysisConfig::AnalysisConfig(contrib::AnalysisConfig &&other) { use_tensorrt_ = other.use_tensorrt_; tensorrt_max_batchsize_ = other.tensorrt_max_batchsize_; tensorrt_workspace_size_ = other.tensorrt_workspace_size_; + tensorrt_min_subgraph_size_ = other.tensorrt_min_subgraph_size_; model_from_memory_ = other.model_from_memory_; pass_builder_ = std::move(other.pass_builder_); @@ -105,12 +107,14 @@ void contrib::AnalysisConfig::EnableMKLDNN() { } void contrib::AnalysisConfig::EnableTensorRtEngine(int workspace_size, - int max_batch_size) { + int max_batch_size, + int min_subgraph_size) { use_tensorrt_ = true; tensorrt_workspace_size_ = workspace_size; tensorrt_max_batchsize_ = max_batch_size; - // Append after the infer_clean pass. - pass_builder()->InsertPass(1, "tensorrt_subgraph_pass"); + tensorrt_min_subgraph_size_ = min_subgraph_size; + // Append after the conv+affine_channel fuse pass. + pass_builder()->InsertPass(3, "tensorrt_subgraph_pass"); } void contrib::AnalysisConfig::SetModelBuffer(const char *prog_buffer, diff --git a/paddle/fluid/inference/api/analysis_predictor.cc b/paddle/fluid/inference/api/analysis_predictor.cc index 3937884ce4..3f8feaaa1e 100644 --- a/paddle/fluid/inference/api/analysis_predictor.cc +++ b/paddle/fluid/inference/api/analysis_predictor.cc @@ -328,6 +328,7 @@ void AnalysisPredictor::OptimizeInferenceProgram() { argument_.SetUseTensorRT(true); argument_.SetTensorRtWorkspaceSize(config_.tensorrt_workspace_size_); argument_.SetTensorRtMaxBatchSize(config_.tensorrt_max_batchsize_); + argument_.SetTensorRtMinSubgraphSize(config_.tensorrt_min_subgraph_size_); } if (config_.use_mkldnn_) { diff --git a/paddle/fluid/inference/api/details/reset_tensor_array.cc b/paddle/fluid/inference/api/details/reset_tensor_array.cc index 569a487328..03c2aa3fb8 100644 --- a/paddle/fluid/inference/api/details/reset_tensor_array.cc +++ b/paddle/fluid/inference/api/details/reset_tensor_array.cc @@ -25,7 +25,7 @@ void TensorArrayBatchCleaner::CollectTensorArrays(framework::Scope *scope) { // TODO(Superjomn) should avoid the case when a TensorArray is a // parameter. if (var_name == "feed" || var_name == "fetch") continue; - if (var->Type() == typeid(framework::LoDTensorArray)) { + if (var->IsType()) { VLOG(4) << "collect " << var_name; arrays_.push_back(var->GetMutable()); } diff --git a/paddle/fluid/inference/api/details/reset_tensor_array.h b/paddle/fluid/inference/api/details/reset_tensor_array.h index 6a5ea64de6..213c6891d0 100644 --- a/paddle/fluid/inference/api/details/reset_tensor_array.h +++ b/paddle/fluid/inference/api/details/reset_tensor_array.h @@ -27,8 +27,11 @@ namespace details { // training phase. struct TensorArrayBatchCleaner { TensorArrayBatchCleaner() { - valid_types_.insert(typeid(framework::Tensor)); - valid_types_.insert(typeid(framework::LoDTensor)); + constexpr auto kTensorId = framework::VarTypeTrait::kId; + constexpr auto kLoDTensorId = + framework::VarTypeTrait::kId; + valid_types_.insert(kTensorId); + valid_types_.insert(kLoDTensorId); } // Collect the variables that are not Tensor or LoDTensor, and reset them to a // bool(trick), because some of them are containers, and some operators just @@ -46,7 +49,7 @@ struct TensorArrayBatchCleaner { bool no_tensor_flag_{true}; std::vector arrays_; - std::unordered_set valid_types_; + std::unordered_set valid_types_; std::unordered_set no_tensor_vars_; }; diff --git a/paddle/fluid/inference/api/helper.h b/paddle/fluid/inference/api/helper.h index 9a393a61c4..7830e85956 100644 --- a/paddle/fluid/inference/api/helper.h +++ b/paddle/fluid/inference/api/helper.h @@ -113,6 +113,16 @@ static void TensorAssignData(PaddleTensor *tensor, } } +template +static void TensorAssignData(PaddleTensor *tensor, + const std::vector> &data, + const std::vector &lod) { + int size = lod[lod.size() - 1]; + tensor->shape.assign({size, 1}); + tensor->lod.assign({lod}); + TensorAssignData(tensor, data); +} + template static int ZeroCopyTensorAssignData(ZeroCopyTensor *tensor, const std::vector> &data) { diff --git a/paddle/fluid/inference/api/paddle_analysis_config.h b/paddle/fluid/inference/api/paddle_analysis_config.h index f05b9832da..e7ccea6587 100644 --- a/paddle/fluid/inference/api/paddle_analysis_config.h +++ b/paddle/fluid/inference/api/paddle_analysis_config.h @@ -49,7 +49,7 @@ struct AnalysisConfig : public NativeConfig { bool use_feed_fetch_ops{true}; void EnableTensorRtEngine(int workspace_size = 1 << 20, - int max_batch_size = 1); + int max_batch_size = 1, int min_subgraph_size = 3); bool use_tensorrt() const { return use_tensorrt_; } void EnableMKLDNN(); @@ -69,8 +69,19 @@ struct AnalysisConfig : public NativeConfig { bool use_tensorrt_{false}; bool use_mkldnn_{false}; std::unordered_set mkldnn_enabled_op_types_; + // For workspace_size, refer it from here: + // https://docs.nvidia.com/deeplearning/sdk/tensorrt-developer-guide/index.html#troubleshooting int tensorrt_workspace_size_; + // While TensorRT allows an engine optimized for a given max batch size + // to run at any smaller size, the performance for those smaller + // sizes may not be as well-optimized. Therefore, Max batch is best + // equivalent to the runtime batch size. int tensorrt_max_batchsize_; + // We transform the Ops that can be converted into TRT layer in the model, + // and aggregate these Ops into subgraphs for TRT execution. + // We set this variable to control the minimum number of nodes in the + // subgraph, 3 as default value. + int tensorrt_min_subgraph_size_{3}; std::unique_ptr pass_builder_; bool model_from_memory_{false}; }; diff --git a/paddle/fluid/inference/api/paddle_pass_builder.h b/paddle/fluid/inference/api/paddle_pass_builder.h index 40ca0d287c..1062ac5f58 100644 --- a/paddle/fluid/inference/api/paddle_pass_builder.h +++ b/paddle/fluid/inference/api/paddle_pass_builder.h @@ -118,11 +118,13 @@ class GpuPassStrategy : public PassStrategy { public: GpuPassStrategy() : PassStrategy({}) { passes_.assign({ - "infer_clean_graph_pass", // - "conv_bn_fuse_pass", // - "conv_elementwise_add_act_fuse_pass", // - "conv_elementwise_add2_act_fuse_pass", // - "conv_elementwise_add_fuse_pass", // + "infer_clean_graph_pass", // + "conv_affine_channel_fuse_pass", // + "conv_eltwiseadd_affine_channel_fuse_pass", // + "conv_bn_fuse_pass", // + "conv_elementwise_add_act_fuse_pass", // + "conv_elementwise_add2_act_fuse_pass", // + "conv_elementwise_add_fuse_pass", // }); } diff --git a/paddle/fluid/inference/tests/api/CMakeLists.txt b/paddle/fluid/inference/tests/api/CMakeLists.txt index 46ce61b736..9aa9db031c 100644 --- a/paddle/fluid/inference/tests/api/CMakeLists.txt +++ b/paddle/fluid/inference/tests/api/CMakeLists.txt @@ -75,6 +75,11 @@ set(LAC_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/lac") download_model_and_data(${LAC_INSTALL_DIR} "lac_model.tar.gz" "lac_data.txt.tar.gz") inference_analysis_api_test(test_analyzer_lac ${LAC_INSTALL_DIR} analyzer_lac_tester.cc) +# MM DNN +set(MM_DNN_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/mm_dnn") +download_model_and_data(${MM_DNN_INSTALL_DIR} "MM_DNN_model.tar.gz" "MM_DNN_data.txt.tar.gz") +inference_analysis_api_test(test_analyzer_mm_dnn ${MM_DNN_INSTALL_DIR} analyzer_mm_dnn_tester.cc) + # text_classification set(TEXT_CLASSIFICATION_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/text_classification") download_model_and_data(${TEXT_CLASSIFICATION_INSTALL_DIR} "text-classification-Senta.tar.gz" "text_classification_data.txt.tar.gz") @@ -103,6 +108,10 @@ inference_analysis_api_test_with_refer_result(test_analyzer_mobilenet_transpose inference_analysis_api_test_with_fake_data(test_analyzer_resnet50 "${INFERENCE_DEMO_INSTALL_DIR}/resnet50" analyzer_resnet50_tester.cc "resnet50_model.tar.gz") +# seq_pool1 +inference_analysis_api_test_with_fake_data(test_analyzer_seq_pool1 +"${INFERENCE_DEMO_INSTALL_DIR}/seq_pool1" analyzer_seq_pool1_tester.cc "seq_pool1.tar.gz") + # mobilenet with depthwise_conv op inference_analysis_api_test_with_fake_data(test_analyzer_mobilenet_depthwise_conv "${INFERENCE_DEMO_INSTALL_DIR}/mobilenet_depthwise_conv" analyzer_resnet50_tester.cc "mobilenet_model.tar.gz") diff --git a/paddle/fluid/inference/tests/api/analyzer_lac_tester.cc b/paddle/fluid/inference/tests/api/analyzer_lac_tester.cc index 142801382b..2213971c17 100644 --- a/paddle/fluid/inference/tests/api/analyzer_lac_tester.cc +++ b/paddle/fluid/inference/tests/api/analyzer_lac_tester.cc @@ -98,10 +98,8 @@ void GetOneBatch(std::vector *input_slots, DataRecord *data, auto one_batch = data->NextBatch(); PaddleTensor input_tensor; input_tensor.name = "word"; - input_tensor.shape.assign({static_cast(one_batch.data.size()), 1}); - input_tensor.lod.assign({one_batch.lod}); input_tensor.dtype = PaddleDType::INT64; - TensorAssignData(&input_tensor, {one_batch.data}); + TensorAssignData(&input_tensor, {one_batch.data}, one_batch.lod); PADDLE_ENFORCE_EQ(batch_size, static_cast(one_batch.lod.size() - 1)); input_slots->assign({input_tensor}); } diff --git a/paddle/fluid/inference/tests/api/analyzer_mm_dnn_tester.cc b/paddle/fluid/inference/tests/api/analyzer_mm_dnn_tester.cc new file mode 100644 index 0000000000..9d3c751943 --- /dev/null +++ b/paddle/fluid/inference/tests/api/analyzer_mm_dnn_tester.cc @@ -0,0 +1,157 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/inference/tests/api/tester_helper.h" + +namespace paddle { +namespace inference { +using contrib::AnalysisConfig; + +struct DataRecord { + std::vector> query, title; + std::vector lod1, lod2; + size_t batch_iter{0}, batch_size{1}, num_samples; // total number of samples + DataRecord() = default; + explicit DataRecord(const std::string &path, int batch_size = 1) + : batch_size(batch_size) { + Load(path); + } + DataRecord NextBatch() { + DataRecord data; + size_t batch_end = batch_iter + batch_size; + // NOTE skip the final batch, if no enough data is provided. + if (batch_end <= query.size()) { + GetInputPerBatch(query, &data.query, &data.lod1, batch_iter, batch_end); + GetInputPerBatch(title, &data.title, &data.lod2, batch_iter, batch_end); + } + batch_iter += batch_size; + return data; + } + void Load(const std::string &path) { + std::ifstream file(path); + std::string line; + int num_lines = 0; + while (std::getline(file, line)) { + num_lines++; + std::vector data; + split(line, '\t', &data); + // load query data + std::vector query_data; + split_to_int64(data[0], ' ', &query_data); + // load title data + std::vector title_data; + split_to_int64(data[1], ' ', &title_data); + query.push_back(std::move(query_data)); + title.push_back(std::move(title_data)); + } + num_samples = num_lines; + } +}; + +void PrepareInputs(std::vector *input_slots, DataRecord *data, + int batch_size) { + PaddleTensor lod_query_tensor, lod_title_tensor; + lod_query_tensor.name = "left"; + lod_title_tensor.name = "right"; + auto one_batch = data->NextBatch(); + // assign data + TensorAssignData(&lod_query_tensor, one_batch.query, one_batch.lod1); + TensorAssignData(&lod_title_tensor, one_batch.title, one_batch.lod2); + // Set inputs. + input_slots->assign({lod_query_tensor, lod_title_tensor}); + for (auto &tensor : *input_slots) { + tensor.dtype = PaddleDType::INT64; + } +} + +void SetConfig(contrib::AnalysisConfig *cfg) { + cfg->model_dir = FLAGS_infer_model; + cfg->use_gpu = false; + cfg->device = 0; + cfg->specify_input_name = true; + cfg->enable_ir_optim = true; +} + +void SetInput(std::vector> *inputs) { + DataRecord data(FLAGS_infer_data, FLAGS_batch_size); + std::vector input_slots; + int epoch = FLAGS_test_all_data ? data.num_samples / FLAGS_batch_size : 1; + LOG(INFO) << "number of samples: " << epoch * FLAGS_batch_size; + for (int bid = 0; bid < epoch; ++bid) { + PrepareInputs(&input_slots, &data, FLAGS_batch_size); + (*inputs).emplace_back(input_slots); + } +} + +// Easy for profiling independently. +TEST(Analyzer_MM_DNN, profile) { + contrib::AnalysisConfig cfg; + SetConfig(&cfg); + std::vector outputs; + + std::vector> input_slots_all; + SetInput(&input_slots_all); + TestPrediction(reinterpret_cast(&cfg), + input_slots_all, &outputs, FLAGS_num_threads); + + if (FLAGS_num_threads == 1 && !FLAGS_test_all_data) { + PADDLE_ENFORCE_EQ(outputs.size(), 2UL); + for (auto &output : outputs) { + size_t size = GetSize(output); + PADDLE_ENFORCE_GT(size, 0); + float *result = static_cast(output.data.data()); + // output is probability, which is in (-1, 1). + for (size_t i = 0; i < size; i++) { + EXPECT_GT(result[i], -1); + EXPECT_LT(result[i], 1); + } + } + } +} + +// Check the fuse status +TEST(Analyzer_MM_DNN, fuse_statis) { + contrib::AnalysisConfig cfg; + SetConfig(&cfg); + + int num_ops; + auto predictor = CreatePaddlePredictor(cfg); + auto fuse_statis = GetFuseStatis( + static_cast(predictor.get()), &num_ops); +} + +// Compare result of NativeConfig and AnalysisConfig +TEST(Analyzer_MM_DNN, compare) { + contrib::AnalysisConfig cfg; + SetConfig(&cfg); + + std::vector> input_slots_all; + SetInput(&input_slots_all); + CompareNativeAndAnalysis( + reinterpret_cast(&cfg), input_slots_all); +} + +// Compare Deterministic result +TEST(Analyzer_MM_DNN, compare_determine) { + AnalysisConfig cfg; + SetConfig(&cfg); + + std::vector> input_slots_all; + SetInput(&input_slots_all); + CompareDeterministic(reinterpret_cast(&cfg), + input_slots_all); +} + +} // namespace inference +} // namespace paddle diff --git a/paddle/fluid/inference/tests/api/analyzer_ner_tester.cc b/paddle/fluid/inference/tests/api/analyzer_ner_tester.cc index f19a2ed59e..f8635968ce 100644 --- a/paddle/fluid/inference/tests/api/analyzer_ner_tester.cc +++ b/paddle/fluid/inference/tests/api/analyzer_ner_tester.cc @@ -19,11 +19,9 @@ namespace inference { using contrib::AnalysisConfig; struct DataRecord { - std::vector> word_data_all, mention_data_all; + std::vector> word, mention; std::vector lod; // two inputs have the same lod info. - size_t batch_iter{0}; - size_t batch_size{1}; - size_t num_samples; // total number of samples + size_t batch_iter{0}, batch_size{1}, num_samples; // total number of samples DataRecord() = default; explicit DataRecord(const std::string &path, int batch_size = 1) : batch_size(batch_size) { @@ -33,20 +31,10 @@ struct DataRecord { DataRecord data; size_t batch_end = batch_iter + batch_size; // NOTE skip the final batch, if no enough data is provided. - if (batch_end <= word_data_all.size()) { - data.word_data_all.assign(word_data_all.begin() + batch_iter, - word_data_all.begin() + batch_end); - data.mention_data_all.assign(mention_data_all.begin() + batch_iter, - mention_data_all.begin() + batch_end); - // Prepare LoDs - data.lod.push_back(0); - CHECK(!data.word_data_all.empty()); - CHECK(!data.mention_data_all.empty()); - CHECK_EQ(data.word_data_all.size(), data.mention_data_all.size()); - for (size_t j = 0; j < data.word_data_all.size(); j++) { - // calculate lod - data.lod.push_back(data.lod.back() + data.word_data_all[j].size()); - } + if (batch_end <= word.size()) { + GetInputPerBatch(word, &data.word, &data.lod, batch_iter, batch_end); + GetInputPerBatch(mention, &data.mention, &data.lod, batch_iter, + batch_end); } batch_iter += batch_size; return data; @@ -65,8 +53,8 @@ struct DataRecord { // load mention data std::vector mention_data; split_to_int64(data[3], ' ', &mention_data); - word_data_all.push_back(std::move(word_data)); - mention_data_all.push_back(std::move(mention_data)); + word.push_back(std::move(word_data)); + mention.push_back(std::move(mention_data)); } num_samples = num_lines; } @@ -78,14 +66,10 @@ void PrepareInputs(std::vector *input_slots, DataRecord *data, lod_word_tensor.name = "word"; lod_mention_tensor.name = "mention"; auto one_batch = data->NextBatch(); - int size = one_batch.lod[one_batch.lod.size() - 1]; // token batch size - lod_word_tensor.shape.assign({size, 1}); - lod_word_tensor.lod.assign({one_batch.lod}); - lod_mention_tensor.shape.assign({size, 1}); - lod_mention_tensor.lod.assign({one_batch.lod}); // assign data - TensorAssignData(&lod_word_tensor, one_batch.word_data_all); - TensorAssignData(&lod_mention_tensor, one_batch.mention_data_all); + TensorAssignData(&lod_word_tensor, one_batch.word, one_batch.lod); + TensorAssignData(&lod_mention_tensor, one_batch.mention, + one_batch.lod); // Set inputs. input_slots->assign({lod_word_tensor, lod_mention_tensor}); for (auto &tensor : *input_slots) { diff --git a/paddle/fluid/inference/tests/api/analyzer_seq_conv1_tester.cc b/paddle/fluid/inference/tests/api/analyzer_seq_conv1_tester.cc index f5082cd60f..e6d6cd2960 100644 --- a/paddle/fluid/inference/tests/api/analyzer_seq_conv1_tester.cc +++ b/paddle/fluid/inference/tests/api/analyzer_seq_conv1_tester.cc @@ -18,12 +18,9 @@ namespace paddle { namespace inference { struct DataRecord { - std::vector> title1_all, title2_all, title3_all, l1_all; std::vector> title1, title2, title3, l1; - std::vector title1_lod, title2_lod, title3_lod, l1_lod; - size_t batch_iter{0}; - size_t batch_size{1}; - size_t num_samples; // total number of samples + std::vector lod1, lod2, lod3, l1_lod; + size_t batch_iter{0}, batch_size{1}, num_samples; // total number of samples DataRecord() = default; explicit DataRecord(const std::string &path, int batch_size = 1) : batch_size(batch_size) { @@ -33,41 +30,11 @@ struct DataRecord { DataRecord data; size_t batch_end = batch_iter + batch_size; // NOTE skip the final batch, if no enough data is provided. - if (batch_end <= title1_all.size()) { - data.title1_all.assign(title1_all.begin() + batch_iter, - title1_all.begin() + batch_end); - data.title2_all.assign(title2_all.begin() + batch_iter, - title2_all.begin() + batch_end); - data.title3_all.assign(title3_all.begin() + batch_iter, - title3_all.begin() + batch_end); - data.l1_all.assign(l1_all.begin() + batch_iter, - l1_all.begin() + batch_end); - // Prepare LoDs - data.title1_lod.push_back(0); - data.title2_lod.push_back(0); - data.title3_lod.push_back(0); - data.l1_lod.push_back(0); - CHECK(!data.title1_all.empty()); - CHECK(!data.title2_all.empty()); - CHECK(!data.title3_all.empty()); - CHECK(!data.l1_all.empty()); - CHECK_EQ(data.title1_all.size(), data.title2_all.size()); - CHECK_EQ(data.title1_all.size(), data.title3_all.size()); - CHECK_EQ(data.title1_all.size(), data.l1_all.size()); - for (size_t j = 0; j < data.title1_all.size(); j++) { - data.title1.push_back(data.title1_all[j]); - data.title2.push_back(data.title2_all[j]); - data.title3.push_back(data.title3_all[j]); - data.l1.push_back(data.l1_all[j]); - // calculate lod - data.title1_lod.push_back(data.title1_lod.back() + - data.title1_all[j].size()); - data.title2_lod.push_back(data.title2_lod.back() + - data.title2_all[j].size()); - data.title3_lod.push_back(data.title3_lod.back() + - data.title3_all[j].size()); - data.l1_lod.push_back(data.l1_lod.back() + data.l1_all[j].size()); - } + if (batch_end <= title1.size()) { + GetInputPerBatch(title1, &data.title1, &data.lod1, batch_iter, batch_end); + GetInputPerBatch(title2, &data.title2, &data.lod2, batch_iter, batch_end); + GetInputPerBatch(title3, &data.title3, &data.lod3, batch_iter, batch_end); + GetInputPerBatch(l1, &data.l1, &data.l1_lod, batch_iter, batch_end); } batch_iter += batch_size; return data; @@ -92,10 +59,10 @@ struct DataRecord { // load l1 data std::vector l1_data; split_to_int64(data[3], ' ', &l1_data); - title1_all.push_back(std::move(title1_data)); - title2_all.push_back(std::move(title2_data)); - title3_all.push_back(std::move(title3_data)); - l1_all.push_back(std::move(l1_data)); + title1.push_back(std::move(title1_data)); + title2.push_back(std::move(title2_data)); + title3.push_back(std::move(title3_data)); + l1.push_back(std::move(l1_data)); } num_samples = num_lines; } @@ -109,24 +76,11 @@ void PrepareInputs(std::vector *input_slots, DataRecord *data, title3_tensor.name = "title3"; l1_tensor.name = "l1"; auto one_batch = data->NextBatch(); - int title1_size = one_batch.title1_lod[one_batch.title1_lod.size() - 1]; - title1_tensor.shape.assign({title1_size, 1}); - title1_tensor.lod.assign({one_batch.title1_lod}); - int title2_size = one_batch.title2_lod[one_batch.title2_lod.size() - 1]; - title2_tensor.shape.assign({title2_size, 1}); - title2_tensor.lod.assign({one_batch.title2_lod}); - int title3_size = one_batch.title3_lod[one_batch.title3_lod.size() - 1]; - title3_tensor.shape.assign({title3_size, 1}); - title3_tensor.lod.assign({one_batch.title3_lod}); - int l1_size = one_batch.l1_lod[one_batch.l1_lod.size() - 1]; - l1_tensor.shape.assign({l1_size, 1}); - l1_tensor.lod.assign({one_batch.l1_lod}); - // assign data - TensorAssignData(&title1_tensor, one_batch.title1); - TensorAssignData(&title2_tensor, one_batch.title2); - TensorAssignData(&title3_tensor, one_batch.title3); - TensorAssignData(&l1_tensor, one_batch.l1); + TensorAssignData(&title1_tensor, one_batch.title1, one_batch.lod1); + TensorAssignData(&title2_tensor, one_batch.title2, one_batch.lod2); + TensorAssignData(&title3_tensor, one_batch.title3, one_batch.lod3); + TensorAssignData(&l1_tensor, one_batch.l1, one_batch.l1_lod); // Set inputs. input_slots->assign({title1_tensor, title2_tensor, title3_tensor, l1_tensor}); for (auto &tensor : *input_slots) { diff --git a/paddle/fluid/inference/tests/api/analyzer_seq_pool1_tester.cc b/paddle/fluid/inference/tests/api/analyzer_seq_pool1_tester.cc new file mode 100644 index 0000000000..2ae840fd11 --- /dev/null +++ b/paddle/fluid/inference/tests/api/analyzer_seq_pool1_tester.cc @@ -0,0 +1,117 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include +#include +#include "paddle/fluid/inference/tests/api/tester_helper.h" + +namespace paddle { +namespace inference { +namespace analysis { + +void SetConfig(AnalysisConfig *cfg) { + cfg->param_file = FLAGS_infer_model + "/params"; + cfg->prog_file = FLAGS_infer_model + "/model"; + cfg->use_gpu = false; + cfg->device = 0; + cfg->enable_ir_optim = true; + cfg->specify_input_name = true; + cfg->SetCpuMathLibraryNumThreads(FLAGS_paddle_num_threads); +} + +void SetInput(std::vector> *inputs) { + std::vector feed_names = { + "slot10000_embed", "slot10001_embed", "slot10004_embed", + "slot10005_embed", "slot10008_embed", "slot10009_embed", + "slot10012_embed", "slot10013_embed", "slot10108_embed", + "slot13324_embed", "slot13325_embed", "slot13326_embed", + "slot13327_embed", "slot13328_embed", "slot13329_embed", + "slot13330_embed", "slot13331_embed", "slot15501_embed", + "slot15502_embed", "slot15503_embed", "slot15504_embed", + "slot15505_embed", "slot15506_embed", "slot15507_embed", + "slot15508_embed", "slot15516_embed", "slot15519_embed", + "slot15523_embed", "slot15531_embed", "slot15533_embed", + "slot15548_embed", "slot15564_embed", "slot15565_embed", + "slot15566_embed", "slot15570_embed", "slot15571_embed", + "slot15572_embed", "slot15573_embed", "slot15574_embed", + "slot15575_embed", "slot15576_embed", "slot15577_embed", + "slot15579_embed", "slot15581_embed", "slot15582_embed", + "slot15583_embed", "slot15584_embed", "slot5016_embed", + "slot5021_embed", "slot6002_embed", "slot6003_embed", + "slot6004_embed", "slot6005_embed", "slot6006_embed", + "slot6007_embed", "slot6008_embed", "slot6009_embed", + "slot6011_embed", "slot6014_embed", "slot6015_embed", + "slot6023_embed", "slot6024_embed", "slot6025_embed", + "slot6027_embed", "slot6029_embed", "slot6031_embed", + "slot6034_embed", "slot6035_embed", "slot6036_embed", + "slot6037_embed", "slot6039_embed", "slot6048_embed", + "slot6050_embed", "slot6058_embed", "slot6059_embed", + "slot6060_embed", "slot6066_embed", "slot6067_embed", + "slot6068_embed", "slot6069_embed", "slot6070_embed", + "slot6071_embed", "slot6072_embed", "slot6073_embed", + "slot6182_embed", "slot6183_embed", "slot6184_embed", + "slot6185_embed", "slot6186_embed", "slot6188_embed", + "slot6189_embed", "slot6190_embed", "slot6201_embed", + "slot6202_embed", "slot6203_embed", "slot6247_embed", + "slot6248_embed", "slot6250_embed", "slot6251_embed", + "slot6807_embed", "slot6808_embed", "slot6809_embed", + "slot6810_embed", "slot6811_embed", "slot6812_embed", + "slot6813_embed", "slot6814_embed", "slot6815_embed", + "slot6816_embed", "slot6817_embed", "slot6818_embed", + "slot6819_embed", "slot6820_embed", "slot6822_embed", + "slot6823_embed", "slot6826_embed", "slot7002_embed", + "slot7003_embed", "slot7004_embed", "slot7005_embed", + "slot7006_embed", "slot7008_embed", "slot7009_embed", + "slot7010_embed", "slot7011_embed", "slot7013_embed", + "slot7014_embed", "slot7015_embed", "slot7016_embed", + "slot7017_embed", "slot7019_embed", "slot7100_embed", + "slot7506_embed", "slot7507_embed", "slot7514_embed", + "slot7515_embed", "slot7516_embed"}; + SetFakeImageInput(inputs, FLAGS_infer_model, true, "model", "params", + &feed_names); +} + +// Easy for profiling independently. +void profile(bool use_mkldnn = false) { + AnalysisConfig cfg; + SetConfig(&cfg); + + if (use_mkldnn) { + cfg.EnableMKLDNN(); + } + std::vector outputs; + + std::vector> input_slots_all; + SetInput(&input_slots_all); + TestPrediction(reinterpret_cast(&cfg), + input_slots_all, &outputs, FLAGS_num_threads); +} + +TEST(Analyzer_seq_pool1, profile) { profile(); } + +// Check the fuse status +TEST(Analyzer_seq_pool1, fuse_statis) { + AnalysisConfig cfg; + SetConfig(&cfg); + int num_ops; + auto predictor = CreatePaddlePredictor(cfg); + auto fuse_statis = GetFuseStatis( + static_cast(predictor.get()), &num_ops); + LOG(INFO) << "num_ops: " << num_ops; + EXPECT_EQ(num_ops, 314); +} + +} // namespace analysis +} // namespace inference +} // namespace paddle diff --git a/paddle/fluid/inference/tests/api/tester_helper.h b/paddle/fluid/inference/tests/api/tester_helper.h index b0c8f395ce..7eb44d9f4e 100644 --- a/paddle/fluid/inference/tests/api/tester_helper.h +++ b/paddle/fluid/inference/tests/api/tester_helper.h @@ -132,7 +132,8 @@ std::unordered_map GetFuseStatis(PaddlePredictor *predictor, void SetFakeImageInput(std::vector> *inputs, const std::string &dirname, bool is_combined = true, std::string model_filename = "model", - std::string params_filename = "params") { + std::string params_filename = "params", + const std::vector *feed_names = nullptr) { // Set fake_image_data PADDLE_ENFORCE_EQ(FLAGS_test_all_data, 0, "Only have single batch of data."); std::vector> feed_target_shapes = GetFeedTargetShapes( @@ -146,29 +147,47 @@ void SetFakeImageInput(std::vector> *inputs, os << "}\n"; } LOG(INFO) << os.str(); - - int dim1 = feed_target_shapes[0][1]; - int dim2 = feed_target_shapes[0][2]; - int dim3 = feed_target_shapes[0][3]; - - PaddleTensor input; - std::vector shape({FLAGS_batch_size, dim1, dim2, dim3}); - input.shape = shape; - input.dtype = PaddleDType::FLOAT32; - - // fill input data, for profile easily, do not use random data here. - size_t size = FLAGS_batch_size * dim1 * dim2 * dim3; - input.data.Resize(size * sizeof(float)); - float *input_data = static_cast(input.data.data()); - for (size_t i = 0; i < size; i++) { - *(input_data + i) = static_cast(i) / size; + if (feed_names) { + PADDLE_ENFORCE_EQ(feed_names->size(), feed_target_shapes.size()); + } + std::vector input_slots(feed_target_shapes.size()); + for (size_t i = 0; i < feed_target_shapes.size(); ++i) { + const auto &feed_shape = feed_target_shapes[i]; + auto &input = input_slots[i]; + std::vector shape({FLAGS_batch_size}); + for (size_t s = 1; s < feed_shape.size(); ++s) { + shape.push_back(static_cast(feed_shape[s])); + } + if (feed_names) { + input.name = (*feed_names)[i]; + } + input.shape = shape; + input.dtype = PaddleDType::FLOAT32; + size_t len = std::accumulate(shape.begin(), shape.end(), 1, + [](int a, int b) { return a * b; }); + input.data.Resize(len * sizeof(float)); + input.lod.assign({{0, static_cast(FLAGS_batch_size)}}); + float *input_data = static_cast(input.data.data()); + // fill input data, for profile easily, do not use random data here. + for (size_t j = 0; j < len; ++j) { + *(input_data + j) = static_cast(j) / len; + } } - - std::vector input_slots; - input_slots.assign({input}); (*inputs).emplace_back(input_slots); } +void GetInputPerBatch(const std::vector> &in, + std::vector> *out, + std::vector *lod, size_t batch_iter, + size_t batch_end) { + lod->clear(); + lod->push_back(0); + for (auto it = in.begin() + batch_iter; it < in.begin() + batch_end; it++) { + out->push_back(*it); + lod->push_back(lod->back() + (*it).size()); // calculate lod + } +} + void TestOneThreadPrediction( const PaddlePredictor::Config *config, const std::vector> &inputs, diff --git a/paddle/fluid/inference/tests/test.cmake b/paddle/fluid/inference/tests/test.cmake index ab3a30ce6b..29f0f034a2 100644 --- a/paddle/fluid/inference/tests/test.cmake +++ b/paddle/fluid/inference/tests/test.cmake @@ -3,14 +3,16 @@ set(INFERENCE_DEMO_INSTALL_DIR "${THIRD_PARTY_PATH}/inference_demo" CACHE STRING "A path setting inference demo download directories.") function (inference_download install_dir url filename) message(STATUS "Download inference test stuff from ${url}/${filename}") - execute_process(COMMAND bash -c "mkdir -p ${install_dir}") - execute_process(COMMAND bash -c "cd ${install_dir} && wget -q ${url}/${filename}") + file(DOWNLOAD "${url}/${filename}" "${install_dir}/${filename}") message(STATUS "finish downloading ${filename}") endfunction() function (inference_download_and_uncompress install_dir url filename) inference_download(${install_dir} ${url} ${filename}) - execute_process(COMMAND bash -c "cd ${install_dir} && tar xzf ${filename}") + execute_process( + COMMAND ${CMAKE_COMMAND} -E tar xzf ${install_dir}/${filename} + WORKING_DIRECTORY ${install_dir} + ) endfunction() set(WORD2VEC_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/word2vec") diff --git a/paddle/fluid/operators/CMakeLists.txt b/paddle/fluid/operators/CMakeLists.txt index 4a14eb941c..ee15420775 100644 --- a/paddle/fluid/operators/CMakeLists.txt +++ b/paddle/fluid/operators/CMakeLists.txt @@ -46,7 +46,7 @@ endif() register_operators(EXCLUDES py_func_op warpctc_op conv_fusion_op DEPS ${OP_HEADER_DEPS} ${OP_PREFETCH_DEPS}) # warpctc_op needs cudnn 7 above -if (WITH_GPU AND NOT WIN32) +if (WITH_GPU) if (${CUDNN_MAJOR_VERSION} VERSION_LESS 7) op_library(warpctc_op DEPS dynload_warpctc sequence_padding sequence_scale SRCS warpctc_op.cc warpctc_op.cu.cc) else() diff --git a/paddle/fluid/operators/clip_by_norm_op.h b/paddle/fluid/operators/clip_by_norm_op.h index 855c4d7067..49e734ce96 100644 --- a/paddle/fluid/operators/clip_by_norm_op.h +++ b/paddle/fluid/operators/clip_by_norm_op.h @@ -64,7 +64,7 @@ class ClipByNormKernel : public framework::OpKernel { output->mutable_data(context.GetPlace()); } else { PADDLE_THROW("Unexpected branch, input variable type is %s", - in_var->Type().name()); + framework::ToTypeName(in_var->Type())); } PADDLE_ENFORCE_NOT_NULL(input); diff --git a/paddle/fluid/operators/controlflow/while_op.cc b/paddle/fluid/operators/controlflow/while_op.cc index 48800947fd..0360cf5273 100644 --- a/paddle/fluid/operators/controlflow/while_op.cc +++ b/paddle/fluid/operators/controlflow/while_op.cc @@ -175,14 +175,13 @@ class WhileGradOp : public framework::OperatorBase { auto &og_inside = detail::Ref(cur_scope.Var(inside_og_name), "Cannot find inside gradient %s", inside_og_name); - if (framework::IsType(og_outside.Type())) { + if (og_outside.IsType()) { auto &outside_tensor = og_outside.Get(); auto &inside_tensor = detail::Ref(og_inside.GetMutable()); inside_tensor.set_lod(outside_tensor.lod()); inside_tensor.ShareDataWith(outside_tensor); - } else if (framework::IsType( - og_outside.Type())) { + } else if (og_outside.IsType()) { auto &outside_array = og_outside.Get(); auto &inside_array = detail::Ref(og_inside.GetMutable()); @@ -256,7 +255,7 @@ class WhileGradOp : public framework::OperatorBase { var->IsType(), "Currently the type of var only can be LoDTensorArray, " "or LoDTensor, but the received var[%s] is %s.", - inside_grad_name, var->Type().name()); + inside_grad_name, framework::ToTypeName(var->Type())); if (var->IsType()) { auto &inside_tensor = var->Get(); diff --git a/paddle/fluid/operators/conv_fusion_op.cu.cc b/paddle/fluid/operators/conv_fusion_op.cu.cc index 3235ad52b9..e73762f5fb 100644 --- a/paddle/fluid/operators/conv_fusion_op.cu.cc +++ b/paddle/fluid/operators/conv_fusion_op.cu.cc @@ -22,7 +22,7 @@ DECLARE_bool(cudnn_exhaustive_search); namespace paddle { namespace operators { -#if CUDNN_VERSION >= 7001 +#if CUDNN_VERSION >= 7100 using Tensor = framework::Tensor; using ScopedTensorDescriptor = platform::ScopedTensorDescriptor; using ScopedFilterDescriptor = platform::ScopedFilterDescriptor; @@ -161,9 +161,7 @@ class CUDNNConvFusionOpKernel : public framework::OpKernel { PADDLE_ENFORCE_LE(workspace_size_in_bytes, workspace_size_limit, "workspace_size to be allocated exceeds the limit"); - if ((activation == "identity") && - (algo != CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM) && - (!residual)) { + if ((activation == "identity") && (!residual)) { // Only the CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM algo is // enabled with CUDNN_ACTIVATION_IDENTITY in cuDNN lib. // But test in some case, the speed is slower, change to use @@ -204,7 +202,7 @@ class CUDNNConvFusionOpKernel : public framework::OpKernel { } // namespace operators } // namespace paddle -#if CUDNN_VERSION >= 7001 +#if CUDNN_VERSION >= 7100 namespace ops = paddle::operators; REGISTER_OP_CUDA_KERNEL(conv2d_fusion, ops::CUDNNConvFusionOpKernel, ops::CUDNNConvFusionOpKernel); diff --git a/paddle/fluid/operators/conv_op.h b/paddle/fluid/operators/conv_op.h index 4a7b31c7d4..2519f5e7ac 100644 --- a/paddle/fluid/operators/conv_op.h +++ b/paddle/fluid/operators/conv_op.h @@ -18,11 +18,11 @@ limitations under the License. */ #include #include "paddle/fluid/framework/eigen.h" #include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/framework/tensor_util.h" #include "paddle/fluid/operators/math/blas.h" #include "paddle/fluid/operators/math/depthwise_conv.h" #include "paddle/fluid/operators/math/im2col.h" #include "paddle/fluid/operators/math/vol2col.h" -#include "paddle/fluid/platform/create_tensor_with_allocationptr.h" namespace paddle { namespace operators { @@ -161,10 +161,7 @@ class GemmConvKernel : public framework::OpKernel { auto tmp_allocation_ptr = platform::DeviceTemporaryAllocator::Instance().Get(dev_ctx).Allocate( framework::product(col_shape) * sizeof(T)); - Tensor tep_tensor = - platform::GetTensor(std::move(tmp_allocation_ptr), col_shape); - - col.ShareDataWith(tep_tensor); + col = framework::GetTensor(std::move(tmp_allocation_ptr), col_shape); col_matrix.ShareDataWith(col); col_matrix.Resize(col_matrix_shape); } @@ -299,10 +296,7 @@ class GemmConvGradKernel : public framework::OpKernel { auto tmp_allocation_ptr = platform::DeviceTemporaryAllocator::Instance().Get(dev_ctx).Allocate( framework::product(col_shape) * sizeof(T)); - Tensor tep_tensor = - platform::GetTensor(std::move(tmp_allocation_ptr), col_shape); - - col.ShareDataWith(tep_tensor); + col = framework::GetTensor(std::move(tmp_allocation_ptr), col_shape); col_matrix.ShareDataWith(col); col_matrix.Resize(col_matrix_shape); } diff --git a/paddle/fluid/operators/cudnn_lstm_op.cu.cc b/paddle/fluid/operators/cudnn_lstm_op.cu.cc index f2ba75485c..fae0925149 100644 --- a/paddle/fluid/operators/cudnn_lstm_op.cu.cc +++ b/paddle/fluid/operators/cudnn_lstm_op.cu.cc @@ -13,8 +13,8 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/operators/cudnn_rnn_cache.h" #include "paddle/fluid/operators/math/math_function.h" -#include "paddle/fluid/platform/cudnn_helper.h" namespace paddle { namespace operators { @@ -22,239 +22,6 @@ namespace operators { using LoDTensor = framework::LoDTensor; using Tensor = framework::Tensor; -struct CudnnRNNCache { - CudnnRNNCache() { - x_desc_ = NULL; - y_desc_ = NULL; - dx_desc_ = NULL; - dy_desc_ = NULL; - } - ~CudnnRNNCache() { release(); } - - cudnnRNNDescriptor_t rnn_desc_; - cudnnTensorDescriptor_t *x_desc_; - cudnnTensorDescriptor_t *y_desc_; - cudnnTensorDescriptor_t *dx_desc_; - cudnnTensorDescriptor_t *dy_desc_; - - cudnnTensorDescriptor_t hx_desc_; - cudnnTensorDescriptor_t cx_desc_; - cudnnTensorDescriptor_t hy_desc_; - cudnnTensorDescriptor_t cy_desc_; - - cudnnTensorDescriptor_t dhx_desc_; - cudnnTensorDescriptor_t dcx_desc_; - cudnnTensorDescriptor_t dhy_desc_; - cudnnTensorDescriptor_t dcy_desc_; - - cudnnTensorDescriptor_t output_x_desc_; - cudnnTensorDescriptor_t output_y_desc_; - - cudnnDropoutDescriptor_t dropout_desc_; - - size_t weights_size_; - cudnnFilterDescriptor_t w_desc_; - cudnnFilterDescriptor_t dw_desc_; - - size_t workspace_size_; - size_t reserve_size_; - Tensor reserve_data_; - Tensor workspace_data_; - - Tensor dropout_state_; - - size_t max_length_; - - float dropout_prob_; - bool is_bidirec_; - - int batch_size_; - int input_size_; - int hidden_size_; - int num_layers_; - int seed_; - - void init(cudnnHandle_t handle, const framework::ExecutionContext &ctx, - size_t max_len, int batch_size, int input_size, int hidden_size, - int num_layers, float dropout_prob, bool is_bidirec, int seed, - int weight_numel) { - max_length_ = max_len; - batch_size_ = batch_size; - input_size_ = input_size; - hidden_size_ = hidden_size; - num_layers_ = num_layers; - dropout_prob_ = dropout_prob; - is_bidirec_ = is_bidirec; - seed_ = seed; - - x_desc_ = new cudnnTensorDescriptor_t[max_length_]; - y_desc_ = new cudnnTensorDescriptor_t[max_length_]; - dx_desc_ = new cudnnTensorDescriptor_t[max_length_]; - dy_desc_ = new cudnnTensorDescriptor_t[max_length_]; - int dim_a[3]; - int stride_a[3]; - - for (size_t i = 0; i < max_length_; ++i) { - CUDNN_ENFORCE( - platform::dynload::cudnnCreateTensorDescriptor(&x_desc_[i])); - CUDNN_ENFORCE( - platform::dynload::cudnnCreateTensorDescriptor(&y_desc_[i])); - CUDNN_ENFORCE( - platform::dynload::cudnnCreateTensorDescriptor(&dx_desc_[i])); - CUDNN_ENFORCE( - platform::dynload::cudnnCreateTensorDescriptor(&dy_desc_[i])); - dim_a[0] = batch_size_; - dim_a[1] = input_size_; - dim_a[2] = 1; - - stride_a[0] = dim_a[2] * dim_a[1]; - stride_a[1] = dim_a[2]; - stride_a[2] = 1; - CUDNN_ENFORCE(platform::dynload::cudnnSetTensorNdDescriptor( - x_desc_[i], CUDNN_DATA_FLOAT, 3, dim_a, stride_a)); - CUDNN_ENFORCE(platform::dynload::cudnnSetTensorNdDescriptor( - dx_desc_[i], CUDNN_DATA_FLOAT, 3, dim_a, stride_a)); - - dim_a[0] = batch_size_; - dim_a[1] = is_bidirec_ ? hidden_size_ * 2 : hidden_size_; - dim_a[2] = 1; - - stride_a[0] = dim_a[2] * dim_a[1]; - stride_a[1] = dim_a[2]; - stride_a[2] = 1; - - CUDNN_ENFORCE(platform::dynload::cudnnSetTensorNdDescriptor( - y_desc_[i], CUDNN_DATA_FLOAT, 3, dim_a, stride_a)); - CUDNN_ENFORCE(platform::dynload::cudnnSetTensorNdDescriptor( - dy_desc_[i], CUDNN_DATA_FLOAT, 3, dim_a, stride_a)); - } - - dim_a[0] = num_layers_ * (is_bidirec_ ? 2 : 1); - dim_a[1] = batch_size_; - dim_a[2] = hidden_size_; - - stride_a[0] = dim_a[2] * dim_a[1]; - stride_a[1] = dim_a[2]; - stride_a[2] = 1; - - CUDNN_ENFORCE(platform::dynload::cudnnCreateTensorDescriptor(&hx_desc_)); - CUDNN_ENFORCE(platform::dynload::cudnnCreateTensorDescriptor(&cx_desc_)); - CUDNN_ENFORCE(platform::dynload::cudnnCreateTensorDescriptor(&hy_desc_)); - CUDNN_ENFORCE(platform::dynload::cudnnCreateTensorDescriptor(&cy_desc_)); - CUDNN_ENFORCE(platform::dynload::cudnnCreateTensorDescriptor(&dhx_desc_)); - CUDNN_ENFORCE(platform::dynload::cudnnCreateTensorDescriptor(&dcx_desc_)); - CUDNN_ENFORCE(platform::dynload::cudnnCreateTensorDescriptor(&dhy_desc_)); - CUDNN_ENFORCE(platform::dynload::cudnnCreateTensorDescriptor(&dcy_desc_)); - - CUDNN_ENFORCE(platform::dynload::cudnnSetTensorNdDescriptor( - hx_desc_, CUDNN_DATA_FLOAT, 3, dim_a, stride_a)); - CUDNN_ENFORCE(platform::dynload::cudnnSetTensorNdDescriptor( - cx_desc_, CUDNN_DATA_FLOAT, 3, dim_a, stride_a)); - CUDNN_ENFORCE(platform::dynload::cudnnSetTensorNdDescriptor( - hy_desc_, CUDNN_DATA_FLOAT, 3, dim_a, stride_a)); - CUDNN_ENFORCE(platform::dynload::cudnnSetTensorNdDescriptor( - cy_desc_, CUDNN_DATA_FLOAT, 3, dim_a, stride_a)); - CUDNN_ENFORCE(platform::dynload::cudnnSetTensorNdDescriptor( - dhx_desc_, CUDNN_DATA_FLOAT, 3, dim_a, stride_a)); - CUDNN_ENFORCE(platform::dynload::cudnnSetTensorNdDescriptor( - dcx_desc_, CUDNN_DATA_FLOAT, 3, dim_a, stride_a)); - CUDNN_ENFORCE(platform::dynload::cudnnSetTensorNdDescriptor( - dhy_desc_, CUDNN_DATA_FLOAT, 3, dim_a, stride_a)); - CUDNN_ENFORCE(platform::dynload::cudnnSetTensorNdDescriptor( - dcy_desc_, CUDNN_DATA_FLOAT, 3, dim_a, stride_a)); - - CUDNN_ENFORCE( - platform::dynload::cudnnCreateDropoutDescriptor(&dropout_desc_)); - - size_t state_size; - CUDNN_ENFORCE( - platform::dynload::cudnnDropoutGetStatesSize(handle, &state_size); - dropout_state_.Resize({static_cast(state_size)})); - auto *dropout_state_data = - dropout_state_.mutable_data(ctx.GetPlace()); - CUDNN_ENFORCE(platform::dynload::cudnnSetDropoutDescriptor( - dropout_desc_, handle, dropout_prob_, dropout_state_data, state_size, - seed_)); - - CUDNN_ENFORCE(platform::dynload::cudnnCreateRNNDescriptor(&rnn_desc_)); - -#if CUDNN_VERSION >= 6000 - CUDNN_ENFORCE(platform::dynload::cudnnSetRNNDescriptor_v6( - handle, rnn_desc_, hidden_size_, num_layers_, dropout_desc_, - CUDNN_LINEAR_INPUT, - is_bidirec_ ? CUDNN_BIDIRECTIONAL : CUDNN_UNIDIRECTIONAL, CUDNN_LSTM, - CUDNN_RNN_ALGO_STANDARD, CUDNN_DATA_FLOAT)); -#else - CUDNN_ENFORCE(platform::dynload::cudnnSetRNNDescriptor( - rnn_desc_, hidden_size_, num_layers_, dropout_desc_, CUDNN_LINEAR_INPUT, - is_bidirec_ ? CUDNN_BIDIRECTIONAL : CUDNN_UNIDIRECTIONAL, CUDNN_LSTM, - CUDNN_DATA_FLOAT)); -#endif - - CUDNN_ENFORCE(platform::dynload::cudnnCreateFilterDescriptor(&w_desc_)); - CUDNN_ENFORCE(platform::dynload::cudnnCreateFilterDescriptor(&dw_desc_)); - - CUDNN_ENFORCE(platform::dynload::cudnnGetRNNParamsSize( - handle, rnn_desc_, x_desc_[0], &weights_size_, CUDNN_DATA_FLOAT)); - - PADDLE_ENFORCE_EQ(weights_size_, sizeof(float) * weight_numel, - "cudnn lstm weight size should be SAME"); - int dim_w[3]; - dim_w[0] = weights_size_ / sizeof(float); - dim_w[1] = 1; - dim_w[2] = 1; - CUDNN_ENFORCE(platform::dynload::cudnnSetFilterNdDescriptor( - w_desc_, CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 3, dim_w)); - CUDNN_ENFORCE(platform::dynload::cudnnSetFilterNdDescriptor( - dw_desc_, CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 3, dim_w)); - - CUDNN_ENFORCE(platform::dynload::cudnnGetRNNWorkspaceSize( - handle, rnn_desc_, max_length_, x_desc_, &workspace_size_)); - CUDNN_ENFORCE(platform::dynload::cudnnGetRNNTrainingReserveSize( - handle, rnn_desc_, max_length_, x_desc_, &reserve_size_)); - - reserve_data_.Resize({static_cast(reserve_size_)}); - reserve_data_.mutable_data(ctx.GetPlace()); - - workspace_data_.Resize({static_cast(workspace_size_)}); - workspace_data_.mutable_data(ctx.GetPlace()); - } - - void release() { - for (size_t i = 0; i < max_length_; ++i) { - CUDNN_ENFORCE( - platform::dynload::cudnnDestroyTensorDescriptor(x_desc_[i])); - CUDNN_ENFORCE( - platform::dynload::cudnnDestroyTensorDescriptor(y_desc_[i])); - CUDNN_ENFORCE( - platform::dynload::cudnnDestroyTensorDescriptor(dx_desc_[i])); - CUDNN_ENFORCE( - platform::dynload::cudnnDestroyTensorDescriptor(dy_desc_[i])); - } - - delete[] x_desc_; - delete[] y_desc_; - delete[] dx_desc_; - delete[] dy_desc_; - - CUDNN_ENFORCE(platform::dynload::cudnnDestroyTensorDescriptor(hx_desc_)); - CUDNN_ENFORCE(platform::dynload::cudnnDestroyTensorDescriptor(cx_desc_)); - CUDNN_ENFORCE(platform::dynload::cudnnDestroyTensorDescriptor(hy_desc_)); - CUDNN_ENFORCE(platform::dynload::cudnnDestroyTensorDescriptor(cy_desc_)); - CUDNN_ENFORCE(platform::dynload::cudnnDestroyTensorDescriptor(dhx_desc_)); - CUDNN_ENFORCE(platform::dynload::cudnnDestroyTensorDescriptor(dcx_desc_)); - CUDNN_ENFORCE(platform::dynload::cudnnDestroyTensorDescriptor(dhy_desc_)); - CUDNN_ENFORCE(platform::dynload::cudnnDestroyTensorDescriptor(dcy_desc_)); - - CUDNN_ENFORCE( - platform::dynload::cudnnDestroyDropoutDescriptor(dropout_desc_)); - CUDNN_ENFORCE(platform::dynload::cudnnDestroyRNNDescriptor(rnn_desc_)); - - CUDNN_ENFORCE(platform::dynload::cudnnDestroyFilterDescriptor(w_desc_)); - CUDNN_ENFORCE(platform::dynload::cudnnDestroyFilterDescriptor(dw_desc_)); - } -}; - template class CudnnLSTMGPUKernel : public framework::OpKernel { public: @@ -315,9 +82,9 @@ class CudnnLSTMGPUKernel : public framework::OpKernel { auto input_w_numel = w->numel(); auto batch_size = x->dims()[1]; - cudnn_rnn_cache->init(handle, ctx, max_len, batch_size, input_size, - hidden_size, num_layers, dropout_prob, is_bidirec, - seed, input_w_numel); + cudnn_rnn_cache->init(handle, ctx.GetPlace(), max_len, batch_size, + input_size, hidden_size, num_layers, dropout_prob, + is_bidirec, seed, input_w_numel); } auto run_seq_len = x->dims()[0]; diff --git a/paddle/fluid/operators/cudnn_rnn_cache.h b/paddle/fluid/operators/cudnn_rnn_cache.h new file mode 100644 index 0000000000..7f18b83927 --- /dev/null +++ b/paddle/fluid/operators/cudnn_rnn_cache.h @@ -0,0 +1,255 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include "paddle/fluid/framework/tensor.h" +#include "paddle/fluid/platform/cudnn_helper.h" + +namespace paddle { +namespace operators { + +struct CudnnRNNCache { + CudnnRNNCache() { + x_desc_ = NULL; + y_desc_ = NULL; + dx_desc_ = NULL; + dy_desc_ = NULL; + } + ~CudnnRNNCache() { release(); } + + cudnnRNNDescriptor_t rnn_desc_; + cudnnTensorDescriptor_t *x_desc_; + cudnnTensorDescriptor_t *y_desc_; + cudnnTensorDescriptor_t *dx_desc_; + cudnnTensorDescriptor_t *dy_desc_; + + cudnnTensorDescriptor_t hx_desc_; + cudnnTensorDescriptor_t cx_desc_; + cudnnTensorDescriptor_t hy_desc_; + cudnnTensorDescriptor_t cy_desc_; + + cudnnTensorDescriptor_t dhx_desc_; + cudnnTensorDescriptor_t dcx_desc_; + cudnnTensorDescriptor_t dhy_desc_; + cudnnTensorDescriptor_t dcy_desc_; + + cudnnTensorDescriptor_t output_x_desc_; + cudnnTensorDescriptor_t output_y_desc_; + + cudnnDropoutDescriptor_t dropout_desc_; + + size_t weights_size_; + cudnnFilterDescriptor_t w_desc_; + cudnnFilterDescriptor_t dw_desc_; + + size_t workspace_size_; + size_t reserve_size_; + framework::Tensor reserve_data_; + framework::Tensor workspace_data_; + + framework::Tensor dropout_state_; + + size_t max_length_; + + float dropout_prob_; + bool is_bidirec_; + + int batch_size_; + int input_size_; + int hidden_size_; + int num_layers_; + int seed_; + + void init(cudnnHandle_t handle, const platform::Place &place, size_t max_len, + int batch_size, int input_size, int hidden_size, int num_layers, + float dropout_prob, bool is_bidirec, int seed, int weight_numel) { + max_length_ = max_len; + batch_size_ = batch_size; + input_size_ = input_size; + hidden_size_ = hidden_size; + num_layers_ = num_layers; + dropout_prob_ = dropout_prob; + is_bidirec_ = is_bidirec; + seed_ = seed; + + x_desc_ = new cudnnTensorDescriptor_t[max_length_]; + y_desc_ = new cudnnTensorDescriptor_t[max_length_]; + dx_desc_ = new cudnnTensorDescriptor_t[max_length_]; + dy_desc_ = new cudnnTensorDescriptor_t[max_length_]; + int dim_a[3]; + int stride_a[3]; + + for (size_t i = 0; i < max_length_; ++i) { + CUDNN_ENFORCE( + platform::dynload::cudnnCreateTensorDescriptor(&x_desc_[i])); + CUDNN_ENFORCE( + platform::dynload::cudnnCreateTensorDescriptor(&y_desc_[i])); + CUDNN_ENFORCE( + platform::dynload::cudnnCreateTensorDescriptor(&dx_desc_[i])); + CUDNN_ENFORCE( + platform::dynload::cudnnCreateTensorDescriptor(&dy_desc_[i])); + dim_a[0] = batch_size_; + dim_a[1] = input_size_; + dim_a[2] = 1; + + stride_a[0] = dim_a[2] * dim_a[1]; + stride_a[1] = dim_a[2]; + stride_a[2] = 1; + CUDNN_ENFORCE(platform::dynload::cudnnSetTensorNdDescriptor( + x_desc_[i], CUDNN_DATA_FLOAT, 3, dim_a, stride_a)); + CUDNN_ENFORCE(platform::dynload::cudnnSetTensorNdDescriptor( + dx_desc_[i], CUDNN_DATA_FLOAT, 3, dim_a, stride_a)); + + dim_a[0] = batch_size_; + dim_a[1] = is_bidirec_ ? hidden_size_ * 2 : hidden_size_; + dim_a[2] = 1; + + stride_a[0] = dim_a[2] * dim_a[1]; + stride_a[1] = dim_a[2]; + stride_a[2] = 1; + + CUDNN_ENFORCE(platform::dynload::cudnnSetTensorNdDescriptor( + y_desc_[i], CUDNN_DATA_FLOAT, 3, dim_a, stride_a)); + CUDNN_ENFORCE(platform::dynload::cudnnSetTensorNdDescriptor( + dy_desc_[i], CUDNN_DATA_FLOAT, 3, dim_a, stride_a)); + } + + dim_a[0] = num_layers_ * (is_bidirec_ ? 2 : 1); + dim_a[1] = batch_size_; + dim_a[2] = hidden_size_; + + stride_a[0] = dim_a[2] * dim_a[1]; + stride_a[1] = dim_a[2]; + stride_a[2] = 1; + + CUDNN_ENFORCE(platform::dynload::cudnnCreateTensorDescriptor(&hx_desc_)); + CUDNN_ENFORCE(platform::dynload::cudnnCreateTensorDescriptor(&cx_desc_)); + CUDNN_ENFORCE(platform::dynload::cudnnCreateTensorDescriptor(&hy_desc_)); + CUDNN_ENFORCE(platform::dynload::cudnnCreateTensorDescriptor(&cy_desc_)); + CUDNN_ENFORCE(platform::dynload::cudnnCreateTensorDescriptor(&dhx_desc_)); + CUDNN_ENFORCE(platform::dynload::cudnnCreateTensorDescriptor(&dcx_desc_)); + CUDNN_ENFORCE(platform::dynload::cudnnCreateTensorDescriptor(&dhy_desc_)); + CUDNN_ENFORCE(platform::dynload::cudnnCreateTensorDescriptor(&dcy_desc_)); + + CUDNN_ENFORCE(platform::dynload::cudnnSetTensorNdDescriptor( + hx_desc_, CUDNN_DATA_FLOAT, 3, dim_a, stride_a)); + CUDNN_ENFORCE(platform::dynload::cudnnSetTensorNdDescriptor( + cx_desc_, CUDNN_DATA_FLOAT, 3, dim_a, stride_a)); + CUDNN_ENFORCE(platform::dynload::cudnnSetTensorNdDescriptor( + hy_desc_, CUDNN_DATA_FLOAT, 3, dim_a, stride_a)); + CUDNN_ENFORCE(platform::dynload::cudnnSetTensorNdDescriptor( + cy_desc_, CUDNN_DATA_FLOAT, 3, dim_a, stride_a)); + CUDNN_ENFORCE(platform::dynload::cudnnSetTensorNdDescriptor( + dhx_desc_, CUDNN_DATA_FLOAT, 3, dim_a, stride_a)); + CUDNN_ENFORCE(platform::dynload::cudnnSetTensorNdDescriptor( + dcx_desc_, CUDNN_DATA_FLOAT, 3, dim_a, stride_a)); + CUDNN_ENFORCE(platform::dynload::cudnnSetTensorNdDescriptor( + dhy_desc_, CUDNN_DATA_FLOAT, 3, dim_a, stride_a)); + CUDNN_ENFORCE(platform::dynload::cudnnSetTensorNdDescriptor( + dcy_desc_, CUDNN_DATA_FLOAT, 3, dim_a, stride_a)); + + CUDNN_ENFORCE( + platform::dynload::cudnnCreateDropoutDescriptor(&dropout_desc_)); + + size_t state_size; + CUDNN_ENFORCE( + platform::dynload::cudnnDropoutGetStatesSize(handle, &state_size); + dropout_state_.Resize({static_cast(state_size)})); + auto *dropout_state_data = dropout_state_.mutable_data(place); + CUDNN_ENFORCE(platform::dynload::cudnnSetDropoutDescriptor( + dropout_desc_, handle, dropout_prob_, dropout_state_data, state_size, + seed_)); + + CUDNN_ENFORCE(platform::dynload::cudnnCreateRNNDescriptor(&rnn_desc_)); + +#if CUDNN_VERSION >= 6000 + CUDNN_ENFORCE(platform::dynload::cudnnSetRNNDescriptor_v6( + handle, rnn_desc_, hidden_size_, num_layers_, dropout_desc_, + CUDNN_LINEAR_INPUT, + is_bidirec_ ? CUDNN_BIDIRECTIONAL : CUDNN_UNIDIRECTIONAL, CUDNN_LSTM, + CUDNN_RNN_ALGO_STANDARD, CUDNN_DATA_FLOAT)); +#else + CUDNN_ENFORCE(platform::dynload::cudnnSetRNNDescriptor( + rnn_desc_, hidden_size_, num_layers_, dropout_desc_, CUDNN_LINEAR_INPUT, + is_bidirec_ ? CUDNN_BIDIRECTIONAL : CUDNN_UNIDIRECTIONAL, CUDNN_LSTM, + CUDNN_DATA_FLOAT)); +#endif + + CUDNN_ENFORCE(platform::dynload::cudnnCreateFilterDescriptor(&w_desc_)); + CUDNN_ENFORCE(platform::dynload::cudnnCreateFilterDescriptor(&dw_desc_)); + + CUDNN_ENFORCE(platform::dynload::cudnnGetRNNParamsSize( + handle, rnn_desc_, x_desc_[0], &weights_size_, CUDNN_DATA_FLOAT)); + + PADDLE_ENFORCE_EQ(weights_size_, sizeof(float) * weight_numel, + "cudnn lstm weight size should be SAME"); + int dim_w[3]; + dim_w[0] = weights_size_ / sizeof(float); + dim_w[1] = 1; + dim_w[2] = 1; + CUDNN_ENFORCE(platform::dynload::cudnnSetFilterNdDescriptor( + w_desc_, CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 3, dim_w)); + CUDNN_ENFORCE(platform::dynload::cudnnSetFilterNdDescriptor( + dw_desc_, CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 3, dim_w)); + + CUDNN_ENFORCE(platform::dynload::cudnnGetRNNWorkspaceSize( + handle, rnn_desc_, max_length_, x_desc_, &workspace_size_)); + CUDNN_ENFORCE(platform::dynload::cudnnGetRNNTrainingReserveSize( + handle, rnn_desc_, max_length_, x_desc_, &reserve_size_)); + + reserve_data_.Resize({static_cast(reserve_size_)}); + reserve_data_.mutable_data(place); + + workspace_data_.Resize({static_cast(workspace_size_)}); + workspace_data_.mutable_data(place); + } + + void release() { + for (size_t i = 0; i < max_length_; ++i) { + CUDNN_ENFORCE( + platform::dynload::cudnnDestroyTensorDescriptor(x_desc_[i])); + CUDNN_ENFORCE( + platform::dynload::cudnnDestroyTensorDescriptor(y_desc_[i])); + CUDNN_ENFORCE( + platform::dynload::cudnnDestroyTensorDescriptor(dx_desc_[i])); + CUDNN_ENFORCE( + platform::dynload::cudnnDestroyTensorDescriptor(dy_desc_[i])); + } + + delete[] x_desc_; + delete[] y_desc_; + delete[] dx_desc_; + delete[] dy_desc_; + + CUDNN_ENFORCE(platform::dynload::cudnnDestroyTensorDescriptor(hx_desc_)); + CUDNN_ENFORCE(platform::dynload::cudnnDestroyTensorDescriptor(cx_desc_)); + CUDNN_ENFORCE(platform::dynload::cudnnDestroyTensorDescriptor(hy_desc_)); + CUDNN_ENFORCE(platform::dynload::cudnnDestroyTensorDescriptor(cy_desc_)); + CUDNN_ENFORCE(platform::dynload::cudnnDestroyTensorDescriptor(dhx_desc_)); + CUDNN_ENFORCE(platform::dynload::cudnnDestroyTensorDescriptor(dcx_desc_)); + CUDNN_ENFORCE(platform::dynload::cudnnDestroyTensorDescriptor(dhy_desc_)); + CUDNN_ENFORCE(platform::dynload::cudnnDestroyTensorDescriptor(dcy_desc_)); + + CUDNN_ENFORCE( + platform::dynload::cudnnDestroyDropoutDescriptor(dropout_desc_)); + CUDNN_ENFORCE(platform::dynload::cudnnDestroyRNNDescriptor(rnn_desc_)); + + CUDNN_ENFORCE(platform::dynload::cudnnDestroyFilterDescriptor(w_desc_)); + CUDNN_ENFORCE(platform::dynload::cudnnDestroyFilterDescriptor(dw_desc_)); + } +}; + +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/cum_op.h b/paddle/fluid/operators/cum_op.h index 999fdcff90..7c0fda4169 100644 --- a/paddle/fluid/operators/cum_op.h +++ b/paddle/fluid/operators/cum_op.h @@ -13,6 +13,8 @@ See the License for the specific language governing permissions and limitations under the License. */ #pragma once + +#include #include "paddle/fluid/framework/eigen.h" #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/operator.h" diff --git a/paddle/fluid/operators/detail/safe_ref.h b/paddle/fluid/operators/detail/safe_ref.h index a800d5df0a..8660bc219c 100644 --- a/paddle/fluid/operators/detail/safe_ref.h +++ b/paddle/fluid/operators/detail/safe_ref.h @@ -25,7 +25,7 @@ namespace detail { */ template inline T& Ref(T* ptr, ARGS&&... args) { - PADDLE_ENFORCE(ptr != nullptr, args...); + PADDLE_ENFORCE(ptr != nullptr, ::paddle::string::Sprintf(args...)); return *ptr; } diff --git a/paddle/fluid/operators/distributed/proto_encoder_helper.h b/paddle/fluid/operators/distributed/proto_encoder_helper.h index d2b0eb6ca6..27ca1f4edc 100644 --- a/paddle/fluid/operators/distributed/proto_encoder_helper.h +++ b/paddle/fluid/operators/distributed/proto_encoder_helper.h @@ -84,7 +84,9 @@ class ProtoEncodeHelper { ~ProtoEncodeHelper() { #define REPLACE_ENFORCE_GLOG 1 // Make sure callers didn't do operations that went over max_size promised - paddle::platform::throw_on_error(p_ <= limit_); + if (paddle::platform::is_error(p_ <= limit_)) { + paddle::platform::throw_on_error(p_ <= limit_); + } #undef REPLACE_ENFORCE_GLOG } diff --git a/paddle/fluid/operators/distributed_ops/CMakeLists.txt b/paddle/fluid/operators/distributed_ops/CMakeLists.txt index 3c0b7ff24f..a8bb597cbd 100644 --- a/paddle/fluid/operators/distributed_ops/CMakeLists.txt +++ b/paddle/fluid/operators/distributed_ops/CMakeLists.txt @@ -33,7 +33,7 @@ register_operators(EXCLUDES gen_nccl_id_op DEPS ${DISTRIBUTE_DEPS}) if(WITH_GPU AND NOT WIN32) set(DISTRIBUTE_DEPS ${DISTRIBUTE_DEPS} nccl_common) - op_library(gen_nccl_id_op ${DISTRIBUTE_DEPS} nccl_common) + op_library(gen_nccl_id_op DEPS ${DISTRIBUTE_DEPS} nccl_common) endif() set(OPERATOR_DEPS ${OPERATOR_DEPS} ${DISTRIBUTE_DEPS} PARENT_SCOPE) diff --git a/paddle/fluid/operators/distributed_ops/split_ids_op.h b/paddle/fluid/operators/distributed_ops/split_ids_op.h index acc9b1e622..6676ecd1c8 100644 --- a/paddle/fluid/operators/distributed_ops/split_ids_op.h +++ b/paddle/fluid/operators/distributed_ops/split_ids_op.h @@ -116,7 +116,7 @@ class SplitIdsOpKernel : public framework::OpKernel { } else { PADDLE_THROW( "% should be LoDTensor or SelectedRows, but the received type is %s", - ctx.Inputs("Ids")[0], ids_var->Type().name()); + ctx.Inputs("Ids")[0], framework::ToTypeName(ids_var->Type())); } } }; diff --git a/paddle/fluid/operators/elementwise/elementwise_div_op.cu b/paddle/fluid/operators/elementwise/elementwise_div_op.cu index 1a149298fd..ae669f5525 100644 --- a/paddle/fluid/operators/elementwise/elementwise_div_op.cu +++ b/paddle/fluid/operators/elementwise/elementwise_div_op.cu @@ -12,18 +12,23 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/operators/elementwise/elementwise_div_op.h" +#include "paddle/fluid/platform/float16.h" namespace ops = paddle::operators; REGISTER_OP_CUDA_KERNEL( elementwise_div, ops::ElementwiseDivKernel, + ops::ElementwiseDivKernel, ops::ElementwiseDivKernel, ops::ElementwiseDivKernel, ops::ElementwiseDivKernel); REGISTER_OP_CUDA_KERNEL( elementwise_div_grad, ops::ElementwiseDivGradKernel, + ops::ElementwiseDivGradKernel, ops::ElementwiseDivGradKernel, ops::ElementwiseDivGradKernel, ops::ElementwiseDivGradKernel, - ops::ElementwiseMulKernel, - ops::ElementwiseMulKernel, - ops::ElementwiseMulKernel); + elementwise_mul, ops::ElementwiseMulKernel, + ops::ElementwiseMulKernel, + ops::ElementwiseMulKernel, + ops::ElementwiseMulKernel, + ops::ElementwiseMulKernel); REGISTER_OP_CUDA_KERNEL( elementwise_mul_grad, - ops::ElementwiseMulGradKernel, - ops::ElementwiseMulGradKernel, - ops::ElementwiseMulGradKernel, - ops::ElementwiseMulGradKernel); + ops::ElementwiseMulGradKernel, + ops::ElementwiseMulGradKernel, + ops::ElementwiseMulGradKernel, + ops::ElementwiseMulGradKernel, + ops::ElementwiseMulGradKernel); diff --git a/paddle/fluid/operators/elementwise/elementwise_mul_op.h b/paddle/fluid/operators/elementwise/elementwise_mul_op.h index a8b8a67a11..7a7a3989c0 100644 --- a/paddle/fluid/operators/elementwise/elementwise_mul_op.h +++ b/paddle/fluid/operators/elementwise/elementwise_mul_op.h @@ -83,7 +83,7 @@ class ElementwiseMulKernel : public framework::OpKernel { z = ctx.Output("Out"); } else { PADDLE_THROW("X's type[%s] is not supported by elementwise_op.", - x_var->Type().name()); + framework::ToTypeName(x_var->Type())); } z->mutable_data(ctx.GetPlace()); diff --git a/paddle/fluid/operators/fill_constant_op.cc b/paddle/fluid/operators/fill_constant_op.cc index 38cb33e790..c86430524e 100644 --- a/paddle/fluid/operators/fill_constant_op.cc +++ b/paddle/fluid/operators/fill_constant_op.cc @@ -12,68 +12,40 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/fluid/framework/data_type.h" -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/operators/math/math_function.h" +#include "paddle/fluid/operators/fill_constant_op.h" namespace paddle { namespace operators { -class FillConstantInferShape : public framework::InferShapeBase { +class FillConstantOp : public framework::OperatorWithKernel { public: - void operator()(framework::InferShapeContext *ctx) const override { + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext* ctx) const override { PADDLE_ENFORCE(ctx->HasOutput("Out"), "Output(Out) of FillConstantOp should not be null."); - auto &shape = ctx->Attrs().Get>("shape"); + auto& shape = ctx->Attrs().Get>("shape"); ctx->SetOutputDim("Out", framework::make_ddim(shape)); } -}; - -class FillConstantOp : public framework::OperatorBase { - public: - using framework::OperatorBase::OperatorBase; - - private: - void RunImpl(const framework::Scope &scope, - const platform::Place &dev_place) const override { - auto data_type = - static_cast(Attr("dtype")); - auto value = Attr("value"); - auto force_cpu = Attr("force_cpu"); - - framework::Tensor *tensor = nullptr; - auto &out_var = *scope.FindVar(Output("Out")); - - if (out_var.IsType()) { - tensor = out_var.GetMutable(); - tensor->Resize(framework::make_ddim(Attr>("shape"))); - } else if (out_var.IsType()) { - tensor = out_var.GetMutable()->mutable_value(); - tensor->Resize(framework::make_ddim(Attr>("shape"))); - } else { - PADDLE_THROW( - "fill constant op's output only" - "supports SelectedRows and LoDTensor"); - } - - if (force_cpu) { - auto cpu = platform::CPUPlace(); - tensor->mutable_data(cpu, data_type); - } else { - tensor->mutable_data(dev_place, data_type); - } - - platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance(); - auto &dev_ctx = *pool.Get(dev_place); - math::set_constant(dev_ctx, tensor, value); + protected: + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext& ctx) const override { + return framework::OpKernelType( + framework::proto::VarType::Type(ctx.Attr("dtype")), + ctx.GetPlace()); } }; class FillConstantOpVarTypeInference : public framework::VarTypeInference { public: - void operator()(const framework::OpDesc &op_desc, - framework::BlockDesc *block) const override {} + void operator()(const framework::OpDesc& op_desc, + framework::BlockDesc* block) const override { + auto data_type = static_cast( + boost::get(op_desc.GetAttr("dtype"))); + auto& out_var_name = op_desc.Output("Out").front(); + block->Var(out_var_name)->SetDataType(data_type); + } }; class FillConstantOpMaker : public framework::OpProtoAndCheckerMaker { @@ -107,7 +79,13 @@ Fill up a variable with specified constant value. } // namespace paddle namespace ops = paddle::operators; -REGISTER_OPERATOR(fill_constant, ops::FillConstantOp, - ops::FillConstantInferShape, ops::FillConstantOpMaker, - paddle::framework::EmptyGradOpMaker, - ops::FillConstantOpVarTypeInference); + +REGISTER_OPERATOR(fill_constant, ops::FillConstantOp, ops::FillConstantOpMaker, + ops::FillConstantOpVarTypeInference, + paddle::framework::EmptyGradOpMaker); + +REGISTER_OP_CPU_KERNEL(fill_constant, ops::FillConstantKernel, + ops::FillConstantKernel, + ops::FillConstantKernel, + ops::FillConstantKernel, + ops::FillConstantKernel); diff --git a/paddle/fluid/operators/fill_constant_op.cu.cc b/paddle/fluid/operators/fill_constant_op.cu.cc new file mode 100644 index 0000000000..77027b5a87 --- /dev/null +++ b/paddle/fluid/operators/fill_constant_op.cu.cc @@ -0,0 +1,22 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/fill_constant_op.h" + +namespace ops = paddle::operators; +REGISTER_OP_CUDA_KERNEL(fill_constant, ops::FillConstantKernel, + ops::FillConstantKernel, + ops::FillConstantKernel, + ops::FillConstantKernel, + ops::FillConstantKernel); diff --git a/paddle/fluid/operators/fill_constant_op.h b/paddle/fluid/operators/fill_constant_op.h new file mode 100644 index 0000000000..417c5b4da6 --- /dev/null +++ b/paddle/fluid/operators/fill_constant_op.h @@ -0,0 +1,64 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include + +#include "paddle/fluid/framework/data_type.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/operators/math/math_function.h" + +namespace paddle { +namespace operators { +template +class FillConstantKernel : public framework::OpKernel { + public: + void Compute(const paddle::framework::ExecutionContext &ctx) const override { + auto data_type = + static_cast(ctx.Attr("dtype")); + auto value = ctx.Attr("value"); + auto force_cpu = ctx.Attr("force_cpu"); + + framework::Tensor *tensor = nullptr; + + framework::Variable *out_var = ctx.OutputVar("Out"); + + if (out_var->IsType()) { + tensor = out_var->GetMutable(); + tensor->Resize( + framework::make_ddim(ctx.Attr>("shape"))); + } else if (out_var->IsType()) { + tensor = out_var->GetMutable()->mutable_value(); + tensor->Resize( + framework::make_ddim(ctx.Attr>("shape"))); + } else { + PADDLE_THROW( + "fill constant op's output only" + "supports SelectedRows and LoDTensor"); + } + + if (force_cpu) { + tensor->mutable_data(platform::CPUPlace(), data_type); + } else { + tensor->mutable_data(ctx.GetPlace(), data_type); + } + + platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance(); + auto &dev_ctx = *pool.Get(ctx.GetPlace()); + math::set_constant(dev_ctx, tensor, value); + } +}; +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/fill_zeros_like_op.cu.cc b/paddle/fluid/operators/fill_zeros_like_op.cu.cc index 9538177460..e80a703c30 100644 --- a/paddle/fluid/operators/fill_zeros_like_op.cu.cc +++ b/paddle/fluid/operators/fill_zeros_like_op.cu.cc @@ -14,6 +14,7 @@ limitations under the License. */ #include "paddle/fluid/operators/fill_zeros_like_op.h" #include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/platform/float16.h" namespace ops = paddle::operators; REGISTER_OP_CUDA_KERNEL( @@ -22,4 +23,6 @@ REGISTER_OP_CUDA_KERNEL( ops::FillZerosLikeKernel, ops::FillZerosLikeKernel, ops::FillZerosLikeKernel, + ops::FillZerosLikeKernel, ops::FillZerosLikeKernel); diff --git a/paddle/fluid/operators/lrn_mkldnn_op.cc b/paddle/fluid/operators/lrn_mkldnn_op.cc index 0a18882e81..4e4f977fcc 100644 --- a/paddle/fluid/operators/lrn_mkldnn_op.cc +++ b/paddle/fluid/operators/lrn_mkldnn_op.cc @@ -50,8 +50,8 @@ template class LRNMKLDNNOpKernel : public paddle::framework::OpKernel { public: void Compute(const paddle::framework::ExecutionContext& ctx) const override { - PADDLE_ENFORCE(std::is_same::value, - "MKLDNN LRN must use float data."); + const bool is_float_type = std::is_same::value; + PADDLE_ENFORCE(is_float_type, "MKLDNN LRN must use float data."); PADDLE_ENFORCE(paddle::platform::is_cpu_place(ctx.GetPlace()), "MKLDNN LRN must use CPUPlace."); @@ -132,8 +132,8 @@ template class LRNMKLDNNGradOpKernel : public paddle::framework::OpKernel { public: void Compute(const paddle::framework::ExecutionContext& ctx) const override { - PADDLE_ENFORCE(std::is_same::value, - "MKLDNN LRN must use float data."); + const bool is_float_type = std::is_same::value; + PADDLE_ENFORCE(is_float_type, "MKLDNN LRN must use float data."); PADDLE_ENFORCE(paddle::platform::is_cpu_place(ctx.GetPlace()), "MKLDNN LRN must use CPUPlace."); PADDLE_ENFORCE( diff --git a/paddle/fluid/operators/math/concat_and_split.cu b/paddle/fluid/operators/math/concat_and_split.cu index b10a19b658..e925e7bb59 100644 --- a/paddle/fluid/operators/math/concat_and_split.cu +++ b/paddle/fluid/operators/math/concat_and_split.cu @@ -131,8 +131,9 @@ class ConcatFunctor { int in_col = input[0].numel() / in_row; int out_row = in_row, out_col = 0; - std::vector inputs_data(in_num); + std::vector inputs_data; std::vector inputs_col(in_num + 1); + inputs_data.reserve(in_num); inputs_col[0] = 0; bool sameShape = true; @@ -143,7 +144,7 @@ class ConcatFunctor { } out_col += t_cols; inputs_col[i + 1] = out_col; - inputs_data[i] = const_cast(input[i].data()); + inputs_data.emplace_back(input[i].data()); } // computation diff --git a/paddle/fluid/operators/metrics/accuracy_op.cu b/paddle/fluid/operators/metrics/accuracy_op.cu index b255d2a7c4..4682940f7e 100644 --- a/paddle/fluid/operators/metrics/accuracy_op.cu +++ b/paddle/fluid/operators/metrics/accuracy_op.cu @@ -16,6 +16,7 @@ limitations under the License. */ #include #include "paddle/fluid/operators/metrics/accuracy_op.h" #include "paddle/fluid/platform/cuda_primitives.h" +#include "paddle/fluid/platform/float16.h" #include "paddle/fluid/platform/gpu_info.h" namespace paddle { @@ -94,6 +95,7 @@ class AccuracyOpCUDAKernel : public framework::OpKernel { // FIXME(typhoonzero): types of T is for inference data. // label data is always int64 -REGISTER_OP_CUDA_KERNEL(accuracy, - paddle::operators::AccuracyOpCUDAKernel, - paddle::operators::AccuracyOpCUDAKernel); +REGISTER_OP_CUDA_KERNEL( + accuracy, paddle::operators::AccuracyOpCUDAKernel, + paddle::operators::AccuracyOpCUDAKernel, + paddle::operators::AccuracyOpCUDAKernel); diff --git a/paddle/fluid/operators/optimizers/adadelta_op.h b/paddle/fluid/operators/optimizers/adadelta_op.h index 6c616aa03d..3f51bb0b3d 100644 --- a/paddle/fluid/operators/optimizers/adadelta_op.h +++ b/paddle/fluid/operators/optimizers/adadelta_op.h @@ -27,12 +27,14 @@ class AdadeltaOpKernel : public framework::OpKernel { PADDLE_ENFORCE(param_var->IsType(), "The Var(%s)'s type should be LoDTensor, " "but the received is %s", - ctx.Inputs("Param").front(), param_var->Type().name()); + ctx.Inputs("Param").front(), + framework::ToTypeName(param_var->Type())); const auto* grad_var = ctx.InputVar("Grad"); PADDLE_ENFORCE(grad_var->IsType(), "The Var(%s)'s type should be LoDTensor, " "but the received is %s", - ctx.Inputs("Grad").front(), grad_var->Type().name()); + ctx.Inputs("Grad").front(), + framework::ToTypeName(grad_var->Type())); auto param_out_tensor = ctx.Output("ParamOut"); auto avg_squared_grad_out_tensor = diff --git a/paddle/fluid/operators/optimizers/adagrad_op.h b/paddle/fluid/operators/optimizers/adagrad_op.h index 9f6ef39169..13455fc42c 100644 --- a/paddle/fluid/operators/optimizers/adagrad_op.h +++ b/paddle/fluid/operators/optimizers/adagrad_op.h @@ -50,7 +50,8 @@ class AdagradOpKernel : public framework::OpKernel { PADDLE_ENFORCE(param_var->IsType(), "The Var(%s)'s type should be LoDTensor, " "but the received is %s", - ctx.Inputs("Param").front(), param_var->Type().name()); + ctx.Inputs("Param").front(), + framework::ToTypeName(param_var->Type())); auto *param_out_tensor = ctx.Output("ParamOut"); auto *moment_out_tensor = ctx.Output("MomentOut"); diff --git a/paddle/fluid/operators/optimizers/adam_op.h b/paddle/fluid/operators/optimizers/adam_op.h index 1138bb7400..5c559484ec 100644 --- a/paddle/fluid/operators/optimizers/adam_op.h +++ b/paddle/fluid/operators/optimizers/adam_op.h @@ -347,7 +347,8 @@ class AdamOpKernel : public framework::OpKernel { PADDLE_ENFORCE(param_var->IsType(), "The Var(%s)'s type should be LoDTensor, " "but the received is %s", - ctx.Inputs("Param").front(), param_var->Type().name()); + ctx.Inputs("Param").front(), + framework::ToTypeName(param_var->Type())); using paddle::framework::LoDTensor; using paddle::operators::detail::Ref; diff --git a/paddle/fluid/operators/optimizers/adamax_op.h b/paddle/fluid/operators/optimizers/adamax_op.h index 7137fbd965..55d25ecbdd 100644 --- a/paddle/fluid/operators/optimizers/adamax_op.h +++ b/paddle/fluid/operators/optimizers/adamax_op.h @@ -27,12 +27,14 @@ class AdamaxOpKernel : public framework::OpKernel { PADDLE_ENFORCE(param_var->IsType(), "The Var(%s)'s type should be LoDTensor, " "but the received is %s", - ctx.Inputs("Param").front(), param_var->Type().name()); + ctx.Inputs("Param").front(), + framework::ToTypeName(param_var->Type())); const auto* grad_var = ctx.InputVar("Grad"); PADDLE_ENFORCE(grad_var->IsType(), "The Var(%s)'s type should be LoDTensor, " "but the received is %s", - ctx.Inputs("Grad").front(), grad_var->Type().name()); + ctx.Inputs("Grad").front(), + framework::ToTypeName(grad_var->Type())); auto param_out_tensor = ctx.Output("ParamOut"); auto moment_out_tensor = ctx.Output("MomentOut"); diff --git a/paddle/fluid/operators/optimizers/decayed_adagrad_op.h b/paddle/fluid/operators/optimizers/decayed_adagrad_op.h index 5df43d33ef..4abd436927 100644 --- a/paddle/fluid/operators/optimizers/decayed_adagrad_op.h +++ b/paddle/fluid/operators/optimizers/decayed_adagrad_op.h @@ -27,12 +27,14 @@ class DecayedAdagradOpKernel : public framework::OpKernel { PADDLE_ENFORCE(param_var->IsType(), "The Var(%s)'s type should be LoDTensor, " "but the received is %s", - ctx.Inputs("Param").front(), param_var->Type().name()); + ctx.Inputs("Param").front(), + framework::ToTypeName(param_var->Type())); const auto* grad_var = ctx.InputVar("Grad"); PADDLE_ENFORCE(grad_var->IsType(), "The Var(%s)'s type should be LoDTensor, " "but the received is %s", - ctx.Inputs("Grad").front(), grad_var->Type().name()); + ctx.Inputs("Grad").front(), + framework::ToTypeName(grad_var->Type())); auto param_out_tensor = ctx.Output("ParamOut"); auto moment_out_tensor = ctx.Output("MomentOut"); diff --git a/paddle/fluid/operators/optimizers/ftrl_op.h b/paddle/fluid/operators/optimizers/ftrl_op.h index 8f812c9a03..bbf34d8316 100644 --- a/paddle/fluid/operators/optimizers/ftrl_op.h +++ b/paddle/fluid/operators/optimizers/ftrl_op.h @@ -32,12 +32,14 @@ class FTRLOpKernel : public framework::OpKernel { PADDLE_ENFORCE(param_var->IsType(), "The Var(%s)'s type should be LoDTensor, " "but the received is %s", - ctx.Inputs("Param").front(), param_var->Type().name()); + ctx.Inputs("Param").front(), + framework::ToTypeName(param_var->Type())); const auto* grad_var = ctx.InputVar("Grad"); PADDLE_ENFORCE(grad_var->IsType(), "The Var(%s)'s type should be LoDTensor, " "but the received is %s", - ctx.Inputs("Grad").front(), grad_var->Type().name()); + ctx.Inputs("Grad").front(), + framework::ToTypeName(grad_var->Type())); auto* param_out = ctx.Output("ParamOut"); auto* sq_accum_out = ctx.Output("SquaredAccumOut"); diff --git a/paddle/fluid/operators/optimizers/momentum_op.cu b/paddle/fluid/operators/optimizers/momentum_op.cu index 8ce739de8d..7f9e724640 100644 --- a/paddle/fluid/operators/optimizers/momentum_op.cu +++ b/paddle/fluid/operators/optimizers/momentum_op.cu @@ -14,8 +14,11 @@ limitations under the License. */ #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/operators/optimizers/momentum_op.h" +#include "paddle/fluid/platform/float16.h" namespace ops = paddle::operators; REGISTER_OP_CUDA_KERNEL( momentum, ops::MomentumOpKernel, - ops::MomentumOpKernel); + ops::MomentumOpKernel, + ops::MomentumOpKernel); diff --git a/paddle/fluid/operators/optimizers/momentum_op.h b/paddle/fluid/operators/optimizers/momentum_op.h index 71f079e4d9..3ed1bff5ff 100644 --- a/paddle/fluid/operators/optimizers/momentum_op.h +++ b/paddle/fluid/operators/optimizers/momentum_op.h @@ -237,7 +237,8 @@ class SparseMomentumFunctor { inline HOSTDEVICE void operator()(size_t i) { auto row_idx = math::BinarySearch(rows_, row_height_, i / row_numel_); - T g = row_idx >= 0 ? g_[row_idx * row_numel_ + i % row_numel_] : 0; + T g = row_idx >= 0 ? g_[row_idx * row_numel_ + i % row_numel_] + : static_cast(0); // put memory access in register const T p = p_[i]; const T lr = lr_[0]; @@ -282,7 +283,8 @@ class SparseMomentumFunctor { inline HOSTDEVICE void operator()(size_t i) { auto row_idx = math::BinarySearch(rows_, row_height_, i / row_numel_); - T g = row_idx >= 0 ? g_[row_idx * row_numel_ + i % row_numel_] : 0; + T g = row_idx >= 0 ? g_[row_idx * row_numel_ + i % row_numel_] + : static_cast(0); // put memory access in register const T p = p_[i]; const T lr = lr_[0]; @@ -393,7 +395,7 @@ class MomentumOpKernel : public framework::OpKernel { PADDLE_THROW( string::Sprintf("MomentumOp only supports LoDTensor or SelectedRows " "gradient, but the received Variable Type is %s", - grad_var->Type().name())); + framework::ToTypeName(grad_var->Type()))); } } }; diff --git a/paddle/fluid/operators/optimizers/sgd_op.cu b/paddle/fluid/operators/optimizers/sgd_op.cu index a9d303d55d..975e4b8e72 100644 --- a/paddle/fluid/operators/optimizers/sgd_op.cu +++ b/paddle/fluid/operators/optimizers/sgd_op.cu @@ -60,7 +60,8 @@ class SGDOpCUDAKernel : public framework::OpKernel { PADDLE_ENFORCE(param_var->IsType(), "The Var(%s)'s type should be LoDTensor, " "but the received is %s", - ctx.Inputs("Param").front(), param_var->Type().name()); + ctx.Inputs("Param").front(), + framework::ToTypeName(param_var->Type())); auto* param = ctx.Input("Param"); auto* param_out = ctx.Output("ParamOut"); diff --git a/paddle/fluid/operators/sequence_ops/sequence_mask_op.h b/paddle/fluid/operators/sequence_ops/sequence_mask_op.h index 8fceed3558..57d6f4b3ea 100644 --- a/paddle/fluid/operators/sequence_ops/sequence_mask_op.h +++ b/paddle/fluid/operators/sequence_ops/sequence_mask_op.h @@ -52,7 +52,7 @@ class SequenceMaskOpMaker : public framework::OpProtoAndCheckerMaker { "The maximum length of the sequence. If maxlen < 0, maxlen " "= max(Input(X)).") .SetDefault(-1) - .AddCustomChecker([](int &v) { + .AddCustomChecker([](const int &v) { PADDLE_ENFORCE(v < 0 || v >= 1, "Attr(maxlen) must be less than 0 or larger than 1"); }); diff --git a/paddle/fluid/operators/split_lod_tensor_op.cc b/paddle/fluid/operators/split_lod_tensor_op.cc index 767449cde9..5ede972c71 100644 --- a/paddle/fluid/operators/split_lod_tensor_op.cc +++ b/paddle/fluid/operators/split_lod_tensor_op.cc @@ -63,7 +63,7 @@ class SplitLoDTensorOp : public framework::OperatorBase { } auto *mask_data = cpu_mask->data(); - std::vector> copy_ranges(mask_dim[0]); + std::vector> copy_ranges(2); // set out_true/out_false lod for (size_t t = 0; t < 2; t++) { diff --git a/paddle/fluid/operators/sum_mkldnn_op.cc b/paddle/fluid/operators/sum_mkldnn_op.cc index f9a16ef35e..c39f94637a 100644 --- a/paddle/fluid/operators/sum_mkldnn_op.cc +++ b/paddle/fluid/operators/sum_mkldnn_op.cc @@ -245,7 +245,7 @@ class SumMKLDNNOpKernel : public paddle::framework::OpKernel { } } else { PADDLE_THROW("Unexpected branch, output variable type is %s", - out_var->Type().name()); + framework::ToTypeName(out_var->Type())); } } }; diff --git a/paddle/fluid/operators/sum_op.cc b/paddle/fluid/operators/sum_op.cc index 4f717a4355..01996e6bf9 100644 --- a/paddle/fluid/operators/sum_op.cc +++ b/paddle/fluid/operators/sum_op.cc @@ -126,7 +126,7 @@ class SumOp : public framework::OperatorWithKernel { PADDLE_THROW("Cannot find the input data type by all input data"); } PADDLE_THROW("Unexpected branch. Input type is %s", - x_vars[0]->Type().name()); + framework::ToTypeName(x_vars[0]->Type())); } }; diff --git a/paddle/fluid/operators/sum_op.h b/paddle/fluid/operators/sum_op.h index 76cc796a9b..a8b2df186d 100644 --- a/paddle/fluid/operators/sum_op.h +++ b/paddle/fluid/operators/sum_op.h @@ -163,7 +163,7 @@ class SumKernel : public framework::OpKernel { } } else { PADDLE_THROW("Unexpected branch, output variable type is %s", - out_var->Type().name()); + framework::ToTypeName(out_var->Type())); } } }; diff --git a/paddle/fluid/operators/top_k_op.cc b/paddle/fluid/operators/top_k_op.cc index c17d1afc30..9e77f7252d 100644 --- a/paddle/fluid/operators/top_k_op.cc +++ b/paddle/fluid/operators/top_k_op.cc @@ -21,7 +21,7 @@ class TopkOp : public framework::OperatorWithKernel { public: using framework::OperatorWithKernel::OperatorWithKernel; - void InferShape(framework::InferShapeContext *ctx) const override { + void InferShape(framework::InferShapeContext* ctx) const override { PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) of TopkOp should not be null."); PADDLE_ENFORCE(ctx->HasOutput("Out"), @@ -44,12 +44,25 @@ class TopkOp : public framework::OperatorWithKernel { ctx->ShareLoD("X", "Out"); ctx->ShareLoD("X", "Indices"); } + + protected: + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext& ctx) const override { + framework::LibraryType library_{framework::LibraryType::kPlain}; + framework::DataLayout layout_ = framework::DataLayout::kAnyLayout; + return framework::OpKernelType(ctx.Input("X")->type(), + ctx.device_context(), layout_, library_); + } }; class TopkOpMaker : public framework::OpProtoAndCheckerMaker { public: void Make() override { AddInput("X", "(Tensor) The input of Topk op"); + AddInput("K", + "(Tensor) Number of top elements to look for along " + "the last dimension (along each row for matrices).") + .AsDispensable(); AddOutput("Out", "(Tensor) The output tensor of Topk op"); AddOutput("Indices", "(Tensor) The indices of Topk elements of input"); AddComment(R"DOC( diff --git a/paddle/fluid/operators/top_k_op.cu b/paddle/fluid/operators/top_k_op.cu index 0cad224ca8..c27039dd0a 100644 --- a/paddle/fluid/operators/top_k_op.cu +++ b/paddle/fluid/operators/top_k_op.cu @@ -16,6 +16,7 @@ limitations under the License. */ #include "paddle/fluid/operators/top_k_op.h" #include "paddle/fluid/platform/assert.h" #include "paddle/fluid/platform/cuda_device_function.h" +#include "paddle/fluid/platform/float16.h" namespace paddle { namespace operators { @@ -150,7 +151,7 @@ __device__ __forceinline__ void ThreadGetTopK(Pair topk[], int* beam, if (k < MaxLength - (*beam)) { topk[k] = topk[k + *beam]; } else { - topk[k].set(-INFINITY, -1); + topk[k].set(-static_cast(INFINITY), -1); } } if (!(*is_empty)) { @@ -160,7 +161,7 @@ __device__ __forceinline__ void ThreadGetTopK(Pair topk[], int* beam, } *max = topk[MaxLength - 1]; - if ((*max).v == -1) *is_empty = true; + if ((*max).v == -static_cast(1)) *is_empty = true; *beam = 0; } } @@ -181,7 +182,7 @@ __device__ __forceinline__ void ThreadGetTopK(Pair topk[], int* beam, if (k < MaxLength - *beam) { topk[k] = topk[k + *beam]; } else { - topk[k].set(-INFINITY, -1); + topk[k].set(-static_cast(INFINITY), -1); } } if (!(*is_empty)) { @@ -278,7 +279,7 @@ __global__ void KeMatrixTopK(T* output, int output_stride, int64_t* indices, bool firststep = true; for (int j = 0; j < MaxLength; j++) { - topk[j].set(-INFINITY, -1); + topk[j].set(-static_cast(INFINITY), -1); } while (top_num) { ThreadGetTopK( @@ -326,6 +327,17 @@ class TopkOpCUDAKernel : public framework::OpKernel { auto* indices = ctx.Output("Indices"); size_t k = static_cast(ctx.Attr("k")); + auto* k_t = ctx.Input("K"); + if (k_t) { + Tensor k_host; + framework::TensorCopySync(*k_t, platform::CPUPlace(), &k_host); + k = k_host.data()[0]; + framework::DDim output_dims = output->dims(); + output_dims[output_dims.size() - 1] = k; + output->Resize(output_dims); + indices->Resize(output_dims); + } + const T* input_data = input->data(); T* output_data = output->mutable_data(ctx.GetPlace()); // FIXME(typhoonzero): data is always converted to type T? @@ -362,5 +374,7 @@ class TopkOpCUDAKernel : public framework::OpKernel { } // namespace operators } // namespace paddle -REGISTER_OP_CUDA_KERNEL(top_k, paddle::operators::TopkOpCUDAKernel, - paddle::operators::TopkOpCUDAKernel); +REGISTER_OP_CUDA_KERNEL( + top_k, paddle::operators::TopkOpCUDAKernel, + paddle::operators::TopkOpCUDAKernel, + paddle::operators::TopkOpCUDAKernel); diff --git a/paddle/fluid/operators/top_k_op.h b/paddle/fluid/operators/top_k_op.h index 76ece57b39..f7bac67300 100644 --- a/paddle/fluid/operators/top_k_op.h +++ b/paddle/fluid/operators/top_k_op.h @@ -37,8 +37,16 @@ class TopkKernel : public framework::OpKernel { auto* input = ctx.Input("X"); auto* output = ctx.Output("Out"); auto* indices = ctx.Output("Indices"); - // k is determined by Attr - const size_t k = static_cast(ctx.Attr("k")); + + size_t k = static_cast(ctx.Attr("k")); + auto* k_t = ctx.Input("K"); + if (k_t) { + k = k_t->data()[0]; + framework::DDim output_dims = output->dims(); + output_dims[output_dims.size() - 1] = k; + output->Resize(output_dims); + indices->Resize(output_dims); + } T* output_data = output->mutable_data(ctx.GetPlace()); int64_t* indices_data = indices->mutable_data(ctx.GetPlace()); diff --git a/paddle/fluid/platform/create_tensor_with_allocationptr.h b/paddle/fluid/platform/create_tensor_with_allocationptr.h deleted file mode 100644 index 00fcc5f862..0000000000 --- a/paddle/fluid/platform/create_tensor_with_allocationptr.h +++ /dev/null @@ -1,42 +0,0 @@ -// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once -#include "paddle/fluid/framework/tensor.h" -#include "paddle/fluid/platform/temporary_allocator.h" -namespace paddle { -namespace platform { - -template -paddle::framework::Tensor GetTensor( - memory::allocation::AllocationPtr temp_allocation_ptr, - const framework::DDim &dim) { - auto &deleter = temp_allocation_ptr.get_deleter(); - auto *allocation_ptr = temp_allocation_ptr.release(); - auto shared_allocation = - std::shared_ptr(allocation_ptr, deleter); - - PADDLE_ENFORCE(dynamic_cast(allocation_ptr) != nullptr, - "The AllocationPtr must be TemporaryAllocation."); - PADDLE_ENFORCE_EQ(allocation_ptr->size(), - framework::product(dim) * sizeof(T)); - - paddle::framework::Tensor temp_tensor(std::type_index(typeid(T))); - temp_tensor.Resize(dim); - temp_tensor.ResetHolder(std::move(shared_allocation)); - return temp_tensor; -} - -} // namespace platform -} // namespace paddle diff --git a/paddle/fluid/platform/device_context.cc b/paddle/fluid/platform/device_context.cc index 81c443d758..022afb686b 100644 --- a/paddle/fluid/platform/device_context.cc +++ b/paddle/fluid/platform/device_context.cc @@ -256,10 +256,11 @@ CUDADeviceContext::CUDADeviceContext(CUDAPlace place) LOG_FIRST_N(WARNING, 1) << "Please NOTE: device: " << place_.device << ", CUDA Capability: " << compute_capability_ - << ", Driver Version: " << driver_version_ / 1000 + << ", Driver API Version: " << driver_version_ / 1000 << "." << (driver_version_ % 100) / 10 - << ", Runtime Version: " << runtime_version_ / 1000 - << "." << (runtime_version_ % 100) / 10; + << ", Runtime API Version: " + << runtime_version_ / 1000 << "." + << (runtime_version_ % 100) / 10; size_t cudnn_dso_ver = dynload::cudnnGetVersion(); LOG_FIRST_N(WARNING, 1) << "device: " << place_.device << ", cuDNN Version: " << cudnn_dso_ver / 1000 << "." diff --git a/paddle/fluid/platform/device_context.h b/paddle/fluid/platform/device_context.h index af9744dcb8..7e87580189 100644 --- a/paddle/fluid/platform/device_context.h +++ b/paddle/fluid/platform/device_context.h @@ -41,7 +41,28 @@ limitations under the License. */ namespace paddle { namespace platform { -/*! \brief device temporary allocator singleton */ +/*! \brief device temporary allocator singleton. + * + * Some operator needs temporary memory during computation, for example, + * conv_gemm, which needs use col to store the result of im2col. If we + * create a stack memory which is used by CUDA Kernel, before the + * Computation(...) returns, we should add ctx->Wait(), because the + * execution of CUDA is async, if there doesn't have ctx->Wait(), + * the temporary memory will be released before the CUDA Kernel uses + * it. + * + * DeviceTemporaryAllocator is a singleton, which contains a + * `TemporaryAllocator` for each . And the TemporaryAllocator + * contains a temp_allocation_queue which is used to store the temporary + * allocations. The allocation, which is allocated by TemporaryAllocator, + * is a unique_ptr, and when it is not held by any variable, it will be + * pushed into the temp_allocation_queue. There are two opportunities to free + * the allocations of temp_allocation_queue: + * - when the Stream calls cudaStreamSynchronize; + * - when the allocation size of opportunities exceeds a certain threshold + * (defined by FLAGS_limit_of_temporary_allocation). + * + * */ class DeviceTemporaryAllocator { public: static DeviceTemporaryAllocator& Instance() { diff --git a/paddle/fluid/platform/enforce.h b/paddle/fluid/platform/enforce.h index 01ee67fd07..0668053950 100644 --- a/paddle/fluid/platform/enforce.h +++ b/paddle/fluid/platform/enforce.h @@ -140,68 +140,72 @@ struct EOFException : public std::exception { #define LIKELY(condition) (condition) #endif +inline bool is_error(bool stat) { return !stat; } + template inline typename std::enable_if::type throw_on_error( bool stat, const Args&... args) { - if (UNLIKELY(!(stat))) { #ifndef REPLACE_ENFORCE_GLOG - throw std::runtime_error(string::Sprintf(args...)); + throw std::runtime_error(string::Sprintf(args...)); #else - LOG(FATAL) << string::Sprintf(args...); + LOG(FATAL) << string::Sprintf(args...); #endif - } } #ifdef PADDLE_WITH_CUDA +inline bool is_error(cudaError_t e) { return UNLIKELY(e); } + template inline typename std::enable_if::type throw_on_error( cudaError_t e, const Args&... args) { - if (UNLIKELY(e)) { #ifndef REPLACE_ENFORCE_GLOG - throw thrust::system_error(e, thrust::cuda_category(), - string::Sprintf(args...)); + throw thrust::system_error(e, thrust::cuda_category(), + string::Sprintf(args...)); #else - LOG(FATAL) << string::Sprintf(args...); + LOG(FATAL) << string::Sprintf(args...); #endif - } +} + +inline bool is_error(curandStatus_t stat) { + return stat != CURAND_STATUS_SUCCESS; } template inline typename std::enable_if::type throw_on_error( curandStatus_t stat, const Args&... args) { - if (stat != CURAND_STATUS_SUCCESS) { #ifndef REPLACE_ENFORCE_GLOG - throw thrust::system_error(cudaErrorLaunchFailure, thrust::cuda_category(), - string::Sprintf(args...)); + throw thrust::system_error(cudaErrorLaunchFailure, thrust::cuda_category(), + string::Sprintf(args...)); #else - LOG(FATAL) << string::Sprintf(args...); + LOG(FATAL) << string::Sprintf(args...); #endif - } +} + +inline bool is_error(cudnnStatus_t stat) { + return stat != CUDNN_STATUS_SUCCESS; } template inline typename std::enable_if::type throw_on_error( cudnnStatus_t stat, const Args&... args) { - if (stat == CUDNN_STATUS_SUCCESS) { - return; - } else { #ifndef REPLACE_ENFORCE_GLOG - throw std::runtime_error(platform::dynload::cudnnGetErrorString(stat) + - string::Sprintf(args...)); + throw std::runtime_error(platform::dynload::cudnnGetErrorString(stat) + + string::Sprintf(args...)); #else - LOG(FATAL) << string::Sprintf(args...); + LOG(FATAL) << string::Sprintf(args...); #endif - } +} + +inline bool is_error(cublasStatus_t stat) { + return stat != CUBLAS_STATUS_SUCCESS; } template inline typename std::enable_if::type throw_on_error( cublasStatus_t stat, const Args&... args) { std::string err; - if (stat == CUBLAS_STATUS_SUCCESS) { - return; - } else if (stat == CUBLAS_STATUS_NOT_INITIALIZED) { + if (stat == CUBLAS_STATUS_NOT_INITIALIZED) { err = "CUBLAS: not initialized, "; } else if (stat == CUBLAS_STATUS_ALLOC_FAILED) { err = "CUBLAS: alloc failed, "; @@ -254,21 +258,49 @@ inline void throw_on_error(T e) { #define PADDLE_THROW(...) \ throw ::paddle::platform::EnforceNotMet(__FILE__, __LINE__, __VA_ARGS__) +#define __PADDLE_THROW_ERROR_I(_, _9, _8, _7, _6, _5, _4, _3, _2, X_, ...) X_; + +#define __THROW_ON_ERROR_ONE_ARG(COND, ARG) \ + ::paddle::platform::throw_on_error(COND, ::paddle::string::Sprintf(ARG)); + +#define __PADDLE_THROW_ON_ERROR(COND, ...) \ + __PADDLE_THROW_ERROR_I( \ + __VA_ARGS__, ::paddle::platform::throw_on_error(COND, __VA_ARGS__), \ + ::paddle::platform::throw_on_error(COND, __VA_ARGS__), \ + ::paddle::platform::throw_on_error(COND, __VA_ARGS__), \ + ::paddle::platform::throw_on_error(COND, __VA_ARGS__), \ + ::paddle::platform::throw_on_error(COND, __VA_ARGS__), \ + ::paddle::platform::throw_on_error(COND, __VA_ARGS__), \ + ::paddle::platform::throw_on_error(COND, __VA_ARGS__), \ + ::paddle::platform::throw_on_error(COND, __VA_ARGS__), \ + __THROW_ON_ERROR_ONE_ARG(COND, __VA_ARGS__)) + +#define __PADDLE_UNARY_COMPARE(COND, ...) \ + do { \ + auto __cond = COND; \ + if (UNLIKELY(::paddle::platform::is_error(__cond))) { \ + __PADDLE_THROW_ON_ERROR(__cond, __VA_ARGS__); \ + } \ + } while (0) + #ifndef REPLACE_ENFORCE_GLOG -#define PADDLE_ENFORCE(...) \ +#define __PADDLE_ENFORCE_I(COND, ...) \ do { \ try { \ - ::paddle::platform::throw_on_error(__VA_ARGS__); \ + __PADDLE_UNARY_COMPARE(COND, __VA_ARGS__); \ } catch (...) { \ throw ::paddle::platform::EnforceNotMet(std::current_exception(), \ __FILE__, __LINE__); \ } \ - } while (false) + } while (0) #else -#define PADDLE_ENFORCE(...) ::paddle::platform::throw_on_error(__VA_ARGS__); +#define __PADDLE_ENFORCE_I(COND, ...) __PADDLE_UNARY_COMPARE(COND, __VA_ARGS__); #endif // REPLACE_ENFORCE_GLOG +#define __PADDLE_ENFORCE(__args) __PADDLE_ENFORCE_I __args +#define PADDLE_ENFORCE(...) __PADDLE_ENFORCE((__VA_ARGS__)) + #define PADDLE_THROW_EOF() \ do { \ throw ::paddle::platform::EOFException("There is no next data.", __FILE__, \ diff --git a/paddle/fluid/platform/enforce_test.cc b/paddle/fluid/platform/enforce_test.cc index d521829655..1091badae5 100644 --- a/paddle/fluid/platform/enforce_test.cc +++ b/paddle/fluid/platform/enforce_test.cc @@ -37,6 +37,25 @@ TEST(ENFORCE, FAILED) { HasPrefix(StringPiece(error.what()), "Enforce is not ok 123 at all")); } EXPECT_TRUE(caught_exception); + + caught_exception = false; + try { + PADDLE_ENFORCE(false, "Enforce is not ok at all"); + } catch (paddle::platform::EnforceNotMet error) { + caught_exception = true; + EXPECT_TRUE( + HasPrefix(StringPiece(error.what()), "Enforce is not ok at all")); + } + EXPECT_TRUE(caught_exception); + + caught_exception = false; + try { + PADDLE_ENFORCE(false); + } catch (paddle::platform::EnforceNotMet error) { + caught_exception = true; + EXPECT_NE(std::string(error.what()).find(" at "), 0); + } + EXPECT_TRUE(caught_exception); } TEST(ENFORCE, NO_ARG_OK) { diff --git a/paddle/fluid/platform/float16_test.cc b/paddle/fluid/platform/float16_test.cc index 27e930e6e0..3a937dfaec 100644 --- a/paddle/fluid/platform/float16_test.cc +++ b/paddle/fluid/platform/float16_test.cc @@ -12,6 +12,7 @@ limitations under the License. */ #include +#define GLOG_NO_ABBREVIATED_SEVERITIES // msvc conflict logging with windows.h #include "gtest/gtest.h" #include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/platform/init.h" diff --git a/paddle/fluid/platform/float16_test.cu b/paddle/fluid/platform/float16_test.cu index e2b7ca9b03..b1b51d804e 100644 --- a/paddle/fluid/platform/float16_test.cu +++ b/paddle/fluid/platform/float16_test.cu @@ -11,6 +11,7 @@ limitations under the License. */ #include "paddle/fluid/platform/float16.h" +#define GLOG_NO_ABBREVIATED_SEVERITIES // msvc conflict logging with windows.h #include #include #include diff --git a/paddle/fluid/platform/nccl_helper.h b/paddle/fluid/platform/nccl_helper.h index cbb090adef..6ce4bf8f13 100644 --- a/paddle/fluid/platform/nccl_helper.h +++ b/paddle/fluid/platform/nccl_helper.h @@ -23,6 +23,7 @@ #include "paddle/fluid/framework/data_type.h" #include "paddle/fluid/platform/dynload/nccl.h" #include "paddle/fluid/platform/enforce.h" +#include "paddle/fluid/platform/float16.h" #define NCCL_ID_VARNAME "NCCLID" @@ -38,6 +39,8 @@ inline ncclDataType_t ToNCCLDataType(framework::proto::VarType::Type type) { return ncclInt; } else if (type == framework::proto::VarType::INT64) { return ncclInt64; + } else if (type == framework::proto::VarType::FP16) { + return ncclFloat16; } else { PADDLE_THROW("Not supported"); } diff --git a/paddle/fluid/platform/temporary_allocator.h b/paddle/fluid/platform/temporary_allocator.h index 4e32d2d695..812c4a3331 100644 --- a/paddle/fluid/platform/temporary_allocator.h +++ b/paddle/fluid/platform/temporary_allocator.h @@ -29,6 +29,19 @@ class TemporaryAllocation : public memory::allocation::Allocation { memory::allocation::AllocationPtr underlying_allocation_; }; +/*! \brief the TemporaryAllocator is used to alloc the temporary allocation + * which used by CUDA's async operation. + * + * The TemporaryAllocator contains a temp_allocation_queue which + * is used to store the temporary allocations. The allocation, which is + * allocated by TemporaryAllocator, is a unique_ptr, and when it is not held + * by any variable, it will be pushed into the temp_allocation_queue. + * + * There is one opportunity to free the allocations of temp_allocation_queue: + * - when the allocation size of opportunities exceeds a certain threshold + * (defined by FLAGS_limit_of_temporary_allocation). + * + * */ class TemporaryAllocator : public memory::allocation::Allocator { public: explicit TemporaryAllocator(platform::Place place); diff --git a/paddle/fluid/platform/temporary_allocator_test.cc b/paddle/fluid/platform/temporary_allocator_test.cc index 3b940b0e82..e4e5be5b89 100644 --- a/paddle/fluid/platform/temporary_allocator_test.cc +++ b/paddle/fluid/platform/temporary_allocator_test.cc @@ -14,8 +14,7 @@ #include "paddle/fluid/platform/temporary_allocator.h" #include -#include "paddle/fluid/framework/tensor.h" -#include "paddle/fluid/platform/create_tensor_with_allocationptr.h" +#include "paddle/fluid/framework/tensor_util.h" DECLARE_double(limit_of_temporary_allocation); namespace paddle { @@ -47,6 +46,7 @@ TEST(temporary_allocator, temporary_allocator) { TEST(temporary_allocator, add_callback) { #ifdef PADDLE_WITH_CUDA + const double limit = FLAGS_limit_of_temporary_allocation; FLAGS_limit_of_temporary_allocation = 10; platform::CUDAPlace gpu_place(0); TemporaryAllocator gpu_alloc(gpu_place); @@ -63,7 +63,7 @@ TEST(temporary_allocator, add_callback) { }); { gpu_alloc.Allocate(100); } PADDLE_ENFORCE(deleted); - FLAGS_limit_of_temporary_allocation = -1; + FLAGS_limit_of_temporary_allocation = limit; #endif } @@ -75,8 +75,8 @@ TEST(temporary_allocator, create_tensor_with_allocationptr) { auto allocation = cpu_alloc.Allocate(memory_size); void* address = allocation->ptr(); int numel = memory_size / sizeof(float); - framework::Tensor tensor = - GetTensor(std::move(allocation), framework::make_ddim({numel})); + framework::Tensor tensor = framework::GetTensor( + std::move(allocation), framework::make_ddim({numel})); PADDLE_ENFORCE_EQ(address, tensor.data()); PADDLE_ENFORCE_EQ(tensor.numel(), numel); } @@ -90,8 +90,8 @@ TEST(temporary_allocator, create_tensor_with_allocationptr) { auto allocation = gpu_alloc.Allocate(memory_size); void* address = allocation->ptr(); int numel = memory_size / sizeof(float); - framework::Tensor tensor = - GetTensor(std::move(allocation), framework::make_ddim({numel})); + framework::Tensor tensor = framework::GetTensor( + std::move(allocation), framework::make_ddim({numel})); PADDLE_ENFORCE_EQ(address, tensor.data()); PADDLE_ENFORCE_EQ(tensor.numel(), numel); } @@ -116,7 +116,7 @@ TEST(temporary_allocator, create_tensor_with_allocationptr2) { { auto allocation = cpu_alloc.Allocate(memory_size); address = allocation->ptr(); - framework::Tensor tensor = GetTensor( + framework::Tensor tensor = framework::GetTensor( std::move(allocation), framework::make_ddim({numel})); PADDLE_ENFORCE_EQ(address, tensor.data()); PADDLE_ENFORCE_EQ(tensor.numel(), numel); @@ -138,7 +138,7 @@ TEST(temporary_allocator, create_tensor_with_allocationptr2) { { auto allocation = gpu_alloc.Allocate(memory_size); address = allocation->ptr(); - framework::Tensor tensor = GetTensor( + framework::Tensor tensor = framework::GetTensor( std::move(allocation), framework::make_ddim({numel})); PADDLE_ENFORCE_EQ(address, tensor.data()); PADDLE_ENFORCE_EQ(tensor.numel(), numel); diff --git a/paddle/fluid/pybind/CMakeLists.txt b/paddle/fluid/pybind/CMakeLists.txt index fb8bcb190b..72b0f216d3 100644 --- a/paddle/fluid/pybind/CMakeLists.txt +++ b/paddle/fluid/pybind/CMakeLists.txt @@ -1,5 +1,5 @@ -set(PYBIND_DEPS pybind python proto_desc memory executor async_executor prune feed_fetch_method pass_builder parallel_executor profiler layer) +set(PYBIND_DEPS pybind python proto_desc memory executor async_executor prune feed_fetch_method pass_builder parallel_executor profiler layer scope_pool) if(WITH_PYTHON) list(APPEND PYBIND_DEPS py_func_op) endif() diff --git a/paddle/fluid/pybind/imperative.cc b/paddle/fluid/pybind/imperative.cc index be63fb8778..819943508b 100644 --- a/paddle/fluid/pybind/imperative.cc +++ b/paddle/fluid/pybind/imperative.cc @@ -14,7 +14,6 @@ limitations under the License. */ #include "paddle/fluid/pybind/imperative.h" #include "paddle/fluid/framework/block_desc.h" -#include "paddle/fluid/framework/scope.h" #include "paddle/fluid/imperative/tracer.h" namespace paddle { @@ -28,9 +27,7 @@ void BindTracer(pybind11::module *m) { framework::BlockDesc *startup_block) { new (&self) imperative::Tracer(root_block, startup_block); }) - .def("trace", &imperative::Tracer::Trace) - .def("get_scope", &imperative::Tracer::GetScope, - pybind11::return_value_policy::reference); + .def("trace", &imperative::Tracer::Trace); } } // namespace pybind diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc index 88a2a5276a..2ffdc90d84 100644 --- a/paddle/fluid/pybind/pybind.cc +++ b/paddle/fluid/pybind/pybind.cc @@ -32,6 +32,7 @@ limitations under the License. */ #include "paddle/fluid/framework/parallel_executor.h" #include "paddle/fluid/framework/prune.h" #include "paddle/fluid/framework/reader.h" +#include "paddle/fluid/framework/scope_pool.h" #include "paddle/fluid/framework/selected_rows.h" #include "paddle/fluid/framework/version.h" #include "paddle/fluid/imperative/layer.h" @@ -117,12 +118,13 @@ PYBIND11_MODULE(core, m) { return paddle::operators::AppendPythonCallableObjectAndReturnId(py_obj); }); + m.add_object("_cleanup", + py::capsule([]() { ScopePool::Instance().Clear(); })); + py::class_(m, "VarBase", R"DOC()DOC") .def(py::init<>()) .def("_run_backward", - [](imperative::VarBase &self, framework::Scope *scope) { - self.RunBackward(scope); - }) + [](imperative::VarBase &self) { self.RunBackward(); }) .def("_grad", &imperative::VarBase::Grad) .def_property( "desc", @@ -130,7 +132,13 @@ PYBIND11_MODULE(core, m) { [](imperative::VarBase &self, framework::VarDesc *var_desc) { self.var_desc_ = var_desc; }, - py::return_value_policy::reference); + py::return_value_policy::reference) + .def_property("var", + [](const imperative::VarBase &self) { return self.var_; }, + [](imperative::VarBase &self, framework::Variable *var) { + self.var_ = var; + }, + py::return_value_policy::reference); py::class_(m, "OpBase", R"DOC()DOC") .def(py::init<>()) @@ -454,7 +462,7 @@ All parameter, weight, gradient are variables in Paddle. }, py::return_value_policy::copy); - py::class_(m, "Scope", R"DOC( + py::class_(m, "_Scope", R"DOC( Scope is an association of a name to Variable. All variables belong to Scope. Variables in a parent scope can be retrieved from local scope. @@ -474,17 +482,26 @@ All parameter, weight, gradient are variables in Paddle. param.set(param_array, place) )DOC") + .def("_remove_from_pool", + [](Scope &self) { ScopePool::Instance().Remove(&self); }) .def("var", [](Scope &self, const std::string &name) -> Variable * { return self.Var(name); }, py::return_value_policy::reference) .def("find_var", &Scope::FindVar, py::return_value_policy::reference) - .def(py::init<>()) .def("new_scope", [](Scope &self) -> Scope * { return &self.NewScope(); }, py::return_value_policy::reference) .def("drop_kids", &Scope::DropKids); + m.def("Scope", + []() -> Scope * { + auto *s = new Scope(); + ScopePool::Instance().Insert(std::unique_ptr(s)); + return s; + }, + py::return_value_policy::reference); + //! @note: Be careful! PyBind will return std::string as an unicode, not //! Python str. If you want a str object, you should cast them in Python. m.def("get_all_op_protos", []() -> std::vector { diff --git a/paddle/fluid/string/printf.h b/paddle/fluid/string/printf.h index a2eec6e3c4..0b94b60018 100644 --- a/paddle/fluid/string/printf.h +++ b/paddle/fluid/string/printf.h @@ -87,7 +87,7 @@ void Fprintf(std::ostream& out, const char* fmt, const Args&... args) { template std::string Sprintf(const Args&... args) { std::ostringstream oss; - Fprintf(oss, ""); + Fprintf(oss, "%s", args...); return oss.str(); } diff --git a/paddle/scripts/installation_validate.py b/paddle/scripts/installation_validate.py new file mode 100644 index 0000000000..f84e2f4b17 --- /dev/null +++ b/paddle/scripts/installation_validate.py @@ -0,0 +1,18 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import paddle.fluid as fluid +import paddle as pd + +print(pd.__version__) diff --git a/paddle/scripts/paddle_build.sh b/paddle/scripts/paddle_build.sh index 2e6b40148d..1220f80100 100755 --- a/paddle/scripts/paddle_build.sh +++ b/paddle/scripts/paddle_build.sh @@ -14,7 +14,6 @@ # See the License for the specific language governing permissions and # limitations under the License. - #================================================= # Utils #================================================= @@ -79,6 +78,7 @@ function cmake_gen() { PYTHON_FLAGS="-DPYTHON_EXECUTABLE:FILEPATH=/Library/Frameworks/Python.framework/Versions/2.7/bin/python2.7 -DPYTHON_INCLUDE_DIR:PATH=/Library/Frameworks/Python.framework/Versions/2.7/include/python2.7 -DPYTHON_LIBRARY:FILEPATH=/Library/Frameworks/Python.framework/Versions/2.7/lib/libpython2.7.dylib" + pip install --user -r ${PADDLE_ROOT}/python/requirements.txt else exit 1 fi @@ -91,6 +91,7 @@ function cmake_gen() { -DPYTHON_INCLUDE_DIR:PATH=/Library/Frameworks/Python.framework/Versions/3.5/include/python3.5m/ -DPYTHON_LIBRARY:FILEPATH=/Library/Frameworks/Python.framework/Versions/3.5/lib/libpython3.5m.dylib" WITH_FLUID_ONLY=${WITH_FLUID_ONLY:-ON} + pip3.5 install --user -r ${PADDLE_ROOT}/python/requirements.txt else exit 1 fi @@ -103,6 +104,7 @@ function cmake_gen() { -DPYTHON_INCLUDE_DIR:PATH=/Library/Frameworks/Python.framework/Versions/3.6/include/python3.6m/ -DPYTHON_LIBRARY:FILEPATH=/Library/Frameworks/Python.framework/Versions/3.6/lib/libpython3.6m.dylib" WITH_FLUID_ONLY=${WITH_FLUID_ONLY:-ON} + pip3.6 install --user -r ${PADDLE_ROOT}/python/requirements.txt else exit 1 fi @@ -115,6 +117,7 @@ function cmake_gen() { -DPYTHON_INCLUDE_DIR:PATH=/Library/Frameworks/Python.framework/Versions/3.7/include/python3.7m/ -DPYTHON_LIBRARY:FILEPATH=/Library/Frameworks/Python.framework/Versions/3.7/lib/libpython3.7m.dylib" WITH_FLUID_ONLY=${WITH_FLUID_ONLY:-ON} + pip3.7 install --user -r ${PADDLE_ROOT}/python/requirements.txt else exit 1 fi @@ -414,13 +417,6 @@ EOF else ctest --output-on-failure fi - - # make install should also be test when unittest - make install -j `nproc` - pip install ${INSTALL_PREFIX:-/paddle/build}/opt/paddle/share/wheels/*.whl - if [[ ${WITH_FLUID_ONLY:-OFF} == "OFF" ]] ; then - paddle version - fi fi } @@ -441,7 +437,9 @@ EOF # make install should also be test when unittest make install -j 8 if [ "$1" == "cp27-cp27m" ]; then + set -e pip install --user ${INSTALL_PREFIX:-/paddle/build}/opt/paddle/share/wheels/*.whl + python ${PADDLE_ROOT}/paddle/scripts/installation_validate.py elif [ "$1" == "cp35-cp35m" ]; then pip3.5 install --user ${INSTALL_PREFIX:-/paddle/build}/opt/paddle/share/wheels/*.whl elif [ "$1" == "cp36-cp36m" ]; then @@ -916,6 +914,7 @@ function main() { ;; assert_api) assert_api_not_changed ${PYTHON_ABI:-""} + assert_api_spec_approvals ;; test_inference) gen_capi_package @@ -940,6 +939,15 @@ function main() { run_test assert_api_not_changed ${PYTHON_ABI:-""} ;; + cmake_gen) + cmake_gen ${PYTHON_ABI:-""} + ;; + gen_fluid_lib) + gen_fluid_lib + ;; + test_fluid_lib) + test_fluid_lib + ;; *) print_usage exit 0 diff --git a/python/paddle/fluid/__init__.py b/python/paddle/fluid/__init__.py index 8f3660ca38..e0078e5314 100644 --- a/python/paddle/fluid/__init__.py +++ b/python/paddle/fluid/__init__.py @@ -46,7 +46,7 @@ from . import transpiler from . import distribute_lookup_table from .param_attr import ParamAttr, WeightNormParamAttr from .data_feeder import DataFeeder -from .core import LoDTensor, LoDTensorArray, CPUPlace, CUDAPlace, CUDAPinnedPlace, Scope +from .core import LoDTensor, LoDTensorArray, CPUPlace, CUDAPlace, CUDAPinnedPlace, Scope, _Scope from .transpiler import DistributeTranspiler, \ memory_optimize, release_memory, DistributeTranspilerConfig from .lod_tensor import create_lod_tensor, create_random_int_lodtensor diff --git a/python/paddle/fluid/data_feeder.py b/python/paddle/fluid/data_feeder.py index 13d2893fd1..af02721eb7 100644 --- a/python/paddle/fluid/data_feeder.py +++ b/python/paddle/fluid/data_feeder.py @@ -44,6 +44,8 @@ class DataToLoDTensorConverter(object): self.dtype = 'int64' elif dtype == core.VarDesc.VarType.FP64: self.dtype = 'float64' + elif dtype == core.VarDesc.VarType.FP16: + self.dtype = 'float16' elif dtype == core.VarDesc.VarType.INT32: self.dtype = 'int32' elif dtype == core.VarDesc.VarType.UINT8: diff --git a/python/paddle/fluid/executor.py b/python/paddle/fluid/executor.py index f2886090d7..5a9e908b61 100644 --- a/python/paddle/fluid/executor.py +++ b/python/paddle/fluid/executor.py @@ -191,7 +191,7 @@ def _fetch_var(name, scope=None, return_numpy=True): assert isinstance(name, str) if scope is None: scope = global_scope() - assert isinstance(scope, core.Scope) + assert isinstance(scope, core._Scope) var = scope.find_var(name) assert var is not None, ( diff --git a/python/paddle/fluid/framework.py b/python/paddle/fluid/framework.py index de30ed2fc5..823b6d80be 100644 --- a/python/paddle/fluid/framework.py +++ b/python/paddle/fluid/framework.py @@ -15,6 +15,7 @@ from __future__ import print_function import collections +from collections import defaultdict import contextlib import os import re @@ -369,13 +370,11 @@ class Variable(object): self._ivar.desc = self.desc def _numpy(self): - scope = _imperative_tracer().get_scope(self.block.desc) - tensor = core.get_variable_tensor(scope, self.desc.name()) + tensor = self._ivar.var.get_tensor() return np.array(tensor) def _backward(self): - scope = _imperative_tracer().get_scope(self.block.desc) - self._ivar._run_backward(scope) + self._ivar._run_backward() def _gradient(self): return np.array(self._ivar._grad()) @@ -692,20 +691,20 @@ class Operator(object): if _in_imperative_mode(): self.iop = core.OpBase() self.iop.desc = self.desc - self.inputs = [] + self.inputs = defaultdict(list) if inputs is not None: - for inp in inputs.values(): - if isinstance(inp, Variable): - self.inputs.append(inp) - elif isinstance(inp, list) or isinstance(inp, tuple): - self.inputs.extend(inp[:]) - self.outputs = [] + for k, v in six.iteritems(inputs): + if isinstance(v, Variable): + self.inputs[k].append(v._ivar) + elif isinstance(v, list) or isinstance(v, tuple): + self.inputs[k].extend([var._ivar for var in v]) + self.outputs = defaultdict(list) if outputs is not None: - for out in outputs.values(): - if isinstance(out, Variable): - self.outputs.append(out) - elif isinstance(out, list) or isinstance(out, tuple): - self.outputs.extend(out[:]) + for k, v in six.iteritems(outputs): + if isinstance(v, Variable): + self.outputs[k].append(v._ivar) + elif isinstance(v, list) or isinstance(v, tuple): + self.outputs[k].extend([var._ivar for var in v]) def _has_kernel(self, op_type): return op_type not in self.OP_WITHOUT_KERNEL_SET @@ -1273,8 +1272,7 @@ class Block(object): op_desc = self.desc.append_op() op = Operator(block=self, desc=op_desc, *args, **kwargs) if _in_imperative_mode(): - _imperative_tracer().trace(op.iop, [v._ivar for v in op.inputs], - [v._ivar for v in op.outputs], self.desc) + _imperative_tracer().trace(op.iop, op.inputs, op.outputs, self.desc) self.ops.append(op) return op @@ -1325,8 +1323,7 @@ class Block(object): op_desc = self.desc._prepend_op() op = Operator(self, op_desc, *args, **kwargs) if _in_imperative_mode(): - _imperative_tracer().trace(op.iop, [v._ivar for v in op.inputs], - [v._ivar for v in op.outputs], self.desc) + _imperative_tracer().trace(op.iop, op.inputs, op.outputs, self.desc) self.ops.insert(0, op) return op diff --git a/python/paddle/fluid/imperative/base.py b/python/paddle/fluid/imperative/base.py index aa48ef71aa..61e243e288 100644 --- a/python/paddle/fluid/imperative/base.py +++ b/python/paddle/fluid/imperative/base.py @@ -46,8 +46,7 @@ def to_variable(value, block=None): name=None, shape=value.shape, dtype=value.dtype) - scope = framework._imperative_tracer().get_scope(block.desc) - var = scope.var(py_var.name) + var = py_var._ivar.var tensor = var.get_tensor() tensor.set(value, core.CPUPlace()) return py_var diff --git a/python/paddle/fluid/initializer.py b/python/paddle/fluid/initializer.py index b37ebbe517..26d1f8f4d2 100644 --- a/python/paddle/fluid/initializer.py +++ b/python/paddle/fluid/initializer.py @@ -18,6 +18,7 @@ from . import framework import numpy as np import contextlib from .core import VarDesc +from . import unique_name __all__ = [ 'Constant', 'Uniform', 'Normal', 'TruncatedNormal', 'Xavier', 'Bilinear', @@ -207,16 +208,39 @@ class UniformInitializer(Initializer): # Initialization Ops should be prepended and not appended if self._seed == 0: self._seed = block.program.random_seed + + # to be compatible of fp16 initalizers + if var.dtype == VarDesc.VarType.FP16: + out_dtype = VarDesc.VarType.FP32 + out_var = block.create_var( + name=unique_name.generate(".".join(['gaussian_random', 'tmp'])), + shape=var.shape, + dtype=out_dtype, + type=VarDesc.VarType.LOD_TENSOR, + persistable=False) + else: + out_dtype = var.dtype + out_var = var + op = block._prepend_op( type="uniform_random", - outputs={"Out": var}, + outputs={"Out": out_var}, attrs={ "shape": var.shape, - "dtype": int(var.dtype), + "dtype": out_dtype, "min": self._low, "max": self._high, "seed": self._seed }) + + if var.dtype == VarDesc.VarType.FP16: + block.append_op( + type="cast", + inputs={"X": out_var}, + outputs={"Out": var}, + attrs={"in_dtype": out_var.dtype, + "out_dtype": var.dtype}) + var.op = op return op @@ -261,17 +285,39 @@ class NormalInitializer(Initializer): # Initialization Ops should be prepended and not appended if self._seed == 0: self._seed = block.program.random_seed + + # to be compatible of fp16 initalizers + if var.dtype == VarDesc.VarType.FP16: + out_dtype = VarDesc.VarType.FP32 + out_var = block.create_var( + name=unique_name.generate(".".join(['gaussian_random', 'tmp'])), + shape=var.shape, + dtype=out_dtype, + type=VarDesc.VarType.LOD_TENSOR, + persistable=False) + else: + out_dtype = var.dtype + out_var = var + op = block._prepend_op( type="gaussian_random", - outputs={"Out": var}, + outputs={"Out": out_var}, attrs={ "shape": var.shape, - "dtype": int(var.dtype), + "dtype": out_dtype, "mean": self._mean, "std": self._std_dev, "seed": self._seed, "use_mkldnn": False }) + + if var.dtype == VarDesc.VarType.FP16: + block.append_op( + type="cast", + inputs={"X": out_var}, + outputs={"Out": var}, + attrs={"in_dtype": out_var.dtype, + "out_dtype": var.dtype}) var.op = op return op diff --git a/python/paddle/fluid/layer_helper.py b/python/paddle/fluid/layer_helper.py index 74b4a977db..8543cb847d 100644 --- a/python/paddle/fluid/layer_helper.py +++ b/python/paddle/fluid/layer_helper.py @@ -20,7 +20,7 @@ import six import sys import numpy as np -from .framework import Variable, Parameter, default_main_program, default_startup_program, dtype_is_floating +from .framework import Variable, Parameter, default_main_program, default_startup_program, dtype_is_floating, _in_imperative_mode from . import unique_name from paddle.fluid.initializer import Constant, Xavier from paddle.fluid.imperative import base @@ -313,11 +313,22 @@ class LayerHelper(object): param = self._create_weight_normalize(attr, shape, dtype) WeightNormParamAttr.params_with_weight_norm.append(param) return param - - self.startup_program.global_block().create_parameter( - dtype=dtype, shape=shape, **attr._to_kwargs(with_initializer=True)) - return self.main_program.global_block().create_parameter( - dtype=dtype, shape=shape, **attr._to_kwargs()) + if _in_imperative_mode(): + self.main_program.global_block().create_parameter( + dtype=dtype, shape=shape, **attr._to_kwargs()) + # In imperative mode, we want the returned parameter to be + # initialized so that it can be used imperatively. + return self.startup_program.global_block().create_parameter( + dtype=dtype, + shape=shape, + **attr._to_kwargs(with_initializer=True)) + else: + self.startup_program.global_block().create_parameter( + dtype=dtype, + shape=shape, + **attr._to_kwargs(with_initializer=True)) + return self.main_program.global_block().create_parameter( + dtype=dtype, shape=shape, **attr._to_kwargs()) def get_parameter(self, name): param = self.main_program.global_block().var(name) diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py index bdfcc8c4e2..cc1fdbd285 100644 --- a/python/paddle/fluid/layers/nn.py +++ b/python/paddle/fluid/layers/nn.py @@ -2801,6 +2801,10 @@ def batch_norm(input, helper = LayerHelper('batch_norm', **locals()) dtype = helper.input_dtype() + # use fp32 for bn parameter + if dtype == core.VarDesc.VarType.FP16: + dtype = core.VarDesc.VarType.FP32 + input_shape = input.shape if data_layout == 'NCHW': channel_num = input_shape[1] @@ -2835,7 +2839,7 @@ def batch_norm(input, trainable=False, do_model_average=do_model_average_for_mean_and_var), shape=param_shape, - dtype=input.dtype) + dtype=dtype) mean.stop_gradient = True variance = helper.create_parameter( @@ -2845,7 +2849,7 @@ def batch_norm(input, trainable=False, do_model_average=do_model_average_for_mean_and_var), shape=param_shape, - dtype=input.dtype) + dtype=dtype) variance.stop_gradient = True # create output @@ -4526,7 +4530,7 @@ def topk(input, k, name=None): Args: input(Variable): The input variable which can be a vector or Tensor with higher rank. - k(int): The number of top elements to look for along the last dimension + k(int | Variable): The number of top elements to look for along the last dimension of input. name(str|None): A name for this layer(optional). If set None, the layer will be named automatically. @@ -4549,12 +4553,18 @@ def topk(input, k, name=None): helper = LayerHelper("top_k", **locals()) values = helper.create_variable_for_type_inference(dtype=input.dtype) indices = helper.create_variable_for_type_inference(dtype="int64") + inputs = {"X": [input]} + attrs = None + if isinstance(k, Variable): + inputs['K'] = k + else: + attrs = {'k': k} helper.append_op( type="top_k", - inputs={"X": [input]}, + inputs=inputs, outputs={"Out": [values], "Indices": [indices]}, - attrs={"k": k}) + attrs=attrs) values.stop_gradient = True indices.stop_gradient = True return values, indices @@ -7943,7 +7953,7 @@ def unstack(x, axis=0, num=None): num = x.shape[axis] outs = [] - for _ in num: + for _ in range(num): outs.append(helper.create_variable_for_type_inference(x.dtype)) helper.append_op( diff --git a/python/paddle/fluid/parallel_executor.py b/python/paddle/fluid/parallel_executor.py index 74cf76da95..c97a93ec36 100644 --- a/python/paddle/fluid/parallel_executor.py +++ b/python/paddle/fluid/parallel_executor.py @@ -148,7 +148,7 @@ class ParallelExecutor(object): trainers_endpoints), "num_trainers == len(end_points)" build_strategy.trainers_endpoints = trainers_endpoints - # step5: get persistable_vars, parameter_vars, places. persistable_vars + # step6: get persistable_vars, places. persistable_vars # need be broadcast to other local_scope. persistable_vars = set([ cpt.to_text(v.name) for v in [ @@ -164,7 +164,7 @@ class ParallelExecutor(object): places = list(map(place_obj, self._places)) - # step6: init ParallelExecutor + # step7: init ParallelExecutor self.executor = core.ParallelExecutor( places, persistable_vars, main.desc, cpt.to_text(loss_name) diff --git a/python/paddle/fluid/tests/book/high-level-api/image_classification/test_image_classification_resnet.py b/python/paddle/fluid/tests/book/high-level-api/image_classification/test_image_classification_resnet.py index d744a00242..e87c1d58c8 100644 --- a/python/paddle/fluid/tests/book/high-level-api/image_classification/test_image_classification_resnet.py +++ b/python/paddle/fluid/tests/book/high-level-api/image_classification/test_image_classification_resnet.py @@ -185,8 +185,10 @@ def main(use_cuda, parallel): if __name__ == '__main__': - for use_cuda in (False, True): - for parallel in (False, True): - if use_cuda and not core.is_compiled_with_cuda(): - continue - main(use_cuda=use_cuda, parallel=parallel) + on_ci = bool(int(os.environ.get("SKIP_UNSTABLE_CI", '0'))) + if not on_ci: + for use_cuda in (False, True): + for parallel in (False, True): + if use_cuda and not core.is_compiled_with_cuda(): + continue + main(use_cuda=use_cuda, parallel=parallel) diff --git a/python/paddle/fluid/tests/unittests/op_test.py b/python/paddle/fluid/tests/unittests/op_test.py index 76a707efdc..0fe836683b 100644 --- a/python/paddle/fluid/tests/unittests/op_test.py +++ b/python/paddle/fluid/tests/unittests/op_test.py @@ -368,6 +368,8 @@ class OpTest(unittest.TestCase): place = core.CUDAPlace(0) if core.is_float16_supported(place): return [place] + else: + return [] else: return [] places = [fluid.CPUPlace()] diff --git a/python/paddle/fluid/tests/unittests/test_accuracy_op.py b/python/paddle/fluid/tests/unittests/test_accuracy_op.py index 1b2b53f2d4..5257b0be6f 100644 --- a/python/paddle/fluid/tests/unittests/test_accuracy_op.py +++ b/python/paddle/fluid/tests/unittests/test_accuracy_op.py @@ -22,8 +22,10 @@ from op_test import OpTest class TestAccuracyOp(OpTest): def setUp(self): self.op_type = "accuracy" + self.dtype = np.float32 + self.init_dtype() n = 8192 - infer = np.random.random((n, 1)).astype("float32") + infer = np.random.random((n, 1)).astype(self.dtype) indices = np.random.randint(0, 2, (n, 1)) label = np.random.randint(0, 2, (n, 1)) self.inputs = {'Out': infer, 'Indices': indices, "Label": label} @@ -34,14 +36,25 @@ class TestAccuracyOp(OpTest): num_correct += 1 break self.outputs = { - 'Accuracy': np.array([num_correct / float(n)]).astype("float32"), + 'Accuracy': np.array([num_correct / float(n)]).astype(self.dtype), 'Correct': np.array([num_correct]).astype("int32"), 'Total': np.array([n]).astype("int32") } + def init_dtype(self): + pass + def test_check_output(self): self.check_output() +class TestAccuracyOpFp16(TestAccuracyOp): + def init_dtype(self): + self.dtype = np.float16 + + def test_check_output(self): + self.check_output(atol=1e-3) + + if __name__ == '__main__': unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_dist_se_resnext.py b/python/paddle/fluid/tests/unittests/test_dist_se_resnext.py index c2a4e5ca0c..28602d3251 100644 --- a/python/paddle/fluid/tests/unittests/test_dist_se_resnext.py +++ b/python/paddle/fluid/tests/unittests/test_dist_se_resnext.py @@ -15,6 +15,18 @@ from __future__ import print_function import unittest from test_dist_base import TestDistBase +import os + + +def skip_ci(func): + on_ci = bool(int(os.environ.get("SKIP_UNSTABLE_CI", '0'))) + + def __func__(*args, **kwargs): + if on_ci: + return + return func(*args, **kwargs) + + return __func__ class TestDistSeResneXt2x2(TestDistBase): @@ -22,6 +34,7 @@ class TestDistSeResneXt2x2(TestDistBase): self._sync_mode = True self._use_reader_alloc = False + @skip_ci def test_dist_train(self): self.check_with_place("dist_se_resnext.py", delta=1e-7) @@ -32,6 +45,7 @@ class TestDistseResnXt2x2WithMemopt(TestDistBase): self._mem_opt = True self._use_reader_alloc = False + @skip_ci def test_dist_train(self): self.check_with_place("dist_se_resnext.py", delta=1e-7) @@ -41,6 +55,7 @@ class TestDistSeResneXt2x2Async(TestDistBase): self._sync_mode = False self._use_reader_alloc = False + @skip_ci def test_dist_train(self): self.check_with_place("dist_se_resnext.py", delta=100) diff --git a/python/paddle/fluid/tests/unittests/test_elementwise_div_op.py b/python/paddle/fluid/tests/unittests/test_elementwise_div_op.py index cadaf1df53..15d4db590e 100644 --- a/python/paddle/fluid/tests/unittests/test_elementwise_div_op.py +++ b/python/paddle/fluid/tests/unittests/test_elementwise_div_op.py @@ -21,14 +21,16 @@ from op_test import OpTest class ElementwiseDivOp(OpTest): def setUp(self): self.op_type = "elementwise_div" + self.dtype = np.float32 + self.init_dtype() """ Warning CPU gradient check error! 'X': np.random.random((32,84)).astype("float32"), 'Y': np.random.random((32,84)).astype("float32") """ self.inputs = { - 'X': np.random.uniform(0.1, 1, [13, 17]).astype("float32"), - 'Y': np.random.uniform(0.1, 1, [13, 17]).astype("float32") + 'X': np.random.uniform(0.1, 1, [13, 17]).astype(self.dtype), + 'Y': np.random.uniform(0.1, 1, [13, 17]).astype(self.dtype) } self.outputs = {'Out': np.divide(self.inputs['X'], self.inputs['Y'])} @@ -46,6 +48,9 @@ class ElementwiseDivOp(OpTest): self.check_grad( ['X'], 'Out', max_relative_error=0.05, no_grad_set=set('Y')) + def init_dtype(self): + pass + class TestElementwiseDivOp_scalar(ElementwiseDivOp): def setUp(self): @@ -126,5 +131,21 @@ class TestElementwiseDivOp_broadcast_3(ElementwiseDivOp): } +class TestElementwiseDivOpFp16(ElementwiseDivOp): + def init_dtype(self): + self.dtype = np.float16 + + def test_check_grad_normal(self): + self.check_grad(['X', 'Y'], 'Out', max_relative_error=1) + + def test_check_grad_ingore_x(self): + self.check_grad( + ['Y'], 'Out', max_relative_error=1, no_grad_set=set("X")) + + def test_check_grad_ingore_y(self): + self.check_grad( + ['X'], 'Out', max_relative_error=1, no_grad_set=set('Y')) + + if __name__ == '__main__': unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_elementwise_mul_op.py b/python/paddle/fluid/tests/unittests/test_elementwise_mul_op.py index 57ba34f833..0484099188 100644 --- a/python/paddle/fluid/tests/unittests/test_elementwise_mul_op.py +++ b/python/paddle/fluid/tests/unittests/test_elementwise_mul_op.py @@ -135,5 +135,10 @@ class TestElementwiseMulOp_broadcast_3(ElementwiseMulOp): } +class TestElementwiseMulOpFp16(ElementwiseMulOp): + def init_dtype(self): + self.dtype = np.float16 + + if __name__ == '__main__': unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_fill_zeros_like_op.py b/python/paddle/fluid/tests/unittests/test_fill_zeros_like_op.py index eec73d0beb..20f1a110c3 100644 --- a/python/paddle/fluid/tests/unittests/test_fill_zeros_like_op.py +++ b/python/paddle/fluid/tests/unittests/test_fill_zeros_like_op.py @@ -22,12 +22,22 @@ from op_test import OpTest class TestFillZerosLikeOp(OpTest): def setUp(self): self.op_type = "fill_zeros_like" - self.inputs = {'X': np.random.random((219, 232)).astype("float32")} + self.dtype = np.float32 + self.init_dtype() + self.inputs = {'X': np.random.random((219, 232)).astype(self.dtype)} self.outputs = {'Out': np.zeros_like(self.inputs["X"])} + def init_dtype(self): + pass + def test_check_output(self): self.check_output() +class TestFillZerosLikeOpFp16(TestFillZerosLikeOp): + def init_dtype(self): + self.dtype = np.float16 + + if __name__ == "__main__": unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_imperative.py b/python/paddle/fluid/tests/unittests/test_imperative.py index 0fe69d1bd4..6b6ab227de 100644 --- a/python/paddle/fluid/tests/unittests/test_imperative.py +++ b/python/paddle/fluid/tests/unittests/test_imperative.py @@ -38,7 +38,9 @@ class MyLayer(fluid.imperative.PyLayer): def forward(self, inputs): x = fluid.layers.relu(inputs[0]) self._x_for_debug = x - return [fluid.layers.elementwise_mul(x, x)] + x = fluid.layers.elementwise_mul(x, x) + x = fluid.layers.reduce_sum(x) + return [x] class MLP(fluid.imperative.PyLayer): diff --git a/python/paddle/fluid/tests/unittests/test_momentum_op.py b/python/paddle/fluid/tests/unittests/test_momentum_op.py index cf4346cf2e..77ec6f9b6b 100644 --- a/python/paddle/fluid/tests/unittests/test_momentum_op.py +++ b/python/paddle/fluid/tests/unittests/test_momentum_op.py @@ -24,11 +24,13 @@ from op_test import OpTest class TestMomentumOp1(OpTest): def setUp(self): self.op_type = "momentum" + self.dtype = np.float32 + self.init_dtype() - param = np.random.random((123, 321)).astype("float32") - grad = np.random.random((123, 321)).astype("float32") - velocity = np.zeros((123, 321)).astype("float32") - learning_rate = np.array([0.001]).astype("float32") + param = np.random.random((123, 321)).astype(self.dtype) + grad = np.random.random((123, 321)).astype(self.dtype) + velocity = np.zeros((123, 321)).astype(self.dtype) + learning_rate = np.array([0.001]).astype(self.dtype) mu = 0.0001 use_nesterov = False @@ -50,10 +52,21 @@ class TestMomentumOp1(OpTest): self.outputs = {'ParamOut': param_out, 'VelocityOut': velocity_out} + def init_dtype(self): + pass + def test_check_output(self): self.check_output() +class TestMomentumOpFp16(TestMomentumOp1): + def init_dtype(self): + self.dtype = np.float16 + + def test_check_output(self): + self.check_output(atol=1e-3) + + class TestMomentumOp2(OpTest): '''Test Momentum with default values for attributes ''' diff --git a/python/paddle/fluid/tests/unittests/test_py_func_op.py b/python/paddle/fluid/tests/unittests/test_py_func_op.py index 943ad3ed22..655378f7f8 100644 --- a/python/paddle/fluid/tests/unittests/test_py_func_op.py +++ b/python/paddle/fluid/tests/unittests/test_py_func_op.py @@ -26,7 +26,7 @@ os.environ['CPU_NUM'] = str(dev_cnt) def dummy_func_with_no_input(): - return float(1.0) + return np.array([0], dtype='float32') def dummy_func_with_no_output(x): @@ -105,7 +105,7 @@ def simple_fc_net(img, label, use_py_func_op): name='test_tmp_var', dtype='float32', shape=[1]) fluid.layers.py_func( func=dummy_func_with_no_input, x=None, out=dummy_var) - + loss += dummy_var fluid.layers.py_func(func=dummy_func_with_no_output, x=loss, out=None) loss = fluid.layers.mean(loss) @@ -174,7 +174,7 @@ class TestPyFuncOpUseExecutor(unittest.TestCase): self.assertAlmostEqual(max_diff, 0, delta=1e-3) -class TestPyFuncOpUseParallelExecutor(unittest.TestCase): +class TestPyFuncOpUseParallelExecutor(TestPyFuncOpUseExecutor): def setUp(self): self.use_parallel_executor = True diff --git a/python/paddle/fluid/tests/unittests/test_top_k_op.py b/python/paddle/fluid/tests/unittests/test_top_k_op.py index 69b29db83a..9fbf59ed66 100644 --- a/python/paddle/fluid/tests/unittests/test_top_k_op.py +++ b/python/paddle/fluid/tests/unittests/test_top_k_op.py @@ -21,15 +21,22 @@ from op_test import OpTest class TestTopkOp(OpTest): def setUp(self): + self.variable_k = False self.set_args() self.op_type = "top_k" + self.dtype = np.float32 + self.init_dtype() + k = self.top_k - input = np.random.random((self.row, k)).astype("float32") + input = np.random.random((self.row, k)).astype(self.dtype) output = np.ndarray((self.row, k)) indices = np.ndarray((self.row, k)).astype("int64") - self.inputs = {'X': input} - self.attrs = {'k': k} + + if self.variable_k: + self.inputs['K'] = np.array([k]).astype("int32") + else: + self.attrs = {'k': k} for rowid in range(self.row): row = input[rowid] @@ -38,6 +45,9 @@ class TestTopkOp(OpTest): self.outputs = {'Out': output, 'Indices': indices} + def init_dtype(self): + pass + def set_args(self): self.row = 32 self.top_k = 1 @@ -46,6 +56,11 @@ class TestTopkOp(OpTest): self.check_output() +class TestTopkOpFp16(TestTopkOp): + def init_dtype(self): + self.dtype = np.float16 + + class TestTopkOp3d(OpTest): def setUp(self): self.op_type = "top_k" @@ -107,5 +122,12 @@ class TestTopkOp4(TestTopkOp): self.top_k = 1 +class TestTopkOp5(TestTopkOp): + def set_args(self): + self.row = 40000 + self.top_k = 3 + self.variable_k = True + + if __name__ == "__main__": unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_weight_decay.py b/python/paddle/fluid/tests/unittests/test_weight_decay.py new file mode 100644 index 0000000000..f37d2bfb2e --- /dev/null +++ b/python/paddle/fluid/tests/unittests/test_weight_decay.py @@ -0,0 +1,188 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function +import contextlib + +import unittest +from functools import partial +import numpy as np +import paddle +import paddle.fluid.core as core + +import paddle.fluid as fluid + + +def get_places(): + places = [] + if core.is_compiled_with_cuda(): + places.append(core.CUDAPlace(0)) + return places + + +@contextlib.contextmanager +def prog_scope_guard(main_prog, startup_prog): + scope = fluid.core.Scope() + with fluid.unique_name.guard(): + with fluid.scope_guard(scope): + with fluid.program_guard(main_prog, startup_prog): + yield + + +def bow_net(data, + label, + dict_dim, + is_sparse=False, + emb_dim=128, + hid_dim=128, + hid_dim2=96, + class_dim=2): + """ + BOW net + This model is from https://github.com/PaddlePaddle/models: + fluid/PaddleNLP/text_classification/nets.py + """ + emb = fluid.layers.embedding( + input=data, is_sparse=is_sparse, size=[dict_dim, emb_dim]) + bow = fluid.layers.sequence_pool(input=emb, pool_type='sum') + bow_tanh = fluid.layers.tanh(bow) + fc_1 = fluid.layers.fc(input=bow_tanh, size=hid_dim, act="tanh") + fc_2 = fluid.layers.fc(input=fc_1, size=hid_dim2, act="tanh") + prediction = fluid.layers.fc(input=[fc_2], size=class_dim, act="softmax") + cost = fluid.layers.cross_entropy(input=prediction, label=label) + avg_cost = fluid.layers.mean(x=cost) + + return avg_cost + + +class TestWeightDecay(unittest.TestCase): + def setUp(self): + self.word_dict = paddle.dataset.imdb.word_dict() + reader = paddle.batch( + paddle.dataset.imdb.train(self.word_dict), batch_size=4)() + self.train_data = [next(reader) for _ in range(5)] + self.learning_rate = .5 + + def run_executor(self, place, feed_list, loss): + exe = fluid.Executor(place) + feeder = fluid.DataFeeder(feed_list=feed_list, place=place) + exe.run(fluid.default_startup_program()) + main_prog = fluid.default_main_program() + loss_set = [] + for data in self.train_data: + out = exe.run(main_prog, + feed=feeder.feed(data), + fetch_list=[loss.name]) + + print("loss %s" % (np.average(out))) + loss_set.append(np.average(out)) + + return loss_set + + def run_parallel_exe(self, + place, + feed_list, + loss, + use_cuda=True, + use_reduce=False, + use_fast_executor=False, + use_ir_memory_optimize=False): + exe = fluid.Executor(place) + feeder = fluid.DataFeeder(feed_list=feed_list, place=place) + exe.run(fluid.default_startup_program()) + + exec_strategy = fluid.ExecutionStrategy() + if use_fast_executor: + exec_strategy.use_experimental_executor = True + + build_strategy = fluid.BuildStrategy() + build_strategy.reduce_strategy = fluid.BuildStrategy.ReduceStrategy.Reduce \ + if use_reduce else fluid.BuildStrategy.ReduceStrategy.AllReduce + build_strategy.memory_optimize = use_ir_memory_optimize + + parallel_exe = fluid.ParallelExecutor( + use_cuda, + loss_name=loss.name, + exec_strategy=exec_strategy, + build_strategy=build_strategy) + + loss_set = [] + for data in self.train_data: + out = parallel_exe.run(feed=feeder.feed(data), + fetch_list=[loss.name]) + print("loss %s" % (np.average(out))) + loss_set.append(np.average(out)) + + return loss_set + + def check_weight_decay(self, + place, + model, + use_parallel_exe=False, + use_reduce=False): + main_prog = fluid.framework.Program() + startup_prog = fluid.framework.Program() + startup_prog.random_seed = 1 + with prog_scope_guard(main_prog=main_prog, startup_prog=startup_prog): + + data = fluid.layers.data( + name="words", shape=[1], dtype="int64", lod_level=1) + label = fluid.layers.data(name="label", shape=[1], dtype="int64") + + avg_cost = model(data, label, len(self.word_dict)) + + param_list = [(var, var * self.learning_rate) + for var in main_prog.block(0).all_parameters()] + + optimizer = fluid.optimizer.Adagrad( + learning_rate=self.learning_rate) + + optimizer.minimize(avg_cost) + + for params in param_list: + updated_p = fluid.layers.elementwise_sub( + x=params[0], y=params[1]) + fluid.layers.assign(input=updated_p, output=params[0]) + + if use_parallel_exe: + loss = self.run_parallel_exe( + place, [data, label], + loss=avg_cost, + use_cuda=True, + use_reduce=use_reduce) + else: + loss = self.run_executor(place, [data, label], loss=avg_cost) + + return loss + + def test_weight_decay(self): + model = partial(bow_net, is_sparse=False) + for place in get_places(): + loss = self.check_weight_decay(place, model, use_parallel_exe=False) + + loss2 = self.check_weight_decay( + place, model, use_parallel_exe=True, use_reduce=False) + + for i in range(len(loss)): + assert np.isclose(a=loss[i], b=loss2[i], rtol=5e-5) + + loss3 = self.check_weight_decay( + place, model, use_parallel_exe=True, use_reduce=True) + + for i in range(len(loss)): + assert np.isclose(a=loss[i], b=loss3[i], rtol=5e-5) + + +if __name__ == '__main__': + unittest.main() diff --git a/python/paddle/fluid/transpiler/inference_transpiler.py b/python/paddle/fluid/transpiler/inference_transpiler.py index ccf7af334d..cc7f5ec90c 100644 --- a/python/paddle/fluid/transpiler/inference_transpiler.py +++ b/python/paddle/fluid/transpiler/inference_transpiler.py @@ -57,7 +57,7 @@ class InferenceTranspiler(object): raise TypeError("place should be as CPUPlace/CUDAPlace type") if scope is None: scope = global_scope() - if not isinstance(scope, core.Scope): + if not isinstance(scope, core._Scope): raise TypeError("scope should be as Scope type or None") use_mkldnn = bool(os.getenv("FLAGS_use_mkldnn", False))