Merge remote-tracking branch 'ups/develop' into pool-int8

recover_files
xiaoli.liu@intel.com 6 years ago
commit 086f892225

@ -94,52 +94,52 @@ RUN localedef -i en_US -f UTF-8 en_US.UTF-8
# specify sphinx version as 1.5.6 and remove -U option for [pip install -U # specify sphinx version as 1.5.6 and remove -U option for [pip install -U
# sphinx-rtd-theme] since -U option will cause sphinx being updated to newest # sphinx-rtd-theme] since -U option will cause sphinx being updated to newest
# version(1.7.1 for now), which causes building documentation failed. # version(1.7.1 for now), which causes building documentation failed.
RUN pip3 install -U wheel && \ RUN pip3 --no-cache-dir install -U wheel && \
pip3 install -U docopt PyYAML sphinx==1.5.6 && \ pip3 --no-cache-dir install -U docopt PyYAML sphinx==1.5.6 && \
pip3 install sphinx-rtd-theme==0.1.9 recommonmark && \ pip3 --no-cache-dir install sphinx-rtd-theme==0.1.9 recommonmark && \
pip3.6 install -U wheel && \ pip3.6 --no-cache-dir install -U wheel && \
pip3.6 install -U docopt PyYAML sphinx==1.5.6 && \ pip3.6 --no-cache-dir install -U docopt PyYAML sphinx==1.5.6 && \
pip3.6 install sphinx-rtd-theme==0.1.9 recommonmark && \ pip3.6 --no-cache-dir install sphinx-rtd-theme==0.1.9 recommonmark && \
pip3.7 install -U wheel && \ pip3.7 --no-cache-dir install -U wheel && \
pip3.7 install -U docopt PyYAML sphinx==1.5.6 && \ pip3.7 --no-cache-dir install -U docopt PyYAML sphinx==1.5.6 && \
pip3.7 install sphinx-rtd-theme==0.1.9 recommonmark && \ pip3.7 --no-cache-dir install sphinx-rtd-theme==0.1.9 recommonmark && \
easy_install -U pip && \ easy_install -U pip && \
pip install -U pip setuptools wheel && \ pip --no-cache-dir install -U pip setuptools wheel && \
pip install -U docopt PyYAML sphinx==1.5.6 && \ pip --no-cache-dir install -U docopt PyYAML sphinx==1.5.6 && \
pip install sphinx-rtd-theme==0.1.9 recommonmark pip --no-cache-dir install sphinx-rtd-theme==0.1.9 recommonmark
RUN pip3 install 'pre-commit==1.10.4' 'ipython==5.3.0' && \ RUN pip3 --no-cache-dir install 'pre-commit==1.10.4' 'ipython==5.3.0' && \
pip3 install 'ipykernel==4.6.0' 'jupyter==1.0.0' && \ pip3 --no-cache-dir install 'ipykernel==4.6.0' 'jupyter==1.0.0' && \
pip3 install opencv-python && \ pip3 --no-cache-dir install opencv-python && \
pip3.6 install 'pre-commit==1.10.4' 'ipython==5.3.0' && \ pip3.6 --no-cache-dir install 'pre-commit==1.10.4' 'ipython==5.3.0' && \
pip3.6 install 'ipykernel==4.6.0' 'jupyter==1.0.0' && \ pip3.6 --no-cache-dir install 'ipykernel==4.6.0' 'jupyter==1.0.0' && \
pip3.6 install opencv-python && \ pip3.6 --no-cache-dir install opencv-python && \
pip3.7 install 'pre-commit==1.10.4' 'ipython==5.3.0' && \ pip3.7 --no-cache-dir install 'pre-commit==1.10.4' 'ipython==5.3.0' && \
pip3.7 install 'ipykernel==4.6.0' 'jupyter==1.0.0' && \ pip3.7 --no-cache-dir install 'ipykernel==4.6.0' 'jupyter==1.0.0' && \
pip3.7 install opencv-python && \ pip3.7 --no-cache-dir install opencv-python && \
pip install 'pre-commit==1.10.4' 'ipython==5.3.0' && \ pip --no-cache-dir install 'pre-commit==1.10.4' 'ipython==5.3.0' && \
pip install 'ipykernel==4.6.0' 'jupyter==1.0.0' && \ pip --no-cache-dir install 'ipykernel==4.6.0' 'jupyter==1.0.0' && \
pip install opencv-python pip --no-cache-dir install opencv-python
#For docstring checker #For docstring checker
RUN pip3 install pylint pytest astroid isort RUN pip3 --no-cache-dir install pylint pytest astroid isort
RUN pip3.6 install pylint pytest astroid isort RUN pip3.6 --no-cache-dir install pylint pytest astroid isort
RUN pip3.7 install pylint pytest astroid isort RUN pip3.7 --no-cache-dir install pylint pytest astroid isort
RUN pip install pylint pytest astroid isort LinkChecker RUN pip --no-cache-dir install pylint pytest astroid isort LinkChecker
COPY ./python/requirements.txt /root/ COPY ./python/requirements.txt /root/
RUN pip3 install -r /root/requirements.txt RUN pip3 --no-cache-dir install -r /root/requirements.txt
RUN pip3.6 install -r /root/requirements.txt RUN pip3.6 --no-cache-dir install -r /root/requirements.txt
RUN pip3.7 install -r /root/requirements.txt RUN pip3.7 --no-cache-dir install -r /root/requirements.txt
RUN pip install -r /root/requirements.txt RUN pip --no-cache-dir install -r /root/requirements.txt
# To fix https://github.com/PaddlePaddle/Paddle/issues/1954, we use # To fix https://github.com/PaddlePaddle/Paddle/issues/1954, we use
# the solution in https://urllib3.readthedocs.io/en/latest/user-guide.html#ssl-py2 # the solution in https://urllib3.readthedocs.io/en/latest/user-guide.html#ssl-py2
RUN apt-get install -y libssl-dev libffi-dev RUN apt-get install -y libssl-dev libffi-dev && apt-get clean -y
RUN pip3 install certifi urllib3[secure] RUN pip3 --no-cache-dir install certifi urllib3[secure]
RUN pip3.6 install certifi urllib3[secure] RUN pip3.6 --no-cache-dir install certifi urllib3[secure]
RUN pip3.7 install certifi urllib3[secure] RUN pip3.7 --no-cache-dir install certifi urllib3[secure]
RUN pip install certifi urllib3[secure] RUN pip --no-cache-dir install certifi urllib3[secure]
# Install woboq_codebrowser to /woboq # Install woboq_codebrowser to /woboq

@ -106,10 +106,10 @@ else(WIN32)
SET(MKLDNN_SHARED_LIB ${MKLDNN_INSTALL_DIR}/libmkldnn.so.0) SET(MKLDNN_SHARED_LIB ${MKLDNN_INSTALL_DIR}/libmkldnn.so.0)
ADD_CUSTOM_COMMAND(OUTPUT ${MKLDNN_SHARED_LIB} ADD_CUSTOM_COMMAND(OUTPUT ${MKLDNN_SHARED_LIB}
COMMAND ${CMAKE_COMMAND} -E copy ${MKLDNN_LIB} ${MKLDNN_SHARED_LIB} COMMAND ${CMAKE_COMMAND} -E copy ${MKLDNN_LIB} ${MKLDNN_SHARED_LIB}
DEPENDS mkldnn) DEPENDS mkldnn shared_mkldnn)
endif(WIN32) endif(WIN32)
ADD_CUSTOM_TARGET(mkldnn_shared_lib ALL DEPENDS ${MKLDNN_SHARED_LIB}) ADD_CUSTOM_TARGET(mkldnn_shared_lib ALL DEPENDS ${MKLDNN_SHARED_LIB})
ADD_DEPENDENCIES(mkldnn_shared_lib ${MKLDNN_PROJECT} mkldnn)
IF(WITH_C_API) IF(WITH_C_API)
INSTALL(FILES ${MKLDNN_SHARED_LIB} DESTINATION lib) INSTALL(FILES ${MKLDNN_SHARED_LIB} DESTINATION lib)
ENDIF() ENDIF()

@ -136,7 +136,7 @@ if (WITH_MKLDNN)
copy(mkldnn_lib copy(mkldnn_lib
SRCS ${MKLDNN_INC_DIR} ${MKLDNN_SHARED_LIB} SRCS ${MKLDNN_INC_DIR} ${MKLDNN_SHARED_LIB}
DSTS ${dst_dir} ${dst_dir}/lib DSTS ${dst_dir} ${dst_dir}/lib
DEPS mkldnn DEPS mkldnn_shared_lib
) )
endif () endif ()

@ -57,46 +57,43 @@ int main()
return 0; return 0;
}" SSE3_FOUND) }" SSE3_FOUND)
# disable AVX by default on windows # Check AVX
if(NOT WIN32) set(CMAKE_REQUIRED_FLAGS ${AVX_FLAG})
# Check AVX set(AVX_FOUND_EXITCODE 1 CACHE STRING "Result from TRY_RUN" FORCE)
set(CMAKE_REQUIRED_FLAGS ${AVX_FLAG}) CHECK_CXX_SOURCE_RUNS("
set(AVX_FOUND_EXITCODE 1 CACHE STRING "Result from TRY_RUN" FORCE) #include <immintrin.h>
CHECK_CXX_SOURCE_RUNS(" int main()
#include <immintrin.h> {
int main() __m256 a = _mm256_set_ps (-1.0f, 2.0f, -3.0f, 4.0f, -1.0f, 2.0f, -3.0f, 4.0f);
{ __m256 b = _mm256_set_ps (1.0f, 2.0f, 3.0f, 4.0f, 1.0f, 2.0f, 3.0f, 4.0f);
__m256 a = _mm256_set_ps (-1.0f, 2.0f, -3.0f, 4.0f, -1.0f, 2.0f, -3.0f, 4.0f); __m256 result = _mm256_add_ps (a, b);
__m256 b = _mm256_set_ps (1.0f, 2.0f, 3.0f, 4.0f, 1.0f, 2.0f, 3.0f, 4.0f); return 0;
__m256 result = _mm256_add_ps (a, b); }" AVX_FOUND)
return 0;
}" AVX_FOUND)
# Check AVX 2 # Check AVX 2
set(CMAKE_REQUIRED_FLAGS ${AVX2_FLAG}) set(CMAKE_REQUIRED_FLAGS ${AVX2_FLAG})
set(AVX2_FOUND_EXITCODE 1 CACHE STRING "Result from TRY_RUN" FORCE) set(AVX2_FOUND_EXITCODE 1 CACHE STRING "Result from TRY_RUN" FORCE)
CHECK_CXX_SOURCE_RUNS(" CHECK_CXX_SOURCE_RUNS("
#include <immintrin.h> #include <immintrin.h>
int main() int main()
{ {
__m256i a = _mm256_set_epi32 (-1, 2, -3, 4, -1, 2, -3, 4); __m256i a = _mm256_set_epi32 (-1, 2, -3, 4, -1, 2, -3, 4);
__m256i result = _mm256_abs_epi32 (a); __m256i result = _mm256_abs_epi32 (a);
return 0; return 0;
}" AVX2_FOUND) }" AVX2_FOUND)
# Check AVX512F # Check AVX512F
set(CMAKE_REQUIRED_FLAGS ${AVX512F_FLAG}) set(CMAKE_REQUIRED_FLAGS ${AVX512F_FLAG})
set(AVX512F_FOUND_EXITCODE 1 CACHE STRING "Result from TRY_RUN" FORCE) set(AVX512F_FOUND_EXITCODE 1 CACHE STRING "Result from TRY_RUN" FORCE)
CHECK_CXX_SOURCE_RUNS(" CHECK_CXX_SOURCE_RUNS("
#include <immintrin.h> #include <immintrin.h>
int main() int main()
{ {
__m512i a = _mm512_set_epi32 (-1, 2, -3, 4, -1, 2, -3, 4, __m512i a = _mm512_set_epi32 (-1, 2, -3, 4, -1, 2, -3, 4,
13, -5, 6, -7, 9, 2, -6, 3); 13, -5, 6, -7, 9, 2, -6, 3);
__m512i result = _mm512_abs_epi32 (a); __m512i result = _mm512_abs_epi32 (a);
return 0; return 0;
}" AVX512F_FOUND) }" AVX512F_FOUND)
endif(NOT WIN32)
set(CMAKE_REQUIRED_FLAGS ${CMAKE_REQUIRED_FLAGS_RETAINED}) set(CMAKE_REQUIRED_FLAGS ${CMAKE_REQUIRED_FLAGS_RETAINED})
mark_as_advanced(MMX_FOUND SSE2_FOUND SSE3_FOUND AVX_FOUND AVX2_FOUND AVX512F_FOUND) mark_as_advanced(MMX_FOUND SSE2_FOUND SSE3_FOUND AVX_FOUND AVX2_FOUND AVX512F_FOUND)

@ -60,7 +60,7 @@ class Float16Transpiler:
raise TypeError("place should be as CPUPlace/CUDAPlace type") raise TypeError("place should be as CPUPlace/CUDAPlace type")
if scope is None: if scope is None:
scope = global_scope() scope = global_scope()
if not isinstance(scope, core.Scope): if not isinstance(scope, core._Scope):
raise TypeError("scope should be as Scope type or None") raise TypeError("scope should be as Scope type or None")
self.scope = scope self.scope = scope

@ -464,11 +464,7 @@ paddle.fluid.unique_name.switch ArgSpec(args=['new_generator'], varargs=None, ke
paddle.fluid.unique_name.guard ArgSpec(args=[], varargs='args', keywords='kwds', defaults=None) paddle.fluid.unique_name.guard ArgSpec(args=[], varargs='args', keywords='kwds', defaults=None)
paddle.fluid.recordio_writer.convert_reader_to_recordio_file ArgSpec(args=['filename', 'reader_creator', 'feeder', 'compressor', 'max_num_records', 'feed_order'], varargs=None, keywords=None, defaults=(Compressor.Snappy, 1000, None)) paddle.fluid.recordio_writer.convert_reader_to_recordio_file ArgSpec(args=['filename', 'reader_creator', 'feeder', 'compressor', 'max_num_records', 'feed_order'], varargs=None, keywords=None, defaults=(Compressor.Snappy, 1000, None))
paddle.fluid.recordio_writer.convert_reader_to_recordio_files ArgSpec(args=['filename', 'batch_per_file', 'reader_creator', 'feeder', 'compressor', 'max_num_records', 'feed_order'], varargs=None, keywords=None, defaults=(Compressor.Snappy, 1000, None)) paddle.fluid.recordio_writer.convert_reader_to_recordio_files ArgSpec(args=['filename', 'batch_per_file', 'reader_creator', 'feeder', 'compressor', 'max_num_records', 'feed_order'], varargs=None, keywords=None, defaults=(Compressor.Snappy, 1000, None))
paddle.fluid.Scope.__init__ __init__(self: paddle.fluid.core.Scope) -> None paddle.fluid.Scope Scope() -> paddle.fluid.core._Scope
paddle.fluid.Scope.drop_kids drop_kids(self: paddle.fluid.core.Scope) -> None
paddle.fluid.Scope.find_var find_var(self: paddle.fluid.core.Scope, arg0: unicode) -> paddle.fluid.core.Variable
paddle.fluid.Scope.new_scope new_scope(self: paddle.fluid.core.Scope) -> paddle.fluid.core.Scope
paddle.fluid.Scope.var var(self: paddle.fluid.core.Scope, arg0: unicode) -> paddle.fluid.core.Variable
paddle.reader.map_readers ArgSpec(args=['func'], varargs='readers', keywords=None, defaults=None) paddle.reader.map_readers ArgSpec(args=['func'], varargs='readers', keywords=None, defaults=None)
paddle.reader.buffered ArgSpec(args=['reader', 'size'], varargs=None, keywords=None, defaults=None) paddle.reader.buffered ArgSpec(args=['reader', 'size'], varargs=None, keywords=None, defaults=None)
paddle.reader.compose ArgSpec(args=[], varargs='readers', keywords='kwargs', defaults=None) paddle.reader.compose ArgSpec(args=[], varargs='readers', keywords='kwargs', defaults=None)

@ -7,27 +7,17 @@ function(windows_symbolic TARGET)
cmake_parse_arguments(windows_symbolic "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN}) cmake_parse_arguments(windows_symbolic "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
set(final_path ${CMAKE_CURRENT_SOURCE_DIR}/${windows_symbolic_PATH}) set(final_path ${CMAKE_CURRENT_SOURCE_DIR}/${windows_symbolic_PATH})
foreach(src ${windows_symbolic_SRCS}) foreach(src ${windows_symbolic_SRCS})
get_filename_component(src ${src} NAME_WE) get_filename_component(src ${src} NAME_WE)
if (NOT EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${src}.cc OR NOT EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${src}.cu) if (NOT EXISTS ${final_path}/${src}.cc OR NOT EXISTS ${final_path}/${src}.cu)
message(FATAL " ${src}.cc and ${src}.cu must exsits, and ${src}.cu must be symbolic file.") message(FATAL " ${src}.cc and ${src}.cu must exsits, and ${src}.cu must be symbolic file.")
endif() endif()
#only copy the xx.cu to.xx.cu when the content are modified file(GENERATE OUTPUT ${final_path}/.${src}.cu INPUT ${final_path}/${src}.cc)
set(copy_flag 1)
if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/.${src}.cu) add_custom_command(OUTPUT ${final_path}/.${src}.cu
file(READ ${CMAKE_CURRENT_SOURCE_DIR}/${src}.cc SOURCE_STR) COMMAND ${CMAKE_COMMAND} -E copy_if_different "${final_path}/${src}.cc" "${final_path}/.${src}.cu"
file(READ ${CMAKE_CURRENT_SOURCE_DIR}/.${src}.cu TARGET_STR) COMMENT "create hidden file of ${src}.cu")
if (SOURCE_STR STREQUAL TARGET_STR) add_custom_target(${TARGET} ALL DEPENDS .${src}.cu)
set(copy_flag 0)
endif()
endif()
if (copy_flag)
add_custom_command(OUTPUT .${src}.cu
COMMAND ${CMAKE_COMMAND} -E remove ${CMAKE_CURRENT_SOURCE_DIR}/.${src}.cu
COMMAND ${CMAKE_COMMAND} -E copy "${CMAKE_CURRENT_SOURCE_DIR}/${src}.cc" "${CMAKE_CURRENT_SOURCE_DIR}/.${src}.cu"
COMMENT "create hidden file of ${src}.cu")
endif(copy_flag)
add_custom_target(${TARGET} ALL DEPENDS .${src}.cu)
endforeach() endforeach()
endfunction() endfunction()
@ -48,10 +38,10 @@ if(WITH_GPU)
nv_library(tensor SRCS tensor.cc .tensor_util.cu DEPS place memory data_type device_context) nv_library(tensor SRCS tensor.cc .tensor_util.cu DEPS place memory data_type device_context)
add_dependencies(tensor tensor_util) add_dependencies(tensor tensor_util)
else() else()
nv_library(tensor SRCS tensor.cc tensor_util.cu DEPS place memory data_type device_context) nv_library(tensor SRCS tensor.cc tensor_util.cu DEPS place memory data_type device_context )
endif(WIN32) endif(WIN32)
else() else()
cc_library(tensor SRCS tensor.cc tensor_util.cc DEPS place memory data_type device_context) cc_library(tensor SRCS tensor.cc tensor_util.cc DEPS place memory data_type device_context )
endif() endif()
cc_test(tensor_test SRCS tensor_test.cc DEPS tensor) cc_test(tensor_test SRCS tensor_test.cc DEPS tensor)
@ -78,17 +68,23 @@ cc_library(garbage_collector SRCS garbage_collector.cc DEPS device_context memor
cc_library(reader SRCS reader.cc DEPS lod_tensor ddim) cc_library(reader SRCS reader.cc DEPS lod_tensor ddim)
cc_test(reader_test SRCS reader_test.cc DEPS reader) cc_test(reader_test SRCS reader_test.cc DEPS reader)
cc_test(variable_test SRCS variable_test.cc)
cc_library(threadpool SRCS threadpool.cc DEPS enforce) cc_library(threadpool SRCS threadpool.cc DEPS enforce)
cc_test(threadpool_test SRCS threadpool_test.cc DEPS threadpool) cc_test(threadpool_test SRCS threadpool_test.cc DEPS threadpool)
cc_library(scope SRCS scope.cc DEPS glog threadpool) cc_library(var_type_traits SRCS var_type_traits DEPS lod_tensor selected_rows framework_proto)
if (WITH_GPU)
target_link_libraries(var_type_traits dynload_cuda)
endif()
cc_test(var_type_traits_test SRCS var_type_traits_test.cc DEPS var_type_traits)
cc_library(scope SRCS scope.cc DEPS glog threadpool var_type_traits)
cc_library(scope_pool SRCS scope_pool.cc DEPS scope)
cc_test(scope_test SRCS scope_test.cc DEPS scope) cc_test(scope_test SRCS scope_test.cc DEPS scope)
cc_test(variable_test SRCS variable_test.cc DEPS tensor var_type_traits)
cc_library(data_device_transform SRCS data_device_transform.cc DEPS tensor) cc_library(data_device_transform SRCS data_device_transform.cc DEPS tensor)
nv_test(data_device_transform_test SRCS data_device_transform_test.cu nv_test(data_device_transform_test SRCS data_device_transform_test.cu
DEPS operator op_registry device_context math_function) DEPS operator op_registry device_context math_function scope)
if(WITH_GPU) if(WITH_GPU)
if (WIN32) if (WIN32)

@ -165,7 +165,7 @@ template <typename T>
class GreaterThanChecker { class GreaterThanChecker {
public: public:
explicit GreaterThanChecker(T lower_bound) : lower_bound_(lower_bound) {} explicit GreaterThanChecker(T lower_bound) : lower_bound_(lower_bound) {}
void operator()(T& value) const { void operator()(const T& value) const {
PADDLE_ENFORCE(value > lower_bound_, "larger_than check fails."); PADDLE_ENFORCE(value > lower_bound_, "larger_than check fails.");
} }
@ -177,7 +177,7 @@ template <typename T>
class EqualGreaterThanChecker { class EqualGreaterThanChecker {
public: public:
explicit EqualGreaterThanChecker(T lower_bound) : lower_bound_(lower_bound) {} explicit EqualGreaterThanChecker(T lower_bound) : lower_bound_(lower_bound) {}
void operator()(T& value) const { void operator()(const T& value) const {
PADDLE_ENFORCE_GE(value, lower_bound_, "equal_larger_than check fails."); PADDLE_ENFORCE_GE(value, lower_bound_, "equal_larger_than check fails.");
} }
@ -193,7 +193,7 @@ class DefaultValueSetter {
public: public:
explicit DefaultValueSetter(T default_value) explicit DefaultValueSetter(T default_value)
: default_value_(default_value) {} : default_value_(default_value) {}
void operator()(T& value) const { value = default_value_; } // NOLINT void operator()(T* value) const { *value = default_value_; }
private: private:
T default_value_; T default_value_;
@ -203,7 +203,7 @@ template <typename T>
class EnumInContainer { class EnumInContainer {
public: public:
explicit EnumInContainer(const std::unordered_set<T>& c) : container_(c) {} explicit EnumInContainer(const std::unordered_set<T>& c) : container_(c) {}
void operator()(T& val) const { void operator()(const T& val) const {
PADDLE_ENFORCE(container_.find(val) != container_.end(), PADDLE_ENFORCE(container_.find(val) != container_.end(),
"Value %s is not in enum container %s", val, "Value %s is not in enum container %s", val,
ContainerDebugString()); ContainerDebugString());
@ -232,7 +232,8 @@ class EnumInContainer {
// an attribute can have more than one limits // an attribute can have more than one limits
template <typename T> template <typename T>
class TypedAttrChecker { class TypedAttrChecker {
typedef std::function<void(T&)> ValueChecker; typedef std::function<void(T*)> DefaultValueChecker;
typedef std::function<void(const T&)> ValueChecker;
public: public:
explicit TypedAttrChecker(const std::string& attr_name) explicit TypedAttrChecker(const std::string& attr_name)
@ -268,17 +269,17 @@ class TypedAttrChecker {
return *this; return *this;
} }
void operator()(AttributeMap& attr_map) const { // NOLINT void operator()(AttributeMap* attr_map) const {
if (!attr_map.count(attr_name_)) { if (!attr_map->count(attr_name_)) {
// user do not set this attr // user do not set this attr
PADDLE_ENFORCE(!default_value_setter_.empty(), PADDLE_ENFORCE(!default_value_setter_.empty(),
"Attribute '%s' is required!", attr_name_); "Attribute '%s' is required!", attr_name_);
// default_value_setter_ has no more than one element // default_value_setter_ has no more than one element
T val; T val;
(default_value_setter_[0])(val); (default_value_setter_[0])(&val);
attr_map[attr_name_] = val; (*attr_map)[attr_name_] = val;
} }
Attribute& attr = attr_map.at(attr_name_); Attribute& attr = attr_map->at(attr_name_);
ExtractAttribute<T> extract_attr(attr_name_); ExtractAttribute<T> extract_attr(attr_name_);
T* attr_value = extract_attr(attr); T* attr_value = extract_attr(attr);
for (const auto& checker : value_checkers_) { for (const auto& checker : value_checkers_) {
@ -289,12 +290,12 @@ class TypedAttrChecker {
private: private:
std::string attr_name_; std::string attr_name_;
std::vector<ValueChecker> value_checkers_; std::vector<ValueChecker> value_checkers_;
std::vector<ValueChecker> default_value_setter_; std::vector<DefaultValueChecker> default_value_setter_;
}; };
// check whether op's all attributes fit their own limits // check whether op's all attributes fit their own limits
class OpAttrChecker { class OpAttrChecker {
typedef std::function<void(AttributeMap&)> AttrChecker; typedef std::function<void(AttributeMap*)> AttrChecker;
public: public:
template <typename T> template <typename T>
@ -304,7 +305,7 @@ class OpAttrChecker {
return *(checker.target<TypedAttrChecker<T>>()); return *(checker.target<TypedAttrChecker<T>>());
} }
void Check(AttributeMap& attr_map) const { // NOLINT void Check(AttributeMap* attr_map) const {
for (const auto& checker : attr_checkers_) { for (const auto& checker : attr_checkers_) {
checker(attr_map); checker(attr_map);
} }

@ -17,6 +17,7 @@ limitations under the License. */
#include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/framework/lod_tensor.h"
#include "paddle/fluid/framework/op_info.h" #include "paddle/fluid/framework/op_info.h"
#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/scope.h"
#include "paddle/fluid/operators/elementwise/elementwise_op_function.h" #include "paddle/fluid/operators/elementwise/elementwise_op_function.h"
#include "paddle/fluid/operators/math/math_function.h" #include "paddle/fluid/operators/math/math_function.h"
#include "paddle/fluid/platform/device_context.h" #include "paddle/fluid/platform/device_context.h"

@ -88,7 +88,7 @@ void EagerDeletionOpHandle::RunImpl() {
} }
} else { } else {
PADDLE_THROW("Type %s of %s is not supported eager deletion", PADDLE_THROW("Type %s of %s is not supported eager deletion",
var->Type().name(), name); framework::ToTypeName(var->Type()), name);
} }
} }

@ -120,6 +120,7 @@ FeedFetchList FastThreadedSSAGraphExecutor::Run(
ClearFetchOp(graph_.get(), &fetch_ops); ClearFetchOp(graph_.get(), &fetch_ops);
return fetches; return fetches;
} }
void FastThreadedSSAGraphExecutor::RunOpAsync( void FastThreadedSSAGraphExecutor::RunOpAsync(
std::unordered_map<OpHandleBase *, std::atomic<int>> *op_deps, std::unordered_map<OpHandleBase *, std::atomic<int>> *op_deps,
OpHandleBase *op, OpHandleBase *op,

File diff suppressed because it is too large Load Diff

@ -45,7 +45,7 @@ class MultiDevSSAGraphBuilder : public ir::Pass {
#endif #endif
int GetVarDeviceID( int GetVarDeviceID(
const ir::Graph &graph, const std::string &varname, const std::string &varname,
const std::unordered_map<std::string, int> &sharded_var_device) const; const std::unordered_map<std::string, int> &sharded_var_device) const;
bool IsScaleLossOp(ir::Node *node) const; bool IsScaleLossOp(ir::Node *node) const;
@ -57,18 +57,13 @@ class MultiDevSSAGraphBuilder : public ir::Pass {
ir::Graph *result, ir::Node *node, ir::Graph *result, ir::Node *node,
std::unordered_map<std::string, int> *sharded_var_device) const; std::unordered_map<std::string, int> *sharded_var_device) const;
std::vector<std::string> FindDistTrainSendVars(
const std::vector<ir::Node *> &nodes) const;
std::vector<std::string> FindDistTrainRecvVars(
const std::vector<ir::Node *> &nodes) const;
void CreateComputationalOps(ir::Graph *result, ir::Node *node, void CreateComputationalOps(ir::Graph *result, ir::Node *node,
size_t num_places) const; size_t num_places) const;
void CreateScaleLossGradOp(ir::Graph *result, void CreateScaleLossGradOp(ir::Graph *result,
const std::string &loss_grad_name, const std::string &loss_grad_name,
ir::Node *out_var_node) const; ir::Node *out_var_node,
proto::VarType::Type dtype) const;
VarHandle *CreateReduceOp(ir::Graph *result, const std::string &og, VarHandle *CreateReduceOp(ir::Graph *result, const std::string &og,
int dst_dev_id) const; int dst_dev_id) const;
@ -76,7 +71,7 @@ class MultiDevSSAGraphBuilder : public ir::Pass {
int dev_id) const; int dev_id) const;
int GetOpDeviceID( int GetOpDeviceID(
const ir::Graph &graph, ir::Node *node, ir::Node *node,
const std::unordered_map<std::string, int> &sharded_var_device) const; const std::unordered_map<std::string, int> &sharded_var_device) const;
void InsertAllReduceOp(ir::Graph *result, const std::string &og) const; void InsertAllReduceOp(ir::Graph *result, const std::string &og) const;
@ -99,6 +94,15 @@ class MultiDevSSAGraphBuilder : public ir::Pass {
void SetCommunicationContext(OpHandleBase *op_handle, void SetCommunicationContext(OpHandleBase *op_handle,
const platform::Place &p) const; const platform::Place &p) const;
std::vector<ir::Node *> SortForReduceMode(
const std::vector<ir::Node *> &) const;
int GetOpDeviceID(
ir::Node *node,
const std::unordered_map<std::string, int> &shared_var_device,
std::unordered_map<std::string, std::vector<ir::Node *>> *delay_ops)
const;
mutable std::string loss_var_name_; mutable std::string loss_var_name_;
mutable std::vector<platform::Place> places_; mutable std::vector<platform::Place> places_;
mutable std::vector<Scope *> local_scopes_; mutable std::vector<Scope *> local_scopes_;

@ -22,39 +22,66 @@ namespace details {
ScaleLossGradOpHandle::ScaleLossGradOpHandle(ir::Node *node, size_t num_dev, ScaleLossGradOpHandle::ScaleLossGradOpHandle(ir::Node *node, size_t num_dev,
Scope *scope, Scope *scope,
platform::Place place, platform::Place place,
platform::DeviceContext *dev_ctx) platform::DeviceContext *dev_ctx,
proto::VarType::Type dtype)
: OpHandleBase(node), : OpHandleBase(node),
coeff_(static_cast<float>(1.0 / num_dev)), coeff_(static_cast<float>(1.0 / num_dev)),
scope_(scope), scope_(scope),
place_(place) { place_(place),
out_dtype_(dtype) {
this->SetDeviceContext(place_, dev_ctx); this->SetDeviceContext(place_, dev_ctx);
} }
ScaleLossGradOpHandle::~ScaleLossGradOpHandle() {} ScaleLossGradOpHandle::~ScaleLossGradOpHandle() {}
struct ScaleLossGradFunctor {
float coeff_;
Tensor *out_;
platform::Place place_;
OpHandleBase *op_handle_;
proto::VarType::Type out_dtype_;
platform::DeviceContext *ctx_;
ScaleLossGradFunctor(float coeff, Tensor *out, platform::Place place,
OpHandleBase *op_handle, proto::VarType::Type dtype,
platform::DeviceContext *ctx)
: coeff_(coeff), out_(out), place_(place), out_dtype_(dtype), ctx_(ctx) {}
template <typename OutT>
void apply() const {
auto *out_data = out_->mutable_data<OutT>(place_);
if (platform::is_cpu_place(place_)) {
*out_data = static_cast<OutT>(coeff_);
} else {
#ifdef PADDLE_WITH_CUDA
OutT cast_coeff = static_cast<OutT>(coeff_);
auto stream = static_cast<platform::CUDADeviceContext *>(ctx_)->stream();
memory::Copy(boost::get<platform::CUDAPlace>(place_), out_data,
platform::CPUPlace(), &cast_coeff, SizeOfType(out_dtype_),
stream);
VLOG(10) << place_ << "RUN Scale loss grad op";
#endif
}
}
};
void ScaleLossGradOpHandle::RunImpl() { void ScaleLossGradOpHandle::RunImpl() {
// Doesn't wait any event // Doesn't wait any event
std::string var_name = static_cast<VarHandle *>(this->outputs_[0])->name_; std::string var_name = static_cast<VarHandle *>(this->outputs_[0])->name_;
auto &local_scope = *scope_->FindVar(kLocalExecScopeName)->Get<Scope *>(); auto &local_scope = *scope_->FindVar(kLocalExecScopeName)->Get<Scope *>();
float *tmp = local_scope.FindVar(var_name) auto *tensor = local_scope.FindVar(var_name)->GetMutable<LoDTensor>();
->GetMutable<LoDTensor>() tensor->Resize(make_ddim({1}));
->mutable_data<float>(make_ddim({1}), place_);
if (platform::is_cpu_place(place_)) {
*tmp = coeff_;
} else {
#ifdef PADDLE_WITH_CUDA #ifdef PADDLE_WITH_CUDA
this->RunAndRecordEvent([&] { ScaleLossGradFunctor func(coeff_, tensor, place_, this, out_dtype_,
auto stream = static_cast<platform::CUDADeviceContext *>( this->dev_ctxes_.at(place_));
this->dev_ctxes_.at(place_)) this->RunAndRecordEvent([&] { framework::VisitDataType(out_dtype_, func); });
->stream(); #else
memory::Copy(boost::get<platform::CUDAPlace>(place_), tmp, ScaleLossGradFunctor func(coeff_, tensor, place_, this, out_dtype_, nullptr);
platform::CPUPlace(), &coeff_, sizeof(float), stream); framework::VisitDataType(out_dtype_, func);
VLOG(10) << place_ << "RUN Scale loss grad op";
});
#endif #endif
}
} }
std::string ScaleLossGradOpHandle::Name() const { return "Scale LossGrad"; } std::string ScaleLossGradOpHandle::Name() const { return "Scale LossGrad"; }

@ -26,8 +26,8 @@ namespace details {
struct ScaleLossGradOpHandle : public OpHandleBase { struct ScaleLossGradOpHandle : public OpHandleBase {
ScaleLossGradOpHandle(ir::Node *node, size_t num_dev, Scope *scope, ScaleLossGradOpHandle(ir::Node *node, size_t num_dev, Scope *scope,
platform::Place place, platform::Place place, platform::DeviceContext *context,
platform::DeviceContext *context); proto::VarType::Type dtype);
~ScaleLossGradOpHandle() final; ~ScaleLossGradOpHandle() final;
@ -40,6 +40,7 @@ struct ScaleLossGradOpHandle : public OpHandleBase {
float coeff_; float coeff_;
Scope *scope_; Scope *scope_;
platform::Place place_; platform::Place place_;
proto::VarType::Type out_dtype_;
}; };
} // namespace details } // namespace details

@ -24,7 +24,7 @@ static void VisitVariable(Variable* var, Func* func) {
} else if (var->IsType<SelectedRows>()) { } else if (var->IsType<SelectedRows>()) {
(*func)(var->GetMutable<SelectedRows>()); (*func)(var->GetMutable<SelectedRows>());
} else { } else {
PADDLE_THROW("Not supported type %s", var->Type().name()); PADDLE_THROW("Not supported type %s", ToTypeName(var->Type()));
} }
} }
@ -35,7 +35,7 @@ static void VisitVariable(const Variable& var, Func* func) {
} else if (var.IsType<SelectedRows>()) { } else if (var.IsType<SelectedRows>()) {
(*func)(var.Get<SelectedRows>()); (*func)(var.Get<SelectedRows>());
} else { } else {
PADDLE_THROW("Not supported type %s", var.Type().name()); PADDLE_THROW("Not supported type %s", ToTypeName(var.Type()));
} }
} }

@ -119,7 +119,7 @@ static void DeleteUnusedTensors(
} }
} else { } else {
PADDLE_THROW("Type %s of %s is not supported eager deletion", PADDLE_THROW("Type %s of %s is not supported eager deletion",
var->Type().name(), name); framework::ToTypeName(var->Type()), name);
} }
} }
} }

@ -45,6 +45,7 @@ pass_library(is_test_pass base)
pass_library(conv_elementwise_add_act_fuse_pass inference) pass_library(conv_elementwise_add_act_fuse_pass inference)
pass_library(conv_elementwise_add2_act_fuse_pass inference) pass_library(conv_elementwise_add2_act_fuse_pass inference)
pass_library(conv_elementwise_add_fuse_pass inference) pass_library(conv_elementwise_add_fuse_pass inference)
pass_library(conv_affine_channel_fuse_pass inference)
if(WITH_MKLDNN) if(WITH_MKLDNN)
pass_library(mkldnn_placement_pass base) pass_library(mkldnn_placement_pass base)
pass_library(depthwise_conv_mkldnn_pass base) pass_library(depthwise_conv_mkldnn_pass base)

@ -0,0 +1,222 @@
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/framework/ir/conv_affine_channel_fuse_pass.h"
#include <functional>
#include <string>
#include <vector>
#include "paddle/fluid/framework/lod_tensor.h"
#include "paddle/fluid/operators/math/cpu_vec.h"
#include "paddle/fluid/platform/enforce.h"
namespace paddle {
namespace framework {
namespace ir {
#define GET_CONV_BN_NODES(pattern_name) \
/* OPERATORS */ \
GET_IR_NODE_FROM_SUBGRAPH(conv, conv, pattern_name); \
GET_IR_NODE_FROM_SUBGRAPH(affine_channel, affine_channel, pattern_name); \
/* CONV inputs */ \
GET_IR_NODE_FROM_SUBGRAPH(conv_weight, conv_weight, pattern_name); \
/* CONV outputs */ \
GET_IR_NODE_FROM_SUBGRAPH(conv_out, conv_out, pattern_name); \
/* Affine Channel inputs */ \
GET_IR_NODE_FROM_SUBGRAPH(ac_scale, ac_scale, pattern_name); \
GET_IR_NODE_FROM_SUBGRAPH(ac_bias, ac_bias, pattern_name); \
/* Affine channel outputs */ \
GET_IR_NODE_FROM_SUBGRAPH(ac_out, ac_out, pattern_name); /* Out */
void recompute_bias_and_weights(const Scope* scope, ir::Node* conv_weight,
const ir::Node& ac_scale,
const LoDTensor& ac_bias_tensor,
LoDTensor* eltwise_y_in_tensor) {
using EigenVectorArrayMap =
Eigen::Map<Eigen::Array<float, Eigen::Dynamic, 1>>;
using ConstEigenVectorArrayMap =
Eigen::Map<const Eigen::Array<float, Eigen::Dynamic, 1>>;
using EigenMatrixArrayMap = Eigen::Map<
Eigen::Array<float, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>>;
// Re-compute bias of conv2d from AffineChannel
PADDLE_ENFORCE_EQ(eltwise_y_in_tensor->dims(), ac_bias_tensor.dims());
auto* scale_tensor = scope->FindVar(ac_scale.Name())->GetMutable<LoDTensor>();
ConstEigenVectorArrayMap scale_array(scale_tensor->data<float>(),
scale_tensor->numel(), 1);
ConstEigenVectorArrayMap ac_bias_array(ac_bias_tensor.data<float>(),
ac_bias_tensor.numel(), 1);
EigenVectorArrayMap eltwise_y_in_array(
eltwise_y_in_tensor->mutable_data<float>(platform::CPUPlace()),
eltwise_y_in_tensor->numel(), 1);
eltwise_y_in_array = (eltwise_y_in_array * scale_array) + ac_bias_array;
// Re-compute weight of conv2d from AffineChannel
auto* weights = scope->FindVar(conv_weight->Name())->GetMutable<LoDTensor>();
auto weights_shape = weights->dims();
auto weights_shape_2d = flatten_to_2d(weights_shape, 1);
EigenMatrixArrayMap weights_array_2d(
weights->mutable_data<float>(platform::CPUPlace()), weights_shape_2d[0],
weights_shape_2d[1]);
weights_array_2d.colwise() *= scale_array;
}
std::unique_ptr<ir::Graph> ConvAffineChannelFusePass::ApplyImpl(
std::unique_ptr<ir::Graph> graph) const {
PADDLE_ENFORCE(graph.get());
FusePassBase::Init(name_scope_, graph.get());
auto* scope = param_scope();
PADDLE_ENFORCE(scope);
GraphPatternDetector gpd;
auto* conv_input =
gpd.mutable_pattern()
->NewNode(patterns::PDNodeName(name_scope_, "conv_input"))
->AsInput()
->assert_is_op_input("conv2d", "Input");
patterns::ConvAffineChannel conv_ac_pattern(gpd.mutable_pattern(),
name_scope_);
conv_ac_pattern(conv_input, false /*with_eltwise_add*/);
int found_conv_ac_count = 0;
auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph,
Graph* g) {
VLOG(4) << "handle ConvAffineChannel fuse";
GET_CONV_BN_NODES(conv_ac_pattern);
// check if fuse can be done and if MKL-DNN should be used
FuseOptions fuse_option = FindFuseOption(*conv, *affine_channel);
if (fuse_option == DO_NOT_FUSE) {
VLOG(3) << "do not perform conv+affinechannel fuse";
return;
}
// Create eltwise_y (conv bias) variable
VarDesc eltwise_y_in_desc(
patterns::PDNodeName(name_scope_, "eltwise_y_in"));
eltwise_y_in_desc.SetPersistable(true);
auto* eltwise_y_in_node = g->CreateVarNode(&eltwise_y_in_desc);
auto* eltwise_y_in_tensor =
scope->Var(eltwise_y_in_node->Name())->GetMutable<LoDTensor>();
// Get affine_channel bias
auto* ac_bias_tensor =
scope->FindVar(ac_bias->Name())->GetMutable<LoDTensor>();
// Initialize eltwise_y
eltwise_y_in_tensor->Resize(ac_bias_tensor->dims());
std::fill_n(eltwise_y_in_tensor->mutable_data<float>(platform::CPUPlace()),
eltwise_y_in_tensor->numel(), 0.0f);
// update weights and biases
recompute_bias_and_weights(scope, conv_weight, *ac_scale, *ac_bias_tensor,
eltwise_y_in_tensor);
// create an elementwise add node.
OpDesc desc;
desc.SetInput("X", std::vector<std::string>({conv_out->Name()}));
desc.SetInput("Y", std::vector<std::string>({eltwise_y_in_node->Name()}));
desc.SetOutput("Out", std::vector<std::string>({ac_out->Name()}));
desc.SetType("elementwise_add");
desc.SetAttr("axis", 1);
auto eltwise_op = g->CreateOpNode(&desc); // OpDesc will be copied.
GraphSafeRemoveNodes(graph.get(), {ac_scale, ac_bias, affine_channel});
IR_NODE_LINK_TO(conv_out, eltwise_op);
IR_NODE_LINK_TO(eltwise_y_in_node, eltwise_op);
IR_NODE_LINK_TO(eltwise_op, ac_out);
found_conv_ac_count++;
};
gpd(graph.get(), handler);
AddStatis(found_conv_ac_count);
return graph;
}
std::unique_ptr<ir::Graph> ConvEltwiseAddAffineChannelFusePass::ApplyImpl(
std::unique_ptr<ir::Graph> graph) const {
PADDLE_ENFORCE(graph.get());
FusePassBase::Init(name_scope_, graph.get());
auto* scope = param_scope();
PADDLE_ENFORCE(scope);
GraphPatternDetector gpd;
auto* conv_input =
gpd.mutable_pattern()
->NewNode(patterns::PDNodeName(name_scope_, "conv_input"))
->AsInput()
->assert_is_op_input("conv2d", "Input");
patterns::ConvAffineChannel conv_ac_pattern(gpd.mutable_pattern(),
name_scope_);
conv_ac_pattern(conv_input, true /*with_eltwise_add*/);
int found_conv_ac_count = 0;
auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph,
Graph* g) {
VLOG(4) << "handle ConvBN fuse";
GET_CONV_BN_NODES(conv_ac_pattern);
// OPERATORS
GET_IR_NODE_FROM_SUBGRAPH(eltwise, eltwise, conv_ac_pattern);
// BIAS inputs
GET_IR_NODE_FROM_SUBGRAPH(eltwise_y_in, eltwise_y_in, conv_ac_pattern);
// BIAS outputs
GET_IR_NODE_FROM_SUBGRAPH(eltwise_out, eltwise_out, conv_ac_pattern);
// Get eltwise_y (conv bias) variable
auto* eltwise_y_in_tensor =
scope->FindVar(eltwise_y_in->Name())->GetMutable<LoDTensor>();
// Get batch norm bias
auto* ac_bias_tensor =
scope->FindVar(ac_bias->Name())->GetMutable<LoDTensor>();
recompute_bias_and_weights(scope, conv_weight, *ac_scale, *ac_bias_tensor,
eltwise_y_in_tensor);
// Update the elementwise_add node
eltwise->Op()->SetAttr("axis", 1);
eltwise->Op()->SetOutput("Out", std::vector<std::string>({ac_out->Name()}));
GraphSafeRemoveNodes(graph.get(),
{ac_scale, ac_bias, affine_channel, eltwise_out});
IR_NODE_LINK_TO(eltwise, ac_out);
found_conv_ac_count++;
};
gpd(graph.get(), handler);
AddStatis(found_conv_ac_count);
return graph;
}
} // namespace ir
} // namespace framework
} // namespace paddle
REGISTER_PASS(conv_affine_channel_fuse_pass,
paddle::framework::ir::ConvAffineChannelFusePass);
REGISTER_PASS(conv_eltwiseadd_affine_channel_fuse_pass,
paddle::framework::ir::ConvEltwiseAddAffineChannelFusePass);

@ -0,0 +1,49 @@
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <string>
#include "paddle/fluid/framework/ir/fuse_pass_base.h"
#include "paddle/fluid/framework/ir/graph.h"
#include "paddle/fluid/framework/ir/graph_pattern_detector.h"
namespace paddle {
namespace framework {
namespace ir {
/*
* Fuse the Conv and ConvAffineChannel.
*/
class ConvAffineChannelFusePass : public FusePassBase {
public:
virtual ~ConvAffineChannelFusePass() {}
protected:
std::unique_ptr<ir::Graph> ApplyImpl(std::unique_ptr<ir::Graph> graph) const;
const std::string name_scope_{"conv_affine_channel_fuse"};
};
class ConvEltwiseAddAffineChannelFusePass : public FusePassBase {
public:
virtual ~ConvEltwiseAddAffineChannelFusePass() {}
protected:
std::unique_ptr<ir::Graph> ApplyImpl(std::unique_ptr<ir::Graph> graph) const;
const std::string name_scope_{"conv_eltwiseadd_affine_channel_fuse"};
};
} // namespace ir
} // namespace framework
} // namespace paddle

@ -40,18 +40,20 @@ framework::proto::OpDesc PrepareOpDesc(
const std::string& output) { const std::string& output) {
auto proto = base_desc; auto proto = base_desc;
framework::OpDesc desc(proto, nullptr); framework::OpDesc desc(proto, nullptr);
desc.SetType("conv2d_fusion");
desc.SetInput("Bias", {bias}); desc.SetInput("Bias", {bias});
desc.SetInput("ResidualData", {bias1}); desc.SetInput("ResidualData", {bias1});
desc.SetAttr("activation", activation); desc.SetAttr("activation", activation);
desc.SetOutput("Output", {output}); desc.SetOutput("Output", {output});
desc.SetAttr("is_test", true); desc.SetAttr("is_test", true);
desc.SetAttr("use_cudnn", false);
desc.Flush();
return *desc.Proto(); return *desc.Proto();
} }
std::unique_ptr<ir::Graph> ConvElementwiseAdd2ActFusePass::ApplyImpl( std::unique_ptr<ir::Graph> ConvElementwiseAdd2ActFusePass::ApplyImpl(
std::unique_ptr<ir::Graph> graph) const { std::unique_ptr<ir::Graph> graph) const {
const std::string pattern_name = "conv_elementwise_add_act_fuse"; const std::string pattern_name = "conv_elementwise_add2_act_fuse";
FusePassBase::Init(pattern_name, graph.get()); FusePassBase::Init(pattern_name, graph.get());
GraphPatternDetector gpd; GraphPatternDetector gpd;
@ -76,22 +78,23 @@ std::unique_ptr<ir::Graph> ConvElementwiseAdd2ActFusePass::ApplyImpl(
framework::OpDesc new_op_desc(new_op_proto, nullptr); framework::OpDesc new_op_desc(new_op_proto, nullptr);
// Create a new node for the fused op. // Create a new node for the fused op.
graph->CreateOpNode(&new_op_desc); auto* new_conv_op = graph->CreateOpNode(&new_op_desc);
// Link inputs and outputs. // Link inputs and outputs.
PADDLE_ENFORCE(subgraph.count(x)); PADDLE_ENFORCE(subgraph.count(x));
auto* conv_in_node = subgraph.at(x); auto* conv_in_node = subgraph.at(x);
IR_NODE_LINK_TO(conv_in_node, conv_op); // Input IR_NODE_LINK_TO(conv_in_node, new_conv_op); // Input
IR_NODE_LINK_TO(conv_filter, conv_op); // Filter IR_NODE_LINK_TO(conv_filter, new_conv_op); // Filter
IR_NODE_LINK_TO(conv_op, conv_out); // Output IR_NODE_LINK_TO(elementwise_add_in_y, new_conv_op); // Bias
IR_NODE_LINK_TO(elementwise_add_in_y, conv_op); // Bias IR_NODE_LINK_TO(elementwise_add_in_y_1, new_conv_op); // Bias
IR_NODE_LINK_TO(elementwise_add_in_y_1, conv_op); // Bias IR_NODE_LINK_TO(new_conv_op, act_out); // Output
// Delete the unneeded nodes. // Delete the unneeded nodes.
GraphSafeRemoveNodes(graph.get(), GraphSafeRemoveNodes(
{conv_op, elementwise_add_op, elementwise_add_op_1, graph.get(),
elementwise_add_out}); {conv_op, conv_out, elementwise_add_op, elementwise_add_op_1,
elementwise_add_out, elementwise_add_out_1, act_op});
}; };
gpd(graph.get(), handler); gpd(graph.get(), handler);
return graph; return graph;

@ -23,66 +23,8 @@ limitations under the License. */
namespace paddle { namespace paddle {
namespace framework { namespace framework {
namespace ir { namespace ir {
namespace {
void CheckProgram(const ProgramDesc &program) {
#define _INT(role) static_cast<int>(role)
std::map<int, bool> visit;
for (OpDesc *op : program.Block(0).AllOps()) {
// For backward compatibility, some program doesn't have role added.
if (!op->HasAttr(OpProtoAndCheckerMaker::OpRoleAttrName())) continue;
int role_id =
boost::get<int>(op->GetAttr(OpProtoAndCheckerMaker::OpRoleAttrName()));
visit[role_id] = true;
switch (role_id) {
case _INT(OpRole::kForward):
if (visit.find(_INT(OpRole::kBackward)) != visit.end()) {
LOG(ERROR) << "Cannot add backward operator before forward operator "
<< op->Type();
}
break;
case _INT(OpRole::kBackward):
case _INT(OpRole::kBackward) | _INT(OpRole::kLoss):
PADDLE_ENFORCE(
visit.find(_INT(OpRole::kOptimize)) == visit.end(),
"Cannot add backward operator %s after optimize operator.",
op->Type());
break;
case _INT(OpRole::kForward) | _INT(OpRole::kLoss):
PADDLE_ENFORCE(visit.find(_INT(OpRole::kBackward) |
_INT(OpRole::kLoss)) == visit.end(),
"Cannot add backward|loss operator before "
"forward|loss operator %s.",
op->Type());
PADDLE_ENFORCE(
visit.find(_INT(OpRole::kOptimize)) == visit.end(),
"Cannot add forward|loss operator %s after optimize operator.",
op->Type());
break;
case _INT(OpRole::kOptimize):
case _INT(OpRole::kOptimize) | _INT(OpRole::kLRSched):
PADDLE_ENFORCE(visit.find(_INT(OpRole::kBackward)) != visit.end(),
"Optimize operators %s must follow backward operator.",
op->Type());
break;
case _INT(OpRole::kLRSched):
case _INT(OpRole::kDist):
case _INT(OpRole::kRPC):
case _INT(OpRole::kNotSpecified):
break;
default:
LOG(FATAL) << "Unknown operator role. Don't add new role because "
"you don't know what you are doing.";
}
}
#undef _INT
}
} // namespace
Graph::Graph(const ProgramDesc &program) : program_(program) { Graph::Graph(const ProgramDesc &program) : program_(program) {
CheckProgram(program_);
auto var_nodes = InitFromProgram(program_); auto var_nodes = InitFromProgram(program_);
ResolveHazard(var_nodes); ResolveHazard(var_nodes);
} }

@ -1101,9 +1101,7 @@ PDNode *patterns::ElementwiseAdd::operator()(PDNode *x_var, PDNode *y_var) {
return out_var; return out_var;
} }
std::unordered_set<std::string> conv_act_set({"identity", "sigmoid", "relu", std::unordered_set<std::string> conv_act_set({"identity", "relu"});
"relu6", "relux", "tanh",
"band_pass"});
PDNode *patterns::ConvElementwiseaddAct::operator()(PDNode *conv_in) { PDNode *patterns::ConvElementwiseaddAct::operator()(PDNode *conv_in) {
conv_in->AsInput(); conv_in->AsInput();
@ -1169,13 +1167,13 @@ PDNode *patterns::ConvElementwiseadd2Act::operator()(PDNode *conv_in) {
->AsInput(); ->AsInput();
auto elementwise_add_out = pattern->NewNode(elementwise_add_out_repr()) auto elementwise_add_out = pattern->NewNode(elementwise_add_out_repr())
->assert_is_op_output("elementwise_add") ->assert_is_op_output("elementwise_add")
->assert_is_op_input("elementwise_add", "X") ->assert_is_op_input("elementwise_add", "Y")
->AsIntermediate(); ->AsIntermediate();
auto elementwise_add_op_1 = pattern->NewNode(elementwise_add_op_1_repr()) auto elementwise_add_op_1 = pattern->NewNode(elementwise_add_op_1_repr())
->assert_is_op("elementwise_add"); ->assert_is_op("elementwise_add");
auto elementwise_add_in_y_1 = pattern->NewNode(elementwise_add_in_y_1_repr()) auto elementwise_add_in_y_1 = pattern->NewNode(elementwise_add_in_y_1_repr())
->assert_is_op_input("elementwise_add", "Y") ->assert_is_op_input("elementwise_add", "X")
->AsInput(); ->AsInput();
auto elementwise_add_out_1 = pattern->NewNode(elementwise_add_out_1_repr()) auto elementwise_add_out_1 = pattern->NewNode(elementwise_add_out_1_repr())
->assert_is_op_output("elementwise_add") ->assert_is_op_output("elementwise_add")
@ -1203,8 +1201,8 @@ PDNode *patterns::ConvElementwiseadd2Act::operator()(PDNode *conv_in) {
conv_op->LinksFrom({conv_in, conv_filter}).LinksTo({conv_out}); conv_op->LinksFrom({conv_in, conv_filter}).LinksTo({conv_out});
elementwise_add_op->LinksFrom({conv_out, elementwise_add_in_y}) elementwise_add_op->LinksFrom({conv_out, elementwise_add_in_y})
.LinksTo({elementwise_add_out}); .LinksTo({elementwise_add_out});
elementwise_add_op_1->LinksFrom( elementwise_add_op_1->LinksFrom({elementwise_add_out, elementwise_add_in_y_1})
{elementwise_add_out, elementwise_add_in_y_1}); .LinksTo({elementwise_add_out_1});
act_op->LinksFrom({elementwise_add_out_1}).LinksTo({act_out}); act_op->LinksFrom({elementwise_add_out_1}).LinksTo({act_out});
return act_out; return act_out;
} }
@ -1236,6 +1234,78 @@ PDNode *patterns::ConvElementwiseadd::operator()(PDNode *conv_in) {
return elementwise_add_out; return elementwise_add_out;
} }
PDNode *patterns::ConvAffineChannel::operator()(
paddle::framework::ir::PDNode *conv_input, bool with_eltwise_add) {
// Create Operators
conv_input->assert_is_op_input("conv2d", "Input");
auto *conv_op = pattern->NewNode(conv_repr())->assert_is_op("conv2d");
PDNode *eltwise_op = nullptr;
if (with_eltwise_add) {
eltwise_op =
pattern->NewNode(eltwise_repr())->assert_is_op("elementwise_add");
}
auto *affine_channel_op =
pattern->NewNode(affine_channel_repr())->assert_is_op("affine_channel");
// Create variables
// Conv Filter
auto *conv_weight_var = pattern->NewNode(conv_weight_repr())
->AsInput()
->assert_is_persistable_var()
->assert_is_op_input("conv2d", "Filter");
auto *conv_out_var = pattern->NewNode(conv_out_repr())
->AsIntermediate()
->assert_is_only_output_of_op("conv2d");
PDNode *eltwise_y_in_var = nullptr;
PDNode *eltwise_out_var = nullptr;
if (with_eltwise_add) {
// Conv output as Bias input
conv_out_var->assert_is_op_input("elementwise_add", "X");
// Bias
eltwise_y_in_var = pattern->NewNode(eltwise_y_in_repr())
->assert_is_op_input("elementwise_add", "Y")
->AsInput();
eltwise_out_var = pattern->NewNode(eltwise_out_repr())
->AsIntermediate()
->assert_is_only_output_of_op("elementwise_add");
} else {
// Conv output as AffineChannel input
conv_out_var->assert_is_op_input("affine_channel", "X");
}
// AC Scale
auto *ac_scale_var = pattern->NewNode(ac_scale_repr())
->AsInput()
->assert_is_persistable_var()
->assert_is_op_input("affine_channel", "Scale");
// AC Bias
auto *ac_bias_var = pattern->NewNode(ac_bias_repr())
->AsInput()
->assert_is_persistable_var()
->assert_is_op_input("affine_channel", "Bias");
// AC output
auto *ac_out_var = pattern->NewNode(ac_out_repr())
->AsOutput()
->assert_is_op_output("affine_channel");
conv_op->LinksFrom({conv_input, conv_weight_var}).LinksTo({conv_out_var});
if (with_eltwise_add) {
eltwise_op->LinksFrom({conv_out_var, eltwise_y_in_var})
.LinksTo({eltwise_out_var});
affine_channel_op->LinksFrom({eltwise_out_var, ac_scale_var, ac_bias_var})
.LinksTo({ac_out_var});
} else {
affine_channel_op->LinksFrom({conv_out_var, ac_scale_var, ac_bias_var})
.LinksTo({ac_out_var});
}
return ac_out_var;
}
} // namespace ir } // namespace ir
} // namespace framework } // namespace framework
} // namespace paddle } // namespace paddle

@ -734,6 +734,38 @@ struct ConvElementwiseadd : public PatternBase {
PATTERN_DECL_NODE(elementwise_add_out); PATTERN_DECL_NODE(elementwise_add_out);
}; };
// Conv with affine_channel
// op: conv + (elementwise_add +) affine_channel
// named nodes:
// conv_weight, conv_out, conv,
// ac_x, ac_scale, ac_bias
// affine_channel, ac_out
struct ConvAffineChannel : public PatternBase {
ConvAffineChannel(PDPattern* pattern, const std::string& name_scope)
: PatternBase(pattern, name_scope, "conv_affine_channel") {}
PDNode* operator()(PDNode* conv_input, bool with_eltwise_add);
// declare operator node's name
PATTERN_DECL_NODE(conv);
PATTERN_DECL_NODE(affine_channel);
PATTERN_DECL_NODE(eltwise); // ELEMENTWISE_ADD
// CONV inputs
PATTERN_DECL_NODE(conv_weight); // Filter
// CONV outputs
PATTERN_DECL_NODE(conv_out); // tmp
// ELTWISE inputs
PATTERN_DECL_NODE(eltwise_y_in);
// ELTWISE outputs
PATTERN_DECL_NODE(eltwise_out); // tmp
// AC(Affine_Channel) inputs
PATTERN_DECL_NODE(ac_scale);
PATTERN_DECL_NODE(ac_bias);
// AC outputs
PATTERN_DECL_NODE(ac_out); // Out
};
} // namespace patterns } // namespace patterns
// Link two ir::Nodes from each other. // Link two ir::Nodes from each other.

@ -75,6 +75,7 @@ std::unique_ptr<Graph> BatchMergePass::ApplyImpl(
std::vector<Node*> optimize_ops; std::vector<Node*> optimize_ops;
std::vector<Node*> lr_ops; // ops other than forward/backward/optimize std::vector<Node*> lr_ops; // ops other than forward/backward/optimize
std::unordered_set<std::string> grad_names; std::unordered_set<std::string> grad_names;
std::unordered_map<std::string, std::string> gradname2paramname;
std::vector<ir::Node*> nodes = TopologySortOperations(*graph); std::vector<ir::Node*> nodes = TopologySortOperations(*graph);
auto origin_nodes = graph->ReleaseNodes(); auto origin_nodes = graph->ReleaseNodes();
@ -99,6 +100,7 @@ std::unique_ptr<Graph> BatchMergePass::ApplyImpl(
auto op_role_vars = boost::get<std::vector<std::string>>(op_role_var); auto op_role_vars = boost::get<std::vector<std::string>>(op_role_var);
for (size_t i = 0; i < op_role_vars.size(); i += 2) { for (size_t i = 0; i < op_role_vars.size(); i += 2) {
grad_names.insert(op_role_vars[i + 1]); grad_names.insert(op_role_vars[i + 1]);
gradname2paramname[op_role_vars[i + 1]] = op_role_vars[i];
} }
} else if (op_role & static_cast<int>(framework::OpRole::kLRSched)) { } else if (op_role & static_cast<int>(framework::OpRole::kLRSched)) {
lr_ops.push_back(node); lr_ops.push_back(node);
@ -109,7 +111,7 @@ std::unique_ptr<Graph> BatchMergePass::ApplyImpl(
// 2. copy forward backward // 2. copy forward backward
ir::Node* prev_repeat_last_op_node = nullptr; ir::Node* prev_repeat_last_op_node = nullptr;
// record origin_grad -> repeated grad list map. // record origin_grad -> repeated_grad_list map.
std::map<ir::Node*, std::vector<ir::Node*>> grad_repeated_map; std::map<ir::Node*, std::vector<ir::Node*>> grad_repeated_map;
std::map<std::string, std::vector<ir::Node*>> created; std::map<std::string, std::vector<ir::Node*>> created;
std::unordered_set<std::string> bn_vars_need_rename; std::unordered_set<std::string> bn_vars_need_rename;
@ -124,10 +126,16 @@ std::unique_ptr<Graph> BatchMergePass::ApplyImpl(
if (grad_names.find(outname) != grad_names.end()) { if (grad_names.find(outname) != grad_names.end()) {
std::string new_gname = string::Sprintf("%s.repeat.%d", outname, i); std::string new_gname = string::Sprintf("%s.repeat.%d", outname, i);
repeated_op.RenameOutput(outname, new_gname); repeated_op.RenameOutput(outname, new_gname);
// remove op_role_var for backward ops that outputs grad for a
// parameter.
repeated_op.SetAttr(OpProtoAndCheckerMaker::OpRoleVarAttrName(),
std::vector<std::string>());
} }
} }
// 3.5 let batch_norm ops use independent vars, note batch_norm_grad do // 3.5 let batch_norm ops use independent vars, note batch_norm_grad do
// not need this update // not need this update, because only moving mean and variance should be
// differ, trainable parameter scale and bias is the same as other
// parameters.
if (node->Name() == "batch_norm") { if (node->Name() == "batch_norm") {
// NOTE: assume bn op created by layers use save var as output mean and // NOTE: assume bn op created by layers use save var as output mean and
// variance // variance
@ -224,16 +232,25 @@ std::unique_ptr<Graph> BatchMergePass::ApplyImpl(
var->inputs.push_back(repeated_node); var->inputs.push_back(repeated_node);
} }
} }
} } // end copy forward backward
// 5. create GRAD merge op node // 5. create GRAD merge op node: sum(repeat.0...repeat.n) ->
// scale(1/num_repeats)
for (auto kv : grad_repeated_map) { for (auto kv : grad_repeated_map) {
OpDesc sum_op; OpDesc sum_op;
sum_op.SetType("sum"); sum_op.SetType("sum");
std::vector<std::string> repeated_grad_names; std::vector<std::string> repeated_grad_names;
std::vector<std::string> param_grad_op_role_var;
for (auto r : kv.second) { for (auto r : kv.second) {
repeated_grad_names.push_back(r->Var()->Name()); repeated_grad_names.push_back(r->Var()->Name());
} }
// NOTE: use op_role_var to control allreduce op appending in
// multi_devices_graph_pass, we want to append op_role_var
// only once for the merged gradient, so break after first call.
param_grad_op_role_var.push_back(
gradname2paramname.at(kv.first->Var()->Name())); // param
param_grad_op_role_var.push_back(kv.first->Var()->Name()); // grad
sum_op.SetInput("X", repeated_grad_names); sum_op.SetInput("X", repeated_grad_names);
sum_op.SetOutput("Out", {kv.first->Var()->Name()}); sum_op.SetOutput("Out", {kv.first->Var()->Name()});
sum_op.SetAttr(OpProtoAndCheckerMaker::OpRoleAttrName(), sum_op.SetAttr(OpProtoAndCheckerMaker::OpRoleAttrName(),
@ -256,6 +273,10 @@ std::unique_ptr<Graph> BatchMergePass::ApplyImpl(
scale_op.SetAttr("scale", static_cast<float>(1.0f / num_repeats)); scale_op.SetAttr("scale", static_cast<float>(1.0f / num_repeats));
scale_op.SetAttr(OpProtoAndCheckerMaker::OpRoleAttrName(), scale_op.SetAttr(OpProtoAndCheckerMaker::OpRoleAttrName(),
static_cast<int>(OpRole::kBackward)); static_cast<int>(OpRole::kBackward));
scale_op.SetAttr(OpProtoAndCheckerMaker::OpRoleVarAttrName(),
param_grad_op_role_var);
auto scale_op_node = result.CreateOpNode(&scale_op); auto scale_op_node = result.CreateOpNode(&scale_op);
scale_op_node->inputs.push_back(sum_out_var_node); scale_op_node->inputs.push_back(sum_out_var_node);
sum_out_var_node->outputs.push_back(scale_op_node); sum_out_var_node->outputs.push_back(scale_op_node);

Some files were not shown because too many files have changed in this diff Show More

Loading…
Cancel
Save