Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into change_requirements

test=develop
panyx0718-patch-1
minqiyang 6 years ago
commit 1b372d9b9d

3
.gitignore vendored

@ -1,9 +1,11 @@
python/paddle/fluid/tests/unittests/reader_reset_test.recordio
paddle/operators/check_t.save
paddle/operators/check_tensor.ls
paddle/operators/tensor.save
python/paddle/v2/fluid/tests/book/image_classification_resnet.inference.model/
python/paddle/v2/fluid/tests/book/image_classification_vgg.inference.model/
python/paddle/v2/fluid/tests/book/label_semantic_roles.inference.model/
paddle/fluid/operators/distributed/send_recv.proto
*.DS_Store
*.vs
build/
@ -28,4 +30,5 @@ third_party/
build_*
# clion workspace.
cmake-build-*
paddle/fluid/operators/distributed/send_recv.proto
model_test

@ -42,6 +42,7 @@
| QiJune | Jun Qi |
| qingqing01 | Qing-Qing Dang |
| reyoung | Yang Yu |
| Sand3r- | Michal Gallus |
| Superjom | Chun-Wei Yan |
| tensor-tang | Jian Tang |
| tianbingsz | Tian-Bing Xu |

@ -302,6 +302,14 @@ set(PADDLE_PYTHON_BUILD_DIR "${CMAKE_CURRENT_BINARY_DIR}/python/build")
set(CMAKE_CXX_FLAGS_RELWITHDEBINFO "-O3 -g -DNDEBUG")
set(CMAKE_C_FLAGS_RELWITHDEBINFO "-O3 -g -DNDEBUG")
if (ON_INFER)
message(STATUS "On inference mode, will take place some specific optimization.")
add_definitions(-DPADDLE_ON_INFERENCE)
else()
#TODO(luotao), combine this warning with `make inference_lib_dist` command.
message(WARNING "On inference mode, will take place some specific optimization. Turn on the ON_INFER flag when building inference_lib only.")
endif()
add_subdirectory(paddle)
if(WITH_PYTHON)
add_subdirectory(python)
@ -312,11 +320,3 @@ if(WITH_DOC)
find_python_module(recommonmark REQUIRED)
add_subdirectory(doc)
endif()
if (ON_INFER)
message(STATUS "On inference mode, will take place some specific optimization.")
add_definitions(-DPADDLE_ON_INFERENCE)
else()
#TODO(luotao), combine this warning with `make inference_lib_dist` command.
message(WARNING "On inference mode, will take place some specific optimization. Turn on the ON_INFER flag when building inference_lib only.")
endif()

@ -218,3 +218,7 @@ endif(WITH_GRPC)
if(WITH_BRPC_RDMA)
add_definitions(-DPADDLE_WITH_BRPC_RDMA)
endif(WITH_BRPC_RDMA)
if(ON_INFER)
add_definitions(-DPADDLE_ON_INFERENCE)
endif(ON_INFER)

@ -166,8 +166,8 @@ copy(framework_lib DEPS ${framework_lib_deps}
set(module "memory")
copy(memory_lib
SRCS ${src_dir}/${module}/*.h ${src_dir}/${module}/detail/*.h
DSTS ${dst_dir}/${module} ${dst_dir}/${module}/detail
SRCS ${src_dir}/${module}/*.h ${src_dir}/${module}/detail/*.h ${src_dir}/${module}/allocation/*.h
DSTS ${dst_dir}/${module} ${dst_dir}/${module}/detail ${dst_dir}/${module}/allocation
)
set(inference_deps paddle_fluid_shared paddle_fluid)

@ -0,0 +1,219 @@
set(PART_CUDA_KERNEL_FILES)
function(op_library TARGET)
# op_library is a function to create op library. The interface is same as
# cc_library. But it handle split GPU/CPU code and link some common library
# for ops.
set(cc_srcs)
set(cu_srcs)
set(hip_cu_srcs)
set(miopen_hip_cc_srcs)
set(cu_cc_srcs)
set(cudnn_cu_cc_srcs)
set(CUDNN_FILE)
set(mkldnn_cc_srcs)
set(MKLDNN_FILE)
set(op_common_deps operator op_registry math_function)
set(options "")
set(oneValueArgs "")
set(multiValueArgs SRCS DEPS)
set(pybind_flag 0)
cmake_parse_arguments(op_library "${options}" "${oneValueArgs}"
"${multiValueArgs}" ${ARGN})
list(LENGTH op_library_SRCS op_library_SRCS_len)
if (${op_library_SRCS_len} EQUAL 0)
if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${TARGET}.cc)
list(APPEND cc_srcs ${TARGET}.cc)
endif()
if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${TARGET}.cu.cc)
list(APPEND cu_cc_srcs ${TARGET}.cu.cc)
endif()
if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${TARGET}.cu)
list(APPEND cu_srcs ${TARGET}.cu)
endif()
if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${TARGET}.part.cu)
set(PART_CUDA_KERNEL_FILES ${CMAKE_CURRENT_SOURCE_DIR}/${TARGET}.part.cu
${PART_CUDA_KERNEL_FILES} PARENT_SCOPE)
list(APPEND cu_srcs ${CMAKE_CURRENT_SOURCE_DIR}/${TARGET}.part.cu)
endif()
if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${TARGET}.hip.cu)
list(APPEND hip_cu_srcs ${TARGET}.hip.cu)
endif()
string(REPLACE "_op" "_cudnn_op" CUDNN_FILE "${TARGET}")
if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${CUDNN_FILE}.cu.cc)
list(APPEND cudnn_cu_cc_srcs ${CUDNN_FILE}.cu.cc)
endif()
if(WITH_AMD_GPU)
string(REPLACE "_op" "_miopen_op" MIOPEN_FILE "${TARGET}")
if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${MIOPEN_FILE}.hip.cc)
list(APPEND miopen_hip_cc_srcs ${MIOPEN_FILE}.hip.cc)
endif()
endif()
if(WITH_MKLDNN)
string(REPLACE "_op" "_mkldnn_op" MKLDNN_FILE "${TARGET}")
if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${MKLDNN_FILE}.cc)
list(APPEND mkldnn_cc_srcs ${MKLDNN_FILE}.cc)
endif()
endif()
else()
foreach(src ${op_library_SRCS})
if (${src} MATCHES ".*\\.hip.cu$")
list(APPEND hip_cu_srcs ${src})
elseif (${src} MATCHES ".*\\.cu$")
list(APPEND cu_srcs ${src})
elseif(${src} MATCHES ".*_cudnn_op.cu.cc$")
list(APPEND cudnn_cu_cc_srcs ${src})
elseif(WITH_AMD_GPU AND ${src} MATCHES ".*_miopen_op.hip.cc$")
list(APPEND miopen_hip_cc_srcs ${src})
elseif(WITH_MKLDNN AND ${src} MATCHES ".*_mkldnn_op.cc$")
list(APPEND mkldnn_cc_srcs ${src})
elseif(${src} MATCHES ".*\\.cu.cc$")
list(APPEND cu_cc_srcs ${src})
elseif(${src} MATCHES ".*\\.cc$")
list(APPEND cc_srcs ${src})
else()
message(FATAL_ERROR "${TARGET} Source file ${src} should only be .cc or .cu")
endif()
endforeach()
endif()
list(LENGTH cc_srcs cc_srcs_len)
if (${cc_srcs_len} EQUAL 0)
message(FATAL_ERROR "The op library ${TARGET} should contains at least one .cc file")
endif()
if (WIN32)
# remove windows unsupported op, because windows has no nccl, no warpctc such ops.
foreach(windows_unsupport_op "nccl_op" "gen_nccl_id_op" "warpctc_op" "hierarchical_sigmoid_op"
"crf_decoding_op" "select_op" "lstmp_op" "gru_op" "fusion_gru_op" "lstm_op" "fusion_lstm_op" "cumsum_op"
"fusion_seqconv_eltadd_relu_op" "channel_send_op" "channel_create_op" "channel_close_op" "channel_recv_op")
if ("${TARGET}" STREQUAL "${windows_unsupport_op}")
return()
endif()
endforeach()
endif(WIN32)
set(OP_LIBRARY ${TARGET} ${OP_LIBRARY} CACHE INTERNAL "op libs")
list(LENGTH op_library_DEPS op_library_DEPS_len)
if (${op_library_DEPS_len} GREATER 0)
set(DEPS_OPS ${TARGET} ${DEPS_OPS} PARENT_SCOPE)
endif()
if (WITH_GPU)
nv_library(${TARGET} SRCS ${cc_srcs} ${cu_cc_srcs} ${cudnn_cu_cc_srcs} ${mkldnn_cc_srcs} ${cu_srcs} DEPS ${op_library_DEPS}
${op_common_deps})
elseif (WITH_AMD_GPU)
hip_library(${TARGET} SRCS ${cc_srcs} ${hip_cu_srcs} ${miopen_hip_cc_srcs} ${mkldnn_cc_srcs} DEPS ${op_library_DEPS}
${op_common_deps})
else()
cc_library(${TARGET} SRCS ${cc_srcs} ${mkldnn_cc_srcs} DEPS ${op_library_DEPS}
${op_common_deps})
endif()
# Define operators that don't need pybind here.
foreach(manual_pybind_op "compare_op" "logical_op" "nccl_op"
"tensor_array_read_write_op" "tensorrt_engine_op" "conv_fusion_op")
if ("${TARGET}" STREQUAL "${manual_pybind_op}")
set(pybind_flag 1)
endif()
endforeach()
# The registration of USE_OP, please refer to paddle/fluid/framework/op_registry.h.
# Note that it's enough to just adding one operator to pybind in a *_op.cc file.
# And for detail pybind information, please see generated paddle/pybind/pybind.h.
file(READ ${TARGET}.cc TARGET_CONTENT)
string(REGEX MATCH "REGISTER_OPERATOR\\(.*REGISTER_OPERATOR\\(" multi_register "${TARGET_CONTENT}")
string(REGEX MATCH "REGISTER_OPERATOR\\([a-z0-9_]*," one_register "${multi_register}")
if (one_register STREQUAL "")
string(REPLACE "_op" "" TARGET "${TARGET}")
else ()
string(REPLACE "REGISTER_OPERATOR(" "" TARGET "${one_register}")
string(REPLACE "," "" TARGET "${TARGET}")
endif()
# pybind USE_NO_KERNEL_OP
# HACK: if REGISTER_OP_CPU_KERNEL presents the operator must have kernel
string(REGEX MATCH "REGISTER_OP_CPU_KERNEL" regex_result "${TARGET_CONTENT}")
string(REPLACE "_op" "" TARGET "${TARGET}")
if (${pybind_flag} EQUAL 0 AND regex_result STREQUAL "")
file(APPEND ${pybind_file} "USE_NO_KERNEL_OP(${TARGET});\n")
set(pybind_flag 1)
endif()
# pybind USE_CPU_ONLY_OP
list(LENGTH cu_srcs cu_srcs_len)
list(LENGTH cu_cc_srcs cu_cc_srcs_len)
list(LENGTH mkldnn_cc_srcs mkldnn_cc_srcs_len)
list(LENGTH hip_cu_srcs hip_cu_srcs_len)
list(LENGTH miopen_hip_cc_srcs miopen_hip_cc_srcs_len)
if (${pybind_flag} EQUAL 0 AND ${mkldnn_cc_srcs_len} EQUAL 0 AND ${cu_srcs_len} EQUAL 0 AND ${cu_cc_srcs_len} EQUAL 0 AND
${hip_cu_srcs_len} EQUAL 0 AND ${miopen_hip_cc_srcs_len} EQUAL 0)
file(APPEND ${pybind_file} "USE_CPU_ONLY_OP(${TARGET});\n")
set(pybind_flag 1)
endif()
# pybind USE_OP_DEVICE_KERNEL for CUDNN
list(LENGTH cudnn_cu_cc_srcs cudnn_cu_cc_srcs_len)
if (WITH_GPU AND ${cudnn_cu_cc_srcs_len} GREATER 0)
file(APPEND ${pybind_file} "USE_OP_DEVICE_KERNEL(${TARGET}, CUDNN);\n")
endif()
# pybind USE_OP_DEVICE_KERNEL for MIOPEN
if (WITH_AMD_GPU AND ${miopen_hip_cc_srcs_len} GREATER 0)
file(APPEND ${pybind_file} "USE_OP_DEVICE_KERNEL(${TARGET}, MIOPEN);\n")
endif()
# pybind USE_OP_DEVICE_KERNEL for MKLDNN
if (WITH_MKLDNN AND ${mkldnn_cc_srcs_len} GREATER 0)
# Append first implemented MKLDNN activation operator
if (${MKLDNN_FILE} STREQUAL "activation_mkldnn_op")
file(APPEND ${pybind_file} "USE_OP_DEVICE_KERNEL(relu, MKLDNN);\n")
else()
file(APPEND ${pybind_file} "USE_OP_DEVICE_KERNEL(${TARGET}, MKLDNN);\n")
endif()
endif()
# pybind USE_OP
if (${pybind_flag} EQUAL 0)
# NOTE(*): activation use macro to regist the kernels, set use_op manually.
if(${TARGET} STREQUAL "activation")
file(APPEND ${pybind_file} "USE_OP(relu);\n")
elseif(${TARGET} STREQUAL "fake_dequantize")
file(APPEND ${pybind_file} "USE_OP(fake_dequantize_max_abs);\n")
elseif(${TARGET} STREQUAL "fake_quantize")
file(APPEND ${pybind_file} "USE_OP(fake_quantize_abs_max);\n")
elseif(${TARGET} STREQUAL "tensorrt_engine_op")
message(STATUS "Pybind skips [tensorrt_engine_op], for this OP is only used in inference")
elseif(${TARGET} STREQUAL "fc")
# HACK: fc only have mkldnn and cpu, which would mismatch the cpu only condition
file(APPEND ${pybind_file} "USE_CPU_ONLY_OP(${TARGET});\n")
else()
file(APPEND ${pybind_file} "USE_OP(${TARGET});\n")
endif()
endif()
endfunction()
function(register_operators)
set(options "")
set(oneValueArgs "")
set(multiValueArgs EXCLUDES DEPS)
cmake_parse_arguments(register_operators "${options}" "${oneValueArgs}"
"${multiValueArgs}" ${ARGN})
file(GLOB OPS RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "*_op.cc")
string(REPLACE "_mkldnn" "" OPS "${OPS}")
string(REPLACE ".cc" "" OPS "${OPS}")
list(REMOVE_DUPLICATES OPS)
list(LENGTH register_operators_DEPS register_operators_DEPS_len)
foreach(src ${OPS})
list(FIND register_operators_EXCLUDES ${src} _index)
if (${_index} EQUAL -1)
if (${register_operators_DEPS_len} GREATER 0)
op_library(${src} DEPS ${register_operators_DEPS})
else()
op_library(${src})
endif()
endif()
endforeach()
endfunction()

@ -93,11 +93,11 @@ paddle.fluid.layers.edit_distance ArgSpec(args=['input', 'label', 'normalized',
paddle.fluid.layers.l2_normalize ArgSpec(args=['x', 'axis', 'epsilon', 'name'], varargs=None, keywords=None, defaults=(1e-12, None))
paddle.fluid.layers.matmul ArgSpec(args=['x', 'y', 'transpose_x', 'transpose_y', 'alpha', 'name'], varargs=None, keywords=None, defaults=(False, False, 1.0, None))
paddle.fluid.layers.topk ArgSpec(args=['input', 'k', 'name'], varargs=None, keywords=None, defaults=(None,))
paddle.fluid.layers.warpctc ArgSpec(args=['input', 'label', 'blank', 'norm_by_times'], varargs=None, keywords=None, defaults=(0, False))
paddle.fluid.layers.warpctc ArgSpec(args=['input', 'label', 'blank', 'norm_by_times', 'use_cudnn'], varargs=None, keywords=None, defaults=(0, False, False))
paddle.fluid.layers.sequence_reshape ArgSpec(args=['input', 'new_dim'], varargs=None, keywords=None, defaults=None)
paddle.fluid.layers.transpose ArgSpec(args=['x', 'perm', 'name'], varargs=None, keywords=None, defaults=(None,))
paddle.fluid.layers.im2sequence ArgSpec(args=['input', 'filter_size', 'stride', 'padding', 'input_image_size', 'out_stride', 'name'], varargs=None, keywords=None, defaults=(1, 1, 0, None, 1, None))
paddle.fluid.layers.nce ArgSpec(args=['input', 'label', 'num_total_classes', 'sample_weight', 'param_attr', 'bias_attr', 'num_neg_samples', 'name'], varargs=None, keywords=None, defaults=(None, None, None, None, None))
paddle.fluid.layers.nce ArgSpec(args=['input', 'label', 'num_total_classes', 'sample_weight', 'param_attr', 'bias_attr', 'num_neg_samples', 'name', 'sampler', 'custom_dist', 'seed'], varargs=None, keywords=None, defaults=(None, None, None, None, None, 'uniform', None, 0))
paddle.fluid.layers.hsigmoid ArgSpec(args=['input', 'label', 'num_classes', 'param_attr', 'bias_attr', 'name'], varargs=None, keywords=None, defaults=(None, None, None))
paddle.fluid.layers.beam_search ArgSpec(args=['pre_ids', 'pre_scores', 'ids', 'scores', 'beam_size', 'end_id', 'level', 'name'], varargs=None, keywords=None, defaults=(0, None))
paddle.fluid.layers.row_conv ArgSpec(args=['input', 'future_context_size', 'param_attr', 'act'], varargs=None, keywords=None, defaults=(None, None))
@ -128,6 +128,7 @@ paddle.fluid.layers.sequence_scatter ArgSpec(args=['input', 'index', 'updates',
paddle.fluid.layers.random_crop ArgSpec(args=['x', 'shape', 'seed'], varargs=None, keywords=None, defaults=(None,))
paddle.fluid.layers.mean_iou ArgSpec(args=['input', 'label', 'num_classes'], varargs=None, keywords=None, defaults=None)
paddle.fluid.layers.relu ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,))
paddle.fluid.layers.selu ArgSpec(args=['x', 'scale', 'alpha', 'name'], varargs=None, keywords=None, defaults=(None, None, None))
paddle.fluid.layers.log ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,))
paddle.fluid.layers.crop ArgSpec(args=['x', 'shape', 'offsets', 'name'], varargs=None, keywords=None, defaults=(None, None, None))
paddle.fluid.layers.rank_loss ArgSpec(args=['label', 'left', 'right', 'name'], varargs=None, keywords=None, defaults=(None,))

@ -17,7 +17,7 @@ limitations under the License. */
#include "paddle/fluid/framework/lod_tensor.h"
#include "paddle/fluid/framework/op_info.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/elementwise_op_function.h"
#include "paddle/fluid/operators/elementwise/elementwise_op_function.h"
#include "paddle/fluid/operators/math/math_function.h"
#include "paddle/fluid/platform/device_context.h"
#include "paddle/fluid/platform/init.h"

@ -30,6 +30,8 @@ class ExceptionHolder {
Catch(exp);
} catch (platform::EnforceNotMet exp) {
Catch(exp);
} catch (std::exception& ex) {
LOG(FATAL) << "std::exception caught, " << ex.what();
} catch (...) {
LOG(FATAL) << "Unknown exception caught";
}

@ -418,11 +418,6 @@ void Executor::RunPreparedContext(ExecutorPrepareContext* ctx, Scope* scope,
DeleteUnusedTensors(*local_scope, op.get(), gc.get(),
&(ctx->cur_ref_cnts_));
}
if (FLAGS_benchmark) {
VLOG(20) << "Memory used after operator " + op->Type() + " running: "
<< memory::memory_usage(place_);
}
}
if (gc != nullptr) {
@ -444,13 +439,6 @@ void Executor::RunPreparedContext(ExecutorPrepareContext* ctx, Scope* scope,
scope->DropKids();
}
}
if (FLAGS_benchmark) {
VLOG(20) << "-------------------------------------------------------";
VLOG(20) << "Memory used after deleting local scope: "
<< memory::memory_usage(place_);
VLOG(20) << "-------------------------------------------------------";
}
}
void Executor::RunPreparedContext(

@ -41,6 +41,7 @@ pass_library(seq_concat_fc_fuse_pass inference)
pass_library(multi_batch_merge_pass base)
pass_library(conv_bn_fuse_pass inference)
pass_library(seqconv_eltadd_relu_fuse_pass inference)
pass_library(is_test_pass base)
if(WITH_MKLDNN)
pass_library(mkldnn_placement_pass base)
pass_library(depthwise_conv_mkldnn_pass base)
@ -62,6 +63,7 @@ cc_test(graph_helper_test SRCS graph_helper_test.cc DEPS graph graph_helper op_r
cc_test(graph_to_program_pass_test SRCS graph_to_program_pass_test.cc DEPS graph_to_program_pass)
cc_test(test_graph_pattern_detector SRCS graph_pattern_detector_tester.cc DEPS graph_pattern_detector)
cc_test(test_fc_fuse_pass SRCS fc_fuse_pass_tester.cc DEPS fc_fuse_pass framework_proto)
cc_test(test_is_test_pass SRCS is_test_pass_tester.cc DEPS is_test_pass)
if (WITH_MKLDNN)
cc_test(test_depthwise_conv_mkldnn_pass SRCS depthwise_conv_mkldnn_pass_tester.cc DEPS depthwise_conv_mkldnn_pass)
cc_test(test_conv_relu_mkldnn_fuse_pass SRCS conv_relu_mkldnn_fuse_pass_tester.cc DEPS conv_relu_mkldnn_fuse_pass)

@ -15,24 +15,119 @@
#pragma once
#include <string>
#include <tuple>
#include <utility>
#include "paddle/fluid/framework/ir/fuse_pass_base.h"
#include "paddle/fluid/framework/ir/graph.h"
#include "paddle/fluid/framework/ir/graph_pattern_detector.h"
#include <boost/optional.hpp>
namespace paddle {
namespace framework {
namespace ir {
class ConvElementwiseAddMKLDNNFusePass : public FusePassBase {
using graph_ptr = std::unique_ptr<ir::Graph>;
using GraphWithStats = std::pair<ir::Graph*, int>;
void CorrectGraphEdges(Graph* graph, Node* from, Node* to);
bool IsReachable(ir::Graph* graph, Node* from, Node* to);
boost::optional<Node*> HasBias(const Node& op, const std::string& bias_name);
class ResidualConnectionMKLDNNFusePass : public FusePassBase {
private:
GraphWithStats FuseConvAsX(const std::string& name_scope,
const GraphWithStats& graph_with_stats) const;
GraphWithStats FuseConvAsY(const std::string& name_scope,
const GraphWithStats& graph_with_stats) const;
GraphWithStats FuseProjectionConv(
const std::string& name_scope,
const GraphWithStats& graph_with_stats) const;
template <typename RetType>
using GetNodeFunc =
std::function<RetType(const GraphPatternDetector::subgraph_t& subgraph)>;
using IdentityConvFunc = GetNodeFunc<std::tuple<Node*, Node*, Node*, Node*>>;
using IdentityElementwiseAddFunc =
GetNodeFunc<std::tuple<Node*, Node*, Node*>>;
using ProjectionConvFunc = IdentityConvFunc;
using ProjectionElementwiseAddFunc = GetNodeFunc<std::tuple<Node*, Node*>>;
using CanFuseFunc = std::function<bool(Node*, Node*)>;
std::tuple<Node*, Node*, Node*, Node*> GetNodesFromConv(
const patterns::Conv& conv_pattern,
const GraphPatternDetector::subgraph_t& subgraph) const;
std::tuple<Node*, Node*, Node*, Node*> GetNodesFromProjectionConv(
const patterns::Conv& conv_pattern,
const GraphPatternDetector::subgraph_t& subgraph) const;
template <typename HandleType, typename... OpFuncs>
GraphWithStats ExecuteHandleOnGraph(GraphPatternDetector* gpd,
const GraphWithStats& graph_with_stats,
OpFuncs&&... op_funcs) const {
ir::Graph* graph;
int stats;
std::tie(graph, stats) = graph_with_stats;
auto can_fuse = [this](Node* op1, Node* op2) -> bool {
return this->FindFuseOption(*op1, *op2) == FUSE_MKLDNN;
};
auto fuse_handle = HandleType{can_fuse, std::forward<OpFuncs>(op_funcs)...};
(*gpd)(graph, fuse_handle);
return std::make_pair(graph, stats + fuse_handle.get_stats());
}
struct IdentityFuseHandle {
IdentityFuseHandle(
const CanFuseFunc& can_fuse_func,
const IdentityConvFunc& get_node_from_conv_op,
const IdentityElementwiseAddFunc& get_node_from_elementwise_add_op);
void operator()(const GraphPatternDetector::subgraph_t& subgraph,
Graph* graph);
int get_stats() const { return *fusion_stats; }
private:
std::shared_ptr<int> fusion_stats;
CanFuseFunc can_fuse_func;
IdentityConvFunc get_node_from_conv_op;
IdentityElementwiseAddFunc get_node_from_elementwise_add_op;
};
struct ProjectionFuseHandle {
ProjectionFuseHandle(
const CanFuseFunc& can_fuse_func,
const ProjectionConvFunc& get_node_from_conv_x_op,
const ProjectionConvFunc& get_node_from_conv_y_op,
const ProjectionElementwiseAddFunc& get_node_from_elementwise_add_op);
void operator()(const GraphPatternDetector::subgraph_t& subgraph,
Graph* graph);
int get_stats() const { return *fusion_stats; }
private:
std::shared_ptr<int> fusion_stats;
CanFuseFunc can_fuse_func;
ProjectionConvFunc get_node_from_conv_x_op;
ProjectionConvFunc get_node_from_conv_y_op;
ProjectionElementwiseAddFunc get_node_from_elementwise_add_op;
};
public:
virtual ~ConvElementwiseAddMKLDNNFusePass() {}
virtual ~ResidualConnectionMKLDNNFusePass() {}
protected:
std::unique_ptr<ir::Graph> ApplyImpl(std::unique_ptr<ir::Graph> graph) const;
std::unique_ptr<ir::Graph> ApplyImpl(graph_ptr graph) const;
const std::string name_scope_{"residual_connections_fuse_pass"};
const std::string name_scope_{"residual_connection_fuse_pass"};
};
} // namespace ir
} // namespace framework
} // namespace paddle

@ -40,7 +40,7 @@ void SetOp(ProgramDesc* prog, const std::string& type,
op->SetOutput(output.first, {output.second});
}
struct IsReachable {
struct TestIsReachable {
using func = std::function<bool(const std::string&, const std::string&)>;
auto operator()(const std::unique_ptr<ir::Graph>& graph) -> func {
@ -89,7 +89,9 @@ struct IsReachable {
}
};
void AssertOpsCount(const std::unique_ptr<ir::Graph>& graph) {
void AssertOpsCount(const std::unique_ptr<ir::Graph>& graph,
int expected_conv_count,
int expected_elementwise_add_count = 0) {
int conv_count = 0;
int elementwise_add_count = 0;
@ -101,8 +103,8 @@ void AssertOpsCount(const std::unique_ptr<ir::Graph>& graph) {
++elementwise_add_count;
}
}
EXPECT_EQ(conv_count, 1);
EXPECT_EQ(elementwise_add_count, 0);
EXPECT_EQ(conv_count, expected_conv_count);
EXPECT_EQ(elementwise_add_count, expected_elementwise_add_count);
}
ProgramDesc BuildProgramDesc(const std::vector<std::string>& transient_vars,
@ -127,22 +129,13 @@ ProgramDesc BuildProgramDesc(const std::vector<std::string>& transient_vars,
return prog;
}
} // namespace
TEST(ConvElementwiseAddMKLDNNFusePass, ConvolutionWithElementwiseAddRelu) {
auto prog =
BuildProgramDesc({"a", "b", "c", "d", "e", "f"}, {"bias", "weights"});
SetOp(&prog, "conv2d",
{{"Input", "a"}, {"Bias", "bias"}, {"Filter", "weights"}},
{"Output", "b"});
SetOp(&prog, "elementwise_add", {{"X", "b"}, {"Y", "c"}}, {"Out", "d"});
SetOp(&prog, "relu", {{"X", "d"}}, {"Out", "e"});
std::unique_ptr<ir::Graph> graph(new ir::Graph(prog));
void RunPassAndAssert(ProgramDesc* prog, const std::string& from,
const std::string& to, int expected_conv_num) {
std::unique_ptr<ir::Graph> graph(new ir::Graph(*prog));
IsReachable is_reachable;
EXPECT_TRUE(is_reachable(graph)("a", "relu"));
TestIsReachable is_reachable;
EXPECT_TRUE(is_reachable(graph)(from, to));
auto pass =
PassRegistry::Instance().Get("conv_elementwise_add_mkldnn_fuse_pass");
@ -150,82 +143,87 @@ TEST(ConvElementwiseAddMKLDNNFusePass, ConvolutionWithElementwiseAddRelu) {
graph = pass->Apply(std::move(graph));
int current_nodes_num = graph->Nodes().size();
EXPECT_TRUE(is_reachable(graph)("a", "relu"));
EXPECT_TRUE(is_reachable(graph)(from, to));
EXPECT_EQ(original_nodes_num - nodes_removed + nodes_added,
current_nodes_num);
AssertOpsCount(graph);
AssertOpsCount(graph, expected_conv_num);
}
} // namespace
TEST(ConvElementwiseAddMKLDNNFusePass,
ConvolutionWithElementwiseAddReluNoBias) {
auto prog = BuildProgramDesc({"a", "b", "c", "d", "e"}, {"weights"});
SetOp(&prog, "conv2d", {{"Input", "a"}, {"Filter", "weights"}},
{"Output", "b"});
SetOp(&prog, "elementwise_add", {{"X", "b"}, {"Y", "c"}}, {"Out", "d"});
SetOp(&prog, "relu", {{"X", "d"}}, {"Out", "e"});
std::unique_ptr<ir::Graph> graph(new ir::Graph(prog));
TEST(ConvElementwiseAddMKLDNNFusePass, ConvolutionAsYWithElementwiseAddRelu) {
auto prog = BuildProgramDesc({"a", "b", "c", "d", "e"}, {"bias", "weights"});
IsReachable is_reachable;
SetOp(&prog, "sigmoid", {{"X", "a"}}, {"Out", "b"});
SetOp(&prog, "conv2d",
{{"Input", "b"}, {"Bias", "bias"}, {"Filter", "weights"}},
{"Output", "c"});
EXPECT_TRUE(is_reachable(graph)("a", "relu"));
SetOp(&prog, "elementwise_add", {{"X", "a"}, {"Y", "c"}}, {"Out", "d"});
SetOp(&prog, "relu", {{"X", "d"}}, {"Out", "e"});
auto pass =
PassRegistry::Instance().Get("conv_elementwise_add_mkldnn_fuse_pass");
int original_nodes_num = graph->Nodes().size();
graph = pass->Apply(std::move(graph));
int current_nodes_num = graph->Nodes().size();
RunPassAndAssert(&prog, "a", "relu", 1);
}
EXPECT_TRUE(is_reachable(graph)("a", "relu"));
TEST(ConvElementwiseAddMKLDNNFusePass,
ConvolutionAsYWithElementwiseAddReluNoBias) {
auto prog = BuildProgramDesc({"a", "b", "c", "d", "e"}, {"weights"});
EXPECT_EQ(original_nodes_num - nodes_removed + nodes_added,
current_nodes_num);
SetOp(&prog, "sigmoid", {{"X", "a"}}, {"Out", "b"});
SetOp(&prog, "conv2d", {{"Input", "b"}, {"Filter", "weights"}},
{"Output", "c"});
SetOp(&prog, "elementwise_add", {{"X", "a"}, {"Y", "c"}}, {"Out", "d"});
SetOp(&prog, "relu", {{"X", "d"}}, {"Out", "e"});
AssertOpsCount(graph);
RunPassAndAssert(&prog, "a", "relu", 1);
}
TEST(ConvElementwiseAddMKLDNNFusePass, ConvolutionElementwiseAdd) {
auto prog = BuildProgramDesc({"a", "b", "c", "d"}, {"bias", "weights"});
TEST(ConvElementwiseAddMKLDNNFusePass, ConvolutionAsXWithElementwiseAddRelu) {
auto prog = BuildProgramDesc({"a", "b", "c", "d", "e"}, {"bias", "weights"});
SetOp(&prog, "sigmoid", {{"X", "a"}}, {"Out", "b"});
SetOp(&prog, "conv2d",
{{"Input", "a"}, {"Bias", "bias"}, {"Filter", "weights"}},
{"Output", "b"});
SetOp(&prog, "elementwise_add", {{"X", "b"}, {"Y", "c"}}, {"Out", "d"});
{{"Input", "b"}, {"Bias", "bias"}, {"Filter", "weights"}},
{"Output", "c"});
std::unique_ptr<ir::Graph> graph(new ir::Graph(prog));
SetOp(&prog, "elementwise_add", {{"X", "c"}, {"Y", "a"}}, {"Out", "d"});
SetOp(&prog, "relu", {{"X", "d"}}, {"Out", "e"});
IsReachable is_reachable;
EXPECT_TRUE(is_reachable(graph)("a", "d"));
RunPassAndAssert(&prog, "a", "relu", 1);
}
auto pass =
PassRegistry::Instance().Get("conv_elementwise_add_mkldnn_fuse_pass");
int original_nodes_num = graph->Nodes().size();
graph = pass->Apply(std::move(graph));
int current_nodes_num = graph->Nodes().size();
TEST(ConvElementwiseAddMKLDNNFusePass,
ConvolutionAsXWithElementwiseAddReluNoBias) {
auto prog = BuildProgramDesc({"a", "b", "c", "d", "e"}, {"weights"});
EXPECT_FALSE(is_reachable(graph)("a", "d"));
SetOp(&prog, "sigmoid", {{"X", "a"}}, {"Out", "b"});
SetOp(&prog, "conv2d", {{"Input", "b"}, {"Filter", "weights"}},
{"Output", "c"});
SetOp(&prog, "elementwise_add", {{"X", "c"}, {"Y", "a"}}, {"Out", "d"});
SetOp(&prog, "relu", {{"X", "d"}}, {"Out", "e"});
EXPECT_EQ(original_nodes_num - nodes_removed + nodes_added,
current_nodes_num);
AssertOpsCount(graph);
RunPassAndAssert(&prog, "a", "relu", 1);
}
TEST(ConvElementwiseAddMKLDNNFusePass, SigmoidConvolutionAddElementwiseRelu) {
TEST(ConvElementwiseAddMKLDNNFusePass, NoFusion) {
auto prog =
BuildProgramDesc({"a", "b", "c", "d", "e", "f"}, {"bias", "weights"});
BuildProgramDesc({"a", "b", "c", "d", "e", "f", "g"}, {"weights"});
SetOp(&prog, "sigmoid", {{"X", "a"}}, {"Out", "b"});
SetOp(&prog, "conv2d",
{{"Input", "b"}, {"Bias", "bias"}, {"Filter", "weights"}},
SetOp(&prog, "conv2d", {{"Input", "b"}, {"Filter", "weights"}},
{"Output", "c"});
SetOp(&prog, "elementwise_add", {{"X", "c"}, {"Y", "d"}}, {"Out", "e"});
SetOp(&prog, "relu", {{"X", "e"}}, {"Out", "f"});
std::unique_ptr<ir::Graph> graph(new ir::Graph(prog));
SetOp(&prog, "conv2d", {{"Input", "d"}, {"Filter", "weights"}},
{"Output", "e"});
IsReachable is_reachable;
SetOp(&prog, "elementwise_add", {{"X", "c"}, {"Y", "e"}}, {"Out", "f"});
SetOp(&prog, "relu", {{"X", "f"}}, {"Out", "g"});
EXPECT_TRUE(is_reachable(graph)("a", "f"));
std::unique_ptr<ir::Graph> graph(new ir::Graph(prog));
TestIsReachable is_reachable;
EXPECT_TRUE(is_reachable(graph)("a", "g"));
auto pass =
PassRegistry::Instance().Get("conv_elementwise_add_mkldnn_fuse_pass");
@ -233,11 +231,10 @@ TEST(ConvElementwiseAddMKLDNNFusePass, SigmoidConvolutionAddElementwiseRelu) {
graph = pass->Apply(std::move(graph));
int current_nodes_num = graph->Nodes().size();
EXPECT_TRUE(is_reachable(graph)("a", "f"));
EXPECT_TRUE(is_reachable(graph)("a", "g"));
EXPECT_EQ(original_nodes_num, current_nodes_num);
EXPECT_EQ(original_nodes_num - nodes_removed + nodes_added,
current_nodes_num);
AssertOpsCount(graph);
AssertOpsCount(graph, 2, 1);
}
} // namespace ir

@ -15,8 +15,15 @@ limitations under the License. */
#include "paddle/fluid/framework/ir/graph_helper.h"
#include <algorithm>
#include <deque>
#include <fstream>
#include <iosfwd>
#include <ostream>
#include <unordered_set>
DEFINE_string(print_sub_graph_dir, "",
"FLAGS_print_sub_graph_dir is used "
"to print the nodes of sub_graphs.");
namespace paddle {
namespace framework {
namespace ir {
@ -164,12 +171,15 @@ size_t GraphNum(const Graph &graph) {
graph_nodes.emplace_back(g_nodes);
}
if (VLOG_IS_ON(100)) {
VLOG(100) << "graph_num: " << graph_nodes.size();
for (auto &g_n : graph_nodes) {
VLOG(100) << "graph_nodes: " << g_n.size();
if (g_n.size() < 10) {
std::stringstream out;
if (FLAGS_print_sub_graph_dir.size()) {
if (graph_nodes.size() > 1) {
std::stringstream out;
for (auto &g_n : graph_nodes) {
out << "graph_nodes: " << g_n.size() << "\n";
}
out << "\n\n";
for (auto &g_n : graph_nodes) {
out << "graph_nodes: " << g_n.size();
for (auto &node : g_n) {
out << "\nNode: " << node->Name() << " in [";
for (auto &n : node->inputs) {
@ -181,8 +191,12 @@ size_t GraphNum(const Graph &graph) {
}
out << "]";
}
VLOG(100) << out.str();
out << "\n\n\n";
}
std::unique_ptr<std::ostream> fout(
new std::ofstream(FLAGS_print_sub_graph_dir));
PADDLE_ENFORCE(fout->good());
*fout << out.str();
}
}

@ -1084,16 +1084,12 @@ PDNode *patterns::Conv::operator()() {
return output_var;
}
PDNode *patterns::ElementwiseAdd::operator()(PDNode *x_var) {
PDNode *patterns::ElementwiseAdd::operator()(PDNode *x_var, PDNode *y_var) {
auto elementwise_add_op = pattern->NewNode(elementwise_add_op_repr())
->assert_is_op("elementwise_add");
x_var->assert_is_op_input("elementwise_add", "X");
auto y_var = pattern->NewNode(elementwise_add_x_repr())
->AsInput()
->assert_is_op_input("elementwise_add", "Y");
x_var->AsInput()->assert_is_op_input("elementwise_add", "X");
y_var->AsInput()->assert_is_op_input("elementwise_add", "Y");
auto out_var = pattern->NewNode(elementwise_add_out_repr())
->AsOutput()
->assert_is_op_output("elementwise_add", "Out");

@ -664,7 +664,7 @@ struct ElementwiseAdd : public PatternBase {
ElementwiseAdd(PDPattern* pattern, const std::string& name_scope)
: PatternBase(pattern, name_scope, "elementwise_add") {}
PDNode* operator()(PDNode* x_var);
PDNode* operator()(PDNode* x_var, PDNode* y_var);
PATTERN_DECL_NODE(elementwise_add_op);
PATTERN_DECL_NODE(elementwise_add_x);

@ -0,0 +1,57 @@
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/framework/ir/is_test_pass.h"
#include <string>
#include <utility>
namespace paddle {
namespace framework {
namespace ir {
std::unique_ptr<ir::Graph> IsTestPass::ApplyImpl(
std::unique_ptr<ir::Graph> graph) const {
VLOG(3) << "Sets is_test attrbiute to true and if it is missing, inserts it "
"for activations and pooling.";
auto op_list = {"pool2d", "sigmoid", "logsigmoid",
"softshrink", "exp", "brelu",
"pow", "leaky_relu", "stanh",
"relu", "tanh", "tanh_shrink",
"sqrt", "abs", "ceil",
"elu", "floor", "cos",
"sin", "round", "reciprocal",
"hard_shrink", "hard_sigmoid", "relu6",
"soft_relu", "swish", "thresholded_relu",
"log", "square", "softplus",
"softsign"};
for (const Node* n : graph->Nodes()) {
if (n->IsOp()) {
auto* op = n->Op();
if (op->HasAttr("is_test")) {
op->SetAttr("is_test", true);
} else if (std::find(begin(op_list), end(op_list), op->Type()) !=
end(op_list)) {
op->MutableAttrMap()->insert(
std::pair<std::string, Attribute>("is_test", true));
}
}
}
return graph;
}
} // namespace ir
} // namespace framework
} // namespace paddle
REGISTER_PASS(is_test_pass, paddle::framework::ir::IsTestPass);

@ -0,0 +1,31 @@
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include "paddle/fluid/framework/ir/pass.h"
namespace paddle {
namespace framework {
namespace ir {
class IsTestPass : public Pass {
protected:
std::unique_ptr<ir::Graph> ApplyImpl(
std::unique_ptr<ir::Graph> graph) const override;
};
} // namespace ir
} // namespace framework
} // namespace paddle

@ -0,0 +1,117 @@
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/framework/ir/is_test_pass.h"
#include <gtest/gtest.h>
namespace paddle {
namespace framework {
namespace ir {
enum class ISTEST_STATE { FALSE, TRUE, UNSET };
void SetOp(ProgramDesc* prog, const std::string& type, const std::string& name,
const std::vector<std::string>& inputs,
const std::vector<std::string>& outputs, bool use_mkldnn = false,
ISTEST_STATE is_test = ISTEST_STATE::UNSET) {
auto* op = prog->MutableBlock(0)->AppendOp();
op->SetType(type);
op->SetAttr("name", name);
op->SetInput("X", inputs);
op->SetOutput("Out", outputs);
op->SetAttr("use_mkldnn", use_mkldnn);
if (is_test == ISTEST_STATE::UNSET)
op->MutableAttrMap()->erase("is_test");
else if (is_test == ISTEST_STATE::FALSE)
op->SetAttr("is_test", false);
else
op->SetAttr("is_test", true);
}
// a->pool2d->b
// b->relu->c
// c,weights1)->conv2d->d
//
// d->pool2d->e
// e->hard_sigmoid->f
// (f,weights2)->conv2d->g
//
// g->pool2d->h
// h->tanh->i
// (i,weights3)->conv2d->j
ProgramDesc BuildProgramDesc() {
ProgramDesc prog;
for (auto& v :
std::vector<std::string>({"a", "b", "c", "d", "e", "f", "g", "h", "i",
"j", "weights1", "weights2", "weights3"})) {
auto* var = prog.MutableBlock(0)->Var(v);
var->SetType(proto::VarType::SELECTED_ROWS);
if (v == "weights1" || v == "weights2" || v == "weights3") {
var->SetPersistable(true);
}
}
SetOp(&prog, "pool2d", "pooling1", std::vector<std::string>({"a"}),
std::vector<std::string>({"b"}), true, ISTEST_STATE::TRUE);
SetOp(&prog, "relu", "activation1", std::vector<std::string>({"b"}),
std::vector<std::string>({"c"}), true, ISTEST_STATE::TRUE);
SetOp(&prog, "conv2d", "conv1", std::vector<std::string>({"c", "weights1"}),
std::vector<std::string>({"d"}), true, ISTEST_STATE::TRUE);
SetOp(&prog, "pool2d", "pooling2", std::vector<std::string>({"d"}),
std::vector<std::string>({"e"}), false, ISTEST_STATE::FALSE);
SetOp(&prog, "hard_sigmoid", "activation2", std::vector<std::string>({"e"}),
std::vector<std::string>({"f"}), false, ISTEST_STATE::FALSE);
SetOp(&prog, "conv2d", "conv2", std::vector<std::string>({"f", "weights2"}),
std::vector<std::string>({"g"}), false, ISTEST_STATE::FALSE);
SetOp(&prog, "pool2d", "pooling3", std::vector<std::string>({"g"}),
std::vector<std::string>({"h"}), false, ISTEST_STATE::UNSET);
SetOp(&prog, "tanh", "activation3", std::vector<std::string>({"h"}),
std::vector<std::string>({"i"}), true, ISTEST_STATE::UNSET);
SetOp(&prog, "conv2d", "conv3", std::vector<std::string>({"i", "weights3"}),
std::vector<std::string>({"j"}), false, ISTEST_STATE::UNSET);
return prog;
}
TEST(IsTestPass, basic) {
auto prog = BuildProgramDesc();
std::unique_ptr<ir::Graph> graph(new ir::Graph(prog));
auto pass = PassRegistry::Instance().Get("is_test_pass");
graph = pass->Apply(std::move(graph));
for (auto* node : graph->Nodes()) {
if (node->IsOp()) {
auto* op = node->Op();
auto op_name = boost::get<std::string>(op->GetAttr("name"));
if (op_name == "conv3") {
ASSERT_FALSE(op->HasAttr("is_test"));
} else {
ASSERT_TRUE(op->HasAttr("is_test"));
EXPECT_TRUE(boost::get<bool>(op->GetAttr("is_test")));
}
}
}
}
} // namespace ir
} // namespace framework
} // namespace paddle
USE_PASS(is_test_pass);

@ -111,9 +111,6 @@ class LoDTensor : public Tensor {
public:
LoDTensor() : Tensor() {}
/* Constructor with place should only be used in pybind */
explicit LoDTensor(const platform::Place& place) : Tensor(place) {}
explicit LoDTensor(const LoD& lod) : lod_(lod) {}
void set_lod(const LoD& lod) { lod_ = lod; }

@ -23,6 +23,7 @@
#include "paddle/fluid/framework/details/cow_ptr.h"
#include "paddle/fluid/framework/tensor.h"
#include "paddle/fluid/framework/tensor_util.h"
#include "paddle/fluid/memory/malloc.h"
#include "paddle/fluid/memory/memcpy.h"
#include "glog/logging.h"
@ -31,46 +32,6 @@ namespace paddle {
namespace framework {
#if defined(PADDLE_WITH_CUDA)
namespace details {
struct CUDABuffer {
void *data_{nullptr};
size_t size_{0};
platform::CUDAPlace place_;
CUDABuffer() {}
CUDABuffer(platform::Place place, size_t size)
: size_(size), place_(boost::get<platform::CUDAPlace>(place)) {
data_ = memory::Alloc(place_, size);
}
~CUDABuffer() { ClearMemory(); }
CUDABuffer(const CUDABuffer &o) = delete;
CUDABuffer &operator=(const CUDABuffer &o) = delete;
void Resize(platform::Place place, size_t size) {
ClearMemory();
place_ = boost::get<platform::CUDAPlace>(place);
data_ = memory::Alloc(place_, size);
PADDLE_ENFORCE_NOT_NULL(data_);
size_ = size;
}
void Swap(CUDABuffer &o) {
std::swap(data_, o.data_);
std::swap(place_, o.place_);
std::swap(size_, o.size_);
}
private:
void ClearMemory() const {
if (data_ != nullptr) {
memory::Free(place_, data_);
}
}
};
} // namespace details
// Vector<T> implements the std::vector interface, and can get Data or
// MutableData from any place. The data will be synced implicitly inside.
template <typename T>
@ -103,8 +64,6 @@ class Vector {
o.ImmutableCPU();
cpu_ = o.cpu_;
flag_ = kDataInCPU;
details::CUDABuffer null;
gpu_.Swap(null);
return *this;
}
@ -199,7 +158,7 @@ class Vector {
PADDLE_ENFORCE(platform::is_gpu_place(place),
"CUDA Data must on CUDA place");
ImmutableCUDA(place);
return reinterpret_cast<T *>(gpu_.data_);
return reinterpret_cast<T *>(gpu_->ptr());
}
// get cuda ptr. mutable
@ -234,13 +193,11 @@ class Vector {
std::mutex &Mutex() const { return mtx_; }
std::unique_ptr<platform::CUDAPlace> CUDAPlace() const {
if (gpu_.data_ == nullptr) {
return nullptr;
} else {
return std::unique_ptr<platform::CUDAPlace>(
new platform::CUDAPlace(gpu_.place_));
}
boost::optional<platform::CUDAPlace> CUDAPlace() const {
return gpu_ == nullptr
? boost::none
: boost::optional<platform::CUDAPlace>(
boost::get<platform::CUDAPlace>(gpu_->place()));
}
private:
@ -254,13 +211,12 @@ class Vector {
void CopyToCPU() const {
// COPY GPU Data To CPU
auto *dev_ctx = static_cast<platform::CUDADeviceContext *>(
platform::DeviceContextPool::Instance().Get(
platform::Place(gpu_.place_)));
platform::DeviceContextPool::Instance().Get(gpu_->place()));
auto stream = dev_ctx->stream();
void *src = gpu_.data_;
void *src = gpu_->ptr();
void *dst = cpu_.data();
memory::Copy(platform::CPUPlace(), dst, gpu_.place_, src, gpu_.size_,
stream);
memory::Copy(platform::CPUPlace(), dst, CUDAPlace().get(), src,
gpu_->size(), stream);
dev_ctx->Wait();
}
@ -277,8 +233,7 @@ class Vector {
CopyCPUDataToCUDA(place);
UnsetFlag(kDirty);
SetFlag(kDataInCUDA);
} else if (IsInCUDA() &&
!(boost::get<platform::CUDAPlace>(place) == gpu_.place_)) {
} else if (IsInCUDA() && !(place == gpu_->place())) {
PADDLE_THROW("This situation should not happen");
// Still dirty
} else {
@ -290,7 +245,7 @@ class Vector {
// Even data is not dirty. However, data is not in CUDA. Copy data.
CopyCPUDataToCUDA(place);
SetFlag(kDataInCUDA);
} else if (!(boost::get<platform::CUDAPlace>(place) == gpu_.place_)) {
} else if (!(place == gpu_->place())) {
PADDLE_THROW("This situation should not happen.");
} else {
// Not Dirty && DataInCUDA && Device is same
@ -301,13 +256,13 @@ class Vector {
void CopyCPUDataToCUDA(const platform::Place &place) const {
void *src = cpu_.data();
gpu_.Resize(place, cpu_.size() * sizeof(T));
void *dst = gpu_.data_;
gpu_ = memory::Alloc(place, cpu_.size() * sizeof(T));
void *dst = gpu_->ptr();
auto *dev_ctx = static_cast<platform::CUDADeviceContext *>(
platform::DeviceContextPool::Instance().Get(place));
auto stream = dev_ctx->stream();
memory::Copy(gpu_.place_, dst, platform::CPUPlace(), src, gpu_.size_,
stream);
memory::Copy(CUDAPlace().get(), dst, platform::CPUPlace(), src,
gpu_->size(), stream);
}
void ImmutableCPU() const {
@ -329,7 +284,7 @@ class Vector {
bool IsInCPU() const { return flag_ & kDataInCPU; }
mutable std::vector<T> cpu_;
mutable details::CUDABuffer gpu_;
mutable memory::AllocationPtr gpu_;
mutable int flag_;
mutable std::mutex mtx_;
@ -428,8 +383,8 @@ class Vector {
auto &mtx = m_.Data().Mutex();
std::lock_guard<std::mutex> guard(mtx);
auto cuda_place = m_.Data().CUDAPlace();
if (cuda_place == nullptr ||
*cuda_place == boost::get<platform::CUDAPlace>(place)) {
if (cuda_place == boost::none ||
cuda_place == boost::get<platform::CUDAPlace>(place)) {
return m_.Data().CUDAData(place);
}
}
@ -444,8 +399,8 @@ class Vector {
auto &mtx = m_.Data().Mutex();
std::lock_guard<std::mutex> guard(mtx);
auto cuda_place = m_.Data().CUDAPlace();
if (cuda_place == nullptr ||
*cuda_place == boost::get<platform::CUDAPlace>(place)) {
if (cuda_place == boost::none ||
cuda_place == boost::get<platform::CUDAPlace>(place)) {
return m_.MutableData()->CUDAMutableData(place);
}
}

@ -70,6 +70,16 @@ void NaiveExecutor::Prepare(Scope *scope, const ProgramDesc &program_desc,
}
void NaiveExecutor::Run() {
#ifndef PADDLE_ON_INFERENCE
LOG_FIRST_N(WARNING, 15) << "The NaiveExecutor can not work properly if the "
"cmake flag ON_INFER is not set.";
LOG_FIRST_N(WARNING, 15) << "Unlike the training phase, all the scopes and "
"variables will be reused to save the allocation "
"overhead.";
LOG_FIRST_N(WARNING, 15) << "Please re-compile the inference library by "
"setting the cmake flag ON_INFER=ON if you are "
"running Paddle Inference";
#endif // PADDLE_ON_INFERENCE
for (auto &op : ops_) {
VLOG(3) << std::this_thread::get_id() << " run " << op->Type()
<< " on scope " << scope_;

@ -63,6 +63,8 @@ struct OpKernelType {
place_(dev_ctx.GetPlace()),
library_type_(library_type) {}
size_t hash_key() const { return Hash()(*this); }
bool operator==(const OpKernelType& o) const {
return platform::places_are_same_class(place_, o.place_) &&
data_type_ == o.data_type_ && data_layout_ == o.data_layout_ &&

@ -35,6 +35,11 @@ DEFINE_bool(check_nan_inf, false,
namespace paddle {
namespace framework {
// Combine two hash values to a single hash.
inline size_t CombineHash(size_t seed, size_t a) {
return (seed ^ a) + 0x9e3779b9 + (seed << 6) + (seed >> 2);
}
std::vector<std::tuple<platform::Place, LibraryType>> kKernelPriority = {
std::make_tuple(platform::CUDAPlace(0), LibraryType::kCUDNN),
std::make_tuple(platform::CUDAPlace(0), LibraryType::kPlain),
@ -794,6 +799,17 @@ void OperatorWithKernel::TransferInplaceVarsBack(
Scope* OperatorWithKernel::TryTransferData(
const Scope& scope, const OpKernelType& expected_kernel_key,
std::vector<std::string>* transfered_inplace_vars) const {
// In the inference scenerio, the scopes will be reused across the batches, so
// the `new_scope` here will result in GPU memroy explosion over the running of
// operators.
// We use a thread_local cache to fix that issue, the key in the cache is the
// combination of the `scope` argument, from_kernel_type, target_kernel_type.
// Have a discussion with @Superjomn or the inference developers if some changes
// on this logic for this macro might not tested on the other scenerios.
#ifdef PADDLE_ON_INFERENCE
thread_local std::unordered_map<size_t, Scope*> infer_transfer_scope_cache;
#endif
Scope* new_scope = nullptr;
for (auto& var_name_item : Inputs()) {
for (auto& var_name : var_name_item.second) {
@ -824,11 +840,28 @@ Scope* OperatorWithKernel::TryTransferData(
VLOG(30) << "Transform Variable " << var_name << " from "
<< kernel_type_for_var << " to " << expected_kernel_key;
#ifdef PADDLE_ON_INFERENCE
size_t infer_cache_key =
CombineHash(OpKernelType::Hash()(kernel_type_for_var),
OpKernelType::Hash()(expected_kernel_key));
infer_cache_key =
CombineHash(infer_cache_key, std::hash<const Scope*>()(&scope));
auto it = infer_transfer_scope_cache.find(infer_cache_key);
if (it != infer_transfer_scope_cache.end()) {
new_scope = infer_transfer_scope_cache[infer_cache_key];
} else {
new_scope = &scope.NewScope();
infer_transfer_scope_cache[infer_cache_key] = new_scope;
}
#endif
if (new_scope == nullptr) {
new_scope = &scope.NewScope();
}
auto* trans_var = new_scope->Var(var_name);
Tensor out;
TransformData(expected_kernel_key, kernel_type_for_var, *tensor_in, &out);
SetTensorToVariable(*var, out, trans_var);

Some files were not shown because too many files have changed in this diff Show More

Loading…
Cancel
Save