Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into imperative_shared_ptr

test=develop
revert-15207-remove_op_handle_lock_and_fix_var
minqiyang 6 years ago
commit ddfb9f1123

@ -63,6 +63,15 @@ ADD_DEPENDENCIES(gflags extern_gflags)
LIST(APPEND external_project_dependencies gflags) LIST(APPEND external_project_dependencies gflags)
# On Windows (including MinGW), the Shlwapi library is used by gflags if available.
if (WIN32)
include(CheckIncludeFileCXX)
check_include_file_cxx("shlwapi.h" HAVE_SHLWAPI)
if (HAVE_SHLWAPI)
set_property(GLOBAL PROPERTY OS_DEPENDENCY_MODULES shlwapi.lib)
endif(HAVE_SHLWAPI)
endif (WIN32)
IF(WITH_C_API) IF(WITH_C_API)
INSTALL(DIRECTORY ${GFLAGS_INCLUDE_DIR} DESTINATION third_party/gflags) INSTALL(DIRECTORY ${GFLAGS_INCLUDE_DIR} DESTINATION third_party/gflags)
IF(ANDROID) IF(ANDROID)

@ -359,6 +359,8 @@ function(cc_binary TARGET_NAME)
add_dependencies(${TARGET_NAME} ${cc_binary_DEPS}) add_dependencies(${TARGET_NAME} ${cc_binary_DEPS})
common_link(${TARGET_NAME}) common_link(${TARGET_NAME})
endif() endif()
get_property(os_dependency_modules GLOBAL PROPERTY OS_DEPENDENCY_MODULES)
target_link_libraries(${TARGET_NAME} ${os_dependency_modules})
endfunction(cc_binary) endfunction(cc_binary)
function(cc_test TARGET_NAME) function(cc_test TARGET_NAME)
@ -367,18 +369,15 @@ function(cc_test TARGET_NAME)
set(oneValueArgs "") set(oneValueArgs "")
set(multiValueArgs SRCS DEPS ARGS) set(multiValueArgs SRCS DEPS ARGS)
cmake_parse_arguments(cc_test "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN}) cmake_parse_arguments(cc_test "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
add_executable(${TARGET_NAME} ${cc_test_SRCS})
if(WIN32) if(WIN32)
list(APPEND win32_deps shlwapi)
if("${cc_test_DEPS};" MATCHES "python;") if("${cc_test_DEPS};" MATCHES "python;")
list(REMOVE_ITEM cc_test_DEPS python) list(REMOVE_ITEM cc_test_DEPS python)
list(APPEND win32_deps ${PYTHON_LIBRARIES}) target_link_libraries(${TARGET_NAME} ${PYTHON_LIBRARIES})
endif() endif()
endif(WIN32) endif(WIN32)
add_executable(${TARGET_NAME} ${cc_test_SRCS}) get_property(os_dependency_modules GLOBAL PROPERTY OS_DEPENDENCY_MODULES)
target_link_libraries(${TARGET_NAME} ${cc_test_DEPS} paddle_gtest_main lod_tensor memory gtest gflags glog) target_link_libraries(${TARGET_NAME} ${cc_test_DEPS} ${os_dependency_modules} paddle_gtest_main lod_tensor memory gtest gflags glog)
if(WIN32)
target_link_libraries(${TARGET_NAME} ${win32_deps})
endif(WIN32)
add_dependencies(${TARGET_NAME} ${cc_test_DEPS} paddle_gtest_main lod_tensor memory gtest gflags glog) add_dependencies(${TARGET_NAME} ${cc_test_DEPS} paddle_gtest_main lod_tensor memory gtest gflags glog)
common_link(${TARGET_NAME}) common_link(${TARGET_NAME})
add_test(NAME ${TARGET_NAME} add_test(NAME ${TARGET_NAME}
@ -451,7 +450,8 @@ function(nv_test TARGET_NAME)
set(multiValueArgs SRCS DEPS) set(multiValueArgs SRCS DEPS)
cmake_parse_arguments(nv_test "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN}) cmake_parse_arguments(nv_test "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
cuda_add_executable(${TARGET_NAME} ${nv_test_SRCS}) cuda_add_executable(${TARGET_NAME} ${nv_test_SRCS})
target_link_libraries(${TARGET_NAME} ${nv_test_DEPS} paddle_gtest_main lod_tensor memory gtest gflags glog) get_property(os_dependency_modules GLOBAL PROPERTY OS_DEPENDENCY_MODULES)
target_link_libraries(${TARGET_NAME} ${nv_test_DEPS} paddle_gtest_main lod_tensor memory gtest gflags glog ${os_dependency_modules})
add_dependencies(${TARGET_NAME} ${nv_test_DEPS} paddle_gtest_main lod_tensor memory gtest gflags glog) add_dependencies(${TARGET_NAME} ${nv_test_DEPS} paddle_gtest_main lod_tensor memory gtest gflags glog)
common_link(${TARGET_NAME}) common_link(${TARGET_NAME})
add_test(${TARGET_NAME} ${TARGET_NAME}) add_test(${TARGET_NAME} ${TARGET_NAME})
@ -538,7 +538,8 @@ function(hip_test TARGET_NAME)
endif() endif()
add_executable(${TARGET_NAME} ${_cmake_options} ${_generated_files} ${_sources}) add_executable(${TARGET_NAME} ${_cmake_options} ${_generated_files} ${_sources})
set_target_properties(${TARGET_NAME} PROPERTIES LINKER_LANGUAGE HIP) set_target_properties(${TARGET_NAME} PROPERTIES LINKER_LANGUAGE HIP)
target_link_libraries(${TARGET_NAME} ${hip_test_DEPS} paddle_gtest_main memory gtest gflags) get_property(os_dependency_modules GLOBAL PROPERTY OS_DEPENDENCY_MODULES)
target_link_libraries(${TARGET_NAME} ${hip_test_DEPS} paddle_gtest_main memory gtest gflags ${os_dependency_modules})
add_dependencies(${TARGET_NAME} ${hip_test_DEPS} paddle_gtest_main memory gtest gflags) add_dependencies(${TARGET_NAME} ${hip_test_DEPS} paddle_gtest_main memory gtest gflags)
common_link(${TARGET_NAME}) common_link(${TARGET_NAME})
add_test(${TARGET_NAME} ${TARGET_NAME}) add_test(${TARGET_NAME} ${TARGET_NAME})

@ -88,6 +88,7 @@ paddle.fluid.layers.pool3d ArgSpec(args=['input', 'pool_size', 'pool_type', 'poo
paddle.fluid.layers.adaptive_pool2d ArgSpec(args=['input', 'pool_size', 'pool_type', 'require_index', 'name'], varargs=None, keywords=None, defaults=('max', False, None)) paddle.fluid.layers.adaptive_pool2d ArgSpec(args=['input', 'pool_size', 'pool_type', 'require_index', 'name'], varargs=None, keywords=None, defaults=('max', False, None))
paddle.fluid.layers.adaptive_pool3d ArgSpec(args=['input', 'pool_size', 'pool_type', 'require_index', 'name'], varargs=None, keywords=None, defaults=('max', False, None)) paddle.fluid.layers.adaptive_pool3d ArgSpec(args=['input', 'pool_size', 'pool_type', 'require_index', 'name'], varargs=None, keywords=None, defaults=('max', False, None))
paddle.fluid.layers.batch_norm ArgSpec(args=['input', 'act', 'is_test', 'momentum', 'epsilon', 'param_attr', 'bias_attr', 'data_layout', 'in_place', 'name', 'moving_mean_name', 'moving_variance_name', 'do_model_average_for_mean_and_var', 'fuse_with_relu', 'use_global_stats'], varargs=None, keywords=None, defaults=(None, False, 0.9, 1e-05, None, None, 'NCHW', False, None, None, None, False, False, False)) paddle.fluid.layers.batch_norm ArgSpec(args=['input', 'act', 'is_test', 'momentum', 'epsilon', 'param_attr', 'bias_attr', 'data_layout', 'in_place', 'name', 'moving_mean_name', 'moving_variance_name', 'do_model_average_for_mean_and_var', 'fuse_with_relu', 'use_global_stats'], varargs=None, keywords=None, defaults=(None, False, 0.9, 1e-05, None, None, 'NCHW', False, None, None, None, False, False, False))
paddle.fluid.layers.data_norm ArgSpec(args=['input', 'act', 'epsilon', 'param_attr', 'data_layout', 'in_place', 'use_mkldnn', 'name', 'moving_mean_name', 'moving_variance_name', 'do_model_average_for_mean_and_var'], varargs=None, keywords=None, defaults=(None, 1e-05, None, 'NCHW', False, False, None, None, None, False))
paddle.fluid.layers.beam_search_decode ArgSpec(args=['ids', 'scores', 'beam_size', 'end_id', 'name'], varargs=None, keywords=None, defaults=(None,)) paddle.fluid.layers.beam_search_decode ArgSpec(args=['ids', 'scores', 'beam_size', 'end_id', 'name'], varargs=None, keywords=None, defaults=(None,))
paddle.fluid.layers.conv2d_transpose ArgSpec(args=['input', 'num_filters', 'output_size', 'filter_size', 'padding', 'stride', 'dilation', 'groups', 'param_attr', 'bias_attr', 'use_cudnn', 'act', 'name'], varargs=None, keywords=None, defaults=(None, None, 0, 1, 1, None, None, None, True, None, None)) paddle.fluid.layers.conv2d_transpose ArgSpec(args=['input', 'num_filters', 'output_size', 'filter_size', 'padding', 'stride', 'dilation', 'groups', 'param_attr', 'bias_attr', 'use_cudnn', 'act', 'name'], varargs=None, keywords=None, defaults=(None, None, 0, 1, 1, None, None, None, True, None, None))
paddle.fluid.layers.conv3d_transpose ArgSpec(args=['input', 'num_filters', 'output_size', 'filter_size', 'padding', 'stride', 'dilation', 'groups', 'param_attr', 'bias_attr', 'use_cudnn', 'act', 'name'], varargs=None, keywords=None, defaults=(None, None, 0, 1, 1, None, None, None, True, None, None)) paddle.fluid.layers.conv3d_transpose ArgSpec(args=['input', 'num_filters', 'output_size', 'filter_size', 'padding', 'stride', 'dilation', 'groups', 'param_attr', 'bias_attr', 'use_cudnn', 'act', 'name'], varargs=None, keywords=None, defaults=(None, None, 0, 1, 1, None, None, None, True, None, None))
@ -210,6 +211,7 @@ paddle.fluid.layers.get_tensor_from_selected_rows ArgSpec(args=['x', 'name'], va
paddle.fluid.layers.lstm ArgSpec(args=['input', 'init_h', 'init_c', 'max_len', 'hidden_size', 'num_layers', 'dropout_prob', 'is_bidirec', 'is_test', 'name', 'default_initializer', 'seed'], varargs=None, keywords=None, defaults=(0.0, False, False, None, None, -1)) paddle.fluid.layers.lstm ArgSpec(args=['input', 'init_h', 'init_c', 'max_len', 'hidden_size', 'num_layers', 'dropout_prob', 'is_bidirec', 'is_test', 'name', 'default_initializer', 'seed'], varargs=None, keywords=None, defaults=(0.0, False, False, None, None, -1))
paddle.fluid.layers.py_func ArgSpec(args=['func', 'x', 'out', 'backward_func', 'skip_vars_in_backward_input'], varargs=None, keywords=None, defaults=(None, None)) paddle.fluid.layers.py_func ArgSpec(args=['func', 'x', 'out', 'backward_func', 'skip_vars_in_backward_input'], varargs=None, keywords=None, defaults=(None, None))
paddle.fluid.layers.psroi_pool ArgSpec(args=['input', 'rois', 'output_channels', 'spatial_scale', 'pooled_height', 'pooled_width', 'name'], varargs=None, keywords=None, defaults=(None,)) paddle.fluid.layers.psroi_pool ArgSpec(args=['input', 'rois', 'output_channels', 'spatial_scale', 'pooled_height', 'pooled_width', 'name'], varargs=None, keywords=None, defaults=(None,))
paddle.fluid.layers.teacher_student_sigmoid_loss ArgSpec(args=['input', 'label', 'soft_max_up_bound', 'soft_max_lower_bound'], varargs=None, keywords=None, defaults=(15.0, -15.0))
paddle.fluid.layers.huber_loss ArgSpec(args=['input', 'label', 'delta'], varargs=None, keywords=None, defaults=None) paddle.fluid.layers.huber_loss ArgSpec(args=['input', 'label', 'delta'], varargs=None, keywords=None, defaults=None)
paddle.fluid.layers.data ArgSpec(args=['name', 'shape', 'append_batch_size', 'dtype', 'lod_level', 'type', 'stop_gradient'], varargs=None, keywords=None, defaults=(True, 'float32', 0, VarType.LOD_TENSOR, True)) paddle.fluid.layers.data ArgSpec(args=['name', 'shape', 'append_batch_size', 'dtype', 'lod_level', 'type', 'stop_gradient'], varargs=None, keywords=None, defaults=(True, 'float32', 0, VarType.LOD_TENSOR, True))
paddle.fluid.layers.open_files ArgSpec(args=['filenames', 'shapes', 'lod_levels', 'dtypes', 'thread_num', 'buffer_size', 'pass_num', 'is_test'], varargs=None, keywords=None, defaults=(None, None, 1, None)) paddle.fluid.layers.open_files ArgSpec(args=['filenames', 'shapes', 'lod_levels', 'dtypes', 'thread_num', 'buffer_size', 'pass_num', 'is_test'], varargs=None, keywords=None, defaults=(None, None, 1, None))

@ -48,6 +48,17 @@ pass_library(conv_elementwise_add_act_fuse_pass inference)
pass_library(conv_elementwise_add2_act_fuse_pass inference) pass_library(conv_elementwise_add2_act_fuse_pass inference)
pass_library(conv_elementwise_add_fuse_pass inference) pass_library(conv_elementwise_add_fuse_pass inference)
pass_library(conv_affine_channel_fuse_pass inference) pass_library(conv_affine_channel_fuse_pass inference)
pass_library(transpose_flatten_concat_fuse_pass inference)
# There may be many transpose-flatten structures in a model, and the output of
# these structures will be used as inputs to the concat Op. This pattern will
# be detected by our pass. The index here represents the number of structures in the
# pattern. We use index 3 ~ 6, because these quantities of structures are
# common in the models.
foreach (index RANGE 3 6)
file(APPEND ${pass_file} "USE_PASS(transpose_flatten${index}_concat_fuse_pass);\n")
endforeach()
if(WITH_MKLDNN) if(WITH_MKLDNN)
pass_library(mkldnn_placement_pass base) pass_library(mkldnn_placement_pass base)
pass_library(depthwise_conv_mkldnn_pass base) pass_library(depthwise_conv_mkldnn_pass base)

@ -1306,6 +1306,69 @@ PDNode *patterns::ConvAffineChannel::operator()(
return ac_out_var; return ac_out_var;
} }
// a -> transpose_op(1) -> transpose_out_a -> flatten_op(1) -> flatten_out_a
// b -> transpose_op(2) -> transpose_out_b -> flatten_op(2) -> flatten_out_b
// ...
// z -> transpose_op(n) -> transpose_out_z -> flatten_op(n) -> flatten_out_z
// flatten_out_a -> concat_op flatten_out_b -> concat_op ... flatten_out_z ->
// concat_op
PDNode *patterns::TransposeFlattenConcat::operator()(
std::vector<PDNode *> conv_in, int times) {
// The times represents the repeat times of the
// {trans, trans_out, flatten, flatten_out}
const int kNumFields = 4;
const int kTransOutOffset = 1;
const int kFlattenOffset = 2;
const int kFlattenOutOffset = 3;
std::vector<PDNode *> nodes;
for (int i = 0; i < times; i++) {
nodes.push_back(
pattern->NewNode(GetNodeName("transpose" + std::to_string(i)))
->assert_is_op("transpose2"));
nodes.push_back(
pattern->NewNode(GetNodeName("transpose_out" + std::to_string(i)))
->assert_is_op_output("transpose2")
->assert_is_op_input("flatten2", "X")
->AsIntermediate());
nodes.push_back(pattern->NewNode(GetNodeName("flatten" + std::to_string(i)))
->assert_is_op("flatten2"));
nodes.push_back(
pattern->NewNode(GetNodeName("flatten_out" + std::to_string(i)))
->assert_is_op_output("flatten2")
->assert_is_op_nth_input("concat", "X", i)
->AsIntermediate());
}
auto concat_op = pattern->NewNode(GetNodeName("concat"))
->assert_is_op("concat")
->assert_op_has_n_inputs("concat", times);
auto concat_out = pattern->NewNode(GetNodeName("concat_out"))
->assert_is_op_output("concat")
->AsOutput();
std::vector<PDNode *> flatten_outs;
for (int i = 0; i < times; i++) {
conv_in[i]->AsInput();
// trans
nodes[i * kNumFields]->LinksFrom({conv_in[i]});
// trans_out
nodes[i * kNumFields + kTransOutOffset]->LinksFrom({nodes[i * kNumFields]});
// flatten
nodes[i * kNumFields + kFlattenOffset]->LinksFrom(
{nodes[i * kNumFields + kTransOutOffset]});
// flatten_out
nodes[i * kNumFields + kFlattenOutOffset]->LinksFrom(
{nodes[i * kNumFields + kFlattenOffset]});
flatten_outs.push_back(nodes[i * kNumFields + kFlattenOutOffset]);
}
concat_op->LinksFrom(flatten_outs).LinksTo({concat_out});
return concat_out;
}
} // namespace ir } // namespace ir
} // namespace framework } // namespace framework
} // namespace paddle } // namespace paddle

@ -766,6 +766,21 @@ struct ConvAffineChannel : public PatternBase {
PATTERN_DECL_NODE(ac_out); // Out PATTERN_DECL_NODE(ac_out); // Out
}; };
struct TransposeFlattenConcat : public PatternBase {
TransposeFlattenConcat(PDPattern* pattern, const std::string& name_scope)
: PatternBase(pattern, name_scope, "transpose_flatten_concat") {}
PDNode* operator()(std::vector<PDNode*> conv_inputs, int times);
std::string GetNodeName(const std::string& op_type) {
return PDNodeName(name_scope_, repr_, id_, op_type);
}
PDNode* GetPDNode(const std::string& op_type) {
return pattern->RetrieveNode(GetNodeName(op_type));
}
};
} // namespace patterns } // namespace patterns
// Link two ir::Nodes from each other. // Link two ir::Nodes from each other.

@ -50,7 +50,7 @@ PDNode* BuildSeqPoolConcatPattern(PDPattern* pattern,
// the other one should be unused empty var. // the other one should be unused empty var.
if (is_nth_input_var_of_concat(x->outputs[0], idx)) { if (is_nth_input_var_of_concat(x->outputs[0], idx)) {
satisfied_all = satisfied_all && x->outputs[1]->IsVar() && satisfied_all = satisfied_all && x->outputs[1]->IsVar() &&
x->outputs[1]->outputs.size() == 0; x->outputs[1]->outputs.empty();
} else { } else {
satisfied_all = satisfied_all =
satisfied_all && is_nth_input_var_of_concat(x->outputs[1], idx) && satisfied_all && is_nth_input_var_of_concat(x->outputs[1], idx) &&

@ -0,0 +1,148 @@
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <string>
#include <vector>
#include "paddle/fluid/framework/ir/graph_viz_pass.h"
#include "paddle/fluid/framework/ir/node.h"
#include "paddle/fluid/framework/ir/transpose_flatten_concat_fuse_pass.h"
namespace paddle {
namespace framework {
namespace ir {
template <int times>
std::unique_ptr<ir::Graph> TransposeFlattenConcatFusePass<times>::ApplyImpl(
std::unique_ptr<ir::Graph> graph) const {
const std::string pattern_name =
"transpose_flatten" + std::to_string(times) + "_concat_fuse";
FusePassBase::Init(pattern_name, graph.get());
GraphPatternDetector gpd;
std::vector<PDNode *> input_nodes;
for (int i = 0; i < times; i++) {
input_nodes.push_back(gpd.mutable_pattern()
->NewNode("x" + std::to_string(i))
->assert_is_op_input("transpose2", "X")
->AsInput());
}
patterns::TransposeFlattenConcat pattern(gpd.mutable_pattern(), pattern_name);
pattern(input_nodes, times);
auto handler = [&](const GraphPatternDetector::subgraph_t &subgraph,
Graph *g) {
const int kNumFields = 5;
const int kTransOffset = 1;
const int kTransOutOffset = 2;
const int kFlattenOffset = 3;
const int kFlattenOutOffset = 4;
std::vector<Node *> nodes;
for (int i = 0; i < times; i++) {
PADDLE_ENFORCE(
subgraph.at(pattern.GetPDNode("transpose" + std::to_string(i))));
PADDLE_ENFORCE(
subgraph.at(pattern.GetPDNode("transpose_out" + std::to_string(i))));
PADDLE_ENFORCE(
subgraph.at(pattern.GetPDNode("flatten" + std::to_string(i))));
PADDLE_ENFORCE(
subgraph.at(pattern.GetPDNode("flatten_out" + std::to_string(i))));
PADDLE_ENFORCE(subgraph.at(input_nodes[i]));
nodes.push_back(subgraph.at(input_nodes[i]));
nodes.push_back(
subgraph.at(pattern.GetPDNode("transpose" + std::to_string(i))));
nodes.push_back(
subgraph.at(pattern.GetPDNode("transpose_out" + std::to_string(i))));
nodes.push_back(
subgraph.at(pattern.GetPDNode("flatten" + std::to_string(i))));
nodes.push_back(
subgraph.at(pattern.GetPDNode("flatten_out" + std::to_string(i))));
}
Node *concat_op = subgraph.at(pattern.GetPDNode("concat"));
Node *concat_out = subgraph.at(pattern.GetPDNode("concat_out"));
std::vector<std::string> input_names;
std::vector<int> trans_axis = boost::get<std::vector<int>>(
nodes[kTransOffset]->Op()->GetAttr("axis"));
int flatten_axis =
boost::get<int>(nodes[kFlattenOffset]->Op()->GetAttr("axis"));
int concat_axis = boost::get<int>(concat_op->Op()->GetAttr("axis"));
std::string output_name = concat_out->Name();
for (int i = 0; i < times; i++) {
input_names.push_back(nodes[i * kNumFields]->Name());
}
framework::OpDesc new_op_desc;
new_op_desc.SetType("fusion_transpose_flatten_concat");
new_op_desc.SetInput("X", input_names);
new_op_desc.SetAttr("trans_axis", trans_axis);
new_op_desc.SetAttr("flatten_axis", flatten_axis);
new_op_desc.SetAttr("concat_axis", concat_axis);
new_op_desc.SetOutput("Out", {output_name});
new_op_desc.Flush();
// Create a new node for the fused op.
auto *new_conv_op = graph->CreateOpNode(&new_op_desc);
std::unordered_set<const Node *> delete_nodes;
for (int i = 0; i < times; i++) {
nodes[i * kNumFields]->outputs.push_back(new_conv_op);
new_conv_op->inputs.push_back(nodes[i * kNumFields]);
delete_nodes.insert(nodes[i * kNumFields + kTransOffset]);
delete_nodes.insert(nodes[i * kNumFields + kTransOutOffset]);
delete_nodes.insert(nodes[i * kNumFields + kFlattenOffset]);
delete_nodes.insert(nodes[i * kNumFields + kFlattenOutOffset]);
}
delete_nodes.insert(concat_op);
new_conv_op->outputs.push_back(concat_out);
concat_out->inputs.push_back(new_conv_op);
// Delete the unneeded nodes.
GraphSafeRemoveNodes(graph.get(), delete_nodes);
};
gpd(graph.get(), handler);
return graph;
}
template class TransposeFlattenConcatFusePass<1>;
template class TransposeFlattenConcatFusePass<3>;
template class TransposeFlattenConcatFusePass<4>;
template class TransposeFlattenConcatFusePass<5>;
template class TransposeFlattenConcatFusePass<6>;
} // namespace ir
} // namespace framework
} // namespace paddle
REGISTER_PASS(transpose_flatten_concat_fuse_pass,
paddle::framework::ir::TransposeFlattenConcatFusePass<1>);
REGISTER_PASS(transpose_flatten3_concat_fuse_pass,
paddle::framework::ir::TransposeFlattenConcatFusePass<3>);
REGISTER_PASS(transpose_flatten4_concat_fuse_pass,
paddle::framework::ir::TransposeFlattenConcatFusePass<4>);
REGISTER_PASS(transpose_flatten5_concat_fuse_pass,
paddle::framework::ir::TransposeFlattenConcatFusePass<5>);
REGISTER_PASS(transpose_flatten6_concat_fuse_pass,
paddle::framework::ir::TransposeFlattenConcatFusePass<6>);

@ -0,0 +1,38 @@
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include "paddle/fluid/framework/ir/fuse_pass_base.h"
#include "paddle/fluid/framework/ir/graph_pattern_detector.h"
namespace paddle {
namespace framework {
namespace ir {
// There may be many transpose-flatten structures in a model, and the output of
// these structures will be used as inputs to the concat Op. This pattern will
// be detected by our pass. The times here represents the repeat times of this
// structure.
template <int times>
class TransposeFlattenConcatFusePass : public FusePassBase {
public:
virtual ~TransposeFlattenConcatFusePass() {}
protected:
std::unique_ptr<ir::Graph> ApplyImpl(std::unique_ptr<ir::Graph> graph) const;
};
} // namespace ir
} // namespace framework
} // namespace paddle

@ -391,7 +391,7 @@ class ExecutionContext {
PADDLE_ENFORCE( PADDLE_ENFORCE(
dynamic_cast<platform::TemporaryAllocation*>(allocation_ptr) != nullptr, dynamic_cast<platform::TemporaryAllocation*>(allocation_ptr) != nullptr,
"The AllocationPtr must be TemporaryAllocation."); "The AllocationPtr must be TemporaryAllocation.");
PADDLE_ENFORCE_GE(allocation_ptr->size(), PADDLE_ENFORCE_EQ(allocation_ptr->size(),
framework::product(dim) * sizeof(T)); framework::product(dim) * sizeof(T));
paddle::framework::Tensor temp_tensor( paddle::framework::Tensor temp_tensor(

@ -27,6 +27,8 @@
namespace paddle { namespace paddle {
namespace imperative { namespace imperative {
std::map<int, py::object> py_funcs_;
using framework::Variable; using framework::Variable;
void AddTo(Variable* src, Variable* dst) { void AddTo(Variable* src, Variable* dst) {
@ -55,6 +57,7 @@ class Autograd {
if (var->stop_gradient_) { if (var->stop_gradient_) {
return; return;
} }
VLOG(3) << "start autograd";
std::deque<OpBase*> ready; std::deque<OpBase*> ready;
ready.push_back(var->pre_op_); ready.push_back(var->pre_op_);
@ -120,51 +123,57 @@ framework::LoDTensor& VarBase::GradValue() {
} }
std::map<std::string, std::vector<VarBase*>> OpBase::ApplyGrad() { std::map<std::string, std::vector<VarBase*>> OpBase::ApplyGrad() {
if (!grad_op_desc_) { if (!grad_op_desc_ && backward_id_ <= 0) {
LOG(WARNING) << "op with no grad: " << op_desc_->Type(); LOG(WARNING) << "op with no grad: " << op_desc_->Type();
return {}; return {};
} }
VLOG(3) << "op grad " << grad_op_desc_->Type();
std::vector<std::unique_ptr<framework::Variable>> tmp_vars;
std::map<std::string, std::vector<framework::Variable*>> grad_outputs; std::map<std::string, std::vector<framework::Variable*>> grad_outputs;
for (auto it : grad_output_vars_) { if (backward_id_ > 0) {
auto& outputs = grad_outputs[it.first]; VLOG(3) << "py_layer_grad";
for (size_t i = 0; i < it.second.size(); ++i) { grad_outputs["Out@GRAD"] =
// Allocate a new variable PyLayer::ApplyGrad(backward_id_, grad_input_vars_["X@GRAD"]);
Variable* tmp_var = new framework::Variable(); } else {
tmp_var->GetMutable<framework::LoDTensor>(); VLOG(3) << "op grad " << grad_op_desc_->Type();
for (auto it : grad_output_vars_) {
tmp_vars.emplace_back(tmp_var); auto& outputs = grad_outputs[it.first];
outputs.push_back(tmp_var); for (size_t i = 0; i < it.second.size(); ++i) {
// Allocate a new variable
Variable* tmp_var = new framework::Variable();
tmp_var->GetMutable<framework::LoDTensor>();
outputs.push_back(tmp_var);
}
} }
}
framework::RuntimeContext ctx(grad_input_vars_, grad_outputs); framework::RuntimeContext ctx(grad_input_vars_, grad_outputs);
// No need to do compile time infer shape here. // No need to do compile time infer shape here.
// grad_op_desc_->InferShape(*block_); // grad_op_desc_->InferShape(*block_);
grad_op_desc_->InferVarType(block_); grad_op_desc_->InferVarType(block_);
std::unique_ptr<framework::OperatorBase> opbase = std::unique_ptr<framework::OperatorBase> opbase =
framework::OpRegistry::CreateOp(*grad_op_desc_); framework::OpRegistry::CreateOp(*grad_op_desc_);
framework::OperatorWithKernel* op_kernel = framework::OperatorWithKernel* op_kernel =
dynamic_cast<framework::OperatorWithKernel*>(opbase.get()); dynamic_cast<framework::OperatorWithKernel*>(opbase.get());
PADDLE_ENFORCE_NOT_NULL(op_kernel, "only support op with kernel"); PADDLE_ENFORCE_NOT_NULL(op_kernel, "only support op with kernel");
framework::Scope scope; framework::Scope scope;
platform::CPUPlace place; platform::CPUPlace place;
PreparedOp p = PreparedOp::Prepare(ctx, *op_kernel, place); PreparedOp p = PreparedOp::Prepare(ctx, *op_kernel, place);
p.op.RuntimeInferShape(scope, place, ctx); p.op.RuntimeInferShape(scope, place, ctx);
p.func(framework::ExecutionContext(p.op, scope, *p.dev_ctx, p.ctx)); p.func(framework::ExecutionContext(p.op, scope, *p.dev_ctx, p.ctx));
}
for (auto it : grad_output_vars_) { for (auto it : grad_output_vars_) {
auto& outputs = grad_outputs[it.first]; auto& outputs = grad_outputs[it.first];
auto& origin_outputs = it.second; auto& origin_outputs = it.second;
PADDLE_ENFORCE_EQ(outputs.size(), origin_outputs.size());
for (size_t i = 0; i < outputs.size(); ++i) { for (size_t i = 0; i < outputs.size(); ++i) {
framework::Variable* grad = outputs[i];
framework::Variable* orig_grad = origin_outputs[i]; framework::Variable* orig_grad = origin_outputs[i];
AddTo(outputs[i], orig_grad); AddTo(grad, orig_grad);
delete grad;
} }
} }
return input_vars_; return input_vars_;
@ -173,6 +182,7 @@ std::map<std::string, std::vector<VarBase*>> OpBase::ApplyGrad() {
void VarBase::RunBackward() { void VarBase::RunBackward() {
if (!pre_op_) return; if (!pre_op_) return;
VLOG(3) << "start backward";
auto grads_t = grads_->var_->GetMutable<framework::LoDTensor>(); auto grads_t = grads_->var_->GetMutable<framework::LoDTensor>();
float* data = grads_t->mutable_data<float>(platform::CPUPlace()); float* data = grads_t->mutable_data<float>(platform::CPUPlace());
std::fill(data, data + grads_t->numel(), 1.0); std::fill(data, data + grads_t->numel(), 1.0);
@ -183,5 +193,65 @@ void VarBase::RunBackward() {
Autograd().RunBackward(this); Autograd().RunBackward(this);
} }
void PyLayer::RegisterFunc(int func_id, const py::object& py_func) {
py_funcs_[func_id] = py_func;
}
int PyLayer::NumFuncs() { return py_funcs_.size(); }
std::vector<VarBase*> PyLayer::Apply(int func_id,
const std::vector<VarBase*>& inputs) {
std::vector<framework::Variable*> invars;
for (const VarBase* in : inputs) {
invars.push_back(in->var_);
}
PADDLE_ENFORCE(py_funcs_.find(func_id) != py_funcs_.end());
std::vector<Variable*> outvars = CallPythonFunc(py_funcs_[func_id], invars);
std::vector<VarBase*> ret;
for (Variable* v : outvars) {
ret.push_back(new VarBase(v, new VarBase(true)));
}
return ret;
}
std::vector<Variable*> PyLayer::ApplyGrad(
int func_id, const std::vector<framework::Variable*>& inputs) {
PADDLE_ENFORCE(py_funcs_.find(func_id) != py_funcs_.end());
return CallPythonFunc(py_funcs_[func_id], inputs);
}
std::vector<framework::Variable*> PyLayer::CallPythonFunc(
const py::object& callable, const std::vector<framework::Variable*>& ins) {
py::gil_scoped_acquire guard;
py::tuple in_args(ins.size());
for (size_t i = 0; i < ins.size(); ++i) {
const framework::LoDTensor& t = ins[i]->Get<framework::LoDTensor>();
in_args[i] = t.IsInitialized() ? py::cast(t) : py::cast(nullptr);
}
VLOG(3) << "pyfunc in " << py::len(in_args);
// TODO(panyx0718): Who owns the returned LoDTensor.
auto ret = callable(in_args);
auto ret_tuple = py::cast<py::tuple>(ret);
size_t ret_num = py::len(ret_tuple);
std::vector<framework::Variable*> outs;
VLOG(3) << "pyfunc out " << ret_num;
for (size_t i = 0; i < ret_num; ++i) {
try {
auto* py_out_tensor = py::cast<framework::LoDTensor*>(ret_tuple[i]);
PADDLE_ENFORCE_NOT_NULL(py_out_tensor,
"Output tensor %d should not be nullptr", i);
auto* var = new framework::Variable();
auto* tensor = var->GetMutable<framework::LoDTensor>();
tensor->ShareDataWith(*py_out_tensor);
tensor->set_lod(py_out_tensor->lod());
outs.push_back(var);
} catch (py::cast_error&) {
PADDLE_THROW("The %d-th output must be LoDTensor", i);
}
}
return outs;
}
} // namespace imperative } // namespace imperative
} // namespace paddle } // namespace paddle

@ -22,12 +22,15 @@
#include "paddle/fluid/framework/operator.h" #include "paddle/fluid/framework/operator.h"
#include "paddle/fluid/framework/var_desc.h" #include "paddle/fluid/framework/var_desc.h"
#include "paddle/fluid/platform/enforce.h" #include "paddle/fluid/platform/enforce.h"
#include "pybind11/pybind11.h"
#include "paddle/fluid/imperative/type_defs.h" #include "paddle/fluid/imperative/type_defs.h"
namespace paddle { namespace paddle {
namespace imperative { namespace imperative {
namespace py = ::pybind11;
class PreparedOp { class PreparedOp {
public: public:
PreparedOp(const framework::OperatorBase& op, PreparedOp(const framework::OperatorBase& op,
@ -90,16 +93,21 @@ class OpBase;
*/ */
class VarBase { class VarBase {
public: public:
VarBase() VarBase() : VarBase(new framework::Variable(), new VarBase(true)) {}
// Owns `var` and `grad`
VarBase(framework::Variable* var, VarBase* grad)
: pre_op_(nullptr), : pre_op_(nullptr),
pre_op_out_name_(),
pre_op_out_idx_(-1), pre_op_out_idx_(-1),
var_desc_(nullptr), var_desc_(nullptr),
var_(new framework::Variable()), var_(var),
grads_(new VarBase(true)), grads_(grad),
stop_gradient_(false) {} stop_gradient_(false) {}
explicit VarBase(bool stop_gradient) explicit VarBase(bool stop_gradient)
: pre_op_(nullptr), : pre_op_(nullptr),
pre_op_out_name_(),
pre_op_out_idx_(-1), pre_op_out_idx_(-1),
var_desc_(nullptr), var_desc_(nullptr),
var_(new framework::Variable()), var_(new framework::Variable()),
@ -144,7 +152,11 @@ class VarBase {
*/ */
class OpBase { class OpBase {
public: public:
OpBase() : op_desc_(nullptr), grad_op_desc_(nullptr) {} OpBase()
: op_desc_(nullptr),
forward_id_(-1),
grad_op_desc_(nullptr),
backward_id_(-1) {}
virtual ~OpBase() { virtual ~OpBase() {
if (grad_op_desc_) delete grad_op_desc_; if (grad_op_desc_) delete grad_op_desc_;
@ -152,8 +164,14 @@ class OpBase {
std::map<std::string, std::vector<VarBase*>> ApplyGrad(); std::map<std::string, std::vector<VarBase*>> ApplyGrad();
// One of `op_desc_` or `forward_id_` is set, not both.
// For pure python PyLayer, use `forward_id_`, otherwise, use op_desc_.
framework::OpDesc* op_desc_; framework::OpDesc* op_desc_;
int forward_id_;
// When has backward, one of `grad_op_desc_` or `backward_id_` is set,
// not both.
framework::OpDesc* grad_op_desc_; framework::OpDesc* grad_op_desc_;
int backward_id_;
VarBasePtrMap input_vars_; VarBasePtrMap input_vars_;
VarBasePtrMap output_vars_; VarBasePtrMap output_vars_;
@ -173,8 +191,25 @@ class Layer {
std::vector<VarBase> vars; std::vector<VarBase> vars;
return vars; return vars;
} }
};
class PyLayer {
public:
virtual ~PyLayer() {}
static void RegisterFunc(int func_id, const py::object& py_func);
static int NumFuncs();
static std::vector<VarBase*> Apply(int func_id,
const std::vector<VarBase*>& inputs);
static std::vector<framework::Variable*> ApplyGrad(
int func_id, const std::vector<framework::Variable*>& inputs);
virtual void Backward() { LOG(ERROR) << "To support customize"; } private:
static std::vector<framework::Variable*> CallPythonFunc(
const py::object& callable, const std::vector<framework::Variable*>& ins);
}; };
} // namespace imperative } // namespace imperative

@ -115,8 +115,10 @@ void Tracer::Trace(OpBase* op, const VarBasePtrMap& inputs,
if (!stop_gradient) { if (!stop_gradient) {
framework::OpDesc* grad_op_desc; framework::OpDesc* grad_op_desc;
auto grad_to_var = new std::unordered_map<std::string, std::string>(); // TODO(panyx): Is this leaked?
CreateGradOp(*op_desc, {}, {block}, &grad_op_desc, grad_to_var); std::unique_ptr<std::unordered_map<std::string, std::string>> grad_to_var(
new std::unordered_map<std::string, std::string>());
CreateGradOp(*op_desc, {}, {block}, &grad_op_desc, grad_to_var.get());
op->grad_op_desc_ = grad_op_desc; op->grad_op_desc_ = grad_op_desc;
for (auto it : grad_op_desc->Inputs()) { for (auto it : grad_op_desc->Inputs()) {
@ -127,13 +129,15 @@ void Tracer::Trace(OpBase* op, const VarBasePtrMap& inputs,
if (var_it == grad_to_var->end()) { if (var_it == grad_to_var->end()) {
auto fwd_var_it = vars.find(grad_invar); auto fwd_var_it = vars.find(grad_invar);
PADDLE_ENFORCE(fwd_var_it != vars.end()); PADDLE_ENFORCE(fwd_var_it != vars.end());
// Forward inputs or outputs.
grad_in_vars.push_back(fwd_var_it->second->var_); grad_in_vars.push_back(fwd_var_it->second->var_);
} else { } else {
VarBase* var = vars[var_it->second]; VarBase* var = vars[var_it->second];
if (!var->grads_->var_->IsInitialized()) { if (!var->grads_->IsInitialized()) {
InitVar(var->var_, var->grads_->var_); InitVar(var->var_, var->grads_);
} }
grad_in_vars.push_back(var->grads_->var_); // Douts.
grad_in_vars.push_back(var->grads_);
} }
} }
} }
@ -145,10 +149,10 @@ void Tracer::Trace(OpBase* op, const VarBasePtrMap& inputs,
auto var_it = grad_to_var->find(grad_outvar); auto var_it = grad_to_var->find(grad_outvar);
PADDLE_ENFORCE(var_it != grad_to_var->end()); PADDLE_ENFORCE(var_it != grad_to_var->end());
VarBase* var = vars[var_it->second]; VarBase* var = vars[var_it->second];
if (!var->grads_->var_->IsInitialized()) { if (!var->grads_->IsInitialized()) {
InitVar(var->var_, var->grads_->var_); InitVar(var->var_, var->grads_);
} }
grad_out_vars.push_back(var->grads_->var_); grad_out_vars.push_back(var->grads_);
} }
} }
} }
@ -156,5 +160,54 @@ void Tracer::Trace(OpBase* op, const VarBasePtrMap& inputs,
op->block_ = block; op->block_ = block;
} }
std::vector<VarBase*> Tracer::PyTrace(OpBase* op,
const std::vector<VarBase*>& inputs,
bool stop_gradient) {
VLOG(3) << "py_trace";
op->input_vars_["X"] = inputs;
op->output_vars_["Out"] = PyLayer::Apply(op->forward_id_, inputs);
for (VarBase* inp : inputs) {
if (inp->pre_op_) {
op->pre_ops_["X"].push_back(inp->pre_op_);
op->pre_ops_out_idx_["X"].push_back(inp->pre_op_out_idx_);
} else {
op->pre_ops_["X"].push_back(nullptr);
}
}
auto& outputs = op->output_vars_["Out"];
for (size_t i = 0; i < outputs.size(); ++i) {
VarBase* out = outputs[i];
out->stop_gradient_ = stop_gradient;
out->pre_op_ = op;
out->pre_op_out_name_ = "Out";
out->pre_op_out_idx_ = i;
}
if (!stop_gradient) {
auto& grad_input_vars = op->grad_input_vars_["X@GRAD"];
auto& grad_output_vars = op->grad_output_vars_["Out@GRAD"];
for (const VarBase* inp : inputs) {
grad_input_vars.push_back(inp->var_);
}
for (VarBase* out : outputs) {
grad_input_vars.push_back(out->var_);
}
for (VarBase* out : outputs) {
grad_input_vars.push_back(out->grads_);
if (!grad_input_vars.back()->IsInitialized()) {
InitVar(out->var_, grad_input_vars.back());
}
}
for (const VarBase* inp : inputs) {
grad_output_vars.push_back(inp->grads_);
if (!grad_output_vars.back()->IsInitialized()) {
InitVar(inp->var_, grad_output_vars.back());
}
}
}
return outputs;
}
} // namespace imperative } // namespace imperative
} // namespace paddle } // namespace paddle

@ -45,6 +45,9 @@ class Tracer {
const std::map<std::string, std::vector<VarBase*>>& outputs, const std::map<std::string, std::vector<VarBase*>>& outputs,
framework::BlockDesc* block, const bool stop_gradient = false); framework::BlockDesc* block, const bool stop_gradient = false);
std::vector<VarBase*> PyTrace(OpBase* op, const std::vector<VarBase*>& inputs,
bool stop_gradient = false);
private: private:
framework::BlockDesc* root_block_; framework::BlockDesc* root_block_;
}; };

@ -127,6 +127,7 @@ void contrib::AnalysisConfig::EnableTensorRtEngine(int workspace_size,
use_tensorrt_ = true; use_tensorrt_ = true;
tensorrt_workspace_size_ = workspace_size; tensorrt_workspace_size_ = workspace_size;
tensorrt_max_batchsize_ = max_batch_size; tensorrt_max_batchsize_ = max_batch_size;
Update();
} }
void contrib::AnalysisConfig::Update() { void contrib::AnalysisConfig::Update() {

@ -128,8 +128,8 @@ else()
${CMAKE_STATIC_LIBRARY_PREFIX}glog ${CMAKE_STATIC_LIBRARY_PREFIX}gflags ${CMAKE_STATIC_LIBRARY_PREFIX}protobuf ${CMAKE_STATIC_LIBRARY_PREFIX}glog ${CMAKE_STATIC_LIBRARY_PREFIX}gflags ${CMAKE_STATIC_LIBRARY_PREFIX}protobuf
${CMAKE_STATIC_LIBRARY_PREFIX}snappy ${CMAKE_STATIC_LIBRARY_PREFIX}z ${CMAKE_STATIC_LIBRARY_PREFIX}xxhash ${CMAKE_STATIC_LIBRARY_PREFIX}snappy ${CMAKE_STATIC_LIBRARY_PREFIX}z ${CMAKE_STATIC_LIBRARY_PREFIX}xxhash
snappystream ${EXTERNAL_LIB}) snappystream ${EXTERNAL_LIB})
# NOTE(dzhwinter) shlwapi is deprecated. get_property(os_dependency_modules GLOBAL PROPERTY OS_DEPENDENCY_MODULES)
set(DEPS ${DEPS} libcmt shlwapi) set(DEPS ${DEPS} libcmt ${os_dependency_modules})
endif(NOT WIN32) endif(NOT WIN32)
if(WITH_GPU) if(WITH_GPU)

@ -141,6 +141,10 @@ class GpuPassStrategy : public PassStrategy {
"conv_elementwise_add_fuse_pass", // "conv_elementwise_add_fuse_pass", //
}); });
for (int i = 6; i >= 3; i--) {
passes_.push_back("transpose_flatten" + std::to_string(i) +
"_concat_fuse_pass");
}
use_gpu_ = true; use_gpu_ = true;
} }

@ -39,6 +39,7 @@ class ElementwiseWeightOpConverter : public OpConverter {
const framework::Scope& scope, bool test_mode) override { const framework::Scope& scope, bool test_mode) override {
// Here the two nullptr looks strange, that's because the // Here the two nullptr looks strange, that's because the
// framework::OpDesc's constructor is strange. // framework::OpDesc's constructor is strange.
nvinfer1::ILayer* layer = nullptr;
framework::OpDesc op_desc(op, nullptr); framework::OpDesc op_desc(op, nullptr);
VLOG(3) << "Convert a fluid elementwise op to TensorRT IScaleLayer"; VLOG(3) << "Convert a fluid elementwise op to TensorRT IScaleLayer";
@ -98,13 +99,21 @@ class ElementwiseWeightOpConverter : public OpConverter {
0}; 0};
TensorRTEngine::Weight power_weights{nvinfer1::DataType::kFLOAT, nullptr, TensorRTEngine::Weight power_weights{nvinfer1::DataType::kFLOAT, nullptr,
0}; 0};
if (op_type_ == "add") {
nvinfer1::IScaleLayer* scale_layer = TRT_ENGINE_ADD_LAYER(
engine_, Scale, *X, scale_mode, shift_weights.get(),
scale_weights.get(), power_weights.get());
layer = scale_layer;
} else if (op_type_ == "mul") {
nvinfer1::IScaleLayer* scale_layer = TRT_ENGINE_ADD_LAYER(
engine_, Scale, *X, scale_mode, scale_weights.get(),
shift_weights.get(), power_weights.get());
layer = scale_layer;
}
nvinfer1::IScaleLayer* layer = TRT_ENGINE_ADD_LAYER(
engine_, Scale, *const_cast<nvinfer1::ITensor*>(X), scale_mode,
shift_weights.get(), scale_weights.get(), power_weights.get());
auto output_name = op_desc.Output("Out")[0]; auto output_name = op_desc.Output("Out")[0];
layer->setName(
layer->setName(("elementwise_add (Output: " + output_name + ")").c_str()); ("elementwise_" + op_type_ + "(Output: " + output_name + ")").c_str());
layer->getOutput(0)->setName(output_name.c_str()); layer->getOutput(0)->setName(output_name.c_str());
engine_->weight_map[op_desc.Input("Y").front()] = std::move(weight_tensor); engine_->weight_map[op_desc.Input("Y").front()] = std::move(weight_tensor);
engine_->SetITensor(output_name, layer->getOutput(0)); engine_->SetITensor(output_name, layer->getOutput(0));
@ -113,6 +122,9 @@ class ElementwiseWeightOpConverter : public OpConverter {
engine_->DeclareOutput(output_name); engine_->DeclareOutput(output_name);
} }
} }
protected:
std::string op_type_;
}; };
class ElementwiseTensorOpConverter : public OpConverter { class ElementwiseTensorOpConverter : public OpConverter {
@ -188,6 +200,16 @@ const std::unordered_map<std::string, nvinfer1::ElementWiseOperation>
{"max", nvinfer1::ElementWiseOperation::kMAX}, {"max", nvinfer1::ElementWiseOperation::kMAX},
}; };
class ElementwiseWeightAddOpConverter : public ElementwiseWeightOpConverter {
public:
ElementwiseWeightAddOpConverter() { op_type_ = "add"; }
};
class ElementwiseWeightMulOpConverter : public ElementwiseWeightOpConverter {
public:
ElementwiseWeightMulOpConverter() { op_type_ = "mul"; }
};
class ElementwiseTensorAddOpConverter : public ElementwiseTensorOpConverter { class ElementwiseTensorAddOpConverter : public ElementwiseTensorOpConverter {
public: public:
ElementwiseTensorAddOpConverter() { op_type_ = "add"; } ElementwiseTensorAddOpConverter() { op_type_ = "add"; }
@ -227,7 +249,10 @@ class ElementwiseTensorPowOpConverter : public ElementwiseTensorOpConverter {
} // namespace inference } // namespace inference
} // namespace paddle } // namespace paddle
REGISTER_TRT_OP_CONVERTER(elementwise_add_weight, ElementwiseWeightOpConverter); REGISTER_TRT_OP_CONVERTER(elementwise_add_weight,
ElementwiseWeightAddOpConverter);
REGISTER_TRT_OP_CONVERTER(elementwise_mul_weight,
ElementwiseWeightMulOpConverter);
REGISTER_TRT_OP_CONVERTER(elementwise_add_tensor, REGISTER_TRT_OP_CONVERTER(elementwise_add_tensor,
ElementwiseTensorAddOpConverter); ElementwiseTensorAddOpConverter);

@ -2,6 +2,3 @@ cc_library(benchmark SRCS benchmark.cc DEPS enforce)
cc_test(test_benchmark SRCS benchmark_tester.cc DEPS benchmark) cc_test(test_benchmark SRCS benchmark_tester.cc DEPS benchmark)
cc_binary(visualizer SRCS visualizer.cc DEPS analysis cc_binary(visualizer SRCS visualizer.cc DEPS analysis
paddle_pass_builder ir_pass_manager pass graph_viz_pass analysis_passes) paddle_pass_builder ir_pass_manager pass graph_viz_pass analysis_passes)
if(WIN32)
target_link_libraries(visualizer shlwapi)
endif(WIN32)

@ -137,6 +137,7 @@ class CUDNNConvOpKernel : public framework::OpKernel<T> {
// ------------------- cudnn conv algorithm --------------------- // ------------------- cudnn conv algorithm ---------------------
cudnnConvolutionFwdAlgo_t algo; cudnnConvolutionFwdAlgo_t algo;
auto handle = dev_ctx.cudnn_handle(); auto handle = dev_ctx.cudnn_handle();
auto workspace_handle = dev_ctx.cudnn_workspace_handle();
bool half_float = false; bool half_float = false;
#if CUDA_VERSION >= 9000 && CUDNN_VERSION_MIN(7, 0, 1) #if CUDA_VERSION >= 9000 && CUDNN_VERSION_MIN(7, 0, 1)
@ -157,8 +158,6 @@ class CUDNNConvOpKernel : public framework::OpKernel<T> {
VLOG(5) << "NOT use cudnn_tensor_op_math"; VLOG(5) << "NOT use cudnn_tensor_op_math";
} }
#endif #endif
Tensor cudnn_workspace;
void* cudnn_workspace_ptr = nullptr;
auto x_dims = framework::vectorize(input->dims()); auto x_dims = framework::vectorize(input->dims());
auto f_dims = framework::vectorize(filter->dims()); auto f_dims = framework::vectorize(filter->dims());
@ -181,26 +180,21 @@ class CUDNNConvOpKernel : public framework::OpKernel<T> {
.Var(kCUDNNFwdAlgoCache) .Var(kCUDNNFwdAlgoCache)
->GetMutable<AlgorithmsCache<cudnnConvolutionFwdAlgo_t>>(); ->GetMutable<AlgorithmsCache<cudnnConvolutionFwdAlgo_t>>();
} }
cudnn_workspace =
ctx.AllocateTmpTensor<int8_t, platform::CUDADeviceContext>(
framework::make_ddim(
{static_cast<int64_t>(workspace_size_limit)}),
dev_ctx);
cudnn_workspace_ptr = static_cast<void*>(cudnn_workspace.data<int8_t>());
algo = algo_cache->GetAlgorithm( algo = algo_cache->GetAlgorithm(
x_dims, f_dims, strides, paddings, dilations, 0, [&]() { x_dims, f_dims, strides, paddings, dilations, 0, [&]() {
int returned_algo_count; int returned_algo_count;
std::array<cudnnConvolutionFwdAlgoPerf_t, kNUM_CUDNN_FWD_ALGS> std::array<cudnnConvolutionFwdAlgoPerf_t, kNUM_CUDNN_FWD_ALGS>
fwd_perf_stat; fwd_perf_stat;
auto cudnn_find_func = [&](void* cudnn_workspace) {
CUDNN_ENFORCE( CUDNN_ENFORCE(
platform::dynload::cudnnFindConvolutionForwardAlgorithmEx( platform::dynload::cudnnFindConvolutionForwardAlgorithmEx(
handle, cudnn_input_desc, input_data, cudnn_filter_desc, handle, cudnn_input_desc, input_data, cudnn_filter_desc,
filter_data, cudnn_conv_desc, cudnn_output_desc, filter_data, cudnn_conv_desc, cudnn_output_desc,
output_data, kNUM_CUDNN_FWD_ALGS, &returned_algo_count, output_data, kNUM_CUDNN_FWD_ALGS, &returned_algo_count,
fwd_perf_stat.data(), cudnn_workspace_ptr, fwd_perf_stat.data(), cudnn_workspace,
workspace_size_limit)); workspace_size_limit));
};
workspace_handle.RunFunc(cudnn_find_func, workspace_size_limit);
VLOG(3) << "Perf result: (algo: stat, time, memory)"; VLOG(3) << "Perf result: (algo: stat, time, memory)";
for (int i = 0; i < returned_algo_count; ++i) { for (int i = 0; i < returned_algo_count; ++i) {
@ -225,23 +219,17 @@ class CUDNNConvOpKernel : public framework::OpKernel<T> {
PADDLE_ENFORCE_LE(workspace_size_in_bytes, workspace_size_limit, PADDLE_ENFORCE_LE(workspace_size_in_bytes, workspace_size_limit,
"workspace_size to be allocated exceeds the limit"); "workspace_size to be allocated exceeds the limit");
// Allocate on GPU memory
if (!cudnn_workspace_ptr) {
cudnn_workspace =
ctx.AllocateTmpTensor<int8_t, platform::CUDADeviceContext>(
framework::make_ddim(
{static_cast<int64_t>(workspace_size_in_bytes)}),
dev_ctx);
cudnn_workspace_ptr = static_cast<void*>(cudnn_workspace.data<int8_t>());
}
// ------------------- cudnn conv forward --------------------- // ------------------- cudnn conv forward ---------------------
ScalingParamType<T> alpha = 1.0f, beta = 0.0f; ScalingParamType<T> alpha = 1.0f, beta = 0.0f;
for (int i = 0; i < groups; i++) { for (int i = 0; i < groups; i++) {
CUDNN_ENFORCE(platform::dynload::cudnnConvolutionForward( auto cudnn_func = [&](void* cudnn_workspace) {
handle, &alpha, cudnn_input_desc, input_data + i * group_offset_in, CUDNN_ENFORCE(platform::dynload::cudnnConvolutionForward(
cudnn_filter_desc, filter_data + i * group_offset_filter, handle, &alpha, cudnn_input_desc, input_data + i * group_offset_in,
cudnn_conv_desc, algo, cudnn_workspace_ptr, workspace_size_in_bytes, cudnn_filter_desc, filter_data + i * group_offset_filter,
&beta, cudnn_output_desc, output_data + i * group_offset_out)); cudnn_conv_desc, algo, cudnn_workspace, workspace_size_in_bytes,
&beta, cudnn_output_desc, output_data + i * group_offset_out));
};
workspace_handle.RunFunc(cudnn_func, workspace_size_in_bytes);
} }
} }
}; };
@ -365,20 +353,10 @@ class CUDNNConvGradOpKernel : public framework::OpKernel<T> {
workspace_size_limit = max_user_size * 1024 * 1024; workspace_size_limit = max_user_size * 1024 * 1024;
} }
Tensor cudnn_workspace;
void* cudnn_workspace_ptr = nullptr;
if ((input_data || filter_data) && exhaustive_search) {
cudnn_workspace =
ctx.AllocateTmpTensor<int8_t, platform::CUDADeviceContext>(
framework::make_ddim(
{static_cast<int64_t>(workspace_size_limit)}),
dev_ctx);
cudnn_workspace_ptr = static_cast<void*>(cudnn_workspace.data<int8_t>());
}
auto x_dims = framework::vectorize(input->dims()); auto x_dims = framework::vectorize(input->dims());
auto f_dims = framework::vectorize(filter->dims()); auto f_dims = framework::vectorize(filter->dims());
auto handle = dev_ctx.cudnn_handle(); auto handle = dev_ctx.cudnn_handle();
auto workspace_handle = dev_ctx.cudnn_workspace_handle();
if (input_grad) { if (input_grad) {
T* input_grad_data = input_grad->mutable_data<T>(ctx.GetPlace()); T* input_grad_data = input_grad->mutable_data<T>(ctx.GetPlace());
if (exhaustive_search) { if (exhaustive_search) {
@ -396,22 +374,25 @@ class CUDNNConvGradOpKernel : public framework::OpKernel<T> {
->GetMutable< ->GetMutable<
AlgorithmsCache<cudnnConvolutionBwdDataAlgo_t>>(); AlgorithmsCache<cudnnConvolutionBwdDataAlgo_t>>();
} }
data_algo = data_algo_cache->GetAlgorithm( data_algo = data_algo_cache->GetAlgorithm(
x_dims, f_dims, strides, paddings, dilations, 0, [&]() { x_dims, f_dims, strides, paddings, dilations, 0, [&]() {
int returned_algo_count; int returned_algo_count;
std::array<cudnnConvolutionBwdDataAlgoPerf_t, std::array<cudnnConvolutionBwdDataAlgoPerf_t,
kNUM_CUDNN_BWD_DATA_ALGS> kNUM_CUDNN_BWD_DATA_ALGS>
data_perf_stat; data_perf_stat;
auto cudnn_find_bd_data_func = [&](void* cudnn_workspace) {
CUDNN_ENFORCE(platform::dynload:: CUDNN_ENFORCE(
cudnnFindConvolutionBackwardDataAlgorithmEx( platform::dynload::
handle, cudnn_filter_desc, filter_data, cudnnFindConvolutionBackwardDataAlgorithmEx(
cudnn_output_grad_desc, output_grad_data, handle, cudnn_filter_desc, filter_data,
cudnn_conv_desc, cudnn_input_desc, cudnn_output_grad_desc, output_grad_data,
input_grad_data, kNUM_CUDNN_BWD_DATA_ALGS, cudnn_conv_desc, cudnn_input_desc, input_grad_data,
&returned_algo_count, data_perf_stat.data(), kNUM_CUDNN_BWD_DATA_ALGS, &returned_algo_count,
cudnn_workspace_ptr, workspace_size_limit)); data_perf_stat.data(), cudnn_workspace,
workspace_size_limit));
};
workspace_handle.RunFunc(cudnn_find_bd_data_func,
workspace_size_limit);
VLOG(3) << "Perf result: (algo: stat, time, memory)"; VLOG(3) << "Perf result: (algo: stat, time, memory)";
for (int i = 0; i < returned_algo_count; ++i) { for (int i = 0; i < returned_algo_count; ++i) {
@ -462,23 +443,25 @@ class CUDNNConvGradOpKernel : public framework::OpKernel<T> {
->GetMutable< ->GetMutable<
AlgorithmsCache<cudnnConvolutionBwdFilterAlgo_t>>(); AlgorithmsCache<cudnnConvolutionBwdFilterAlgo_t>>();
} }
filter_algo = f_algo_cache->GetAlgorithm( filter_algo = f_algo_cache->GetAlgorithm(
x_dims, f_dims, strides, paddings, dilations, 0, [&]() { x_dims, f_dims, strides, paddings, dilations, 0, [&]() {
int returned_algo_count; int returned_algo_count;
std::array<cudnnConvolutionBwdFilterAlgoPerf_t, std::array<cudnnConvolutionBwdFilterAlgoPerf_t,
kNUM_CUDNN_BWD_FILTER_ALGS> kNUM_CUDNN_BWD_FILTER_ALGS>
filter_perf_stat; filter_perf_stat;
auto cudnn_find_bd_f_func = [&](void* cudnn_workspace) {
CUDNN_ENFORCE( CUDNN_ENFORCE(
platform::dynload:: platform::dynload::
cudnnFindConvolutionBackwardFilterAlgorithmEx( cudnnFindConvolutionBackwardFilterAlgorithmEx(
handle, cudnn_input_desc, input_data, handle, cudnn_input_desc, input_data,
cudnn_output_grad_desc, output_grad_data, cudnn_output_grad_desc, output_grad_data,
cudnn_conv_desc, cudnn_filter_desc, filter_grad_data, cudnn_conv_desc, cudnn_filter_desc,
kNUM_CUDNN_BWD_FILTER_ALGS, &returned_algo_count, filter_grad_data, kNUM_CUDNN_BWD_FILTER_ALGS,
filter_perf_stat.data(), cudnn_workspace_ptr, &returned_algo_count, filter_perf_stat.data(),
workspace_size_limit)); cudnn_workspace, workspace_size_limit));
};
workspace_handle.RunFunc(cudnn_find_bd_f_func,
workspace_size_limit);
return filter_perf_stat[0].algo; return filter_perf_stat[0].algo;
}); });
VLOG(3) << "cuDNN backward filter algo " << filter_algo; VLOG(3) << "cuDNN backward filter algo " << filter_algo;
@ -499,16 +482,6 @@ class CUDNNConvGradOpKernel : public framework::OpKernel<T> {
workspace_size_in_bytes = std::max(workspace_size_in_bytes, tmp_size); workspace_size_in_bytes = std::max(workspace_size_in_bytes, tmp_size);
} }
// ------------------- cudnn conv workspace ---------------------
if (!cudnn_workspace_ptr) {
cudnn_workspace =
ctx.AllocateTmpTensor<int8_t, platform::CUDADeviceContext>(
framework::make_ddim(
{static_cast<int64_t>(workspace_size_in_bytes)}),
dev_ctx);
cudnn_workspace_ptr = static_cast<void*>(cudnn_workspace.data<int8_t>());
}
// ------------------- cudnn conv backward data --------------------- // ------------------- cudnn conv backward data ---------------------
ScalingParamType<T> alpha = 1.0f, beta = 0.0f; ScalingParamType<T> alpha = 1.0f, beta = 0.0f;
if (input_grad) { if (input_grad) {
@ -516,12 +489,15 @@ class CUDNNConvGradOpKernel : public framework::OpKernel<T> {
// Because beta is zero, it is unnecessary to reset input_grad. // Because beta is zero, it is unnecessary to reset input_grad.
for (int i = 0; i < groups; i++) { for (int i = 0; i < groups; i++) {
CUDNN_ENFORCE(platform::dynload::cudnnConvolutionBackwardData( auto cudnn_func = [&](void* cudnn_workspace) {
handle, &alpha, cudnn_filter_desc, CUDNN_ENFORCE(platform::dynload::cudnnConvolutionBackwardData(
filter_data + i * group_offset_filter, cudnn_output_grad_desc, handle, &alpha, cudnn_filter_desc,
output_grad_data + i * group_offset_out, cudnn_conv_desc, data_algo, filter_data + i * group_offset_filter, cudnn_output_grad_desc,
cudnn_workspace_ptr, workspace_size_in_bytes, &beta, output_grad_data + i * group_offset_out, cudnn_conv_desc,
cudnn_input_desc, input_grad_data + i * group_offset_in)); data_algo, cudnn_workspace, workspace_size_in_bytes, &beta,
cudnn_input_desc, input_grad_data + i * group_offset_in));
};
workspace_handle.RunFunc(cudnn_func, workspace_size_in_bytes);
} }
} }
// ------------------- cudnn conv backward filter --------------------- // ------------------- cudnn conv backward filter ---------------------
@ -529,12 +505,15 @@ class CUDNNConvGradOpKernel : public framework::OpKernel<T> {
T* filter_grad_data = filter_grad->mutable_data<T>(ctx.GetPlace()); T* filter_grad_data = filter_grad->mutable_data<T>(ctx.GetPlace());
// Because beta is zero, it is unnecessary to reset filter_grad. // Because beta is zero, it is unnecessary to reset filter_grad.
for (int i = 0; i < groups; i++) { for (int i = 0; i < groups; i++) {
CUDNN_ENFORCE(platform::dynload::cudnnConvolutionBackwardFilter( auto cudnn_func = [&](void* cudnn_workspace) {
handle, &alpha, cudnn_input_desc, input_data + i * group_offset_in, CUDNN_ENFORCE(platform::dynload::cudnnConvolutionBackwardFilter(
cudnn_output_grad_desc, output_grad_data + i * group_offset_out, handle, &alpha, cudnn_input_desc,
cudnn_conv_desc, filter_algo, cudnn_workspace_ptr, input_data + i * group_offset_in, cudnn_output_grad_desc,
workspace_size_in_bytes, &beta, cudnn_filter_desc, output_grad_data + i * group_offset_out, cudnn_conv_desc,
filter_grad_data + i * group_offset_filter)); filter_algo, cudnn_workspace, workspace_size_in_bytes, &beta,
cudnn_filter_desc, filter_grad_data + i * group_offset_filter));
};
workspace_handle.RunFunc(cudnn_func, workspace_size_in_bytes);
} }
} }
} }

File diff suppressed because it is too large Load Diff

@ -0,0 +1,35 @@
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include "paddle/fluid/framework/eigen.h"
#include "paddle/fluid/framework/op_registry.h"
namespace paddle {
namespace operators {
template <typename DeviceContext, typename T>
class DataNormKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override;
};
template <typename DeviceContext, typename T>
class DataNormGradKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override;
};
} // namespace operators
} // namespace paddle

@ -52,11 +52,11 @@ struct BenchFunc {
for (int i = 0; i < FLAGS_burning; ++i) { for (int i = 0; i < FLAGS_burning; ++i) {
tgt(args...); tgt(args...);
} }
auto start = paddle::platform::PosixInNsec() / 1e-3; auto start = paddle::platform::PosixInNsec() * 1e-3;
for (int i = 0; i < FLAGS_repeat; ++i) { for (int i = 0; i < FLAGS_repeat; ++i) {
tgt(args...); tgt(args...);
} }
auto end = paddle::platform::PosixInNsec() / 1e-3; auto end = paddle::platform::PosixInNsec() * 1e-3;
return static_cast<double>(end - start) / FLAGS_repeat; return static_cast<double>(end - start) / FLAGS_repeat;
} }
}; };

@ -0,0 +1,162 @@
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/teacher_student_sigmoid_loss_op.h"
#include "paddle/fluid/operators/math/math_function.h"
namespace paddle {
namespace operators {
using Tensor = framework::Tensor;
class TeacherStudentSigmoidLossOp : public framework::OperatorWithKernel {
public:
using framework::OperatorWithKernel::OperatorWithKernel;
void InferShape(framework::InferShapeContext* ctx) const override {
PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should be not null.");
PADDLE_ENFORCE(ctx->HasInput("Label"), "Input(Label) should be not null.");
PADDLE_ENFORCE(ctx->HasOutput("Y"), "Output(Y) should be not null.");
auto x_dims = ctx->GetInputDim("X");
auto label_dims = ctx->GetInputDim("Label");
PADDLE_ENFORCE_EQ(x_dims.size(), 2UL, "Input(X)'s rank should be 2.");
PADDLE_ENFORCE_EQ(label_dims.size(), 2UL,
"Input(Label)'s rank should be 2.");
PADDLE_ENFORCE_EQ(x_dims[0], label_dims[0],
"The 1st dimension of Input(X) and Input(Label) should "
"be equal.");
PADDLE_ENFORCE_EQ(label_dims[1], 1UL,
"The 2nd dimension of "
"Input(Label) should be 1.");
ctx->SetOutputDim("Y", {x_dims[0], 1});
ctx->ShareLoD("X", /*->*/ "Y");
}
protected:
// Explicitly set that the data type of computation kernel of
// teacher_student_sigmoid_loss
// is determined by its input "X".
framework::OpKernelType GetExpectedKernelType(
const framework::ExecutionContext& ctx) const override {
return framework::OpKernelType(ctx.Input<Tensor>("X")->type(),
ctx.device_context());
}
};
class TeacherStudentSigmoidLossGradientOp
: public framework::OperatorWithKernel {
public:
using framework::OperatorWithKernel::OperatorWithKernel;
void InferShape(framework::InferShapeContext* ctx) const override {
PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should be not null.");
PADDLE_ENFORCE(ctx->HasInput("Label"), "Input(Label) should be not null.");
PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Y")),
"Input(Y@GRAD) should be not null.");
PADDLE_ENFORCE(ctx->HasOutput(framework::GradVarName("X")),
"Output(X@GRAD) should be not null.");
auto x_dims = ctx->GetInputDim("X");
auto label_dims = ctx->GetInputDim("Label");
auto dy_dims = ctx->GetInputDim(framework::GradVarName("Y"));
PADDLE_ENFORCE_EQ(x_dims.size(), 2, "Input(X)'s rank should be 2.");
PADDLE_ENFORCE_EQ(dy_dims.size(), 2, "Input(Y@Grad)'s rank should be 2.");
PADDLE_ENFORCE_EQ(label_dims.size(), 2, "Input(Label)'s rank should be 2.");
PADDLE_ENFORCE_EQ(x_dims[0], label_dims[0],
"The 1st dimension of Input(X) and Input(Label) should "
"be equal.");
PADDLE_ENFORCE_EQ(x_dims[0], dy_dims[0],
"The 1st dimension of Input(X) and Input(Y@Grad) should "
"be equal.");
PADDLE_ENFORCE_EQ(dy_dims[1], 1,
"The 2nd dimension of Input(Y@Grad) should be 1.");
PADDLE_ENFORCE_EQ(label_dims[1], 1,
"When Attr(soft_label) == false, the 2nd dimension of "
"Input(Label) should be 1.");
ctx->SetOutputDim(framework::GradVarName("X"), x_dims);
ctx->ShareLoD("X", framework::GradVarName("X"));
}
protected:
// Explicitly set that the data type of computation kernel of
// teacher_student_sigmoid_loss
// is determined by its input "X".
framework::OpKernelType GetExpectedKernelType(
const framework::ExecutionContext& ctx) const override {
return framework::OpKernelType(ctx.Input<Tensor>("X")->type(),
ctx.device_context());
}
};
class TeacherStudentSigmoidLossOpMaker
: public framework::OpProtoAndCheckerMaker {
public:
void Make() override {
AddInput("X",
"(Tensor, default Tensor<float>), a 2-D tensor with shape [N x 1],"
" where N is the batch size and D is the output. "
"This input is a probability computed by the previous operator, "
"which is almost always the result of a softmax operator.");
AddInput("Label",
"(Tensor), the ground truth which is a 2-D tensor. "
"Label is a Tensor<float> with shape [N x 1]. ");
AddOutput("Y",
"(Tensor, default Tensor<float>), a 2-D tensor with shape "
"[N x 1]. The teacher student sigmoid loss.");
AddAttr<float>(
"soft_max_up_bound",
"fp32, if input > soft_max_up_bound, will be bound, default 15.0")
.SetDefault(15.0);
AddAttr<float>(
"soft_max_lower_bound",
"fp32, if input < soft_max_lower_bound, will be bound, default -15.0")
.SetDefault(-15.0);
AddComment(R"DOC(
TeacherStudentSigmoidLoss Operator.
It's similarity to SigmoidCrossEntropyWithLogits Operator. The difference is that
we add another label(z') to original.
loss = max(x, 0) - x * z + log(1 + exp(-abs(x))) + max(x, 0) - x * z' + log(1 + exp(-abs(x)))
z is click or not
z' is teacher value
label = {-2, -1, [0, 2]}
when z' is not exist, clk = 0 : label = -2;
when z' is not exist, clk = 1 : label = -1;
when z' is exist , clk = 0 : label = 0 + z';
when z' is exist , clk = 1 : label = 1 + z';
)DOC");
}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
REGISTER_OPERATOR(teacher_student_sigmoid_loss,
ops::TeacherStudentSigmoidLossOp,
ops::TeacherStudentSigmoidLossOpMaker,
paddle::framework::DefaultGradOpDescMaker<true>);
REGISTER_OPERATOR(teacher_student_sigmoid_loss_grad,
ops::TeacherStudentSigmoidLossGradientOp);
REGISTER_OP_CPU_KERNEL(teacher_student_sigmoid_loss,
ops::TeacherStudentSigmoidLossOpKernel<float>,
ops::TeacherStudentSigmoidLossOpKernel<double>);
REGISTER_OP_CPU_KERNEL(teacher_student_sigmoid_loss_grad,
ops::TeacherStudentSigmoidLossGradOpKernel<float>,
ops::TeacherStudentSigmoidLossGradOpKernel<double>);

@ -0,0 +1,118 @@
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include "paddle/fluid/framework/eigen.h"
#include "paddle/fluid/framework/op_registry.h"
namespace paddle {
namespace operators {
using Tensor = framework::Tensor;
template <typename T>
class TeacherStudentSigmoidLossOpKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& context) const override {
Tensor* y = context.Output<Tensor>("Y");
const Tensor* x = context.Input<Tensor>("X");
const Tensor* labels = context.Input<Tensor>("Label");
T* y_data = y->mutable_data<T>(context.GetPlace());
const T* x_data = x->data<T>();
const T* label_data = labels->data<T>();
int64_t batch_size = x->dims()[0];
// loss = max(x, 0) - x * z + log(1 + exp(-abs(x))) + max(x, 0) - x * z' +
// log(1 + exp(-abs(x)))
// z is click or not
// z' is value q of feed_fine
// label = {-2, -1, [0, 2]}
// when z' is not exist, clk = 0 : label = -2;
// when z' is not exist, clk = 1 : label = -1;
// when z' is exist , clk = 0 : label = 0 + z';
// when z' is exist , clk = 1 : label = 1 + z';
for (int i = 0; i < batch_size; ++i) {
if (label_data[i] < -1.0) {
y_data[i] = (x_data[i] > 0 ? x_data[i] : 0.0) +
log(1.0 + exp(-fabs(x_data[i])));
} else if (label_data[i] < 0.0) {
y_data[i] = (x_data[i] > 0 ? x_data[i] : 0.0) - x_data[i] +
log(1.0 + exp(-fabs(x_data[i])));
} else if (label_data[i] < 1.0) {
y_data[i] = (x_data[i] > 0 ? x_data[i] : 0.0) +
log(1.0 + exp(-fabs(x_data[i]))) +
(x_data[i] > 0 ? x_data[i] : 0.0) -
x_data[i] * label_data[i] +
log(1.0 + exp(-fabs(x_data[i])));
} else {
y_data[i] = (x_data[i] > 0 ? x_data[i] : 0.0) - x_data[i] +
log(1.0 + exp(-fabs(x_data[i]))) +
(x_data[i] > 0 ? x_data[i] : 0.0) -
x_data[i] * (label_data[i] - 1.0) +
log(1.0 + exp(-fabs(x_data[i])));
}
}
}
};
template <typename T>
class TeacherStudentSigmoidLossGradOpKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& context) const override {
const Tensor* x = context.Input<Tensor>("X");
const T* x_data = x->data<T>();
Tensor* dx = context.Output<Tensor>(framework::GradVarName("X"));
T* dx_data = dx->mutable_data<T>(context.GetPlace());
const Tensor* labels = context.Input<Tensor>("Label");
const T* label_data = labels->data<T>();
T soft_max_up_bound =
static_cast<T>(context.Attr<float>("soft_max_up_bound"));
T soft_max_lower_bound =
static_cast<T>(context.Attr<float>("soft_max_lower_bound"));
int64_t batch_size = x->dims()[0];
const framework::Tensor* dOut =
context.Input<framework::Tensor>(framework::GradVarName("Y"));
const T* dout_data = dOut->data<T>();
for (int i = 0; i < batch_size; ++i) {
T sum_val = x_data[i];
if (sum_val > soft_max_up_bound) {
sum_val = soft_max_up_bound;
} else {
if (sum_val < soft_max_lower_bound) {
sum_val = soft_max_lower_bound;
}
}
T pred = 1.0 / (1.0 + exp(-sum_val));
if (label_data[i] < -1.0) {
dx_data[i] = 0.0 - pred;
} else if (label_data[i] < 0.0) {
dx_data[i] = 1.0 - pred;
} else {
dx_data[i] = label_data[i] - 2.0 * pred;
}
if (sum_val >= soft_max_up_bound || sum_val <= soft_max_lower_bound) {
dx_data[i] = 0;
}
dx_data[i] *= dout_data[i] * -1;
}
}
};
} // namespace operators
} // namespace paddle

Some files were not shown because too many files have changed in this diff Show More

Loading…
Cancel
Save