Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into optimize-opyreader

revert-13637-optimize-opyreader
qiaolongfei 7 years ago
commit 85ddb5c76e

@ -1,11 +1,9 @@
add_custom_target(paddle_apis ALL add_custom_target(paddle_apis ALL
DEPENDS paddle_v2_apis paddle_fluid_apis) DEPENDS paddle_v2_apis)
add_custom_target(paddle_docs ALL add_custom_target(paddle_docs ALL
DEPENDS paddle_v2_docs paddle_v2_docs_cn DEPENDS paddle_v2_docs paddle_v2_docs_cn
paddle_fluid_docs paddle_fluid_docs_cn
paddle_mobile_docs paddle_mobile_docs_cn) paddle_mobile_docs paddle_mobile_docs_cn)
add_subdirectory(v2) add_subdirectory(v2)
add_subdirectory(fluid)
add_subdirectory(mobile) add_subdirectory(mobile)

@ -153,6 +153,13 @@ paddle.fluid.layers.elementwise_mul ArgSpec(args=['x', 'y', 'out', 'axis', 'use_
paddle.fluid.layers.elementwise_max ArgSpec(args=['x', 'y', 'out', 'axis', 'use_mkldnn', 'act', 'name'], varargs=None, keywords=None, defaults=(None, -1, False, None, None)) paddle.fluid.layers.elementwise_max ArgSpec(args=['x', 'y', 'out', 'axis', 'use_mkldnn', 'act', 'name'], varargs=None, keywords=None, defaults=(None, -1, False, None, None))
paddle.fluid.layers.elementwise_min ArgSpec(args=['x', 'y', 'out', 'axis', 'use_mkldnn', 'act', 'name'], varargs=None, keywords=None, defaults=(None, -1, False, None, None)) paddle.fluid.layers.elementwise_min ArgSpec(args=['x', 'y', 'out', 'axis', 'use_mkldnn', 'act', 'name'], varargs=None, keywords=None, defaults=(None, -1, False, None, None))
paddle.fluid.layers.elementwise_pow ArgSpec(args=['x', 'y', 'out', 'axis', 'use_mkldnn', 'act', 'name'], varargs=None, keywords=None, defaults=(None, -1, False, None, None)) paddle.fluid.layers.elementwise_pow ArgSpec(args=['x', 'y', 'out', 'axis', 'use_mkldnn', 'act', 'name'], varargs=None, keywords=None, defaults=(None, -1, False, None, None))
paddle.fluid.layers.uniform_random_batch_size_like ArgSpec(args=['input', 'shape', 'dtype', 'input_dim_idx', 'output_dim_idx', 'min', 'max', 'seed'], varargs=None, keywords=None, defaults=('float32', 0, 0, -1.0, 1.0, 0))
paddle.fluid.layers.gaussian_random ArgSpec(args=['shape', 'mean', 'std', 'seed', 'dtype', 'use_mkldnn'], varargs=None, keywords=None, defaults=(0.0, 1.0, 0, 'float32', False))
paddle.fluid.layers.sampling_id ArgSpec(args=['x', 'min', 'max', 'seed', 'dtype'], varargs=None, keywords=None, defaults=(0.0, 1.0, 0, 'float32'))
paddle.fluid.layers.gaussian_random_batch_size_like ArgSpec(args=['input', 'shape', 'input_dim_idx', 'output_dim_idx', 'mean', 'std', 'seed', 'dtype'], varargs=None, keywords=None, defaults=(0, 0, 0.0, 1.0, 0, 'float32'))
paddle.fluid.layers.sum ArgSpec(args=['x', 'use_mkldnn'], varargs=None, keywords=None, defaults=(False,))
paddle.fluid.layers.slice ArgSpec(args=['input', 'axes', 'starts', 'ends'], varargs=None, keywords=None, defaults=None)
paddle.fluid.layers.shape ArgSpec(args=['input'], varargs=None, keywords=None, defaults=None)
paddle.fluid.layers.data ArgSpec(args=['name', 'shape', 'append_batch_size', 'dtype', 'lod_level', 'type', 'stop_gradient'], varargs=None, keywords=None, defaults=(True, 'float32', 0, VarType.LOD_TENSOR, True)) paddle.fluid.layers.data ArgSpec(args=['name', 'shape', 'append_batch_size', 'dtype', 'lod_level', 'type', 'stop_gradient'], varargs=None, keywords=None, defaults=(True, 'float32', 0, VarType.LOD_TENSOR, True))
paddle.fluid.layers.open_files ArgSpec(args=['filenames', 'shapes', 'lod_levels', 'dtypes', 'thread_num', 'buffer_size', 'pass_num', 'is_test'], varargs=None, keywords=None, defaults=(None, None, 1, None)) paddle.fluid.layers.open_files ArgSpec(args=['filenames', 'shapes', 'lod_levels', 'dtypes', 'thread_num', 'buffer_size', 'pass_num', 'is_test'], varargs=None, keywords=None, defaults=(None, None, 1, None))
paddle.fluid.layers.read_file ArgSpec(args=['reader'], varargs=None, keywords=None, defaults=None) paddle.fluid.layers.read_file ArgSpec(args=['reader'], varargs=None, keywords=None, defaults=None)
@ -224,13 +231,6 @@ paddle.fluid.layers.logical_and ArgSpec(args=[], varargs='args', keywords='kwarg
paddle.fluid.layers.logical_or ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None) paddle.fluid.layers.logical_or ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None)
paddle.fluid.layers.logical_xor ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None) paddle.fluid.layers.logical_xor ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None)
paddle.fluid.layers.logical_not ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None) paddle.fluid.layers.logical_not ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None)
paddle.fluid.layers.uniform_random_batch_size_like ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None)
paddle.fluid.layers.gaussian_random ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None)
paddle.fluid.layers.sampling_id ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None)
paddle.fluid.layers.gaussian_random_batch_size_like ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None)
paddle.fluid.layers.sum ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None)
paddle.fluid.layers.slice ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None)
paddle.fluid.layers.shape ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None)
paddle.fluid.layers.maxout ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None) paddle.fluid.layers.maxout ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None)
paddle.fluid.layers.sigmoid ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)) paddle.fluid.layers.sigmoid ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,))
paddle.fluid.layers.logsigmoid ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)) paddle.fluid.layers.logsigmoid ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,))
@ -298,6 +298,7 @@ paddle.fluid.contrib.BeamSearchDecoder.early_stop ArgSpec(args=['self'], varargs
paddle.fluid.contrib.BeamSearchDecoder.read_array ArgSpec(args=['self', 'init', 'is_ids', 'is_scores'], varargs=None, keywords=None, defaults=(False, False)) paddle.fluid.contrib.BeamSearchDecoder.read_array ArgSpec(args=['self', 'init', 'is_ids', 'is_scores'], varargs=None, keywords=None, defaults=(False, False))
paddle.fluid.contrib.BeamSearchDecoder.update_array ArgSpec(args=['self', 'array', 'value'], varargs=None, keywords=None, defaults=None) paddle.fluid.contrib.BeamSearchDecoder.update_array ArgSpec(args=['self', 'array', 'value'], varargs=None, keywords=None, defaults=None)
paddle.fluid.contrib.memory_usage ArgSpec(args=['program', 'batch_size'], varargs=None, keywords=None, defaults=None) paddle.fluid.contrib.memory_usage ArgSpec(args=['program', 'batch_size'], varargs=None, keywords=None, defaults=None)
paddle.fluid.contrib.op_freq_statistic ArgSpec(args=['program'], varargs=None, keywords=None, defaults=None)
paddle.fluid.transpiler.DistributeTranspiler.__init__ ArgSpec(args=['self', 'config'], varargs=None, keywords=None, defaults=(None,)) paddle.fluid.transpiler.DistributeTranspiler.__init__ ArgSpec(args=['self', 'config'], varargs=None, keywords=None, defaults=(None,))
paddle.fluid.transpiler.DistributeTranspiler.get_pserver_program ArgSpec(args=['self', 'endpoint'], varargs=None, keywords=None, defaults=None) paddle.fluid.transpiler.DistributeTranspiler.get_pserver_program ArgSpec(args=['self', 'endpoint'], varargs=None, keywords=None, defaults=None)
paddle.fluid.transpiler.DistributeTranspiler.get_pserver_programs ArgSpec(args=['self', 'endpoint'], varargs=None, keywords=None, defaults=None) paddle.fluid.transpiler.DistributeTranspiler.get_pserver_programs ArgSpec(args=['self', 'endpoint'], varargs=None, keywords=None, defaults=None)

@ -20,79 +20,37 @@ namespace paddle {
namespace framework { namespace framework {
namespace details { namespace details {
// Change it to thread safe flags if needed. template <class T>
class ThreadUnsafeOwnershipFlags { class COWPtr {
public: public:
explicit ThreadUnsafeOwnershipFlags(bool flag) : flag_(flag) {} typedef std::shared_ptr<T> RefPtr;
ThreadUnsafeOwnershipFlags(const ThreadUnsafeOwnershipFlags& other) = delete;
ThreadUnsafeOwnershipFlags& operator=(
const ThreadUnsafeOwnershipFlags& other) = delete;
ThreadUnsafeOwnershipFlags(ThreadUnsafeOwnershipFlags&& other) = default;
void SetOwnership(bool flag) { flag_ = flag; }
// Invoke the callback if it is not owned.
template <typename Callback>
void AcquireOwnershipOnce(Callback acquire) {
if (!flag_) {
acquire();
flag_ = true;
}
}
private: private:
bool flag_; RefPtr m_sp;
};
// Copy-On-Write pointer.
// It will hold a T* pointer, and only copy once when `MutableData` is invoked.
//
// The template parameter OwnershipFlags should have:
// * a constructor takes a bool. True if own.
// * SetOwnership(bool flag).
// * AcquireOwnershipOnce(Callback). It will invoke the callback if it is not
// owned.
//
// https://en.wikipedia.org/wiki/Copy-on-write
template <typename T, typename OwnershipFlags = ThreadUnsafeOwnershipFlags>
class COWPtr {
public: public:
// Ctor from raw pointer. COWPtr() : m_sp(nullptr) {}
explicit COWPtr(T* ptr) : payload_(ptr), ownership_{true} {} explicit COWPtr(T* t) : m_sp(t) {}
// Move methods. Steal ownership from origin const T& Data() const { return *m_sp; }
COWPtr(COWPtr&& other)
: payload_(other.payload_), ownership_{std::move(other.ownership_)} {}
COWPtr& operator=(COWPtr&& origin) = default;
// Copy methods. Not own payload
COWPtr(const COWPtr& other) : payload_(other.payload_), ownership_{false} {}
COWPtr& operator=(const COWPtr& other) {
payload_ = other.payload_;
ownership_.SetOwnership(false);
return *this;
}
// Access read only data.
const T& Data() const { return *payload_; }
// Access mutable data. If the data is not owned, the data will be copied
// before.
T* MutableData() { T* MutableData() {
ownership_.AcquireOwnershipOnce( DetachIfNotUnique();
[this] { payload_.reset(new T(*payload_)); }); return m_sp.get();
return payload_.get();
} }
private: void DetachIfNotUnique() {
// Actual data pointer. T* tmp = m_sp.get();
std::shared_ptr<T> payload_; if (!(tmp == nullptr || m_sp.unique())) {
Detach();
}
}
// Ownership flag. void Detach() {
OwnershipFlags ownership_; T* tmp = m_sp.get();
m_sp = RefPtr(new T(*tmp));
}
}; };
} // namespace details } // namespace details
} // namespace framework } // namespace framework
} // namespace paddle } // namespace paddle

@ -30,6 +30,14 @@ TEST(COWPtr, all) {
ASSERT_EQ(ptr2.Data(), 10); ASSERT_EQ(ptr2.Data(), 10);
} }
TEST(COWPtr, change_old) {
COWPtr<int> ptr(new int{0});
COWPtr<int> ptr2 = ptr;
*ptr.MutableData() = 10;
ASSERT_EQ(ptr2.Data(), 0);
ASSERT_EQ(ptr.Data(), 10);
}
} // namespace details } // namespace details
} // namespace framework } // namespace framework
} // namespace paddle } // namespace paddle

@ -257,6 +257,22 @@ std::unique_ptr<ir::Graph> AttentionLSTMFusePass::ApplyImpl(
std::unique_ptr<ir::Graph> graph) const { std::unique_ptr<ir::Graph> graph) const {
PDPattern external_pattern, subblock_pattern; PDPattern external_pattern, subblock_pattern;
// Use the following variables to tell whether this model is RNN1.
// This fuse can only works on the RNN1 model.
std::unordered_set<std::string> specified_vars({"data_lod_attention",
"cell_init", "hidden_init",
"data", "week", "minute"});
int count = 0;
for (auto* node : graph->Nodes()) {
if (node->IsVar() && specified_vars.count(node->Name())) {
++count;
}
}
if (count < specified_vars.size()) {
return graph;
}
// Continue to fuse.
FindWhileOp(graph.get()); FindWhileOp(graph.get());
return graph; return graph;
} }

File diff suppressed because it is too large Load Diff

@ -38,31 +38,27 @@ struct OpInfo {
OpAttrChecker* checker_{nullptr}; OpAttrChecker* checker_{nullptr};
InferVarTypeFN infer_var_type_; InferVarTypeFN infer_var_type_;
InferShapeFN infer_shape_; InferShapeFN infer_shape_;
std::string op_type_;
bool HasOpProtoAndChecker() const { bool HasOpProtoAndChecker() const {
return proto_ != nullptr && checker_ != nullptr; return proto_ != nullptr && checker_ != nullptr;
} }
const proto::OpProto& Proto() const { const proto::OpProto& Proto() const {
PADDLE_ENFORCE_NOT_NULL(proto_, "Operator %s Proto has not been registered", PADDLE_ENFORCE_NOT_NULL(proto_, "Operator Proto has not been registered");
op_type_);
PADDLE_ENFORCE(proto_->IsInitialized(), PADDLE_ENFORCE(proto_->IsInitialized(),
"Operator %s Proto must be initialized in op info", "Operator Proto must be initialized in op info");
op_type_);
return *proto_; return *proto_;
} }
const OpCreator& Creator() const { const OpCreator& Creator() const {
PADDLE_ENFORCE_NOT_NULL( PADDLE_ENFORCE_NOT_NULL(creator_,
creator_, "Operator %s Creator has not been registered", op_type_); "Operator Creator has not been registered");
return creator_; return creator_;
} }
const GradOpMakerFN& GradOpMaker() const { const GradOpMakerFN& GradOpMaker() const {
PADDLE_ENFORCE_NOT_NULL(grad_op_maker_, PADDLE_ENFORCE_NOT_NULL(grad_op_maker_,
"Operator %s GradOpMaker has not been registered.", "Operator GradOpMaker has not been registered.");
op_type_);
return grad_op_maker_; return grad_op_maker_;
} }
@ -77,9 +73,8 @@ class OpInfoMap {
return map_.find(op_type) != map_.end(); return map_.find(op_type) != map_.end();
} }
void Insert(const std::string& type, OpInfo info) { void Insert(const std::string& type, const OpInfo& info) {
PADDLE_ENFORCE(!Has(type), "Operator %s has been registered", type); PADDLE_ENFORCE(!Has(type), "Operator %s has been registered", type);
info.op_type_ = type;
map_.insert({type, info}); map_.insert({type, info});
} }

@ -27,8 +27,11 @@ class SelectedRowsTester : public ::testing::Test {
selected_rows_.reset(new SelectedRows(rows, height)); selected_rows_.reset(new SelectedRows(rows, height));
Tensor* value = selected_rows_->mutable_value(); Tensor* value = selected_rows_->mutable_value();
value->mutable_data<float>( auto* data = value->mutable_data<float>(
make_ddim({static_cast<int64_t>(rows.size()), row_numel}), place_); make_ddim({static_cast<int64_t>(rows.size()), row_numel}), place_);
for (int64_t i = 0; i < value->numel(); ++i) {
data[i] = static_cast<float>(i);
}
} }
protected: protected:
@ -60,6 +63,10 @@ TEST_F(SelectedRowsTester, SerializeAndDeseralize) {
ASSERT_EQ(selected_rows_->height(), dst_tensor.height()); ASSERT_EQ(selected_rows_->height(), dst_tensor.height());
ASSERT_EQ(selected_rows_->value().dims(), dst_tensor.value().dims()); ASSERT_EQ(selected_rows_->value().dims(), dst_tensor.value().dims());
ASSERT_EQ(selected_rows_->GetCompleteDims(), dst_tensor.GetCompleteDims()); ASSERT_EQ(selected_rows_->GetCompleteDims(), dst_tensor.GetCompleteDims());
auto* dst_data = dst_tensor.value().data<float>();
for (int64_t i = 0; i < dst_tensor.value().numel(); ++i) {
ASSERT_EQ(dst_data[i], static_cast<float>(i));
}
} }
TEST(SelectedRows, SparseTable) { TEST(SelectedRows, SparseTable) {

@ -212,10 +212,11 @@ struct AnalysisConfig : public NativeConfig {
kExclude // Specify the disabled passes in `ir_passes`. kExclude // Specify the disabled passes in `ir_passes`.
}; };
// Determine whether to perform graph optimization.
bool enable_ir_optim = true; bool enable_ir_optim = true;
// Manually determine the IR passes to run.
IrPassMode ir_mode{IrPassMode::kExclude}; IrPassMode ir_mode{IrPassMode::kExclude};
// attention lstm fuse works only on some specific models, disable as default. std::vector<std::string> ir_passes;
std::vector<std::string> ir_passes{"attention_lstm_fuse_pass"};
// NOTE this is just for internal development, please not use it. // NOTE this is just for internal development, please not use it.
bool _use_mkldnn{false}; bool _use_mkldnn{false};

@ -30,7 +30,13 @@ detection_library(polygon_box_transform_op SRCS polygon_box_transform_op.cc
polygon_box_transform_op.cu) polygon_box_transform_op.cu)
detection_library(rpn_target_assign_op SRCS rpn_target_assign_op.cc) detection_library(rpn_target_assign_op SRCS rpn_target_assign_op.cc)
detection_library(generate_proposal_labels_op SRCS generate_proposal_labels_op.cc) detection_library(generate_proposal_labels_op SRCS generate_proposal_labels_op.cc)
detection_library(generate_proposals_op SRCS generate_proposals_op.cc)
if(WITH_GPU)
detection_library(generate_proposals_op SRCS generate_proposals_op.cc generate_proposals_op.cu DEPS memory cub)
else()
detection_library(generate_proposals_op SRCS generate_proposals_op.cc)
endif()
detection_library(roi_perspective_transform_op SRCS roi_perspective_transform_op.cc roi_perspective_transform_op.cu) detection_library(roi_perspective_transform_op SRCS roi_perspective_transform_op.cc roi_perspective_transform_op.cu)
#Export local libraries to parent #Export local libraries to parent
set(DETECTION_LIBRARY ${LOCAL_DETECTION_LIBS} PARENT_SCOPE) set(DETECTION_LIBRARY ${LOCAL_DETECTION_LIBS} PARENT_SCOPE)

@ -15,6 +15,7 @@ limitations under the License. */
#include <string> #include <string>
#include <vector> #include <vector>
#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/var_type.h"
#include "paddle/fluid/operators/gather.h" #include "paddle/fluid/operators/gather.h"
#include "paddle/fluid/operators/math/math_function.h" #include "paddle/fluid/operators/math/math_function.h"
@ -69,7 +70,7 @@ class GenerateProposalsOp : public framework::OperatorWithKernel {
const framework::ExecutionContext &ctx) const override { const framework::ExecutionContext &ctx) const override {
return framework::OpKernelType( return framework::OpKernelType(
framework::ToDataType(ctx.Input<Tensor>("Anchors")->type()), framework::ToDataType(ctx.Input<Tensor>("Anchors")->type()),
platform::CPUPlace()); ctx.device_context());
} }
}; };
@ -162,7 +163,7 @@ void FilterBoxes(const platform::DeviceContext &ctx, Tensor *boxes,
const T *im_info_data = im_info.data<T>(); const T *im_info_data = im_info.data<T>();
T *boxes_data = boxes->mutable_data<T>(ctx.GetPlace()); T *boxes_data = boxes->mutable_data<T>(ctx.GetPlace());
T im_scale = im_info_data[2]; T im_scale = im_info_data[2];
keep->Resize({boxes->dims()[0], 1}); keep->Resize({boxes->dims()[0]});
min_size = std::max(min_size, 1.0f); min_size = std::max(min_size, 1.0f);
int *keep_data = keep->mutable_data<int>(ctx.GetPlace()); int *keep_data = keep->mutable_data<int>(ctx.GetPlace());
@ -463,7 +464,7 @@ class GenerateProposalsOpMaker : public framework::OpProtoAndCheckerMaker {
AddAttr<int>("post_nms_topN", "post_nms_topN"); AddAttr<int>("post_nms_topN", "post_nms_topN");
AddAttr<float>("nms_thresh", "nms_thres"); AddAttr<float>("nms_thresh", "nms_thres");
AddAttr<float>("min_size", "min size"); AddAttr<float>("min_size", "min size");
AddAttr<float>("eta", "eta"); AddAttr<float>("eta", "The parameter for adaptive NMS.");
AddComment(R"DOC( AddComment(R"DOC(
Generate Proposals OP Generate Proposals OP

File diff suppressed because it is too large Load Diff

@ -76,8 +76,8 @@ class DetectionMAPOpKernel : public framework::OpKernel<T> {
auto ap_type = GetAPType(ctx.Attr<std::string>("ap_type")); auto ap_type = GetAPType(ctx.Attr<std::string>("ap_type"));
int class_num = ctx.Attr<int>("class_num"); int class_num = ctx.Attr<int>("class_num");
auto label_lod = in_label->lod(); auto& label_lod = in_label->lod();
auto detect_lod = in_detect->lod(); auto& detect_lod = in_detect->lod();
PADDLE_ENFORCE_EQ(label_lod.size(), 1UL, PADDLE_ENFORCE_EQ(label_lod.size(), 1UL,
"Only support one level sequence now."); "Only support one level sequence now.");
PADDLE_ENFORCE_EQ(label_lod[0].size(), detect_lod[0].size(), PADDLE_ENFORCE_EQ(label_lod[0].size(), detect_lod[0].size(),
@ -166,11 +166,11 @@ class DetectionMAPOpKernel : public framework::OpKernel<T> {
auto labels = framework::EigenTensor<T, 2>::From(input_label); auto labels = framework::EigenTensor<T, 2>::From(input_label);
auto detect = framework::EigenTensor<T, 2>::From(input_detect); auto detect = framework::EigenTensor<T, 2>::From(input_detect);
auto label_lod = input_label.lod(); auto& label_lod = input_label.lod();
auto detect_lod = input_detect.lod(); auto& detect_lod = input_detect.lod();
int batch_size = label_lod[0].size() - 1; int batch_size = label_lod[0].size() - 1;
auto label_index = label_lod[0]; auto& label_index = label_lod[0];
for (int n = 0; n < batch_size; ++n) { for (int n = 0; n < batch_size; ++n) {
std::map<int, std::vector<Box>> boxes; std::map<int, std::vector<Box>> boxes;
@ -274,7 +274,6 @@ class DetectionMAPOpKernel : public framework::OpKernel<T> {
output_true_pos->set_lod(true_pos_lod); output_true_pos->set_lod(true_pos_lod);
output_false_pos->set_lod(false_pos_lod); output_false_pos->set_lod(false_pos_lod);
return;
} }
void GetInputPos(const framework::Tensor& input_pos_count, void GetInputPos(const framework::Tensor& input_pos_count,
@ -292,7 +291,7 @@ class DetectionMAPOpKernel : public framework::OpKernel<T> {
auto SetData = [](const framework::LoDTensor& pos_tensor, auto SetData = [](const framework::LoDTensor& pos_tensor,
std::map<int, std::vector<std::pair<T, int>>>& pos) { std::map<int, std::vector<std::pair<T, int>>>& pos) {
const T* pos_data = pos_tensor.data<T>(); const T* pos_data = pos_tensor.data<T>();
auto pos_data_lod = pos_tensor.lod()[0]; auto& pos_data_lod = pos_tensor.lod()[0];
for (size_t i = 0; i < pos_data_lod.size() - 1; ++i) { for (size_t i = 0; i < pos_data_lod.size() - 1; ++i) {
for (size_t j = pos_data_lod[i]; j < pos_data_lod[i + 1]; ++j) { for (size_t j = pos_data_lod[i]; j < pos_data_lod[i + 1]; ++j) {
T score = pos_data[j * 2]; T score = pos_data[j * 2];
@ -317,20 +316,23 @@ class DetectionMAPOpKernel : public framework::OpKernel<T> {
std::map<int, std::vector<std::pair<T, int>>>* false_pos) const { std::map<int, std::vector<std::pair<T, int>>>* false_pos) const {
int batch_size = gt_boxes.size(); int batch_size = gt_boxes.size();
for (int n = 0; n < batch_size; ++n) { for (int n = 0; n < batch_size; ++n) {
auto image_gt_boxes = gt_boxes[n]; auto& image_gt_boxes = gt_boxes[n];
for (auto it = image_gt_boxes.begin(); it != image_gt_boxes.end(); ++it) { for (auto& image_gt_box : image_gt_boxes) {
size_t count = 0; size_t count = 0;
auto labeled_bboxes = it->second; auto& labeled_bboxes = image_gt_box.second;
if (evaluate_difficult) { if (evaluate_difficult) {
count = labeled_bboxes.size(); count = labeled_bboxes.size();
} else { } else {
for (size_t i = 0; i < labeled_bboxes.size(); ++i) for (auto& box : labeled_bboxes) {
if (!(labeled_bboxes[i].is_difficult)) ++count; if (!box.is_difficult) {
++count;
}
}
} }
if (count == 0) { if (count == 0) {
continue; continue;
} }
int label = it->first; int label = image_gt_box.first;
if (label_pos_count->find(label) == label_pos_count->end()) { if (label_pos_count->find(label) == label_pos_count->end()) {
(*label_pos_count)[label] = count; (*label_pos_count)[label] = count;
} else { } else {

@ -50,7 +50,7 @@ class ExtractRowsOp : public framework::OperatorBase {
auto &in = scope.FindVar(Input("X"))->Get<framework::SelectedRows>(); auto &in = scope.FindVar(Input("X"))->Get<framework::SelectedRows>();
auto out = scope.FindVar(Output("Out"))->GetMutable<framework::LoDTensor>(); auto out = scope.FindVar(Output("Out"))->GetMutable<framework::LoDTensor>();
auto in_rows = in.rows(); auto &in_rows = in.rows();
auto out_dim = framework::make_ddim( auto out_dim = framework::make_ddim(
std::vector<int64_t>{static_cast<int64_t>(in_rows.size()), 1}); std::vector<int64_t>{static_cast<int64_t>(in_rows.size()), 1});
auto dst_ptr = out->mutable_data<int64_t>(out_dim, in.place()); auto dst_ptr = out->mutable_data<int64_t>(out_dim, in.place());

@ -127,10 +127,8 @@ class LookupTableGradCUDAKernel : public framework::OpKernel<T> {
auto gpu_place = boost::get<platform::CUDAPlace>(context.GetPlace()); auto gpu_place = boost::get<platform::CUDAPlace>(context.GetPlace());
// TODO(yuyang18): Strange code here. // TODO(yuyang18): Strange code here.
memory::Copy(platform::CPUPlace(), memory::Copy(gpu_place, new_rows.CUDAMutableData(context.GetPlace()),
new_rows.CUDAMutableData(context.GetPlace()), gpu_place, gpu_place, ids_data, ids_num * sizeof(int64_t), stream);
ids_data, ids_num * sizeof(int64_t), stream);
d_table->set_rows(new_rows); d_table->set_rows(new_rows);
auto *d_table_value = d_table->mutable_value(); auto *d_table_value = d_table->mutable_value();

@ -60,11 +60,9 @@ struct SelectedRowsAdd<platform::CUDADeviceContext, T> {
auto out_place = context.GetPlace(); auto out_place = context.GetPlace();
PADDLE_ENFORCE(platform::is_gpu_place(out_place)); PADDLE_ENFORCE(platform::is_gpu_place(out_place));
memory::Copy( memory::Copy(boost::get<platform::CUDAPlace>(out_place), out_data,
boost::get<platform::CUDAPlace>(out_place), out_data, boost::get<platform::CUDAPlace>(in1_place), in1_data,
boost::get<platform::CUDAPlace>(in1_place), in1_data, in1_value.numel() * sizeof(T), context.stream());
in1_value.numel() * sizeof(T),
reinterpret_cast<const platform::CUDADeviceContext&>(context).stream());
auto* in2_data = in2_value.data<T>(); auto* in2_data = in2_value.data<T>();
memory::Copy(boost::get<platform::CUDAPlace>(out_place), memory::Copy(boost::get<platform::CUDAPlace>(out_place),
@ -148,7 +146,7 @@ struct SelectedRowsAddTo<platform::CUDADeviceContext, T> {
auto in1_height = input1.height(); auto in1_height = input1.height();
PADDLE_ENFORCE_EQ(in1_height, input2->height()); PADDLE_ENFORCE_EQ(in1_height, input2->height());
framework::Vector<int64_t> in1_rows(input1.rows()); auto& in1_rows = input1.rows();
auto& in2_rows = *(input2->mutable_rows()); auto& in2_rows = *(input2->mutable_rows());
auto& in1_value = input1.value(); auto& in1_value = input1.value();

@ -45,12 +45,10 @@ class ReadInferVarType : public framework::VarTypeInference {
framework::VarDesc* reader = block->FindVarRecursive(reader_name); framework::VarDesc* reader = block->FindVarRecursive(reader_name);
auto dtypes = reader->GetDataTypes(); auto dtypes = reader->GetDataTypes();
PADDLE_ENFORCE_EQ(dtypes.size(), out_names.size()); PADDLE_ENFORCE_EQ(dtypes.size(), out_names.size());
auto lod_levels = reader->GetLoDLevels();
for (size_t i = 0; i < dtypes.size(); ++i) { for (size_t i = 0; i < dtypes.size(); ++i) {
framework::VarDesc& out = block->FindRecursiveOrCreateVar(out_names[i]); framework::VarDesc& out = block->FindRecursiveOrCreateVar(out_names[i]);
out.SetType(framework::proto::VarType::LOD_TENSOR); out.SetType(framework::proto::VarType::LOD_TENSOR);
out.SetDataType(dtypes[i]); out.SetDataType(dtypes[i]);
out.SetLoDLevel(lod_levels[i]);
} }
} }
}; };

@ -53,15 +53,16 @@ class SamplingIdOpMaker : public framework::OpProtoAndCheckerMaker {
SamplingId Operator. SamplingId Operator.
A layer for sampling id from multinomial distribution from the A layer for sampling id from multinomial distribution from the
input. Sampling one id for one sample.)DOC"); input. Sampling one id for one sample.)DOC");
AddAttr<float>("min", "Minimum value of random. [default 0.0].") AddAttr<float>("min", "Minimum value of random. (float, default 0.0).")
.SetDefault(0.0f); .SetDefault(0.0f);
AddAttr<float>("max", "Maximun value of random. [default 1.0].") AddAttr<float>("max", "Maximun value of random. (float, default 1.0).")
.SetDefault(1.0f); .SetDefault(1.0f);
AddAttr<int>("seed", AddAttr<int>(
"Random seed used for the random number engine. " "seed",
"0 means use a seed generated by the system." "Random seed used for the random number engine. "
"Note that if seed is not 0, this operator will always " "0 means use a seed generated by the system."
"generate the same random numbers every time. [default 0].") "Note that if seed is not 0, this operator will always "
"generate the same random numbers every time. (int, default 0).")
.SetDefault(0); .SetDefault(0);
} }
}; };

@ -77,8 +77,10 @@ class ScaleOpVarTypeInference : public framework::VarTypeInference {
auto out_var_name = op_desc.Output("Out").front(); auto out_var_name = op_desc.Output("Out").front();
auto *out_var = block->FindVarRecursive(out_var_name); auto *out_var = block->FindVarRecursive(out_var_name);
out_var->SetType(in_var.GetType()); if (in_var_name != out_var_name) {
out_var->SetDataType(in_var.GetDataType()); out_var->SetType(in_var.GetType());
out_var->SetDataType(in_var.GetDataType());
}
} }
}; };

@ -88,7 +88,7 @@ class SGDOpCUDAKernel : public framework::OpKernel<T> {
PADDLE_ENFORCE_EQ(in_height, out_dims[0]); PADDLE_ENFORCE_EQ(in_height, out_dims[0]);
auto& in_value = grad->value(); auto& in_value = grad->value();
framework::Vector<int64_t> in_rows(grad->rows()); auto& in_rows = grad->rows();
int64_t in_row_numel = in_value.numel() / in_rows.size(); int64_t in_row_numel = in_value.numel() / in_rows.size();
PADDLE_ENFORCE_EQ(in_row_numel, param_out->numel() / in_height); PADDLE_ENFORCE_EQ(in_row_numel, param_out->numel() / in_height);

@ -52,26 +52,16 @@ class ShrinkRNNMemoryOp : public ArrayOp {
size_t height = dst_num_rows; size_t height = dst_num_rows;
// do shrink for the top level LoD // do shrink for the top level LoD
if (x_tensor.lod().size() > 0 && if (x_tensor.lod().size() > 0 &&
x_tensor.lod()[0].size() > static_cast<size_t>(dst_num_rows)) { x_tensor.lod()[0].size() > static_cast<size_t>(dst_num_rows)) {
if (x_tensor.lod().size() > 1) { // MultiLevel LoD auto lod_offset = framework::GetSubLoDAndAbsoluteOffset(x_tensor.lod(), 0,
auto lod_offset = framework::GetSubLoDAndAbsoluteOffset( dst_num_rows, 0);
x_tensor.lod(), 0, dst_num_rows, 0); height = lod_offset.second.second;
height = lod_offset.second.second; auto out_lod = out_tensor.mutable_lod();
auto out_lod = out_tensor.mutable_lod(); framework::AppendLoD(out_lod, lod_offset.first);
framework::AppendLoD(out_lod, lod_offset.first);
} else {
// Shrink LoD
auto lod_item = x_tensor.lod()[0];
lod_item.resize(dst_num_rows + 1);
out_tensor.set_lod({lod_item});
const auto &const_lod_item = lod_item;
height = const_lod_item.back();
}
} }
if (height != 0) { if (dst_num_rows != 0) {
out_tensor.mutable_data(place, x_tensor.type()); out_tensor.mutable_data(place, x_tensor.type());
auto dev_ctx = platform::DeviceContextPool::Instance().Get(place); auto dev_ctx = platform::DeviceContextPool::Instance().Get(place);
framework::TensorCopy(x_tensor.Slice(0, height), place, *dev_ctx, framework::TensorCopy(x_tensor.Slice(0, height), place, *dev_ctx,
@ -144,11 +134,8 @@ class ShrinkRNNMemoryGradOp : public ArrayOp {
} else { } else {
auto &dout_tensor = dout_var->Get<framework::LoDTensor>(); auto &dout_tensor = dout_var->Get<framework::LoDTensor>();
auto height = dout_tensor.dims()[0]; auto height = dout_tensor.dims()[0];
if (height != 0) { auto slice = dx_tensor.Slice(0, static_cast<int>(height));
auto slice = dx_tensor.Slice(0, static_cast<int>(height)); framework::TensorCopy(dout_tensor, dout_tensor.place(), dev_ctx, &slice);
framework::TensorCopy(dout_tensor, dout_tensor.place(), dev_ctx,
&slice);
}
if (dx_tensor.dims()[0] > height) { if (dx_tensor.dims()[0] > height) {
auto rest_tensor = dx_tensor.Slice( auto rest_tensor = dx_tensor.Slice(
static_cast<int>(height), static_cast<int>(dx_tensor.dims()[0])); static_cast<int>(height), static_cast<int>(dx_tensor.dims()[0]));

@ -32,7 +32,7 @@ class SumKernel : public framework::OpKernel<T> {
public: public:
void Compute(const framework::ExecutionContext &context) const override { void Compute(const framework::ExecutionContext &context) const override {
auto in_vars = context.MultiInputVar("X"); auto in_vars = context.MultiInputVar("X");
int N = in_vars.size(); size_t in_num = in_vars.size();
auto out_var = context.OutputVar("Out"); auto out_var = context.OutputVar("Out");
bool in_place = out_var == in_vars[0]; bool in_place = out_var == in_vars[0];
@ -53,7 +53,7 @@ class SumKernel : public framework::OpKernel<T> {
auto &place = auto &place =
*context.template device_context<DeviceContext>().eigen_device(); *context.template device_context<DeviceContext>().eigen_device();
// If in_place, just skip the first tensor // If in_place, just skip the first tensor
for (int i = in_place ? 1 : 0; i < N; i++) { for (size_t i = in_place ? 1 : 0; i < in_num; i++) {
if (in_vars[i]->IsType<framework::LoDTensor>()) { if (in_vars[i]->IsType<framework::LoDTensor>()) {
auto &in_t = in_vars[i]->Get<framework::LoDTensor>(); auto &in_t = in_vars[i]->Get<framework::LoDTensor>();
if (in_t.numel() == 0) { if (in_t.numel() == 0) {
@ -101,13 +101,13 @@ class SumKernel : public framework::OpKernel<T> {
// Runtime InferShape // Runtime InferShape
size_t first_dim = 0; size_t first_dim = 0;
for (int i = 0; i < N; i++) { for (size_t i = 0; i < in_num; i++) {
auto &sel_row = get_selected_row(i); auto &sel_row = get_selected_row(i);
first_dim += sel_row.rows().size(); first_dim += sel_row.rows().size();
} }
std::vector<int64_t> in_dim; std::vector<int64_t> in_dim;
for (int i = 0; i < N; i++) { for (size_t i = 0; i < in_num; i++) {
auto &sel_row = get_selected_row(i); auto &sel_row = get_selected_row(i);
if (sel_row.rows().size() > 0) { if (sel_row.rows().size() > 0) {
in_dim = framework::vectorize(sel_row.value().dims()); in_dim = framework::vectorize(sel_row.value().dims());
@ -116,14 +116,14 @@ class SumKernel : public framework::OpKernel<T> {
} }
if (in_dim.empty()) { if (in_dim.empty()) {
VLOG(3) << "WARNING: all the inputs are empty"; VLOG(3) << "WARNING: all the inputs are empty";
in_dim = framework::vectorize(get_selected_row(N - 1).value().dims()); in_dim =
framework::vectorize(get_selected_row(in_num - 1).value().dims());
} else { } else {
in_dim[0] = static_cast<int64_t>(first_dim); in_dim[0] = static_cast<int64_t>(first_dim);
} }
out_value->Resize(framework::make_ddim(in_dim)); out_value->Resize(framework::make_ddim(in_dim));
out_value->mutable_data<T>(context.GetPlace()); out_value->mutable_data<T>(context.GetPlace());
// if all the input sparse vars are empty, no need to // if all the input sparse vars are empty, no need to
// merge these vars. // merge these vars.
if (first_dim == 0UL) { if (first_dim == 0UL) {
@ -133,7 +133,7 @@ class SumKernel : public framework::OpKernel<T> {
math::SelectedRowsAddTo<DeviceContext, T> functor; math::SelectedRowsAddTo<DeviceContext, T> functor;
int64_t offset = 0; int64_t offset = 0;
for (int i = 0; i < N; i++) { for (size_t i = 0; i < in_num; i++) {
auto &sel_row = get_selected_row(i); auto &sel_row = get_selected_row(i);
if (sel_row.rows().size() == 0) { if (sel_row.rows().size() == 0) {
continue; continue;

@ -201,7 +201,6 @@ CUDADeviceContext::CUDADeviceContext(CUDAPlace place)
compute_capability = GetCUDAComputeCapability(place_.device); compute_capability = GetCUDAComputeCapability(place_.device);
multi_process = GetCUDAMultiProcessors(place_.device); multi_process = GetCUDAMultiProcessors(place_.device);
max_threads_per_mp = GetCUDAMaxThreadsPerMultiProcessor(place_.device); max_threads_per_mp = GetCUDAMaxThreadsPerMultiProcessor(place_.device);
grid_max_dims_ = GpuMaxGridDim(place_.device);
PADDLE_ENFORCE(cudaStreamCreate(&stream_)); PADDLE_ENFORCE(cudaStreamCreate(&stream_));
eigen_stream_.reset(new EigenCudaStreamDevice()); eigen_stream_.reset(new EigenCudaStreamDevice());
eigen_stream_->Reinitialize(&stream_, place); eigen_stream_->Reinitialize(&stream_, place);
@ -240,10 +239,6 @@ int CUDADeviceContext::GetMaxPhysicalThreadCount() const {
return multi_process * max_threads_per_mp; return multi_process * max_threads_per_mp;
} }
std::tuple<int, int, int> CUDADeviceContext::GetMaxGridDims() const {
return grid_max_dims_;
}
Eigen::GpuDevice* CUDADeviceContext::eigen_device() const { Eigen::GpuDevice* CUDADeviceContext::eigen_device() const {
return eigen_device_.get(); return eigen_device_.get();
} }

@ -13,7 +13,6 @@ limitations under the License. */
#include <memory> #include <memory>
#include <mutex> // NOLINT #include <mutex> // NOLINT
#include <string> #include <string>
#include <tuple>
#include <unordered_map> #include <unordered_map>
#include <vector> #include <vector>
@ -92,8 +91,6 @@ class CUDADeviceContext : public DeviceContext {
/*! \brief Return the max physical thread count in the device context */ /*! \brief Return the max physical thread count in the device context */
int GetMaxPhysicalThreadCount() const; int GetMaxPhysicalThreadCount() const;
std::tuple<int, int, int> GetMaxGridDims() const;
/*! \brief Return eigen device in the device context. */ /*! \brief Return eigen device in the device context. */
Eigen::GpuDevice* eigen_device() const; Eigen::GpuDevice* eigen_device() const;
@ -138,8 +135,6 @@ class CUDADeviceContext : public DeviceContext {
cudaStream_t stream_; cudaStream_t stream_;
cublasHandle_t cublas_handle_; cublasHandle_t cublas_handle_;
std::tuple<int, int, int> grid_max_dims_;
int compute_capability; int compute_capability;
int multi_process; int multi_process;
int max_threads_per_mp; int max_threads_per_mp;

@ -48,54 +48,35 @@ __global__ static void ForRangeElemwiseOpGridIsOne(Function func) {
} }
template <typename Function> template <typename Function>
__global__ static void ForRangeElemwiseOp(Function func, size_t limit) { __global__ static void ForRangeElemwiseOp(Function func, int limit) {
size_t idx = static_cast<size_t>(blockIdx.x * blockDim.x + threadIdx.x); size_t idx = static_cast<size_t>(blockIdx.x * blockDim.x + threadIdx.x);
if (idx < limit) { if (idx < limit) {
func(idx); func(idx);
} }
} }
template <typename Function>
__global__ static void ForRangeElemwiseOpGridLarge(Function func, size_t limit,
int grid_dim) {
size_t idx = static_cast<size_t>(blockIdx.x * blockDim.x + threadIdx.x);
while (idx < limit) {
func(idx);
idx += grid_dim;
}
}
template <> template <>
struct ForRange<CUDADeviceContext> { struct ForRange<CUDADeviceContext> {
ForRange(const CUDADeviceContext& dev_ctx, size_t limit) ForRange(const CUDADeviceContext& dev_ctx, size_t limit)
: dev_ctx_(dev_ctx), limit_(limit) {} : dev_ctx_(dev_ctx), limit_(static_cast<int>(limit)) {}
template <typename Function> template <typename Function>
inline void operator()(Function func) const { inline void operator()(Function func) const {
constexpr int num_threads = 1024; constexpr int num_threads = 1024;
int block_size = limit_ <= num_threads ? limit_ : num_threads; int block_size = limit_ <= num_threads ? limit_ : num_threads;
size_t grid_size = (limit_ + num_threads - 1) / num_threads; int grid_size = (limit_ + num_threads - 1) / num_threads;
int max_grid_dim = std::get<0>(dev_ctx_.GetMaxGridDims()); if (grid_size == 1) {
ForRangeElemwiseOpGridIsOne<<<1, block_size, 0, dev_ctx_.stream()>>>(
if (grid_size < max_grid_dim) { func);
int grid_size_int = static_cast<int>(grid_size);
if (grid_size == 1) {
ForRangeElemwiseOpGridIsOne<<<1, block_size, 0, dev_ctx_.stream()>>>(
func);
} else {
ForRangeElemwiseOp<<<grid_size_int, block_size, 0, dev_ctx_.stream()>>>(
func, limit_);
}
} else { } else {
ForRangeElemwiseOpGridLarge<<<max_grid_dim, block_size, 0, ForRangeElemwiseOp<<<grid_size, block_size, 0, dev_ctx_.stream()>>>(
dev_ctx_.stream()>>>(func, limit_, func, limit_);
max_grid_dim);
} }
} }
const CUDADeviceContext& dev_ctx_; const CUDADeviceContext& dev_ctx_;
size_t limit_; int limit_;
}; };
#endif #endif

Some files were not shown because too many files have changed in this diff Show More

Loading…
Cancel
Save