Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into fix-beam-search-size

test=develop
revert-15296-async_double_buffered_py_reader
guoshengCS 6 years ago
commit b6c3b69af8

@ -325,6 +325,7 @@ paddle.fluid.layers.iou_similarity ArgSpec(args=['x', 'y', 'name'], varargs=None
paddle.fluid.layers.box_coder ArgSpec(args=['prior_box', 'prior_box_var', 'target_box', 'code_type', 'box_normalized', 'name', 'axis'], varargs=None, keywords=None, defaults=('encode_center_size', True, None, 0))
paddle.fluid.layers.polygon_box_transform ArgSpec(args=['input', 'name'], varargs=None, keywords=None, defaults=(None,))
paddle.fluid.layers.yolov3_loss ArgSpec(args=['x', 'gtbox', 'gtlabel', 'anchors', 'anchor_mask', 'class_num', 'ignore_thresh', 'downsample_ratio', 'name'], varargs=None, keywords=None, defaults=(None,))
paddle.fluid.layers.box_clip ArgSpec(args=['input', 'im_info', 'name'], varargs=None, keywords=None, defaults=(None,))
paddle.fluid.layers.multiclass_nms ArgSpec(args=['bboxes', 'scores', 'score_threshold', 'nms_top_k', 'keep_top_k', 'nms_threshold', 'normalized', 'nms_eta', 'background_label', 'name'], varargs=None, keywords=None, defaults=(0.3, True, 1.0, 0, None))
paddle.fluid.layers.accuracy ArgSpec(args=['input', 'label', 'k', 'correct', 'total'], varargs=None, keywords=None, defaults=(1, None, None))
paddle.fluid.layers.auc ArgSpec(args=['input', 'label', 'curve', 'num_thresholds', 'topk', 'slide_steps'], varargs=None, keywords=None, defaults=('ROC', 4095, 1, 1))

@ -65,6 +65,7 @@ pass_library(conv_elementwise_add2_act_fuse_pass inference)
pass_library(conv_elementwise_add_fuse_pass inference)
pass_library(conv_affine_channel_fuse_pass inference)
pass_library(transpose_flatten_concat_fuse_pass inference)
pass_library(identity_scale_op_clean_pass base)
# There may be many transpose-flatten structures in a model, and the output of
# these structures will be used as inputs to the concat Op. This pattern will

@ -0,0 +1,80 @@
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/framework/ir/identity_scale_op_clean_pass.h"
#include <string>
#include "paddle/fluid/framework/ir/graph_pattern_detector.h"
namespace paddle {
namespace framework {
namespace ir {
std::unique_ptr<ir::Graph> IdentityScaleOpCleanPass::ApplyImpl(
std::unique_ptr<ir::Graph> graph) const {
FusePassBase::Init("identity_scale_op_clean", graph.get());
// pre_op -> scale_in -> scale_op -> scale_out
// ->
// pre_op -> scale_out
GraphPatternDetector detector;
auto pre_op = detector.mutable_pattern()->NewNode("pre_op")->assert_is_op();
auto scale_in = detector.mutable_pattern()
->NewNode("scale_in")
->assert_is_op_input("scale")
->AsIntermediate();
auto scale_op = detector.mutable_pattern()
->NewNode("scale_fuse")
->assert_is_op("scale")
->assert_op_attr<float>("scale", 1.)
->assert_op_attr<float>("bias", 0.);
auto scale_out = detector.mutable_pattern()
->NewNode("scale_out")
->assert_is_op_output("scale");
pre_op->LinksTo({scale_in});
scale_op->LinksFrom({scale_in}).LinksTo({scale_out});
GraphPatternDetector::handle_t handler = [&](
const GraphPatternDetector::subgraph_t& subgraph, Graph* graph) {
Node* scale_op_var = subgraph.at(scale_op);
Node* scale_in_var = subgraph.at(scale_in);
Node* scale_out_var = subgraph.at(scale_out);
Node* pre_op_var = subgraph.at(pre_op);
// Link pre_op directly to scale_out
const std::string scale_in_name = scale_in_var->Name();
const std::string scale_out_name = scale_out_var->Name();
// Remove links in graph
GraphSafeRemoveNodes(graph, {scale_in_var, scale_op_var});
// Modify proto message
auto* pre_op_desc = pre_op_var->Op();
for (auto& parameter : *pre_op_desc->Proto()->mutable_outputs()) {
auto* arguments = parameter.mutable_arguments();
auto it = std::find(arguments->begin(), arguments->end(), scale_in_name);
PADDLE_ENFORCE(it != arguments->end());
*it = scale_out_name;
}
IR_NODE_LINK_TO(pre_op_var, scale_out_var);
};
detector(graph.get(), handler);
return graph;
}
} // namespace ir
} // namespace framework
} // namespace paddle
REGISTER_PASS(identity_scale_op_clean_pass,
paddle::framework::ir::IdentityScaleOpCleanPass);

@ -0,0 +1,33 @@
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include "paddle/fluid/framework/ir/fuse_pass_base.h"
namespace paddle {
namespace framework {
namespace ir {
class IdentityScaleOpCleanPass : public FusePassBase {
protected:
std::unique_ptr<ir::Graph> ApplyImpl(std::unique_ptr<ir::Graph> graph) const;
private:
virtual ~IdentityScaleOpCleanPass() = default;
};
} // namespace ir
} // namespace framework
} // namespace paddle

@ -22,11 +22,7 @@ limitations under the License. */
#include "paddle/fluid/framework/threadpool.h"
#include "paddle/fluid/string/printf.h"
DEFINE_bool(benchmark, false,
"Doing memory benchmark. It will make deleting scope synchronized, "
"and add some memory usage logs."
"Default cuda is asynchronous device, set to True will"
"force op run in synchronous mode.");
DECLARE_bool(benchmark);
DEFINE_bool(
eager_delete_scope, true,

@ -83,7 +83,6 @@ void IRPassManager::CreatePasses(Argument *argument,
new std::string(GetOrCreateModelOptCacheDir(model_opt_cache_dir)));
}
// graph_ = pass->Apply(std::move(graph_));
pre_pass = pass_name;
passes_.emplace_back(std::move(pass));
@ -97,8 +96,9 @@ std::unique_ptr<Graph> IRPassManager::Apply(std::unique_ptr<Graph> graph) {
PADDLE_ENFORCE(graph.get());
// Apply all the passes
for (const auto &pass : passes_) {
if (pass->Type() == "graph_viz_pass") continue;
PrettyLogEndl(Style::H2(), "--- Running IR pass [%s]", pass->Type());
if (pass->Type() != "graph_viz_pass") {
PrettyLogEndl(Style::H2(), "--- Running IR pass [%s]", pass->Type());
}
graph = pass->Apply(std::move(graph));
}
return std::move(graph);

@ -318,4 +318,9 @@ NativeConfig AnalysisConfig::ToNativeConfig() const {
return config;
}
void AnalysisConfig::SwitchIrDebug(int x) {
ir_debug_ = x;
Update();
}
} // namespace paddle

@ -58,7 +58,8 @@ namespace {
bool IsPersistable(const framework::VarDesc *var) {
if (var->Persistable() &&
var->GetType() != framework::proto::VarType::FEED_MINIBATCH &&
var->GetType() != framework::proto::VarType::FETCH_LIST) {
var->GetType() != framework::proto::VarType::FETCH_LIST &&
var->GetType() != framework::proto::VarType::RAW) {
return true;
}
return false;

@ -196,7 +196,7 @@ TEST(AnalysisPredictor, memory_optim) {
AnalysisConfig config(FLAGS_dirname);
config.DisableGpu();
config.EnableMemoryOptim(true);
config.pass_builder()->TurnOnDebug();
config.SwitchIrDebug();
auto native_predictor =
CreatePaddlePredictor<NativeConfig>(config.ToNativeConfig());

@ -140,9 +140,12 @@ struct AnalysisConfig {
*/
bool tensorrt_engine_enabled() const { return use_tensorrt_; }
/** Control whther to debug IR graph analysis phase.
/** \brief Control whether to debug IR graph analysis phase.
*
* This will generate DOT files for visualizing the computation graph after
* each analysis pass applied.
*/
void SwitchIrDebug(int x = true) { ir_debug_ = x; }
void SwitchIrDebug(int x = true);
/** Turn on MKLDNN.
*/

@ -117,6 +117,7 @@ class CpuPassStrategy : public PassStrategy {
"conv_bn_fuse_pass", //
"conv_eltwiseadd_bn_fuse_pass", //
"is_test_pass", //
"identity_scale_op_clean_pass", //
});
use_gpu_ = false;
}
@ -155,6 +156,7 @@ class GpuPassStrategy : public PassStrategy {
GpuPassStrategy() : PassStrategy({}) {
passes_.assign({
"infer_clean_graph_pass", //
"identity_scale_op_clean_pass", //
"conv_affine_channel_fuse_pass", //
"conv_eltwiseadd_affine_channel_fuse_pass", //
"conv_bn_fuse_pass", //

@ -142,7 +142,7 @@ void SetConfig(AnalysisConfig *cfg, bool use_mkldnn = false) {
cfg->SetModel(FLAGS_infer_model + "/model", FLAGS_infer_model + "/params");
cfg->DisableGpu();
cfg->SwitchSpecifyInputNames();
cfg->pass_builder()->TurnOnDebug();
cfg->SwitchIrDebug();
cfg->SetCpuMathLibraryNumThreads(FLAGS_paddle_num_threads);
if (use_mkldnn) {
cfg->EnableMKLDNN();

@ -69,7 +69,7 @@ void SetInput(std::vector<std::vector<PaddleTensor>> *inputs) {
TEST(Analyzer_Text_Classification, profile) {
AnalysisConfig cfg;
SetConfig(&cfg);
cfg.pass_builder()->TurnOnDebug();
cfg.SwitchIrDebug();
std::vector<PaddleTensor> outputs;
std::vector<std::vector<PaddleTensor>> input_slots_all;

@ -35,6 +35,7 @@ DEFINE_bool(init_allocated_mem, false,
"To find this error in time, we use init_allocated_mem to indicate "
"that initializing the allocated memory with a small value "
"during unit testing.");
DECLARE_bool(benchmark);
DECLARE_double(fraction_of_gpu_memory_to_use);
namespace paddle {
@ -59,11 +60,6 @@ size_t memory_usage(const platform::Place &p);
using BuddyAllocator = detail::BuddyAllocator;
std::unordered_map</*device id*/ int,
std::pair</*current memory usage*/ uint64_t,
/*peak memory usage*/ uint64_t>>
gpu_mem_info;
BuddyAllocator *GetCPUBuddyAllocator() {
// We tried thread_local for inference::RNN1 model, but that not works much
// for multi-thread test.
@ -144,6 +140,8 @@ BuddyAllocator *GetGPUBuddyAllocator(int gpu_id) {
devices = platform::GetSelectedDevices();
int gpu_num = devices.size();
allocation::GPUMemMonitor.Initialize(devices.size());
a_arr = new BuddyAllocator *[gpu_num];
for (size_t i = 0; i < devices.size(); ++i) {
int dev_id = devices[i];
@ -204,12 +202,7 @@ void *Alloc<platform::CUDAPlace>(const platform::CUDAPlace &place,
<< string::HumanReadableSize(Used<platform::CUDAPlace>(place));
platform::SetDeviceId(cur_dev);
} else {
gpu_mem_info[place.device].first += size;
if (gpu_mem_info[place.device].first > gpu_mem_info[place.device].second) {
gpu_mem_info[place.device].second = gpu_mem_info[place.device].first;
VLOG(3) << "device: " << place.device << " peak memory usage : "
<< (gpu_mem_info[place.device].second >> 20) << " MiB";
}
if (FLAGS_benchmark) allocation::GPUMemMonitor.Add(place.device, size);
if (FLAGS_init_allocated_mem) {
cudaMemset(ptr, 0xEF, size);
}
@ -225,7 +218,7 @@ void Free<platform::CUDAPlace>(const platform::CUDAPlace &place, void *p,
size_t size) {
#ifdef PADDLE_WITH_CUDA
GetGPUBuddyAllocator(place.device)->Free(p);
gpu_mem_info[place.device].first -= size;
if (FLAGS_benchmark) allocation::GPUMemMonitor.Minus(place.device, size);
#else
PADDLE_THROW("'CUDAPlace' is not supported in CPU only device.");
#endif
@ -335,6 +328,8 @@ size_t Usage::operator()(const platform::CUDAPinnedPlace &cuda_pinned) const {
namespace allocation {
LegacyMemMonitor GPUMemMonitor;
Allocation *LegacyAllocator::AllocateImpl(size_t size, Allocator::Attr attr) {
void *ptr = boost::apply_visitor(legacy::AllocVisitor(size), place_);
return new Allocation(ptr, size, place_);
@ -346,6 +341,63 @@ void LegacyAllocator::Free(Allocation *allocation) {
allocation->place());
delete allocation;
}
bool MemInfo::Add(const size_t &size) {
std::lock_guard<std::mutex> lock(mutex_);
usage_ += size;
bool peak_point = usage_ > peak_usage_;
if (peak_point) peak_usage_ = usage_;
return peak_point;
}
void MemInfo::Minus(const size_t &size) {
std::lock_guard<std::mutex> lock(mutex_);
usage_ -= size;
}
uint64_t MemInfo::GetPeakUsage() { return peak_usage_; }
LegacyMemMonitor::~LegacyMemMonitor() {
for (auto &item : gpu_mem_info_) delete item.second;
}
void LegacyMemMonitor::Initialize(const int &device_num) {
for (auto i = 0; i < device_num; ++i) {
gpu_mem_info_[i] = new MemInfo();
}
}
void LegacyMemMonitor::Add(const int &device, const size_t &size) {
if (gpu_mem_info_[device]->Add(size)) {
VLOG(3) << "#LegacyMemMonitor# device: " << device
<< " peak memory usage : "
<< (gpu_mem_info_[device]->GetPeakUsage() >> 20) << " MiB";
}
}
void LegacyMemMonitor::Minus(const int &device, const size_t &size) {
gpu_mem_info_[device]->Minus(size);
}
uint64_t LegacyMemMonitor::GetMemUsage(const int &device) {
return gpu_mem_info_.find(device) == gpu_mem_info_.end()
? 0
: gpu_mem_info_[device]->GetPeakUsage();
}
void LegacyMemMonitor::PrintMemUsage() {
std::vector<int> devices;
for (const auto &item : gpu_mem_info_) {
devices.emplace_back(item.first);
}
std::sort(devices.begin(), devices.end());
for (const auto &device : devices) {
std::cout << "Device : " << device << " Peak Memory Usage : "
<< (gpu_mem_info_[device]->GetPeakUsage() >> 20) << " MiB"
<< std::endl;
}
}
} // namespace allocation
} // namespace memory
} // namespace paddle

@ -13,12 +13,59 @@
// limitations under the License.
#pragma once
#include <algorithm>
#include <mutex> // NOLINT
#include <unordered_map>
#include <utility>
#include <vector>
#include "paddle/fluid/memory/allocation/allocator.h"
#include "paddle/fluid/platform/place.h"
namespace paddle {
namespace memory {
namespace allocation {
class MemInfo {
public:
MemInfo() : usage_(0), peak_usage_(0) {}
MemInfo(const MemInfo &) = delete;
MemInfo &operator=(const MemInfo &) = delete;
// return a flag to indicate current operation will create a peak point or not
bool Add(const size_t &);
void Minus(const size_t &);
uint64_t GetPeakUsage();
private:
/* current memory usage*/
uint64_t usage_;
uint64_t peak_usage_;
std::mutex mutex_;
};
class LegacyMemMonitor {
public:
// used to store the GPU memory usage of each devices
using MemUsage = std::unordered_map</*device id*/ int,
/*mem usage info node*/ MemInfo *>;
MemUsage GetMemUsageInfo() { return gpu_mem_info_; }
~LegacyMemMonitor();
void Initialize(const int &);
void Add(const int &, const size_t &);
void Minus(const int &, const size_t &);
uint64_t GetMemUsage(const int &);
void PrintMemUsage();
protected:
MemUsage gpu_mem_info_;
};
extern LegacyMemMonitor GPUMemMonitor;
class LegacyAllocatorPrivate;
class LegacyAllocator : public Allocator {
public:

@ -589,8 +589,10 @@ class BatchNormGradMaker : public framework::SingleGradOpDescMaker {
op->SetInput("SavedVariance", Output("SavedVariance"));
// used when setting use_global_stats True during training
op->SetInput("Mean", Output("MeanOut"));
op->SetInput("Variance", Output("VarianceOut"));
if (boost::get<bool>(GetAttr("use_global_stats"))) {
op->SetInput("Mean", Output("MeanOut"));
op->SetInput("Variance", Output("VarianceOut"));
}
op->SetAttrMap(Attrs());

@ -31,6 +31,7 @@ detection_library(polygon_box_transform_op SRCS polygon_box_transform_op.cc
polygon_box_transform_op.cu)
detection_library(rpn_target_assign_op SRCS rpn_target_assign_op.cc)
detection_library(generate_proposal_labels_op SRCS generate_proposal_labels_op.cc)
detection_library(box_clip_op SRCS box_clip_op.cc box_clip_op.cu)
detection_library(yolov3_loss_op SRCS yolov3_loss_op.cc)
if(WITH_GPU)

@ -99,5 +99,29 @@ void BboxOverlaps(const framework::Tensor& r_boxes,
}
}
template <class T>
void ClipTiledBoxes(const platform::DeviceContext& ctx,
const framework::Tensor& im_info,
const framework::Tensor& input_boxes,
framework::Tensor* out) {
T* out_data = out->mutable_data<T>(ctx.GetPlace());
const T* im_info_data = im_info.data<T>();
const T* input_boxes_data = input_boxes.data<T>();
T zero(0);
T im_w = round(im_info_data[1] / im_info_data[2]);
T im_h = round(im_info_data[0] / im_info_data[2]);
for (int64_t i = 0; i < input_boxes.numel(); ++i) {
if (i % 4 == 0) {
out_data[i] = std::max(std::min(input_boxes_data[i], im_w - 1), zero);
} else if (i % 4 == 1) {
out_data[i] = std::max(std::min(input_boxes_data[i], im_h - 1), zero);
} else if (i % 4 == 2) {
out_data[i] = std::max(std::min(input_boxes_data[i], im_w - 1), zero);
} else {
out_data[i] = std::max(std::min(input_boxes_data[i], im_h - 1), zero);
}
}
}
} // namespace operators
} // namespace paddle

@ -0,0 +1,86 @@
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/detection/box_clip_op.h"
#include "paddle/fluid/framework/op_registry.h"
namespace paddle {
namespace operators {
class BoxClipOp : public framework::OperatorWithKernel {
public:
using framework::OperatorWithKernel::OperatorWithKernel;
protected:
void InferShape(framework::InferShapeContext* ctx) const override {
PADDLE_ENFORCE(ctx->HasInput("Input"),
"Input(Input) of BoxClipOp should not be null.");
PADDLE_ENFORCE(ctx->HasInput("ImInfo"),
"Input(ImInfo) of BoxClipOp should not be null.");
auto input_box_dims = ctx->GetInputDim("Input");
auto im_info_dims = ctx->GetInputDim("ImInfo");
if (ctx->IsRuntime()) {
auto input_box_size = input_box_dims.size();
PADDLE_ENFORCE_EQ(input_box_dims[input_box_size - 1], 4,
"The last dimension of Input must be 4");
PADDLE_ENFORCE_EQ(im_info_dims.size(), 2,
"The rank of Input(Input) in BoxClipOp must be 2");
PADDLE_ENFORCE_EQ(im_info_dims[1], 3,
"The last dimension of ImInfo must be 3");
}
ctx->ShareDim("Input", /*->*/ "Output");
ctx->ShareLoD("Input", /*->*/ "Output");
}
};
class BoxClipOpMaker : public framework::OpProtoAndCheckerMaker {
public:
void Make() override {
AddInput("Input",
"(LoDTensor) "
"Input is a LoDTensor with shape [..., 4] holds 4 points"
"in last dimension in format [xmin, ymin, xmax, ymax]");
AddInput("ImInfo",
"(Tensor) Information for image reshape is in shape (N, 3), "
"in format (height, width, im_scale)");
AddOutput("Output",
"(LoDTensor) "
"Output is a LoDTensor with the same shape as Input"
"and it is the result after clip");
AddComment(R"DOC(
This operator clips input boxes to original input images.
For each input box, The formula is given as follows:
$$xmin = \max(\min(xmin, im_w - 1), 0)$$
$$ymin = \max(\min(ymin, im_h - 1), 0)$$
$$xmax = \max(\min(xmax, im_w - 1), 0)$$
$$ymax = \max(\min(ymax, im_h - 1), 0)$$
where im_w and im_h are computed from ImInfo, the formula is given as follows:
$$im_w = \round(width / im_scale)$$
$$im_h = \round(height / im_scale)$$
)DOC");
}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
REGISTER_OPERATOR(box_clip, ops::BoxClipOp, ops::BoxClipOpMaker,
paddle::framework::EmptyGradOpMaker);
REGISTER_OP_CPU_KERNEL(
box_clip, ops::BoxClipKernel<paddle::platform::CPUDeviceContext, float>,
ops::BoxClipKernel<paddle::platform::CPUDeviceContext, double>);

@ -0,0 +1,74 @@
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <algorithm>
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/detection/box_clip_op.h"
#include "paddle/fluid/operators/math/math_function.h"
#include "paddle/fluid/platform/cuda_primitives.h"
#include "paddle/fluid/platform/hostdevice.h"
namespace paddle {
namespace operators {
using Tensor = framework::Tensor;
using LoDTenso = framework::LoDTensor;
static constexpr int ImInfoSize = 3;
template <typename T, int BlockSize>
static __global__ void GPUBoxClip(const T *input, const size_t *lod,
const size_t width, const T *im_info,
T *output) {
T im_w = round(im_info[blockIdx.x * ImInfoSize + 1] /
im_info[blockIdx.x * ImInfoSize + 2]);
T im_h = round(im_info[blockIdx.x * ImInfoSize] /
im_info[blockIdx.x * ImInfoSize + 2]);
for (int i = threadIdx.x; i < (lod[blockIdx.x + 1] - lod[blockIdx.x]) * width;
i += BlockSize) {
int idx = lod[blockIdx.x] * width + i;
T im_size = (idx % 2 == 0) ? im_w : im_h;
output[idx] = max(min(input[idx], im_size - 1), T(0.));
}
}
template <typename DeviceContext, typename T>
class GPUBoxClipKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext &context) const override {
PADDLE_ENFORCE(platform::is_gpu_place(context.GetPlace()),
"This kernel only runs on GPU device.");
auto *input = context.Input<LoDTensor>("Input");
auto *im_info = context.Input<Tensor>("ImInfo");
auto *output = context.Output<LoDTensor>("Output");
const int64_t num = input->dims()[0];
const int64_t bbox_width = input->numel() / num;
auto lod = input->lod();
framework::LoD abs_offset_lod = framework::ToAbsOffset(lod);
auto &dev_ctx = context.template device_context<DeviceContext>();
auto stream = dev_ctx.stream();
const size_t batch_size = lod.back().size() - 1;
T *output_data = output->mutable_data<T>(dev_ctx.GetPlace());
GPUBoxClip<T, 512><<<batch_size, 512, 0, stream>>>(
input->data<T>(), abs_offset_lod[0].CUDAMutableData(dev_ctx.GetPlace()),
bbox_width, im_info->data<T>(), output_data);
}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
REGISTER_OP_CUDA_KERNEL(
box_clip, ops::GPUBoxClipKernel<paddle::platform::CUDADeviceContext, float>,
ops::GPUBoxClipKernel<paddle::platform::CUDADeviceContext, double>);

@ -0,0 +1,50 @@
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <string>
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/detection/bbox_util.h"
#include "paddle/fluid/operators/math/math_function.h"
namespace paddle {
namespace operators {
using Tensor = framework::Tensor;
using LoDTensor = framework::LoDTensor;
template <typename DeviceContext, typename T>
class BoxClipKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& context) const override {
auto* input_box = context.Input<LoDTensor>("Input");
auto* im_info = context.Input<LoDTensor>("ImInfo");
auto* output_box = context.Output<LoDTensor>("Output");
auto& dev_ctx =
context.template device_context<platform::CPUDeviceContext>();
output_box->mutable_data<T>(context.GetPlace());
if (input_box->lod().size()) {
PADDLE_ENFORCE_EQ(input_box->lod().size(), 1UL,
"Only support 1 level of LoD.");
}
auto box_lod = input_box->lod().back();
int64_t n = static_cast<int64_t>(box_lod.size() - 1);
for (int i = 0; i < n; ++i) {
Tensor im_info_slice = im_info->Slice(i, i + 1);
Tensor box_slice = input_box->Slice(box_lod[i], box_lod[i + 1]);
Tensor output_slice = output_box->Slice(box_lod[i], box_lod[i + 1]);
ClipTiledBoxes<T>(dev_ctx, im_info_slice, box_slice, &output_slice);
}
}
};
} // namespace operators
} // namespace paddle

@ -93,6 +93,7 @@ std::vector<int> TestSizes() {
template <typename KernelTuples, typename... Args>
struct BenchFunc {
// return this function avg time
// TODO(TJ): clear cache every time
double operator()(const typename KernelTuples::func_type tgt, Args... args) {
for (int i = 0; i < FLAGS_burning; ++i) {
tgt(args...);
@ -172,6 +173,9 @@ void BenchXYZNKernel() {
RandomVec<T>(d, y_data);
BenchAllImpls<KT, jit::XYZNTuples<T>, PlaceType>(d, x.data<T>(),
y.data<T>(), z_data, d);
// test inplace
BenchAllImpls<KT, jit::XYZNTuples<T>, PlaceType>(d, x.data<T>(), z_data,
z_data, d);
}
}

@ -155,7 +155,7 @@ class NCHW16CMulNCCreator : public JitCodeCreator<int> {
class name##Creator : public JitCodeCreator<int> { \
public: \
bool UseMe(const int& attr) const override { \
return platform::MayIUse(platform::avx); \
return platform::MayIUse(platform::avx) && attr <= 1024; \
} \
size_t CodeSize(const int& d) const override { \
return 96 + d / YMM_FLOAT_BLOCK * 4 * 8; \

@ -61,6 +61,7 @@ class VXXJitCode : public JitCode {
base += "_Vec";
}
base += (with_relu_ ? "_Relu" : "");
base += "_D" + std::to_string(num_);
return base.c_str();
}
void genCode() override;

@ -118,26 +118,33 @@ typename KernelTuples::func_type Get(
return GetRefer<KT, KernelTuples>();
}
template <KernelType KT, typename KernelTuples>
class KernelFuncsCache {
template <KernelType KT, typename KernelTuples, typename PlaceType>
class KernelFuncs {
public:
KernelFuncsCache() = default;
static KernelFuncsCache& Instance() {
static thread_local KernelFuncsCache<KT, KernelTuples> g_func_cache;
KernelFuncs() = default;
static KernelFuncs& Cache() {
static thread_local KernelFuncs<KT, KernelTuples, PlaceType> g_func_cache;
return g_func_cache;
}
bool Has(int key) const { return funcs_.find(key) != funcs_.end(); }
typename KernelTuples::func_type At(int key) { return funcs_.at(key); }
void Insert(int key, typename KernelTuples::func_type func) {
funcs_.emplace(key, func);
}
typename KernelTuples::func_type At(int key) {
if (Has(key)) {
return funcs_.at(key);
}
auto func = Get<KT, KernelTuples, PlaceType>(key);
Insert(key, func);
return func;
}
private:
std::unordered_map<int, typename KernelTuples::func_type> funcs_;
DISABLE_COPY_AND_ASSIGN(KernelFuncsCache);
DISABLE_COPY_AND_ASSIGN(KernelFuncs);
};
const char* to_string(KernelType kt);

Some files were not shown because too many files have changed in this diff Show More

Loading…
Cancel
Save