remove conflict

emailweixu-patch-1
chengduoZH 7 years ago
commit 5ccab2dc65

@ -2,7 +2,7 @@
|---|---|
| backyes | Yan-Fei Wang |
| beckett1124 | Bin Qi |
| Canpio | Jia-Yi Feng |
| JiayiFeng | Jia-Yi Feng |
| chengxiaohua1105 | Xiao-Hua Cheng |
| cxwangyi, yiwangbaidu, wangkuiyi | Yi Wang |
| cxysteven | Xing-Yi Cheng |

@ -82,7 +82,7 @@ language = 'zh_CN'
# List of patterns, relative to source directory, that match files and
# directories to ignore when looking for source files.
exclude_patterns = ['_build', '**/*_en*', '*_en*']
exclude_patterns = ['_build', '**/*_en*', '*_en*', 'api/*']
# The reST default role (used for this markup: `text`) to use for all
# documents.

@ -82,7 +82,7 @@ language = None
# List of patterns, relative to source directory, that match files and
# directories to ignore when looking for source files.
exclude_patterns = ['_build', '**/*_cn*', '*_cn*']
exclude_patterns = ['_build', '**/*_cn*', '*_cn*', 'api/*']
# The reST default role (used for this markup: `text`) to use for all
# documents.

@ -11,7 +11,6 @@ if(MOBILE_INFERENCE)
else()
add_subdirectory(pserver)
add_subdirectory(trainer)
add_subdirectory(string)
add_subdirectory(scripts)
if(WITH_C_API)

@ -4,3 +4,4 @@ add_subdirectory(framework)
add_subdirectory(operators)
add_subdirectory(pybind)
add_subdirectory(inference)
add_subdirectory(string)

@ -314,5 +314,15 @@ DDim stride(const DDim& ddim) {
}
return framework::make_ddim(strides);
}
DDim stride_numel(const framework::DDim& ddim) {
std::vector<int64_t> strides(ddim.size());
strides[ddim.size() - 1] = ddim[ddim.size() - 1];
for (int i = ddim.size() - 2; i >= 0; --i) {
strides[i] = strides[i + 1] * ddim[i];
}
return framework::make_ddim(strides);
}
} // namespace framework
} // namespace paddle

@ -125,6 +125,8 @@ DDim flatten_to_2d(const DDim& src, int num_col_dims);
DDim flatten_to_1d(const DDim& src);
DDim stride(const DDim& ddim);
DDim stride_numel(const DDim& ddim);
} // namespace framework
} // namespace paddle

@ -20,7 +20,7 @@ limitations under the License. */
#include "paddle/fluid/framework/operator.h"
#include "paddle/fluid/platform/device_context.h"
#include "paddle/fluid/platform/place.h"
#include "paddle/string/piece.h"
#include "paddle/fluid/string/piece.h"
namespace paddle {
namespace framework {

@ -37,9 +37,8 @@ class Vector {
// Fill vector with value. The vector size is `count`.
explicit Vector(size_t count, const T& value = T()) {
if (count == 0) {
InitEmpty();
} else {
InitEmpty();
if (count != 0) {
resize(count);
T* ptr = begin();
for (size_t i = 0; i < count; ++i) {
@ -122,6 +121,10 @@ class Vector {
const T* begin() const { return &this->operator[](0); }
const T* end() const { return &this->operator[](size()); }
const T* cbegin() const { return begin(); }
const T* cend() const { return end(); }
const T& back() const {
auto it = end();
--it;
@ -244,7 +247,9 @@ class Vector {
bool operator==(const Vector<T>& other) const {
if (size() != other.size()) return false;
for (auto it1 = begin(), it2 = other.begin(); it1 < end(); ++it1, ++it2) {
auto it1 = cbegin();
auto it2 = other.cbegin();
for (; it1 < cend(); ++it1, ++it2) {
if (*it1 != *it2) {
return false;
}

@ -26,10 +26,10 @@ TEST(mixed_vector, CPU_VECTOR) {
for (int i = 0; i < 10; ++i) {
tmp.push_back(i);
}
ASSERT_EQ(tmp.size(), 10);
ASSERT_EQ(tmp.size(), 10UL);
vec<int> tmp2;
tmp2 = tmp;
ASSERT_EQ(tmp2.size(), 10);
ASSERT_EQ(tmp2.size(), 10UL);
for (int i = 0; i < 10; ++i) {
ASSERT_EQ(tmp2[i], i);
ASSERT_EQ(tmp2[i], tmp[i]);
@ -58,7 +58,7 @@ TEST(mixed_vector, GPU_VECTOR) {
for (int i = 0; i < 10; ++i) {
tmp.push_back(i);
}
ASSERT_EQ(tmp.size(), 10);
ASSERT_EQ(tmp.size(), 10UL);
paddle::platform::CUDAPlace gpu(0);
multiply_10<<<1, 1, 0, GetCUDAStream(gpu)>>>(tmp.MutableData(gpu));
@ -79,7 +79,7 @@ TEST(mixed_vector, MultiGPU) {
for (int i = 0; i < 10; ++i) {
tmp.push_back(i);
}
ASSERT_EQ(tmp.size(), 10);
ASSERT_EQ(tmp.size(), 10UL);
paddle::platform::CUDAPlace gpu0(0);
paddle::platform::SetDeviceId(0);
multiply_10<<<1, 1, 0, GetCUDAStream(gpu0)>>>(tmp.MutableData(gpu0));
@ -91,3 +91,10 @@ TEST(mixed_vector, MultiGPU) {
ASSERT_EQ(tmp[i], i * 100);
}
}
TEST(mixed_vector, InitWithCount) {
paddle::framework::Vector<int> vec(10, 10);
for (int i = 0; i < 10; ++i) {
ASSERT_EQ(vec[i], 10);
}
}

@ -18,7 +18,7 @@ limitations under the License. */
#include <mutex> // for call_once
#include "glog/logging.h"
#include "paddle/fluid/framework/threadpool.h"
#include "paddle/string/printf.h"
#include "paddle/fluid/string/printf.h"
DEFINE_bool(benchmark, false,
"Doing memory benchmark. It will make deleting scope synchronized, "

@ -28,17 +28,18 @@ class ConcatKernel : public framework::OpKernel<T> {
auto ins = ctx.MultiInput<framework::Tensor>("X");
auto* out = ctx.Output<framework::Tensor>("Out");
int64_t axis = static_cast<int64_t>(ctx.Attr<int>("axis"));
const size_t n = ins.size();
auto place = ctx.GetPlace();
out->mutable_data<T>(place);
auto out_stride = framework::stride_numel(out->dims());
size_t output_offset = 0;
out->mutable_data<T>(ctx.GetPlace());
auto out_stride = framework::stride(out->dims());
for (size_t i = 0; i < n; i++) {
auto& in = ins[i];
auto axis_dim = in->dims()[axis];
auto in_stride = framework::stride(in->dims());
StridedMemcpy<T>(ctx.device_context(), in->data<T>(), in_stride,
in->dims(), out_stride, out->data<T>() + output_offset);
output_offset += axis_dim * in_stride[axis];
for (auto* in : ins) {
auto in_stride = framework::stride_numel(in->dims());
StridedNumelCopyWithAxis<T>(ctx.device_context(), axis,
out->data<T>() + output_offset, out_stride,
in->data<T>(), in_stride);
output_offset += in_stride[axis];
}
}
};
@ -50,17 +51,16 @@ class ConcatGradKernel : public framework::OpKernel<T> {
auto* in = ctx.Input<framework::Tensor>(framework::GradVarName("Out"));
auto outs = ctx.MultiOutput<framework::Tensor>(framework::GradVarName("X"));
int64_t axis = static_cast<int64_t>(ctx.Attr<int>("axis"));
const size_t n = outs.size();
size_t input_offset = 0;
auto in_stride = framework::stride(in->dims());
for (size_t i = 0; i < n; i++) {
auto& out = outs[i];
auto in_stride = framework::stride_numel(in->dims());
for (auto& out : outs) {
out->mutable_data<T>(ctx.GetPlace());
size_t axis_dim = out->dims()[axis];
auto out_stride = framework::stride(out->dims());
StridedMemcpy<T>(ctx.device_context(), in->data<T>() + input_offset,
in_stride, out->dims(), out_stride, out->data<T>());
input_offset += axis_dim * in_stride[axis];
auto out_stride = framework::stride_numel(out->dims());
StridedNumelCopyWithAxis<T>(ctx.device_context(), axis, out->data<T>(),
out_stride, in->data<T>() + input_offset,
in_stride);
input_offset += out_stride[axis];
}
}
};

@ -27,7 +27,7 @@ limitations under the License. */
#include "paddle/fluid/operators/detail/grpc_server.h"
#include "paddle/fluid/operators/detail/sendrecvop_utils.h"
#include "paddle/fluid/operators/detail/simple_block_queue.h"
#include "paddle/string/printf.h"
#include "paddle/fluid/string/printf.h"
namespace paddle {
namespace operators {
@ -101,11 +101,15 @@ class ListenAndServOp : public framework::OperatorBase {
// TODO(typhoonzero): change this to a while_op for every cluster-batch.
bool exit_flag = false;
// Record received sparse variables, so that
// we could reset those after execute optimize program
std::vector<framework::Variable *> sparse_vars;
while (!exit_flag) {
// Get from multiple trainers, we don't care about the order in which
// the gradients arrives, just add suffix 0~n and merge the gradient.
rpc_service_->SetCond(0);
size_t recv_var_cnt = 0;
size_t update_param_cnt = 0;
int batch_barrier = 0;
while (batch_barrier != fan_in) {
const detail::MessageWithName &v = rpc_service_->Get();
@ -126,13 +130,14 @@ class ListenAndServOp : public framework::OperatorBase {
std::string param_var_name;
if (it != grad_list.end()) {
param_var_name = param_list[it - grad_list.begin()];
update_param_cnt++;
VLOG(3) << "received grad: " << grad_var_name
<< " updating param: " << param_var_name;
} else {
LOG(ERROR) << "grad has no paired param:" << grad_var_name;
VLOG(3) << "received variable: " << grad_var_name
<< " no need to update param";
}
VLOG(3) << "received grad: " << grad_var_name
<< " updating param: " << param_var_name;
if (fan_in > 1) {
if (fan_in > 1 && !param_var_name.empty()) {
grad_var_name = this->GetGradVarNameForTrainer(grad_var_name);
}
auto *var = recv_scope.FindVar(grad_var_name);
@ -141,23 +146,35 @@ class ListenAndServOp : public framework::OperatorBase {
PADDLE_THROW("Can not find server side var");
}
detail::DeserializeFromMessage(v.second, dev_ctx, var);
if (var->IsType<framework::SelectedRows>()) {
sparse_vars.push_back(var);
}
}
}
VLOG(3) << "recv " << recv_var_cnt << " parmeters for one barrier.";
// TODO(Yancey1989): merge SelectedRows variables here
if (exit_flag) {
rpc_service_->ShutDown();
}
VLOG(3) << "run optimize graph...";
try {
executor.Run(*program, &recv_scope, block->ID(), /*global_block*/
false /*create_local_scope*/, false /*create_vars*/);
} catch (std::exception &e) {
LOG(ERROR) << "run sub program error " << e.what();
}
// Reset the received sparse variables, the sum operator would not
// sum the input sparse variables which rows is empty at the next
// mini-batch.
// TOOD(Yancey1989): move the reset action into an operator, we couldn't
// have any hide logic in the operator.
for (auto &var : sparse_vars) {
var->GetMutable<framework::SelectedRows>()->mutable_rows()->clear();
}
rpc_service_->SetCond(1);
rpc_service_->WaitClientGet(recv_var_cnt);
rpc_service_->WaitClientGet(update_param_cnt);
grads_counter_.clear();
sparse_vars.clear();
} // while(true)
}

@ -38,22 +38,22 @@ class MultiClassNMSOp : public framework::OperatorWithKernel {
auto box_dims = ctx->GetInputDim("BBoxes");
auto score_dims = ctx->GetInputDim("Scores");
PADDLE_ENFORCE_EQ(box_dims.size(), 2,
"The rank of Input(BBoxes) must be 2.");
PADDLE_ENFORCE_EQ(box_dims.size(), 3,
"The rank of Input(BBoxes) must be 3.");
PADDLE_ENFORCE_EQ(score_dims.size(), 3,
"The rank of Input(Scores) must be 3.");
PADDLE_ENFORCE_EQ(box_dims[1], 4,
PADDLE_ENFORCE_EQ(box_dims[2], 4,
"The 2nd dimension of Input(BBoxes) must be 4, "
"represents the layout of coordinate "
"[xmin, ymin, xmax, ymax]");
PADDLE_ENFORCE_EQ(box_dims[0], score_dims[2],
PADDLE_ENFORCE_EQ(box_dims[1], score_dims[2],
"The 1st dimensiong of Input(BBoxes) must be equal to "
"3rd dimension of Input(Scores), which represents the "
"predicted bboxes.");
// Here the box_dims[0] is not the real dimension of output.
// It will be rewritten in the computing kernel.
ctx->SetOutputDim("Out", {box_dims[0], 6});
ctx->SetOutputDim("Out", {box_dims[1], 6});
}
protected:
@ -260,15 +260,20 @@ class MultiClassNMSKernel : public framework::OpKernel<T> {
int64_t batch_size = score_dims[0];
int64_t class_num = score_dims[1];
int64_t predict_dim = score_dims[2];
int64_t box_dim = boxes->dims()[2];
std::vector<std::map<int, std::vector<int>>> all_indices;
std::vector<size_t> batch_starts = {0};
for (int64_t i = 0; i < batch_size; ++i) {
Tensor ins_score = scores->Slice(i, i + 1);
ins_score.Resize({class_num, predict_dim});
Tensor ins_boxes = boxes->Slice(i, i + 1);
ins_boxes.Resize({predict_dim, box_dim});
std::map<int, std::vector<int>> indices;
int num_nmsed_out = 0;
MultiClassNMS(ctx, ins_score, *boxes, indices, num_nmsed_out);
MultiClassNMS(ctx, ins_score, ins_boxes, indices, num_nmsed_out);
all_indices.push_back(indices);
batch_starts.push_back(batch_starts.back() + num_nmsed_out);
}
@ -282,11 +287,15 @@ class MultiClassNMSKernel : public framework::OpKernel<T> {
for (int64_t i = 0; i < batch_size; ++i) {
Tensor ins_score = scores->Slice(i, i + 1);
ins_score.Resize({class_num, predict_dim});
Tensor ins_boxes = boxes->Slice(i, i + 1);
ins_boxes.Resize({predict_dim, box_dim});
int64_t s = batch_starts[i];
int64_t e = batch_starts[i + 1];
if (e > s) {
Tensor out = outs->Slice(s, e);
MultiClassOutput(ins_score, *boxes, all_indices[i], &out);
MultiClassOutput(ins_score, ins_boxes, all_indices[i], &out);
}
}
}
@ -303,9 +312,9 @@ class MultiClassNMSOpMaker : public framework::OpProtoAndCheckerMaker {
MultiClassNMSOpMaker(OpProto* proto, OpAttrChecker* op_checker)
: OpProtoAndCheckerMaker(proto, op_checker) {
AddInput("BBoxes",
"(Tensor) A 2-D Tensor with shape [M, 4] represents the "
"predicted locations of M bounding bboxes. Each bounding box "
"has four coordinate values and the layout is "
"(Tensor) A 3-D Tensor with shape [N, M, 4] represents the "
"predicted locations of M bounding bboxes, N is the batch size. "
"Each bounding box has four coordinate values and the layout is "
"[xmin, ymin, xmax, ymax].");
AddInput("Scores",
"(Tensor) A 3-D Tensor with shape [N, C, M] represents the "

@ -24,6 +24,22 @@ limitations under the License. */
namespace paddle {
namespace operators {
static bool IsVariableInitialized(const framework::Scope& scope,
const std::string& varname) {
auto* var = scope.FindVar(varname);
PADDLE_ENFORCE_NOT_NULL(var, "Can not find variable '%s' in the send side.",
varname);
if (var->IsType<framework::LoDTensor>()) {
return var->Get<framework::LoDTensor>().IsInitialized();
} else if (var->IsType<framework::SelectedRows>()) {
return var->Get<framework::SelectedRows>().value().IsInitialized();
} else {
PADDLE_THROW(
"Variable type in send side should be in "
"[LodTensor, SelectedRows]");
}
return false;
}
class SendOp : public framework::OperatorBase {
public:
@ -51,8 +67,12 @@ class SendOp : public framework::OperatorBase {
detail::RPCClient* rpc_client = client_var->GetMutable<detail::RPCClient>();
for (size_t i = 0; i < ins.size(); i++) {
VLOG(3) << "sending " << ins[i] << " to " << epmap[i];
rpc_client->AsyncSendVariable(epmap[i], ctx, scope, ins[i]);
if (IsVariableInitialized(scope, ins[i])) {
VLOG(3) << "sending " << ins[i] << " to " << epmap[i];
rpc_client->AsyncSendVariable(epmap[i], ctx, scope, ins[i]);
} else {
VLOG(3) << "don't send no-initialied variable: " << ins[i];
}
}
PADDLE_ENFORCE(rpc_client->Wait());

@ -22,7 +22,7 @@ limitations under the License. */
#include "paddle/fluid/framework/program_desc.h"
#include "paddle/fluid/operators/math/math_function.h"
#include "paddle/fluid/operators/math/selected_rows_functor.h"
#include "paddle/string/printf.h"
#include "paddle/fluid/string/printf.h"
USE_NO_KERNEL_OP(send);
USE_NO_KERNEL_OP(listen_and_serv);

@ -29,7 +29,9 @@ class SequenceExpandOp : public framework::OperatorWithKernel {
PADDLE_ENFORCE(ctx->HasOutput("Out"));
PADDLE_ENFORCE(ctx->HasInput("Y"));
framework::DDim out_dim;
out_dim = ctx->GetInputDim("Y");
auto y_dim = ctx->GetInputDim("Y");
out_dim = ctx->GetInputDim("X");
out_dim[0] = y_dim[0];
ctx->ShareLoD("Y", "Out");
ctx->SetOutputDim("Out", out_dim);
}

@ -14,6 +14,7 @@ limitations under the License. */
#pragma once
#include <chrono>
#include <vector>
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/strided_memcpy.h"
@ -27,18 +28,18 @@ class SplitOpKernel : public framework::OpKernel<T> {
void Compute(const framework::ExecutionContext& ctx) const override {
auto* in = ctx.Input<framework::Tensor>("X");
auto outs = ctx.MultiOutput<framework::Tensor>("Out");
auto in_stride = framework::stride(in->dims());
auto in_stride = framework::stride_numel(in->dims());
int64_t axis = static_cast<int64_t>(ctx.Attr<int>("axis"));
const size_t n = outs.size();
auto place = ctx.GetPlace();
size_t input_offset = 0;
for (size_t i = 0; i < n; i++) {
auto& out = outs[i];
for (auto& out : outs) {
out->mutable_data<T>(ctx.GetPlace());
size_t axis_dim = out->dims()[axis];
auto out_stride = framework::stride(out->dims());
StridedMemcpy<T>(ctx.device_context(), in->data<T>() + input_offset,
in_stride, out->dims(), out_stride, out->data<T>());
input_offset += axis_dim * in_stride[axis];
auto out_stride = framework::stride_numel(out->dims());
StridedNumelCopyWithAxis<T>(ctx.device_context(), axis, out->data<T>(),
out_stride, in->data<T>() + input_offset,
in_stride);
input_offset += out_stride[axis];
}
}
};

@ -22,7 +22,7 @@ class SplitSelectedRowsOpMaker : public framework::OpProtoAndCheckerMaker {
SplitSelectedRowsOpMaker(OpProto *proto, OpAttrChecker *op_checker)
: OpProtoAndCheckerMaker(proto, op_checker) {
AddInput("X", "The input SelectedRows.");
AddOutput("Out", "The outputs of input SelectedRows.").AsDuplicable();
AddOutput("Out", "The outputs of the input SelectedRows.").AsDuplicable();
AddAttr<std::vector<int>>("height_sections",
"Height for each output SelectedRows.")
.SetDefault(std::vector<int>({}));
@ -56,27 +56,6 @@ class SplitSelectedRowsOp : public framework::OperatorWithKernel {
PADDLE_ENFORCE(ctx->HasInput("X"), "SplitSelectedRowsOp must has input X.");
PADDLE_ENFORCE(ctx->HasOutputs("Out"),
"SplitSelectedRowsOp must has output Out.");
std::vector<int> height_sections =
ctx->Attrs().Get<std::vector<int>>("height_sections");
int64_t n = ctx->Outputs("Out").size();
std::vector<framework::DDim> outs_dims;
outs_dims.reserve(n);
// make output dims
for (int64_t i = 0; i < n; ++i) {
auto dims = ctx->GetInputDim("X");
if (height_sections.size()) {
PADDLE_ENFORCE_EQ(
height_sections.size(), static_cast<size_t>(n),
"The size of height section should be the same with height"
" section size.");
dims[0] = height_sections[i];
}
outs_dims.push_back(dims);
}
ctx->SetOutputsDim("Out", outs_dims);
}
};

@ -55,6 +55,7 @@ class SplitSelectedRowsOpKernel : public framework::OpKernel<T> {
for (size_t i = 0; i < outs_rows_idx.size(); ++i) {
auto rows_idx = outs_rows_idx[i];
outs[i]->set_height(height_sections[i]);
if (rows_idx.size() > 0) {
auto dims = x->GetCompleteDims();
dims[0] = rows_idx.size();

@ -41,5 +41,62 @@ inline void StridedMemcpy(const platform::DeviceContext& dev_ctx, const T* src,
StridedCopyDimVisitor<T> func(dev_ctx, src, src_stride, dst_stride, dst);
boost::apply_visitor(func, dst_dim);
}
// Strided numel memory copy from src to dst by the specified axis
//
// For example, for a tensor dims [4, 20, 100], the strieded numel is
// [8000, 2000, 100]
//
// NOTE: The src and dst tensor should have the same elements
// except the specified axis.
template <typename T>
inline void StridedNumelCopyWithAxis(const platform::DeviceContext& ctx,
int64_t axis, T* dst,
const framework::DDim& dst_stride_numel,
const T* src,
const framework::DDim& src_stride_numel) {
int64_t before = dst_stride_numel[0] / dst_stride_numel[axis];
int64_t src_after = src_stride_numel[axis];
int64_t dst_after = dst_stride_numel[axis];
auto place = ctx.GetPlace();
PADDLE_ENFORCE_EQ(src_stride_numel.size(), dst_stride_numel.size(),
"src and dst tensor should have the same dims size.");
for (int64_t i = 0; i < axis; ++i) {
if (i < axis) {
PADDLE_ENFORCE_EQ(src_stride_numel[i] / src_stride_numel[axis],
dst_stride_numel[i] / dst_stride_numel[axis],
"src and dst should have the same elements "
"except the specified axis.");
} else if (i == axis) {
continue;
} else {
PADDLE_ENFORCE_EQ(src_stride_numel[i], dst_stride_numel[i],
"src and dst should have the same elements "
"except the specified axis.");
}
}
for (int64_t i = 0; i < before; ++i) {
if (platform::is_cpu_place(place)) {
auto& cpu_place = boost::get<platform::CPUPlace>(place);
memory::Copy(cpu_place, dst + i * dst_after, cpu_place,
src + i * src_after, sizeof(T) * src_after);
} else {
#ifdef PADDLE_WITH_CUDA
auto& gpu_place = boost::get<platform::CUDAPlace>(place);
auto& cuda_ctx =
reinterpret_cast<const platform::CUDADeviceContext&>(ctx);
memory::Copy(gpu_place, dst + i * dst_after, gpu_place,
src + i * src_after, sizeof(T) * src_after,
cuda_ctx.stream());
#else
PADDLE_THROW("Paddle is not compiled with GPU");
#endif
}
}
}
} // namespace operators
} // namespace paddle

@ -116,7 +116,9 @@ class SumKernel : public framework::OpKernel<T> {
int64_t offset = 0;
for (int i = 0; i < N; i++) {
auto &sel_row = get_selected_row(i);
if (!sel_row.value().IsInitialized() || sel_row.rows().size() == 0) {
continue;
}
PADDLE_ENFORCE_EQ(out->height(), sel_row.height());
functor(context.template device_context<DeviceContext>(), sel_row,
offset, out);

File diff suppressed because it is too large Load Diff

@ -17,39 +17,41 @@ limitations under the License. */
namespace paddle {
namespace operators {
template <typename T>
template <typename T, typename WT>
__global__ void NegTargetAssignKernel(const int* neg_indices, const size_t* lod,
const int num, const int num_prior_box,
const int background_label,
int* out_label, T* out_label_wt) {
const int N, const int M, const int K,
const int mismatch_value, T* out,
WT* out_wt) {
int bidx = blockIdx.x;
int st = lod[bidx];
int ed = lod[bidx + 1];
int row_start = bidx * num_prior_box;
int row_start = bidx * M;
for (int i = st + threadIdx.x; i < ed; i += blockDim.x) {
int id = row_start + neg_indices[i];
out_label[id] = background_label;
out_label_wt[id] = 1.;
for (int k = 0; k < K; ++k) {
out[id * K + k] = T(mismatch_value);
out_wt[id * K + k] = WT(1.);
}
}
}
template <typename T>
struct NegTargetAssignFunctor<platform::CUDADeviceContext, T> {
template <typename T, typename WT>
struct NegTargetAssignFunctor<platform::CUDADeviceContext, T, WT> {
void operator()(const platform::CUDADeviceContext& ctx,
const int* neg_indices, const size_t* lod, const int num,
const int num_prior_box, const int background_label,
int* out_label, T* out_label_wt) {
const int* neg_indices, const size_t* lod, const int N,
const int M, const int K, const int mismatch_value, T* out,
WT* out_wt) {
const int block_size = 256;
const int grid_size = num;
NegTargetAssignKernel<T><<<grid_size, block_size, 0, ctx.stream()>>>(
neg_indices, lod, num, num_prior_box, background_label, out_label,
out_label_wt);
const int grid_size = N;
NegTargetAssignKernel<T, WT><<<grid_size, block_size, 0, ctx.stream()>>>(
neg_indices, lod, N, M, K, mismatch_value, out, out_wt);
}
};
template struct NegTargetAssignFunctor<platform::CUDADeviceContext, float>;
template struct NegTargetAssignFunctor<platform::CUDADeviceContext, double>;
template struct NegTargetAssignFunctor<platform::CUDADeviceContext, int, float>;
template struct NegTargetAssignFunctor<platform::CUDADeviceContext, float,
float>;
} // namespace operators
} // namespace paddle
@ -57,5 +59,5 @@ template struct NegTargetAssignFunctor<platform::CUDADeviceContext, double>;
namespace ops = paddle::operators;
REGISTER_OP_CUDA_KERNEL(
target_assign,
ops::TargetAssignKernel<paddle::platform::CUDADeviceContext, float>,
ops::TargetAssignKernel<paddle::platform::CUDADeviceContext, double>);
ops::TargetAssignKernel<paddle::platform::CUDADeviceContext, int, float>,
ops::TargetAssignKernel<paddle::platform::CUDADeviceContext, float, float>);

Some files were not shown because too many files have changed in this diff Show More

Loading…
Cancel
Save