You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
365 lines
15 KiB
365 lines
15 KiB
/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
|
|
|
|
Licensed under the Apache License, Version 2.0 (the "License");
|
|
you may not use this file except in compliance with the License.
|
|
You may obtain a copy of the License at
|
|
|
|
http://www.apache.org/licenses/LICENSE-2.0
|
|
|
|
Unless required by applicable law or agreed to in writing, software
|
|
distributed under the License is distributed on an "AS IS" BASIS,
|
|
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
See the License for the specific language governing permissions and
|
|
limitations under the License. */
|
|
|
|
#include <algorithm>
|
|
#include "paddle/fluid/operators/gather.cu.h"
|
|
#include "paddle/fluid/operators/math/math_function.h"
|
|
#include "paddle/fluid/operators/math/segment_pooling.h"
|
|
#include "paddle/fluid/platform/cuda_primitives.h"
|
|
#include "paddle/fluid/platform/gpu_launch_config.h"
|
|
|
|
namespace paddle {
|
|
namespace operators {
|
|
|
|
using Tensor = framework::Tensor;
|
|
|
|
template <typename T, typename Index, int DimTileSize>
|
|
__global__ void SegmentMeanCustomKernel(
|
|
const Index* segment_ids, const T* input, T* output, T* summed_ids,
|
|
const Index input_length_size, const Index inner_dim_size,
|
|
const Index output_length_size, const Index total_stripe_count) {
|
|
CUDA_KERNEL_LOOP(stripe_index, total_stripe_count) {
|
|
const Index segment_offset = stripe_index % inner_dim_size;
|
|
const Index dim_index_base =
|
|
stripe_index / inner_dim_size * Index(DimTileSize);
|
|
const Index actual_height =
|
|
min(Index(DimTileSize), input_length_size - dim_index_base);
|
|
|
|
Index first_segment_id = segment_ids[dim_index_base];
|
|
Index last_segment_id = -1;
|
|
if (dim_index_base > 0) {
|
|
last_segment_id = segment_ids[dim_index_base - 1];
|
|
}
|
|
if (segment_offset == 0) {
|
|
T sum = T(0);
|
|
for (Index j = 0; j < actual_height; j++) {
|
|
Index current_segment_id = segment_ids[dim_index_base + j];
|
|
// Note(ZHUI): following check may cause
|
|
// cudaErrorLaunchOutOfResources.
|
|
// PADDLE_ENFORCE(current_segment_id >= last_segment_id,
|
|
// "the segment ids should be sorted, but got "
|
|
// "segment_ids[%d]:%d > segment_ids[%d]:%d.",
|
|
// dim_index_base + j - 1, dim_index_base + j,
|
|
// last_segment_id, current_segment_id);
|
|
|
|
if (j > 0 && current_segment_id > last_segment_id) {
|
|
if (last_segment_id == first_segment_id) {
|
|
platform::CudaAtomicAdd(summed_ids + last_segment_id, sum);
|
|
} else {
|
|
*(summed_ids + last_segment_id) = sum;
|
|
}
|
|
sum = T(0);
|
|
}
|
|
sum += T(1);
|
|
last_segment_id = current_segment_id;
|
|
}
|
|
platform::CudaAtomicAdd(summed_ids + last_segment_id, sum);
|
|
}
|
|
// ensure last_segment_id is the largest
|
|
last_segment_id = output_length_size;
|
|
__syncthreads();
|
|
T sum = T(0);
|
|
for (Index j = 0; j < actual_height; j++) {
|
|
Index current_segment_id = segment_ids[dim_index_base + j];
|
|
if (current_segment_id > last_segment_id) {
|
|
const Index output_index =
|
|
last_segment_id * inner_dim_size + segment_offset;
|
|
if (last_segment_id == first_segment_id) {
|
|
platform::CudaAtomicAdd(output + output_index,
|
|
sum / *(summed_ids + last_segment_id));
|
|
} else {
|
|
*(output + output_index) = sum / *(summed_ids + last_segment_id);
|
|
}
|
|
sum = T(0);
|
|
}
|
|
sum += input[(dim_index_base + j) * inner_dim_size + segment_offset];
|
|
last_segment_id = current_segment_id;
|
|
}
|
|
const Index output_index =
|
|
last_segment_id * inner_dim_size + segment_offset;
|
|
platform::CudaAtomicAdd(output + output_index,
|
|
sum / *(summed_ids + last_segment_id));
|
|
}
|
|
}
|
|
|
|
template <typename T, typename Index, typename Helper, typename Pool>
|
|
__global__ void SegmentOpsKernel(const Index* segment_ids, const T* input,
|
|
T* output, Helper h, Pool pool) {
|
|
CUDA_KERNEL_LOOP(stripe_index, h.total_stripe_count) {
|
|
Index segment_offset, dim_index_base, actual_height;
|
|
Index inner_dim_size = h.inner_dim_size;
|
|
h.calculate(stripe_index, &segment_offset, &dim_index_base, &actual_height);
|
|
|
|
T minmax = pool.initial();
|
|
Index first_segment_id = segment_ids[dim_index_base];
|
|
// -1 is for the start value when interval_id = 0
|
|
Index last_segment_id = -1;
|
|
if (dim_index_base > 0) {
|
|
last_segment_id = segment_ids[dim_index_base - 1];
|
|
}
|
|
|
|
for (Index j = 0; j < actual_height; j++) {
|
|
Index current_segment_id = segment_ids[dim_index_base + j];
|
|
// ensure the segment_ids is sorted.
|
|
PADDLE_ENFORCE(current_segment_id >= last_segment_id,
|
|
"The segment ids should be sorted, but got "
|
|
"segment_ids[%d]:%d > segment_ids[%d]:%d.",
|
|
dim_index_base + j - 1, dim_index_base + j,
|
|
last_segment_id, current_segment_id);
|
|
|
|
if (current_segment_id > last_segment_id) {
|
|
// reset the interval value which do not have corresponding ids.
|
|
for (Index interval_id = last_segment_id + 1;
|
|
interval_id < current_segment_id; ++interval_id) {
|
|
*(output + interval_id * inner_dim_size + segment_offset) = 0;
|
|
}
|
|
// don't update result when j=0
|
|
if (j > 0) {
|
|
const Index output_index =
|
|
last_segment_id * inner_dim_size + segment_offset;
|
|
if (last_segment_id == first_segment_id) {
|
|
pool.atomic(output + output_index, minmax);
|
|
} else {
|
|
*(output + output_index) = minmax;
|
|
}
|
|
minmax = pool.initial();
|
|
}
|
|
}
|
|
pool.compute(
|
|
input[(dim_index_base + j) * inner_dim_size + segment_offset],
|
|
&minmax);
|
|
last_segment_id = current_segment_id;
|
|
}
|
|
const Index output_index =
|
|
last_segment_id * inner_dim_size + segment_offset;
|
|
pool.atomic(output + output_index, minmax);
|
|
}
|
|
}
|
|
|
|
template <typename T, typename Index, typename Helper>
|
|
__global__ void SegmentIndexGradKernel(const Index* segment_ids, const T* input,
|
|
const T* output, const T* out_grad,
|
|
T* in_grad, Helper h) {
|
|
CUDA_KERNEL_LOOP(stripe_index, h.total_stripe_count) {
|
|
Index segment_offset, dim_index_base, actual_height;
|
|
h.calculate(stripe_index, &segment_offset, &dim_index_base, &actual_height);
|
|
|
|
for (Index j = 0; j < actual_height; j++) {
|
|
Index current_segment_id = segment_ids[dim_index_base + j];
|
|
Index input_index =
|
|
(dim_index_base + j) * h.inner_dim_size + segment_offset;
|
|
Index output_index =
|
|
current_segment_id * h.inner_dim_size + segment_offset;
|
|
if (input[input_index] == output[output_index]) {
|
|
in_grad[input_index] = out_grad[output_index];
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
template <class T>
|
|
class MaxPool {
|
|
public:
|
|
DEVICE inline T initial() { return static_cast<T>(-FLT_MAX); }
|
|
DEVICE inline void compute(const T& x, T* y) { *y = *y > x ? *y : x; }
|
|
DEVICE inline T atomic(T* address, const T val) {
|
|
return platform::CudaAtomicMax(address, val);
|
|
}
|
|
};
|
|
|
|
template <class T>
|
|
class MinPool {
|
|
public:
|
|
DEVICE inline T initial() { return static_cast<T>(FLT_MAX); }
|
|
DEVICE inline void compute(const T& x, T* y) { *y = *y < x ? *y : x; }
|
|
DEVICE inline T atomic(T* address, const T val) {
|
|
return platform::CudaAtomicMin(address, val);
|
|
}
|
|
};
|
|
|
|
template <class T>
|
|
class SumPool {
|
|
public:
|
|
DEVICE inline T initial() { return static_cast<T>(0); }
|
|
DEVICE inline void compute(const T& x, T* y) { *y = *y + x; }
|
|
DEVICE inline T atomic(T* address, const T val) {
|
|
return platform::CudaAtomicAdd(address, val);
|
|
}
|
|
};
|
|
|
|
template <class T>
|
|
class ArrangeHelper {
|
|
public:
|
|
const T input_total_size;
|
|
const T input_length_size;
|
|
const T output_length_size;
|
|
T inner_dim_size;
|
|
T total_stripe_count;
|
|
const T DimTileSize = 8;
|
|
|
|
ArrangeHelper(T a, T b, T c)
|
|
: input_total_size(a), input_length_size(b), output_length_size(c) {
|
|
T input_outer_dim_num_stripe =
|
|
(input_length_size + DimTileSize - 1) / DimTileSize;
|
|
inner_dim_size = input_total_size / input_length_size;
|
|
total_stripe_count = inner_dim_size * input_outer_dim_num_stripe;
|
|
}
|
|
|
|
DEVICE inline void calculate(T stripe_index, T* segment_offset,
|
|
T* dim_index_base, T* actual_height) {
|
|
*segment_offset = stripe_index % inner_dim_size;
|
|
*dim_index_base = stripe_index / inner_dim_size * DimTileSize;
|
|
*actual_height = min(DimTileSize, input_length_size - *dim_index_base);
|
|
}
|
|
};
|
|
|
|
template <typename T, typename Index>
|
|
void SegmentPoolCUDAGradFunctor(const platform::CUDADeviceContext& ctx,
|
|
const framework::Tensor& input,
|
|
const framework::Tensor& segment_ids,
|
|
const framework::Tensor& output,
|
|
const framework::Tensor& out_grad,
|
|
framework::Tensor* in_grad,
|
|
const std::string pooltype = "SUM") {
|
|
auto h = ArrangeHelper<Index>(input.numel(), segment_ids.dims()[0],
|
|
output.dims()[0]);
|
|
auto config = platform::GetGpuLaunchConfig1D(ctx, h.total_stripe_count);
|
|
if (pooltype == "MAX" || pooltype == "MIN") {
|
|
SegmentIndexGradKernel<T, Index, ArrangeHelper<Index>><<<
|
|
config.block_per_grid.x, config.thread_per_block.x, 0, ctx.stream()>>>(
|
|
segment_ids.data<Index>(), input.data<T>(), output.data<T>(),
|
|
out_grad.data<T>(), in_grad->data<T>(), h);
|
|
} else {
|
|
PADDLE_THROW(platform::errors::InvalidArgument(
|
|
"Unsupported segment pooling grad operation, Only MAX, MIN "
|
|
"available, but got %s.",
|
|
pooltype));
|
|
}
|
|
}
|
|
|
|
template <typename T>
|
|
__global__ void SimpleDiv(T* x, const T* y, const int len, const int dim) {
|
|
for (int i = blockIdx.x; i < len; i += gridDim.x) {
|
|
__shared__ T y_i;
|
|
auto base = i * dim;
|
|
if (threadIdx.x == 0) {
|
|
y_i = y[i];
|
|
}
|
|
__syncthreads();
|
|
for (int j = threadIdx.x; j < dim; j += blockDim.x) {
|
|
x[base + j] /= y_i;
|
|
}
|
|
}
|
|
}
|
|
|
|
template <typename T, typename IndexT>
|
|
class SegmentPoolFunctor<platform::CUDADeviceContext, T, IndexT> {
|
|
public:
|
|
void operator()(const platform::CUDADeviceContext& ctx,
|
|
const framework::Tensor& input,
|
|
const framework::Tensor& segment_ids,
|
|
framework::Tensor* output,
|
|
framework::Tensor* summed_ids = nullptr,
|
|
const std::string pooltype = "SUM") {
|
|
auto h = ArrangeHelper<IndexT>(input.numel(), segment_ids.dims()[0],
|
|
output->dims()[0]);
|
|
auto config = platform::GetGpuLaunchConfig1D(ctx, h.total_stripe_count);
|
|
if (pooltype == "MEAN") {
|
|
SegmentMeanCustomKernel<
|
|
T, IndexT, IndexT(8)><<<config.block_per_grid.x,
|
|
config.thread_per_block.x, 0, ctx.stream()>>>(
|
|
segment_ids.data<IndexT>(), input.data<T>(), output->data<T>(),
|
|
summed_ids->data<T>(), h.input_length_size, h.inner_dim_size,
|
|
h.output_length_size, h.total_stripe_count);
|
|
} else if (pooltype == "SUM") {
|
|
SumPool<T> pool;
|
|
SegmentOpsKernel<
|
|
T, IndexT, ArrangeHelper<IndexT>,
|
|
SumPool<T>><<<config.block_per_grid.x, config.thread_per_block.x, 0,
|
|
ctx.stream()>>>(segment_ids.data<IndexT>(),
|
|
input.data<T>(), output->data<T>(), h,
|
|
pool);
|
|
} else if (pooltype == "MAX") {
|
|
MaxPool<T> pool;
|
|
SegmentOpsKernel<
|
|
T, IndexT, ArrangeHelper<IndexT>,
|
|
MaxPool<T>><<<config.block_per_grid.x, config.thread_per_block.x, 0,
|
|
ctx.stream()>>>(segment_ids.data<IndexT>(),
|
|
input.data<T>(), output->data<T>(), h,
|
|
pool);
|
|
} else if (pooltype == "MIN") {
|
|
MinPool<T> pool;
|
|
SegmentOpsKernel<
|
|
T, IndexT, ArrangeHelper<IndexT>,
|
|
MinPool<T>><<<config.block_per_grid.x, config.thread_per_block.x, 0,
|
|
ctx.stream()>>>(segment_ids.data<IndexT>(),
|
|
input.data<T>(), output->data<T>(), h,
|
|
pool);
|
|
} else {
|
|
PADDLE_THROW(platform::errors::InvalidArgument(
|
|
"Unsupported segment pooling operation, Only MEAN, SUM, MAX, MIN "
|
|
"available, but got %s.",
|
|
pooltype));
|
|
}
|
|
}
|
|
};
|
|
|
|
template <typename T, typename IndexT>
|
|
class SegmentPoolGradFunctor<platform::CUDADeviceContext, T, IndexT> {
|
|
public:
|
|
void operator()(const platform::CUDADeviceContext& context,
|
|
const framework::Tensor& input,
|
|
const framework::Tensor& output,
|
|
const framework::Tensor& out_grad,
|
|
const framework::Tensor& segments, framework::Tensor* in_grad,
|
|
const framework::Tensor* summed_ids = nullptr,
|
|
const std::string pooltype = "SUM") {
|
|
if (pooltype == "MAX" || pooltype == "MIN") {
|
|
SegmentPoolCUDAGradFunctor<T, IndexT>(context, input, segments, output,
|
|
out_grad, in_grad, pooltype);
|
|
} else if (pooltype == "MEAN") {
|
|
framework::Tensor mean_grad;
|
|
mean_grad.mutable_data<T>(input.dims(), context.GetPlace());
|
|
framework::TensorCopy(out_grad, context.GetPlace(), context, &mean_grad);
|
|
int len = output.dims()[0];
|
|
int dim = output.numel() / len;
|
|
auto config = platform::GetGpuLaunchConfig1D(context, len);
|
|
SimpleDiv<T><<<config.block_per_grid.x, config.thread_per_block.x, 0,
|
|
context.stream()>>>(mean_grad.data<T>(),
|
|
summed_ids->data<T>(), len, dim);
|
|
GPUGather<T, IndexT>(context, mean_grad, segments, in_grad);
|
|
} else if (pooltype == "SUM") {
|
|
GPUGather<T, IndexT>(context, out_grad, segments, in_grad);
|
|
} else {
|
|
PADDLE_THROW(platform::errors::InvalidArgument(
|
|
"Unsupported segment pooling operation, Only MEAN, SUM, MAX, MIN "
|
|
"available, but got %s.",
|
|
pooltype));
|
|
}
|
|
}
|
|
};
|
|
|
|
using CUDA = paddle::platform::CUDADeviceContext;
|
|
template class SegmentPoolFunctor<CUDA, float, int>;
|
|
template class SegmentPoolFunctor<CUDA, float, int64_t>;
|
|
template class SegmentPoolFunctor<CUDA, double, int>;
|
|
template class SegmentPoolFunctor<CUDA, double, int64_t>;
|
|
template class SegmentPoolGradFunctor<CUDA, float, int>;
|
|
template class SegmentPoolGradFunctor<CUDA, float, int64_t>;
|
|
template class SegmentPoolGradFunctor<CUDA, double, int>;
|
|
template class SegmentPoolGradFunctor<CUDA, double, int64_t>;
|
|
|
|
} // namespace operators
|
|
} // namespace paddle
|