You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
498 lines
17 KiB
498 lines
17 KiB
/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
|
|
|
|
Licensed under the Apache License, Version 2.0 (the "License");
|
|
you may not use this file except in compliance with the License.
|
|
You may obtain a copy of the License at
|
|
|
|
http://www.apache.org/licenses/LICENSE-2.0
|
|
|
|
Unless required by applicable law or agreed to in writing, software
|
|
distributed under the License is distributed on an "AS IS" BASIS,
|
|
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
See the License for the specific language governing permissions and
|
|
limitations under the License. */
|
|
|
|
#include <algorithm>
|
|
#include "paddle/fluid/framework/op_registry.h"
|
|
#include "paddle/fluid/operators/grid_sampler_op.h"
|
|
#include "paddle/fluid/platform/cuda_device_function.h"
|
|
#include "paddle/fluid/platform/cuda_primitives.h"
|
|
#include "paddle/fluid/platform/gpu_info.h"
|
|
|
|
namespace paddle {
|
|
namespace operators {
|
|
|
|
static __forceinline__ __device__ bool in_bounds(int h, int w, int H, int W) {
|
|
return h >= 0 && h < H && w >= 0 && w < W;
|
|
}
|
|
|
|
template <typename T>
|
|
static __forceinline__ __device__ void atomic_add(T* data, int h, int w, int sH,
|
|
int sW, int H, int W,
|
|
T delta) {
|
|
if (in_bounds(h, w, H, W)) {
|
|
platform::CudaAtomicAdd(data + h * sH + w * sW, delta);
|
|
}
|
|
}
|
|
|
|
template <typename T>
|
|
static __forceinline__ __device__ T _unnormalize(T coord, int size,
|
|
bool align_corners) {
|
|
if (align_corners) {
|
|
return ((coord + 1.f) / 2) * (size - 1);
|
|
} else {
|
|
return ((coord + 1.f) * size - 1) / 2;
|
|
}
|
|
}
|
|
|
|
template <typename T>
|
|
static __forceinline__ __device__ T clip_indexes(T in, int max_value) {
|
|
return min(static_cast<T>(max_value), max(in, static_cast<T>(0)));
|
|
}
|
|
|
|
template <typename T>
|
|
static __forceinline__ __device__ T reflect_indexes(T in, int twice_low,
|
|
int twice_high) {
|
|
if (twice_low == twice_high) {
|
|
return static_cast<T>(0);
|
|
}
|
|
T min = static_cast<T>(twice_low) / 2;
|
|
T span = static_cast<T>(twice_high - twice_low) / 2;
|
|
in = fabs(in - min);
|
|
T extra = fmod(in, span);
|
|
int flips = static_cast<int>(floor(in / span));
|
|
if (flips % 2 == 0) {
|
|
return extra + min;
|
|
} else {
|
|
return span - extra + min;
|
|
}
|
|
}
|
|
|
|
template <typename T>
|
|
static __forceinline__ __device__ T compute_positions(T coord, int size,
|
|
PaddingMode padding_mode,
|
|
bool align_corners) {
|
|
coord = _unnormalize<T>(coord, size, align_corners);
|
|
if (padding_mode == PaddingMode::border) {
|
|
coord = clip_indexes(coord, size - 1);
|
|
} else if (padding_mode == PaddingMode::reflect) {
|
|
if (align_corners) {
|
|
coord = reflect_indexes(coord, 0, 2 * (size - 1));
|
|
} else {
|
|
coord = reflect_indexes(coord, -1, 2 * size - 1);
|
|
}
|
|
coord = clip_indexes(coord, size - 1);
|
|
}
|
|
return coord;
|
|
}
|
|
|
|
template <typename T>
|
|
static __forceinline__ __device__ T _unnormalize_with_mask(T coord, int size,
|
|
bool align_corners,
|
|
T* grad_in) {
|
|
if (align_corners) {
|
|
*grad_in = static_cast<T>(size - 1) / 2;
|
|
return ((coord + 1.f) / 2) * (size - 1);
|
|
} else {
|
|
*grad_in = static_cast<T>(size) / 2;
|
|
return ((coord + 1.f) * size - 1) / 2;
|
|
}
|
|
}
|
|
|
|
template <typename T>
|
|
static __forceinline__ __device__ T clip_indexes_with_mask(T in, int clip_limit,
|
|
T* grad_in) {
|
|
if (in <= static_cast<T>(0)) {
|
|
*grad_in = static_cast<T>(0);
|
|
return static_cast<T>(0);
|
|
} else {
|
|
T max = static_cast<T>(clip_limit - 1);
|
|
if (in >= max) {
|
|
*grad_in = static_cast<T>(0);
|
|
return max;
|
|
} else {
|
|
*grad_in = static_cast<T>(1);
|
|
return in;
|
|
}
|
|
}
|
|
}
|
|
|
|
template <typename T>
|
|
static __forceinline__ __device__ T
|
|
reflect_indexes_with_mask(T in, int twice_low, int twice_high, T* grad_in) {
|
|
if (twice_low == twice_high) {
|
|
*grad_in = static_cast<T>(0);
|
|
return static_cast<T>(0);
|
|
}
|
|
int grad_in_mult_;
|
|
T min = static_cast<T>(twice_low) / 2;
|
|
T span = static_cast<T>(twice_high - twice_low) / 2;
|
|
in = in - min;
|
|
if (in < static_cast<T>(0)) {
|
|
grad_in_mult_ = -1;
|
|
in = -in;
|
|
} else {
|
|
grad_in_mult_ = 1;
|
|
}
|
|
T extra = fmod(in, span);
|
|
int flips = static_cast<int>(floor(in / span));
|
|
if (flips % 2 == 0) {
|
|
*grad_in = static_cast<T>(grad_in_mult_);
|
|
return extra + min;
|
|
} else {
|
|
*grad_in = static_cast<T>(-grad_in_mult_);
|
|
return span - extra + min;
|
|
}
|
|
}
|
|
|
|
template <typename T>
|
|
static __forceinline__ __device__ T
|
|
compute_positions_with_mask(T coord, int size, PaddingMode padding_mode,
|
|
bool align_corners, T* grad_in) {
|
|
T grad_clip, grad_refl;
|
|
coord = _unnormalize_with_mask<T>(coord, size, align_corners, grad_in);
|
|
if (padding_mode == PaddingMode::border) {
|
|
coord = clip_indexes_with_mask(coord, size, &grad_clip);
|
|
*grad_in = (*grad_in) * grad_clip;
|
|
} else if (padding_mode == PaddingMode::reflect) {
|
|
if (align_corners) {
|
|
coord = reflect_indexes_with_mask(coord, 0, 2 * (size - 1), &grad_refl);
|
|
} else {
|
|
coord = reflect_indexes_with_mask(coord, -1, 2 * size - 1, &grad_refl);
|
|
}
|
|
coord = clip_indexes_with_mask(coord, size, &grad_clip);
|
|
*grad_in = (*grad_in) * grad_refl * grad_clip;
|
|
}
|
|
|
|
return coord;
|
|
}
|
|
|
|
template <typename T>
|
|
__global__ void grid_sample_cuda_kernel(const int nthreads, int n, int out_c,
|
|
int out_h, int out_w, int in_h,
|
|
int in_w, const T* input, const T* grid,
|
|
T* output, const Mode mode,
|
|
const PaddingMode padding_mode,
|
|
bool align_corners) {
|
|
int inp_sN = out_c * in_h * in_w;
|
|
|
|
int inp_sC = in_h * in_w;
|
|
int inp_sH = in_w;
|
|
int inp_sW = 1;
|
|
int grid_sN = out_h * out_w * 2;
|
|
int grid_sH = out_w * 2;
|
|
int grid_sW = 2;
|
|
int grid_sCoor = 1;
|
|
int out_sN = out_c * out_h * out_w;
|
|
int out_sC = out_h * out_w;
|
|
int out_sH = out_w;
|
|
int out_sW = 1;
|
|
|
|
CUDA_KERNEL_LOOP(index, nthreads) {
|
|
const int w = index % out_w;
|
|
const int h = (index / out_w) % out_h;
|
|
const int n = index / (out_h * out_w);
|
|
const int grid_offset = n * grid_sN + h * grid_sH + w * grid_sW;
|
|
|
|
T ix = grid[grid_offset];
|
|
T iy = grid[grid_offset + grid_sCoor];
|
|
|
|
ix = compute_positions(ix, in_w, padding_mode, align_corners);
|
|
iy = compute_positions(iy, in_h, padding_mode, align_corners);
|
|
|
|
if (mode == Mode::bilinear) {
|
|
int ix_nw = static_cast<int>(floor(ix));
|
|
int iy_nw = static_cast<int>(floor(iy));
|
|
int ix_ne = ix_nw + 1;
|
|
int iy_ne = iy_nw;
|
|
int ix_sw = ix_nw;
|
|
int iy_sw = iy_nw + 1;
|
|
int ix_se = ix_nw + 1;
|
|
int iy_se = iy_nw + 1;
|
|
|
|
T nw = (ix_se - ix) * (iy_se - iy);
|
|
T ne = (ix - ix_sw) * (iy_sw - iy);
|
|
T sw = (ix_ne - ix) * (iy - iy_ne);
|
|
T se = (ix - ix_nw) * (iy - iy_nw);
|
|
|
|
auto inp_offset_NC = n * inp_sN;
|
|
auto out_ptr_NCHW = output + n * out_sN + h * out_sH + w * out_sW;
|
|
for (int c = 0; c < out_c;
|
|
++c, inp_offset_NC += inp_sC, out_ptr_NCHW += out_sC) {
|
|
*out_ptr_NCHW = static_cast<T>(0);
|
|
if (in_bounds(iy_nw, ix_nw, in_h, in_w)) {
|
|
*out_ptr_NCHW +=
|
|
input[inp_offset_NC + iy_nw * inp_sH + ix_nw * inp_sW] * nw;
|
|
}
|
|
if (in_bounds(iy_ne, ix_ne, in_h, in_w)) {
|
|
*out_ptr_NCHW +=
|
|
input[inp_offset_NC + iy_ne * inp_sH + ix_ne * inp_sW] * ne;
|
|
}
|
|
if (in_bounds(iy_sw, ix_sw, in_h, in_w)) {
|
|
*out_ptr_NCHW +=
|
|
input[inp_offset_NC + iy_sw * inp_sH + ix_sw * inp_sW] * sw;
|
|
}
|
|
if (in_bounds(iy_se, ix_se, in_h, in_w)) {
|
|
*out_ptr_NCHW +=
|
|
input[inp_offset_NC + iy_se * inp_sH + ix_se * inp_sW] * se;
|
|
}
|
|
}
|
|
} else if (mode == Mode::nearest) {
|
|
int ix_nearest = static_cast<int>(std::nearbyint(ix));
|
|
int iy_nearest = static_cast<int>(std::nearbyint(iy));
|
|
auto inp_offset_NC = n * inp_sN;
|
|
auto out_ptr_NCHW = output + n * out_sN + h * out_sH + w * out_sW;
|
|
for (int c = 0; c < out_c;
|
|
++c, inp_offset_NC += inp_sC, out_ptr_NCHW += out_sC) {
|
|
if (in_bounds(iy_nearest, ix_nearest, in_h, in_w)) {
|
|
*out_ptr_NCHW =
|
|
input[inp_offset_NC + iy_nearest * inp_sH + ix_nearest * inp_sW];
|
|
} else {
|
|
*out_ptr_NCHW = static_cast<T>(0);
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
template <typename T>
|
|
class GridSampleOpCUDAKernel : public framework::OpKernel<T> {
|
|
public:
|
|
void Compute(const framework::ExecutionContext& ctx) const override {
|
|
auto& dev_ctx = ctx.cuda_device_context();
|
|
auto align_corners = ctx.Attr<bool>("align_corners");
|
|
auto padding_mode_s = ctx.Attr<std::string>("padding_mode");
|
|
auto mode_s = ctx.Attr<std::string>("mode");
|
|
PaddingMode padding_mode;
|
|
Mode mode;
|
|
if (padding_mode_s == "border") {
|
|
padding_mode = PaddingMode::border;
|
|
} else if (padding_mode_s == "reflection") {
|
|
padding_mode = PaddingMode::reflect;
|
|
} else {
|
|
padding_mode = PaddingMode::zeros;
|
|
}
|
|
|
|
if (mode_s == "nearest") {
|
|
mode = Mode::nearest;
|
|
} else {
|
|
mode = Mode::bilinear;
|
|
}
|
|
|
|
auto* input = ctx.Input<Tensor>("X");
|
|
auto* grid = ctx.Input<Tensor>("Grid");
|
|
const int n = grid->dims()[0];
|
|
const int out_h = grid->dims()[1];
|
|
const int out_w = grid->dims()[2];
|
|
const int c = input->dims()[1];
|
|
const int in_h = input->dims()[2];
|
|
const int in_w = input->dims()[3];
|
|
VLOG(3) << "n: " << n << "; c: " << c << "; out_h: " << out_h
|
|
<< "; out_w: " << out_w;
|
|
auto* output = ctx.Output<Tensor>("Output");
|
|
auto* output_data = output->mutable_data<T>(ctx.GetPlace());
|
|
|
|
VLOG(3) << "set constant";
|
|
math::SetConstant<paddle::platform::CUDADeviceContext, T>()(
|
|
dev_ctx, output, static_cast<T>(0));
|
|
int count = static_cast<int>(n * out_h * out_w);
|
|
|
|
auto cu_stream = dev_ctx.stream();
|
|
|
|
int block = 512;
|
|
int grid_size = (count + block - 1) / block;
|
|
grid_sample_cuda_kernel<T><<<block, grid_size, 0, cu_stream>>>(
|
|
count, n, c, out_h, out_w, in_h, in_w, input->data<T>(),
|
|
grid->data<T>(), output_data, mode, padding_mode, align_corners);
|
|
}
|
|
};
|
|
|
|
template <typename T>
|
|
__global__ void grid_sampler_cuda_backward_kernel(
|
|
const int nthreads, const T* grad_output, const T* input, const T* grid,
|
|
int n, int out_c, int out_h, int out_w, int in_h, int in_w, T* grad_input,
|
|
T* grad_grid, const Mode mode, const PaddingMode padding_mode,
|
|
bool align_corners) {
|
|
int inp_sN = out_c * in_h * in_w;
|
|
int inp_sC = in_h * in_w;
|
|
int inp_sH = in_w;
|
|
int inp_sW = 1;
|
|
int grid_sN = out_h * out_w * 2;
|
|
int grid_sH = out_w * 2;
|
|
int grid_sW = 2;
|
|
int grid_sCoor = 1;
|
|
|
|
int gOut_sN = out_c * out_h * out_w;
|
|
int gOut_sC = out_h * out_w;
|
|
int gOut_sH = out_w;
|
|
int gOut_sW = 1;
|
|
|
|
CUDA_KERNEL_LOOP(index, nthreads) {
|
|
const int w = index % out_w;
|
|
const int h = (index / out_w) % out_h;
|
|
const int n = index / (out_h * out_w);
|
|
const int grid_offset = n * grid_sN + h * grid_sH + w * grid_sW;
|
|
|
|
T ix = grid[grid_offset];
|
|
T iy = grid[grid_offset + grid_sCoor];
|
|
|
|
T gix_mult, giy_mult;
|
|
ix = compute_positions_with_mask(ix, in_w, padding_mode, align_corners,
|
|
&gix_mult);
|
|
iy = compute_positions_with_mask(iy, in_h, padding_mode, align_corners,
|
|
&giy_mult);
|
|
|
|
if (mode == Mode::bilinear) {
|
|
int ix_nw = static_cast<int>(floor(ix));
|
|
int iy_nw = static_cast<int>(floor(iy));
|
|
int ix_ne = ix_nw + 1;
|
|
int iy_ne = iy_nw;
|
|
int ix_sw = ix_nw;
|
|
int iy_sw = iy_nw + 1;
|
|
int ix_se = ix_nw + 1;
|
|
int iy_se = iy_nw + 1;
|
|
|
|
T nw = (ix_se - ix) * (iy_se - iy);
|
|
T ne = (ix - ix_sw) * (iy_sw - iy);
|
|
T sw = (ix_ne - ix) * (iy - iy_ne);
|
|
T se = (ix - ix_nw) * (iy - iy_nw);
|
|
|
|
T gix = static_cast<T>(0), giy = static_cast<T>(0);
|
|
int gOut_offset = n * gOut_sN + h * gOut_sH + w * gOut_sW;
|
|
T* gInp_ptr_NC = grad_input + n * inp_sN;
|
|
int inp_offset_NC = n * inp_sN;
|
|
for (int c = 0; c < out_c; ++c, inp_offset_NC += inp_sC,
|
|
gInp_ptr_NC += inp_sC, gOut_offset += gOut_sC) {
|
|
T gOut = grad_output[gOut_offset];
|
|
|
|
atomic_add(gInp_ptr_NC, iy_nw, ix_nw, inp_sH, inp_sW, in_h, in_w,
|
|
nw * gOut);
|
|
atomic_add(gInp_ptr_NC, iy_ne, ix_ne, inp_sH, inp_sW, in_h, in_w,
|
|
ne * gOut);
|
|
atomic_add(gInp_ptr_NC, iy_sw, ix_sw, inp_sH, inp_sW, in_h, in_w,
|
|
sw * gOut);
|
|
atomic_add(gInp_ptr_NC, iy_se, ix_se, inp_sH, inp_sW, in_h, in_w,
|
|
se * gOut);
|
|
|
|
if (in_bounds(iy_nw, ix_nw, in_h, in_w)) {
|
|
T nw_val = input[inp_offset_NC + iy_nw * inp_sH + ix_nw * inp_sW];
|
|
gix -= nw_val * (iy_se - iy) * gOut;
|
|
giy -= nw_val * (ix_se - ix) * gOut;
|
|
}
|
|
if (in_bounds(iy_ne, ix_ne, in_h, in_w)) {
|
|
T ne_val = input[inp_offset_NC + iy_ne * inp_sH + ix_ne * inp_sW];
|
|
gix += ne_val * (iy_sw - iy) * gOut;
|
|
giy -= ne_val * (ix - ix_sw) * gOut;
|
|
}
|
|
if (in_bounds(iy_sw, ix_sw, in_h, in_w)) {
|
|
T sw_val = input[inp_offset_NC + iy_sw * inp_sH + ix_sw * inp_sW];
|
|
gix -= sw_val * (iy - iy_ne) * gOut;
|
|
giy += sw_val * (ix_ne - ix) * gOut;
|
|
}
|
|
if (in_bounds(iy_se, ix_se, in_h, in_w)) {
|
|
T se_val = input[inp_offset_NC + iy_se * inp_sH + ix_se * inp_sW];
|
|
gix += se_val * (iy - iy_nw) * gOut;
|
|
giy += se_val * (ix - ix_nw) * gOut;
|
|
}
|
|
}
|
|
|
|
if (grad_grid != nullptr) {
|
|
T* gGrid_ptr_NHW = grad_grid + index * grid_sW;
|
|
gGrid_ptr_NHW[0] = gix_mult * gix;
|
|
gGrid_ptr_NHW[1] = giy_mult * giy;
|
|
}
|
|
} else if (mode == Mode::nearest) {
|
|
int ix_nearest = static_cast<int>(std::nearbyint(ix));
|
|
int iy_nearest = static_cast<int>(std::nearbyint(iy));
|
|
|
|
int gOut_offset = n * gOut_sN + h * gOut_sH + w * gOut_sW;
|
|
T* gInp_ptr_NC = grad_input + n * inp_sN;
|
|
for (int c = 0; c < out_c;
|
|
++c, gInp_ptr_NC += inp_sC, gOut_offset += gOut_sC) {
|
|
atomic_add(gInp_ptr_NC, iy_nearest, ix_nearest, inp_sH, inp_sW, in_h,
|
|
in_w, grad_output[gOut_offset]);
|
|
}
|
|
|
|
if (grad_grid != nullptr) {
|
|
T* gGrid_ptr_NHW = grad_grid + index * grid_sW;
|
|
gGrid_ptr_NHW[0] = static_cast<T>(0);
|
|
gGrid_ptr_NHW[1] = static_cast<T>(0);
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
template <typename T>
|
|
class GridSampleGradOpCUDAKernel : public framework::OpKernel<T> {
|
|
public:
|
|
void Compute(const framework::ExecutionContext& ctx) const override {
|
|
auto& dev_ctx = ctx.cuda_device_context();
|
|
auto align_corners = ctx.Attr<bool>("align_corners");
|
|
auto padding_mode_s = ctx.Attr<std::string>("padding_mode");
|
|
auto mode_s = ctx.Attr<std::string>("mode");
|
|
|
|
PaddingMode padding_mode;
|
|
Mode mode;
|
|
if (padding_mode_s == "border") {
|
|
padding_mode = PaddingMode::border;
|
|
} else if (padding_mode_s == "reflection") {
|
|
padding_mode = PaddingMode::reflect;
|
|
} else {
|
|
padding_mode = PaddingMode::zeros;
|
|
}
|
|
|
|
if (mode_s == "nearest") {
|
|
mode = Mode::nearest;
|
|
} else {
|
|
mode = Mode::bilinear;
|
|
}
|
|
|
|
auto* input = ctx.Input<Tensor>("X");
|
|
auto* grid = ctx.Input<Tensor>("Grid");
|
|
auto* output_grad = ctx.Input<Tensor>(framework::GradVarName("Output"));
|
|
|
|
const int n = grid->dims()[0];
|
|
const int out_h = grid->dims()[1];
|
|
const int out_w = grid->dims()[2];
|
|
const int c = input->dims()[1];
|
|
const int in_h = input->dims()[2];
|
|
const int in_w = input->dims()[3];
|
|
|
|
auto* input_grad = ctx.Output<Tensor>(framework::GradVarName("X"));
|
|
input_grad->mutable_data<T>(ctx.GetPlace());
|
|
math::SetConstant<paddle::platform::CUDADeviceContext, T>()(
|
|
ctx.template device_context<paddle::platform::CUDADeviceContext>(),
|
|
input_grad, static_cast<T>(0));
|
|
|
|
T* grid_grad_data = nullptr;
|
|
if (ctx.HasOutput(framework::GradVarName("Grid"))) {
|
|
auto* grid_grad = ctx.Output<Tensor>(framework::GradVarName("Grid"));
|
|
grid_grad_data = grid_grad->mutable_data<T>(ctx.GetPlace());
|
|
math::SetConstant<paddle::platform::CUDADeviceContext, T>()(
|
|
ctx.template device_context<paddle::platform::CUDADeviceContext>(),
|
|
grid_grad, static_cast<T>(0));
|
|
}
|
|
|
|
int count = static_cast<int>(n * out_h * out_w);
|
|
auto cu_stream = dev_ctx.stream();
|
|
int block = 512;
|
|
int grid_size = (count + block - 1) / block;
|
|
grid_sampler_cuda_backward_kernel<T><<<block, grid_size, 0, cu_stream>>>(
|
|
count, output_grad->data<T>(), input->data<T>(), grid->data<T>(), n, c,
|
|
out_h, out_w, in_h, in_w, input_grad->data<T>(), grid_grad_data, mode,
|
|
padding_mode, align_corners);
|
|
}
|
|
};
|
|
|
|
} // namespace operators
|
|
} // namespace paddle
|
|
|
|
namespace ops = paddle::operators;
|
|
namespace plat = paddle::platform;
|
|
|
|
REGISTER_OP_CUDA_KERNEL(grid_sampler, ops::GridSampleOpCUDAKernel<float>,
|
|
ops::GridSampleOpCUDAKernel<double>);
|
|
REGISTER_OP_CUDA_KERNEL(grid_sampler_grad,
|
|
ops::GridSampleGradOpCUDAKernel<float>,
|
|
ops::GridSampleGradOpCUDAKernel<double>);
|