|
|
|
@ -31,10 +31,6 @@ static inline int NumBlocks(const int N) {
|
|
|
|
|
kNumMaxinumNumBlocks);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
#define CUDA_1D_KERNEL_LOOP(i, n) \
|
|
|
|
|
for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < (n); \
|
|
|
|
|
i += blockDim.x * gridDim.x)
|
|
|
|
|
|
|
|
|
|
template <typename T>
|
|
|
|
|
__global__ void GPUNLLLossForward1D_no_reduce(T* out_data, const T* x_data,
|
|
|
|
|
const int64_t* label_data,
|
|
|
|
@ -42,7 +38,7 @@ __global__ void GPUNLLLossForward1D_no_reduce(T* out_data, const T* x_data,
|
|
|
|
|
const int64_t batch_size,
|
|
|
|
|
const int64_t n_classes,
|
|
|
|
|
const int64_t ignore_index) {
|
|
|
|
|
CUDA_1D_KERNEL_LOOP(i, batch_size) {
|
|
|
|
|
CUDA_KERNEL_LOOP(i, batch_size) {
|
|
|
|
|
const int64_t cur_label = label_data[i];
|
|
|
|
|
if (cur_label == ignore_index) {
|
|
|
|
|
out_data[i] = 0;
|
|
|
|
@ -191,7 +187,7 @@ __global__ void GPUNLLLossForward2D_no_reduce(
|
|
|
|
|
const int64_t map_size = in_dim2 * in_dim3;
|
|
|
|
|
const int64_t sample_size = n_classes * map_size;
|
|
|
|
|
const int64_t out_numel = batch_size * map_size;
|
|
|
|
|
CUDA_1D_KERNEL_LOOP(i, out_numel) {
|
|
|
|
|
CUDA_KERNEL_LOOP(i, out_numel) {
|
|
|
|
|
const int64_t b = i % batch_size;
|
|
|
|
|
const int64_t h = (i / batch_size) % in_dim2;
|
|
|
|
|
const int64_t w = (i / (batch_size * in_dim2)) % in_dim3;
|
|
|
|
@ -261,7 +257,7 @@ __global__ void GPUNLLLossBackward1D_no_reduce(
|
|
|
|
|
T* dx_data, const int64_t* label_data, const T* weight_data,
|
|
|
|
|
const T* dout_data, const int64_t batch_size, const int64_t n_classes,
|
|
|
|
|
const int64_t ignore_index) {
|
|
|
|
|
CUDA_1D_KERNEL_LOOP(i, batch_size) {
|
|
|
|
|
CUDA_KERNEL_LOOP(i, batch_size) {
|
|
|
|
|
const int64_t cur_label = label_data[i];
|
|
|
|
|
if (cur_label == ignore_index) {
|
|
|
|
|
continue;
|
|
|
|
@ -299,7 +295,7 @@ __global__ void GPUNLLLossBackward2D_no_reduce(
|
|
|
|
|
const int64_t map_size = in_dim2 * in_dim3;
|
|
|
|
|
const int64_t sample_size = n_classes * map_size;
|
|
|
|
|
const int64_t out_numel = batch_size * map_size;
|
|
|
|
|
CUDA_1D_KERNEL_LOOP(i, out_numel) {
|
|
|
|
|
CUDA_KERNEL_LOOP(i, out_numel) {
|
|
|
|
|
const int64_t b = i % batch_size;
|
|
|
|
|
const int64_t h = (i / batch_size) % in_dim2;
|
|
|
|
|
const int64_t w = (i / (batch_size * in_dim2)) % in_dim3;
|
|
|
|
|