Fix index overflow bug of the CUDA kernel loop increment (#25435)

* fix softmax_with_cross_entropy cuda kernel overflow bug, test=develop * replace old macro & for condition, test=develop * polish details, test=develop
5 years ago · 0b54d54fd8
parent 0b2ec49ff9
commit 0b54d54fd8
38 changed files with 115 additions and 188 deletions
--- a/paddle/fluid/framework/fleet/box_wrapper.cu
+++ b/paddle/fluid/framework/fleet/box_wrapper.cu
@ -23,9 +23,6 @@

 namespace paddle {
 namespace framework {
-#define CUDA_KERNEL_LOOP(i, n)                                 \
-  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < (n); \
-       i += blockDim.x * gridDim.x)

 template <size_t EMBEDX_DIM, size_t EXPAND_EMBED_DIM>
 __global__ void PullCopy(
--- a/paddle/fluid/framework/lod_tensor_test.cu
+++ b/paddle/fluid/framework/lod_tensor_test.cu
@ -22,10 +22,7 @@
 #include "paddle/fluid/platform/place.h"

 __global__ void test(size_t* a, int size) {
-  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < size;
-       i += blockDim.x * gridDim.x) {
-    a[i] *= 2;
-  }
+  CUDA_KERNEL_LOOP(i, size) { a[i] *= 2; }
 }

 TEST(LoD, data) {
--- a/paddle/fluid/operators/batch_fc_op.cu
+++ b/paddle/fluid/operators/batch_fc_op.cu
@ -24,10 +24,6 @@ namespace paddle {
 namespace operators {
 using framework::Tensor;

-#define CUDA_KERNEL_LOOP(i, n)                                 \
-  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < (n); \
-       i += blockDim.x * gridDim.x)
-
 const int CUDA_NUM_THREADS = 1024;
 static inline int GET_BLOCKS(const int N) {
  return (N + CUDA_NUM_THREADS - 1) / CUDA_NUM_THREADS;
--- a/paddle/fluid/operators/bce_loss_op.cu
+++ b/paddle/fluid/operators/bce_loss_op.cu
@ -24,14 +24,10 @@ namespace operators {

 using Tensor = framework::Tensor;

-#define CUDA_1D_KERNEL_LOOP(i, n)                              \
-  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < (n); \
-       i += blockDim.x * gridDim.x)
-
 template <typename T>
 __global__ void GPUBCELossForward(const T* x_data, const T* label_data,
                                  T* out_data, const int in_numel) {
-  CUDA_1D_KERNEL_LOOP(i, in_numel) {
+  CUDA_KERNEL_LOOP(i, in_numel) {
    T x = x_data[i];
    T label = label_data[i];
    T one = static_cast<T>(1.);
@ -48,7 +44,7 @@ template <typename T>
 __global__ void GPUBCELossBackward(const T* x_data, const T* label_data,
                                   const T* dout_data, T* dx_data,
                                   const int in_numel) {
-  CUDA_1D_KERNEL_LOOP(i, in_numel) {
+  CUDA_KERNEL_LOOP(i, in_numel) {
    T x = x_data[i];
    T label = label_data[i];
    T dout = dout_data[i];
--- a/paddle/fluid/operators/cvm_op.cu
+++ b/paddle/fluid/operators/cvm_op.cu
@ -25,10 +25,6 @@ using platform::PADDLE_CUDA_NUM_THREADS;
 using Tensor = framework::Tensor;
 using LoDTensor = framework::LoDTensor;

-#define CUDA_KERNEL_LOOP(i, n)                                 \
-  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < (n); \
-       i += blockDim.x * gridDim.x)
-
 template <typename T>
 __global__ void CvmComputeKernel(const bool use_cvm, const int64_t item_width,
                                 const T* X, T* Y, int64_t numel) {
--- a/paddle/fluid/operators/data_norm_op.cu
+++ b/paddle/fluid/operators/data_norm_op.cu
@ -30,10 +30,6 @@ using LoDTensor = framework::LoDTensor;
 using DataLayout = framework::DataLayout;
 using platform::PADDLE_CUDA_NUM_THREADS;

-#define CUDA_KERNEL_LOOP(i, n)                                 \
-  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < (n); \
-       i += blockDim.x * gridDim.x)
-
 inline int GET_BLOCKS(const int N) {
  return (N + PADDLE_CUDA_NUM_THREADS - 1) / PADDLE_CUDA_NUM_THREADS;
 }
--- a/paddle/fluid/operators/deformable_psroi_pooling_op.cu
+++ b/paddle/fluid/operators/deformable_psroi_pooling_op.cu
@ -40,10 +40,6 @@ namespace operators {
 using Tensor = framework::Tensor;
 using LoDTensor = framework::LoDTensor;

-#define CUDA_KERNEL_LOOP(i, n)                                 \
-  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < (n); \
-       i += blockDim.x * gridDim.x)
-
 const int CUDA_NUM_THREADS = 1024;
 static inline int GET_BLOCKS(const int N) {
  return (N + CUDA_NUM_THREADS - 1) / CUDA_NUM_THREADS;
--- a/paddle/fluid/operators/detection/anchor_generator_op.cu
+++ b/paddle/fluid/operators/detection/anchor_generator_op.cu
@ -24,8 +24,7 @@ __global__ void GenAnchors(T* out, const T* aspect_ratios, const int ar_num,
                           const int width, const T offset) {
  int num_anchors = as_num * ar_num;
  int box_num = height * width * num_anchors;
-  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < box_num;
-       i += blockDim.x * gridDim.x) {
+  CUDA_KERNEL_LOOP(i, box_num) {
    int h_idx = i / (num_anchors * width);
    int w_idx = (i / num_anchors) % width;
    T stride_width = stride[0];
@ -64,10 +63,7 @@ __global__ void GenAnchors(T* out, const T* aspect_ratios, const int ar_num,
 template <typename T>
 __global__ void SetVariance(T* out, const T* var, const int vnum,
                            const int num) {
-  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < num;
-       i += blockDim.x * gridDim.x) {
-    out[i] = var[i % vnum];
-  }
+  CUDA_KERNEL_LOOP(i, num) { out[i] = var[i % vnum]; }
 }

 template <typename T>
--- a/paddle/fluid/operators/detection/collect_fpn_proposals_op.cu
+++ b/paddle/fluid/operators/detection/collect_fpn_proposals_op.cu
@ -40,8 +40,7 @@ static inline int NumBlocks(const int N) {

 static __global__ void GetLengthLoD(const int nthreads, const int* batch_ids,
                                    int* length_lod) {
-  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < (nthreads);
-       i += blockDim.x * gridDim.x) {
+  CUDA_KERNEL_LOOP(i, nthreads) {
    platform::CudaAtomicAdd(length_lod + batch_ids[i], 1);
  }
 }
--- a/paddle/fluid/operators/detection/distribute_fpn_proposals_op.cu
+++ b/paddle/fluid/operators/detection/distribute_fpn_proposals_op.cu
@ -31,10 +31,6 @@ using LoDTensor = framework::LoDTensor;
 static constexpr int kNumCUDAThreads = 64;
 static constexpr int kNumMaxinumNumBlocks = 4096;

-#define CUDA_1D_KERNEL_LOOP(i, n)                              \
-  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < (n); \
-       i += blockDim.x * gridDim.x)
-
 int const BBoxSize = 4;

 static inline int NumBlocks(const int N) {
@ -48,7 +44,7 @@ __global__ void GPUDistFpnProposalsHelper(
    const int refer_level, const int refer_scale, const int max_level,
    const int min_level, int* roi_batch_id_data, int* sub_lod_list,
    int* target_lvls) {
-  CUDA_1D_KERNEL_LOOP(i, nthreads) {
+  CUDA_KERNEL_LOOP(i, nthreads) {
    const T* offset_roi = rois + i * BBoxSize;
    int roi_batch_ind = roi_batch_id_data[i];
    // get the target level of current rois
--- a/paddle/fluid/operators/detection/generate_proposals_op.cu
+++ b/paddle/fluid/operators/detection/generate_proposals_op.cu
@ -33,9 +33,6 @@ using LoDTensor = framework::LoDTensor;
 namespace {

 #define DIVUP(m, n) ((m) / (n) + ((m) % (n) > 0))
-#define CUDA_1D_KERNEL_LOOP(i, n)                              \
-  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < (n); \
-       i += blockDim.x * gridDim.x)

 int const kThreadsPerBlock = sizeof(uint64_t) * 8;

@ -155,7 +152,7 @@ static __global__ void FilterBBoxes(const T *bboxes, const T *im_info,
  int cnt = 0;
  __shared__ int keep_index[BlockSize];

-  CUDA_1D_KERNEL_LOOP(i, num) {
+  CUDA_KERNEL_LOOP(i, num) {
    keep_index[threadIdx.x] = -1;
    __syncthreads();

--- a/paddle/fluid/operators/detection/prior_box_op.cu
+++ b/paddle/fluid/operators/detection/prior_box_op.cu
@ -32,8 +32,7 @@ __global__ void GenPriorBox(T* out, const T* aspect_ratios, const int height,
                            bool min_max_aspect_ratios_order) {
  int num_priors = max_sizes ? as_num * min_num + min_num : as_num * min_num;
  int box_num = height * width * num_priors;
-  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < box_num;
-       i += blockDim.x * gridDim.x) {
+  CUDA_KERNEL_LOOP(i, box_num) {
    int h = i / (num_priors * width);
    int w = (i / num_priors) % width;
    int p = i % num_priors;
@ -87,10 +86,7 @@ __global__ void GenPriorBox(T* out, const T* aspect_ratios, const int height,
 template <typename T>
 __global__ void SetVariance(T* out, const T* var, const int vnum,
                            const int num) {
-  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < num;
-       i += blockDim.x * gridDim.x) {
-    out[i] = var[i % vnum];
-  }
+  CUDA_KERNEL_LOOP(i, num) { out[i] = var[i % vnum]; }
 }

 template <typename T>
--- a/paddle/fluid/operators/detection/roi_perspective_transform_op.cu
+++ b/paddle/fluid/operators/detection/roi_perspective_transform_op.cu
@ -30,10 +30,6 @@ namespace operators {
 #define idx4_2(index, d1, d2, d3, d4) ((index / d4 / d3) % d2)
 #define idx4_1(index, d1, d2, d3, d4) ((index / d4 / d3 / d2) % d1)

-#define CUDA_1D_KERNEL_LOOP(i, n)                              \
-  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < (n); \
-       i += blockDim.x * gridDim.x)
-
 template <typename T>
 __device__ bool GT_E(T a, T b) {
  return (a > b) || Eigen::numext::abs(a - b) < 1e-4;
@ -284,7 +280,7 @@ __global__ void RoiTransformKernel(const float* input_data,
                                   int* mask, T* transform_matrix) {
  int output_size =
      num_rois * transformed_height * transformed_width * channels;
-  CUDA_1D_KERNEL_LOOP(index, output_size) {
+  CUDA_KERNEL_LOOP(index, output_size) {
    // (n, c, out_h, out_w) is an element in the transformed output
    int out_w = idx4_4(index, num_rois, channels, transformed_height,
                       transformed_width);
@ -463,7 +459,7 @@ __global__ void RoiTransformGradKernel(int out_size, const int* out2in_idx_data,
                                       const T* out2in_w_data,
                                       const T* out_grad_data,
                                       T* in_grad_data) {
-  CUDA_1D_KERNEL_LOOP(index, out_size * 4) {
+  CUDA_KERNEL_LOOP(index, out_size * 4) {
    int in_idx = out2in_idx_data[index];
    if (in_idx >= 0) {
      int out_idx = index / 4;
--- a/paddle/fluid/operators/detection/sigmoid_focal_loss_op.cu
+++ b/paddle/fluid/operators/detection/sigmoid_focal_loss_op.cu
@ -30,10 +30,6 @@ static inline int NumBlocks(const int N) {
                  kNumMaxinumNumBlocks);
 }

-#define CUDA_1D_KERNEL_LOOP(i, n)                              \
-  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < (n); \
-       i += blockDim.x * gridDim.x)
-
 template <typename T>
 __global__ void GPUSigmoidFocalLossForward(const T *x_data,
                                           const int *label_data,
@ -41,7 +37,7 @@ __global__ void GPUSigmoidFocalLossForward(const T *x_data,
                                           const T gamma, const T alpha,
                                           const int num_classes,
                                           const int limit, T *out_data) {
-  CUDA_1D_KERNEL_LOOP(i, limit) {
+  CUDA_KERNEL_LOOP(i, limit) {
    T x = x_data[i];
    int a = i / num_classes;  // current sample
    int d = i % num_classes;  // current class
@ -79,7 +75,7 @@ __global__ void GPUSigmoidFocalLossBackward(
    const T *x_data, const int *label_data, const int *fg_num_data,
    const T gamma, const T alpha, const int num_classes, const T *dout_data,
    const int limit, T *dx_data) {
-  CUDA_1D_KERNEL_LOOP(i, limit) {
+  CUDA_KERNEL_LOOP(i, limit) {
    T x = x_data[i];
    T dout = dout_data[i];

--- a/paddle/fluid/operators/gather.cu.h
+++ b/paddle/fluid/operators/gather.cu.h
@ -27,15 +27,11 @@ namespace operators {
 using framework::Tensor;
 using platform::DeviceContext;

-#define CUDA_1D_KERNEL_LOOP(i, n)                              \
-  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < (n); \
-       i += blockDim.x * gridDim.x)
-
 template <typename T, typename IndexT = int>
 __global__ void GatherCUDAKernel(const T* params, const IndexT* indices,
                                 T* output, size_t index_size,
                                 size_t slice_size) {
-  CUDA_1D_KERNEL_LOOP(i, index_size * slice_size) {
+  CUDA_KERNEL_LOOP(i, index_size * slice_size) {
    int indices_i = i / slice_size;
    int slice_i = i - indices_i * slice_size;  // offset inside the slice
    IndexT gather_i = indices[indices_i];
@ -49,7 +45,7 @@ __global__ void GatherNdCUDAKernel(const T* input, const int* input_dims,
                                   const IndexT* indices, T* output,
                                   size_t remain_size, size_t slice_size,
                                   size_t end_size) {
-  CUDA_1D_KERNEL_LOOP(i, remain_size * slice_size) {
+  CUDA_KERNEL_LOOP(i, remain_size * slice_size) {
    int indices_i = i / slice_size;
    int slice_i = i - indices_i * slice_size;  // offset inside the slice
    IndexT gather_i = 0;
--- a/paddle/fluid/operators/gather_tree_op.cu
+++ b/paddle/fluid/operators/gather_tree_op.cu
@ -19,15 +19,11 @@ limitations under the License. */
 namespace paddle {
 namespace operators {

-#define CUDA_1D_KERNEL_LOOP(i, n)                              \
-  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < (n); \
-       i += blockDim.x * gridDim.x)
-
 template <typename T>
 __global__ void GatherTree(const T *ids_data, const T *parents_data,
                           T *out_data, const int64_t max_length,
                           const int64_t batch_size, const int64_t beam_size) {
-  CUDA_1D_KERNEL_LOOP(i, batch_size * beam_size) {
+  CUDA_KERNEL_LOOP(i, batch_size * beam_size) {
    int batch = i / beam_size;
    int beam = i % beam_size;
    auto idx =
--- a/paddle/fluid/operators/histogram_op.cu
+++ b/paddle/fluid/operators/histogram_op.cu
@ -27,10 +27,6 @@ using IndexType = int64_t;
 using Tensor = framework::Tensor;
 using platform::PADDLE_CUDA_NUM_THREADS;

-#define CUDA_KERNEL_LOOP(i, n)                                 \
-  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < (n); \
-       i += blockDim.x * gridDim.x)
-
 inline int GET_BLOCKS(const int N) {
  return (N + PADDLE_CUDA_NUM_THREADS - 1) / PADDLE_CUDA_NUM_THREADS;
 }
--- a/paddle/fluid/operators/instance_norm_op.cu
+++ b/paddle/fluid/operators/instance_norm_op.cu
@ -35,8 +35,7 @@ using BatchNormParamType = typename CudnnDataType<T>::BatchNormParamType;
 template <typename T>
 static __global__ void repeat_param(const T *input, T *output,
                                    const int repeat_num, const int C) {
-  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < repeat_num * C;
-       i += blockDim.x * gridDim.x) {
+  CUDA_KERNEL_LOOP(i, repeat_num * C) {
    int index = i % C;
    output[i] = input[index];
  }
--- a/paddle/fluid/operators/linspace_op.cu
+++ b/paddle/fluid/operators/linspace_op.cu
@ -19,13 +19,9 @@ limitations under the License. */
 namespace paddle {
 namespace operators {

-#define CUDA_1D_KERNEL_LOOP(i, n)                              \
-  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < (n); \
-       i += blockDim.x * gridDim.x)
-
 template <typename T>
 __global__ void LinspaceKernel(T start, T step, int64_t size, T* out) {
-  CUDA_1D_KERNEL_LOOP(index, size) { out[index] = start + step * index; }
+  CUDA_KERNEL_LOOP(index, size) { out[index] = start + step * index; }
 }

 template <typename T>
--- a/paddle/fluid/operators/lstm_unit_op.cu
+++ b/paddle/fluid/operators/lstm_unit_op.cu
@ -24,10 +24,6 @@ https://github.com/caffe2/caffe2/blob/master/caffe2/operators/lstm_unit_op_gpu.c
 namespace paddle {
 namespace operators {

-#define CUDA_1D_KERNEL_LOOP(i, n)                              \
-  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < (n); \
-       i += blockDim.x * gridDim.x)
-
 template <typename Dtype>
 __device__ Dtype cuda_sigmoid(const Dtype x) {
  return Dtype(1) / (Dtype(1) + exp(-x));
@ -42,7 +38,7 @@ template <typename T>
 __global__ void LSTMUnitKernel(const int nthreads, const int dim,
                               const T* C_prev, const T* X, T* C, T* H,
                               const T forget_bias) {
-  CUDA_1D_KERNEL_LOOP(index, nthreads) {
+  CUDA_KERNEL_LOOP(index, nthreads) {
    const int n = index / dim;
    const int d = index % dim;

@ -65,7 +61,7 @@ __global__ void LSTMUnitGradientKernel(const int nthreads, const int dim,
                                       const T* C_diff, const T* H_diff,
                                       T* C_prev_diff, T* X_diff,
                                       const T forget_bias) {
-  CUDA_1D_KERNEL_LOOP(index, nthreads) {
+  CUDA_KERNEL_LOOP(index, nthreads) {
    const int n = index / dim;
    const int d = index % dim;
    const T* X_offset = X + 4 * dim * n;
--- a/paddle/fluid/operators/math/cross_entropy.cu
+++ b/paddle/fluid/operators/math/cross_entropy.cu
@ -25,8 +25,7 @@ template <typename T>
 __global__ void CrossEntropyKernel(T* Y, const T* X, const int64_t* label,
                                   const int N, const int D,
                                   const int ignore_index) {
-  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < N;
-       i += blockDim.x * gridDim.x) {
+  CUDA_KERNEL_LOOP(i, N) {
    PADDLE_ENFORCE(label[i] >= 0 && label[i] < D || label[i] == ignore_index,
                   "label[%d] expected >= 0 and < %ld, or == %ld, but got "
                   "%ld. Please check input value.",
--- a/paddle/fluid/operators/math/math_function.cu
+++ b/paddle/fluid/operators/math/math_function.cu
@ -75,8 +75,7 @@ template <typename T>
 __global__ void RowwiseAddKernel(const T* a, const T* b, T* c, int width,
                                 int num) {
  T tmp = 1.0 / width;
-  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < num;
-       i += blockDim.x * gridDim.x) {
+  CUDA_KERNEL_LOOP(i, num) {
    int h = i * tmp;
    int w = i - h * width;
    c[i] = a[i] + b[w];
--- a/paddle/fluid/operators/mean_iou_op.cu
+++ b/paddle/fluid/operators/mean_iou_op.cu
@ -23,10 +23,6 @@ namespace operators {

 using platform::PADDLE_CUDA_NUM_THREADS;

-#define CUDA_1D_KERNEL_LOOP(i, n)                              \
-  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < (n); \
-       i += blockDim.x * gridDim.x)
-
 template <typename T>
 __global__ void CountCUDAKernel(const int num_classes, const int count,
                                const T* predictions, const T* labels,
@ -42,7 +38,7 @@ __global__ void CountCUDAKernel(const int num_classes, const int count,

  T pred;
  T label;
-  CUDA_1D_KERNEL_LOOP(i, count) {
+  CUDA_KERNEL_LOOP(i, count) {
    pred = predictions[i];
    label = labels[i];
    if (pred == label) {
@ -68,7 +64,7 @@ __global__ void ComputeIoUCUDAKernel(const int num_classes, int* wrong,
    valid_count_c = 0;
  }
  __syncthreads();
-  CUDA_1D_KERNEL_LOOP(i, num_classes) {
+  CUDA_KERNEL_LOOP(i, num_classes) {
    int wrong_n = wrong[i];
    int correct_n = correct[i];
    int denominator = wrong_n + correct_n;
--- a/paddle/fluid/operators/metrics/auc_op.cu
+++ b/paddle/fluid/operators/metrics/auc_op.cu
@ -23,9 +23,6 @@ namespace operators {
 using platform::PADDLE_CUDA_NUM_THREADS;
 using Tensor = framework::Tensor;
 using LoDTensor = framework::LoDTensor;
-#define CUDA_KERNEL_LOOP(i, n)                                 \
-  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < (n); \
-       i += blockDim.x * gridDim.x)

 __global__ void ClearObsoleteDataKernel(int64_t *pos, int64_t *neg,
                                        const int bucket_length,
--- a/paddle/fluid/operators/nll_loss_op.cu
+++ b/paddle/fluid/operators/nll_loss_op.cu
@ -31,10 +31,6 @@ static inline int NumBlocks(const int N) {
                  kNumMaxinumNumBlocks);
 }

-#define CUDA_1D_KERNEL_LOOP(i, n)                              \
-  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < (n); \
-       i += blockDim.x * gridDim.x)
-
 template <typename T>
 __global__ void GPUNLLLossForward1D_no_reduce(T* out_data, const T* x_data,
                                              const int64_t* label_data,
@ -42,7 +38,7 @@ __global__ void GPUNLLLossForward1D_no_reduce(T* out_data, const T* x_data,
                                              const int64_t batch_size,
                                              const int64_t n_classes,
                                              const int64_t ignore_index) {
-  CUDA_1D_KERNEL_LOOP(i, batch_size) {
+  CUDA_KERNEL_LOOP(i, batch_size) {
    const int64_t cur_label = label_data[i];
    if (cur_label == ignore_index) {
      out_data[i] = 0;
@ -191,7 +187,7 @@ __global__ void GPUNLLLossForward2D_no_reduce(
  const int64_t map_size = in_dim2 * in_dim3;
  const int64_t sample_size = n_classes * map_size;
  const int64_t out_numel = batch_size * map_size;
-  CUDA_1D_KERNEL_LOOP(i, out_numel) {
+  CUDA_KERNEL_LOOP(i, out_numel) {
    const int64_t b = i % batch_size;
    const int64_t h = (i / batch_size) % in_dim2;
    const int64_t w = (i / (batch_size * in_dim2)) % in_dim3;
@ -261,7 +257,7 @@ __global__ void GPUNLLLossBackward1D_no_reduce(
    T* dx_data, const int64_t* label_data, const T* weight_data,
    const T* dout_data, const int64_t batch_size, const int64_t n_classes,
    const int64_t ignore_index) {
-  CUDA_1D_KERNEL_LOOP(i, batch_size) {
+  CUDA_KERNEL_LOOP(i, batch_size) {
    const int64_t cur_label = label_data[i];
    if (cur_label == ignore_index) {
      continue;
@ -299,7 +295,7 @@ __global__ void GPUNLLLossBackward2D_no_reduce(
  const int64_t map_size = in_dim2 * in_dim3;
  const int64_t sample_size = n_classes * map_size;
  const int64_t out_numel = batch_size * map_size;
-  CUDA_1D_KERNEL_LOOP(i, out_numel) {
+  CUDA_KERNEL_LOOP(i, out_numel) {
    const int64_t b = i % batch_size;
    const int64_t h = (i / batch_size) % in_dim2;
    const int64_t w = (i / (batch_size * in_dim2)) % in_dim3;
--- a/Show More
+++ b/Show More