fix softmax cross entropy integer overflow (#30590)

[BUG FIX] Fix softmax cross entropy overflow problem.
5 years ago · 16fe11d71e
parent 44ee251fde
commit 16fe11d71e
4 changed files with 96 additions and 93 deletions
--- a/paddle/fluid/operators/log_softmax_op.h
+++ b/paddle/fluid/operators/log_softmax_op.h
@ -29,16 +29,16 @@ static inline int CanonicalAxis(const int axis, const int rank) {
  return axis;
 }

-static inline int SizeToAxis(const int axis, const framework::DDim dims) {
-  int size = 1;
+static inline size_t SizeToAxis(const int axis, const framework::DDim dims) {
+  size_t size = 1;
  for (int i = 0; i < axis; i++) {
    size *= dims[i];
  }
  return size;
 }

-static inline int SizeFromAxis(const int axis, const framework::DDim dims) {
-  int size = 1;
+static inline size_t SizeFromAxis(const int axis, const framework::DDim dims) {
+  size_t size = 1;
  for (int i = axis; i < dims.size(); i++) {
    size *= dims[i];
  }
--- a/paddle/fluid/operators/softmax_with_cross_entropy_op.cu
+++ b/paddle/fluid/operators/softmax_with_cross_entropy_op.cu
--- a/paddle/fluid/platform/cuda_helper.h
+++ b/paddle/fluid/platform/cuda_helper.h
@ -75,11 +75,14 @@ namespace platform {
 *    }
 *
 */
-#define CUDA_KERNEL_LOOP(i, num)                             \
+
+#define CUDA_KERNEL_LOOP_TYPE(i, num, index_type)            \
  int64_t __index__ = blockIdx.x * blockDim.x + threadIdx.x; \
-  for (int i = __index__; __index__ < (num);                 \
+  for (index_type i = __index__; __index__ < (num);          \
       __index__ += blockDim.x * gridDim.x, i = __index__)

+#define CUDA_KERNEL_LOOP(i, num) CUDA_KERNEL_LOOP_TYPE(i, num, int)
+
 class CublasHandleHolder {
 public:
 #ifdef PADDLE_WITH_HIP
--- a/paddle/fluid/platform/for_range.h
+++ b/paddle/fluid/platform/for_range.h
@ -48,7 +48,7 @@ __global__ static void ForRangeElemwiseOpGridIsOne(Function func) {
 }

 template <typename Function>
-__global__ static void ForRangeElemwiseOp(Function func, int limit) {
+__global__ static void ForRangeElemwiseOp(Function func, size_t limit) {
  size_t idx = static_cast<size_t>(blockIdx.x * blockDim.x + threadIdx.x);
  if (idx < limit) {
    func(idx);
@ -58,13 +58,13 @@ __global__ static void ForRangeElemwiseOp(Function func, int limit) {
 template <>
 struct ForRange<CUDADeviceContext> {
  ForRange(const CUDADeviceContext& dev_ctx, size_t limit)
-      : dev_ctx_(dev_ctx), limit_(static_cast<int>(limit)) {}
+      : dev_ctx_(dev_ctx), limit_(static_cast<size_t>(limit)) {}

  template <typename Function>
  inline void operator()(Function func) const {
    constexpr int num_threads = 1024;
-    int block_size = limit_ <= num_threads ? limit_ : num_threads;
-    int grid_size = (limit_ + num_threads - 1) / num_threads;
+    size_t block_size = limit_ <= num_threads ? limit_ : num_threads;
+    size_t grid_size = (limit_ + num_threads - 1) / num_threads;

    if (grid_size == 1) {
      ForRangeElemwiseOpGridIsOne<<<1, block_size, 0, dev_ctx_.stream()>>>(
@ -76,7 +76,7 @@ struct ForRange<CUDADeviceContext> {
  }

  const CUDADeviceContext& dev_ctx_;
-  int limit_;
+  size_t limit_;
 };

 #endif