fix softmax cross entropy integer overflow (#30590)

[BUG FIX] Fix softmax cross entropy overflow problem.
revert-31068-fix_conv3d_windows
Zhong Hui 5 years ago committed by GitHub
parent 44ee251fde
commit 16fe11d71e
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

@ -29,16 +29,16 @@ static inline int CanonicalAxis(const int axis, const int rank) {
return axis;
}
static inline int SizeToAxis(const int axis, const framework::DDim dims) {
int size = 1;
static inline size_t SizeToAxis(const int axis, const framework::DDim dims) {
size_t size = 1;
for (int i = 0; i < axis; i++) {
size *= dims[i];
}
return size;
}
static inline int SizeFromAxis(const int axis, const framework::DDim dims) {
int size = 1;
static inline size_t SizeFromAxis(const int axis, const framework::DDim dims) {
size_t size = 1;
for (int i = axis; i < dims.size(); i++) {
size *= dims[i];
}

File diff suppressed because it is too large Load Diff

@ -75,11 +75,14 @@ namespace platform {
* }
*
*/
#define CUDA_KERNEL_LOOP(i, num) \
#define CUDA_KERNEL_LOOP_TYPE(i, num, index_type) \
int64_t __index__ = blockIdx.x * blockDim.x + threadIdx.x; \
for (int i = __index__; __index__ < (num); \
for (index_type i = __index__; __index__ < (num); \
__index__ += blockDim.x * gridDim.x, i = __index__)
#define CUDA_KERNEL_LOOP(i, num) CUDA_KERNEL_LOOP_TYPE(i, num, int)
class CublasHandleHolder {
public:
#ifdef PADDLE_WITH_HIP

@ -48,7 +48,7 @@ __global__ static void ForRangeElemwiseOpGridIsOne(Function func) {
}
template <typename Function>
__global__ static void ForRangeElemwiseOp(Function func, int limit) {
__global__ static void ForRangeElemwiseOp(Function func, size_t limit) {
size_t idx = static_cast<size_t>(blockIdx.x * blockDim.x + threadIdx.x);
if (idx < limit) {
func(idx);
@ -58,13 +58,13 @@ __global__ static void ForRangeElemwiseOp(Function func, int limit) {
template <>
struct ForRange<CUDADeviceContext> {
ForRange(const CUDADeviceContext& dev_ctx, size_t limit)
: dev_ctx_(dev_ctx), limit_(static_cast<int>(limit)) {}
: dev_ctx_(dev_ctx), limit_(static_cast<size_t>(limit)) {}
template <typename Function>
inline void operator()(Function func) const {
constexpr int num_threads = 1024;
int block_size = limit_ <= num_threads ? limit_ : num_threads;
int grid_size = (limit_ + num_threads - 1) / num_threads;
size_t block_size = limit_ <= num_threads ? limit_ : num_threads;
size_t grid_size = (limit_ + num_threads - 1) / num_threads;
if (grid_size == 1) {
ForRangeElemwiseOpGridIsOne<<<1, block_size, 0, dev_ctx_.stream()>>>(
@ -76,7 +76,7 @@ struct ForRange<CUDADeviceContext> {
}
const CUDADeviceContext& dev_ctx_;
int limit_;
size_t limit_;
};
#endif

Loading…
Cancel
Save