Paddle/paddle/fluid/platform/cuda_helper.h

// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

#pragma once

#include <mutex>  // NOLINT

#ifdef PADDLE_WITH_CUDA
#include "paddle/fluid/platform/dynload/cublas.h"
#endif
#ifdef PADDLE_WITH_HIP
#include "paddle/fluid/platform/dynload/rocblas.h"
#endif
#include "paddle/fluid/platform/enforce.h"
#include "paddle/fluid/platform/macros.h"

#if defined(PADDLE_WITH_CUDA) && CUDA_VERSION < 9000
enum cublasMath_t { CUBLAS_DEFAULT_MATH = 0 };
#endif

namespace paddle {
namespace platform {

/*
 * Summary: Grid stride looping macro in CUDA kernel
 *
 *  [ Why need this macro? ]
 *
 *    The original looping in CUDA kernel is:
 *
 *    `for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < (n); \
 *        i += blockDim.x * gridDim.x)`
 *
 *    This for condition is risky. The value of `blockIdx.x * blockDim.x`
 *    may be large, such as over 1GB, the first iteration is no problem here,
 *    but when `i += blockDim.x * gridDim.x` is executed, the value of i
 *    will greater than INT_MAX and overflow becomes negative value, at
 *    this time, the cycle condition `i < (n)` is still satisfied, so it
 *    will cause illegal access to cuda memory.
 *
 *    Here is a real example in ERINE, it will trigger above error.
 *    The related data are:
 *      - blockIdx.x = 2172938
 *      - blockDim.x = 512
 *      - blockIdx.x * blockDim.x = 1112543864
 *      - INT_MAX = 2147483647
 *
 *    So we polish the for condition as follow, the int64_t __index__ will
 *    prevent overflow in the loop increment.
 *
 * Parameters:
 *    - i: loop index
 *    - num: total element numbers
 *
 * Examples:
 *    template <typename T>
 *    __global__ void Scale(T* logit_grad, const T* loss_grad, const int num,
 *                      const int d, const int remain) {
 *    CUDA_KERNEL_LOOP(index, num) {
 *      int idx_n = index / d;
 *      int idx_remain = index % remain;
 *      logit_grad[index] *= loss_grad[idx_n * remain + idx_remain];
 *      }
 *    }
 *
*/

#define CUDA_KERNEL_LOOP_TYPE(i, num, index_type)            \
  int64_t __index__ = blockIdx.x * blockDim.x + threadIdx.x; \
  for (index_type i = __index__; __index__ < (num);          \
       __index__ += blockDim.x * gridDim.x, i = __index__)

#define CUDA_KERNEL_LOOP(i, num) CUDA_KERNEL_LOOP_TYPE(i, num, int)

class CublasHandleHolder {
 public:
#ifdef PADDLE_WITH_HIP
  explicit CublasHandleHolder(hipStream_t stream) {
    PADDLE_RETRY_CUDA_SUCCESS(dynload::rocblas_create_handle(&handle_));
    PADDLE_RETRY_CUDA_SUCCESS(dynload::rocblas_set_stream(handle_, stream));
  }
#else
  CublasHandleHolder(cudaStream_t stream, cublasMath_t math_type) {
    PADDLE_RETRY_CUDA_SUCCESS(dynload::cublasCreate(&handle_));
    PADDLE_RETRY_CUDA_SUCCESS(dynload::cublasSetStream(handle_, stream));
#if CUDA_VERSION >= 9000
    if (math_type == CUBLAS_TENSOR_OP_MATH) {
      PADDLE_RETRY_CUDA_SUCCESS(
          dynload::cublasSetMathMode(handle_, CUBLAS_TENSOR_OP_MATH));
#if CUDA_VERSION >= 11000
    } else if (math_type == CUBLAS_TF32_TENSOR_OP_MATH) {
      PADDLE_RETRY_CUDA_SUCCESS(
          dynload::cublasSetMathMode(handle_, CUBLAS_TF32_TENSOR_OP_MATH));
#endif  // CUDA_VERSION >= 11000
    }
#endif  // CUDA_VERSION >= 9000
  }
#endif

  const cublasHandle_t& GetCublasHandle() const { return handle_; }

  ~CublasHandleHolder() PADDLE_MAY_THROW {
#ifdef PADDLE_WITH_HIP
    PADDLE_RETRY_CUDA_SUCCESS(dynload::rocblas_destroy_handle(handle_));
#else
    PADDLE_RETRY_CUDA_SUCCESS(dynload::cublasDestroy(handle_));
#endif
  }

  template <typename Callback>
  inline void Call(Callback&& callback) const {
    std::lock_guard<std::mutex> guard(mtx_);
    callback(handle_);
  }

 private:
  DISABLE_COPY_AND_ASSIGN(CublasHandleHolder);

#ifdef PADDLE_WITH_HIP
  rocblas_handle handle_;
#else
  cublasHandle_t handle_;
#endif
  mutable std::mutex mtx_;
};

}  // namespace platform
}  // namespace paddle
Revert "Revert "Remove op handle lock"" test=develop 6 years ago			`// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.`
			`//`
			`// Licensed under the Apache License, Version 2.0 (the "License");`
			`// you may not use this file except in compliance with the License.`
			`// You may obtain a copy of the License at`
			`//`
			`// http://www.apache.org/licenses/LICENSE-2.0`
			`//`
			`// Unless required by applicable law or agreed to in writing, software`
			`// distributed under the License is distributed on an "AS IS" BASIS,`
			`// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.`
			`// See the License for the specific language governing permissions and`
			`// limitations under the License.`

			`#pragma once`

			`#include <mutex> // NOLINT`

[ROCM] update fluid platform for rocm39 (part3), test=develop (#30913) 4 years ago			`#ifdef PADDLE_WITH_CUDA`
Revert "Revert "Remove op handle lock"" test=develop 6 years ago			`#include "paddle/fluid/platform/dynload/cublas.h"`
[ROCM] update fluid platform for rocm39 (part3), test=develop (#30913) 4 years ago			`#endif`
			`#ifdef PADDLE_WITH_HIP`
			`#include "paddle/fluid/platform/dynload/rocblas.h"`
			`#endif`
Fix index overflow bug of the CUDA kernel loop increment (#25435) * fix softmax_with_cross_entropy cuda kernel overflow bug, test=develop * replace old macro & for condition, test=develop * polish details, test=develop 5 years ago			`#include "paddle/fluid/platform/enforce.h"`
Revert "Revert "Remove op handle lock"" test=develop 6 years ago			`#include "paddle/fluid/platform/macros.h"`

[ROCM] update fluid platform for rocm39 (part3), test=develop (#30913) 4 years ago			`#if defined(PADDLE_WITH_CUDA) && CUDA_VERSION < 9000`
Revert "Revert "Remove op handle lock"" test=develop 6 years ago			`enum cublasMath_t { CUBLAS_DEFAULT_MATH = 0 };`
			`#endif`

			`namespace paddle {`
			`namespace platform {`

Fix index overflow bug of the CUDA kernel loop increment (#25435) * fix softmax_with_cross_entropy cuda kernel overflow bug, test=develop * replace old macro & for condition, test=develop * polish details, test=develop 5 years ago			`/*`
			`* Summary: Grid stride looping macro in CUDA kernel`
			`*`
			`* [ Why need this macro? ]`
			`*`
			`* The original looping in CUDA kernel is:`
			`*`
			* `for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < (n); \
			* i += blockDim.x * gridDim.x)`
			`*`
			* This for condition is risky. The value of `blockIdx.x * blockDim.x`
			`* may be large, such as over 1GB, the first iteration is no problem here,`
			* but when `i += blockDim.x * gridDim.x` is executed, the value of i
			`* will greater than INT_MAX and overflow becomes negative value, at`
			* this time, the cycle condition `i < (n)` is still satisfied, so it
			`* will cause illegal access to cuda memory.`
			`*`
			`* Here is a real example in ERINE, it will trigger above error.`
			`* The related data are:`
			`* - blockIdx.x = 2172938`
			`* - blockDim.x = 512`
			`* - blockIdx.x * blockDim.x = 1112543864`
			`* - INT_MAX = 2147483647`
			`*`
			`* So we polish the for condition as follow, the int64_t __index__ will`
			`* prevent overflow in the loop increment.`
			`*`
			`* Parameters:`
			`* - i: loop index`
			`* - num: total element numbers`
			`*`
			`* Examples:`
			`* template <typename T>`
			`* __global__ void Scale(T* logit_grad, const T* loss_grad, const int num,`
			`* const int d, const int remain) {`
			`* CUDA_KERNEL_LOOP(index, num) {`
			`* int idx_n = index / d;`
			`* int idx_remain = index % remain;`
			`* logit_grad[index] = loss_grad[idx_n remain + idx_remain];`
			`* }`
			`* }`
			`*`
			`*/`
fix softmax cross entropy integer overflow (#30590) [BUG FIX] Fix softmax cross entropy overflow problem. 4 years ago
			`#define CUDA_KERNEL_LOOP_TYPE(i, num, index_type) \`
Fix index overflow bug of the CUDA kernel loop increment (#25435) * fix softmax_with_cross_entropy cuda kernel overflow bug, test=develop * replace old macro & for condition, test=develop * polish details, test=develop 5 years ago			`int64_t __index__ = blockIdx.x * blockDim.x + threadIdx.x; \`
fix softmax cross entropy integer overflow (#30590) [BUG FIX] Fix softmax cross entropy overflow problem. 4 years ago			`for (index_type i = __index__; __index__ < (num); \`
Fix index overflow bug of the CUDA kernel loop increment (#25435) * fix softmax_with_cross_entropy cuda kernel overflow bug, test=develop * replace old macro & for condition, test=develop * polish details, test=develop 5 years ago			`__index__ += blockDim.x * gridDim.x, i = __index__)`

fix softmax cross entropy integer overflow (#30590) [BUG FIX] Fix softmax cross entropy overflow problem. 4 years ago			`#define CUDA_KERNEL_LOOP(i, num) CUDA_KERNEL_LOOP_TYPE(i, num, int)`

Revert "Revert "Remove op handle lock"" test=develop 6 years ago			`class CublasHandleHolder {`
			`public:`
[ROCM] update fluid platform for rocm39 (part3), test=develop (#30913) 4 years ago			`#ifdef PADDLE_WITH_HIP`
			`explicit CublasHandleHolder(hipStream_t stream) {`
			`PADDLE_RETRY_CUDA_SUCCESS(dynload::rocblas_create_handle(&handle_));`
			`PADDLE_RETRY_CUDA_SUCCESS(dynload::rocblas_set_stream(handle_, stream));`
			`}`
			`#else`
Revert "Revert "Remove op handle lock"" test=develop 6 years ago			`CublasHandleHolder(cudaStream_t stream, cublasMath_t math_type) {`
Modify CublasHandleHolder to Fix Random Unittest Failure. test=develop (#29617) Modify CublasHandleHolder from using PADDLE_ENFORCE_CUDA_SUCCESS to PADDLE_RETRY_CUDA_SUCCESS to fix random unittest failure. We checked that the unittest log showed CUDA allocation error at this file, which may due to GPU not enough. We fixed similar failure in the past, so we applied PADDLE_RETRY_CUDA_SUCCESS here. 4 years ago			`PADDLE_RETRY_CUDA_SUCCESS(dynload::cublasCreate(&handle_));`
			`PADDLE_RETRY_CUDA_SUCCESS(dynload::cublasSetStream(handle_, stream));`
Revert "Revert "Remove op handle lock"" test=develop 6 years ago			`#if CUDA_VERSION >= 9000`
			`if (math_type == CUBLAS_TENSOR_OP_MATH) {`
Modify CublasHandleHolder to Fix Random Unittest Failure. test=develop (#29617) Modify CublasHandleHolder from using PADDLE_ENFORCE_CUDA_SUCCESS to PADDLE_RETRY_CUDA_SUCCESS to fix random unittest failure. We checked that the unittest log showed CUDA allocation error at this file, which may due to GPU not enough. We fixed similar failure in the past, so we applied PADDLE_RETRY_CUDA_SUCCESS here. 4 years ago			`PADDLE_RETRY_CUDA_SUCCESS(`
Revert "Revert "Remove op handle lock"" test=develop 6 years ago			`dynload::cublasSetMathMode(handle_, CUBLAS_TENSOR_OP_MATH));`
Add tf32 support for A100 tensor core acceleration for cuBLAS (#28732) 4 years ago			`#if CUDA_VERSION >= 11000`
			`} else if (math_type == CUBLAS_TF32_TENSOR_OP_MATH) {`
Add Retry Logic to CublasHandlerHolder Add Retry Logic to CublasHandlerHolder to avoid random unittest failure. 4 years ago			`PADDLE_RETRY_CUDA_SUCCESS(`
Add tf32 support for A100 tensor core acceleration for cuBLAS (#28732) 4 years ago			`dynload::cublasSetMathMode(handle_, CUBLAS_TF32_TENSOR_OP_MATH));`
			`#endif // CUDA_VERSION >= 11000`
Revert "Revert "Remove op handle lock"" test=develop 6 years ago			`}`
Add tf32 support for A100 tensor core acceleration for cuBLAS (#28732) 4 years ago			`#endif // CUDA_VERSION >= 9000`
Revert "Revert "Remove op handle lock"" test=develop 6 years ago			`}`
[ROCM] update fluid platform for rocm39 (part3), test=develop (#30913) 4 years ago			`#endif`
Revert "Revert "Remove op handle lock"" test=develop 6 years ago
Add cublas_handle() to expose cublas_handle to ops (#31157) * add get_cublas_handle() api * update format * add unittests * alter function name 4 years ago			`const cublasHandle_t& GetCublasHandle() const { return handle_; }`

Fix warn of gcc8 (#21205) * fix warnings oof gcc 8 compilation, test=develop * fix boost::bad_get, test=develop * refine PADDLE_ENFORCE, test=develop 5 years ago			`~CublasHandleHolder() PADDLE_MAY_THROW {`
[ROCM] update fluid platform for rocm39 (part3), test=develop (#30913) 4 years ago			`#ifdef PADDLE_WITH_HIP`
			`PADDLE_RETRY_CUDA_SUCCESS(dynload::rocblas_destroy_handle(handle_));`
			`#else`
Modify CublasHandleHolder to Fix Random Unittest Failure. test=develop (#29617) Modify CublasHandleHolder from using PADDLE_ENFORCE_CUDA_SUCCESS to PADDLE_RETRY_CUDA_SUCCESS to fix random unittest failure. We checked that the unittest log showed CUDA allocation error at this file, which may due to GPU not enough. We fixed similar failure in the past, so we applied PADDLE_RETRY_CUDA_SUCCESS here. 4 years ago			`PADDLE_RETRY_CUDA_SUCCESS(dynload::cublasDestroy(handle_));`
[ROCM] update fluid platform for rocm39 (part3), test=develop (#30913) 4 years ago			`#endif`
refine PADDLE_ENFORCE codes for unify PADDLE_ASSERT_MSG (#19603) test=develop 6 years ago			`}`
Revert "Revert "Remove op handle lock"" test=develop 6 years ago
			`template <typename Callback>`
Add cublas_handle() to expose cublas_handle to ops (#31157) * add get_cublas_handle() api * update format * add unittests * alter function name 4 years ago			`inline void Call(Callback&& callback) const {`
Revert "Revert "Remove op handle lock"" test=develop 6 years ago			`std::lock_guard<std::mutex> guard(mtx_);`
			`callback(handle_);`
			`}`

			`private:`
			`DISABLE_COPY_AND_ASSIGN(CublasHandleHolder);`

[ROCM] update fluid platform for rocm39 (part3), test=develop (#30913) 4 years ago			`#ifdef PADDLE_WITH_HIP`
			`rocblas_handle handle_;`
			`#else`
Revert "Revert "Remove op handle lock"" test=develop 6 years ago			`cublasHandle_t handle_;`
[ROCM] update fluid platform for rocm39 (part3), test=develop (#30913) 4 years ago			`#endif`
Revert "Revert "Remove op handle lock"" test=develop 6 years ago			`mutable std::mutex mtx_;`
			`};`

			`} // namespace platform`
			`} // namespace paddle`