|
|
|
@ -19,12 +19,13 @@ limitations under the License. */
|
|
|
|
|
#endif // __GNUC__
|
|
|
|
|
|
|
|
|
|
#if !defined(_WIN32)
|
|
|
|
|
#include <dlfcn.h> // dladdr
|
|
|
|
|
#else // _WIN32
|
|
|
|
|
#include <dlfcn.h> // dladdr
|
|
|
|
|
#include <unistd.h> // sleep
|
|
|
|
|
#else // _WIN32
|
|
|
|
|
#ifndef NOMINMAX
|
|
|
|
|
#define NOMINMAX // msvc max/min macro conflict with std::min/max
|
|
|
|
|
#endif
|
|
|
|
|
#include <windows.h> // GetModuleFileName
|
|
|
|
|
#include <windows.h> // GetModuleFileName, Sleep
|
|
|
|
|
#endif
|
|
|
|
|
|
|
|
|
|
#ifdef PADDLE_WITH_CUDA
|
|
|
|
@ -80,6 +81,9 @@ class ErrorSummary;
|
|
|
|
|
} // namespace platform
|
|
|
|
|
} // namespace paddle
|
|
|
|
|
|
|
|
|
|
#ifdef PADDLE_WITH_CUDA
|
|
|
|
|
DECLARE_int64(gpu_allocator_retry_time);
|
|
|
|
|
#endif
|
|
|
|
|
DECLARE_int32(call_stack_level);
|
|
|
|
|
|
|
|
|
|
namespace paddle {
|
|
|
|
@ -924,6 +928,14 @@ DEFINE_CUDA_STATUS_TYPE(ncclResult_t, ncclSuccess);
|
|
|
|
|
} \
|
|
|
|
|
} while (0)
|
|
|
|
|
|
|
|
|
|
inline void retry_sleep(unsigned millisecond) {
|
|
|
|
|
#ifdef _WIN32
|
|
|
|
|
Sleep(millisecond);
|
|
|
|
|
#else
|
|
|
|
|
sleep(millisecond);
|
|
|
|
|
#endif
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
#define PADDLE_RETRY_CUDA_SUCCESS(COND) \
|
|
|
|
|
do { \
|
|
|
|
|
auto __cond__ = (COND); \
|
|
|
|
@ -933,6 +945,7 @@ DEFINE_CUDA_STATUS_TYPE(ncclResult_t, ncclSuccess);
|
|
|
|
|
::paddle::platform::details::CudaStatusType< \
|
|
|
|
|
__CUDA_STATUS_TYPE__>::kSuccess; \
|
|
|
|
|
while (UNLIKELY(__cond__ != __success_type__) && retry_count < 5) { \
|
|
|
|
|
retry_sleep(FLAGS_gpu_allocator_retry_time); \
|
|
|
|
|
__cond__ = (COND); \
|
|
|
|
|
++retry_count; \
|
|
|
|
|
} \
|
|
|
|
|