refine error message of gpu allocator, test=develop (#21008)

6 years ago · f56967c483
parent bc8e600ce5
commit f56967c483
1 changed files with 28 additions and 13 deletions
--- a/paddle/fluid/memory/detail/system_allocator.cc
+++ b/paddle/fluid/memory/detail/system_allocator.cc
@ -105,6 +105,21 @@ bool CPUAllocator::UseGpu() const { return false; }

 #ifdef PADDLE_WITH_CUDA

+static void ClearCUDAOutOfMemoryError(cudaError_t* status) {
+  if (*status == cudaErrorMemoryAllocation) {
+    *status = cudaSuccess;
+  }
+
+  PADDLE_ENFORCE_CUDA_SUCCESS(*status);
+
+  *status = cudaGetLastError();
+  if (*status == cudaErrorMemoryAllocation) {
+    *status = cudaSuccess;
+  }
+
+  PADDLE_ENFORCE_CUDA_SUCCESS(*status);
+}
+
 void* GPUAllocator::Alloc(size_t* index, size_t size) {
  // CUDA documentation doesn't explain if cudaMalloc returns nullptr
  // if size is 0.  We just make sure it does.
@ -120,19 +135,19 @@ void* GPUAllocator::Alloc(size_t* index, size_t size) {
    gpu_alloc_size_ += size;
    return p;
  } else {
-    if (result == cudaErrorMemoryAllocation) {
-      result = cudaSuccess;
-    }
-    PADDLE_ENFORCE_CUDA_SUCCESS(result);
-
-    result = cudaGetLastError();
-    if (result == cudaErrorMemoryAllocation) {
-      result = cudaSuccess;
-    }
-    PADDLE_ENFORCE_CUDA_SUCCESS(result);
-
-    size_t avail, total;
-    platform::GpuMemoryUsage(&avail, &total);
+    ClearCUDAOutOfMemoryError(&result);
+
+    /**
+     * NOTE(zjl): Sometimes cudaMemGetInfo would raise OOM error
+     * if there is very little GPU memory left. In this case, we
+     * should consider the available GPU memory to be 0, and throw
+     * exception inside this function instead of throwing exception
+     * inside cudaMemGetInfo.
+     */
+    size_t avail = 0, total = 0;
+    result = cudaMemGetInfo(&avail, &total);
+    if (result != cudaSuccess) avail = 0;
+    ClearCUDAOutOfMemoryError(&result);

    PADDLE_THROW_BAD_ALLOC(
        "\n\nOut of memory error on GPU %d. "