[cherry-pick2.0]Optimize the error messages of paddle CUDA API (#23849)

* cherry-pick,Optimize the error messages of paddle CUDA API * fix the error messages of paddle CUDA API * Refactoring PADDLE_ENFORCE_CUDA_SUCCESS, and apply to curand/cudnn/cublas/NCCL * remove build_ex_string
5 years ago · 3f4678c957
parent 30e4cacd7c
commit 3f4678c957
29 changed files with 655 additions and 516 deletions
--- a/cmake/inference_lib.cmake
+++ b/cmake/inference_lib.cmake
@ -135,6 +135,12 @@ copy(inference_lib_dist
        SRCS ${THREADPOOL_INCLUDE_DIR}/ThreadPool.h
        DSTS ${dst_dir})

+set(dst_dir "${FLUID_INFERENCE_INSTALL_DIR}/third_party/cudaerror/data")
+copy(inference_lib_dist
+        SRCS ${cudaerror_INCLUDE_DIR}
+        DSTS ${dst_dir})
+
+# CMakeCache Info
 copy(inference_lib_dist
        SRCS ${CMAKE_CURRENT_BINARY_DIR}/CMakeCache.txt
        DSTS ${FLUID_INFERENCE_INSTALL_DIR})
@ -184,7 +190,7 @@ copy(fluid_lib_dist
        )

 set(module "framework")
-set(framework_lib_deps framework_proto)
+set(framework_lib_deps framework_proto data_feed_proto trainer_desc_proto)
 add_dependencies(fluid_lib_dist ${framework_lib_deps})
 copy(fluid_lib_dist
        SRCS ${src_dir}/${module}/*.h ${src_dir}/${module}/details/*.h ${PADDLE_BINARY_DIR}/paddle/fluid/framework/trainer_desc.pb.h ${PADDLE_BINARY_DIR}/paddle/fluid/framework/framework.pb.h ${PADDLE_BINARY_DIR}/paddle/fluid/framework/data_feed.pb.h ${src_dir}/${module}/ir/memory_optimize_pass/*.h
@ -204,11 +210,11 @@ copy(fluid_lib_dist
        )

 set(module "platform")
-set(platform_lib_deps profiler_proto)
+set(platform_lib_deps profiler_proto error_codes_proto cuda_error_proto)
 add_dependencies(fluid_lib_dist ${platform_lib_deps})
 copy(fluid_lib_dist
-        SRCS ${src_dir}/${module}/*.h ${src_dir}/${module}/dynload/*.h ${src_dir}/${module}/details/*.h ${PADDLE_BINARY_DIR}/paddle/fluid/platform/profiler.pb.h ${PADDLE_BINARY_DIR}/paddle/fluid/platform/error_codes.pb.h
-        DSTS ${dst_dir}/${module} ${dst_dir}/${module}/dynload ${dst_dir}/${module}/details ${dst_dir}/${module} ${dst_dir}/${module}
+        SRCS ${src_dir}/${module}/*.h ${src_dir}/${module}/dynload/*.h ${src_dir}/${module}/details/*.h ${PADDLE_BINARY_DIR}/paddle/fluid/platform/*.pb.h
+        DSTS ${dst_dir}/${module} ${dst_dir}/${module}/dynload ${dst_dir}/${module}/details ${dst_dir}/${module}
        )

 set(module "string")
@ -249,6 +255,7 @@ copy(inference_lib_dist
        SRCS ${ZLIB_INCLUDE_DIR} ${ZLIB_LIBRARIES}
        DSTS ${dst_dir} ${dst_dir}/lib)

+
 # CMakeCache Info
 copy(fluid_lib_dist
        SRCS ${FLUID_INFERENCE_INSTALL_DIR}/third_party ${CMAKE_CURRENT_BINARY_DIR}/CMakeCache.txt
--- a/cmake/third_party.cmake
+++ b/cmake/third_party.cmake
@ -12,6 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.

+include(ExternalProject)
 # Creat a target named "third_party", which can compile external dependencies on all platform(windows/linux/mac)

 set(THIRD_PARTY_PATH  "${CMAKE_BINARY_DIR}/third_party" CACHE STRING
@ -21,6 +22,7 @@ set(THIRD_PARTY_CACHE_PATH     "${CMAKE_SOURCE_DIR}"    CACHE STRING
    "A path cache third party source code to avoid repeated download.")

 set(THIRD_PARTY_BUILD_TYPE Release)
+set(third_party_deps)

 # cache funciton to avoid repeat download code of third_party.
 # This function has 4 parameters, URL / REPOSITOR / TAG / DIR:
@ -100,6 +102,32 @@ MACRO(UNSET_VAR VAR_NAME)
    UNSET(${VAR_NAME})
 ENDMACRO()

+# Funciton to Download the dependencies during compilation
+# This function has 2 parameters, URL / DIRNAME:
+# 1. URL:           The download url of 3rd dependencies
+# 2. NAME:          The name of file, that determin the dirname
+#
+MACRO(file_download_and_uncompress URL NAME)
+  MESSAGE(STATUS "Download dependence[${NAME}] from ${URL}")
+  SET(EXTERNAL_PROJECT_NAME "extern_download_${NAME}")
+  SET(${NAME}_INCLUDE_DIR ${THIRD_PARTY_PATH}/${NAME}/data)
+  ExternalProject_Add(
+      ${EXTERNAL_PROJECT_NAME}
+      ${EXTERNAL_PROJECT_LOG_ARGS}
+      PREFIX                ${THIRD_PARTY_PATH}/${NAME}
+      URL                   ${URL}
+      DOWNLOAD_DIR          ${THIRD_PARTY_PATH}/${NAME}/data/
+      SOURCE_DIR            ${THIRD_PARTY_PATH}/${NAME}/data/
+      DOWNLOAD_NO_PROGRESS  1
+      CONFIGURE_COMMAND     ""
+      BUILD_COMMAND         ""
+      UPDATE_COMMAND        ""
+      INSTALL_COMMAND       ""
+    )
+  list(APPEND third_party_deps ${EXTERNAL_PROJECT_NAME})
+ENDMACRO()
+
+
 # Correction of flags on different Platform(WIN/MAC) and Print Warning Message
 if (APPLE)
    if(WITH_MKL)
@ -178,10 +206,13 @@ include(external/dlpack)    # download dlpack
 include(external/xxhash)    # download, build, install xxhash
 include(external/warpctc)   # download, build, install warpctc

-set(third_party_deps)
 list(APPEND third_party_deps extern_eigen3 extern_gflags extern_glog extern_boost extern_xxhash)
 list(APPEND third_party_deps extern_zlib extern_dlpack extern_warpctc extern_threadpool)

+# download file
+set(CUDAERROR_URL  "https://paddlepaddledeps.bj.bcebos.com/cudaErrorMessage.tar.gz" CACHE STRING "" FORCE)
+file_download_and_uncompress(${CUDAERROR_URL} "cudaerror")
+
 if(WITH_AMD_GPU)
    include(external/rocprim)   # download, build, install rocprim
    list(APPEND third_party_deps extern_rocprim)
@ -274,4 +305,4 @@ if (WITH_LITE)
    include(external/lite)
 endif (WITH_LITE)

-add_custom_target(third_party DEPENDS ${third_party_deps})
+add_custom_target(third_party ALL DEPENDS ${third_party_deps})
--- a/paddle/fluid/framework/details/nan_inf_utils_detail.cu
+++ b/paddle/fluid/framework/details/nan_inf_utils_detail.cu
@ -152,9 +152,7 @@ void TensorCheckerVisitor<platform::CUDADeviceContext>::apply(

      PADDLE_ENFORCE_CUDA_SUCCESS(
          cudaMemcpyAsync(gpu_str_ptr, iter->first.c_str(), op_var.length() + 1,
-                          cudaMemcpyHostToDevice, dev_ctx->stream()),
-          platform::errors::External(
-              "Async cudaMemcpy op_var info to gpu failed."));
+                          cudaMemcpyHostToDevice, dev_ctx->stream()));
    } else {  // get
      auto iter = op_var2gpu_str.find(op_var);
      PADDLE_ENFORCE_EQ(iter != op_var2gpu_str.end(), true,
--- a/paddle/fluid/inference/tensorrt/plugin/split_op_plugin.cu
+++ b/paddle/fluid/inference/tensorrt/plugin/split_op_plugin.cu
@ -124,12 +124,9 @@ int SplitPlugin::enqueue(int batchSize, const void* const* inputs,
  float const* input_ptr = reinterpret_cast<float const*>(inputs[0]);
  float* const* h_odatas = reinterpret_cast<float* const*>(outputs);
  float** output_ptrs = thrust::raw_pointer_cast(&d_output_ptrs_[0]);
-  PADDLE_ENFORCE_CUDA_SUCCESS(
-      cudaMemcpyAsync(output_ptrs, h_odatas,
-                      d_output_ptrs_.size() * sizeof(float*),
-                      cudaMemcpyHostToDevice, stream),
-      platform::errors::External(
-          "CUDA Memcpy failed during split plugin run."));
+  PADDLE_ENFORCE_CUDA_SUCCESS(cudaMemcpyAsync(
+      output_ptrs, h_odatas, d_output_ptrs_.size() * sizeof(float*),
+      cudaMemcpyHostToDevice, stream));

  int outer_rows = outer_rows_ * batchSize;

@ -244,12 +241,9 @@ int SplitPluginDynamic::enqueue(const nvinfer1::PluginTensorDesc* input_desc,
    float* const* h_odatas = reinterpret_cast<float* const*>(outputs);
    float** output_ptrs = thrust::raw_pointer_cast(&d_output_ptrs[0]);

-    PADDLE_ENFORCE_CUDA_SUCCESS(
-        cudaMemcpyAsync(output_ptrs, h_odatas,
-                        d_output_ptrs.size() * sizeof(float*),
-                        cudaMemcpyHostToDevice, stream),
-        platform::errors::External(
-            "CUDA Memcpy failed during split plugin run."));
+    PADDLE_ENFORCE_CUDA_SUCCESS(cudaMemcpyAsync(
+        output_ptrs, h_odatas, d_output_ptrs.size() * sizeof(float*),
+        cudaMemcpyHostToDevice, stream));

    split_kernel<<<grid, block, 0, stream>>>(
        d_segment_offsets.size(), d_segment_offsets_ptr, input_ptr, output_ptrs,
@ -263,12 +257,9 @@ int SplitPluginDynamic::enqueue(const nvinfer1::PluginTensorDesc* input_desc,
    half* const* h_odatas = reinterpret_cast<half* const*>(outputs);
    half** output_ptrs = thrust::raw_pointer_cast(&d_output_ptrs[0]);

-    PADDLE_ENFORCE_CUDA_SUCCESS(
-        cudaMemcpyAsync(output_ptrs, h_odatas,
-                        d_output_ptrs.size() * sizeof(half*),
-                        cudaMemcpyHostToDevice, stream),
-        platform::errors::External(
-            "CUDA Memcpy failed during split plugin run."));
+    PADDLE_ENFORCE_CUDA_SUCCESS(cudaMemcpyAsync(
+        output_ptrs, h_odatas, d_output_ptrs.size() * sizeof(half*),
+        cudaMemcpyHostToDevice, stream));

    split_kernel<<<grid, block, 0, stream>>>(
        d_segment_offsets.size(), d_segment_offsets_ptr, input_ptr, output_ptrs,
--- a/paddle/fluid/memory/allocation/cuda_device_context_allocator.h
+++ b/paddle/fluid/memory/allocation/cuda_device_context_allocator.h
@ -80,17 +80,13 @@ class CUDADeviceContextAllocator : public Allocator {
      : place_(place), default_stream_(default_stream) {
    platform::CUDADeviceGuard guard(place_.device);
    PADDLE_ENFORCE_CUDA_SUCCESS(
-        cudaEventCreate(&event_, cudaEventDisableTiming),
-        platform::errors::External(
-            "Create event failed in CUDADeviceContextAllocator"));
+        cudaEventCreate(&event_, cudaEventDisableTiming));
  }

  ~CUDADeviceContextAllocator() {
    if (event_) {
      platform::CUDADeviceGuard guard(place_.device);
-      PADDLE_ENFORCE_CUDA_SUCCESS(
-          cudaEventDestroy(event_),
-          "Destory event failed in CUDADeviceContextAllocator destroctor");
+      PADDLE_ENFORCE_CUDA_SUCCESS(cudaEventDestroy(event_));
    }
  }

@ -103,12 +99,9 @@ class CUDADeviceContextAllocator : public Allocator {
    auto allocation =
        new CUDADeviceContextAllocation(memory::Alloc(place_, size));
    // Wait for the event on stream
+    PADDLE_ENFORCE_CUDA_SUCCESS(cudaEventRecord(event_, default_stream_));
    PADDLE_ENFORCE_CUDA_SUCCESS(
-        cudaEventRecord(event_, default_stream_),
-        "Failed to record event in CUDADeviceContextAllocator");
-    PADDLE_ENFORCE_CUDA_SUCCESS(
-        cudaStreamWaitEvent(default_stream_, event_, 0),
-        "Failed to wait event in CUDADeviceContextAllocator");
+        cudaStreamWaitEvent(default_stream_, event_, 0));
    return allocation;
  }

--- a/paddle/fluid/operators/argsort_op.cu
+++ b/paddle/fluid/operators/argsort_op.cu
@ -141,12 +141,7 @@ void ArgFullSort(const platform::CUDADeviceContext& ctx, const Tensor* input,
        num_rows, segment_offsets_t, segment_offsets_t + 1, 0, sizeof(T) * 8,
        cu_stream);
  }
-  PADDLE_ENFORCE_CUDA_SUCCESS(
-      err,
-      "ArgSortOP failed as could not launch "
-      "cub::DeviceSegmentedRadixSort::SortPairsDescending to calculate"
-      "temp_storage_bytes, status:%s.",
-      temp_storage_bytes, cudaGetErrorString(err));
+  PADDLE_ENFORCE_CUDA_SUCCESS(err);

  Tensor temp_storage;
  temp_storage.mutable_data<uint8_t>(ctx.GetPlace(), temp_storage_bytes);
@ -165,12 +160,7 @@ void ArgFullSort(const platform::CUDADeviceContext& ctx, const Tensor* input,
        cu_stream);
  }

-  PADDLE_ENFORCE_CUDA_SUCCESS(
-      err,
-      "ArgSortOP failed as could not launch "
-      "cub::DeviceSegmentedRadixSort::SortPairsDescending to sort input, "
-      "temp_storage_bytes:%d status:%s.",
-      temp_storage_bytes, cudaGetErrorString(err));
+  PADDLE_ENFORCE_CUDA_SUCCESS(err);
 }

 template <typename T, typename IndType>
--- a/paddle/fluid/operators/fused/conv_fusion_op.cu
+++ b/paddle/fluid/operators/fused/conv_fusion_op.cu
@ -167,13 +167,7 @@ class CUDNNConvFusionOpKernel : public framework::OpKernel<T> {
        conv_desc.descriptor<T>(padding_common, strides, dilations);
    PADDLE_ENFORCE_CUDA_SUCCESS(
        platform::dynload::cudnnSetConvolutionGroupCount(cudnn_conv_desc,
-                                                         groups),
-        platform::errors::External(
-            "Call of cudnnSetConvolutionGroupCount(cudnn_conv_desc, groups) "
-            "failed, where cudnn_conv_desc is configured: padding = [%s], "
-            "strides = [%s], dilations = [%s]; groups = %d",
-            framework::make_ddim(padding_common), framework::make_ddim(strides),
-            framework::make_ddim(dilations), groups));
+                                                         groups));

    cudnnTensorDescriptor_t cudnn_input_desc = input_desc.descriptor<T>(
        layout, framework::vectorize<int>(transformed_input.dims()));
@ -204,15 +198,8 @@ class CUDNNConvFusionOpKernel : public framework::OpKernel<T> {
    auto handle = dev_ctx.cudnn_handle();
    auto workspace_handle = dev_ctx.cudnn_workspace_handle();

-    PADDLE_ENFORCE_CUDA_SUCCESS(
-        platform::dynload::cudnnSetConvolutionMathType(cudnn_conv_desc,
-                                                       CUDNN_DEFAULT_MATH),
-        platform::errors::External(
-            "Call of cudnnSetConvolutionMathType(cudnn_conv_desc, "
-            "CUDNN_DEFAULT_MATH) failed, where cudnn_conv_desc is configured: "
-            "padding = %d, strides = %d, dilations = %d.",
-            framework::make_ddim(padding_common), framework::make_ddim(strides),
-            framework::make_ddim(dilations)));
+    PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnSetConvolutionMathType(
+        cudnn_conv_desc, CUDNN_DEFAULT_MATH));

    auto x_dims = framework::vectorize(transformed_input.dims());
    auto f_dims = framework::vectorize(filter->dims());
@ -221,9 +208,7 @@ class CUDNNConvFusionOpKernel : public framework::OpKernel<T> {
          platform::dynload::cudnnGetConvolutionForwardAlgorithm(
              handle, cudnn_input_desc, cudnn_filter_desc, cudnn_conv_desc,
              cudnn_output_desc, CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT,
-              workspace_size_limit, &algo),
-          platform::errors::External(
-              "Call of cudnnGetConvolutionForwardAlgorithm failed."));
+              workspace_size_limit, &algo));
      VLOG(3) << "cuDNN forward algo " << algo;
    } else {
      std::function<cudnnConvolutionFwdAlgo_t()> search_func =
@ -237,9 +222,7 @@ class CUDNNConvFusionOpKernel : public framework::OpKernel<T> {
                  handle, cudnn_input_desc, input_data, cudnn_filter_desc,
                  filter_data, cudnn_conv_desc, cudnn_output_desc, output_data,
                  kNUM_CUDNN_FWD_ALGS, &returned_algo_count,
-                  fwd_perf_stat.data(), cudnn_workspace, workspace_size_limit),
-              platform::errors::External(
-                  "Call of cudnnFindConvolutionForwardAlgorithmEx failed."));
+                  fwd_perf_stat.data(), cudnn_workspace, workspace_size_limit));
        };
        workspace_handle.RunFuncSync(cudnn_find_func, workspace_size_limit);
        VLOG(3) << "Perf result: (algo: stat, time, memory)";
@ -273,9 +256,7 @@ class CUDNNConvFusionOpKernel : public framework::OpKernel<T> {
    PADDLE_ENFORCE_CUDA_SUCCESS(
        platform::dynload::cudnnGetConvolutionForwardWorkspaceSize(
            handle, cudnn_input_desc, cudnn_filter_desc, cudnn_conv_desc,
-            cudnn_output_desc, algo, &workspace_size_in_bytes),
-        platform::errors::External(
-            "Call of cudnnGetConvolutionForwardWorkspaceSize failed."));
+            cudnn_output_desc, algo, &workspace_size_in_bytes));
    PADDLE_ENFORCE_LE(
        workspace_size_in_bytes, workspace_size_limit,
        platform::errors::InvalidArgument(
@ -292,20 +273,15 @@ class CUDNNConvFusionOpKernel : public framework::OpKernel<T> {
      // ------------- cudnn conv forward and bias add ---------------------
      ScalingParamType<T> alpha = 1.0f, beta = 0.0f;
      auto cudnn_func = [&](void* cudnn_workspace) {
-        PADDLE_ENFORCE_CUDA_SUCCESS(
-            platform::dynload::cudnnConvolutionForward(
-                handle, &alpha, cudnn_input_desc, input_data, cudnn_filter_desc,
-                filter_data, cudnn_conv_desc, algo, cudnn_workspace,
-                workspace_size_in_bytes, &beta, cudnn_output_desc, output_data),
-            platform::errors::External(
-                "Call of cudnnConvolutionForward failed."));
+        PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnConvolutionForward(
+            handle, &alpha, cudnn_input_desc, input_data, cudnn_filter_desc,
+            filter_data, cudnn_conv_desc, algo, cudnn_workspace,
+            workspace_size_in_bytes, &beta, cudnn_output_desc, output_data));
      };
      workspace_handle.RunFunc(cudnn_func, workspace_size_in_bytes);
-      PADDLE_ENFORCE_CUDA_SUCCESS(
-          platform::dynload::cudnnAddTensor(handle, &alpha, cudnn_bias_desc,
-                                            bias_data, &alpha,
-                                            cudnn_output_desc, output_data),
-          platform::errors::External("Call of cudnnAddTensor failed."));
+      PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnAddTensor(
+          handle, &alpha, cudnn_bias_desc, bias_data, &alpha, cudnn_output_desc,
+          output_data));
    } else {
      if (activation == "identity") {
        algo = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM;
@ -320,9 +296,7 @@ class CUDNNConvFusionOpKernel : public framework::OpKernel<T> {
                cudnn_filter_desc, filter_data, cudnn_conv_desc, algo,
                cudnn_workspace, workspace_size_in_bytes, &alpha2,
                cudnn_output_desc, residual_data, cudnn_bias_desc, bias_data,
-                cudnn_act_desc, cudnn_output_desc, output_data),
-            platform::errors::External(
-                "Call of cudnnConvolutionBiasActivationForward failed."));
+                cudnn_act_desc, cudnn_output_desc, output_data));
      };
      workspace_handle.RunFunc(cudnn_func, workspace_size_in_bytes);
    }
--- a/paddle/fluid/operators/fused/fused_bn_activation_op.cu
+++ b/paddle/fluid/operators/fused/fused_bn_activation_op.cu
@ -108,32 +108,21 @@ class FusedBatchNormActKernel<platform::CUDADeviceContext, T>
    cudnnBatchNormMode_t mode_ = CUDNN_BATCHNORM_SPATIAL_PERSISTENT;

    PADDLE_ENFORCE_CUDA_SUCCESS(
-        platform::dynload::cudnnCreateTensorDescriptor(&data_desc_),
-        platform::errors::External(
-            "The error has happened when calling "
-            "cudnnCreateTensorDescriptor(&data_desc_)."));
+        platform::dynload::cudnnCreateTensorDescriptor(&data_desc_));
    PADDLE_ENFORCE_CUDA_SUCCESS(
-        platform::dynload::cudnnCreateTensorDescriptor(&bn_param_desc_),
-        platform::errors::External(
-            "The error has happened when calling "
-            "cudnnCreateTensorDescriptor(&bn_param_desc_)."));
+        platform::dynload::cudnnCreateTensorDescriptor(&bn_param_desc_));

    VLOG(3) << "Setting descriptors.";
    std::vector<int> dims = {N, C, H, W, D};
    std::vector<int> strides = {H * W * D * C, 1, W * D * C, D * C, C};

-    PADDLE_ENFORCE_CUDA_SUCCESS(
-        platform::dynload::cudnnSetTensorNdDescriptor(
-            data_desc_, CudnnDataType<T>::type,
-            x_dims.size() > 3 ? x_dims.size() : 4, dims.data(), strides.data()),
-        platform::errors::External(
-            "The error has happened when calling cudnnSetTensorNdDescriptor."));
+    PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnSetTensorNdDescriptor(
+        data_desc_, CudnnDataType<T>::type,
+        x_dims.size() > 3 ? x_dims.size() : 4, dims.data(), strides.data()));

    PADDLE_ENFORCE_CUDA_SUCCESS(
        platform::dynload::cudnnDeriveBNTensorDescriptor(bn_param_desc_,
-                                                         data_desc_, mode_),
-        platform::errors::External("The error has happened when calling "
-                                   "cudnnDeriveBNTensorDescriptor."));
+                                                         data_desc_, mode_));

    double this_factor = 1. - momentum;
    cudnnBatchNormOps_t bnOps_ = CUDNN_BATCHNORM_OPS_BN_ACTIVATION;
@ -166,10 +155,7 @@ class FusedBatchNormActKernel<platform::CUDADeviceContext, T>
                /*yDesc=*/data_desc_,
                /*bnScaleBiasMeanVarDesc=*/bn_param_desc_,
                /*activationDesc=*/activation_desc_,
-                /*sizeInBytes=*/&workspace_size),
-        platform::errors::External(
-            "The error has happened when calling "
-            "cudnnGetBatchNormalizationForwardTrainingExWorkspaceSize."));
+                /*sizeInBytes=*/&workspace_size));

    // -------------- cudnn batchnorm reserve space --------------
    PADDLE_ENFORCE_CUDA_SUCCESS(
@ -179,10 +165,7 @@ class FusedBatchNormActKernel<platform::CUDADeviceContext, T>
            /*bnOps=*/bnOps_,
            /*activationDesc=*/activation_desc_,
            /*xDesc=*/data_desc_,
-            /*sizeInBytes=*/&reserve_space_size),
-        platform::errors::External(
-            "The error has happened when calling "
-            "cudnnGetBatchNormalizationTrainingExReserveSpaceSize."));
+            /*sizeInBytes=*/&reserve_space_size));

    reserve_space_ptr = reserve_space->mutable_data(ctx.GetPlace(), x->type(),
                                                    reserve_space_size);
@ -204,22 +187,13 @@ class FusedBatchNormActKernel<platform::CUDADeviceContext, T>
            saved_variance->template mutable_data<BatchNormParamType<T>>(
                ctx.GetPlace()),
            activation_desc_, workspace_ptr, workspace_size, reserve_space_ptr,
-            reserve_space_size),
-        platform::errors::External(
-            "The error has happened when calling "
-            "cudnnBatchNormalizationForwardTrainingEx."));
+            reserve_space_size));

    // clean when exit.
    PADDLE_ENFORCE_CUDA_SUCCESS(
-        platform::dynload::cudnnDestroyTensorDescriptor(data_desc_),
-        platform::errors::External(
-            "The error has happened when calling "
-            "cudnnDestroyTensorDescriptor(data_desc_)."));
+        platform::dynload::cudnnDestroyTensorDescriptor(data_desc_));
    PADDLE_ENFORCE_CUDA_SUCCESS(
-        platform::dynload::cudnnDestroyTensorDescriptor(bn_param_desc_),
-        platform::errors::External(
-            "The error has happened when calling "
-            "cudnnDestroyTensorDescriptor(bn_param_desc_)."));
+        platform::dynload::cudnnDestroyTensorDescriptor(bn_param_desc_));
  }
 };

@ -298,15 +272,9 @@ class FusedBatchNormActGradKernel<platform::CUDADeviceContext, T>
    cudnnBatchNormMode_t mode_ = CUDNN_BATCHNORM_SPATIAL_PERSISTENT;

    PADDLE_ENFORCE_CUDA_SUCCESS(
-        platform::dynload::cudnnCreateTensorDescriptor(&data_desc_),
-        platform::errors::External(
-            "The error has happened when calling "
-            "cudnnCreateTensorDescriptor(&data_desc_)."));
+        platform::dynload::cudnnCreateTensorDescriptor(&data_desc_));
    PADDLE_ENFORCE_CUDA_SUCCESS(
-        platform::dynload::cudnnCreateTensorDescriptor(&bn_param_desc_),
-        platform::errors::External(
-            "The error has happened when calling "
-            "cudnnCreateTensorDescriptor(&bn_param_desc_)."));
+        platform::dynload::cudnnCreateTensorDescriptor(&bn_param_desc_));
    if (epsilon <= CUDNN_BN_MIN_EPSILON - FLT_EPSILON) {
      LOG(ERROR) << "Provided epsilon is smaller than "
                 << "CUDNN_BN_MIN_EPSILON. Setting it to "
@ -314,17 +282,12 @@ class FusedBatchNormActGradKernel<platform::CUDADeviceContext, T>
    }
    epsilon = std::max(epsilon, CUDNN_BN_MIN_EPSILON);

-    PADDLE_ENFORCE_CUDA_SUCCESS(
-        platform::dynload::cudnnSetTensorNdDescriptor(
-            data_desc_, CudnnDataType<T>::type,
-            x_dims.size() > 3 ? x_dims.size() : 4, dims.data(), strides.data()),
-        platform::errors::External(
-            "The error has happened when calling cudnnSetTensorNdDescriptor."));
+    PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnSetTensorNdDescriptor(
+        data_desc_, CudnnDataType<T>::type,
+        x_dims.size() > 3 ? x_dims.size() : 4, dims.data(), strides.data()));
    PADDLE_ENFORCE_CUDA_SUCCESS(
        platform::dynload::cudnnDeriveBNTensorDescriptor(bn_param_desc_,
-                                                         data_desc_, mode_),
-        platform::errors::External("The error has happened when calling "
-                                   "cudnnDeriveBNTensorDescriptor."));
+                                                         data_desc_, mode_));

    const auto *saved_mean = ctx.Input<Tensor>("SavedMean");
    const auto *saved_var = ctx.Input<Tensor>("SavedVariance");
@ -354,10 +317,7 @@ class FusedBatchNormActGradKernel<platform::CUDADeviceContext, T>
            /*dxDesc=*/data_desc_,
            /*bnScaleBiasMeanVarDesc=*/bn_param_desc_,
            /*activationDesc=*/activation_desc_,
-            /*sizeInBytes=*/&workspace_size),
-        platform::errors::External(
-            "The error has happened when calling "
-            "cudnnGetBatchNormalizationBackwardExWorkspaceSize."));
+            /*sizeInBytes=*/&workspace_size));

    workspace_ptr = workspace_tensor.mutable_data(ctx.GetPlace(), x->type(),
                                                  workspace_size);
@ -395,21 +355,13 @@ class FusedBatchNormActGradKernel<platform::CUDADeviceContext, T>
            /*workspace=*/workspace_ptr,
            /*workSpaceSizeInBytes=*/workspace_size,
            /*reserveSpace=*/const_cast<T *>(reserve_space->template data<T>()),
-            /*reserveSpaceSizeInBytes=*/reserve_space_size),
-        platform::errors::External("The error has happened when calling "
-                                   "cudnnBatchNormalizationBackwardEx."));
+            /*reserveSpaceSizeInBytes=*/reserve_space_size));

    // clean when exit.
    PADDLE_ENFORCE_CUDA_SUCCESS(
-        platform::dynload::cudnnDestroyTensorDescriptor(data_desc_),
-        platform::errors::External(
-            "The error has happened when calling "
-            "cudnnDestroyTensorDescriptor(data_desc_)."));
+        platform::dynload::cudnnDestroyTensorDescriptor(data_desc_));
    PADDLE_ENFORCE_CUDA_SUCCESS(
-        platform::dynload::cudnnDestroyTensorDescriptor(bn_param_desc_),
-        platform::errors::External(
-            "The error has happened when calling "
-            "cudnnDestroyTensorDescriptor(bn_param_desc_)."));
+        platform::dynload::cudnnDestroyTensorDescriptor(bn_param_desc_));
  }
 };

--- a/paddle/fluid/operators/fused/fusion_transpose_flatten_concat_op.cu.cc
+++ b/paddle/fluid/operators/fused/fusion_transpose_flatten_concat_op.cu.cc
@ -46,13 +46,9 @@ class TransposeFlattenConcatFusionKernel : public framework::OpKernel<T> {
    cudnnTensorDescriptor_t in_desc;
    cudnnTensorDescriptor_t out_desc;
    PADDLE_ENFORCE_CUDA_SUCCESS(
-        platform::dynload::cudnnCreateTensorDescriptor(&in_desc),
-        platform::errors::External("Create cudnn tensor descriptor failed in "
-                                   "transpose_flatten_concat_fusion op."));
+        platform::dynload::cudnnCreateTensorDescriptor(&in_desc));
    PADDLE_ENFORCE_CUDA_SUCCESS(
-        platform::dynload::cudnnCreateTensorDescriptor(&out_desc),
-        platform::errors::External("Create cudnn tensor descriptor failed in "
-                                   "transpose_flatten_concat_fusion op."));
+        platform::dynload::cudnnCreateTensorDescriptor(&out_desc));
    cudnnDataType_t cudnn_dtype = CudnnDataType<T>::type;

    auto& dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
@ -91,24 +87,15 @@ class TransposeFlattenConcatFusionKernel : public framework::OpKernel<T> {
        dims_y[i] = 1;
      }

-      PADDLE_ENFORCE_CUDA_SUCCESS(
-          platform::dynload::cudnnSetTensorNdDescriptor(
-              in_desc, cudnn_dtype, max_dim, dims_y.data(), stride_x.data()),
-          platform::errors::External("Create cudnn tensorNd descriptor failed "
-                                     "in transpose_flatten_concat op."));
-      PADDLE_ENFORCE_CUDA_SUCCESS(
-          platform::dynload::cudnnSetTensorNdDescriptor(
-              out_desc, cudnn_dtype, max_dim, dims_y.data(), stride_y.data()),
-          platform::errors::External("Create cudnn tensorNd descriptor failed "
-                                     "in transpose_flatten_concat op."));
-
-      PADDLE_ENFORCE_CUDA_SUCCESS(
-          platform::dynload::cudnnTransformTensor(
-              handle, CudnnDataType<T>::kOne(), in_desc,
-              static_cast<const void*>(ins[k]->data<T>()),
-              CudnnDataType<T>::kZero(), out_desc, static_cast<void*>(odata)),
-          platform::errors::External("Create cudnn transform tensor failed in "
-                                     "transpose_flatten_concat op."));
+      PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnSetTensorNdDescriptor(
+          in_desc, cudnn_dtype, max_dim, dims_y.data(), stride_x.data()));
+      PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnSetTensorNdDescriptor(
+          out_desc, cudnn_dtype, max_dim, dims_y.data(), stride_y.data()));
+
+      PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnTransformTensor(
+          handle, CudnnDataType<T>::kOne(), in_desc,
+          static_cast<const void*>(ins[k]->data<T>()),
+          CudnnDataType<T>::kZero(), out_desc, static_cast<void*>(odata)));
      if (concat_axis == 0) {
        odata += osize;
      } else {
@ -117,13 +104,9 @@ class TransposeFlattenConcatFusionKernel : public framework::OpKernel<T> {
      }
    }
    PADDLE_ENFORCE_CUDA_SUCCESS(
-        platform::dynload::cudnnDestroyTensorDescriptor(in_desc),
-        platform::errors::External(
-            "Destory cudnn descriptor failed in transpose_flatten_concat op."));
+        platform::dynload::cudnnDestroyTensorDescriptor(in_desc));
    PADDLE_ENFORCE_CUDA_SUCCESS(
-        platform::dynload::cudnnDestroyTensorDescriptor(out_desc),
-        platform::errors::External(
-            "Destory cudnn descriptor failed in transpose_flatten_concat op."));
+        platform::dynload::cudnnDestroyTensorDescriptor(out_desc));
  }
 };

--- a/paddle/fluid/operators/grid_sampler_cudnn_op.cu.cc
+++ b/paddle/fluid/operators/grid_sampler_cudnn_op.cu.cc
@ -60,13 +60,10 @@ class CUDNNGridSampleOpKernel : public framework::OpKernel<T> {
    cudnnTensorDescriptor_t cudnn_output_desc = output_desc.descriptor<T>(
        DataLayout::kNCHW, framework::vectorize<int>(output->dims()));

-    PADDLE_ENFORCE_CUDA_SUCCESS(
-        platform::dynload::cudnnSpatialTfSamplerForward(
-            handle, cudnn_st_desc, CudnnDataType<T>::kOne(), cudnn_input_desc,
-            input_data, grid_data, CudnnDataType<T>::kZero(), cudnn_output_desc,
-            output_data),
-        platform::errors::InvalidArgument(
-            "cudnnSpatialTfSamplerForward in Op(grid_sampler) failed"));
+    PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnSpatialTfSamplerForward(
+        handle, cudnn_st_desc, CudnnDataType<T>::kOne(), cudnn_input_desc,
+        input_data, grid_data, CudnnDataType<T>::kZero(), cudnn_output_desc,
+        output_data));
  }
 };

@ -122,9 +119,7 @@ class CUDNNGridSampleGradOpKernel : public framework::OpKernel<T> {
            input_data, CudnnDataType<T>::kZero(), cudnn_input_grad_desc,
            input_grad_data, CudnnDataType<T>::kOne(), cudnn_output_grad_desc,
            output_grad_data, grid_data, CudnnDataType<T>::kZero(),
-            grid_grad_data),
-        platform::errors::InvalidArgument(
-            "cudnnSpatialTfSamplerBackward in Op(grid_sampler) failed"));
+            grid_grad_data));
  }
 };

--- a/paddle/fluid/operators/math/blas_impl.cu.h
+++ b/paddle/fluid/operators/math/blas_impl.cu.h
@ -41,16 +41,12 @@ struct CUBlas<float> {

  template <typename... ARGS>
  static void SCAL(ARGS... args) {
-    PADDLE_ENFORCE_CUDA_SUCCESS(
-        platform::dynload::cublasSscal(args...),
-        platform::errors::External("dynload cublasSscal lib failed"));
+    PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cublasSscal(args...));
  }

  template <typename... ARGS>
  static void VCOPY(ARGS... args) {
-    PADDLE_ENFORCE_CUDA_SUCCESS(
-        platform::dynload::cublasScopy(args...),
-        platform::errors::External("dynload cublasScopy lib failed"));
+    PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cublasScopy(args...));
  }

  template <typename... ARGS>
@ -108,16 +104,12 @@ struct CUBlas<double> {

  template <typename... ARGS>
  static void SCAL(ARGS... args) {
-    PADDLE_ENFORCE_CUDA_SUCCESS(
-        platform::dynload::cublasDscal(args...),
-        platform::errors::External("dynload cublasDscal lib failed"));
+    PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cublasDscal(args...));
  }

  template <typename... ARGS>
  static void VCOPY(ARGS... args) {
-    PADDLE_ENFORCE_CUDA_SUCCESS(
-        platform::dynload::cublasDcopy(args...),
-        platform::errors::External("dynload cublasDcopy lib failed"));
+    PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cublasDcopy(args...));
  }

  template <typename... ARGS>
--- a/paddle/fluid/operators/mean_op.cu
+++ b/paddle/fluid/operators/mean_op.cu
@ -59,20 +59,14 @@ class MeanCUDAKernel : public framework::OpKernel<T> {

    auto err = cub::DeviceReduce::Sum(nullptr, temp_storage_bytes, trans_x,
                                      out_data, size_prob, stream);
-    PADDLE_ENFORCE_CUDA_SUCCESS(
-        err, platform::errors::External(
-                 "MeanOP failed to get reduce workspace size %s.",
-                 cudaGetErrorString(err)));
+    PADDLE_ENFORCE_CUDA_SUCCESS(err);
    framework::Tensor tmp;
    auto* temp_storage = tmp.mutable_data<uint8_t>(
        framework::make_ddim({static_cast<int64_t>(temp_storage_bytes)}),
        context.GetPlace());
    err = cub::DeviceReduce::Sum(temp_storage, temp_storage_bytes, trans_x,
                                 out_data, size_prob, stream);
-    PADDLE_ENFORCE_CUDA_SUCCESS(
-        err, platform::errors::External(
-                 "MeanOP failed to run CUDA reduce computation: %s.",
-                 cudaGetErrorString(err)));
+    PADDLE_ENFORCE_CUDA_SUCCESS(err);
  }
 };

--- a/paddle/fluid/operators/reader/buffered_reader.cc
+++ b/paddle/fluid/operators/reader/buffered_reader.cc
@ -104,13 +104,9 @@ void BufferedReader::ReadAsync(size_t i) {
      // gpu memory immediately without waiting gpu kernel ends
      platform::SetDeviceId(boost::get<platform::CUDAPlace>(place_).device);
      PADDLE_ENFORCE_CUDA_SUCCESS(
-          cudaEventRecord(events_[i].get(), compute_stream_),
-          platform::errors::Fatal(
-              "cudaEventRecord raises unexpected exception"));
+          cudaEventRecord(events_[i].get(), compute_stream_));
      PADDLE_ENFORCE_CUDA_SUCCESS(
-          cudaStreamWaitEvent(stream_.get(), events_[i].get(), 0),
-          platform::errors::Fatal(
-              "cudaStreamWaitEvent raises unexpected exception"));
+          cudaStreamWaitEvent(stream_.get(), events_[i].get(), 0));

      platform::RecordEvent record_event("BufferedReader:MemoryCopy");
      for (size_t i = 0; i < cpu.size(); ++i) {
@ -138,17 +134,11 @@ void BufferedReader::ReadAsync(size_t i) {
                       size);
          memory::Copy(boost::get<platform::CUDAPlace>(place_), gpu_ptr,
                       cuda_pinned_place, cuda_pinned_ptr, size, stream_.get());
-          PADDLE_ENFORCE_CUDA_SUCCESS(
-              cudaStreamSynchronize(stream_.get()),
-              platform::errors::Fatal(
-                  "cudaStreamSynchronize raises unexpected exception"));
+          PADDLE_ENFORCE_CUDA_SUCCESS(cudaStreamSynchronize(stream_.get()));
        }
        gpu[i].set_lod(cpu[i].lod());
      }
-      PADDLE_ENFORCE_CUDA_SUCCESS(
-          cudaStreamSynchronize(stream_.get()),
-          platform::errors::Fatal(
-              "cudaStreamSynchronize raises unexpected exception"));
+      PADDLE_ENFORCE_CUDA_SUCCESS(cudaStreamSynchronize(stream_.get()));
    }
 #endif
    return i;
--- a/paddle/fluid/operators/sync_batch_norm_op.cu.h
+++ b/paddle/fluid/operators/sync_batch_norm_op.cu.h
@ -191,12 +191,9 @@ void SyncBatchNormFunctor(const framework::ExecutionContext &ctx,
    if (comm) {
      int dtype = platform::ToNCCLDataType(mean_out->type());
      // In-place operation
-      PADDLE_ENFORCE_CUDA_SUCCESS(
-          platform::dynload::ncclAllReduce(stats, stats, 2 * C + 1,
-                                           static_cast<ncclDataType_t>(dtype),
-                                           ncclSum, comm, stream),
-          platform::errors::InvalidArgument(
-              "ncclAllReduce in Op(sync_batch_norm) failed"));
+      PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::ncclAllReduce(
+          stats, stats, 2 * C + 1, static_cast<ncclDataType_t>(dtype), ncclSum,
+          comm, stream));
    }
 #endif

@ -468,12 +465,9 @@ void SyncBatchNormGradFunctor(
  if (comm) {
    int dtype = platform::ToNCCLDataType(scale->type());
    // In-place operation
-    PADDLE_ENFORCE_CUDA_SUCCESS(
-        platform::dynload::ncclAllReduce(stats, stats, 2 * C + 1,
-                                         static_cast<ncclDataType_t>(dtype),
-                                         ncclSum, comm, stream),
-        platform::errors::InvalidArgument(
-            "ncclAllReduce in Op(sync_batch_norm) failed"));
+    PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::ncclAllReduce(
+        stats, stats, 2 * C + 1, static_cast<ncclDataType_t>(dtype), ncclSum,
+        comm, stream));
  }
 #endif

--- a/paddle/fluid/platform/CMakeLists.txt
+++ b/paddle/fluid/platform/CMakeLists.txt
@ -1,6 +1,6 @@
 proto_library(profiler_proto SRCS profiler.proto DEPS framework_proto simple_threadpool)
 proto_library(error_codes_proto SRCS error_codes.proto)
-
+proto_library(cuda_error_proto SRCS cuda_error.proto)

 if (WITH_PYTHON)
  py_proto_compile(profiler_py_proto SRCS profiler.proto)
@ -28,7 +28,7 @@ cc_library(flags SRCS flags.cc DEPS gflags)
 cc_library(errors SRCS errors.cc DEPS error_codes_proto)
 cc_test(errors_test SRCS errors_test.cc DEPS errors enforce)

-cc_library(enforce INTERFACE SRCS enforce.cc DEPS flags errors)
+cc_library(enforce INTERFACE SRCS enforce.cc DEPS flags errors cuda_error_proto)
 cc_test(enforce_test SRCS enforce_test.cc DEPS stringpiece enforce)

 set(CPU_INFO_DEPS gflags glog enforce)
--- a/paddle/fluid/platform/cuda_error.proto
+++ b/paddle/fluid/platform/cuda_error.proto
@ -0,0 +1,35 @@
+/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+syntax = "proto2";
+package paddle.platform.proto;
+
+message MessageDesc {
+  // Indicates the type of error
+  required int32 errorCode = 1;
+  // Indicates the message of error
+  required string errorMessage = 2;
+}
+
+message AllMessageDesc {
+  // Version of cuda API
+  required int32 version = 1;
+  // Error messages of different errortype
+  repeated MessageDesc Messages = 2;
+}
+
+message cudaerrorDesc {
+  // Error messages of different cuda versions(9.0/10.0/10.2)
+  repeated AllMessageDesc AllMessages = 2;
+}
--- a/paddle/fluid/platform/cuda_helper.h
+++ b/paddle/fluid/platform/cuda_helper.h
@ -29,14 +29,7 @@ namespace platform {
 class CublasHandleHolder {
 public:
  CublasHandleHolder(cudaStream_t stream, cublasMath_t math_type) {
-    PADDLE_ENFORCE_CUDA_SUCCESS(
-        dynload::cublasCreate(&handle_),
-        platform::errors::External(
-            "The cuBLAS library was not initialized. This is usually caused by "
-            "an error in the CUDA Runtime API called by the cuBLAS routine, or "
-            "an error in the hardware setup.\n"
-            "To correct: check that the hardware, an appropriate version of "
-            "the driver, and the cuBLAS library are correctly installed."));
+    PADDLE_ENFORCE_CUDA_SUCCESS(dynload::cublasCreate(&handle_));
    PADDLE_ENFORCE_CUDA_SUCCESS(dynload::cublasSetStream(handle_, stream));
 #if CUDA_VERSION >= 9000
    if (math_type == CUBLAS_TENSOR_OP_MATH) {
--- a/paddle/fluid/platform/cuda_resource_pool.cc
+++ b/paddle/fluid/platform/cuda_resource_pool.cc
@ -27,18 +27,13 @@ CudaStreamResourcePool::CudaStreamResourcePool() {
      platform::SetDeviceId(dev_idx);
      cudaStream_t stream;
      PADDLE_ENFORCE_CUDA_SUCCESS(
-          cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking),
-          platform::errors::Fatal(
-              "cudaStreamCreateWithFlags raises unexpected exception"));
+          cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking));
      return stream;
    };

    auto deleter = [dev_idx](cudaStream_t stream) {
      platform::SetDeviceId(dev_idx);
-      PADDLE_ENFORCE_CUDA_SUCCESS(
-          cudaStreamDestroy(stream),
-          platform::errors::Fatal(
-              "cudaStreamDestroy raises unexpected exception"));
+      PADDLE_ENFORCE_CUDA_SUCCESS(cudaStreamDestroy(stream));
    };

    pool_.emplace_back(
@ -72,18 +67,13 @@ CudaEventResourcePool::CudaEventResourcePool() {
      platform::SetDeviceId(dev_idx);
      cudaEvent_t event;
      PADDLE_ENFORCE_CUDA_SUCCESS(
-          cudaEventCreateWithFlags(&event, cudaEventDisableTiming),
-          platform::errors::Fatal(
-              "cudaEventCreateWithFlags raises unexpected exception"));
+          cudaEventCreateWithFlags(&event, cudaEventDisableTiming));
      return event;
    };

    auto deleter = [dev_idx](cudaEvent_t event) {
      platform::SetDeviceId(dev_idx);
-      PADDLE_ENFORCE_CUDA_SUCCESS(
-          cudaEventDestroy(event),
-          platform::errors::Fatal(
-              "cudaEventDestroy raises unexpected exception"));
+      PADDLE_ENFORCE_CUDA_SUCCESS(cudaEventDestroy(event));
    };

    pool_.emplace_back(ResourcePool<CudaEventObject>::Create(creator, deleter));
--- a/paddle/fluid/platform/device_context.cc
+++ b/paddle/fluid/platform/device_context.cc
@ -278,12 +278,9 @@ CUDADeviceContext::CUDADeviceContext(CUDAPlace place) : place_(place) {
            << "Please recompile or reinstall Paddle with compatible CUDNN "
               "version.";
      }
+      PADDLE_ENFORCE_CUDA_SUCCESS(dynload::cudnnCreate(&cudnn_handle_));
      PADDLE_ENFORCE_CUDA_SUCCESS(
-          dynload::cudnnCreate(&cudnn_handle_),
-          "Failed to create Cudnn handle in DeviceContext");
-      PADDLE_ENFORCE_CUDA_SUCCESS(
-          dynload::cudnnSetStream(cudnn_handle_, stream_),
-          "Failed to set stream for Cudnn handle in DeviceContext");
+          dynload::cudnnSetStream(cudnn_handle_, stream_));
    } else {
      cudnn_handle_ = nullptr;
    }
@ -302,8 +299,7 @@ CUDADeviceContext::~CUDADeviceContext() {
  eigen_device_.reset();
  PADDLE_ENFORCE_CUDA_SUCCESS(cudaStreamDestroy(stream_));
  if (cudnn_handle_) {
-    PADDLE_ENFORCE_CUDA_SUCCESS(dynload::cudnnDestroy(cudnn_handle_),
-                                "Failed to destory Cudnn handle");
+    PADDLE_ENFORCE_CUDA_SUCCESS(dynload::cudnnDestroy(cudnn_handle_));
  }
 #if defined(PADDLE_WITH_NCCL)
  if (nccl_comm_) {
@ -325,10 +321,7 @@ void CUDADeviceContext::Wait() const {
  }
 #endif

-  PADDLE_ENFORCE_CUDA_SUCCESS(
-      e_sync, platform::errors::Fatal(
-                  "cudaStreamSynchronize raises error: %s, errono: %d",
-                  cudaGetErrorString(e_sync), static_cast<int>(e_sync)));
+  PADDLE_ENFORCE_CUDA_SUCCESS(e_sync);
 }

 int CUDADeviceContext::GetComputeCapability() const {
--- a/paddle/fluid/platform/enforce.h
+++ b/paddle/fluid/platform/enforce.h
--- a/paddle/fluid/platform/enforce_test.cc
+++ b/paddle/fluid/platform/enforce_test.cc
@ -261,15 +261,14 @@ TEST(EOF_EXCEPTION, THROW_EOF) {
 #ifdef PADDLE_WITH_CUDA
 template <typename T>
 bool CheckCudaStatusSuccess(T value, const std::string& msg = "success") {
-  PADDLE_ENFORCE_CUDA_SUCCESS(value, msg);
+  PADDLE_ENFORCE_CUDA_SUCCESS(value);
  return true;
 }

 template <typename T>
-bool CheckCudaStatusFailure(
-    T value, const std::string& msg = "self-defined cuda status failed") {
+bool CheckCudaStatusFailure(T value, const std::string& msg) {
  try {
-    PADDLE_ENFORCE_CUDA_SUCCESS(value, msg);
+    PADDLE_ENFORCE_CUDA_SUCCESS(value);
    return false;
  } catch (paddle::platform::EnforceNotMet& error) {
    std::string ex_msg = error.what();
@ -279,24 +278,31 @@ bool CheckCudaStatusFailure(

 TEST(enforce, cuda_success) {
  EXPECT_TRUE(CheckCudaStatusSuccess(cudaSuccess));
-  EXPECT_TRUE(CheckCudaStatusFailure(cudaErrorInvalidValue));
-  EXPECT_TRUE(CheckCudaStatusFailure(cudaErrorMemoryAllocation));
+  EXPECT_TRUE(CheckCudaStatusFailure(cudaErrorInvalidValue, "Cuda error"));
+  EXPECT_TRUE(CheckCudaStatusFailure(cudaErrorMemoryAllocation, "Cuda error"));

+  int count;
+  PADDLE_ENFORCE(cudaGetDeviceCount(&count));
  EXPECT_TRUE(CheckCudaStatusSuccess(CURAND_STATUS_SUCCESS));
-  EXPECT_TRUE(CheckCudaStatusFailure(CURAND_STATUS_VERSION_MISMATCH));
-  EXPECT_TRUE(CheckCudaStatusFailure(CURAND_STATUS_NOT_INITIALIZED));
+  EXPECT_TRUE(
+      CheckCudaStatusFailure(CURAND_STATUS_VERSION_MISMATCH, "Curand error"));
+  EXPECT_TRUE(
+      CheckCudaStatusFailure(CURAND_STATUS_NOT_INITIALIZED, "Curand error"));

  EXPECT_TRUE(CheckCudaStatusSuccess(CUDNN_STATUS_SUCCESS));
-  EXPECT_TRUE(CheckCudaStatusFailure(CUDNN_STATUS_NOT_INITIALIZED));
-  EXPECT_TRUE(CheckCudaStatusFailure(CUDNN_STATUS_ALLOC_FAILED));
+  EXPECT_TRUE(
+      CheckCudaStatusFailure(CUDNN_STATUS_NOT_INITIALIZED, "Cudnn error"));
+  EXPECT_TRUE(CheckCudaStatusFailure(CUDNN_STATUS_ALLOC_FAILED, "Cudnn error"));

  EXPECT_TRUE(CheckCudaStatusSuccess(CUBLAS_STATUS_SUCCESS));
-  EXPECT_TRUE(CheckCudaStatusFailure(CUBLAS_STATUS_NOT_INITIALIZED));
-  EXPECT_TRUE(CheckCudaStatusFailure(CUBLAS_STATUS_INVALID_VALUE));
+  EXPECT_TRUE(
+      CheckCudaStatusFailure(CUBLAS_STATUS_NOT_INITIALIZED, "Cublas error"));
+  EXPECT_TRUE(
+      CheckCudaStatusFailure(CUBLAS_STATUS_INVALID_VALUE, "Cublas error"));
 #if !defined(__APPLE__) && defined(PADDLE_WITH_NCCL)
  EXPECT_TRUE(CheckCudaStatusSuccess(ncclSuccess));
-  EXPECT_TRUE(CheckCudaStatusFailure(ncclUnhandledCudaError));
-  EXPECT_TRUE(CheckCudaStatusFailure(ncclSystemError));
+  EXPECT_TRUE(CheckCudaStatusFailure(ncclUnhandledCudaError, "Nccl error"));
+  EXPECT_TRUE(CheckCudaStatusFailure(ncclSystemError, "Nccl error"));
 #endif
 }
 #endif
--- a/paddle/fluid/platform/gpu_info.cc
+++ b/paddle/fluid/platform/gpu_info.cc
--- a/paddle/fluid/platform/profiler_helper.h
+++ b/paddle/fluid/platform/profiler_helper.h
@ -117,10 +117,7 @@ void SynchronizeAllDevice() {
  int count = GetCUDADeviceCount();
  for (int i = 0; i < count; i++) {
    SetDeviceId(i);
-    PADDLE_ENFORCE_CUDA_SUCCESS(
-        cudaDeviceSynchronize(),
-        platform::errors::External(
-            "Device synchronize failed in cudaDeviceSynchronize()"));
+    PADDLE_ENFORCE_CUDA_SUCCESS(cudaDeviceSynchronize());
  }
 #endif
 }
--- a/python/setup.py.in
+++ b/python/setup.py.in
@ -213,12 +213,14 @@ if '${CMAKE_SYSTEM_PROCESSOR}' not in ['arm', 'armv7-a', 'aarch64']:

 # the prefix is sys.prefix which should always be usr
 paddle_bins = ''
+
 if not '${WIN32}':
    paddle_bins = ['${PADDLE_BINARY_DIR}/paddle/scripts/paddle']
 package_data={'paddle.fluid': ['${FLUID_CORE_NAME}' + ('.so' if os.name != 'nt' else '.pyd')]}
 if '${HAS_NOAVX_CORE}' == 'ON':
    package_data['paddle.fluid'] += ['core_noavx' + ('.so' if os.name != 'nt' else '.pyd')]

+
 package_dir={
    '': '${PADDLE_BINARY_DIR}/python',
    # The paddle.fluid.proto will be generated while compiling.
@ -329,6 +331,7 @@ headers = (
    list(find_files('*.h', '@PADDLE_SOURCE_DIR@/paddle/fluid/string')) +
    list(find_files('*.pb.h', '${PADDLE_BINARY_DIR}/paddle/fluid/platform')) +
    list(find_files('*.pb.h', '${PADDLE_BINARY_DIR}/paddle/fluid/framework')) +
+    list(find_files('*.pb', '${cudaerror_INCLUDE_DIR}')) + # errorMessage.pb for errormessage
    ['${EIGEN_INCLUDE_DIR}/Eigen/Core'] + # eigen
    list(find_files('*', '${EIGEN_INCLUDE_DIR}/Eigen/src')) + # eigen
    list(find_files('*', '${EIGEN_INCLUDE_DIR}/unsupported/Eigen')) + # eigen
@ -400,7 +403,9 @@ class InstallHeaders(Command):
        return self.copy_file(header, install_dir)

    def run(self):
+        # only copy third_party/cudaErrorMessage.pb for cudaErrorMessage on mac or windows
        if os.name == 'nt' or sys.platform == 'darwin':
+            self.mkdir_and_copy_file('${cudaerror_INCLUDE_DIR}/cudaErrorMessage.pb')
            return
        hdrs = self.distribution.headers
        if not hdrs:
--- a/tools/check_api_approvals.sh
+++ b/tools/check_api_approvals.sh
@ -172,8 +172,8 @@ if [ "${ALL_PADDLE_ENFORCE}" != "" ] && [ "${GIT_PR_ID}" != "" ]; then
    check_approval 1 6836917 47554610 22561442
 fi

-ALL_PADDLE_CHECK=`git diff -U0 upstream/$BRANCH |grep "^+" |grep -zoE "(PADDLE_ENFORCE[A-Z_]*|PADDLE_THROW)\(.[^,\);]*.[^;]*\);\s" || true`
-VALID_PADDLE_CHECK=`echo "$ALL_PADDLE_CHECK" | grep -zoE '(PADDLE_ENFORCE[A-Z_]*|PADDLE_THROW)\((.[^,;]+,)*.[^";]*(errors::).[^"]*".[^";]{20,}.[^;]*\);\s' || true`
+ALL_PADDLE_CHECK=`git diff -U0 upstream/$BRANCH |grep "^+" |grep -zoE "(PADDLE_ENFORCE[A-Z_]{0,9}|PADDLE_THROW)\(.[^,\);]*.[^;]*\);\s" || true`
+VALID_PADDLE_CHECK=`echo "$ALL_PADDLE_CHECK" | grep -zoE '(PADDLE_ENFORCE[A-Z_]{0,9}|PADDLE_THROW)\((.[^,;]+,)*.[^";]*(errors::).[^"]*".[^";]{20,}.[^;]*\);\s' || true`
 INVALID_PADDLE_CHECK=`echo "$ALL_PADDLE_CHECK" |grep -vxF "$VALID_PADDLE_CHECK" || true`
 if [ "${INVALID_PADDLE_CHECK}" != "" ] && [ "${GIT_PR_ID}" != "" ]; then
    echo_line="The error message you wrote in PADDLE_ENFORCE{_**} or PADDLE_THROW does not meet our error message writing specification. Possible errors include 1. the error message is empty / 2. the error message is too short / 3. the error type is not specified. Please read the specification [ https://github.com/PaddlePaddle/Paddle/wiki/Paddle-Error-Message-Writing-Specification ], then refine the error message. If it is a mismatch, please specify chenwhql (Recommend), luotao1 or lanxianghit review and approve.\nThe PADDLE_ENFORCE{_**} or PADDLE_THROW entries that do not meet the specification are as follows:\n${INVALID_PADDLE_CHECK}\n"
--- a/Show More
+++ b/Show More