From 6c6e2e54783b839cdb93c86119bec9f92f6e3182 Mon Sep 17 00:00:00 2001 From: VectorSL Date: Fri, 4 Dec 2020 12:17:25 +0800 Subject: [PATCH] add trace for gpu error/excpt log --- .../gpu/arrays/array_reduce_gpu_kernel.h | 33 +++--- .../gpu/arrays/concatv2_gpu_kernel.h | 7 +- .../gpu/arrays/dynamic_shape_gpu_kernel.h | 2 + .../gpu/arrays/gathernd_gpu_kernel.h | 7 +- .../gpu/arrays/gatherv2_gpu_kernel.h | 7 +- .../gpu/arrays/pack_gpu_kernel.h | 4 +- .../gpu/arrays/scatter_add_gpu_kernel.h | 4 +- .../gpu/arrays/scatter_nd_gpu_kernel.h | 8 +- .../gpu/arrays/scatter_update_gpu_kernel.h | 4 +- .../gpu/arrays/split_gpu_kernel.h | 4 +- .../gpu/arrays/transpose_gpu_kernel.h | 7 +- .../gpu/arrays/unique_gpu_kernel.h | 2 +- .../gpu/arrays/unpack_gpu_kernel.h | 4 +- .../arrays/unsorted_segment_max_gpu_kernel.h | 4 +- .../arrays/unsorted_segment_sum_gpu_kernel.h | 3 +- .../gpu/control/recv_gpu_kernel.h | 4 +- .../gpu/control/send_gpu_kernel.h | 4 +- .../gpu/data/dataset_iterator_kernel.cc | 6 +- .../backend/kernel_compiler/gpu/gpu_kernel.h | 5 +- .../gpu/math/addn_gpu_kernel.h | 21 ++-- .../gpu/math/bias_add_gpu_kernel.h | 31 ++++-- .../gpu/math/broadcast_grad_gpu_kernel.h | 7 +- .../gpu/math/cast_all_gpu_kernel.h | 5 +- .../gpu/math/cholesky_solve_gpu_kernel.h | 18 ++- .../gpu/math/cholesky_trsm_solve_gpu_kernel.h | 20 +++- .../gpu/math/matmul_gpu_kernel.h | 2 + .../gpu/math/update_thor_gradient.h | 3 + .../gpu/nccl/nccl_collective_gpu_kernel.h | 11 +- .../gpu/nccl/nccl_recv_gpu_kernel.h | 4 +- .../gpu/nccl/nccl_send_gpu_kernel.h | 4 +- .../gpu/nn/activation_gpu_kernel.h | 23 ++-- .../gpu/nn/activation_grad_kernel.h | 21 ++-- .../gpu/nn/batchnorm_grad_gpu_kernel.h | 30 +++-- .../gpu/nn/bias_add_grad_gpu_kenel.h | 35 ++++-- .../gpu/nn/conv2d_gpu_kernel.h | 67 +++++++---- .../gpu/nn/conv2d_grad_filter_gpu_kernel.h | 63 +++++++---- .../gpu/nn/conv2d_grad_input_gpu_kernel.h | 64 +++++++---- .../gpu/nn/ctcloss_gpu_kernel.h | 44 +++++--- .../gpu/nn/fused_batch_norm_ex_gpu_kernel.h | 48 +++++--- .../gpu/nn/fused_batch_norm_gpu_kernel.h | 30 +++-- .../nn/fused_batch_norm_grad_ex_gpu_kernel.h | 51 +++++---- .../gpu/nn/fused_batchnorm_grad_gpu_kernel.h | 30 +++-- .../gpu/nn/im2col_gpu_kernel.h | 61 ++++++---- .../gpu/nn/l2normalize_gpu_kernel.h | 33 +++--- .../gpu/nn/l2normalize_grad_gpu_kernel.h | 41 ++++--- .../kernel_compiler/gpu/nn/lstm_gpu_kernel.h | 84 ++++++++------ .../gpu/nn/lstm_grad_data_gpu_kernel.h | 105 +++++++++++------- .../gpu/nn/lstm_grad_weight_gpu_kernel.h | 65 ++++++----- .../gpu/nn/pooling_gpu_kernel.h | 31 ++++-- .../gpu/nn/pooling_grad_gpu_kernel.h | 52 +++++---- ...max_cross_entropy_with_logits_gpu_kernel.h | 14 ++- .../gpu/nn/softmax_gpu_kernel.h | 28 +++-- .../gpu/nn/softmax_grad_gpu_kernel.h | 21 ++-- ...max_cross_entropy_with_logits_gpu_kernel.h | 14 ++- .../gpu/other/assign_gpu_kernel.h | 3 + .../gpu_convert_to_dynamic_shape_gpu_kernel.h | 6 +- .../quant/batchnorm_fold2_grad_gpu_kernel.h | 6 +- .../gpu/quant/batchnorm_fold_gpu_kernel.h | 27 +++-- .../quant/batchnorm_fold_grad_gpu_kernel.h | 4 +- .../quant/fake_quant_perchannel_gpu_kernel.cc | 4 +- .../fake_quant_perchannel_grad_gpu_kernel.cc | 4 +- .../quant/fake_quant_perlayer_gpu_kernel.cc | 4 +- .../fake_quant_perlayer_grad_gpu_kernel.cc | 4 +- .../random/random_categorical_gpu_kernel.h | 10 +- .../uniform_candidate_sampler_gpu_kernel.h | 7 +- .../runtime/device/gpu/blocking_queue.cc | 2 +- .../ccsrc/runtime/device/gpu/gpu_common.h | 76 +++++++++---- .../runtime/device/gpu/gpu_device_manager.cc | 20 ++-- .../runtime/device/gpu/gpu_stream_assign.cc | 3 +- .../ccsrc/runtime/device/gpu/queue_common.h | 42 +++++++ 70 files changed, 1014 insertions(+), 508 deletions(-) create mode 100644 mindspore/ccsrc/runtime/device/gpu/queue_common.h diff --git a/mindspore/ccsrc/backend/kernel_compiler/gpu/arrays/array_reduce_gpu_kernel.h b/mindspore/ccsrc/backend/kernel_compiler/gpu/arrays/array_reduce_gpu_kernel.h index f82ec1f556..10c36bafb2 100644 --- a/mindspore/ccsrc/backend/kernel_compiler/gpu/arrays/array_reduce_gpu_kernel.h +++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/arrays/array_reduce_gpu_kernel.h @@ -56,11 +56,13 @@ class ArrayReduceGpuKernel : public GpuKernel { if (all_match_) { MS_LOG(DEBUG) << "The corresponding dimensions of the input and output tensors all match. No need to call cuDNN kernel."; - CHECK_CUDA_RET_WITH_EXCEPT(cudaMemcpyAsync(output_addr, input_addr, inputs[0]->size, cudaMemcpyDeviceToDevice, + CHECK_CUDA_RET_WITH_EXCEPT(kernel_node_, + cudaMemcpyAsync(output_addr, input_addr, inputs[0]->size, cudaMemcpyDeviceToDevice, reinterpret_cast(stream_ptr)), "cudaMemcpyAsync failed in ArrayReduceGpuKernel::Launch."); } else { CHECK_CUDNN_RET_WITH_EXCEPT( + kernel_node_, cudnnReduceTensor(cudnn_handle_, reduce_tensor_descriptor_, nullptr, 0, workspace_addr, workspace_size_, &alpha, inputA_descriptor_, input_addr, &beta, outputC_descriptor_, output_addr), "cudnnReduceTensor failed."); @@ -68,6 +70,7 @@ class ArrayReduceGpuKernel : public GpuKernel { return true; } bool Init(const CNodePtr &kernel_node) override { + kernel_node_ = kernel_node; InitResource(); data_type_ = GetCudnnDataType(TypeIdLabel(AnfAlgo::GetInputDeviceDataType(kernel_node, 0))); size_t input_num = AnfAlgo::GetInputTensorNum(kernel_node); @@ -139,34 +142,35 @@ class ArrayReduceGpuKernel : public GpuKernel { } void DestroyResource() noexcept override { - CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyReduceTensorDescriptor(reduce_tensor_descriptor_), + CHECK_CUDNN_RET_WITH_ERROR(kernel_node_, cudnnDestroyReduceTensorDescriptor(reduce_tensor_descriptor_), "cudnnDestroyReduceTensorDescriptor failed."); - CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(inputA_descriptor_), + CHECK_CUDNN_RET_WITH_ERROR(kernel_node_, cudnnDestroyTensorDescriptor(inputA_descriptor_), "cudnnDestroyTensorDescriptor failed."); - CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(outputC_descriptor_), + CHECK_CUDNN_RET_WITH_ERROR(kernel_node_, cudnnDestroyTensorDescriptor(outputC_descriptor_), "cudnnDestroyTensorDescriptor failed."); } protected: void InitResource() override { cudnn_handle_ = device::gpu::GPUDeviceManager::GetInstance().GetCudnnHandle(); - CHECK_CUDNN_RET_WITH_EXCEPT(cudnnCreateReduceTensorDescriptor(&reduce_tensor_descriptor_), + CHECK_CUDNN_RET_WITH_EXCEPT(kernel_node_, cudnnCreateReduceTensorDescriptor(&reduce_tensor_descriptor_), "cudnnCreateReduceTensorDescriptor failed."); - CHECK_CUDNN_RET_WITH_EXCEPT(cudnnCreateTensorDescriptor(&inputA_descriptor_), + CHECK_CUDNN_RET_WITH_EXCEPT(kernel_node_, cudnnCreateTensorDescriptor(&inputA_descriptor_), "cudnnCreateTensorDescriptor failed."); - CHECK_CUDNN_RET_WITH_EXCEPT(cudnnCreateTensorDescriptor(&outputC_descriptor_), + CHECK_CUDNN_RET_WITH_EXCEPT(kernel_node_, cudnnCreateTensorDescriptor(&outputC_descriptor_), "cudnnCreateTensorDescriptor failed."); } void InitSizeLists() override { - CHECK_CUDNN_RET_WITH_EXCEPT(cudnnGetTensorSizeInBytes(inputA_descriptor_, &input_size_), + CHECK_CUDNN_RET_WITH_EXCEPT(kernel_node_, cudnnGetTensorSizeInBytes(inputA_descriptor_, &input_size_), "cudnnGetTensorSizeInBytes failed."); input_size_list_.push_back(input_size_); - CHECK_CUDNN_RET_WITH_EXCEPT(cudnnGetTensorSizeInBytes(outputC_descriptor_, &output_size_), + CHECK_CUDNN_RET_WITH_EXCEPT(kernel_node_, cudnnGetTensorSizeInBytes(outputC_descriptor_, &output_size_), "cudnnGetTensorSizeInBytes failed."); output_size_list_.push_back(output_size_); CHECK_CUDNN_RET_WITH_EXCEPT( + kernel_node_, cudnnGetReductionWorkspaceSize(cudnn_handle_, reduce_tensor_descriptor_, inputA_descriptor_, outputC_descriptor_, &workspace_size_), "cudnnGetReductionWorkspaceSize failed."); @@ -185,6 +189,7 @@ class ArrayReduceGpuKernel : public GpuKernel { } CHECK_CUDNN_RET_WITH_EXCEPT( + kernel_node_, cudnnSetReduceTensorDescriptor(reduce_tensor_descriptor_, reduce_tensor_op_, CUDNN_DATA_FLOAT, nan_prop_, reduce_indices_, CUDNN_32BIT_INDICES), "cudnnSetReduceTensorDescriptor failed"); @@ -198,11 +203,12 @@ class ArrayReduceGpuKernel : public GpuKernel { if (input_shape.size() <= split_dim) { ShapeNdTo4d(input_shape, &inputA); CHECK_CUDNN_RET_WITH_EXCEPT( + kernel_node_, cudnnSetTensor4dDescriptor(inputA_descriptor_, CUDNN_TENSOR_NCHW, data_type_, SizeToInt(inputA[0]), SizeToInt(inputA[1]), SizeToInt(inputA[2]), SizeToInt(inputA[3])), "cudnnSetTensor4dDescriptor failed"); } else { - CudnnSetTensorNdDescriptor(input_shape, inputA_descriptor_, data_type_); + CudnnSetTensorNdDescriptor(input_shape, inputA_descriptor_, data_type_, kernel_node_); for (auto dim : input_shape) { inputA.emplace_back(SizeToInt(dim)); } @@ -212,10 +218,10 @@ class ArrayReduceGpuKernel : public GpuKernel { outputC_shape.resize(input_shape.size(), 1); if (outputC_shape.size() <= split_dim) { CHECK_CUDNN_RET_WITH_EXCEPT( - cudnnSetTensor4dDescriptor(outputC_descriptor_, CUDNN_TENSOR_NCHW, data_type_, 1, 1, 1, 1), + kernel_node_, cudnnSetTensor4dDescriptor(outputC_descriptor_, CUDNN_TENSOR_NCHW, data_type_, 1, 1, 1, 1), "cudnnSetTensor4dDescriptor failed"); } else { - CudnnSetTensorNdDescriptor(outputC_shape, outputC_descriptor_, data_type_); + CudnnSetTensorNdDescriptor(outputC_shape, outputC_descriptor_, data_type_, kernel_node_); } for (auto dim : inputA) { @@ -238,11 +244,12 @@ class ArrayReduceGpuKernel : public GpuKernel { if (outputC_shape.size() <= split_dim) { ShapeNdTo4d(outputC_shape, &outputC); CHECK_CUDNN_RET_WITH_EXCEPT( + kernel_node_, cudnnSetTensor4dDescriptor(outputC_descriptor_, CUDNN_TENSOR_NCHW, data_type_, SizeToInt(outputC[0]), SizeToInt(outputC[1]), SizeToInt(outputC[2]), SizeToInt(outputC[3])), "cudnnSetTensor4dDescriptor failed"); } else { - CudnnSetTensorNdDescriptor(outputC_shape, outputC_descriptor_, data_type_); + CudnnSetTensorNdDescriptor(outputC_shape, outputC_descriptor_, data_type_, kernel_node_); for (auto dim : outputC_shape) { outputC.emplace_back(SizeToInt(dim)); } diff --git a/mindspore/ccsrc/backend/kernel_compiler/gpu/arrays/concatv2_gpu_kernel.h b/mindspore/ccsrc/backend/kernel_compiler/gpu/arrays/concatv2_gpu_kernel.h index 14b900351b..4a2001d589 100644 --- a/mindspore/ccsrc/backend/kernel_compiler/gpu/arrays/concatv2_gpu_kernel.h +++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/arrays/concatv2_gpu_kernel.h @@ -49,10 +49,12 @@ class ConcatV2GpuFwdKernel : public GpuKernel { for (size_t i = 0; i < inputs.size(); i++) { inputs_host_[i] = GetDeviceAddress(inputs, i); } - CHECK_CUDA_RET_WITH_EXCEPT(cudaMemcpyAsync(inputs_device, inputs_host_.get(), sizeof(T *) * input_num_, + CHECK_CUDA_RET_WITH_EXCEPT(kernel_node_, + cudaMemcpyAsync(inputs_device, inputs_host_.get(), sizeof(T *) * input_num_, cudaMemcpyHostToDevice, reinterpret_cast(stream_ptr)), "ConcatV2 opt cudaMemcpyAsync inputs failed"); - CHECK_CUDA_RET_WITH_EXCEPT(cudaMemcpyAsync(len_axis_device, len_axis_.get(), sizeof(int) * input_num_, + CHECK_CUDA_RET_WITH_EXCEPT(kernel_node_, + cudaMemcpyAsync(len_axis_device, len_axis_.get(), sizeof(int) * input_num_, cudaMemcpyHostToDevice, reinterpret_cast(stream_ptr)), "ConcatV2 opt cudaMemcpyAsync length on axis failed"); ConcatKernel(output_size_, input_num_, all_size_before_axis_, all_size_axis_, len_axis_device, inputs_device, @@ -60,6 +62,7 @@ class ConcatV2GpuFwdKernel : public GpuKernel { return true; } bool Init(const CNodePtr &kernel_node) override { + kernel_node_ = kernel_node; if (!CheckParam(kernel_node)) { return false; } diff --git a/mindspore/ccsrc/backend/kernel_compiler/gpu/arrays/dynamic_shape_gpu_kernel.h b/mindspore/ccsrc/backend/kernel_compiler/gpu/arrays/dynamic_shape_gpu_kernel.h index 3721621444..dd15fa7d22 100755 --- a/mindspore/ccsrc/backend/kernel_compiler/gpu/arrays/dynamic_shape_gpu_kernel.h +++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/arrays/dynamic_shape_gpu_kernel.h @@ -41,6 +41,7 @@ class DynamicShapeGpuKernel : public GpuKernel { S *output_device_address = GetDeviceAddress(outputs, 0); size_t prev_node_output_shape_size = prev_node_output_shape_.size() * sizeof(S); CHECK_CUDA_RET_WITH_EXCEPT( + kernel_node_, cudaMemcpyAsync(output_device_address, prev_node_output_shape_.data(), prev_node_output_shape_size, cudaMemcpyHostToDevice, reinterpret_cast(stream_ptr)), "cudaMemcpyAsync prev_node_output_shape failed"); @@ -49,6 +50,7 @@ class DynamicShapeGpuKernel : public GpuKernel { } bool Init(const CNodePtr &kernel_node) override { + kernel_node_ = kernel_node; size_t input_count = AnfAlgo::GetInputTensorNum(kernel_node); if (input_count != 1) { MS_LOG(EXCEPTION) << input_count << " arguments were provided, but DynamicShapeGpuKernel expects 1."; diff --git a/mindspore/ccsrc/backend/kernel_compiler/gpu/arrays/gathernd_gpu_kernel.h b/mindspore/ccsrc/backend/kernel_compiler/gpu/arrays/gathernd_gpu_kernel.h index d4e8d3d8ad..d95e546aa2 100644 --- a/mindspore/ccsrc/backend/kernel_compiler/gpu/arrays/gathernd_gpu_kernel.h +++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/arrays/gathernd_gpu_kernel.h @@ -51,10 +51,12 @@ class GatherNdGpuFwdKernel : public GpuKernel { if (!memcpy_flag_) { const size_t strides_len = sizeof(S) * batch_strides_.size(); const size_t indices_len = sizeof(S) * batch_indices_.size(); - CHECK_CUDA_RET_WITH_EXCEPT(cudaMemcpyAsync(dev_batch_strides_, &batch_strides_[0], strides_len, + CHECK_CUDA_RET_WITH_EXCEPT(kernel_node_, + cudaMemcpyAsync(dev_batch_strides_, &batch_strides_[0], strides_len, cudaMemcpyHostToDevice, reinterpret_cast(stream_ptr)), "cudaMemcpyAsync failed in GatherNdGpuFwdKernel::Launch."); - CHECK_CUDA_RET_WITH_EXCEPT(cudaMemcpyAsync(dev_batch_indices_, &batch_indices_[0], indices_len, + CHECK_CUDA_RET_WITH_EXCEPT(kernel_node_, + cudaMemcpyAsync(dev_batch_indices_, &batch_indices_[0], indices_len, cudaMemcpyHostToDevice, reinterpret_cast(stream_ptr)), "cudaMemcpyAsync failed in GatherNdGpuFwdKernel::Launch."); memcpy_flag_ = true; @@ -65,6 +67,7 @@ class GatherNdGpuFwdKernel : public GpuKernel { return true; } bool Init(const CNodePtr &kernel_node) override { + kernel_node_ = kernel_node; InitResource(); memcpy_flag_ = false; size_t input_num = AnfAlgo::GetInputTensorNum(kernel_node); diff --git a/mindspore/ccsrc/backend/kernel_compiler/gpu/arrays/gatherv2_gpu_kernel.h b/mindspore/ccsrc/backend/kernel_compiler/gpu/arrays/gatherv2_gpu_kernel.h index 45f3fb6580..eb4e86d424 100644 --- a/mindspore/ccsrc/backend/kernel_compiler/gpu/arrays/gatherv2_gpu_kernel.h +++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/arrays/gatherv2_gpu_kernel.h @@ -43,10 +43,12 @@ class GatherV2GpuFwdKernel : public GpuKernel { T *output_addr = GetDeviceAddress(outputs, 0); if (is_dynamic_shape_) { int64_t *axis_device_address = GetDeviceAddress(inputs, 2); // only get this if in dynamic mode - CHECK_CUDA_RET_WITH_EXCEPT(cudaMemcpyAsync(&axis_, axis_device_address, sizeof(int64_t), cudaMemcpyDeviceToHost, + CHECK_CUDA_RET_WITH_EXCEPT(kernel_node_, + cudaMemcpyAsync(&axis_, axis_device_address, sizeof(int64_t), cudaMemcpyDeviceToHost, reinterpret_cast(stream_ptr)), "cudaMemcpyAsync axis_ failed"); - CHECK_CUDA_RET_WITH_EXCEPT(cudaDeviceSynchronize(), "cudaDeviceSyncFailed - GatherV2 - in dynamic mode"); + CHECK_CUDA_RET_WITH_EXCEPT(kernel_node_, cudaDeviceSynchronize(), + "cudaDeviceSyncFailed - GatherV2 - in dynamic mode"); Reshape(); } auto input_dim1 = input_shapes_[IntToSize(axis_)]; @@ -55,6 +57,7 @@ class GatherV2GpuFwdKernel : public GpuKernel { return true; } bool Init(const CNodePtr &kernel_node) override { + kernel_node_ = kernel_node; InitResource(); size_t input_num = AnfAlgo::GetInputTensorNum(kernel_node); if (input_num == 3) { diff --git a/mindspore/ccsrc/backend/kernel_compiler/gpu/arrays/pack_gpu_kernel.h b/mindspore/ccsrc/backend/kernel_compiler/gpu/arrays/pack_gpu_kernel.h index b0ad39b355..b8da6902eb 100644 --- a/mindspore/ccsrc/backend/kernel_compiler/gpu/arrays/pack_gpu_kernel.h +++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/arrays/pack_gpu_kernel.h @@ -41,7 +41,8 @@ class PackGpuFwdKernel : public GpuKernel { for (size_t i = 0; i < inputs.size(); i++) { inputs_host_[i] = GetDeviceAddress(inputs, i); } - CHECK_CUDA_RET_WITH_EXCEPT(cudaMemcpyAsync(inputs_array, // NOLINT + CHECK_CUDA_RET_WITH_EXCEPT(kernel_node_, + cudaMemcpyAsync(inputs_array, // NOLINT inputs_host_.get(), sizeof(T *) * input_num_, cudaMemcpyHostToDevice, reinterpret_cast(stream_ptr)), "Pack opt cudaMemcpyAsync inputs failed"); @@ -50,6 +51,7 @@ class PackGpuFwdKernel : public GpuKernel { return true; } bool Init(const CNodePtr &kernel_node) override { + kernel_node_ = kernel_node; if (!CheckParam(kernel_node)) { return false; } diff --git a/mindspore/ccsrc/backend/kernel_compiler/gpu/arrays/scatter_add_gpu_kernel.h b/mindspore/ccsrc/backend/kernel_compiler/gpu/arrays/scatter_add_gpu_kernel.h index e59cded87f..0b00babc0b 100644 --- a/mindspore/ccsrc/backend/kernel_compiler/gpu/arrays/scatter_add_gpu_kernel.h +++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/arrays/scatter_add_gpu_kernel.h @@ -41,13 +41,15 @@ class ScatterAddKernel : public GpuKernel { T *updates = GetDeviceAddress(inputs, 2); T *output = GetDeviceAddress(outputs, 0); CalScatterAdd(inner_size_, indices_size_, indices, updates, input, reinterpret_cast(stream_ptr)); - CHECK_CUDA_RET_WITH_EXCEPT(cudaMemcpyAsync(&output[0], &input[0], input_size_ * sizeof(T), cudaMemcpyDeviceToDevice, + CHECK_CUDA_RET_WITH_EXCEPT(kernel_node_, + cudaMemcpyAsync(&output[0], &input[0], input_size_ * sizeof(T), cudaMemcpyDeviceToDevice, reinterpret_cast(stream_ptr)), "cudaMemcpyAsync output failed"); return true; } bool Init(const CNodePtr &kernel_node) override { + kernel_node_ = kernel_node; size_t input_num = AnfAlgo::GetInputTensorNum(kernel_node); if (input_num != 3) { MS_LOG(ERROR) << "Input number is " << input_num << ", but ScatterAdd needs 3 inputs."; diff --git a/mindspore/ccsrc/backend/kernel_compiler/gpu/arrays/scatter_nd_gpu_kernel.h b/mindspore/ccsrc/backend/kernel_compiler/gpu/arrays/scatter_nd_gpu_kernel.h index eb50372a43..281c8a04ee 100644 --- a/mindspore/ccsrc/backend/kernel_compiler/gpu/arrays/scatter_nd_gpu_kernel.h +++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/arrays/scatter_nd_gpu_kernel.h @@ -61,16 +61,19 @@ class ScatterNdGpuFwdKernel : public GpuKernel { if (!memcpy_flag_) { const size_t indices_len = sizeof(S) * vec_indices_stride_.size(); const size_t vec_work_len = sizeof(S) * vec_work_shape_.size(); - CHECK_CUDA_RET_WITH_EXCEPT(cudaMemcpyAsync(indices_stride_, &vec_indices_stride_[0], indices_len, + CHECK_CUDA_RET_WITH_EXCEPT(kernel_node_, + cudaMemcpyAsync(indices_stride_, &vec_indices_stride_[0], indices_len, cudaMemcpyHostToDevice, reinterpret_cast(stream_ptr)), "cudaMemcpy failed in ScatterNdGpuFwdKernel::Launch."); - CHECK_CUDA_RET_WITH_EXCEPT(cudaMemcpyAsync(work_shape_, &vec_work_shape_[0], vec_work_len, cudaMemcpyHostToDevice, + CHECK_CUDA_RET_WITH_EXCEPT(kernel_node_, + cudaMemcpyAsync(work_shape_, &vec_work_shape_[0], vec_work_len, cudaMemcpyHostToDevice, reinterpret_cast(stream_ptr)), "cudaMemcpy failed in ScatterNdGpuFwdKernel::Launch."); memcpy_flag_ = true; } CHECK_CUDA_RET_WITH_EXCEPT( + kernel_node_, cudaMemsetAsync(output, static_cast(0.0), output_size_, reinterpret_cast(stream_ptr)), "cudaMemSet failed in ScatterNdGpuFwdKernel::Launch."); @@ -83,6 +86,7 @@ class ScatterNdGpuFwdKernel : public GpuKernel { } bool Init(const CNodePtr &kernel_node) override { + kernel_node_ = kernel_node; memcpy_flag_ = false; size_t input_num = AnfAlgo::GetInputTensorNum(kernel_node); if (input_num != 2) { diff --git a/mindspore/ccsrc/backend/kernel_compiler/gpu/arrays/scatter_update_gpu_kernel.h b/mindspore/ccsrc/backend/kernel_compiler/gpu/arrays/scatter_update_gpu_kernel.h index f43ad5935e..e0ac72561f 100644 --- a/mindspore/ccsrc/backend/kernel_compiler/gpu/arrays/scatter_update_gpu_kernel.h +++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/arrays/scatter_update_gpu_kernel.h @@ -41,13 +41,15 @@ class ScatterUpdateKernel : public GpuKernel { T *updates = GetDeviceAddress(inputs, 2); T *output = GetDeviceAddress(outputs, 0); CalScatterUpdate(inner_size_, indices_size_, indices, updates, input, reinterpret_cast(stream_ptr)); - CHECK_CUDA_RET_WITH_EXCEPT(cudaMemcpyAsync(&output[0], &input[0], input_size_ * sizeof(T), cudaMemcpyDeviceToDevice, + CHECK_CUDA_RET_WITH_EXCEPT(kernel_node_, + cudaMemcpyAsync(&output[0], &input[0], input_size_ * sizeof(T), cudaMemcpyDeviceToDevice, reinterpret_cast(stream_ptr)), "cudaMemcpyAsync output failed"); return true; } bool Init(const CNodePtr &kernel_node) override { + kernel_node_ = kernel_node; size_t input_num = AnfAlgo::GetInputTensorNum(kernel_node); if (input_num != 3) { MS_LOG(ERROR) << "Input number is " << input_num << ", but ScatterUpdate needs 3 inputs."; diff --git a/mindspore/ccsrc/backend/kernel_compiler/gpu/arrays/split_gpu_kernel.h b/mindspore/ccsrc/backend/kernel_compiler/gpu/arrays/split_gpu_kernel.h index a36c75c6b5..1d63dcacb9 100644 --- a/mindspore/ccsrc/backend/kernel_compiler/gpu/arrays/split_gpu_kernel.h +++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/arrays/split_gpu_kernel.h @@ -41,7 +41,8 @@ class SplitGpuFwdKernel : public GpuKernel { for (size_t i = 0; i < outputs.size(); i++) { outputs_host_[i] = GetDeviceAddress(outputs, i); } - CHECK_CUDA_RET_WITH_EXCEPT(cudaMemcpyAsync(outputs_device, outputs_host_.get(), sizeof(T *) * output_num_, + CHECK_CUDA_RET_WITH_EXCEPT(kernel_node_, + cudaMemcpyAsync(outputs_device, outputs_host_.get(), sizeof(T *) * output_num_, cudaMemcpyHostToDevice, reinterpret_cast(stream_ptr)), "Split opt cudaMemcpyAsync outputs failed"); SplitKernel(input_size_, axis_step_, all_size_before_axis_, all_size_axis_, input, outputs_device, @@ -50,6 +51,7 @@ class SplitGpuFwdKernel : public GpuKernel { } bool Init(const CNodePtr &kernel_node) override { + kernel_node_ = kernel_node; axis_ = static_cast(GetAttr(kernel_node, "axis")); if (axis_ < 0) { auto input_shape = AnfAlgo::GetInputRealDeviceShapeIfExist(kernel_node, 0); diff --git a/mindspore/ccsrc/backend/kernel_compiler/gpu/arrays/transpose_gpu_kernel.h b/mindspore/ccsrc/backend/kernel_compiler/gpu/arrays/transpose_gpu_kernel.h index 6a04cf5beb..0369bf76ef 100644 --- a/mindspore/ccsrc/backend/kernel_compiler/gpu/arrays/transpose_gpu_kernel.h +++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/arrays/transpose_gpu_kernel.h @@ -40,10 +40,12 @@ class TransposeGpuFwdKernel : public GpuKernel { T *output = GetDeviceAddress(outputs, 0); size_t *input_shape = GetDeviceAddress(workspace, 0); size_t *input_axis = GetDeviceAddress(workspace, 1); - CHECK_CUDA_RET_WITH_EXCEPT(cudaMemcpyAsync(input_shape, &input_shape_[0], workspace_size_, cudaMemcpyHostToDevice, + CHECK_CUDA_RET_WITH_EXCEPT(kernel_node_, + cudaMemcpyAsync(input_shape, &input_shape_[0], workspace_size_, cudaMemcpyHostToDevice, reinterpret_cast(stream_ptr)), "cudaMemcpyAsync input_shape failed"); - CHECK_CUDA_RET_WITH_EXCEPT(cudaMemcpyAsync(input_axis, &input_axis_[0], workspace_size_, cudaMemcpyHostToDevice, + CHECK_CUDA_RET_WITH_EXCEPT(kernel_node_, + cudaMemcpyAsync(input_axis, &input_axis_[0], workspace_size_, cudaMemcpyHostToDevice, reinterpret_cast(stream_ptr)), "cudaMemcpyAsync input_axis failed"); size_t size = input_size_ / sizeof(T); @@ -52,6 +54,7 @@ class TransposeGpuFwdKernel : public GpuKernel { } bool Init(const CNodePtr &kernel_node) override { + kernel_node_ = kernel_node; size_t input_num = AnfAlgo::GetInputTensorNum(kernel_node); if (input_num != 1) { MS_LOG(ERROR) << "Input number is " << input_num << ", but transpose needs 1 input."; diff --git a/mindspore/ccsrc/backend/kernel_compiler/gpu/arrays/unique_gpu_kernel.h b/mindspore/ccsrc/backend/kernel_compiler/gpu/arrays/unique_gpu_kernel.h index e8a6c853c2..913ba45730 100644 --- a/mindspore/ccsrc/backend/kernel_compiler/gpu/arrays/unique_gpu_kernel.h +++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/arrays/unique_gpu_kernel.h @@ -60,7 +60,7 @@ class UniqueGpuKernel : public GpuKernel { } void PostExecute() override { - CHECK_CUDA_RET_WITH_EXCEPT(cudaStreamSynchronize(reinterpret_cast(stream_ptr_)), + CHECK_CUDA_RET_WITH_EXCEPT(kernel_node_, cudaStreamSynchronize(reinterpret_cast(stream_ptr_)), "cudaStreamSynchronized failed"); std::vector type_ids; std::vector> shapes; diff --git a/mindspore/ccsrc/backend/kernel_compiler/gpu/arrays/unpack_gpu_kernel.h b/mindspore/ccsrc/backend/kernel_compiler/gpu/arrays/unpack_gpu_kernel.h index 980cb619e6..2e7e020b55 100644 --- a/mindspore/ccsrc/backend/kernel_compiler/gpu/arrays/unpack_gpu_kernel.h +++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/arrays/unpack_gpu_kernel.h @@ -41,7 +41,8 @@ class UnpackGpuFwdKernel : public GpuKernel { for (size_t i = 0; i < outputs.size(); i++) { outputs_host_[i] = GetDeviceAddress(outputs, i); } - CHECK_CUDA_RET_WITH_EXCEPT(cudaMemcpyAsync(outputs_array, // NOLINT + CHECK_CUDA_RET_WITH_EXCEPT(kernel_node_, + cudaMemcpyAsync(outputs_array, // NOLINT outputs_host_.get(), sizeof(T *) * output_num_, cudaMemcpyHostToDevice, reinterpret_cast(stream_ptr)), "Unpack opt cudaMemcpyAsync outputs failed"); @@ -50,6 +51,7 @@ class UnpackGpuFwdKernel : public GpuKernel { return true; } bool Init(const CNodePtr &kernel_node) override { + kernel_node_ = kernel_node; if (!CheckParam(kernel_node)) { return false; } diff --git a/mindspore/ccsrc/backend/kernel_compiler/gpu/arrays/unsorted_segment_max_gpu_kernel.h b/mindspore/ccsrc/backend/kernel_compiler/gpu/arrays/unsorted_segment_max_gpu_kernel.h index 2c6fa1a177..2cda65112a 100644 --- a/mindspore/ccsrc/backend/kernel_compiler/gpu/arrays/unsorted_segment_max_gpu_kernel.h +++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/arrays/unsorted_segment_max_gpu_kernel.h @@ -44,7 +44,8 @@ class UnsortedSegmentMaxGpuKernel : public GpuKernel { int *indices_addr = GetDeviceAddress(inputs, 1); T *output_addr = GetDeviceAddress(outputs, 0); - CHECK_CUDA_RET_WITH_EXCEPT(cudaMemsetAsync(output_addr, std::numeric_limits::min(), outputs[0]->size, + CHECK_CUDA_RET_WITH_EXCEPT(kernel_node_, + cudaMemsetAsync(output_addr, std::numeric_limits::min(), outputs[0]->size, reinterpret_cast(stream_ptr)), "cudaMemSet Failed"); CalUnsortedSegmentMax(input_addr, indices_addr, num_segments_, outer_size_, inner_size_, output_addr, @@ -53,6 +54,7 @@ class UnsortedSegmentMaxGpuKernel : public GpuKernel { } bool Init(const CNodePtr &kernel_node) override { + kernel_node_ = kernel_node; auto input_shapes = AnfAlgo::GetInputRealDeviceShapeIfExist(kernel_node, 0); is_null_input_ = CHECK_NULL_INPUT(input_shapes); if (is_null_input_) { diff --git a/mindspore/ccsrc/backend/kernel_compiler/gpu/arrays/unsorted_segment_sum_gpu_kernel.h b/mindspore/ccsrc/backend/kernel_compiler/gpu/arrays/unsorted_segment_sum_gpu_kernel.h index 5e1fbe0993..90d5fd7625 100644 --- a/mindspore/ccsrc/backend/kernel_compiler/gpu/arrays/unsorted_segment_sum_gpu_kernel.h +++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/arrays/unsorted_segment_sum_gpu_kernel.h @@ -44,7 +44,7 @@ class UnsortedSegmentSumGpuKernel : public GpuKernel { T *output_addr = GetDeviceAddress(outputs, 0); CHECK_CUDA_RET_WITH_EXCEPT( - cudaMemsetAsync(output_addr, 0, outputs[0]->size, reinterpret_cast(stream_ptr)), + kernel_node_, cudaMemsetAsync(output_addr, 0, outputs[0]->size, reinterpret_cast(stream_ptr)), "cudaMemSet Failed"); UnsortedSegmentSum(input_dim0_, input_dim1_, output_dim0_, output_dim1_, input_addr, indices_addr, output_addr, reinterpret_cast(stream_ptr)); @@ -52,6 +52,7 @@ class UnsortedSegmentSumGpuKernel : public GpuKernel { } bool Init(const CNodePtr &kernel_node) override { + kernel_node_ = kernel_node; auto input_shapes = AnfAlgo::GetInputRealDeviceShapeIfExist(kernel_node, 0); is_null_input_ = CHECK_NULL_INPUT(input_shapes); if (is_null_input_) { diff --git a/mindspore/ccsrc/backend/kernel_compiler/gpu/control/recv_gpu_kernel.h b/mindspore/ccsrc/backend/kernel_compiler/gpu/control/recv_gpu_kernel.h index 9d4d5c8cd0..dda155aa57 100644 --- a/mindspore/ccsrc/backend/kernel_compiler/gpu/control/recv_gpu_kernel.h +++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/control/recv_gpu_kernel.h @@ -34,10 +34,12 @@ class RecvGpuKernel : public GpuKernel { bool Launch(const std::vector &, const std::vector &, const std::vector &, void *) override { - CHECK_CUDA_RET_WITH_EXCEPT(cudaStreamWaitEvent(wait_stream_, wait_event_, 0), "Waiting cuda event failed."); + CHECK_CUDA_RET_WITH_EXCEPT(kernel_node_, cudaStreamWaitEvent(wait_stream_, wait_event_, 0), + "Waiting cuda event failed."); return true; } bool Init(const CNodePtr &kernel_node) override { + kernel_node_ = kernel_node; wait_stream_ = reinterpret_cast(GetAttr(kernel_node, "wait_event_stream")); wait_event_ = reinterpret_cast(GetAttr(kernel_node, "wait_event")); InitSizeLists(); diff --git a/mindspore/ccsrc/backend/kernel_compiler/gpu/control/send_gpu_kernel.h b/mindspore/ccsrc/backend/kernel_compiler/gpu/control/send_gpu_kernel.h index d9b70e5629..cbdc9ca48e 100644 --- a/mindspore/ccsrc/backend/kernel_compiler/gpu/control/send_gpu_kernel.h +++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/control/send_gpu_kernel.h @@ -34,10 +34,12 @@ class SendGpuKernel : public GpuKernel { bool Launch(const std::vector &, const std::vector &, const std::vector &, void *) override { - CHECK_CUDA_RET_WITH_EXCEPT(cudaEventRecord(record_event_, record_stream_), "Recording cuda event failed."); + CHECK_CUDA_RET_WITH_EXCEPT(kernel_node_, cudaEventRecord(record_event_, record_stream_), + "Recording cuda event failed."); return true; } bool Init(const CNodePtr &kernel_node) override { + kernel_node_ = kernel_node; record_stream_ = reinterpret_cast(GetAttr(kernel_node, "record_event_stream")); record_event_ = reinterpret_cast(GetAttr(kernel_node, "record_event")); InitSizeLists(); diff --git a/mindspore/ccsrc/backend/kernel_compiler/gpu/data/dataset_iterator_kernel.cc b/mindspore/ccsrc/backend/kernel_compiler/gpu/data/dataset_iterator_kernel.cc index 59269b7703..998ebffede 100644 --- a/mindspore/ccsrc/backend/kernel_compiler/gpu/data/dataset_iterator_kernel.cc +++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/data/dataset_iterator_kernel.cc @@ -44,6 +44,7 @@ const std::vector &DatasetIteratorKernel::GetOutputSizeList() const { re const std::vector &DatasetIteratorKernel::GetWorkspaceSizeList() const { return workspace_size_list_; } bool DatasetIteratorKernel::Init(const CNodePtr &kernel_node) { + kernel_node_ = kernel_node; queue_name_ = GetAttr(kernel_node, "shared_name"); std::vector> shapes; std::vector> shapes_me = GetAttr>>(kernel_node, "shapes"); @@ -143,13 +144,14 @@ bool DatasetIteratorKernel::Launch(const std::vector &, const std::v for (size_t i = 0; i < output_size_list_.size(); i++) { void *output_addr = GetDeviceAddress(outputs, i); - CHECK_CUDA_RET_WITH_EXCEPT(cudaMemcpyAsync(output_addr, addr, output_size_list_[i], cudaMemcpyDeviceToDevice, + CHECK_CUDA_RET_WITH_EXCEPT(kernel_node_, + cudaMemcpyAsync(output_addr, addr, output_size_list_[i], cudaMemcpyDeviceToDevice, reinterpret_cast(stream)), "Cuda Memcpy Failed"); addr = reinterpret_cast(addr) + output_size_list_[i]; } - CHECK_CUDA_RET_WITH_EXCEPT(cudaStreamSynchronize(reinterpret_cast(stream)), + CHECK_CUDA_RET_WITH_EXCEPT(kernel_node_, cudaStreamSynchronize(reinterpret_cast(stream)), "cudaStreamSynchronize failed"); (void)GpuBufferMgr::GetInstance().Pop(handle_); return true; diff --git a/mindspore/ccsrc/backend/kernel_compiler/gpu/gpu_kernel.h b/mindspore/ccsrc/backend/kernel_compiler/gpu/gpu_kernel.h index 863829e55b..a9d195d106 100644 --- a/mindspore/ccsrc/backend/kernel_compiler/gpu/gpu_kernel.h +++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/gpu_kernel.h @@ -73,6 +73,7 @@ class GpuKernel : public KernelMod { protected: virtual void InitResource() {} virtual void InitSizeLists() = 0; + CNodePtr kernel_node_; template inline T *GetDeviceAddress(const std::vector &addr_list, size_t index) { @@ -201,7 +202,7 @@ class GpuKernel : public KernelMod { // set the tensor descriptor for cudnn/cublas void CudnnSetTensorNdDescriptor(const std::vector &shape, cudnnTensorDescriptor_t descriptor, - cudnnDataType_t data_type) { + cudnnDataType_t data_type, const CNodePtr &node) { if (shape.size() < 3) { MS_EXCEPTION(ValueError) << "cudnnSetTensorNdDescriptor don't support" << shape.size() << "D."; } @@ -224,7 +225,7 @@ class GpuKernel : public KernelMod { stride[i] = stride[i + 1] * SizeToInt(shape[i + 1]); } - CHECK_CUDNN_RET_WITH_EXCEPT(cudnnSetTensorNdDescriptor(descriptor, data_type, nbDims, dim, stride), + CHECK_CUDNN_RET_WITH_EXCEPT(node, cudnnSetTensorNdDescriptor(descriptor, data_type, nbDims, dim, stride), "cudnnSetTensorNdDescriptor failed"); delete[] dim; diff --git a/mindspore/ccsrc/backend/kernel_compiler/gpu/math/addn_gpu_kernel.h b/mindspore/ccsrc/backend/kernel_compiler/gpu/math/addn_gpu_kernel.h index a5f484ce94..b017c741c5 100644 --- a/mindspore/ccsrc/backend/kernel_compiler/gpu/math/addn_gpu_kernel.h +++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/math/addn_gpu_kernel.h @@ -69,19 +69,22 @@ class AddNGpuFwdKernel : public GpuKernel { ElewiseArith(outputs[0]->size / sizeof(T), BROADCAST_TYPE_ADD, input_addr, work_addr, work_addr, reinterpret_cast(stream_ptr)); } else { - CHECK_CUDNN_RET_WITH_EXCEPT(cudnnAddTensor(cudnn_handle_, &alpha, input_descriptor_, input_addr, + CHECK_CUDNN_RET_WITH_EXCEPT(kernel_node_, + cudnnAddTensor(cudnn_handle_, &alpha, input_descriptor_, input_addr, &(i > 0 ? alpha : beta), input_descriptor_, work_addr), "cudnnAddTensor failed"); } } if (work_addr != output_addr) { - CHECK_CUDA_RET_WITH_EXCEPT(cudaMemcpyAsync(output_addr, work_addr, outputs[0]->size, cudaMemcpyDeviceToDevice, + CHECK_CUDA_RET_WITH_EXCEPT(kernel_node_, + cudaMemcpyAsync(output_addr, work_addr, outputs[0]->size, cudaMemcpyDeviceToDevice, reinterpret_cast(stream_ptr)), "Addn cudaMemcpyAsync outputs failed"); } return true; } bool Init(const CNodePtr &kernel_node) override { + kernel_node_ = kernel_node; InitResource(); cudnn_data_type_ = GetCudnnDataType(TypeIdLabel(AnfAlgo::GetInputDeviceDataType(kernel_node, 0))); size_t input_num = AnfAlgo::GetInputTensorNum(kernel_node); @@ -111,11 +114,13 @@ class AddNGpuFwdKernel : public GpuKernel { } auto input_format = AnfAlgo::GetInputFormat(kernel_node, 0); if (input_format == kOpFormat_NHWC) { - CHECK_CUDNN_RET_WITH_EXCEPT(cudnnSetTensorNdDescriptorEx(input_descriptor_, CUDNN_TENSOR_NHWC, cudnn_data_type_, + CHECK_CUDNN_RET_WITH_EXCEPT(kernel_node_, + cudnnSetTensorNdDescriptorEx(input_descriptor_, CUDNN_TENSOR_NHWC, cudnn_data_type_, SizeToInt(input_shape.size()), dimA), "cudnnSetTensorNdDescriptor failed"); } else { - CHECK_CUDNN_RET_WITH_EXCEPT(cudnnSetTensorNdDescriptorEx(input_descriptor_, CUDNN_TENSOR_NCHW, cudnn_data_type_, + CHECK_CUDNN_RET_WITH_EXCEPT(kernel_node_, + cudnnSetTensorNdDescriptorEx(input_descriptor_, CUDNN_TENSOR_NCHW, cudnn_data_type_, SizeToInt(input_shape.size()), dimA), "cudnnSetTensorNdDescriptor failed"); } @@ -124,17 +129,19 @@ class AddNGpuFwdKernel : public GpuKernel { } void DestroyResource() noexcept override { - CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(input_descriptor_), "cudnnDestroyTensorDescriptor failed"); + CHECK_CUDNN_RET_WITH_ERROR(kernel_node_, cudnnDestroyTensorDescriptor(input_descriptor_), + "cudnnDestroyTensorDescriptor failed"); } protected: void InitResource() override { cudnn_handle_ = device::gpu::GPUDeviceManager::GetInstance().GetCudnnHandle(); - CHECK_CUDNN_RET_WITH_EXCEPT(cudnnCreateTensorDescriptor(&input_descriptor_), "cudnnCreateTensorDescriptor failed"); + CHECK_CUDNN_RET_WITH_EXCEPT(kernel_node_, cudnnCreateTensorDescriptor(&input_descriptor_), + "cudnnCreateTensorDescriptor failed"); } void InitSizeLists() override { if (!is_null_input_) { - CHECK_CUDNN_RET_WITH_EXCEPT(cudnnGetTensorSizeInBytes(input_descriptor_, &input_size_), + CHECK_CUDNN_RET_WITH_EXCEPT(kernel_node_, cudnnGetTensorSizeInBytes(input_descriptor_, &input_size_), "cudnnGetTensorSizeInBytes failed"); } for (size_t i = 0; i < num_input_; i++) { diff --git a/mindspore/ccsrc/backend/kernel_compiler/gpu/math/bias_add_gpu_kernel.h b/mindspore/ccsrc/backend/kernel_compiler/gpu/math/bias_add_gpu_kernel.h index d43d07970d..00d7be7037 100644 --- a/mindspore/ccsrc/backend/kernel_compiler/gpu/math/bias_add_gpu_kernel.h +++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/math/bias_add_gpu_kernel.h @@ -57,7 +57,8 @@ class BiasAddGpuKernel : public GpuKernel { try { const float alpha = 1; const float beta = 0; - CHECK_CUDNN_RET_WITH_EXCEPT(cudnnOpTensor(cudnn_handle_, op_desc_, &alpha, x_desc_, x_addr, &alpha, b_desc_, + CHECK_CUDNN_RET_WITH_EXCEPT(kernel_node_, + cudnnOpTensor(cudnn_handle_, op_desc_, &alpha, x_desc_, x_addr, &alpha, b_desc_, b_addr, &beta, x_desc_, output_addr), "cudnnOpTensor failed"); } catch (const std::exception &e) { @@ -66,6 +67,7 @@ class BiasAddGpuKernel : public GpuKernel { return true; } bool Init(const CNodePtr &kernel_node) override { + kernel_node_ = kernel_node; InitResource(); cudnn_data_type_ = GetCudnnDataType(TypeIdLabel(AnfAlgo::GetInputDeviceDataType(kernel_node, 0))); auto x_shape = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 0); @@ -99,12 +101,15 @@ class BiasAddGpuKernel : public GpuKernel { auto input_device_format = AnfAlgo::GetInputFormat(kernel_node, 0); auto cudnn_cal_format = (input_device_format == kOpFormat_NHWC) ? CUDNN_TENSOR_NHWC : CUDNN_TENSOR_NCHW; CHECK_CUDNN_RET_WITH_EXCEPT( + kernel_node_, cudnnSetTensorNdDescriptorEx(x_desc_, cudnn_cal_format, cudnn_data_type_, SizeToInt(cudnn_dims), x_dims.get()), "cudnnSetTensorNdDescriptor failed"); CHECK_CUDNN_RET_WITH_EXCEPT( + kernel_node_, cudnnSetTensorNdDescriptorEx(b_desc_, cudnn_cal_format, cudnn_data_type_, SizeToInt(cudnn_dims), b_dims.get()), "cudnnSetTensorNdDescriptor failed"); CHECK_CUDNN_RET_WITH_EXCEPT( + kernel_node_, cudnnSetOpTensorDescriptor(op_desc_, CUDNN_OP_TENSOR_ADD, CUDNN_DATA_FLOAT, CUDNN_NOT_PROPAGATE_NAN), "cudnnSetOpTensorDescriptor failed"); @@ -113,22 +118,30 @@ class BiasAddGpuKernel : public GpuKernel { } void DestroyResource() noexcept override { - CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyOpTensorDescriptor(op_desc_), "cudnnDestroyTensorDescriptor failed"); - CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(b_desc_), "cudnnDestroyTensorDescriptor failed"); - CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(x_desc_), "cudnnDestroyOpTensorDescriptor failed"); + CHECK_CUDNN_RET_WITH_ERROR(kernel_node_, cudnnDestroyOpTensorDescriptor(op_desc_), + "cudnnDestroyTensorDescriptor failed"); + CHECK_CUDNN_RET_WITH_ERROR(kernel_node_, cudnnDestroyTensorDescriptor(b_desc_), + "cudnnDestroyTensorDescriptor failed"); + CHECK_CUDNN_RET_WITH_ERROR(kernel_node_, cudnnDestroyTensorDescriptor(x_desc_), + "cudnnDestroyOpTensorDescriptor failed"); } protected: void InitResource() override { cudnn_handle_ = device::gpu::GPUDeviceManager::GetInstance().GetCudnnHandle(); - CHECK_CUDNN_RET_WITH_EXCEPT(cudnnCreateTensorDescriptor(&x_desc_), "cudnnCreateTensorDescriptor failed"); - CHECK_CUDNN_RET_WITH_EXCEPT(cudnnCreateTensorDescriptor(&b_desc_), "cudnnCreateTensorDescriptor failed"); - CHECK_CUDNN_RET_WITH_EXCEPT(cudnnCreateOpTensorDescriptor(&op_desc_), "cudnnCreateOpTensorDescriptor failed"); + CHECK_CUDNN_RET_WITH_EXCEPT(kernel_node_, cudnnCreateTensorDescriptor(&x_desc_), + "cudnnCreateTensorDescriptor failed"); + CHECK_CUDNN_RET_WITH_EXCEPT(kernel_node_, cudnnCreateTensorDescriptor(&b_desc_), + "cudnnCreateTensorDescriptor failed"); + CHECK_CUDNN_RET_WITH_EXCEPT(kernel_node_, cudnnCreateOpTensorDescriptor(&op_desc_), + "cudnnCreateOpTensorDescriptor failed"); } void InitSizeLists() override { size_t x_size, b_size; - CHECK_CUDNN_RET_WITH_EXCEPT(cudnnGetTensorSizeInBytes(x_desc_, &x_size), "cudnnGetTensorSizeInBytes failed."); - CHECK_CUDNN_RET_WITH_EXCEPT(cudnnGetTensorSizeInBytes(b_desc_, &b_size), "cudnnGetTensorSizeInBytes failed."); + CHECK_CUDNN_RET_WITH_EXCEPT(kernel_node_, cudnnGetTensorSizeInBytes(x_desc_, &x_size), + "cudnnGetTensorSizeInBytes failed."); + CHECK_CUDNN_RET_WITH_EXCEPT(kernel_node_, cudnnGetTensorSizeInBytes(b_desc_, &b_size), + "cudnnGetTensorSizeInBytes failed."); input_size_list_.push_back(x_size); input_size_list_.push_back(b_size); output_size_list_.push_back(x_size); diff --git a/mindspore/ccsrc/backend/kernel_compiler/gpu/math/broadcast_grad_gpu_kernel.h b/mindspore/ccsrc/backend/kernel_compiler/gpu/math/broadcast_grad_gpu_kernel.h index 0b1d262a84..b2a173c909 100644 --- a/mindspore/ccsrc/backend/kernel_compiler/gpu/math/broadcast_grad_gpu_kernel.h +++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/math/broadcast_grad_gpu_kernel.h @@ -45,9 +45,11 @@ class BroadcastOpGradGpuKernel : public GpuKernel { T *dx1 = GetDeviceAddress(outputs, 0); T *dx2 = GetDeviceAddress(outputs, 1); - CHECK_CUDA_RET_WITH_EXCEPT(cudaMemsetAsync(dx1, 0, outputs[0]->size, reinterpret_cast(stream_ptr)), + CHECK_CUDA_RET_WITH_EXCEPT(kernel_node_, + cudaMemsetAsync(dx1, 0, outputs[0]->size, reinterpret_cast(stream_ptr)), "cudaMemSet Failed"); - CHECK_CUDA_RET_WITH_EXCEPT(cudaMemsetAsync(dx2, 0, outputs[1]->size, reinterpret_cast(stream_ptr)), + CHECK_CUDA_RET_WITH_EXCEPT(kernel_node_, + cudaMemsetAsync(dx2, 0, outputs[1]->size, reinterpret_cast(stream_ptr)), "cudaMemSet Failed"); if (need_broadcast_) { BroadcastGrad(x1_shape_[0], x1_shape_[1], x1_shape_[2], x1_shape_[3], x2_shape_[0], x2_shape_[1], x2_shape_[2], @@ -61,6 +63,7 @@ class BroadcastOpGradGpuKernel : public GpuKernel { return true; } bool Init(const CNodePtr &kernel_node) override { + kernel_node_ = kernel_node; GetOpType(kernel_node); auto shape1 = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 0); auto shape2 = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 1); diff --git a/mindspore/ccsrc/backend/kernel_compiler/gpu/math/cast_all_gpu_kernel.h b/mindspore/ccsrc/backend/kernel_compiler/gpu/math/cast_all_gpu_kernel.h index 6d9ac9ef33..4e0f59b444 100644 --- a/mindspore/ccsrc/backend/kernel_compiler/gpu/math/cast_all_gpu_kernel.h +++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/math/cast_all_gpu_kernel.h @@ -48,19 +48,22 @@ class CastAllGpuFwdKernel : public GpuKernel { S **outputs_dev = GetDeviceAddress(workspace, 1); size_t *size_dev = GetDeviceAddress(workspace, 2); CHECK_CUDA_RET_WITH_EXCEPT( + kernel_node_, cudaMemcpyAsync(inputs_dev, in_addr.get(), sizeof(T *) * num_input_, cudaMemcpyHostToDevice, stream), "cudaMemCPY failed") CHECK_CUDA_RET_WITH_EXCEPT( + kernel_node_, cudaMemcpyAsync(outputs_dev, out_addr.get(), sizeof(S *) * num_input_, cudaMemcpyHostToDevice, stream), "cudaMemCPY failed") CHECK_CUDA_RET_WITH_EXCEPT( - cudaMemcpyAsync(size_dev, size_.get(), sizeof(size_t) * num_input_, cudaMemcpyHostToDevice, stream), + kernel_node_, cudaMemcpyAsync(size_dev, size_.get(), sizeof(size_t) * num_input_, cudaMemcpyHostToDevice, stream), "cudaMemCPY failed") CastAllKernel(inputs_dev, outputs_dev, max_, num_input_, size_dev, stream); return true; } bool Init(const CNodePtr &kernel_node) override { + kernel_node_ = kernel_node; num_input_ = GetAttr(kernel_node, "n"); size_ = std::make_unique(num_input_); for (size_t i = 0; i < num_input_; i++) { diff --git a/mindspore/ccsrc/backend/kernel_compiler/gpu/math/cholesky_solve_gpu_kernel.h b/mindspore/ccsrc/backend/kernel_compiler/gpu/math/cholesky_solve_gpu_kernel.h index efc9b976ee..a0356ffac4 100644 --- a/mindspore/ccsrc/backend/kernel_compiler/gpu/math/cholesky_solve_gpu_kernel.h +++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/math/cholesky_solve_gpu_kernel.h @@ -54,18 +54,21 @@ class CholeskyGpuKernel : public GpuKernel { h_array[i] = input1_addr + i * lda_ * m_; h_identity[i] = output_addr + i * ldb_ * m_; CHECK_CUDA_RET_WITH_ERROR( + kernel_node_, cudaMemcpyAsync(output_addr + i * ldb_ * m_, h_identity_data.data(), sizeof(T) * ldb_ * m_, cudaMemcpyHostToDevice, reinterpret_cast(stream_ptr)), "cuda memcopy Fail"); } - CHECK_CUDA_RET_WITH_ERROR(cudaMemcpyAsync(d_array_addr, h_array.data(), sizeof(T *) * batch_, + CHECK_CUDA_RET_WITH_ERROR(kernel_node_, + cudaMemcpyAsync(d_array_addr, h_array.data(), sizeof(T *) * batch_, cudaMemcpyHostToDevice, reinterpret_cast(stream_ptr)), "cuda memcopy Fail"); - CHECK_CUDA_RET_WITH_ERROR(cudaMemcpyAsync(d_identity_addr, h_identity.data(), sizeof(T *) * batch_, + CHECK_CUDA_RET_WITH_ERROR(kernel_node_, + cudaMemcpyAsync(d_identity_addr, h_identity.data(), sizeof(T *) * batch_, cudaMemcpyHostToDevice, reinterpret_cast(stream_ptr)), "cuda memcopy Fail"); CHECK_CUSOLVER_RET_WITH_EXCEPT( - cusolverDnSpotrfBatched(handle_, uplo, m_, d_array_addr, lda_, d_info_array_addr, batch_), + kernel_node_, cusolverDnSpotrfBatched(handle_, uplo, m_, d_array_addr, lda_, d_info_array_addr, batch_), "cusolver cholesky batched Fail"); TriangleMatrixCopy(input1_addr, output_addr, uplo, outputs[0]->size / sizeof(T), ldb_, m_, reinterpret_cast(stream_ptr)); @@ -79,14 +82,16 @@ class CholeskyGpuKernel : public GpuKernel { Identity(batch_ * split_dim * split_dim, split_dim, output_addr, reinterpret_cast(stream_ptr)); MatrixSplit(batch_ * split_dim * split_dim, split_dim, width, input1_addr, d_batch_input_addr, reinterpret_cast(stream_ptr)); - CHECK_CUDA_RET_WITH_ERROR(cudaMemcpyAsync(d_array_addr, h_array.data(), sizeof(T *) * batch_, + CHECK_CUDA_RET_WITH_ERROR(kernel_node_, + cudaMemcpyAsync(d_array_addr, h_array.data(), sizeof(T *) * batch_, cudaMemcpyHostToDevice, reinterpret_cast(stream_ptr)), "cuda memcopy Fail"); - CHECK_CUDA_RET_WITH_ERROR(cudaMemcpyAsync(d_identity_addr, h_identity.data(), sizeof(T *) * batch_, + CHECK_CUDA_RET_WITH_ERROR(kernel_node_, + cudaMemcpyAsync(d_identity_addr, h_identity.data(), sizeof(T *) * batch_, cudaMemcpyHostToDevice, reinterpret_cast(stream_ptr)), "cuda memcopy Fail"); CHECK_CUSOLVER_RET_WITH_EXCEPT( - cusolverDnSpotrfBatched(handle_, uplo, m_, d_array_addr, lda_, d_info_array_addr, batch_), + kernel_node_, cusolverDnSpotrfBatched(handle_, uplo, m_, d_array_addr, lda_, d_info_array_addr, batch_), "cusolver cholesky batched Fail"); TriangleMatrixCopy(d_batch_input_addr, output_addr, uplo, outputs[0]->size / sizeof(T), ldb_, m_, reinterpret_cast(stream_ptr)); @@ -95,6 +100,7 @@ class CholeskyGpuKernel : public GpuKernel { } bool Init(const CNodePtr &kernel_node) override { + kernel_node_ = kernel_node; handle_ = device::gpu::GPUDeviceManager::GetInstance().GetCusolverDnHandle(); blas_handle_ = device::gpu::GPUDeviceManager::GetInstance().GetCublasHandle(); auto in_shape = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 0); diff --git a/mindspore/ccsrc/backend/kernel_compiler/gpu/math/cholesky_trsm_solve_gpu_kernel.h b/mindspore/ccsrc/backend/kernel_compiler/gpu/math/cholesky_trsm_solve_gpu_kernel.h index 8a6883d0de..323c4befeb 100644 --- a/mindspore/ccsrc/backend/kernel_compiler/gpu/math/cholesky_trsm_solve_gpu_kernel.h +++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/math/cholesky_trsm_solve_gpu_kernel.h @@ -51,6 +51,7 @@ class CholeskyTrsmGpuKernel : public GpuKernel { return true; } bool Init(const CNodePtr &kernel_node) override { + kernel_node_ = kernel_node; handle_ = device::gpu::GPUDeviceManager::GetInstance().GetCusolverDnHandle(); blas_handle_ = device::gpu::GPUDeviceManager::GetInstance().GetCublasHandle(); auto in_shape = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 0); @@ -113,21 +114,25 @@ class CholeskyTrsmGpuKernel : public GpuKernel { h_array[i] = input1_addr + i * lda_ * m_; h_identity[i] = output_addr + i * ldb_ * m_; CHECK_CUDA_RET_WITH_ERROR( + kernel_node_, cudaMemcpyAsync(output_addr + i * ldb_ * m_, h_identity_data.data(), sizeof(T) * ldb_ * m_, cudaMemcpyHostToDevice, reinterpret_cast(stream_ptr)), "cuda memcopy Fail"); } - CHECK_CUDA_RET_WITH_ERROR(cudaMemcpyAsync(d_array_addr, h_array.data(), sizeof(T *) * batch_, + CHECK_CUDA_RET_WITH_ERROR(kernel_node_, + cudaMemcpyAsync(d_array_addr, h_array.data(), sizeof(T *) * batch_, cudaMemcpyHostToDevice, reinterpret_cast(stream_ptr)), "cuda memcopy Fail"); - CHECK_CUDA_RET_WITH_ERROR(cudaMemcpyAsync(d_identity_addr, h_identity.data(), sizeof(T *) * batch_, + CHECK_CUDA_RET_WITH_ERROR(kernel_node_, + cudaMemcpyAsync(d_identity_addr, h_identity.data(), sizeof(T *) * batch_, cudaMemcpyHostToDevice, reinterpret_cast(stream_ptr)), "cuda memcopy Fail"); CHECK_CUSOLVER_RET_WITH_EXCEPT( - cusolverDnSpotrfBatched(handle_, uplo, m_, d_array_addr, lda_, d_info_array_addr, batch_), + kernel_node_, cusolverDnSpotrfBatched(handle_, uplo, m_, d_array_addr, lda_, d_info_array_addr, batch_), "cusolver cholesky batched Fail"); float alpha = 1; CHECK_CUBLAS_RET_WITH_EXCEPT( + kernel_node_, cublasStrsmBatched(blas_handle_, CUBLAS_SIDE_LEFT, uplo, CUBLAS_OP_N, CUBLAS_DIAG_NON_UNIT, m_, m_, &alpha, d_array_addr, lda_, d_identity_addr, ldb_, batch_), "cublas trsm batched Fail"); @@ -147,17 +152,20 @@ class CholeskyTrsmGpuKernel : public GpuKernel { Identity(batch_ * split_dim * split_dim, split_dim, output_addr, reinterpret_cast(stream_ptr)); MatrixSplit(batch_ * split_dim * split_dim, split_dim, width, input1_addr, d_batch_input_addr, reinterpret_cast(stream_ptr)); - CHECK_CUDA_RET_WITH_ERROR(cudaMemcpyAsync(d_array_addr, h_array.data(), sizeof(T *) * batch_, + CHECK_CUDA_RET_WITH_ERROR(kernel_node_, + cudaMemcpyAsync(d_array_addr, h_array.data(), sizeof(T *) * batch_, cudaMemcpyHostToDevice, reinterpret_cast(stream_ptr)), "cuda memcopy Fail"); - CHECK_CUDA_RET_WITH_ERROR(cudaMemcpyAsync(d_identity_addr, h_identity.data(), sizeof(T *) * batch_, + CHECK_CUDA_RET_WITH_ERROR(kernel_node_, + cudaMemcpyAsync(d_identity_addr, h_identity.data(), sizeof(T *) * batch_, cudaMemcpyHostToDevice, reinterpret_cast(stream_ptr)), "cuda memcopy Fail"); CHECK_CUSOLVER_RET_WITH_EXCEPT( - cusolverDnSpotrfBatched(handle_, uplo, m_, d_array_addr, lda_, d_info_array_addr, batch_), + kernel_node_, cusolverDnSpotrfBatched(handle_, uplo, m_, d_array_addr, lda_, d_info_array_addr, batch_), "cusolver cholesky batched Fail"); float alpha = 1; CHECK_CUBLAS_RET_WITH_EXCEPT( + kernel_node_, cublasStrsmBatched(blas_handle_, CUBLAS_SIDE_LEFT, uplo, CUBLAS_OP_N, CUBLAS_DIAG_NON_UNIT, m_, m_, &alpha, d_array_addr, lda_, d_identity_addr, ldb_, batch_), "cublas trsm batched Fail"); diff --git a/mindspore/ccsrc/backend/kernel_compiler/gpu/math/matmul_gpu_kernel.h b/mindspore/ccsrc/backend/kernel_compiler/gpu/math/matmul_gpu_kernel.h index f65b6fd10e..66bcbf9149 100644 --- a/mindspore/ccsrc/backend/kernel_compiler/gpu/math/matmul_gpu_kernel.h +++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/math/matmul_gpu_kernel.h @@ -71,6 +71,7 @@ class MatMulGpuKernel : public GpuKernel { try { CHECK_CUBLAS_RET_WITH_EXCEPT( + kernel_node_, cublasGemmStridedBatchedEx(handle_, transpose_x2_, transpose_x1_, SizeToInt(n_), SizeToInt(m_), SizeToInt(k_), &alpha, input2_addr, dtype_b_, ldb, stride_b, input1_addr, dtype_a_, lda, stride_a, &beta, output_addr, dtype_c_, ldc, stride_c, batch_, CUDA_R_32F, algo_), @@ -81,6 +82,7 @@ class MatMulGpuKernel : public GpuKernel { return true; } bool Init(const CNodePtr &kernel_node) override { + kernel_node_ = kernel_node; handle_ = device::gpu::GPUDeviceManager::GetInstance().GetCublasHandle(); dtype_a_ = GetCudaDataType(TypeIdLabel(AnfAlgo::GetInputDeviceDataType(kernel_node, 0))); dtype_b_ = GetCudaDataType(TypeIdLabel(AnfAlgo::GetInputDeviceDataType(kernel_node, 1))); diff --git a/mindspore/ccsrc/backend/kernel_compiler/gpu/math/update_thor_gradient.h b/mindspore/ccsrc/backend/kernel_compiler/gpu/math/update_thor_gradient.h index 30ce884e10..4ec8fc70cd 100644 --- a/mindspore/ccsrc/backend/kernel_compiler/gpu/math/update_thor_gradient.h +++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/math/update_thor_gradient.h @@ -85,6 +85,7 @@ class UpdateThorGradientGpuKernel : public GpuKernel { try { CHECK_CUBLAS_RET_WITH_EXCEPT( + kernel_node_, cublasGemmStridedBatchedEx(handle_, CUBLAS_OP_N, CUBLAS_OP_N, SizeToInt(gradient_size.ori_w), SizeToInt(gradient_size.h), SizeToInt(gradient_size.h), &alpha, input2_addr, gradient_size.dtype, ldb, stride_b, input1_addr, gradient_size.dtype, lda, stride_a, @@ -116,6 +117,7 @@ class UpdateThorGradientGpuKernel : public GpuKernel { r_output_addr = workspace3_addr; } CHECK_CUBLAS_RET_WITH_EXCEPT( + kernel_node_, cublasGemmStridedBatchedEx(handle_, CUBLAS_OP_N, CUBLAS_OP_N, SizeToInt(gradient_size.w), SizeToInt(gradient_size.h), SizeToInt(gradient_size.w), &alpha, input3_addr, gradient_size.dtype, ldb_r, stride_b, r_input_addr, gradient_size.dtype, lda_r, @@ -138,6 +140,7 @@ class UpdateThorGradientGpuKernel : public GpuKernel { } bool Init(const CNodePtr &kernel_node) override { + kernel_node_ = kernel_node; handle_ = device::gpu::GPUDeviceManager::GetInstance().GetCublasHandle(); SetProperty(kernel_node); InitSizeLists(); diff --git a/mindspore/ccsrc/backend/kernel_compiler/gpu/nccl/nccl_collective_gpu_kernel.h b/mindspore/ccsrc/backend/kernel_compiler/gpu/nccl/nccl_collective_gpu_kernel.h index 559fe59ea2..9bd230d134 100644 --- a/mindspore/ccsrc/backend/kernel_compiler/gpu/nccl/nccl_collective_gpu_kernel.h +++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/nccl/nccl_collective_gpu_kernel.h @@ -83,6 +83,7 @@ class NcclCollectiveGpuKernel : public NcclGpuKernel { return true; } bool Init(const CNodePtr &kernel_node) override { + kernel_node_ = kernel_node; nccl_data_type_ = nccl_dtype(AnfAlgo::GetInputDeviceDataType(kernel_node, 0)); InferCommType(kernel_node); @@ -133,7 +134,8 @@ class NcclCollectiveGpuKernel : public NcclGpuKernel { cudaStream_t stream = comm_stream_ ? comm_stream_ : reinterpret_cast(stream_ptr); auto all_reduce_funcptr = reinterpret_cast(dlsym(const_cast(collective_handle_), "AllReduce")); MS_EXCEPTION_IF_NULL(all_reduce_funcptr); - CHECK_NCCL_RET_WITH_EXCEPT((*all_reduce_funcptr)(input_addr, output_addr, output_size_ / sizeof(T), nccl_data_type_, + CHECK_NCCL_RET_WITH_EXCEPT(kernel_node_, + (*all_reduce_funcptr)(input_addr, output_addr, output_size_ / sizeof(T), nccl_data_type_, nccl_reduce_type_, stream, group_name_), "ncclAllReduce failed"); } @@ -146,6 +148,7 @@ class NcclCollectiveGpuKernel : public NcclGpuKernel { auto all_gather_funcptr = reinterpret_cast(dlsym(const_cast(collective_handle_), "AllGather")); MS_EXCEPTION_IF_NULL(all_gather_funcptr); CHECK_NCCL_RET_WITH_EXCEPT( + kernel_node_, (*all_gather_funcptr)(input_addr, output_addr, input_size_ / sizeof(T), nccl_data_type_, stream, group_name_), "ncclAllGather failed"); } @@ -158,7 +161,8 @@ class NcclCollectiveGpuKernel : public NcclGpuKernel { auto reduce_scatter_funcptr = reinterpret_cast(dlsym(const_cast(collective_handle_), "ReduceScatter")); MS_EXCEPTION_IF_NULL(reduce_scatter_funcptr); - CHECK_NCCL_RET_WITH_EXCEPT((*reduce_scatter_funcptr)(input_addr, output_addr, output_size_ / sizeof(T), + CHECK_NCCL_RET_WITH_EXCEPT(kernel_node_, + (*reduce_scatter_funcptr)(input_addr, output_addr, output_size_ / sizeof(T), nccl_data_type_, nccl_reduce_type_, stream, group_name_), "ncclReduceScatter failed"); } @@ -173,7 +177,8 @@ class NcclCollectiveGpuKernel : public NcclGpuKernel { for (int i = 0; i < SizeToInt(input_size_list_.size()); ++i) { input_addr = GetDeviceAddress(inputs, i); output_addr = GetDeviceAddress(outputs, i); - CHECK_NCCL_RET_WITH_EXCEPT((*broadcast_funcptr)(input_addr, output_addr, output_size_list_[i] / sizeof(T), + CHECK_NCCL_RET_WITH_EXCEPT(kernel_node_, + (*broadcast_funcptr)(input_addr, output_addr, output_size_list_[i] / sizeof(T), nccl_data_type_, root_, stream, group_name_), "ncclBroadcast failed"); } diff --git a/mindspore/ccsrc/backend/kernel_compiler/gpu/nccl/nccl_recv_gpu_kernel.h b/mindspore/ccsrc/backend/kernel_compiler/gpu/nccl/nccl_recv_gpu_kernel.h index 6c16705dfb..c4fbf42ef7 100644 --- a/mindspore/ccsrc/backend/kernel_compiler/gpu/nccl/nccl_recv_gpu_kernel.h +++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/nccl/nccl_recv_gpu_kernel.h @@ -39,7 +39,8 @@ class NcclRecvGpuKernel : public NcclGpuKernel { T *output_addr = GetDeviceAddress(outputs, 0); auto nccl_recv_func = reinterpret_cast(dlsym(const_cast(collective_handle_), "Recv")); MS_EXCEPTION_IF_NULL(nccl_recv_func); - CHECK_NCCL_RET_WITH_EXCEPT((*nccl_recv_func)(output_addr, output_size_list_[0] / sizeof(T), nccl_data_type_, + CHECK_NCCL_RET_WITH_EXCEPT(kernel_node_, + (*nccl_recv_func)(output_addr, output_size_list_[0] / sizeof(T), nccl_data_type_, src_rank_, reinterpret_cast(stream_ptr), group_name_), "ncclRecv failed"); return true; @@ -47,6 +48,7 @@ class NcclRecvGpuKernel : public NcclGpuKernel { bool Init(const CNodePtr &kernel_node) override { MS_EXCEPTION_IF_NULL(kernel_node); + kernel_node_ = kernel_node; size_t input_num = AnfAlgo::GetInputTensorNum(kernel_node); if (input_num != 0) { MS_LOG(ERROR) << "Input number is " << input_num << ", but NCCL receive needs 0 input."; diff --git a/mindspore/ccsrc/backend/kernel_compiler/gpu/nccl/nccl_send_gpu_kernel.h b/mindspore/ccsrc/backend/kernel_compiler/gpu/nccl/nccl_send_gpu_kernel.h index 06de4c5001..768cefd8a2 100644 --- a/mindspore/ccsrc/backend/kernel_compiler/gpu/nccl/nccl_send_gpu_kernel.h +++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/nccl/nccl_send_gpu_kernel.h @@ -39,7 +39,8 @@ class NcclSendGpuKernel : public NcclGpuKernel { T *input_addr = GetDeviceAddress(inputs, 0); auto nccl_send_func = reinterpret_cast(dlsym(const_cast(collective_handle_), "Send")); MS_EXCEPTION_IF_NULL(nccl_send_func); - CHECK_NCCL_RET_WITH_EXCEPT((*nccl_send_func)(input_addr, input_size_list_[0] / sizeof(T), nccl_data_type_, + CHECK_NCCL_RET_WITH_EXCEPT(kernel_node_, + (*nccl_send_func)(input_addr, input_size_list_[0] / sizeof(T), nccl_data_type_, dest_rank_, reinterpret_cast(stream_ptr), group_name_), "ncclSend failed"); return true; @@ -47,6 +48,7 @@ class NcclSendGpuKernel : public NcclGpuKernel { bool Init(const CNodePtr &kernel_node) override { MS_EXCEPTION_IF_NULL(kernel_node); + kernel_node_ = kernel_node; size_t input_num = AnfAlgo::GetInputTensorNum(kernel_node); if (input_num != 1) { MS_LOG(ERROR) << "Input number is " << input_num << ", but NCCL send needs 1 input."; diff --git a/mindspore/ccsrc/backend/kernel_compiler/gpu/nn/activation_gpu_kernel.h b/mindspore/ccsrc/backend/kernel_compiler/gpu/nn/activation_gpu_kernel.h index 8779ed0a66..78cfbe0f6e 100644 --- a/mindspore/ccsrc/backend/kernel_compiler/gpu/nn/activation_gpu_kernel.h +++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/nn/activation_gpu_kernel.h @@ -50,7 +50,8 @@ class ActivationGpuFwdKernel : public GpuKernel { } else { const float alpha = 1; const float beta = 0; - CHECK_CUDNN_RET_WITH_EXCEPT(cudnnActivationForward(cudnn_handle_, activation_desc_, &alpha, data_descriptor_, + CHECK_CUDNN_RET_WITH_EXCEPT(kernel_node_, + cudnnActivationForward(cudnn_handle_, activation_desc_, &alpha, data_descriptor_, input, &beta, data_descriptor_, output), "cudnnActivationForward failed"); } @@ -58,6 +59,7 @@ class ActivationGpuFwdKernel : public GpuKernel { return true; } bool Init(const CNodePtr &kernel_node) override { + kernel_node_ = kernel_node; auto node_name = AnfAlgo::GetCNodeName(kernel_node); auto iter = kernel_map.find(node_name); if (iter == kernel_map.end()) { @@ -85,7 +87,8 @@ class ActivationGpuFwdKernel : public GpuKernel { float alpha = GetAttr(kernel_node, "alpha"); coef = static_cast(alpha); } - CHECK_CUDNN_RET_WITH_EXCEPT(cudnnSetActivationDescriptor(activation_desc_, mode_, CUDNN_NOT_PROPAGATE_NAN, coef), + CHECK_CUDNN_RET_WITH_EXCEPT(kernel_node_, + cudnnSetActivationDescriptor(activation_desc_, mode_, CUDNN_NOT_PROPAGATE_NAN, coef), "cudnnSetActivationDescriptor failed"); const int split_dim = 4; @@ -93,17 +96,19 @@ class ActivationGpuFwdKernel : public GpuKernel { ShapeNdTo4d(input_shape, &shape); if (AnfAlgo::GetInputFormat(kernel_node, 0) == kOpFormat_NHWC) { CHECK_CUDNN_RET_WITH_EXCEPT( + kernel_node_, cudnnSetTensor4dDescriptor(data_descriptor_, CUDNN_TENSOR_NHWC, cudnn_data_type_, SizeToInt(shape[0]), SizeToInt(shape[3]), SizeToInt(shape[1]), SizeToInt(shape[2])), "cudnnSetTensor4dDescriptor failed"); } else { CHECK_CUDNN_RET_WITH_EXCEPT( + kernel_node_, cudnnSetTensor4dDescriptor(data_descriptor_, CUDNN_TENSOR_NCHW, cudnn_data_type_, SizeToInt(shape[0]), SizeToInt(shape[1]), SizeToInt(shape[2]), SizeToInt(shape[3])), "cudnnSetTensor4dDescriptor failed"); } } else { - CudnnSetTensorNdDescriptor(input_shape, data_descriptor_, cudnn_data_type_); + CudnnSetTensorNdDescriptor(input_shape, data_descriptor_, cudnn_data_type_, kernel_node_); } InitSizeLists(); @@ -111,9 +116,10 @@ class ActivationGpuFwdKernel : public GpuKernel { } void DestroyResource() noexcept override { - CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyActivationDescriptor(activation_desc_), + CHECK_CUDNN_RET_WITH_ERROR(kernel_node_, cudnnDestroyActivationDescriptor(activation_desc_), "cudnnDestroyActivationDescriptor failed"); - CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(data_descriptor_), "cudnnDestroyTensorDescriptor failed"); + CHECK_CUDNN_RET_WITH_ERROR(kernel_node_, cudnnDestroyTensorDescriptor(data_descriptor_), + "cudnnDestroyTensorDescriptor failed"); } void ResetResource() noexcept override { @@ -134,14 +140,15 @@ class ActivationGpuFwdKernel : public GpuKernel { protected: void InitResource() override { cudnn_handle_ = device::gpu::GPUDeviceManager::GetInstance().GetCudnnHandle(); - CHECK_CUDNN_RET_WITH_EXCEPT(cudnnCreateTensorDescriptor(&data_descriptor_), "cudnnCreateTensorDescriptor failed"); - CHECK_CUDNN_RET_WITH_EXCEPT(cudnnCreateActivationDescriptor(&activation_desc_), + CHECK_CUDNN_RET_WITH_EXCEPT(kernel_node_, cudnnCreateTensorDescriptor(&data_descriptor_), + "cudnnCreateTensorDescriptor failed"); + CHECK_CUDNN_RET_WITH_EXCEPT(kernel_node_, cudnnCreateActivationDescriptor(&activation_desc_), "cudnnCreateActivationDescriptor failed"); } void InitSizeLists() override { if (!is_null_input_) { - CHECK_CUDNN_RET_WITH_EXCEPT(cudnnGetTensorSizeInBytes(data_descriptor_, &input_size_), + CHECK_CUDNN_RET_WITH_EXCEPT(kernel_node_, cudnnGetTensorSizeInBytes(data_descriptor_, &input_size_), "cudnnGetTensorSizeInBytes failed"); output_size_ = input_size_; } diff --git a/mindspore/ccsrc/backend/kernel_compiler/gpu/nn/activation_grad_kernel.h b/mindspore/ccsrc/backend/kernel_compiler/gpu/nn/activation_grad_kernel.h index b3d0c55af5..bcfb61c58d 100644 --- a/mindspore/ccsrc/backend/kernel_compiler/gpu/nn/activation_grad_kernel.h +++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/nn/activation_grad_kernel.h @@ -54,6 +54,7 @@ class ActivationGradGpuKernel : public GpuKernel { const float alpha = 1; const float beta = 0; CHECK_CUDNN_RET_WITH_EXCEPT( + kernel_node_, cudnnActivationBackward(cudnn_handle_, activation_desc_, &alpha, data_descriptor_, y, data_descriptor_, dy, data_descriptor_, y, &beta, data_descriptor_, dx), "cudnnActivationBackward failed"); @@ -61,6 +62,7 @@ class ActivationGradGpuKernel : public GpuKernel { return true; } bool Init(const CNodePtr &kernel_node) override { + kernel_node_ = kernel_node; auto node_name = AnfAlgo::GetCNodeName(kernel_node); auto iter = kernel_map.find(node_name); if (iter == kernel_map.end()) { @@ -85,7 +87,8 @@ class ActivationGradGpuKernel : public GpuKernel { std::vector shape; double coef = (mode_ == CUDNN_ACTIVATION_CLIPPED_RELU) ? 5.999999 : 0.0; if (mode_ == CUDNN_ACTIVATION_ELU) coef = 1.0; - CHECK_CUDNN_RET_WITH_EXCEPT(cudnnSetActivationDescriptor(activation_desc_, mode_, CUDNN_PROPAGATE_NAN, coef), + CHECK_CUDNN_RET_WITH_EXCEPT(kernel_node_, + cudnnSetActivationDescriptor(activation_desc_, mode_, CUDNN_PROPAGATE_NAN, coef), "SetActivationDescriptor failed"); const int split_dim = 4; @@ -93,17 +96,19 @@ class ActivationGradGpuKernel : public GpuKernel { ShapeNdTo4d(input_shape, &shape); if (AnfAlgo::GetInputFormat(kernel_node, 0) == kOpFormat_NHWC) { CHECK_CUDNN_RET_WITH_EXCEPT( + kernel_node_, cudnnSetTensor4dDescriptor(data_descriptor_, CUDNN_TENSOR_NHWC, cudnn_data_type_, SizeToInt(shape[0]), SizeToInt(shape[3]), SizeToInt(shape[1]), SizeToInt(shape[2])), "cudnnSetTensor4dDescriptor failed"); } else { CHECK_CUDNN_RET_WITH_EXCEPT( + kernel_node_, cudnnSetTensor4dDescriptor(data_descriptor_, CUDNN_TENSOR_NCHW, cudnn_data_type_, SizeToInt(shape[0]), SizeToInt(shape[1]), SizeToInt(shape[2]), SizeToInt(shape[3])), "cudnnSetTensor4dDescriptor failed"); } } else { - CudnnSetTensorNdDescriptor(input_shape, data_descriptor_, cudnn_data_type_); + CudnnSetTensorNdDescriptor(input_shape, data_descriptor_, cudnn_data_type_, kernel_node_); } InitSizeLists(); @@ -111,9 +116,10 @@ class ActivationGradGpuKernel : public GpuKernel { } void DestroyResource() noexcept override { - CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyActivationDescriptor(activation_desc_), + CHECK_CUDNN_RET_WITH_ERROR(kernel_node_, cudnnDestroyActivationDescriptor(activation_desc_), "cudnnDestroyActivationDescriptor failed"); - CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(data_descriptor_), "cudnnDestroyTensorDescriptor failed"); + CHECK_CUDNN_RET_WITH_ERROR(kernel_node_, cudnnDestroyTensorDescriptor(data_descriptor_), + "cudnnDestroyTensorDescriptor failed"); } void ResetResource() noexcept override { @@ -132,13 +138,14 @@ class ActivationGradGpuKernel : public GpuKernel { protected: void InitResource() override { cudnn_handle_ = device::gpu::GPUDeviceManager::GetInstance().GetCudnnHandle(); - CHECK_CUDNN_RET_WITH_EXCEPT(cudnnCreateTensorDescriptor(&data_descriptor_), "cudnnCreateTensorDescriptor failed"); - CHECK_CUDNN_RET_WITH_EXCEPT(cudnnCreateActivationDescriptor(&activation_desc_), + CHECK_CUDNN_RET_WITH_EXCEPT(kernel_node_, cudnnCreateTensorDescriptor(&data_descriptor_), + "cudnnCreateTensorDescriptor failed"); + CHECK_CUDNN_RET_WITH_EXCEPT(kernel_node_, cudnnCreateActivationDescriptor(&activation_desc_), "cudnnCreateActivationDescriptor failed"); } void InitSizeLists() override { if (!is_null_input_) { - CHECK_CUDNN_RET_WITH_EXCEPT(cudnnGetTensorSizeInBytes(data_descriptor_, &input_size_), + CHECK_CUDNN_RET_WITH_EXCEPT(kernel_node_, cudnnGetTensorSizeInBytes(data_descriptor_, &input_size_), "cudnnGetTensorSizeInBytes failed"); } input_size_list_.push_back(input_size_); diff --git a/mindspore/ccsrc/backend/kernel_compiler/gpu/nn/batchnorm_grad_gpu_kernel.h b/mindspore/ccsrc/backend/kernel_compiler/gpu/nn/batchnorm_grad_gpu_kernel.h index 5185109a90..8ba2678ff3 100644 --- a/mindspore/ccsrc/backend/kernel_compiler/gpu/nn/batchnorm_grad_gpu_kernel.h +++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/nn/batchnorm_grad_gpu_kernel.h @@ -71,6 +71,7 @@ class BatchNormGradGpuKernel : public GpuKernel { const float alpha_param_diff = 1; const float beta_param_diff = 0; CHECK_CUDNN_RET_WITH_EXCEPT( + kernel_node_, cudnnBatchNormalizationBackward(handle_, mode_, &alpha_data_diff, &beta_data_diff, &alpha_param_diff, &beta_param_diff, x_desc_, x, dy_desc_, dy, dx_desc_, dx, scale_bias_desc_, scale, bn_scale, bn_bias, epsilon_, save_mean, save_variance), @@ -78,6 +79,7 @@ class BatchNormGradGpuKernel : public GpuKernel { return true; } bool Init(const CNodePtr &kernel_node) override { + kernel_node_ = kernel_node; InitResource(); cudnn_data_type_ = GetCudnnDataType(TypeIdLabel(AnfAlgo::GetInputDeviceDataType(kernel_node, 0))); size_t input_num = AnfAlgo::GetInputTensorNum(kernel_node); @@ -105,15 +107,19 @@ class BatchNormGradGpuKernel : public GpuKernel { epsilon_ = GetAttr(kernel_node, "epsilon"); CHECK_CUDNN_RET_WITH_EXCEPT( + kernel_node_, cudnnSetTensor4dDescriptor(x_desc_, CUDNN_TENSOR_NCHW, cudnn_data_type_, batch_, channel_, height_, width_), "Set x desc failed"); CHECK_CUDNN_RET_WITH_EXCEPT( + kernel_node_, cudnnSetTensor4dDescriptor(dy_desc_, CUDNN_TENSOR_NCHW, cudnn_data_type_, batch_, channel_, height_, width_), "Set dy desc failed"); CHECK_CUDNN_RET_WITH_EXCEPT( + kernel_node_, cudnnSetTensor4dDescriptor(dx_desc_, CUDNN_TENSOR_NCHW, cudnn_data_type_, batch_, channel_, height_, width_), "Set dx desc failed"); CHECK_CUDNN_RET_WITH_EXCEPT( + kernel_node_, cudnnSetTensor4dDescriptor(scale_bias_desc_, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, 1, channel_, 1, 1), "Set para desc failed"); @@ -122,27 +128,31 @@ class BatchNormGradGpuKernel : public GpuKernel { } void DestroyResource() noexcept override { - CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(scale_bias_desc_), "Destroy para desc failed"); - CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(dx_desc_), "Destroy dx desc failed"); - CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(dy_desc_), "Destroy dy desc failed"); - CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(x_desc_), "Destroy x desc failed"); + CHECK_CUDNN_RET_WITH_ERROR(kernel_node_, cudnnDestroyTensorDescriptor(scale_bias_desc_), + "Destroy para desc failed"); + CHECK_CUDNN_RET_WITH_ERROR(kernel_node_, cudnnDestroyTensorDescriptor(dx_desc_), "Destroy dx desc failed"); + CHECK_CUDNN_RET_WITH_ERROR(kernel_node_, cudnnDestroyTensorDescriptor(dy_desc_), "Destroy dy desc failed"); + CHECK_CUDNN_RET_WITH_ERROR(kernel_node_, cudnnDestroyTensorDescriptor(x_desc_), "Destroy x desc failed"); } protected: void InitResource() override { handle_ = device::gpu::GPUDeviceManager::GetInstance().GetCudnnHandle(); - CHECK_CUDNN_RET_WITH_EXCEPT(cudnnCreateTensorDescriptor(&x_desc_), "Create x desc failed"); - CHECK_CUDNN_RET_WITH_EXCEPT(cudnnCreateTensorDescriptor(&dy_desc_), "Create dy desc failed"); - CHECK_CUDNN_RET_WITH_EXCEPT(cudnnCreateTensorDescriptor(&dx_desc_), "Create dx desc failed"); - CHECK_CUDNN_RET_WITH_EXCEPT(cudnnCreateTensorDescriptor(&scale_bias_desc_), "Create para desc failed"); + CHECK_CUDNN_RET_WITH_EXCEPT(kernel_node_, cudnnCreateTensorDescriptor(&x_desc_), "Create x desc failed"); + CHECK_CUDNN_RET_WITH_EXCEPT(kernel_node_, cudnnCreateTensorDescriptor(&dy_desc_), "Create dy desc failed"); + CHECK_CUDNN_RET_WITH_EXCEPT(kernel_node_, cudnnCreateTensorDescriptor(&dx_desc_), "Create dx desc failed"); + CHECK_CUDNN_RET_WITH_EXCEPT(kernel_node_, cudnnCreateTensorDescriptor(&scale_bias_desc_), + "Create para desc failed"); } void InitSizeLists() override { size_t input_size = 0; size_t para_size = 0; if (!is_null_input_) { - CHECK_CUDNN_RET_WITH_EXCEPT(cudnnGetTensorSizeInBytes(x_desc_, &input_size), "Get input size failed"); - CHECK_CUDNN_RET_WITH_EXCEPT(cudnnGetTensorSizeInBytes(scale_bias_desc_, ¶_size), "Get input size failed"); + CHECK_CUDNN_RET_WITH_EXCEPT(kernel_node_, cudnnGetTensorSizeInBytes(x_desc_, &input_size), + "Get input size failed"); + CHECK_CUDNN_RET_WITH_EXCEPT(kernel_node_, cudnnGetTensorSizeInBytes(scale_bias_desc_, ¶_size), + "Get input size failed"); } input_size_list_.push_back(input_size); diff --git a/mindspore/ccsrc/backend/kernel_compiler/gpu/nn/bias_add_grad_gpu_kenel.h b/mindspore/ccsrc/backend/kernel_compiler/gpu/nn/bias_add_grad_gpu_kenel.h index aa1be7b7b1..935ade363b 100644 --- a/mindspore/ccsrc/backend/kernel_compiler/gpu/nn/bias_add_grad_gpu_kenel.h +++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/nn/bias_add_grad_gpu_kenel.h @@ -54,11 +54,13 @@ class BiasAddGradGpuKernel : public GpuKernel { const float alpha = 1; const float beta = 0; if (same_dims_) { - CHECK_CUDA_RET_WITH_EXCEPT(cudaMemcpyAsync(db_addr, dy_addr, output_size_list_[0], cudaMemcpyDeviceToDevice, + CHECK_CUDA_RET_WITH_EXCEPT(kernel_node_, + cudaMemcpyAsync(db_addr, dy_addr, output_size_list_[0], cudaMemcpyDeviceToDevice, reinterpret_cast(stream_ptr)), "cudaMemcpyAsync failed."); } else { CHECK_CUDNN_RET_WITH_EXCEPT( + kernel_node_, cudnnReduceTensor(cudnn_handle_, op_desc_, indices_addr, workspace_size_list_[0], workspace_addr, workspace_size_list_[1], &alpha, dy_desc_, dy_addr, &beta, db_desc_, db_addr), "cudnnReduceTensor failed"); @@ -67,6 +69,7 @@ class BiasAddGradGpuKernel : public GpuKernel { return true; } bool Init(const CNodePtr &kernel_node) override { + kernel_node_ = kernel_node; InitResource(); cudnn_data_type_ = GetCudnnDataType(TypeIdLabel(AnfAlgo::GetInputDeviceDataType(kernel_node, 0))); auto dy_shape = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 0); @@ -97,12 +100,15 @@ class BiasAddGradGpuKernel : public GpuKernel { auto input_device_format = AnfAlgo::GetInputFormat(kernel_node, 0); auto cudnn_cal_format = (input_device_format == kOpFormat_NHWC) ? CUDNN_TENSOR_NHWC : CUDNN_TENSOR_NCHW; CHECK_CUDNN_RET_WITH_EXCEPT( + kernel_node_, cudnnSetTensorNdDescriptorEx(dy_desc_, cudnn_cal_format, cudnn_data_type_, SizeToInt(cudnn_dims), dy_dims.get()), "cudnnSetTensorNdDescriptor failed"); CHECK_CUDNN_RET_WITH_EXCEPT( + kernel_node_, cudnnSetTensorNdDescriptorEx(db_desc_, cudnn_cal_format, cudnn_data_type_, SizeToInt(cudnn_dims), db_dims.get()), "cudnnSetTensorNdDescriptor failed"); CHECK_CUDNN_RET_WITH_EXCEPT( + kernel_node_, cudnnSetReduceTensorDescriptor(op_desc_, CUDNN_REDUCE_TENSOR_ADD, CUDNN_DATA_FLOAT, CUDNN_NOT_PROPAGATE_NAN, CUDNN_REDUCE_TENSOR_NO_INDICES, CUDNN_32BIT_INDICES), "cudnnSetReduceTensorDescriptor failed"); @@ -112,32 +118,39 @@ class BiasAddGradGpuKernel : public GpuKernel { } void DestroyResource() noexcept override { - CHECK_CUDNN_RET_WITH_EXCEPT(cudnnDestroyReduceTensorDescriptor(op_desc_), + CHECK_CUDNN_RET_WITH_EXCEPT(kernel_node_, cudnnDestroyReduceTensorDescriptor(op_desc_), "cudnnDestroyReduceTensorDescriptor failed"); - CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(db_desc_), "cudnnDestroyTensorDescriptor failed"); - CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(dy_desc_), "cudnnDestroyOpTensorDescriptor failed"); + CHECK_CUDNN_RET_WITH_ERROR(kernel_node_, cudnnDestroyTensorDescriptor(db_desc_), + "cudnnDestroyTensorDescriptor failed"); + CHECK_CUDNN_RET_WITH_ERROR(kernel_node_, cudnnDestroyTensorDescriptor(dy_desc_), + "cudnnDestroyOpTensorDescriptor failed"); } protected: void InitResource() override { cudnn_handle_ = device::gpu::GPUDeviceManager::GetInstance().GetCudnnHandle(); - CHECK_CUDNN_RET_WITH_EXCEPT(cudnnCreateTensorDescriptor(&dy_desc_), "cudnnCreateTensorDescriptor failed"); - CHECK_CUDNN_RET_WITH_EXCEPT(cudnnCreateTensorDescriptor(&db_desc_), "cudnnCreateTensorDescriptor failed"); - CHECK_CUDNN_RET_WITH_EXCEPT(cudnnCreateReduceTensorDescriptor(&op_desc_), "cudnnCreateOpTensorDescriptor failed"); + CHECK_CUDNN_RET_WITH_EXCEPT(kernel_node_, cudnnCreateTensorDescriptor(&dy_desc_), + "cudnnCreateTensorDescriptor failed"); + CHECK_CUDNN_RET_WITH_EXCEPT(kernel_node_, cudnnCreateTensorDescriptor(&db_desc_), + "cudnnCreateTensorDescriptor failed"); + CHECK_CUDNN_RET_WITH_EXCEPT(kernel_node_, cudnnCreateReduceTensorDescriptor(&op_desc_), + "cudnnCreateOpTensorDescriptor failed"); } void InitSizeLists() override { size_t dy_size, db_size; - CHECK_CUDNN_RET_WITH_EXCEPT(cudnnGetTensorSizeInBytes(dy_desc_, &dy_size), "cudnnGetTensorSizeInBytes failed"); - CHECK_CUDNN_RET_WITH_EXCEPT(cudnnGetTensorSizeInBytes(db_desc_, &db_size), "cudnnGetTensorSizeInBytes failed"); + CHECK_CUDNN_RET_WITH_EXCEPT(kernel_node_, cudnnGetTensorSizeInBytes(dy_desc_, &dy_size), + "cudnnGetTensorSizeInBytes failed"); + CHECK_CUDNN_RET_WITH_EXCEPT(kernel_node_, cudnnGetTensorSizeInBytes(db_desc_, &db_size), + "cudnnGetTensorSizeInBytes failed"); input_size_list_.push_back(dy_size); output_size_list_.push_back(db_size); size_t indices_size, workspace_size; CHECK_CUDNN_RET_WITH_EXCEPT( - cudnnGetReductionIndicesSize(cudnn_handle_, op_desc_, dy_desc_, db_desc_, &indices_size), + kernel_node_, cudnnGetReductionIndicesSize(cudnn_handle_, op_desc_, dy_desc_, db_desc_, &indices_size), "cudnnGetReductionIndicesSize failed") CHECK_CUDNN_RET_WITH_EXCEPT( - cudnnGetReductionWorkspaceSize(cudnn_handle_, op_desc_, dy_desc_, db_desc_, &workspace_size), + kernel_node_, cudnnGetReductionWorkspaceSize(cudnn_handle_, op_desc_, dy_desc_, db_desc_, &workspace_size), "cudnnGetReductionWorkspaceSize failed") workspace_size_list_.push_back(indices_size); workspace_size_list_.push_back(workspace_size); diff --git a/mindspore/ccsrc/backend/kernel_compiler/gpu/nn/conv2d_gpu_kernel.h b/mindspore/ccsrc/backend/kernel_compiler/gpu/nn/conv2d_gpu_kernel.h index 173a84aecb..ecbb5461e2 100644 --- a/mindspore/ccsrc/backend/kernel_compiler/gpu/nn/conv2d_gpu_kernel.h +++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/nn/conv2d_gpu_kernel.h @@ -88,11 +88,13 @@ class Conv2dGpuFwdKernel : public GpuKernel { reinterpret_cast(stream_ptr)); } CHECK_CUDNN_RET_WITH_EXCEPT( + kernel_node_, cudnnConvolutionForward(cudnn_handle_, &alpha, padded_desc_, padded_addr, filter_desc_, filter_addr, conv_desc_, conv_algorithm_, workspace_addr, workspace_size_, &beta, output_desc_, output_addr), "cudnnConvolutionForward failed"); } else { CHECK_CUDNN_RET_WITH_EXCEPT( + kernel_node_, cudnnConvolutionForward(cudnn_handle_, &alpha, input_desc_, input_addr, filter_desc_, filter_addr, conv_desc_, conv_algorithm_, workspace_addr, workspace_size_, &beta, output_desc_, output_addr), "cudnnConvolutionForward failed"); @@ -101,6 +103,7 @@ class Conv2dGpuFwdKernel : public GpuKernel { return true; } bool Init(const CNodePtr &kernel_node) override { + kernel_node_ = kernel_node; InitResource(); if (!CheckParam(kernel_node)) { return false; @@ -126,7 +129,8 @@ class Conv2dGpuFwdKernel : public GpuKernel { } Set4DDesc(in_shape, filter_shape, output_shape); group_ = static_cast(GetAttr(kernel_node, "group")); - CHECK_CUDNN_RET_WITH_EXCEPT(cudnnSetConvolutionGroupCount(conv_desc_, group_), "cudnnSetConvGroupCount failed"); + CHECK_CUDNN_RET_WITH_EXCEPT(kernel_node_, cudnnSetConvolutionGroupCount(conv_desc_, group_), + "cudnnSetConvGroupCount failed"); std::vector pad_list; std::vector pad_list_me = GetAttr>(kernel_node, "pad_list"); (void)std::transform(pad_list_me.begin(), pad_list_me.end(), std::back_inserter(pad_list), @@ -158,11 +162,13 @@ class Conv2dGpuFwdKernel : public GpuKernel { SetDimA(padded_shape, dimA, 4, data_format_); SetStrideA(padded_shape, strideApadded, 4, data_format_); } - CHECK_CUDNN_RET_WITH_EXCEPT(cudnnSetTensorNdDescriptor(padded_desc_, cudnn_data_type_, 4, dimA, strideApadded), + CHECK_CUDNN_RET_WITH_EXCEPT(kernel_node_, + cudnnSetTensorNdDescriptor(padded_desc_, cudnn_data_type_, 4, dimA, strideApadded), "cudnnSetTensor4dDescriptor failed"); padA[0] = 0; padA[1] = 0; CHECK_CUDNN_RET_WITH_EXCEPT( + kernel_node_, cudnnSetConvolutionNdDescriptor(conv_desc_, 2, padA, strideA, dilaA, CUDNN_CROSS_CORRELATION, CUDNN_DATA_FLOAT), "cudnnSetConvolutionNdDescriptor failed"); input_descriptor_real = padded_desc_; @@ -174,12 +180,13 @@ class Conv2dGpuFwdKernel : public GpuKernel { padA[0] = pad_height_; padA[1] = pad_width_; CHECK_CUDNN_RET_WITH_EXCEPT( + kernel_node_, cudnnSetConvolutionNdDescriptor(conv_desc_, 2, padA, strideA, dilaA, CUDNN_CROSS_CORRELATION, CUDNN_DATA_FLOAT), "cudnnSetConvolution2dDescriptor failed"); input_descriptor_real = input_desc_; } if (cudnn_data_type_ == CUDNN_DATA_HALF) { - CHECK_CUDNN_RET_WITH_EXCEPT(cudnnSetConvolutionMathType(conv_desc_, CUDNN_TENSOR_OP_MATH), + CHECK_CUDNN_RET_WITH_EXCEPT(kernel_node_, cudnnSetConvolutionMathType(conv_desc_, CUDNN_TENSOR_OP_MATH), "cudnnSetConvolutionMathType failed.") } SelectAlgorithm(input_descriptor_real); @@ -188,34 +195,46 @@ class Conv2dGpuFwdKernel : public GpuKernel { } void DestroyResource() noexcept override { - CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyConvolutionDescriptor(conv_desc_), + CHECK_CUDNN_RET_WITH_ERROR(kernel_node_, cudnnDestroyConvolutionDescriptor(conv_desc_), "cudnnDestroyConvolutionDescriptor failed"); - CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyFilterDescriptor(filter_desc_), "cudnnDestroyTensorDescriptor failed"); - CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(padded_desc_), "cudnnDestroyTensorDescriptor failed"); - CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(output_desc_), "cudnnDestroyTensorDescriptor failed"); - CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(input_desc_), "cudnnDestroyTensorDescriptor failed"); + CHECK_CUDNN_RET_WITH_ERROR(kernel_node_, cudnnDestroyFilterDescriptor(filter_desc_), + "cudnnDestroyTensorDescriptor failed"); + CHECK_CUDNN_RET_WITH_ERROR(kernel_node_, cudnnDestroyTensorDescriptor(padded_desc_), + "cudnnDestroyTensorDescriptor failed"); + CHECK_CUDNN_RET_WITH_ERROR(kernel_node_, cudnnDestroyTensorDescriptor(output_desc_), + "cudnnDestroyTensorDescriptor failed"); + CHECK_CUDNN_RET_WITH_ERROR(kernel_node_, cudnnDestroyTensorDescriptor(input_desc_), + "cudnnDestroyTensorDescriptor failed"); } protected: void InitResource() override { cudnn_handle_ = device::gpu::GPUDeviceManager::GetInstance().GetCudnnHandle(); - CHECK_CUDNN_RET_WITH_EXCEPT(cudnnCreateTensorDescriptor(&input_desc_), "cudnnCreateTensorDescriptor failed"); - CHECK_CUDNN_RET_WITH_EXCEPT(cudnnCreateTensorDescriptor(&output_desc_), "cudnnCreateTensorDescriptor failed"); - CHECK_CUDNN_RET_WITH_EXCEPT(cudnnCreateTensorDescriptor(&padded_desc_), "cudnnCreateTensorDescriptor failed"); - CHECK_CUDNN_RET_WITH_EXCEPT(cudnnCreateFilterDescriptor(&filter_desc_), "cudnnCreateTensorDescriptor failed"); - CHECK_CUDNN_RET_WITH_EXCEPT(cudnnCreateConvolutionDescriptor(&conv_desc_), + CHECK_CUDNN_RET_WITH_EXCEPT(kernel_node_, cudnnCreateTensorDescriptor(&input_desc_), + "cudnnCreateTensorDescriptor failed"); + CHECK_CUDNN_RET_WITH_EXCEPT(kernel_node_, cudnnCreateTensorDescriptor(&output_desc_), + "cudnnCreateTensorDescriptor failed"); + CHECK_CUDNN_RET_WITH_EXCEPT(kernel_node_, cudnnCreateTensorDescriptor(&padded_desc_), + "cudnnCreateTensorDescriptor failed"); + CHECK_CUDNN_RET_WITH_EXCEPT(kernel_node_, cudnnCreateFilterDescriptor(&filter_desc_), + "cudnnCreateTensorDescriptor failed"); + CHECK_CUDNN_RET_WITH_EXCEPT(kernel_node_, cudnnCreateConvolutionDescriptor(&conv_desc_), "cudnnCreateConvolutionDescriptor failed"); } void InitSizeLists() override { if (!is_null_input_) { - CHECK_CUDNN_RET_WITH_EXCEPT(cudnnGetTensorSizeInBytes(input_desc_, reinterpret_cast(&input_size_)), + CHECK_CUDNN_RET_WITH_EXCEPT(kernel_node_, + cudnnGetTensorSizeInBytes(input_desc_, reinterpret_cast(&input_size_)), "cudnnGetTensorSizeInBytes failed"); - CHECK_CUDNN_RET_WITH_EXCEPT(cudnnGetFilterSizeInBytes(filter_desc_, reinterpret_cast(&filter_size_)), + CHECK_CUDNN_RET_WITH_EXCEPT(kernel_node_, + cudnnGetFilterSizeInBytes(filter_desc_, reinterpret_cast(&filter_size_)), "cudnnGetFilterSizeInBytes failed"); - CHECK_CUDNN_RET_WITH_EXCEPT(cudnnGetTensorSizeInBytes(output_desc_, reinterpret_cast(&output_size_)), + CHECK_CUDNN_RET_WITH_EXCEPT(kernel_node_, + cudnnGetTensorSizeInBytes(output_desc_, reinterpret_cast(&output_size_)), "cudnnGetTensorSizeInBytes failed"); - CHECK_CUDNN_RET_WITH_EXCEPT(cudnnGetTensorSizeInBytes(padded_desc_, reinterpret_cast(&padded_size_)), + CHECK_CUDNN_RET_WITH_EXCEPT(kernel_node_, + cudnnGetTensorSizeInBytes(padded_desc_, reinterpret_cast(&padded_size_)), "cudnnGetTensorSizeInBytes failed"); } input_size_list_.push_back(input_size_); @@ -223,6 +242,7 @@ class Conv2dGpuFwdKernel : public GpuKernel { output_size_list_.push_back(output_size_); if (use_pad_ && !is_null_input_) { CHECK_CUDNN_RET_WITH_EXCEPT( + kernel_node_, cudnnGetConvolutionForwardWorkspaceSize(cudnn_handle_, padded_desc_, filter_desc_, conv_desc_, output_desc_, conv_algorithm_, &workspace_size_), "cudnnGetConvolutionForwardWorkspaceSize failed"); @@ -230,6 +250,7 @@ class Conv2dGpuFwdKernel : public GpuKernel { } else { if (!is_null_input_) { CHECK_CUDNN_RET_WITH_EXCEPT( + kernel_node_, cudnnGetConvolutionForwardWorkspaceSize(cudnn_handle_, input_desc_, filter_desc_, conv_desc_, output_desc_, conv_algorithm_, &workspace_size_), "cudnnGetConvolutionForwardWorkspaceSize failed"); @@ -269,18 +290,21 @@ class Conv2dGpuFwdKernel : public GpuKernel { int filterDimA[4]; // OHWI for NHWC; OIHW for NCHW SetDimA(filter_shape, filterDimA, 4, data_format_); - CHECK_CUDNN_RET_WITH_EXCEPT(cudnnSetTensorNdDescriptor(input_desc_, cudnn_data_type_, nbDims, dimA, strideAin), + CHECK_CUDNN_RET_WITH_EXCEPT(kernel_node_, + cudnnSetTensorNdDescriptor(input_desc_, cudnn_data_type_, nbDims, dimA, strideAin), "cudnnSetTensor4dDescriptor failed"); CHECK_CUDNN_RET_WITH_EXCEPT( - cudnnSetFilterNdDescriptor(filter_desc_, cudnn_data_type_, compute_format_, nbDims, filterDimA), + kernel_node_, cudnnSetFilterNdDescriptor(filter_desc_, cudnn_data_type_, compute_format_, nbDims, filterDimA), "cudnnSetFilter4dDescriptor failed"); - CHECK_CUDNN_RET_WITH_EXCEPT(cudnnSetTensorNdDescriptor(output_desc_, cudnn_data_type_, nbDims, dimAout, strideAout), + CHECK_CUDNN_RET_WITH_EXCEPT(kernel_node_, + cudnnSetTensorNdDescriptor(output_desc_, cudnn_data_type_, nbDims, dimAout, strideAout), "cudnnSetTensor4dDescriptor failed"); } void SelectAlgorithm(cudnnTensorDescriptor_t input_descriptor_real) { if (group_ > 1 || CUDNN_MAJOR < 7) { - CHECK_CUDNN_RET_WITH_EXCEPT(cudnnGetConvolutionForwardAlgorithm( + CHECK_CUDNN_RET_WITH_EXCEPT(kernel_node_, + cudnnGetConvolutionForwardAlgorithm( cudnn_handle_, input_descriptor_real, filter_desc_, conv_desc_, output_desc_, CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT, 0, &conv_algorithm_), "cudnnGetConvolutionForwardAlgorithm failed"); @@ -289,6 +313,7 @@ class Conv2dGpuFwdKernel : public GpuKernel { int returned_algo_count; cudnnConvolutionFwdAlgoPerf_t perf_results; CHECK_CUDNN_RET_WITH_EXCEPT( + kernel_node_, cudnnGetConvolutionForwardAlgorithm_v7(cudnn_handle_, input_descriptor_real, filter_desc_, conv_desc_, output_desc_, requested_algo_count, &returned_algo_count, &perf_results), "cudnnGetConvolutionForwardAlgorithm_v7 failed"); diff --git a/mindspore/ccsrc/backend/kernel_compiler/gpu/nn/conv2d_grad_filter_gpu_kernel.h b/mindspore/ccsrc/backend/kernel_compiler/gpu/nn/conv2d_grad_filter_gpu_kernel.h index f9b916dd2f..8423b0a0b7 100644 --- a/mindspore/ccsrc/backend/kernel_compiler/gpu/nn/conv2d_grad_filter_gpu_kernel.h +++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/nn/conv2d_grad_filter_gpu_kernel.h @@ -90,18 +90,21 @@ class ConvGradFilterGpuBkwKernel : public GpuKernel { reinterpret_cast(stream_ptr)); } CHECK_CUDNN_RET_WITH_EXCEPT( + kernel_node_, cudnnConvolutionBackwardFilter(cudnn_handle_, &alpha, padded_descriptor_, padded, dy_desc_, dy, conv_desc_, algo_, work_space, workspace_size_, &beta, dw_desc_, dw), "ConvolutionBackwardFilter failed"); return true; } CHECK_CUDNN_RET_WITH_EXCEPT( + kernel_node_, cudnnConvolutionBackwardFilter(cudnn_handle_, &alpha, x_desc_, x, dy_desc_, dy, conv_desc_, algo_, work_space, workspace_size_, &beta, dw_desc_, dw), "ConvolutionBackwardFilter failed"); return true; } bool Init(const CNodePtr &kernel_node) override { + kernel_node_ = kernel_node; InitResource(); if (!CheckParam(kernel_node)) { return false; @@ -128,7 +131,8 @@ class ConvGradFilterGpuBkwKernel : public GpuKernel { SetNCHW(in_shape, &n_, &c_, &old_height_, &old_width_, data_format_); Set4DDesc(dy_shape, filter_shape, in_shape); group_ = static_cast(GetAttr(kernel_node, "group")); - CHECK_CUDNN_RET_WITH_EXCEPT(cudnnSetConvolutionGroupCount(conv_desc_, group_), "cudnnSetConvGroupCount failed"); + CHECK_CUDNN_RET_WITH_EXCEPT(kernel_node_, cudnnSetConvolutionGroupCount(conv_desc_, group_), + "cudnnSetConvGroupCount failed"); std::vector pad_list; std::vector pad_list_me = GetAttr>(kernel_node, "pad_list"); @@ -165,11 +169,12 @@ class ConvGradFilterGpuBkwKernel : public GpuKernel { SetStrideA(padded_shape, strideApadded, 4, data_format_); } CHECK_CUDNN_RET_WITH_EXCEPT( - cudnnSetTensorNdDescriptor(padded_descriptor_, cudnn_data_type_, 4, dimA, strideApadded), + kernel_node_, cudnnSetTensorNdDescriptor(padded_descriptor_, cudnn_data_type_, 4, dimA, strideApadded), "cudnnSetTensor4dDescriptor failed"); padA[0] = 0; padA[1] = 0; CHECK_CUDNN_RET_WITH_EXCEPT( + kernel_node_, cudnnSetConvolutionNdDescriptor(conv_desc_, 2, padA, strideA, dilaA, CUDNN_CROSS_CORRELATION, CUDNN_DATA_FLOAT), "cudnnSetConvolutionNdDescriptor failed"); x_desc_real = padded_descriptor_; @@ -181,12 +186,13 @@ class ConvGradFilterGpuBkwKernel : public GpuKernel { padA[0] = pad_height_; padA[1] = pad_width_; CHECK_CUDNN_RET_WITH_EXCEPT( + kernel_node_, cudnnSetConvolutionNdDescriptor(conv_desc_, 2, padA, strideA, dilaA, CUDNN_CROSS_CORRELATION, CUDNN_DATA_FLOAT), "cudnnSetConvolution2dDescriptor failed"); x_desc_real = x_desc_; } if (cudnn_data_type_ == CUDNN_DATA_HALF) { - CHECK_CUDNN_RET_WITH_EXCEPT(cudnnSetConvolutionMathType(conv_desc_, CUDNN_TENSOR_OP_MATH), + CHECK_CUDNN_RET_WITH_EXCEPT(kernel_node_, cudnnSetConvolutionMathType(conv_desc_, CUDNN_TENSOR_OP_MATH), "cudnnSetConvolutionMathType failed.") } SelectAlgorithm(x_desc_real); @@ -195,31 +201,42 @@ class ConvGradFilterGpuBkwKernel : public GpuKernel { } void DestroyResource() noexcept override { - CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyConvolutionDescriptor(conv_desc_), + CHECK_CUDNN_RET_WITH_ERROR(kernel_node_, cudnnDestroyConvolutionDescriptor(conv_desc_), "cudnnDestroyConvolutionDescriptor failed"); - CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyFilterDescriptor(dw_desc_), "cudnnDestroyFilterDescriptor failed"); - CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(padded_descriptor_), "cudnnDestroyTensorDescriptor failed"); - CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(dy_desc_), "cudnnDestroyTensorDescriptor failed"); - CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(x_desc_), "cudnnDestroyTensorDescriptor failed"); + CHECK_CUDNN_RET_WITH_ERROR(kernel_node_, cudnnDestroyFilterDescriptor(dw_desc_), + "cudnnDestroyFilterDescriptor failed"); + CHECK_CUDNN_RET_WITH_ERROR(kernel_node_, cudnnDestroyTensorDescriptor(padded_descriptor_), + "cudnnDestroyTensorDescriptor failed"); + CHECK_CUDNN_RET_WITH_ERROR(kernel_node_, cudnnDestroyTensorDescriptor(dy_desc_), + "cudnnDestroyTensorDescriptor failed"); + CHECK_CUDNN_RET_WITH_ERROR(kernel_node_, cudnnDestroyTensorDescriptor(x_desc_), + "cudnnDestroyTensorDescriptor failed"); } protected: void InitResource() override { cudnn_handle_ = device::gpu::GPUDeviceManager::GetInstance().GetCudnnHandle(); - CHECK_CUDNN_RET_WITH_EXCEPT(cudnnCreateTensorDescriptor(&x_desc_), "cudnnCreateTensorDescriptor failed"); - CHECK_CUDNN_RET_WITH_EXCEPT(cudnnCreateTensorDescriptor(&dy_desc_), "cudnnCreateTensorDescriptor failed"); - CHECK_CUDNN_RET_WITH_EXCEPT(cudnnCreateTensorDescriptor(&padded_descriptor_), "cudnnCreateTensorDescriptor failed"); - CHECK_CUDNN_RET_WITH_EXCEPT(cudnnCreateFilterDescriptor(&dw_desc_), "cudnnCreateFilterDescriptor failed"); - CHECK_CUDNN_RET_WITH_EXCEPT(cudnnCreateConvolutionDescriptor(&conv_desc_), + CHECK_CUDNN_RET_WITH_EXCEPT(kernel_node_, cudnnCreateTensorDescriptor(&x_desc_), + "cudnnCreateTensorDescriptor failed"); + CHECK_CUDNN_RET_WITH_EXCEPT(kernel_node_, cudnnCreateTensorDescriptor(&dy_desc_), + "cudnnCreateTensorDescriptor failed"); + CHECK_CUDNN_RET_WITH_EXCEPT(kernel_node_, cudnnCreateTensorDescriptor(&padded_descriptor_), + "cudnnCreateTensorDescriptor failed"); + CHECK_CUDNN_RET_WITH_EXCEPT(kernel_node_, cudnnCreateFilterDescriptor(&dw_desc_), + "cudnnCreateFilterDescriptor failed"); + CHECK_CUDNN_RET_WITH_EXCEPT(kernel_node_, cudnnCreateConvolutionDescriptor(&conv_desc_), "cudnnCreateConvolutionDescriptor failed"); } void InitSizeLists() override { if (!is_null_input_) { - CHECK_CUDNN_RET_WITH_EXCEPT(cudnnGetTensorSizeInBytes(dy_desc_, reinterpret_cast(&dy_size_)), + CHECK_CUDNN_RET_WITH_EXCEPT(kernel_node_, + cudnnGetTensorSizeInBytes(dy_desc_, reinterpret_cast(&dy_size_)), "cudnnGetTensorSizeInBytes failed"); - CHECK_CUDNN_RET_WITH_EXCEPT(cudnnGetTensorSizeInBytes(x_desc_, reinterpret_cast(&input_size_)), + CHECK_CUDNN_RET_WITH_EXCEPT(kernel_node_, + cudnnGetTensorSizeInBytes(x_desc_, reinterpret_cast(&input_size_)), "cudnnGetTensorSizeInBytes failed"); - CHECK_CUDNN_RET_WITH_EXCEPT(cudnnGetFilterSizeInBytes(dw_desc_, reinterpret_cast(&output_size_)), + CHECK_CUDNN_RET_WITH_EXCEPT(kernel_node_, + cudnnGetFilterSizeInBytes(dw_desc_, reinterpret_cast(&output_size_)), "cudnnGetFilterSizeInBytes failed"); } input_size_list_.push_back(dy_size_); @@ -228,9 +245,10 @@ class ConvGradFilterGpuBkwKernel : public GpuKernel { if (use_pad_ && !is_null_input_) { CHECK_CUDNN_RET_WITH_EXCEPT( - cudnnGetTensorSizeInBytes(padded_descriptor_, reinterpret_cast(&padded_size_)), + kernel_node_, cudnnGetTensorSizeInBytes(padded_descriptor_, reinterpret_cast(&padded_size_)), "cudnnGetTensorSizeInBytes failed"); CHECK_CUDNN_RET_WITH_EXCEPT( + kernel_node_, cudnnGetConvolutionBackwardFilterWorkspaceSize(cudnn_handle_, padded_descriptor_, dy_desc_, conv_desc_, dw_desc_, algo_, reinterpret_cast(&workspace_size_)), "cudnnGetConvolutionBackwardFilterWorkspaceSize failed"); @@ -238,6 +256,7 @@ class ConvGradFilterGpuBkwKernel : public GpuKernel { } else { if (!is_null_input_) { CHECK_CUDNN_RET_WITH_EXCEPT( + kernel_node_, cudnnGetConvolutionBackwardFilterWorkspaceSize(cudnn_handle_, x_desc_, dy_desc_, conv_desc_, dw_desc_, algo_, reinterpret_cast(&workspace_size_)), "cudnnGetConvolutionBackwardFilterWorkspaceSize failed"); @@ -263,6 +282,7 @@ class ConvGradFilterGpuBkwKernel : public GpuKernel { void SelectAlgorithm(cudnnTensorDescriptor_t x_desc_real) { if (group_ > 1 || CUDNN_MAJOR < 7) { CHECK_CUDNN_RET_WITH_EXCEPT( + kernel_node_, cudnnGetConvolutionBackwardFilterAlgorithm(cudnn_handle_, x_desc_real, dy_desc_, conv_desc_, dw_desc_, CUDNN_CONVOLUTION_BWD_FILTER_SPECIFY_WORKSPACE_LIMIT, 0, &algo_), "GetConvolutionBackwardFilterAlgorithm failed"); @@ -271,6 +291,7 @@ class ConvGradFilterGpuBkwKernel : public GpuKernel { int returned_algo_count; cudnnConvolutionBwdFilterAlgoPerf_t perf_results; CHECK_CUDNN_RET_WITH_EXCEPT( + kernel_node_, cudnnGetConvolutionBackwardFilterAlgorithm_v7(cudnn_handle_, x_desc_real, dy_desc_, conv_desc_, dw_desc_, requested_algo_count, &returned_algo_count, &perf_results), "GetConvolutionBackwardFilterAlgorithm failed"); @@ -299,12 +320,14 @@ class ConvGradFilterGpuBkwKernel : public GpuKernel { // filter shape relued by format_attr_. In native mode it's OHWI. In transpose mode it's OIHW. int filterDimA[4]; SetDimA(filter_shape, filterDimA, 4, format_attr_); - CHECK_CUDNN_RET_WITH_EXCEPT(cudnnSetTensorNdDescriptor(dy_desc_, cudnn_data_type_, nbDims, dimAdy, strideAdy), + CHECK_CUDNN_RET_WITH_EXCEPT(kernel_node_, + cudnnSetTensorNdDescriptor(dy_desc_, cudnn_data_type_, nbDims, dimAdy, strideAdy), "cudnnSetTensorNdDescriptor failed"); CHECK_CUDNN_RET_WITH_EXCEPT( - cudnnSetFilterNdDescriptor(dw_desc_, cudnn_data_type_, compute_format_, nbDims, filterDimA), + kernel_node_, cudnnSetFilterNdDescriptor(dw_desc_, cudnn_data_type_, compute_format_, nbDims, filterDimA), "cudnnSetFilterNdDescriptor failed"); - CHECK_CUDNN_RET_WITH_EXCEPT(cudnnSetTensorNdDescriptor(x_desc_, cudnn_data_type_, nbDims, dimA, strideAin), + CHECK_CUDNN_RET_WITH_EXCEPT(kernel_node_, + cudnnSetTensorNdDescriptor(x_desc_, cudnn_data_type_, nbDims, dimA, strideAin), "cudnnSetTensorNdDescriptor failed"); } void SetStrideAndDilation(const CNodePtr &kernel_node) { diff --git a/mindspore/ccsrc/backend/kernel_compiler/gpu/nn/conv2d_grad_input_gpu_kernel.h b/mindspore/ccsrc/backend/kernel_compiler/gpu/nn/conv2d_grad_input_gpu_kernel.h index 1504312b92..6057fb9e0a 100644 --- a/mindspore/ccsrc/backend/kernel_compiler/gpu/nn/conv2d_grad_input_gpu_kernel.h +++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/nn/conv2d_grad_input_gpu_kernel.h @@ -81,6 +81,7 @@ class ConvGradInputGpuBkwKernel : public GpuKernel { T *padded = GetDeviceAddress(workspace, 1); CHECK_CUDNN_RET_WITH_EXCEPT( + kernel_node_, cudnnConvolutionBackwardData(cudnn_handle_, &alpha, w_desc_, w, dy_desc_, dy, conv_desc_, algo_, work_space, workspace_size_, &beta_, padded_descriptor_, padded), "ConvolutionBackwardData failed"); @@ -93,6 +94,7 @@ class ConvGradInputGpuBkwKernel : public GpuKernel { } } else { CHECK_CUDNN_RET_WITH_EXCEPT( + kernel_node_, cudnnConvolutionBackwardData(cudnn_handle_, &alpha, w_desc_, w, dy_desc_, dy, conv_desc_, algo_, work_space, workspace_size_, &beta_, dx_desc_, dx), "ConvolutionBackwardData failed"); @@ -100,6 +102,7 @@ class ConvGradInputGpuBkwKernel : public GpuKernel { return true; } bool Init(const CNodePtr &kernel_node) override { + kernel_node_ = kernel_node; InitResource(); if (!CheckParam(kernel_node)) { return false; @@ -131,7 +134,8 @@ class ConvGradInputGpuBkwKernel : public GpuKernel { Set4DDesc(dy_shape, input_shape, filter_shape); group_ = static_cast(GetAttr(kernel_node, "group")); - CHECK_CUDNN_RET_WITH_EXCEPT(cudnnSetConvolutionGroupCount(conv_desc_, group_), "cudnnSetConvGroupCount failed"); + CHECK_CUDNN_RET_WITH_EXCEPT(kernel_node_, cudnnSetConvolutionGroupCount(conv_desc_, group_), + "cudnnSetConvGroupCount failed"); std::vector pad_list; std::vector pad_list_me = GetAttr>(kernel_node, "pad_list"); @@ -168,11 +172,12 @@ class ConvGradInputGpuBkwKernel : public GpuKernel { SetStrideA(padded_shape, strideApadded, 4, data_format_); } CHECK_CUDNN_RET_WITH_EXCEPT( - cudnnSetTensorNdDescriptor(padded_descriptor_, cudnn_data_type_, 4, dimA, strideApadded), + kernel_node_, cudnnSetTensorNdDescriptor(padded_descriptor_, cudnn_data_type_, 4, dimA, strideApadded), "cudnnSetTensor4dDescriptor failed"); padA[0] = 0; padA[1] = 0; CHECK_CUDNN_RET_WITH_EXCEPT( + kernel_node_, cudnnSetConvolutionNdDescriptor(conv_desc_, 2, padA, strideA, dilaA, CUDNN_CROSS_CORRELATION, CUDNN_DATA_FLOAT), "cudnnSetConvolutionNdDescriptor failed"); dx_desc_real = padded_descriptor_; @@ -184,12 +189,13 @@ class ConvGradInputGpuBkwKernel : public GpuKernel { padA[0] = pad_height_; padA[1] = pad_width_; CHECK_CUDNN_RET_WITH_EXCEPT( + kernel_node_, cudnnSetConvolutionNdDescriptor(conv_desc_, 2, padA, strideA, dilaA, CUDNN_CROSS_CORRELATION, CUDNN_DATA_FLOAT), "cudnnSetConvolution2dDescriptor failed"); dx_desc_real = dx_desc_; } if (cudnn_data_type_ == CUDNN_DATA_HALF) { - CHECK_CUDNN_RET_WITH_EXCEPT(cudnnSetConvolutionMathType(conv_desc_, CUDNN_TENSOR_OP_MATH), + CHECK_CUDNN_RET_WITH_EXCEPT(kernel_node_, cudnnSetConvolutionMathType(conv_desc_, CUDNN_TENSOR_OP_MATH), "cudnnSetConvolutionMathType failed.") } SelectAlgorithm(dx_desc_real); @@ -199,29 +205,39 @@ class ConvGradInputGpuBkwKernel : public GpuKernel { } void DestroyResource() noexcept override { - CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyConvolutionDescriptor(conv_desc_), + CHECK_CUDNN_RET_WITH_ERROR(kernel_node_, cudnnDestroyConvolutionDescriptor(conv_desc_), "cudnnDestroyConvolutionDescriptor failed"); - CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyFilterDescriptor(w_desc_), "cudnnDestroyFilterDescriptor failed"); - CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(padded_descriptor_), "cudnnDestroyTensorDescriptor failed"); - CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(dy_desc_), "cudnnDestroyTensorDescriptor failed"); - CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(dx_desc_), "cudnnDestroyTensorDescriptor failed"); + CHECK_CUDNN_RET_WITH_ERROR(kernel_node_, cudnnDestroyFilterDescriptor(w_desc_), + "cudnnDestroyFilterDescriptor failed"); + CHECK_CUDNN_RET_WITH_ERROR(kernel_node_, cudnnDestroyTensorDescriptor(padded_descriptor_), + "cudnnDestroyTensorDescriptor failed"); + CHECK_CUDNN_RET_WITH_ERROR(kernel_node_, cudnnDestroyTensorDescriptor(dy_desc_), + "cudnnDestroyTensorDescriptor failed"); + CHECK_CUDNN_RET_WITH_ERROR(kernel_node_, cudnnDestroyTensorDescriptor(dx_desc_), + "cudnnDestroyTensorDescriptor failed"); } protected: void InitResource() override { cudnn_handle_ = device::gpu::GPUDeviceManager::GetInstance().GetCudnnHandle(); - CHECK_CUDNN_RET_WITH_EXCEPT(cudnnCreateTensorDescriptor(&dx_desc_), "cudnnCreateTensorDescriptor failed"); - CHECK_CUDNN_RET_WITH_EXCEPT(cudnnCreateTensorDescriptor(&dy_desc_), "cudnnCreateTensorDescriptor failed"); - CHECK_CUDNN_RET_WITH_EXCEPT(cudnnCreateTensorDescriptor(&padded_descriptor_), "cudnnCreateTensorDescriptor failed"); - CHECK_CUDNN_RET_WITH_EXCEPT(cudnnCreateFilterDescriptor(&w_desc_), "cudnnCreateFilterDescriptor failed"); - CHECK_CUDNN_RET_WITH_EXCEPT(cudnnCreateConvolutionDescriptor(&conv_desc_), + CHECK_CUDNN_RET_WITH_EXCEPT(kernel_node_, cudnnCreateTensorDescriptor(&dx_desc_), + "cudnnCreateTensorDescriptor failed"); + CHECK_CUDNN_RET_WITH_EXCEPT(kernel_node_, cudnnCreateTensorDescriptor(&dy_desc_), + "cudnnCreateTensorDescriptor failed"); + CHECK_CUDNN_RET_WITH_EXCEPT(kernel_node_, cudnnCreateTensorDescriptor(&padded_descriptor_), + "cudnnCreateTensorDescriptor failed"); + CHECK_CUDNN_RET_WITH_EXCEPT(kernel_node_, cudnnCreateFilterDescriptor(&w_desc_), + "cudnnCreateFilterDescriptor failed"); + CHECK_CUDNN_RET_WITH_EXCEPT(kernel_node_, cudnnCreateConvolutionDescriptor(&conv_desc_), "cudnnCreateConvolutionDescriptor failed"); } void InitSizeLists() override { if (!is_null_input_) { - CHECK_CUDNN_RET_WITH_EXCEPT(cudnnGetTensorSizeInBytes(dy_desc_, &dy_size_), "cudnnGetTensorSizeInBytes failed"); - CHECK_CUDNN_RET_WITH_EXCEPT(cudnnGetFilterSizeInBytes(w_desc_, &w_size_), "cudnnGetTensorSizeInBytes failed"); - CHECK_CUDNN_RET_WITH_EXCEPT(cudnnGetTensorSizeInBytes(dx_desc_, &output_size_), + CHECK_CUDNN_RET_WITH_EXCEPT(kernel_node_, cudnnGetTensorSizeInBytes(dy_desc_, &dy_size_), + "cudnnGetTensorSizeInBytes failed"); + CHECK_CUDNN_RET_WITH_EXCEPT(kernel_node_, cudnnGetFilterSizeInBytes(w_desc_, &w_size_), + "cudnnGetTensorSizeInBytes failed"); + CHECK_CUDNN_RET_WITH_EXCEPT(kernel_node_, cudnnGetTensorSizeInBytes(dx_desc_, &output_size_), "cudnnGetTensorSizeInBytes failed"); } input_size_list_.push_back(dy_size_); @@ -229,17 +245,19 @@ class ConvGradInputGpuBkwKernel : public GpuKernel { output_size_list_.push_back(output_size_); if (use_pad_ && !is_null_input_) { - CHECK_CUDNN_RET_WITH_EXCEPT(cudnnGetTensorSizeInBytes(padded_descriptor_, &padded_size_), + CHECK_CUDNN_RET_WITH_EXCEPT(kernel_node_, cudnnGetTensorSizeInBytes(padded_descriptor_, &padded_size_), "cudnnGetTensorSizeInBytes failed"); CHECK_CUDNN_RET_WITH_EXCEPT( + kernel_node_, cudnnGetConvolutionBackwardDataWorkspaceSize(cudnn_handle_, w_desc_, dy_desc_, conv_desc_, padded_descriptor_, algo_, &workspace_size_), "cudnnGetConvolutionBackwardDataWorkspaceSize failed"); workspace_size_list_.push_back(padded_size_); } else { if (!is_null_input_) { - CHECK_CUDNN_RET_WITH_EXCEPT(cudnnGetConvolutionBackwardDataWorkspaceSize( + CHECK_CUDNN_RET_WITH_EXCEPT(kernel_node_, + cudnnGetConvolutionBackwardDataWorkspaceSize( cudnn_handle_, w_desc_, dy_desc_, conv_desc_, dx_desc_, algo_, &workspace_size_), "cudnnGetConvolutionBackwardDataWorkspaceSize failed"); } @@ -270,6 +288,7 @@ class ConvGradInputGpuBkwKernel : public GpuKernel { void SelectAlgorithm(cudnnTensorDescriptor_t dx_desc_real) { if (group_ > 1 || CUDNN_MAJOR < 7) { CHECK_CUDNN_RET_WITH_EXCEPT( + kernel_node_, cudnnGetConvolutionBackwardDataAlgorithm(cudnn_handle_, w_desc_, dy_desc_, conv_desc_, dx_desc_real, CUDNN_CONVOLUTION_BWD_DATA_SPECIFY_WORKSPACE_LIMIT, 0, &algo_), "cudnnGetConvolutionBackwardDataAlgorithm failed"); @@ -278,6 +297,7 @@ class ConvGradInputGpuBkwKernel : public GpuKernel { int returned_algo_count; cudnnConvolutionBwdDataAlgoPerf_t perf_results; CHECK_CUDNN_RET_WITH_EXCEPT( + kernel_node_, cudnnGetConvolutionBackwardDataAlgorithm_v7(cudnn_handle_, w_desc_, dy_desc_, conv_desc_, dx_desc_real, requested_algo_count, &returned_algo_count, &perf_results), "cudnnGetConvolutionBackwardDataAlgorithm_v7 failed"); @@ -306,12 +326,14 @@ class ConvGradInputGpuBkwKernel : public GpuKernel { SetStrideA(dy_shape, strideAdy, 4, data_format_); SetDimA(filter_shape, filterDimA, 4, data_format_); - CHECK_CUDNN_RET_WITH_EXCEPT(cudnnSetTensorNdDescriptor(dy_desc_, cudnn_data_type_, nbDims, dimAdy, strideAdy), + CHECK_CUDNN_RET_WITH_EXCEPT(kernel_node_, + cudnnSetTensorNdDescriptor(dy_desc_, cudnn_data_type_, nbDims, dimAdy, strideAdy), "cudnnSetTensorNdDescriptor failed"); CHECK_CUDNN_RET_WITH_EXCEPT( - cudnnSetFilterNdDescriptor(w_desc_, cudnn_data_type_, compute_format_, nbDims, filterDimA), + kernel_node_, cudnnSetFilterNdDescriptor(w_desc_, cudnn_data_type_, compute_format_, nbDims, filterDimA), "cudnnSetFilterNdDescriptor failed"); - CHECK_CUDNN_RET_WITH_EXCEPT(cudnnSetTensorNdDescriptor(dx_desc_, cudnn_data_type_, nbDims, dimA, strideAin), + CHECK_CUDNN_RET_WITH_EXCEPT(kernel_node_, + cudnnSetTensorNdDescriptor(dx_desc_, cudnn_data_type_, nbDims, dimA, strideAin), "cudnnSetTensorNdDescriptor failed"); } void SetStrideAndDilation(const CNodePtr &kernel_node) { diff --git a/mindspore/ccsrc/backend/kernel_compiler/gpu/nn/ctcloss_gpu_kernel.h b/mindspore/ccsrc/backend/kernel_compiler/gpu/nn/ctcloss_gpu_kernel.h index 4b6f4bf60b..15b7d7538a 100644 --- a/mindspore/ccsrc/backend/kernel_compiler/gpu/nn/ctcloss_gpu_kernel.h +++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/nn/ctcloss_gpu_kernel.h @@ -50,6 +50,7 @@ class CtcLossGpuKernel : public GpuKernel { return true; } bool Init(const CNodePtr &kernel_node) override { + kernel_node_ = kernel_node; InitResource(); auto probs_shape = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 0); if (probs_shape.size() != 3) { @@ -116,9 +117,9 @@ class CtcLossGpuKernel : public GpuKernel { cudaStream_t stream = reinterpret_cast(stream_ptr); CalculateMaxSequence(sequence_length, max_labels_length, batch, stream); CHECK_CUDA_RET_WITH_EXCEPT( - cudaMemcpyAsync(&max_sequence, max_labels_length, sizeof(int), cudaMemcpyDeviceToHost, stream), + kernel_node_, cudaMemcpyAsync(&max_sequence, max_labels_length, sizeof(int), cudaMemcpyDeviceToHost, stream), "cudaMemcpyAsync failed."); - CHECK_CUDA_RET_WITH_EXCEPT(cudaStreamSynchronize(stream), "cudaStreamSynchronize failed."); + CHECK_CUDA_RET_WITH_EXCEPT(kernel_node_, cudaStreamSynchronize(stream), "cudaStreamSynchronize failed."); if (max_time < max_sequence) { MS_LOG(EXCEPTION) << "max_time should be greater than sequence length."; } @@ -128,9 +129,9 @@ class CtcLossGpuKernel : public GpuKernel { CalculatePreLength(label_squence_length, precum_labels_length, cum_labels_length, max_labels_length, label_indices, batch, label_size_ / sizeof(int), stream); CHECK_CUDA_RET_WITH_EXCEPT( - cudaMemcpyAsync(&batch_label, max_labels_length, sizeof(int), cudaMemcpyDeviceToHost, stream), + kernel_node_, cudaMemcpyAsync(&batch_label, max_labels_length, sizeof(int), cudaMemcpyDeviceToHost, stream), "cudaMemcpyAsync failed."); - CHECK_CUDA_RET_WITH_EXCEPT(cudaStreamSynchronize(stream), "cudaStreamSynchronize failed."); + CHECK_CUDA_RET_WITH_EXCEPT(kernel_node_, cudaStreamSynchronize(stream), "cudaStreamSynchronize failed."); if (batch != batch_label + 1) { MS_LOG(EXCEPTION) << "label batch should be equal to input batch."; } @@ -141,9 +142,10 @@ class CtcLossGpuKernel : public GpuKernel { batch, stream); } CHECK_CUDA_RET_WITH_EXCEPT( + kernel_node_, cudaMemcpyAsync(&max_labels_length_host, max_labels_length, sizeof(int), cudaMemcpyDeviceToHost, stream), "cudaMemcpyAsync failed."); - CHECK_CUDA_RET_WITH_EXCEPT(cudaStreamSynchronize(stream), "cudaStreamSynchronize failed."); + CHECK_CUDA_RET_WITH_EXCEPT(kernel_node_, cudaStreamSynchronize(stream), "cudaStreamSynchronize failed."); } void LaunchSecondHalf(const std::vector &inputs, const std::vector &workspace, @@ -175,7 +177,7 @@ class CtcLossGpuKernel : public GpuKernel { CTCLoss(log_alpha_b, log_beta_b, softmax_probs, label_value_with_blank, batch, SOffSet, max_time, numclass, sequence_length, label_squence_length, cum_labels_length, costs, grads, prob_num, ignore_longer_outputs_than_inputs_, stream); - CHECK_CUDA_RET_WITH_EXCEPT(cudaStreamSynchronize(stream), "cudaStreamSynchronize failed."); + CHECK_CUDA_RET_WITH_EXCEPT(kernel_node_, cudaStreamSynchronize(stream), "cudaStreamSynchronize failed."); FreeMem(label_value_with_blank, log_alpha_b, log_beta_b); } @@ -197,39 +199,45 @@ class CtcLossGpuKernel : public GpuKernel { } void MemsetForWS(int *label_value_pcr, int *cum_labels_length, int *label_squence_length, T *costs, T *grads, cudaStream_t stream) { - CHECK_CUDA_RET_WITH_EXCEPT(cudaMemsetAsync(label_value_pcr, static_cast(0), label_size_, stream), + CHECK_CUDA_RET_WITH_EXCEPT(kernel_node_, cudaMemsetAsync(label_value_pcr, static_cast(0), label_size_, stream), "cudaMemSet failed in CtcLossGpuKernel::Launch."); - CHECK_CUDA_RET_WITH_EXCEPT(cudaMemsetAsync(cum_labels_length, static_cast(0), squence_lengths_size_, stream), + CHECK_CUDA_RET_WITH_EXCEPT(kernel_node_, + cudaMemsetAsync(cum_labels_length, static_cast(0), squence_lengths_size_, stream), "cudaMemSet failed in CtcLossGpuKernel::Launch."); CHECK_CUDA_RET_WITH_EXCEPT( - cudaMemsetAsync(label_squence_length, static_cast(0), squence_lengths_size_, stream), + kernel_node_, cudaMemsetAsync(label_squence_length, static_cast(0), squence_lengths_size_, stream), "cudaMemSet failed in CtcLossGpuKernel::Launch."); - CHECK_CUDA_RET_WITH_EXCEPT(cudaMemsetAsync(costs, static_cast(0), probs_dims_[1] * sizeof(T), stream), + CHECK_CUDA_RET_WITH_EXCEPT(kernel_node_, + cudaMemsetAsync(costs, static_cast(0), probs_dims_[1] * sizeof(T), stream), "cudaMemSet failed in CtcLossGpuKernel::Launch."); CHECK_CUDA_RET_WITH_EXCEPT( + kernel_node_, cudaMemsetAsync(grads, static_cast(0), probs_dims_[0] * probs_dims_[1] * probs_dims_[2] * sizeof(T), stream), "cudaMemSet failed in CtcLossGpuKernel::Launch."); } void MemManageForCus(T **log_alpha_b, T **log_beta_b, int **label_value_with_blank, int *cum_labels_length, int log_prob_size, int batch, cudaStream_t stream) { int total_labels_size_host = 0; - CHECK_CUDA_RET_WITH_EXCEPT(cudaMalloc(reinterpret_cast(log_alpha_b), sizeof(T) * log_prob_size), + CHECK_CUDA_RET_WITH_EXCEPT(kernel_node_, + cudaMalloc(reinterpret_cast(log_alpha_b), sizeof(T) * log_prob_size), "cudaMalloc failed."); - CHECK_CUDA_RET_WITH_EXCEPT(cudaMalloc(reinterpret_cast(log_beta_b), sizeof(T) * log_prob_size), - "cudaMalloc failed."); - CHECK_CUDA_RET_WITH_EXCEPT(cudaMemcpyAsync(&total_labels_size_host, cum_labels_length + batch - 1, sizeof(int), + CHECK_CUDA_RET_WITH_EXCEPT( + kernel_node_, cudaMalloc(reinterpret_cast(log_beta_b), sizeof(T) * log_prob_size), "cudaMalloc failed."); + CHECK_CUDA_RET_WITH_EXCEPT(kernel_node_, + cudaMemcpyAsync(&total_labels_size_host, cum_labels_length + batch - 1, sizeof(int), cudaMemcpyDeviceToHost, stream), "cudaMemcpyAsync failed."); - CHECK_CUDA_RET_WITH_EXCEPT(cudaStreamSynchronize(stream), "cudaStreamSynchronize failed."); + CHECK_CUDA_RET_WITH_EXCEPT(kernel_node_, cudaStreamSynchronize(stream), "cudaStreamSynchronize failed."); CHECK_CUDA_RET_WITH_EXCEPT( + kernel_node_, cudaMalloc(reinterpret_cast(label_value_with_blank), sizeof(int) * (2 * total_labels_size_host + batch)), "cudaMalloc failed."); } void FreeMem(int *label_value_with_blank, T *log_alpha_b, T *log_beta_b) { - CHECK_CUDA_RET_WITH_EXCEPT(cudaFree(label_value_with_blank), "cudaFree failed."); - CHECK_CUDA_RET_WITH_EXCEPT(cudaFree(log_alpha_b), "cudaFree failed."); - CHECK_CUDA_RET_WITH_EXCEPT(cudaFree(log_beta_b), "cudaFree failed."); + CHECK_CUDA_RET_WITH_EXCEPT(kernel_node_, cudaFree(label_value_with_blank), "cudaFree failed."); + CHECK_CUDA_RET_WITH_EXCEPT(kernel_node_, cudaFree(log_alpha_b), "cudaFree failed."); + CHECK_CUDA_RET_WITH_EXCEPT(kernel_node_, cudaFree(log_beta_b), "cudaFree failed."); } std::vector input_size_list_; diff --git a/mindspore/ccsrc/backend/kernel_compiler/gpu/nn/fused_batch_norm_ex_gpu_kernel.h b/mindspore/ccsrc/backend/kernel_compiler/gpu/nn/fused_batch_norm_ex_gpu_kernel.h index e8564d354c..07f7eb7a14 100644 --- a/mindspore/ccsrc/backend/kernel_compiler/gpu/nn/fused_batch_norm_ex_gpu_kernel.h +++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/nn/fused_batch_norm_ex_gpu_kernel.h @@ -82,6 +82,7 @@ class FusedBatchNormExGpuKernel : public GpuKernel { const float alpha = 1; const float beta = 0; CHECK_CUDNN_RET_WITH_EXCEPT( + kernel_node_, cudnnBatchNormalizationForwardTrainingEx(handle_, mode_, bn_ops_, &alpha, &beta, x_desc_, x, z_desc_, z, y_desc_, y, scale_bias_mean_var_desc_, scale, bias, exp_avg_factor_, runing_mean, runnig_variance, epsilon_, save_mean, save_variance, activation_desc_, @@ -91,6 +92,7 @@ class FusedBatchNormExGpuKernel : public GpuKernel { } bool Init(const CNodePtr &kernel_node) override { + kernel_node_ = kernel_node; MS_EXCEPTION_IF_NULL(kernel_node); std::string kernel_name = AnfAlgo::GetCNodeName(kernel_node); if (kernel_name == kFusedBatchNormEx) { @@ -141,15 +143,16 @@ class FusedBatchNormExGpuKernel : public GpuKernel { } void DestroyResource() noexcept override { - CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(x_desc_), "Destroy x desc failed"); - CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(y_desc_), "Destroy y desc failed"); - CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(scale_bias_mean_var_desc_), "Destroy para desc failed"); + CHECK_CUDNN_RET_WITH_ERROR(kernel_node_, cudnnDestroyTensorDescriptor(x_desc_), "Destroy x desc failed"); + CHECK_CUDNN_RET_WITH_ERROR(kernel_node_, cudnnDestroyTensorDescriptor(y_desc_), "Destroy y desc failed"); + CHECK_CUDNN_RET_WITH_ERROR(kernel_node_, cudnnDestroyTensorDescriptor(scale_bias_mean_var_desc_), + "Destroy para desc failed"); if (bn_ops_ == CUDNN_BATCHNORM_OPS_BN_ADD_ACTIVATION) { - CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(z_desc_), "Destroy z desc failed"); + CHECK_CUDNN_RET_WITH_ERROR(kernel_node_, cudnnDestroyTensorDescriptor(z_desc_), "Destroy z desc failed"); } if (bn_ops_ != CUDNN_BATCHNORM_OPS_BN) { - CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyActivationDescriptor(activation_desc_), + CHECK_CUDNN_RET_WITH_ERROR(kernel_node_, cudnnDestroyActivationDescriptor(activation_desc_), "Destroy activation descriptor failed"); } } @@ -157,35 +160,41 @@ class FusedBatchNormExGpuKernel : public GpuKernel { protected: void InitResource() override { handle_ = device::gpu::GPUDeviceManager::GetInstance().GetCudnnHandle(); - CHECK_CUDNN_RET_WITH_EXCEPT(cudnnCreateTensorDescriptor(&x_desc_), "Create x desc failed"); - CHECK_CUDNN_RET_WITH_EXCEPT(cudnnCreateTensorDescriptor(&y_desc_), "Create y desc failed"); + CHECK_CUDNN_RET_WITH_EXCEPT(kernel_node_, cudnnCreateTensorDescriptor(&x_desc_), "Create x desc failed"); + CHECK_CUDNN_RET_WITH_EXCEPT(kernel_node_, cudnnCreateTensorDescriptor(&y_desc_), "Create y desc failed"); if (bn_ops_ == CUDNN_BATCHNORM_OPS_BN_ADD_ACTIVATION) { - CHECK_CUDNN_RET_WITH_EXCEPT(cudnnCreateTensorDescriptor(&z_desc_), "Create z desc failed"); + CHECK_CUDNN_RET_WITH_EXCEPT(kernel_node_, cudnnCreateTensorDescriptor(&z_desc_), "Create z desc failed"); } - CHECK_CUDNN_RET_WITH_EXCEPT(cudnnCreateTensorDescriptor(&scale_bias_mean_var_desc_), "Create para desc failed"); + CHECK_CUDNN_RET_WITH_EXCEPT(kernel_node_, cudnnCreateTensorDescriptor(&scale_bias_mean_var_desc_), + "Create para desc failed"); if (bn_ops_ != CUDNN_BATCHNORM_OPS_BN) { - CHECK_CUDNN_RET_WITH_EXCEPT(cudnnCreateActivationDescriptor(&activation_desc_), + CHECK_CUDNN_RET_WITH_EXCEPT(kernel_node_, cudnnCreateActivationDescriptor(&activation_desc_), "Create activation descriptor failed"); } } void InitSizeLists() override { if (!is_null_input_) { - CHECK_CUDNN_RET_WITH_EXCEPT(cudnnGetTensorSizeInBytes(x_desc_, &input_x_size_), "Get input x size failed"); - CHECK_CUDNN_RET_WITH_EXCEPT(cudnnGetTensorSizeInBytes(scale_bias_mean_var_desc_, ¶_size_), + CHECK_CUDNN_RET_WITH_EXCEPT(kernel_node_, cudnnGetTensorSizeInBytes(x_desc_, &input_x_size_), + "Get input x size failed"); + CHECK_CUDNN_RET_WITH_EXCEPT(kernel_node_, cudnnGetTensorSizeInBytes(scale_bias_mean_var_desc_, ¶_size_), "Get para size failed"); - CHECK_CUDNN_RET_WITH_EXCEPT(cudnnGetTensorSizeInBytes(y_desc_, &output_size_), "Get output size failed"); + CHECK_CUDNN_RET_WITH_EXCEPT(kernel_node_, cudnnGetTensorSizeInBytes(y_desc_, &output_size_), + "Get output size failed"); if (bn_ops_ == CUDNN_BATCHNORM_OPS_BN_ADD_ACTIVATION) { - CHECK_CUDNN_RET_WITH_EXCEPT(cudnnGetTensorSizeInBytes(z_desc_, &input_z_size_), "Get input z size failed"); + CHECK_CUDNN_RET_WITH_EXCEPT(kernel_node_, cudnnGetTensorSizeInBytes(z_desc_, &input_z_size_), + "Get input z size failed"); } - CHECK_CUDNN_RET_WITH_EXCEPT(cudnnGetBatchNormalizationForwardTrainingExWorkspaceSize( + CHECK_CUDNN_RET_WITH_EXCEPT(kernel_node_, + cudnnGetBatchNormalizationForwardTrainingExWorkspaceSize( handle_, mode_, bn_ops_, x_desc_, z_desc_, y_desc_, scale_bias_mean_var_desc_, activation_desc_, &workspace_size_), "cudnnGetBatchNormalizationForwardTrainingExWorkspaceSize failed"); - CHECK_CUDNN_RET_WITH_EXCEPT(cudnnGetBatchNormalizationTrainingExReserveSpaceSize( + CHECK_CUDNN_RET_WITH_EXCEPT(kernel_node_, + cudnnGetBatchNormalizationTrainingExReserveSpaceSize( handle_, mode_, bn_ops_, activation_desc_, x_desc_, &reserve_size_), "Get reserve size failed"); } @@ -228,25 +237,28 @@ class FusedBatchNormExGpuKernel : public GpuKernel { } CHECK_CUDNN_RET_WITH_EXCEPT( - cudnnSetTensor4dDescriptor(x_desc_, cudnn_format, cudnn_data_type_, batch, channel, height, width), + kernel_node_, cudnnSetTensor4dDescriptor(x_desc_, cudnn_format, cudnn_data_type_, batch, channel, height, width), "Set x desc failed"); CHECK_CUDNN_RET_WITH_EXCEPT( - cudnnSetTensor4dDescriptor(y_desc_, cudnn_format, cudnn_data_type_, batch, channel, height, width), + kernel_node_, cudnnSetTensor4dDescriptor(y_desc_, cudnn_format, cudnn_data_type_, batch, channel, height, width), "Set y desc failed"); if (bn_ops_ == CUDNN_BATCHNORM_OPS_BN_ADD_ACTIVATION) { CHECK_CUDNN_RET_WITH_EXCEPT( + kernel_node_, cudnnSetTensor4dDescriptor(z_desc_, cudnn_format, cudnn_data_type_, batch, channel, height, width), "Set z desc failed"); } CHECK_CUDNN_RET_WITH_EXCEPT( + kernel_node_, cudnnSetTensor4dDescriptor(scale_bias_mean_var_desc_, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, 1, channel, 1, 1), "Set para desc failed"); if (bn_ops_ != CUDNN_BATCHNORM_OPS_BN) { CHECK_CUDNN_RET_WITH_EXCEPT( + kernel_node_, cudnnSetActivationDescriptor(activation_desc_, CUDNN_ACTIVATION_RELU, CUDNN_NOT_PROPAGATE_NAN, 0.0), "cudnnSetActivationDescriptor failed"); } diff --git a/mindspore/ccsrc/backend/kernel_compiler/gpu/nn/fused_batch_norm_gpu_kernel.h b/mindspore/ccsrc/backend/kernel_compiler/gpu/nn/fused_batch_norm_gpu_kernel.h index 695c453e64..8fb9a93139 100644 --- a/mindspore/ccsrc/backend/kernel_compiler/gpu/nn/fused_batch_norm_gpu_kernel.h +++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/nn/fused_batch_norm_gpu_kernel.h @@ -69,12 +69,14 @@ class FusedBatchNormGpuKernel : public GpuKernel { auto save_mean = GetDeviceAddress(outputs, 3); auto save_variance = GetDeviceAddress(outputs, 4); CHECK_CUDNN_RET_WITH_EXCEPT( + kernel_node_, cudnnBatchNormalizationForwardTraining(handle_, mode_, &alpha, &beta, x_desc_, x, y_desc_, y, scale_bias_mean_var_desc_, scale, bias, exp_avg_factor_, runing_mean, runnig_variance, epsilon_, save_mean, save_variance), "Kernel launch failed"); } else { - CHECK_CUDNN_RET_WITH_EXCEPT(cudnnBatchNormalizationForwardInference(handle_, mode_, &alpha, &beta, x_desc_, x, + CHECK_CUDNN_RET_WITH_EXCEPT(kernel_node_, + cudnnBatchNormalizationForwardInference(handle_, mode_, &alpha, &beta, x_desc_, x, y_desc_, y, scale_bias_mean_var_desc_, scale, bias, runing_mean, runnig_variance, epsilon_), "Kernel launch failed"); @@ -82,6 +84,7 @@ class FusedBatchNormGpuKernel : public GpuKernel { return true; } bool Init(const CNodePtr &kernel_node) override { + kernel_node_ = kernel_node; InitResource(); cudnn_data_type_ = GetCudnnDataType(TypeIdLabel(AnfAlgo::GetInputDeviceDataType(kernel_node, 0))); size_t input_num = AnfAlgo::GetInputTensorNum(kernel_node); @@ -117,14 +120,17 @@ class FusedBatchNormGpuKernel : public GpuKernel { } CHECK_CUDNN_RET_WITH_EXCEPT( + kernel_node_, cudnnSetTensor4dDescriptor(x_desc_, cudnn_format, cudnn_data_type_, batch_, channel_, height_, width_), "Set x desc failed"); CHECK_CUDNN_RET_WITH_EXCEPT( + kernel_node_, cudnnSetTensor4dDescriptor(y_desc_, cudnn_format, cudnn_data_type_, batch_, channel_, height_, width_), "Set y desc failed"); CHECK_CUDNN_RET_WITH_EXCEPT( + kernel_node_, cudnnSetTensor4dDescriptor(scale_bias_mean_var_desc_, cudnn_format, CUDNN_DATA_FLOAT, 1, channel_, 1, 1), "Set para desc failed"); @@ -134,27 +140,31 @@ class FusedBatchNormGpuKernel : public GpuKernel { } void DestroyResource() noexcept override { - CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(x_desc_), "Destroy x desc failed"); - CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(y_desc_), "Destroy y desc failed"); - CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(scale_bias_mean_var_desc_), "Destroy para desc failed"); + CHECK_CUDNN_RET_WITH_ERROR(kernel_node_, cudnnDestroyTensorDescriptor(x_desc_), "Destroy x desc failed"); + CHECK_CUDNN_RET_WITH_ERROR(kernel_node_, cudnnDestroyTensorDescriptor(y_desc_), "Destroy y desc failed"); + CHECK_CUDNN_RET_WITH_ERROR(kernel_node_, cudnnDestroyTensorDescriptor(scale_bias_mean_var_desc_), + "Destroy para desc failed"); } protected: void InitResource() override { handle_ = device::gpu::GPUDeviceManager::GetInstance().GetCudnnHandle(); - CHECK_CUDNN_RET_WITH_EXCEPT(cudnnCreateTensorDescriptor(&x_desc_), "Create x desc failed"); - CHECK_CUDNN_RET_WITH_EXCEPT(cudnnCreateTensorDescriptor(&y_desc_), "Create y desc failed"); - CHECK_CUDNN_RET_WITH_EXCEPT(cudnnCreateTensorDescriptor(&scale_bias_mean_var_desc_), "Create para desc failed"); + CHECK_CUDNN_RET_WITH_EXCEPT(kernel_node_, cudnnCreateTensorDescriptor(&x_desc_), "Create x desc failed"); + CHECK_CUDNN_RET_WITH_EXCEPT(kernel_node_, cudnnCreateTensorDescriptor(&y_desc_), "Create y desc failed"); + CHECK_CUDNN_RET_WITH_EXCEPT(kernel_node_, cudnnCreateTensorDescriptor(&scale_bias_mean_var_desc_), + "Create para desc failed"); } void InitSizeLists() override { size_t input_size = 0; size_t para_size = 0; size_t output_size = 0; if (!is_null_input_) { - CHECK_CUDNN_RET_WITH_EXCEPT(cudnnGetTensorSizeInBytes(x_desc_, &input_size), "Get input size failed"); - CHECK_CUDNN_RET_WITH_EXCEPT(cudnnGetTensorSizeInBytes(scale_bias_mean_var_desc_, ¶_size), + CHECK_CUDNN_RET_WITH_EXCEPT(kernel_node_, cudnnGetTensorSizeInBytes(x_desc_, &input_size), + "Get input size failed"); + CHECK_CUDNN_RET_WITH_EXCEPT(kernel_node_, cudnnGetTensorSizeInBytes(scale_bias_mean_var_desc_, ¶_size), + "Get para size failed"); + CHECK_CUDNN_RET_WITH_EXCEPT(kernel_node_, cudnnGetTensorSizeInBytes(y_desc_, &output_size), "Get para size failed"); - CHECK_CUDNN_RET_WITH_EXCEPT(cudnnGetTensorSizeInBytes(y_desc_, &output_size), "Get para size failed"); } input_size_list_.push_back(input_size); input_size_list_.push_back(para_size); // scale diff --git a/mindspore/ccsrc/backend/kernel_compiler/gpu/nn/fused_batch_norm_grad_ex_gpu_kernel.h b/mindspore/ccsrc/backend/kernel_compiler/gpu/nn/fused_batch_norm_grad_ex_gpu_kernel.h index 7704dccec0..de06ba7d96 100644 --- a/mindspore/ccsrc/backend/kernel_compiler/gpu/nn/fused_batch_norm_grad_ex_gpu_kernel.h +++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/nn/fused_batch_norm_grad_ex_gpu_kernel.h @@ -92,7 +92,8 @@ class FusedBatchNormGradExGpuKernel : public GpuKernel { const float alpha_data_diff = 1; const float alpha_param_diff = 1; const float beta_param_diff = 0; - CHECK_CUDNN_RET_WITH_EXCEPT(cudnnBatchNormalizationBackwardEx( + CHECK_CUDNN_RET_WITH_EXCEPT(kernel_node_, + cudnnBatchNormalizationBackwardEx( handle_, mode_, bn_ops_, &alpha_data_diff, &beta_data_diff_, &alpha_param_diff, &beta_param_diff, x_desc_, x, y_desc_, y, dy_desc_, dy, dz_desc_, dz, dx_desc_, dx, scale_bias_diff_desc_, scale, bias, dscale, dbias, epsilon_, save_mean, save_variance, @@ -102,6 +103,7 @@ class FusedBatchNormGradExGpuKernel : public GpuKernel { } bool Init(const CNodePtr &kernel_node) override { + kernel_node_ = kernel_node; MS_EXCEPTION_IF_NULL(kernel_node); std::string kernel_name = AnfAlgo::GetCNodeName(kernel_node); if (kernel_name == kFusedBatchNormGradEx) { @@ -154,28 +156,30 @@ class FusedBatchNormGradExGpuKernel : public GpuKernel { protected: void InitResource() override { handle_ = device::gpu::GPUDeviceManager::GetInstance().GetCudnnHandle(); - CHECK_CUDNN_RET_WITH_EXCEPT(cudnnCreateTensorDescriptor(&x_desc_), "Create x desc failed"); + CHECK_CUDNN_RET_WITH_EXCEPT(kernel_node_, cudnnCreateTensorDescriptor(&x_desc_), "Create x desc failed"); if (bn_ops_ != CUDNN_BATCHNORM_OPS_BN) { - CHECK_CUDNN_RET_WITH_EXCEPT(cudnnCreateTensorDescriptor(&y_desc_), "Create y desc failed"); - CHECK_CUDNN_RET_WITH_EXCEPT(cudnnCreateActivationDescriptor(&activation_desc_), + CHECK_CUDNN_RET_WITH_EXCEPT(kernel_node_, cudnnCreateTensorDescriptor(&y_desc_), "Create y desc failed"); + CHECK_CUDNN_RET_WITH_EXCEPT(kernel_node_, cudnnCreateActivationDescriptor(&activation_desc_), "Create activation descriptor failed"); } - CHECK_CUDNN_RET_WITH_EXCEPT(cudnnCreateTensorDescriptor(&dy_desc_), "Create dy desc failed"); + CHECK_CUDNN_RET_WITH_EXCEPT(kernel_node_, cudnnCreateTensorDescriptor(&dy_desc_), "Create dy desc failed"); - CHECK_CUDNN_RET_WITH_EXCEPT(cudnnCreateTensorDescriptor(&dx_desc_), "Create dx desc failed"); + CHECK_CUDNN_RET_WITH_EXCEPT(kernel_node_, cudnnCreateTensorDescriptor(&dx_desc_), "Create dx desc failed"); if (bn_ops_ == CUDNN_BATCHNORM_OPS_BN_ADD_ACTIVATION) { - CHECK_CUDNN_RET_WITH_EXCEPT(cudnnCreateTensorDescriptor(&dz_desc_), "Create dz desc failed"); + CHECK_CUDNN_RET_WITH_EXCEPT(kernel_node_, cudnnCreateTensorDescriptor(&dz_desc_), "Create dz desc failed"); } - CHECK_CUDNN_RET_WITH_EXCEPT(cudnnCreateTensorDescriptor(&scale_bias_diff_desc_), "Create para desc failed"); + CHECK_CUDNN_RET_WITH_EXCEPT(kernel_node_, cudnnCreateTensorDescriptor(&scale_bias_diff_desc_), + "Create para desc failed"); } void InitSizeLists() override { if (!is_null_input_) { - CHECK_CUDNN_RET_WITH_EXCEPT(cudnnGetTensorSizeInBytes(x_desc_, &x_size_), "Get x size failed"); - CHECK_CUDNN_RET_WITH_EXCEPT(cudnnGetTensorSizeInBytes(scale_bias_diff_desc_, ¶_size_), + CHECK_CUDNN_RET_WITH_EXCEPT(kernel_node_, cudnnGetTensorSizeInBytes(x_desc_, &x_size_), "Get x size failed"); + CHECK_CUDNN_RET_WITH_EXCEPT(kernel_node_, cudnnGetTensorSizeInBytes(scale_bias_diff_desc_, ¶_size_), "Get para size failed"); - CHECK_CUDNN_RET_WITH_EXCEPT(cudnnGetBatchNormalizationBackwardExWorkspaceSize( + CHECK_CUDNN_RET_WITH_EXCEPT(kernel_node_, + cudnnGetBatchNormalizationBackwardExWorkspaceSize( handle_, mode_, bn_ops_, x_desc_, y_desc_, dy_desc_, dz_desc_, dx_desc_, scale_bias_diff_desc_, activation_desc_, &workspace_size_), "cudnnGetBatchNormalizationBackwardExWorkspaceSize failed"); @@ -202,19 +206,20 @@ class FusedBatchNormGradExGpuKernel : public GpuKernel { workspace_size_list_.push_back(workspace_size_); } void DestroyResource() noexcept override { - CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(x_desc_), "Destroy x desc failed"); + CHECK_CUDNN_RET_WITH_ERROR(kernel_node_, cudnnDestroyTensorDescriptor(x_desc_), "Destroy x desc failed"); if (bn_ops_ != CUDNN_BATCHNORM_OPS_BN) { - CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(y_desc_), "Destroy y desc failed"); - CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyActivationDescriptor(activation_desc_), + CHECK_CUDNN_RET_WITH_ERROR(kernel_node_, cudnnDestroyTensorDescriptor(y_desc_), "Destroy y desc failed"); + CHECK_CUDNN_RET_WITH_ERROR(kernel_node_, cudnnDestroyActivationDescriptor(activation_desc_), "Destroy activation descriptor failed"); } - CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(dy_desc_), "Destroy dy desc failed"); + CHECK_CUDNN_RET_WITH_ERROR(kernel_node_, cudnnDestroyTensorDescriptor(dy_desc_), "Destroy dy desc failed"); - CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(dx_desc_), "Destroy dx desc failed"); + CHECK_CUDNN_RET_WITH_ERROR(kernel_node_, cudnnDestroyTensorDescriptor(dx_desc_), "Destroy dx desc failed"); if (bn_ops_ == CUDNN_BATCHNORM_OPS_BN_ADD_ACTIVATION) { - CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(dz_desc_), "Destroy z desc failed"); + CHECK_CUDNN_RET_WITH_ERROR(kernel_node_, cudnnDestroyTensorDescriptor(dz_desc_), "Destroy z desc failed"); } - CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(scale_bias_diff_desc_), "Destroy para desc failed"); + CHECK_CUDNN_RET_WITH_ERROR(kernel_node_, cudnnDestroyTensorDescriptor(scale_bias_diff_desc_), + "Destroy para desc failed"); } private: @@ -236,35 +241,39 @@ class FusedBatchNormGradExGpuKernel : public GpuKernel { } CHECK_CUDNN_RET_WITH_EXCEPT( - cudnnSetTensor4dDescriptor(x_desc_, cudnn_format, cudnn_data_type_, batch, channel, height, width), + kernel_node_, cudnnSetTensor4dDescriptor(x_desc_, cudnn_format, cudnn_data_type_, batch, channel, height, width), "Set x desc failed"); if (bn_ops_ != CUDNN_BATCHNORM_OPS_BN) { CHECK_CUDNN_RET_WITH_EXCEPT( + kernel_node_, cudnnSetTensor4dDescriptor(y_desc_, cudnn_format, cudnn_data_type_, batch, channel, height, width), "Set z desc failed"); } CHECK_CUDNN_RET_WITH_EXCEPT( - cudnnSetTensor4dDescriptor(dy_desc_, cudnn_format, cudnn_data_type_, batch, channel, height, width), + kernel_node_, cudnnSetTensor4dDescriptor(dy_desc_, cudnn_format, cudnn_data_type_, batch, channel, height, width), "Set dy desc failed"); CHECK_CUDNN_RET_WITH_EXCEPT( - cudnnSetTensor4dDescriptor(dx_desc_, cudnn_format, cudnn_data_type_, batch, channel, height, width), + kernel_node_, cudnnSetTensor4dDescriptor(dx_desc_, cudnn_format, cudnn_data_type_, batch, channel, height, width), "Set dx desc failed"); if (bn_ops_ == CUDNN_BATCHNORM_OPS_BN_ADD_ACTIVATION) { CHECK_CUDNN_RET_WITH_EXCEPT( + kernel_node_, cudnnSetTensor4dDescriptor(dz_desc_, cudnn_format, cudnn_data_type_, batch, channel, height, width), "Set z desc failed"); } CHECK_CUDNN_RET_WITH_EXCEPT( + kernel_node_, cudnnSetTensor4dDescriptor(scale_bias_diff_desc_, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, 1, channel, 1, 1), "Set para desc failed"); if (bn_ops_ != CUDNN_BATCHNORM_OPS_BN) { CHECK_CUDNN_RET_WITH_EXCEPT( + kernel_node_, cudnnSetActivationDescriptor(activation_desc_, CUDNN_ACTIVATION_RELU, CUDNN_NOT_PROPAGATE_NAN, 0.0), "cudnnSetActivationDescriptor failed"); } diff --git a/mindspore/ccsrc/backend/kernel_compiler/gpu/nn/fused_batchnorm_grad_gpu_kernel.h b/mindspore/ccsrc/backend/kernel_compiler/gpu/nn/fused_batchnorm_grad_gpu_kernel.h index c459df03cf..c348747085 100644 --- a/mindspore/ccsrc/backend/kernel_compiler/gpu/nn/fused_batchnorm_grad_gpu_kernel.h +++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/nn/fused_batchnorm_grad_gpu_kernel.h @@ -67,6 +67,7 @@ class FusedBatchNormGradGpuKernel : public GpuKernel { const float alpha_param_diff = 1; const float beta_param_diff = 0; CHECK_CUDNN_RET_WITH_EXCEPT( + kernel_node_, cudnnBatchNormalizationBackward(handle_, mode_, &alpha_data_diff, &beta_data_diff, &alpha_param_diff, &beta_param_diff, x_desc_, x, dy_desc_, dy, dx_desc_, dx, scale_bias_desc_, scale, bn_scale, bn_bias, epsilon_, save_mean, save_variance), @@ -74,6 +75,7 @@ class FusedBatchNormGradGpuKernel : public GpuKernel { return true; } bool Init(const CNodePtr &kernel_node) override { + kernel_node_ = kernel_node; InitResource(); cudnn_data_type_ = GetCudnnDataType(TypeIdLabel(AnfAlgo::GetInputDeviceDataType(kernel_node, 0))); size_t input_num = AnfAlgo::GetInputTensorNum(kernel_node); @@ -101,15 +103,19 @@ class FusedBatchNormGradGpuKernel : public GpuKernel { epsilon_ = GetAttr(kernel_node, "epsilon"); CHECK_CUDNN_RET_WITH_EXCEPT( + kernel_node_, cudnnSetTensor4dDescriptor(x_desc_, CUDNN_TENSOR_NCHW, cudnn_data_type_, batch_, channel_, height_, width_), "Set x desc failed"); CHECK_CUDNN_RET_WITH_EXCEPT( + kernel_node_, cudnnSetTensor4dDescriptor(dy_desc_, CUDNN_TENSOR_NCHW, cudnn_data_type_, batch_, channel_, height_, width_), "Set dy desc failed"); CHECK_CUDNN_RET_WITH_EXCEPT( + kernel_node_, cudnnSetTensor4dDescriptor(dx_desc_, CUDNN_TENSOR_NCHW, cudnn_data_type_, batch_, channel_, height_, width_), "Set dx desc failed"); CHECK_CUDNN_RET_WITH_EXCEPT( + kernel_node_, cudnnSetTensor4dDescriptor(scale_bias_desc_, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, 1, channel_, 1, 1), "Set para desc failed"); @@ -118,27 +124,31 @@ class FusedBatchNormGradGpuKernel : public GpuKernel { } void DestroyResource() noexcept override { - CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(scale_bias_desc_), "Destroy para desc failed"); - CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(dx_desc_), "Destroy dx desc failed"); - CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(dy_desc_), "Destroy dy desc failed"); - CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(x_desc_), "Destroy x desc failed"); + CHECK_CUDNN_RET_WITH_ERROR(kernel_node_, cudnnDestroyTensorDescriptor(scale_bias_desc_), + "Destroy para desc failed"); + CHECK_CUDNN_RET_WITH_ERROR(kernel_node_, cudnnDestroyTensorDescriptor(dx_desc_), "Destroy dx desc failed"); + CHECK_CUDNN_RET_WITH_ERROR(kernel_node_, cudnnDestroyTensorDescriptor(dy_desc_), "Destroy dy desc failed"); + CHECK_CUDNN_RET_WITH_ERROR(kernel_node_, cudnnDestroyTensorDescriptor(x_desc_), "Destroy x desc failed"); } protected: void InitResource() override { handle_ = device::gpu::GPUDeviceManager::GetInstance().GetCudnnHandle(); - CHECK_CUDNN_RET_WITH_EXCEPT(cudnnCreateTensorDescriptor(&x_desc_), "Create x desc failed"); - CHECK_CUDNN_RET_WITH_EXCEPT(cudnnCreateTensorDescriptor(&dy_desc_), "Create dy desc failed"); - CHECK_CUDNN_RET_WITH_EXCEPT(cudnnCreateTensorDescriptor(&dx_desc_), "Create dx desc failed"); - CHECK_CUDNN_RET_WITH_EXCEPT(cudnnCreateTensorDescriptor(&scale_bias_desc_), "Create para desc failed"); + CHECK_CUDNN_RET_WITH_EXCEPT(kernel_node_, cudnnCreateTensorDescriptor(&x_desc_), "Create x desc failed"); + CHECK_CUDNN_RET_WITH_EXCEPT(kernel_node_, cudnnCreateTensorDescriptor(&dy_desc_), "Create dy desc failed"); + CHECK_CUDNN_RET_WITH_EXCEPT(kernel_node_, cudnnCreateTensorDescriptor(&dx_desc_), "Create dx desc failed"); + CHECK_CUDNN_RET_WITH_EXCEPT(kernel_node_, cudnnCreateTensorDescriptor(&scale_bias_desc_), + "Create para desc failed"); } void InitSizeLists() override { size_t input_size = 0; size_t para_size = 0; if (!is_null_input_) { - CHECK_CUDNN_RET_WITH_EXCEPT(cudnnGetTensorSizeInBytes(x_desc_, &input_size), "Get input size failed"); - CHECK_CUDNN_RET_WITH_EXCEPT(cudnnGetTensorSizeInBytes(scale_bias_desc_, ¶_size), "Get input size failed"); + CHECK_CUDNN_RET_WITH_EXCEPT(kernel_node_, cudnnGetTensorSizeInBytes(x_desc_, &input_size), + "Get input size failed"); + CHECK_CUDNN_RET_WITH_EXCEPT(kernel_node_, cudnnGetTensorSizeInBytes(scale_bias_desc_, ¶_size), + "Get input size failed"); } input_size_list_.push_back(input_size); diff --git a/mindspore/ccsrc/backend/kernel_compiler/gpu/nn/im2col_gpu_kernel.h b/mindspore/ccsrc/backend/kernel_compiler/gpu/nn/im2col_gpu_kernel.h index fa27e77d8e..95fc16cf95 100644 --- a/mindspore/ccsrc/backend/kernel_compiler/gpu/nn/im2col_gpu_kernel.h +++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/nn/im2col_gpu_kernel.h @@ -70,17 +70,18 @@ class Im2ColGpuFwdKernel : public GpuKernel { old_width_ + pad_width_, pad_top_, pad_left_, pad_value_, padded_addr, reinterpret_cast(stream_ptr)); CHECK_CUDNN_RET_WITH_EXCEPT( - cudnnIm2Col(cudnn_handle_, padded_desc_, padded_addr, filter_desc_, conv_desc_, output_addr), + kernel_node_, cudnnIm2Col(cudnn_handle_, padded_desc_, padded_addr, filter_desc_, conv_desc_, output_addr), "cudnnIm2ColForward failed"); } else { CHECK_CUDNN_RET_WITH_EXCEPT( - cudnnIm2Col(cudnn_handle_, input_desc_, input_addr, filter_desc_, conv_desc_, output_addr), + kernel_node_, cudnnIm2Col(cudnn_handle_, input_desc_, input_addr, filter_desc_, conv_desc_, output_addr), "cudnnIm2ColForward failed"); } return true; } bool Init(const CNodePtr &kernel_node) override { + kernel_node_ = kernel_node; InitResource(); if (!CheckParam(kernel_node)) { return false; @@ -98,7 +99,8 @@ class Im2ColGpuFwdKernel : public GpuKernel { return true; } Set4DDesc(in_shape, filter_shape, output_shape); - CHECK_CUDNN_RET_WITH_EXCEPT(cudnnSetConvolutionGroupCount(conv_desc_, 1), "cudnnSetConvGroupCount failed"); + CHECK_CUDNN_RET_WITH_EXCEPT(kernel_node_, cudnnSetConvolutionGroupCount(conv_desc_, 1), + "cudnnSetConvGroupCount failed"); pad_height_ = static_cast(GetAttr(kernel_node, "pad")); pad_width_ = pad_height_; pad_mode_ = GetAttr(kernel_node, "pad_mode"); @@ -111,12 +113,13 @@ class Im2ColGpuFwdKernel : public GpuKernel { pad_width_ = 0; } CHECK_CUDNN_RET_WITH_EXCEPT( + kernel_node_, cudnnSetConvolution2dDescriptor(conv_desc_, pad_height_, pad_width_, stride_[2], stride_[3], dilation_[2], dilation_[3], CUDNN_CROSS_CORRELATION, CUDNN_DATA_FLOAT), "cudnnSetConvolution2dDescriptor failed"); } if (cudnn_data_type_ == CUDNN_DATA_HALF) { - CHECK_CUDNN_RET_WITH_EXCEPT(cudnnSetConvolutionMathType(conv_desc_, CUDNN_TENSOR_OP_MATH), + CHECK_CUDNN_RET_WITH_EXCEPT(kernel_node_, cudnnSetConvolutionMathType(conv_desc_, CUDNN_TENSOR_OP_MATH), "cudnnSetConvolutionMathType failed.") } InitSizeLists(); @@ -124,32 +127,43 @@ class Im2ColGpuFwdKernel : public GpuKernel { } void DestroyResource() noexcept override { - CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyConvolutionDescriptor(conv_desc_), + CHECK_CUDNN_RET_WITH_ERROR(kernel_node_, cudnnDestroyConvolutionDescriptor(conv_desc_), "cudnnDestroyConvolutionDescriptor failed"); - CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyFilterDescriptor(filter_desc_), "cudnnDestroyTensorDescriptor failed"); - CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(padded_desc_), "cudnnDestroyTensorDescriptor failed"); - CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(output_desc_), "cudnnDestroyTensorDescriptor failed"); - CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(input_desc_), "cudnnDestroyTensorDescriptor failed"); + CHECK_CUDNN_RET_WITH_ERROR(kernel_node_, cudnnDestroyFilterDescriptor(filter_desc_), + "cudnnDestroyTensorDescriptor failed"); + CHECK_CUDNN_RET_WITH_ERROR(kernel_node_, cudnnDestroyTensorDescriptor(padded_desc_), + "cudnnDestroyTensorDescriptor failed"); + CHECK_CUDNN_RET_WITH_ERROR(kernel_node_, cudnnDestroyTensorDescriptor(output_desc_), + "cudnnDestroyTensorDescriptor failed"); + CHECK_CUDNN_RET_WITH_ERROR(kernel_node_, cudnnDestroyTensorDescriptor(input_desc_), + "cudnnDestroyTensorDescriptor failed"); } protected: void InitResource() override { cudnn_handle_ = device::gpu::GPUDeviceManager::GetInstance().GetCudnnHandle(); - CHECK_CUDNN_RET_WITH_EXCEPT(cudnnCreateTensorDescriptor(&input_desc_), "cudnnCreateTensorDescriptor failed"); - CHECK_CUDNN_RET_WITH_EXCEPT(cudnnCreateTensorDescriptor(&output_desc_), "cudnnCreateTensorDescriptor failed"); - CHECK_CUDNN_RET_WITH_EXCEPT(cudnnCreateTensorDescriptor(&padded_desc_), "cudnnCreateTensorDescriptor failed"); - CHECK_CUDNN_RET_WITH_EXCEPT(cudnnCreateFilterDescriptor(&filter_desc_), "cudnnCreateTensorDescriptor failed"); - CHECK_CUDNN_RET_WITH_EXCEPT(cudnnCreateConvolutionDescriptor(&conv_desc_), + CHECK_CUDNN_RET_WITH_EXCEPT(kernel_node_, cudnnCreateTensorDescriptor(&input_desc_), + "cudnnCreateTensorDescriptor failed"); + CHECK_CUDNN_RET_WITH_EXCEPT(kernel_node_, cudnnCreateTensorDescriptor(&output_desc_), + "cudnnCreateTensorDescriptor failed"); + CHECK_CUDNN_RET_WITH_EXCEPT(kernel_node_, cudnnCreateTensorDescriptor(&padded_desc_), + "cudnnCreateTensorDescriptor failed"); + CHECK_CUDNN_RET_WITH_EXCEPT(kernel_node_, cudnnCreateFilterDescriptor(&filter_desc_), + "cudnnCreateTensorDescriptor failed"); + CHECK_CUDNN_RET_WITH_EXCEPT(kernel_node_, cudnnCreateConvolutionDescriptor(&conv_desc_), "cudnnCreateConvolutionDescriptor failed"); } void InitSizeLists() override { if (!is_null_input_) { - CHECK_CUDNN_RET_WITH_EXCEPT(cudnnGetTensorSizeInBytes(input_desc_, reinterpret_cast(&input_size_)), + CHECK_CUDNN_RET_WITH_EXCEPT(kernel_node_, + cudnnGetTensorSizeInBytes(input_desc_, reinterpret_cast(&input_size_)), "cudnnGetTensorSizeInBytes failed"); - CHECK_CUDNN_RET_WITH_EXCEPT(cudnnGetTensorSizeInBytes(output_desc_, reinterpret_cast(&output_size_)), + CHECK_CUDNN_RET_WITH_EXCEPT(kernel_node_, + cudnnGetTensorSizeInBytes(output_desc_, reinterpret_cast(&output_size_)), "cudnnGetTensorSizeInBytes failed"); - CHECK_CUDNN_RET_WITH_EXCEPT(cudnnGetTensorSizeInBytes(padded_desc_, reinterpret_cast(&padded_size_)), + CHECK_CUDNN_RET_WITH_EXCEPT(kernel_node_, + cudnnGetTensorSizeInBytes(padded_desc_, reinterpret_cast(&padded_size_)), "cudnnGetTensorSizeInBytes failed"); } input_size_list_.push_back(input_size_); @@ -196,10 +210,12 @@ class Im2ColGpuFwdKernel : public GpuKernel { use_pad_ = false; } - CHECK_CUDNN_RET_WITH_EXCEPT(cudnnSetTensor4dDescriptor(padded_desc_, CUDNN_TENSOR_NCHW, cudnn_data_type_, n_, c_, + CHECK_CUDNN_RET_WITH_EXCEPT(kernel_node_, + cudnnSetTensor4dDescriptor(padded_desc_, CUDNN_TENSOR_NCHW, cudnn_data_type_, n_, c_, old_height_ + pad_height_, old_width_ + pad_width_), "cudnnSetTensor4dDescriptor failed"); - CHECK_CUDNN_RET_WITH_EXCEPT(cudnnSetConvolution2dDescriptor( + CHECK_CUDNN_RET_WITH_EXCEPT(kernel_node_, + cudnnSetConvolution2dDescriptor( conv_desc_, use_pad_ ? 0 : pad_top_, use_pad_ ? 0 : pad_left_, stride_[2], stride_[3], dilation_[2], dilation_[3], CUDNN_CROSS_CORRELATION, CUDNN_DATA_FLOAT), "cudnnSetConvolution2dDescriptor failed"); @@ -208,17 +224,20 @@ class Im2ColGpuFwdKernel : public GpuKernel { void Set4DDesc(const std::vector &in_shape, const std::vector &filter_shape, const std::vector &output_shape) { CHECK_CUDNN_RET_WITH_EXCEPT( + kernel_node_, cudnnSetTensor4dDescriptor(input_desc_, CUDNN_TENSOR_NCHW, cudnn_data_type_, SizeToInt(in_shape[0]), SizeToInt(in_shape[1]), SizeToInt(in_shape[2]), SizeToInt(in_shape[3])), "cudnnSetTensor4dDescriptor failed"); - CHECK_CUDNN_RET_WITH_EXCEPT(cudnnSetFilter4dDescriptor(filter_desc_, cudnn_data_type_, CUDNN_TENSOR_NCHW, 1, + CHECK_CUDNN_RET_WITH_EXCEPT(kernel_node_, + cudnnSetFilter4dDescriptor(filter_desc_, cudnn_data_type_, CUDNN_TENSOR_NCHW, 1, SizeToInt(in_shape[1]), filter_shape[0], filter_shape[1]), "cudnnSetFilter4dDescriptor failed"); auto out_H = output_shape[0] * output_shape[1] * output_shape[2]; auto out_W = output_shape[3] * output_shape[4] * output_shape[5]; - CHECK_CUDNN_RET_WITH_EXCEPT(cudnnSetTensor4dDescriptor(output_desc_, CUDNN_TENSOR_NCHW, cudnn_data_type_, + CHECK_CUDNN_RET_WITH_EXCEPT(kernel_node_, + cudnnSetTensor4dDescriptor(output_desc_, CUDNN_TENSOR_NCHW, cudnn_data_type_, SizeToInt(out_H), SizeToInt(out_W), 1, 1), "cudnnSetTensor4dDescriptor failed"); } diff --git a/mindspore/ccsrc/backend/kernel_compiler/gpu/nn/l2normalize_gpu_kernel.h b/mindspore/ccsrc/backend/kernel_compiler/gpu/nn/l2normalize_gpu_kernel.h index 65875f9f04..78ec17b743 100644 --- a/mindspore/ccsrc/backend/kernel_compiler/gpu/nn/l2normalize_gpu_kernel.h +++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/nn/l2normalize_gpu_kernel.h @@ -66,11 +66,13 @@ class L2NormalizeGpuKernel : public GpuKernel { const float beta = 0; if (all_match_) { - CHECK_CUDA_RET_WITH_EXCEPT(cudaMemcpyAsync(reduce_workspace_addr, input_addr, input_size_list_[0], + CHECK_CUDA_RET_WITH_EXCEPT(kernel_node_, + cudaMemcpyAsync(reduce_workspace_addr, input_addr, input_size_list_[0], cudaMemcpyDeviceToDevice, reinterpret_cast(stream_ptr)), "cudaMemcpyAsync failed in L2Normalize::Launch."); } else { CHECK_CUDNN_RET_WITH_EXCEPT( + kernel_node_, cudnnReduceTensor(cudnn_handle_, reduce_tensor_descriptor_, nullptr, 0, workspace_addr, workspace_size_, &alpha, inputA_descriptor_, input_addr, &beta, outputC_descriptor_, reduce_workspace_addr), "cudnnReduceTensor failed."); @@ -84,6 +86,7 @@ class L2NormalizeGpuKernel : public GpuKernel { } bool Init(const CNodePtr &kernel_node) override { + kernel_node_ = kernel_node; InitResource(); data_type_ = GetCudnnDataType(TypeIdLabel(AnfAlgo::GetInputDeviceDataType(kernel_node, 0))); if (!CheckIONumber(kernel_node)) { @@ -142,25 +145,26 @@ class L2NormalizeGpuKernel : public GpuKernel { protected: void InitResource() override { cudnn_handle_ = device::gpu::GPUDeviceManager::GetInstance().GetCudnnHandle(); - CHECK_CUDNN_RET_WITH_EXCEPT(cudnnCreateReduceTensorDescriptor(&reduce_tensor_descriptor_), + CHECK_CUDNN_RET_WITH_EXCEPT(kernel_node_, cudnnCreateReduceTensorDescriptor(&reduce_tensor_descriptor_), "cudnnCreateReduceTensorDescriptor failed."); - CHECK_CUDNN_RET_WITH_EXCEPT(cudnnCreateTensorDescriptor(&inputA_descriptor_), + CHECK_CUDNN_RET_WITH_EXCEPT(kernel_node_, cudnnCreateTensorDescriptor(&inputA_descriptor_), "cudnnCreateTensorDescriptor failed."); - CHECK_CUDNN_RET_WITH_EXCEPT(cudnnCreateTensorDescriptor(&outputC_descriptor_), + CHECK_CUDNN_RET_WITH_EXCEPT(kernel_node_, cudnnCreateTensorDescriptor(&outputC_descriptor_), "cudnnCreateTensorDescriptor failed."); } void InitSizeLists() override { - CHECK_CUDNN_RET_WITH_EXCEPT(cudnnGetTensorSizeInBytes(inputA_descriptor_, &input_size_), + CHECK_CUDNN_RET_WITH_EXCEPT(kernel_node_, cudnnGetTensorSizeInBytes(inputA_descriptor_, &input_size_), "cudnnGetTensorSizeInBytes failed."); input_size_list_.push_back(input_size_); output_size_list_.push_back(output_size_); - CHECK_CUDNN_RET_WITH_EXCEPT(cudnnGetTensorSizeInBytes(outputC_descriptor_, &workspace_size_), + CHECK_CUDNN_RET_WITH_EXCEPT(kernel_node_, cudnnGetTensorSizeInBytes(outputC_descriptor_, &workspace_size_), "cudnnGetTensorSizeInBytes failed."); workspace_size_list_.push_back(workspace_size_); CHECK_CUDNN_RET_WITH_EXCEPT( + kernel_node_, cudnnGetReductionWorkspaceSize(cudnn_handle_, reduce_tensor_descriptor_, inputA_descriptor_, outputC_descriptor_, &workspace_size_), "cudnnGetReductionWorkspaceSize failed."); @@ -184,15 +188,16 @@ class L2NormalizeGpuKernel : public GpuKernel { return true; } void DestroyResource() noexcept { - CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyReduceTensorDescriptor(reduce_tensor_descriptor_), + CHECK_CUDNN_RET_WITH_ERROR(kernel_node_, cudnnDestroyReduceTensorDescriptor(reduce_tensor_descriptor_), "cudnnDestroyReduceTensorDescriptor failed."); - CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(inputA_descriptor_), + CHECK_CUDNN_RET_WITH_ERROR(kernel_node_, cudnnDestroyTensorDescriptor(inputA_descriptor_), "cudnnDestroyTensorDescriptor failed."); - CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(outputC_descriptor_), + CHECK_CUDNN_RET_WITH_ERROR(kernel_node_, cudnnDestroyTensorDescriptor(outputC_descriptor_), "cudnnDestroyTensorDescriptor failed."); } void InferArrayReduceType(const CNodePtr &kernel_node) { CHECK_CUDNN_RET_WITH_EXCEPT( + kernel_node_, cudnnSetReduceTensorDescriptor(reduce_tensor_descriptor_, CUDNN_REDUCE_TENSOR_NORM2, CUDNN_DATA_FLOAT, nan_prop_, reduce_indices_, CUDNN_32BIT_INDICES), "cudnnSetReduceTensorDescriptor failed"); @@ -205,11 +210,12 @@ class L2NormalizeGpuKernel : public GpuKernel { if (input_shape.size() <= split_dim) { ShapeNdTo4d(input_shape, &inputA); - CHECK_CUDNN_RET_WITH_EXCEPT(cudnnSetTensor4dDescriptor(inputA_descriptor_, CUDNN_TENSOR_NCHW, data_type_, + CHECK_CUDNN_RET_WITH_EXCEPT(kernel_node_, + cudnnSetTensor4dDescriptor(inputA_descriptor_, CUDNN_TENSOR_NCHW, data_type_, inputA[0], inputA[1], inputA[2], inputA[3]), "cudnnSetTensor4dDescriptor failed"); } else { - CudnnSetTensorNdDescriptor(input_shape, inputA_descriptor_, data_type_); + CudnnSetTensorNdDescriptor(input_shape, inputA_descriptor_, data_type_, kernel_node_); for (auto dim : input_shape) { inputA.emplace_back(dim); } @@ -219,11 +225,12 @@ class L2NormalizeGpuKernel : public GpuKernel { if (outputC_shape.size() <= split_dim) { ShapeNdTo4d(outputC_shape, &outputC); - CHECK_CUDNN_RET_WITH_EXCEPT(cudnnSetTensor4dDescriptor(outputC_descriptor_, CUDNN_TENSOR_NCHW, data_type_, + CHECK_CUDNN_RET_WITH_EXCEPT(kernel_node_, + cudnnSetTensor4dDescriptor(outputC_descriptor_, CUDNN_TENSOR_NCHW, data_type_, outputC[0], outputC[1], outputC[2], outputC[3]), "cudnnSetTensor4dDescriptor failed"); } else { - CudnnSetTensorNdDescriptor(outputC_shape, outputC_descriptor_, data_type_); + CudnnSetTensorNdDescriptor(outputC_shape, outputC_descriptor_, data_type_, kernel_node_); for (auto dim : outputC_shape) { outputC.emplace_back(dim); } diff --git a/mindspore/ccsrc/backend/kernel_compiler/gpu/nn/l2normalize_grad_gpu_kernel.h b/mindspore/ccsrc/backend/kernel_compiler/gpu/nn/l2normalize_grad_gpu_kernel.h index 39e334f3b3..451303d79b 100644 --- a/mindspore/ccsrc/backend/kernel_compiler/gpu/nn/l2normalize_grad_gpu_kernel.h +++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/nn/l2normalize_grad_gpu_kernel.h @@ -71,11 +71,13 @@ class L2NormalizeGradGpuKernel : public GpuKernel { const float beta = 0; if (all_match_) { - CHECK_CUDA_RET_WITH_EXCEPT(cudaMemcpyAsync(reduce_workspace_addr, x_addr, input_size_list_[0], + CHECK_CUDA_RET_WITH_EXCEPT(kernel_node_, + cudaMemcpyAsync(reduce_workspace_addr, x_addr, input_size_list_[0], cudaMemcpyDeviceToDevice, reinterpret_cast(stream_ptr)), "cudaMemcpyAsync failed in L2Normalize::Launch."); } else { CHECK_CUDNN_RET_WITH_EXCEPT( + kernel_node_, cudnnReduceTensor(cudnn_handle_, reduce_tensor_descriptor_, nullptr, 0, workspace_addr, workspace_size_list_[2], &alpha, inputA_descriptor_, x_addr, &beta, outputC_descriptor_, reduce_workspace_addr), "cudnnReduceTensor failed."); @@ -85,11 +87,13 @@ class L2NormalizeGradGpuKernel : public GpuKernel { BroadcastArith(output_shape_, output_shape_, output_shape_, BROADCAST_TYPE_MUL, y_addr, dy_addr, dx_addr, reinterpret_cast(stream_ptr)); if (all_match_) { - CHECK_CUDA_RET_WITH_EXCEPT(cudaMemcpyAsync(reduce_y_dy_workspace_addr, dx_addr, output_size_list_[0], + CHECK_CUDA_RET_WITH_EXCEPT(kernel_node_, + cudaMemcpyAsync(reduce_y_dy_workspace_addr, dx_addr, output_size_list_[0], cudaMemcpyDeviceToDevice, reinterpret_cast(stream_ptr)), "cudaMemcpyAsync failed in L2Normalize::Launch."); } else { CHECK_CUDNN_RET_WITH_EXCEPT( + kernel_node_, cudnnReduceTensor(cudnn_handle_, reduce_sum_tensor_descriptor_, nullptr, 0, workspace_y_dy_addr, workspace_size_list_[3], &alpha, inputA_descriptor_, dx_addr, &beta, outputC_descriptor_, reduce_y_dy_workspace_addr), @@ -124,6 +128,7 @@ class L2NormalizeGradGpuKernel : public GpuKernel { } bool Init(const CNodePtr &kernel_node) override { + kernel_node_ = kernel_node; InitResource(); data_type_ = GetCudnnDataType(TypeIdLabel(AnfAlgo::GetInputDeviceDataType(kernel_node, 0))); if (!CheckIONumber(kernel_node)) { @@ -187,13 +192,13 @@ class L2NormalizeGradGpuKernel : public GpuKernel { } void InitResource() override { cudnn_handle_ = device::gpu::GPUDeviceManager::GetInstance().GetCudnnHandle(); - CHECK_CUDNN_RET_WITH_EXCEPT(cudnnCreateReduceTensorDescriptor(&reduce_tensor_descriptor_), + CHECK_CUDNN_RET_WITH_EXCEPT(kernel_node_, cudnnCreateReduceTensorDescriptor(&reduce_tensor_descriptor_), "cudnnCreateReduceTensorDescriptor failed."); - CHECK_CUDNN_RET_WITH_EXCEPT(cudnnCreateReduceTensorDescriptor(&reduce_sum_tensor_descriptor_), + CHECK_CUDNN_RET_WITH_EXCEPT(kernel_node_, cudnnCreateReduceTensorDescriptor(&reduce_sum_tensor_descriptor_), "cudnnCreateReduceTensorDescriptor failed."); - CHECK_CUDNN_RET_WITH_EXCEPT(cudnnCreateTensorDescriptor(&inputA_descriptor_), + CHECK_CUDNN_RET_WITH_EXCEPT(kernel_node_, cudnnCreateTensorDescriptor(&inputA_descriptor_), "cudnnCreateTensorDescriptor failed."); - CHECK_CUDNN_RET_WITH_EXCEPT(cudnnCreateTensorDescriptor(&outputC_descriptor_), + CHECK_CUDNN_RET_WITH_EXCEPT(kernel_node_, cudnnCreateTensorDescriptor(&outputC_descriptor_), "cudnnCreateTensorDescriptor failed."); } void InitSizeLists() override { @@ -207,18 +212,20 @@ class L2NormalizeGradGpuKernel : public GpuKernel { output_size_list_.push_back(output_size_); - CHECK_CUDNN_RET_WITH_EXCEPT(cudnnGetTensorSizeInBytes(outputC_descriptor_, &workspace_size_), + CHECK_CUDNN_RET_WITH_EXCEPT(kernel_node_, cudnnGetTensorSizeInBytes(outputC_descriptor_, &workspace_size_), "cudnnGetTensorSizeInBytes failed."); workspace_size_list_.push_back(workspace_size_); workspace_size_list_.push_back(workspace_size_); CHECK_CUDNN_RET_WITH_EXCEPT( + kernel_node_, cudnnGetReductionWorkspaceSize(cudnn_handle_, reduce_tensor_descriptor_, inputA_descriptor_, outputC_descriptor_, &workspace_size_), "cudnnGetReductionWorkspaceSize failed."); workspace_size_list_.push_back(workspace_size_); CHECK_CUDNN_RET_WITH_EXCEPT( + kernel_node_, cudnnGetReductionWorkspaceSize(cudnn_handle_, reduce_sum_tensor_descriptor_, inputA_descriptor_, outputC_descriptor_, &workspace_size_), "cudnnGetReductionWorkspaceSize failed."); @@ -229,21 +236,23 @@ class L2NormalizeGradGpuKernel : public GpuKernel { private: void DestroyResource() noexcept { - CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyReduceTensorDescriptor(reduce_tensor_descriptor_), + CHECK_CUDNN_RET_WITH_ERROR(kernel_node_, cudnnDestroyReduceTensorDescriptor(reduce_tensor_descriptor_), "cudnnDestroyReduceTensorDescriptor failed."); - CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyReduceTensorDescriptor(reduce_sum_tensor_descriptor_), + CHECK_CUDNN_RET_WITH_ERROR(kernel_node_, cudnnDestroyReduceTensorDescriptor(reduce_sum_tensor_descriptor_), "cudnnDestroyReduceTensorDescriptor failed."); - CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(inputA_descriptor_), + CHECK_CUDNN_RET_WITH_ERROR(kernel_node_, cudnnDestroyTensorDescriptor(inputA_descriptor_), "cudnnDestroyTensorDescriptor failed."); - CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(outputC_descriptor_), + CHECK_CUDNN_RET_WITH_ERROR(kernel_node_, cudnnDestroyTensorDescriptor(outputC_descriptor_), "cudnnDestroyTensorDescriptor failed."); } void InferArrayReduceType(const CNodePtr &kernel_node) { CHECK_CUDNN_RET_WITH_EXCEPT( + kernel_node_, cudnnSetReduceTensorDescriptor(reduce_tensor_descriptor_, CUDNN_REDUCE_TENSOR_NORM2, CUDNN_DATA_FLOAT, nan_prop_, reduce_indices_, CUDNN_32BIT_INDICES), "cudnnSetReduceTensorDescriptor failed"); CHECK_CUDNN_RET_WITH_EXCEPT( + kernel_node_, cudnnSetReduceTensorDescriptor(reduce_sum_tensor_descriptor_, CUDNN_REDUCE_TENSOR_ADD, CUDNN_DATA_FLOAT, nan_prop_, reduce_indices_, CUDNN_32BIT_INDICES), "cudnnSetReduceTensorDescriptor failed"); @@ -256,11 +265,12 @@ class L2NormalizeGradGpuKernel : public GpuKernel { if (input_shape.size() <= split_dim) { ShapeNdTo4d(input_shape, &inputA); - CHECK_CUDNN_RET_WITH_EXCEPT(cudnnSetTensor4dDescriptor(inputA_descriptor_, CUDNN_TENSOR_NCHW, data_type_, + CHECK_CUDNN_RET_WITH_EXCEPT(kernel_node_, + cudnnSetTensor4dDescriptor(inputA_descriptor_, CUDNN_TENSOR_NCHW, data_type_, inputA[0], inputA[1], inputA[2], inputA[3]), "cudnnSetTensor4dDescriptor failed"); } else { - CudnnSetTensorNdDescriptor(input_shape, inputA_descriptor_, data_type_); + CudnnSetTensorNdDescriptor(input_shape, inputA_descriptor_, data_type_, kernel_node_); for (auto dim : input_shape) { inputA.emplace_back(dim); } @@ -270,11 +280,12 @@ class L2NormalizeGradGpuKernel : public GpuKernel { if (outputC_shape.size() <= split_dim) { ShapeNdTo4d(outputC_shape, &outputC); - CHECK_CUDNN_RET_WITH_EXCEPT(cudnnSetTensor4dDescriptor(outputC_descriptor_, CUDNN_TENSOR_NCHW, data_type_, + CHECK_CUDNN_RET_WITH_EXCEPT(kernel_node_, + cudnnSetTensor4dDescriptor(outputC_descriptor_, CUDNN_TENSOR_NCHW, data_type_, outputC[0], outputC[1], outputC[2], outputC[3]), "cudnnSetTensor4dDescriptor failed"); } else { - CudnnSetTensorNdDescriptor(outputC_shape, outputC_descriptor_, data_type_); + CudnnSetTensorNdDescriptor(outputC_shape, outputC_descriptor_, data_type_, kernel_node_); for (auto dim : outputC_shape) { outputC.emplace_back(dim); } diff --git a/mindspore/ccsrc/backend/kernel_compiler/gpu/nn/lstm_gpu_kernel.h b/mindspore/ccsrc/backend/kernel_compiler/gpu/nn/lstm_gpu_kernel.h index 60cb4e43dd..605b4c6f97 100644 --- a/mindspore/ccsrc/backend/kernel_compiler/gpu/nn/lstm_gpu_kernel.h +++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/nn/lstm_gpu_kernel.h @@ -74,12 +74,13 @@ class LstmGpuKernel : public GpuKernel { if (!states_init_) { CHECK_CUDNN_RET_WITH_EXCEPT( - cudnnSetDropoutDescriptor(dropout_desc_, handle_, dropout_, states_addr, output_size_list_[4], 0), + kernel_node_, cudnnSetDropoutDescriptor(dropout_desc_, handle_, dropout_, states_addr, output_size_list_[4], 0), "set dropout_desc failed"); states_init_ = true; } CHECK_CUDNN_RET_WITH_EXCEPT( + kernel_node_, cudnnRNNForwardTraining(handle_, rnn_desc_, seq_len_, x_desc_.get(), x_addr, hx_desc_, hx_addr, cx_desc_, cx_addr, w_desc_, w_addr, y_desc_.get(), y_addr, hy_desc_, hy_addr, cy_desc_, cy_addr, workspace_addr, workspace_size_list_[0], reserved_addr, reserved_size_), @@ -88,6 +89,7 @@ class LstmGpuKernel : public GpuKernel { return true; } bool Init(const CNodePtr &kernel_node) override { + kernel_node_ = kernel_node; InitResource(); cudnn_data_type_ = GetCudnnDataType(TypeIdLabel(AnfAlgo::GetInputDeviceDataType(kernel_node, 0))); auto input_shape = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 0); @@ -108,33 +110,41 @@ class LstmGpuKernel : public GpuKernel { cudnnRNNAlgo_t algo = CUDNN_RNN_ALGO_STANDARD; CreateTensorDescGrp(); int hx_dims[3]{num_layers_ * (bidirectional_ ? 2 : 1), batch_size_, hidden_size_}; - CHECK_CUDNN_RET_WITH_EXCEPT(cudnnSetTensorNdDescriptorEx(hx_desc_, CUDNN_TENSOR_NCHW, cudnn_data_type_, 3, hx_dims), + CHECK_CUDNN_RET_WITH_EXCEPT(kernel_node_, + cudnnSetTensorNdDescriptorEx(hx_desc_, CUDNN_TENSOR_NCHW, cudnn_data_type_, 3, hx_dims), "set hx_desc failed"); - CHECK_CUDNN_RET_WITH_EXCEPT(cudnnSetTensorNdDescriptorEx(cx_desc_, CUDNN_TENSOR_NCHW, cudnn_data_type_, 3, hx_dims), + CHECK_CUDNN_RET_WITH_EXCEPT(kernel_node_, + cudnnSetTensorNdDescriptorEx(cx_desc_, CUDNN_TENSOR_NCHW, cudnn_data_type_, 3, hx_dims), "set cx_desc failed"); - CHECK_CUDNN_RET_WITH_EXCEPT(cudnnSetTensorNdDescriptorEx(hy_desc_, CUDNN_TENSOR_NCHW, cudnn_data_type_, 3, hx_dims), + CHECK_CUDNN_RET_WITH_EXCEPT(kernel_node_, + cudnnSetTensorNdDescriptorEx(hy_desc_, CUDNN_TENSOR_NCHW, cudnn_data_type_, 3, hx_dims), "set hy_desc failed"); - CHECK_CUDNN_RET_WITH_EXCEPT(cudnnSetTensorNdDescriptorEx(cy_desc_, CUDNN_TENSOR_NCHW, cudnn_data_type_, 3, hx_dims), + CHECK_CUDNN_RET_WITH_EXCEPT(kernel_node_, + cudnnSetTensorNdDescriptorEx(cy_desc_, CUDNN_TENSOR_NCHW, cudnn_data_type_, 3, hx_dims), "set cy_desc failed"); - CHECK_CUDNN_RET_WITH_EXCEPT(cudnnSetDropoutDescriptor(dropout_desc_, handle_, dropout_, nullptr, 0, 0), + CHECK_CUDNN_RET_WITH_EXCEPT(kernel_node_, + cudnnSetDropoutDescriptor(dropout_desc_, handle_, dropout_, nullptr, 0, 0), "set dropout_desc failed"); - CHECK_CUDNN_RET_WITH_EXCEPT(cudnnSetRNNDescriptor(handle_, rnn_desc_, hidden_size_, num_layers_, dropout_desc_, + CHECK_CUDNN_RET_WITH_EXCEPT(kernel_node_, + cudnnSetRNNDescriptor(handle_, rnn_desc_, hidden_size_, num_layers_, dropout_desc_, input_mode, direction, rnn_mode, algo, cudnn_data_type_), "set rnn_desc failed"); cudnnRNNBiasMode_t bias_mode = has_bias_ ? CUDNN_RNN_DOUBLE_BIAS : CUDNN_RNN_NO_BIAS; - CHECK_CUDNN_RET_WITH_EXCEPT(cudnnSetRNNBiasMode(rnn_desc_, bias_mode), "set bias_mode failed"); + CHECK_CUDNN_RET_WITH_EXCEPT(kernel_node_, cudnnSetRNNBiasMode(rnn_desc_, bias_mode), "set bias_mode failed"); auto weight_shape = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 3); size_t weight_size = weight_shape[0] * weight_shape[1] * weight_shape[2] * sizeof(T); - CHECK_CUDNN_RET_WITH_EXCEPT(cudnnGetRNNParamsSize(handle_, rnn_desc_, x_desc_[0], &weight_size_, cudnn_data_type_), + CHECK_CUDNN_RET_WITH_EXCEPT(kernel_node_, + cudnnGetRNNParamsSize(handle_, rnn_desc_, x_desc_[0], &weight_size_, cudnn_data_type_), "get weight_size_ failed"); if (weight_size != weight_size_) { MS_LOG(EXCEPTION) << "weight size: " << weight_size << " error, expect: " << weight_size_ << " ."; } int w_dims[3] = {SizeToInt(weight_size_ / 4), 1, 1}; - CHECK_CUDNN_RET_WITH_EXCEPT(cudnnSetFilterNdDescriptor(w_desc_, cudnn_data_type_, CUDNN_TENSOR_NCHW, 3, w_dims), + CHECK_CUDNN_RET_WITH_EXCEPT(kernel_node_, + cudnnSetFilterNdDescriptor(w_desc_, cudnn_data_type_, CUDNN_TENSOR_NCHW, 3, w_dims), "set w_desc failed"); CHECK_CUDNN_RET_WITH_EXCEPT( - cudnnGetRNNTrainingReserveSize(handle_, rnn_desc_, seq_len_, x_desc_.get(), &reserved_size_), + kernel_node_, cudnnGetRNNTrainingReserveSize(handle_, rnn_desc_, seq_len_, x_desc_.get(), &reserved_size_), "get reserve size failed"); InitSizeLists(); return true; @@ -147,47 +157,51 @@ class LstmGpuKernel : public GpuKernel { y_desc_ = std::make_unique(seq_len_); for (size_t i = 0; i < IntToSize(seq_len_); ++i) { - CHECK_CUDNN_RET_WITH_EXCEPT(cudnnCreateTensorDescriptor(&x_desc_[i]), "create x_desc failed"); + CHECK_CUDNN_RET_WITH_EXCEPT(kernel_node_, cudnnCreateTensorDescriptor(&x_desc_[i]), "create x_desc failed"); CHECK_CUDNN_RET_WITH_EXCEPT( - cudnnSetTensorNdDescriptorEx(x_desc_[i], CUDNN_TENSOR_NCHW, cudnn_data_type_, 3, x_dims), "set x_desc failed"); + kernel_node_, cudnnSetTensorNdDescriptorEx(x_desc_[i], CUDNN_TENSOR_NCHW, cudnn_data_type_, 3, x_dims), + "set x_desc failed"); - CHECK_CUDNN_RET_WITH_EXCEPT(cudnnCreateTensorDescriptor(&y_desc_[i]), "create y_desc failed"); + CHECK_CUDNN_RET_WITH_EXCEPT(kernel_node_, cudnnCreateTensorDescriptor(&y_desc_[i]), "create y_desc failed"); CHECK_CUDNN_RET_WITH_EXCEPT( - cudnnSetTensorNdDescriptorEx(y_desc_[i], CUDNN_TENSOR_NCHW, cudnn_data_type_, 3, y_dims), "set y_desc failed"); + kernel_node_, cudnnSetTensorNdDescriptorEx(y_desc_[i], CUDNN_TENSOR_NCHW, cudnn_data_type_, 3, y_dims), + "set y_desc failed"); } } void DestroyResource() noexcept override { - CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyRNNDescriptor(rnn_desc_), "destroy rnn_desc failed"); - CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyDropoutDescriptor(dropout_desc_), "destroy dropout_desc failed"); - CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(cy_desc_), "destroy cy_desc failed"); - CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(hy_desc_), "destroy hy_desc failed"); - CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyFilterDescriptor(w_desc_), "destroy w_desc failed"); - CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(hx_desc_), "destroy hx_desc failed"); - CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(cx_desc_), "destroy cx_desc failed"); + CHECK_CUDNN_RET_WITH_ERROR(kernel_node_, cudnnDestroyRNNDescriptor(rnn_desc_), "destroy rnn_desc failed"); + CHECK_CUDNN_RET_WITH_ERROR(kernel_node_, cudnnDestroyDropoutDescriptor(dropout_desc_), + "destroy dropout_desc failed"); + CHECK_CUDNN_RET_WITH_ERROR(kernel_node_, cudnnDestroyTensorDescriptor(cy_desc_), "destroy cy_desc failed"); + CHECK_CUDNN_RET_WITH_ERROR(kernel_node_, cudnnDestroyTensorDescriptor(hy_desc_), "destroy hy_desc failed"); + CHECK_CUDNN_RET_WITH_ERROR(kernel_node_, cudnnDestroyFilterDescriptor(w_desc_), "destroy w_desc failed"); + CHECK_CUDNN_RET_WITH_ERROR(kernel_node_, cudnnDestroyTensorDescriptor(hx_desc_), "destroy hx_desc failed"); + CHECK_CUDNN_RET_WITH_ERROR(kernel_node_, cudnnDestroyTensorDescriptor(cx_desc_), "destroy cx_desc failed"); for (size_t i = 0; i < IntToSize(seq_len_); ++i) { - CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(y_desc_[i]), "destroy y_desc failed"); - CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(x_desc_[i]), "destroy x_desc failed"); + CHECK_CUDNN_RET_WITH_ERROR(kernel_node_, cudnnDestroyTensorDescriptor(y_desc_[i]), "destroy y_desc failed"); + CHECK_CUDNN_RET_WITH_ERROR(kernel_node_, cudnnDestroyTensorDescriptor(x_desc_[i]), "destroy x_desc failed"); } } protected: void InitResource() override { handle_ = device::gpu::GPUDeviceManager::GetInstance().GetCudnnHandle(); - CHECK_CUDNN_RET_WITH_EXCEPT(cudnnCreateTensorDescriptor(&hx_desc_), "create hx_desc failed"); - CHECK_CUDNN_RET_WITH_EXCEPT(cudnnCreateTensorDescriptor(&cx_desc_), "create cx_desc failed"); - CHECK_CUDNN_RET_WITH_EXCEPT(cudnnCreateFilterDescriptor(&w_desc_), "create w_desc failed"); - CHECK_CUDNN_RET_WITH_EXCEPT(cudnnCreateTensorDescriptor(&hy_desc_), "create hy_desc failed"); - CHECK_CUDNN_RET_WITH_EXCEPT(cudnnCreateTensorDescriptor(&cy_desc_), "create cy_desc failed"); - CHECK_CUDNN_RET_WITH_EXCEPT(cudnnCreateDropoutDescriptor(&dropout_desc_), "create dropout_desc failed"); - CHECK_CUDNN_RET_WITH_EXCEPT(cudnnCreateRNNDescriptor(&rnn_desc_), "create rnn_desc failed"); + CHECK_CUDNN_RET_WITH_EXCEPT(kernel_node_, cudnnCreateTensorDescriptor(&hx_desc_), "create hx_desc failed"); + CHECK_CUDNN_RET_WITH_EXCEPT(kernel_node_, cudnnCreateTensorDescriptor(&cx_desc_), "create cx_desc failed"); + CHECK_CUDNN_RET_WITH_EXCEPT(kernel_node_, cudnnCreateFilterDescriptor(&w_desc_), "create w_desc failed"); + CHECK_CUDNN_RET_WITH_EXCEPT(kernel_node_, cudnnCreateTensorDescriptor(&hy_desc_), "create hy_desc failed"); + CHECK_CUDNN_RET_WITH_EXCEPT(kernel_node_, cudnnCreateTensorDescriptor(&cy_desc_), "create cy_desc failed"); + CHECK_CUDNN_RET_WITH_EXCEPT(kernel_node_, cudnnCreateDropoutDescriptor(&dropout_desc_), + "create dropout_desc failed"); + CHECK_CUDNN_RET_WITH_EXCEPT(kernel_node_, cudnnCreateRNNDescriptor(&rnn_desc_), "create rnn_desc failed"); } void InitSizeLists() override { size_t x_size = IntToSize(seq_len_ * batch_size_ * input_size_) * sizeof(T); size_t h_size = 0; - CHECK_CUDNN_RET_WITH_EXCEPT(cudnnGetTensorSizeInBytes(hx_desc_, &h_size), "get h size failed"); + CHECK_CUDNN_RET_WITH_EXCEPT(kernel_node_, cudnnGetTensorSizeInBytes(hx_desc_, &h_size), "get h size failed"); input_size_list_.push_back(x_size); input_size_list_.push_back(h_size); @@ -200,11 +214,13 @@ class LstmGpuKernel : public GpuKernel { output_size_list_.push_back(h_size); output_size_list_.push_back(reserved_size_); size_t state_size = 0; - CHECK_CUDNN_RET_WITH_EXCEPT(cudnnDropoutGetStatesSize(handle_, &state_size), "get dropout states size failed"); + CHECK_CUDNN_RET_WITH_EXCEPT(kernel_node_, cudnnDropoutGetStatesSize(handle_, &state_size), + "get dropout states size failed"); output_size_list_.push_back(state_size); size_t workspace_size = 0; - CHECK_CUDNN_RET_WITH_EXCEPT(cudnnGetRNNWorkspaceSize(handle_, rnn_desc_, seq_len_, x_desc_.get(), &workspace_size), + CHECK_CUDNN_RET_WITH_EXCEPT(kernel_node_, + cudnnGetRNNWorkspaceSize(handle_, rnn_desc_, seq_len_, x_desc_.get(), &workspace_size), "get workspace size failed"); workspace_size_list_.push_back(workspace_size); } diff --git a/mindspore/ccsrc/backend/kernel_compiler/gpu/nn/lstm_grad_data_gpu_kernel.h b/mindspore/ccsrc/backend/kernel_compiler/gpu/nn/lstm_grad_data_gpu_kernel.h index dee8d56cd7..f5a007c2cd 100644 --- a/mindspore/ccsrc/backend/kernel_compiler/gpu/nn/lstm_grad_data_gpu_kernel.h +++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/nn/lstm_grad_data_gpu_kernel.h @@ -79,19 +79,21 @@ class LstmGradDataGpuKernel : public GpuKernel { if (!states_init_) { CHECK_CUDNN_RET_WITH_EXCEPT( + kernel_node_, cudnnRestoreDropoutDescriptor(dropout_desc_, handle_, dropout_, states_addr, input_size_list_[8], 0), "restore dropout state failed"); states_init_ = true; } CHECK_CUDNN_RET_WITH_EXCEPT( + kernel_node_, cudnnRNNBackwardData(handle_, rnn_desc_, seq_len_, y_desc_.get(), y_addr, dy_desc_.get(), dy_addr, dhy_desc_, dhy_addr, dcy_desc_, dcy_addr, w_desc_, w_addr, hx_desc_, hx_addr, cx_desc_, cx_addr, dx_desc_.get(), dx_addr, dhx_desc_, dhx_addr, dcx_desc_, dcx_addr, workspace_addr, workspace_size_list_[0], reserved_addr, reserved_size_), "launch lstm back data kernel failed"); - CHECK_CUDA_RET_WITH_EXCEPT(cudaStreamSynchronize(reinterpret_cast(stream_ptr)), + CHECK_CUDA_RET_WITH_EXCEPT(kernel_node_, cudaStreamSynchronize(reinterpret_cast(stream_ptr)), "stream synchronize failed."); return true; } @@ -104,6 +106,7 @@ class LstmGradDataGpuKernel : public GpuKernel { dropout_ = GetAttr(kernel_node, "dropout"); } bool Init(const CNodePtr &kernel_node) override { + kernel_node_ = kernel_node; InitResource(); cudnn_data_type_ = GetCudnnDataType(TypeIdLabel(AnfAlgo::GetInputDeviceDataType(kernel_node, 0))); auto input_shape = AnfAlgo::GetOutputInferShape(kernel_node, 0); @@ -117,71 +120,84 @@ class LstmGradDataGpuKernel : public GpuKernel { CreateTensorDescGrp(); int hx_dims[3]{num_layers_ * (bidirectional_ ? 2 : 1), batch_size_, hidden_size_}; CHECK_CUDNN_RET_WITH_EXCEPT( - cudnnSetTensorNdDescriptorEx(dhy_desc_, CUDNN_TENSOR_NCHW, cudnn_data_type_, 3, hx_dims), "set dhy_desc_ failed"); + kernel_node_, cudnnSetTensorNdDescriptorEx(dhy_desc_, CUDNN_TENSOR_NCHW, cudnn_data_type_, 3, hx_dims), + "set dhy_desc_ failed"); CHECK_CUDNN_RET_WITH_EXCEPT( - cudnnSetTensorNdDescriptorEx(dcy_desc_, CUDNN_TENSOR_NCHW, cudnn_data_type_, 3, hx_dims), "set dcy_desc_ failed"); - CHECK_CUDNN_RET_WITH_EXCEPT(cudnnSetTensorNdDescriptorEx(hx_desc_, CUDNN_TENSOR_NCHW, cudnn_data_type_, 3, hx_dims), + kernel_node_, cudnnSetTensorNdDescriptorEx(dcy_desc_, CUDNN_TENSOR_NCHW, cudnn_data_type_, 3, hx_dims), + "set dcy_desc_ failed"); + CHECK_CUDNN_RET_WITH_EXCEPT(kernel_node_, + cudnnSetTensorNdDescriptorEx(hx_desc_, CUDNN_TENSOR_NCHW, cudnn_data_type_, 3, hx_dims), "set hx_desc_ failed"); - CHECK_CUDNN_RET_WITH_EXCEPT(cudnnSetTensorNdDescriptorEx(cx_desc_, CUDNN_TENSOR_NCHW, cudnn_data_type_, 3, hx_dims), + CHECK_CUDNN_RET_WITH_EXCEPT(kernel_node_, + cudnnSetTensorNdDescriptorEx(cx_desc_, CUDNN_TENSOR_NCHW, cudnn_data_type_, 3, hx_dims), "set cx_desc_ failed"); CHECK_CUDNN_RET_WITH_EXCEPT( - cudnnSetTensorNdDescriptorEx(dhx_desc_, CUDNN_TENSOR_NCHW, cudnn_data_type_, 3, hx_dims), "set dhx_desc_ failed"); + kernel_node_, cudnnSetTensorNdDescriptorEx(dhx_desc_, CUDNN_TENSOR_NCHW, cudnn_data_type_, 3, hx_dims), + "set dhx_desc_ failed"); CHECK_CUDNN_RET_WITH_EXCEPT( - cudnnSetTensorNdDescriptorEx(dcx_desc_, CUDNN_TENSOR_NCHW, cudnn_data_type_, 3, hx_dims), "set dcx_desc_ failed"); - CHECK_CUDNN_RET_WITH_EXCEPT(cudnnSetDropoutDescriptor(dropout_desc_, handle_, dropout_, nullptr, 0, 0), + kernel_node_, cudnnSetTensorNdDescriptorEx(dcx_desc_, CUDNN_TENSOR_NCHW, cudnn_data_type_, 3, hx_dims), + "set dcx_desc_ failed"); + CHECK_CUDNN_RET_WITH_EXCEPT(kernel_node_, + cudnnSetDropoutDescriptor(dropout_desc_, handle_, dropout_, nullptr, 0, 0), "set dropout_desc failed"); - CHECK_CUDNN_RET_WITH_EXCEPT(cudnnSetRNNDescriptor(handle_, rnn_desc_, hidden_size_, num_layers_, dropout_desc_, + CHECK_CUDNN_RET_WITH_EXCEPT(kernel_node_, + cudnnSetRNNDescriptor(handle_, rnn_desc_, hidden_size_, num_layers_, dropout_desc_, input_mode, direction, rnn_mode, algo, cudnn_data_type_), "set rnn_desc failed"); cudnnRNNBiasMode_t bias_mode = has_bias_ ? CUDNN_RNN_DOUBLE_BIAS : CUDNN_RNN_NO_BIAS; - CHECK_CUDNN_RET_WITH_EXCEPT(cudnnSetRNNBiasMode(rnn_desc_, bias_mode), "set bias_mode failed"); + CHECK_CUDNN_RET_WITH_EXCEPT(kernel_node_, cudnnSetRNNBiasMode(rnn_desc_, bias_mode), "set bias_mode failed"); auto weight_shape = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 4); size_t weight_size = weight_shape[0] * weight_shape[1] * weight_shape[2] * sizeof(T); - CHECK_CUDNN_RET_WITH_EXCEPT(cudnnGetRNNParamsSize(handle_, rnn_desc_, dx_desc_[0], &weight_size_, cudnn_data_type_), + CHECK_CUDNN_RET_WITH_EXCEPT(kernel_node_, + cudnnGetRNNParamsSize(handle_, rnn_desc_, dx_desc_[0], &weight_size_, cudnn_data_type_), "get weight_size_ failed"); if (weight_size != weight_size_) { MS_LOG(EXCEPTION) << "weight size: " << weight_size << " error, expect: " << weight_size_ << " ."; } int w_dims[3] = {SizeToInt(weight_size_ / 4), 1, 1}; - CHECK_CUDNN_RET_WITH_EXCEPT(cudnnSetFilterNdDescriptor(w_desc_, cudnn_data_type_, CUDNN_TENSOR_NCHW, 3, w_dims), + CHECK_CUDNN_RET_WITH_EXCEPT(kernel_node_, + cudnnSetFilterNdDescriptor(w_desc_, cudnn_data_type_, CUDNN_TENSOR_NCHW, 3, w_dims), "set w_desc failed"); CHECK_CUDNN_RET_WITH_EXCEPT( - cudnnGetRNNTrainingReserveSize(handle_, rnn_desc_, seq_len_, dx_desc_.get(), &reserved_size_), "get size failed"); + kernel_node_, cudnnGetRNNTrainingReserveSize(handle_, rnn_desc_, seq_len_, dx_desc_.get(), &reserved_size_), + "get size failed"); InitSizeLists(); return true; } void DestroyResource() noexcept override { - CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyRNNDescriptor(rnn_desc_), "destroy rnn_desc failed"); - CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyDropoutDescriptor(dropout_desc_), "destroy dropout_desc failed"); - CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(dcx_desc_), "destroy dcx_desc_ failed"); - CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(dhx_desc_), "destroy dhx_desc_ failed"); - CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyFilterDescriptor(w_desc_), "destroy w_desc_ failed"); - CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(cx_desc_), "destroy cx_desc_ failed"); - CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(hx_desc_), "destroy hx_desc_ failed"); - CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(dcy_desc_), "destroy dcy_desc_ failed"); - CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(dhy_desc_), "destroy dhy_desc_ failed"); + CHECK_CUDNN_RET_WITH_ERROR(kernel_node_, cudnnDestroyRNNDescriptor(rnn_desc_), "destroy rnn_desc failed"); + CHECK_CUDNN_RET_WITH_ERROR(kernel_node_, cudnnDestroyDropoutDescriptor(dropout_desc_), + "destroy dropout_desc failed"); + CHECK_CUDNN_RET_WITH_ERROR(kernel_node_, cudnnDestroyTensorDescriptor(dcx_desc_), "destroy dcx_desc_ failed"); + CHECK_CUDNN_RET_WITH_ERROR(kernel_node_, cudnnDestroyTensorDescriptor(dhx_desc_), "destroy dhx_desc_ failed"); + CHECK_CUDNN_RET_WITH_ERROR(kernel_node_, cudnnDestroyFilterDescriptor(w_desc_), "destroy w_desc_ failed"); + CHECK_CUDNN_RET_WITH_ERROR(kernel_node_, cudnnDestroyTensorDescriptor(cx_desc_), "destroy cx_desc_ failed"); + CHECK_CUDNN_RET_WITH_ERROR(kernel_node_, cudnnDestroyTensorDescriptor(hx_desc_), "destroy hx_desc_ failed"); + CHECK_CUDNN_RET_WITH_ERROR(kernel_node_, cudnnDestroyTensorDescriptor(dcy_desc_), "destroy dcy_desc_ failed"); + CHECK_CUDNN_RET_WITH_ERROR(kernel_node_, cudnnDestroyTensorDescriptor(dhy_desc_), "destroy dhy_desc_ failed"); DestroyTensorDescGrp(); } protected: void InitResource() override { handle_ = device::gpu::GPUDeviceManager::GetInstance().GetCudnnHandle(); - CHECK_CUDNN_RET_WITH_EXCEPT(cudnnCreateTensorDescriptor(&dhy_desc_), "create dhy_desc_ failed"); - CHECK_CUDNN_RET_WITH_EXCEPT(cudnnCreateTensorDescriptor(&dcy_desc_), "create dcy_desc_ failed"); - CHECK_CUDNN_RET_WITH_EXCEPT(cudnnCreateTensorDescriptor(&hx_desc_), "create hx_desc_ failed"); - CHECK_CUDNN_RET_WITH_EXCEPT(cudnnCreateTensorDescriptor(&cx_desc_), "create cx_desc_ failed"); - CHECK_CUDNN_RET_WITH_EXCEPT(cudnnCreateFilterDescriptor(&w_desc_), "create w_desc_ failed"); - CHECK_CUDNN_RET_WITH_EXCEPT(cudnnCreateTensorDescriptor(&dhx_desc_), "create dhx_desc_ failed"); - CHECK_CUDNN_RET_WITH_EXCEPT(cudnnCreateTensorDescriptor(&dcx_desc_), "create dcx_desc_ failed"); - CHECK_CUDNN_RET_WITH_EXCEPT(cudnnCreateDropoutDescriptor(&dropout_desc_), "create dropout_desc failed"); - CHECK_CUDNN_RET_WITH_EXCEPT(cudnnCreateRNNDescriptor(&rnn_desc_), "create rnn_desc failed"); + CHECK_CUDNN_RET_WITH_EXCEPT(kernel_node_, cudnnCreateTensorDescriptor(&dhy_desc_), "create dhy_desc_ failed"); + CHECK_CUDNN_RET_WITH_EXCEPT(kernel_node_, cudnnCreateTensorDescriptor(&dcy_desc_), "create dcy_desc_ failed"); + CHECK_CUDNN_RET_WITH_EXCEPT(kernel_node_, cudnnCreateTensorDescriptor(&hx_desc_), "create hx_desc_ failed"); + CHECK_CUDNN_RET_WITH_EXCEPT(kernel_node_, cudnnCreateTensorDescriptor(&cx_desc_), "create cx_desc_ failed"); + CHECK_CUDNN_RET_WITH_EXCEPT(kernel_node_, cudnnCreateFilterDescriptor(&w_desc_), "create w_desc_ failed"); + CHECK_CUDNN_RET_WITH_EXCEPT(kernel_node_, cudnnCreateTensorDescriptor(&dhx_desc_), "create dhx_desc_ failed"); + CHECK_CUDNN_RET_WITH_EXCEPT(kernel_node_, cudnnCreateTensorDescriptor(&dcx_desc_), "create dcx_desc_ failed"); + CHECK_CUDNN_RET_WITH_EXCEPT(kernel_node_, cudnnCreateDropoutDescriptor(&dropout_desc_), + "create dropout_desc failed"); + CHECK_CUDNN_RET_WITH_EXCEPT(kernel_node_, cudnnCreateRNNDescriptor(&rnn_desc_), "create rnn_desc failed"); } void InitSizeLists() override { size_t y_size = IntToSize(seq_len_ * batch_size_ * hidden_size_ * (bidirectional_ ? 2 : 1)) * sizeof(T); size_t h_size = 0; - CHECK_CUDNN_RET_WITH_EXCEPT(cudnnGetTensorSizeInBytes(hx_desc_, &h_size), "get h size failed"); + CHECK_CUDNN_RET_WITH_EXCEPT(kernel_node_, cudnnGetTensorSizeInBytes(hx_desc_, &h_size), "get h size failed"); input_size_list_.push_back(y_size); input_size_list_.push_back(y_size); @@ -192,7 +208,8 @@ class LstmGradDataGpuKernel : public GpuKernel { input_size_list_.push_back(h_size); input_size_list_.push_back(reserved_size_); size_t state_size = 0; - CHECK_CUDNN_RET_WITH_EXCEPT(cudnnDropoutGetStatesSize(handle_, &state_size), "get dropout states size failed"); + CHECK_CUDNN_RET_WITH_EXCEPT(kernel_node_, cudnnDropoutGetStatesSize(handle_, &state_size), + "get dropout states size failed"); input_size_list_.push_back(state_size); size_t x_size = IntToSize(seq_len_ * batch_size_ * input_size_) * sizeof(T); @@ -201,7 +218,8 @@ class LstmGradDataGpuKernel : public GpuKernel { output_size_list_.push_back(h_size); size_t workspace_size = 0; - CHECK_CUDNN_RET_WITH_EXCEPT(cudnnGetRNNWorkspaceSize(handle_, rnn_desc_, seq_len_, dx_desc_.get(), &workspace_size), + CHECK_CUDNN_RET_WITH_EXCEPT(kernel_node_, + cudnnGetRNNWorkspaceSize(handle_, rnn_desc_, seq_len_, dx_desc_.get(), &workspace_size), "get workspace size failed"); workspace_size_list_.push_back(workspace_size); } @@ -216,27 +234,28 @@ class LstmGradDataGpuKernel : public GpuKernel { dy_desc_ = std::make_unique(seq_len_); for (size_t i = 0; i < IntToSize(seq_len_); ++i) { - CHECK_CUDNN_RET_WITH_EXCEPT(cudnnCreateTensorDescriptor(&dx_desc_[i]), "create x_desc failed"); + CHECK_CUDNN_RET_WITH_EXCEPT(kernel_node_, cudnnCreateTensorDescriptor(&dx_desc_[i]), "create x_desc failed"); CHECK_CUDNN_RET_WITH_EXCEPT( - cudnnSetTensorNdDescriptorEx(dx_desc_[i], CUDNN_TENSOR_NCHW, cudnn_data_type_, 3, x_dims), + kernel_node_, cudnnSetTensorNdDescriptorEx(dx_desc_[i], CUDNN_TENSOR_NCHW, cudnn_data_type_, 3, x_dims), "set dx_desc failed"); - CHECK_CUDNN_RET_WITH_EXCEPT(cudnnCreateTensorDescriptor(&y_desc_[i]), "create y_desc failed"); + CHECK_CUDNN_RET_WITH_EXCEPT(kernel_node_, cudnnCreateTensorDescriptor(&y_desc_[i]), "create y_desc failed"); CHECK_CUDNN_RET_WITH_EXCEPT( - cudnnSetTensorNdDescriptorEx(y_desc_[i], CUDNN_TENSOR_NCHW, cudnn_data_type_, 3, y_dims), "set y_desc failed"); + kernel_node_, cudnnSetTensorNdDescriptorEx(y_desc_[i], CUDNN_TENSOR_NCHW, cudnn_data_type_, 3, y_dims), + "set y_desc failed"); - CHECK_CUDNN_RET_WITH_EXCEPT(cudnnCreateTensorDescriptor(&dy_desc_[i]), "create dy_desc_ failed"); + CHECK_CUDNN_RET_WITH_EXCEPT(kernel_node_, cudnnCreateTensorDescriptor(&dy_desc_[i]), "create dy_desc_ failed"); CHECK_CUDNN_RET_WITH_EXCEPT( - cudnnSetTensorNdDescriptorEx(dy_desc_[i], CUDNN_TENSOR_NCHW, cudnn_data_type_, 3, y_dims), + kernel_node_, cudnnSetTensorNdDescriptorEx(dy_desc_[i], CUDNN_TENSOR_NCHW, cudnn_data_type_, 3, y_dims), "set dy_desc_ failed"); } } void DestroyTensorDescGrp() { for (size_t i = 0; i < IntToSize(seq_len_); ++i) { - CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(dy_desc_[i]), "destroy dy_desc failed"); - CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(y_desc_[i]), "destroy y_desc failed"); - CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(dx_desc_[i]), "destroy x_desc failed"); + CHECK_CUDNN_RET_WITH_ERROR(kernel_node_, cudnnDestroyTensorDescriptor(dy_desc_[i]), "destroy dy_desc failed"); + CHECK_CUDNN_RET_WITH_ERROR(kernel_node_, cudnnDestroyTensorDescriptor(y_desc_[i]), "destroy y_desc failed"); + CHECK_CUDNN_RET_WITH_ERROR(kernel_node_, cudnnDestroyTensorDescriptor(dx_desc_[i]), "destroy x_desc failed"); } } diff --git a/mindspore/ccsrc/backend/kernel_compiler/gpu/nn/lstm_grad_weight_gpu_kernel.h b/mindspore/ccsrc/backend/kernel_compiler/gpu/nn/lstm_grad_weight_gpu_kernel.h index 81f620c1ad..bcb3b2365b 100644 --- a/mindspore/ccsrc/backend/kernel_compiler/gpu/nn/lstm_grad_weight_gpu_kernel.h +++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/nn/lstm_grad_weight_gpu_kernel.h @@ -66,15 +66,18 @@ class LstmGradWeightGpuKernel : public GpuKernel { if (!states_init_) { CHECK_CUDNN_RET_WITH_EXCEPT( + kernel_node_, cudnnRestoreDropoutDescriptor(dropout_desc_, handle_, dropout_, states_addr, input_size_list_[4], 0), "restore dropout state failed"); states_init_ = true; } CHECK_CUDA_RET_WITH_EXCEPT( - cudaMemsetAsync(dw_addr, 0, outputs[0]->size, reinterpret_cast(stream_ptr)), "cudaMemSet Failed"); + kernel_node_, cudaMemsetAsync(dw_addr, 0, outputs[0]->size, reinterpret_cast(stream_ptr)), + "cudaMemSet Failed"); CHECK_CUDNN_RET_WITH_EXCEPT( + kernel_node_, cudnnRNNBackwardWeights(handle_, rnn_desc_, seq_len_, x_desc_.get(), x_addr, hx_desc_, hx_addr, y_desc_.get(), y_addr, workspace_addr, workspace_size_list_[0], dw_desc_, dw_addr, reserved_addr, reserved_size_), @@ -83,6 +86,7 @@ class LstmGradWeightGpuKernel : public GpuKernel { return true; } bool Init(const CNodePtr &kernel_node) override { + kernel_node_ = kernel_node; InitResource(); cudnn_data_type_ = GetCudnnDataType(TypeIdLabel(AnfAlgo::GetInputDeviceDataType(kernel_node, 0))); auto input_shape = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 0); @@ -104,29 +108,34 @@ class LstmGradWeightGpuKernel : public GpuKernel { CreateTensorDescGrp(); int hx_dims[3]{num_layers_ * (bidirectional_ ? 2 : 1), batch_size_, hidden_size_}; - CHECK_CUDNN_RET_WITH_EXCEPT(cudnnSetTensorNdDescriptorEx(hx_desc_, CUDNN_TENSOR_NCHW, cudnn_data_type_, 3, hx_dims), + CHECK_CUDNN_RET_WITH_EXCEPT(kernel_node_, + cudnnSetTensorNdDescriptorEx(hx_desc_, CUDNN_TENSOR_NCHW, cudnn_data_type_, 3, hx_dims), "set hx_desc_ failed"); - CHECK_CUDNN_RET_WITH_EXCEPT(cudnnSetDropoutDescriptor(dropout_desc_, handle_, dropout_, nullptr, 0, 0), + CHECK_CUDNN_RET_WITH_EXCEPT(kernel_node_, + cudnnSetDropoutDescriptor(dropout_desc_, handle_, dropout_, nullptr, 0, 0), "set dropout_desc failed"); - CHECK_CUDNN_RET_WITH_EXCEPT(cudnnSetRNNDescriptor(handle_, rnn_desc_, hidden_size_, num_layers_, dropout_desc_, + CHECK_CUDNN_RET_WITH_EXCEPT(kernel_node_, + cudnnSetRNNDescriptor(handle_, rnn_desc_, hidden_size_, num_layers_, dropout_desc_, input_mode, direction, rnn_mode, algo, cudnn_data_type_), "set rnn_desc failed"); cudnnRNNBiasMode_t bias_mode = has_bias_ ? CUDNN_RNN_DOUBLE_BIAS : CUDNN_RNN_NO_BIAS; - CHECK_CUDNN_RET_WITH_EXCEPT(cudnnSetRNNBiasMode(rnn_desc_, bias_mode), "set bias_mode failed"); + CHECK_CUDNN_RET_WITH_EXCEPT(kernel_node_, cudnnSetRNNBiasMode(rnn_desc_, bias_mode), "set bias_mode failed"); auto weight_shape = AnfAlgo::GetOutputInferShape(kernel_node, 0); size_t weight_size = weight_shape[0] * weight_shape[1] * weight_shape[2] * sizeof(T); - CHECK_CUDNN_RET_WITH_EXCEPT(cudnnGetRNNParamsSize(handle_, rnn_desc_, x_desc_[0], &weight_size_, cudnn_data_type_), + CHECK_CUDNN_RET_WITH_EXCEPT(kernel_node_, + cudnnGetRNNParamsSize(handle_, rnn_desc_, x_desc_[0], &weight_size_, cudnn_data_type_), "get weight_size_ failed"); if (weight_size != weight_size_) { MS_LOG(EXCEPTION) << "weight size: " << weight_size << " error, expect: " << weight_size_ << " ."; } int w_dims[3] = {SizeToInt(weight_size_ / 4), 1, 1}; - CHECK_CUDNN_RET_WITH_EXCEPT(cudnnSetFilterNdDescriptor(dw_desc_, cudnn_data_type_, CUDNN_TENSOR_NCHW, 3, w_dims), + CHECK_CUDNN_RET_WITH_EXCEPT(kernel_node_, + cudnnSetFilterNdDescriptor(dw_desc_, cudnn_data_type_, CUDNN_TENSOR_NCHW, 3, w_dims), "set dw_desc failed"); CHECK_CUDNN_RET_WITH_EXCEPT( - cudnnGetRNNTrainingReserveSize(handle_, rnn_desc_, seq_len_, x_desc_.get(), &reserved_size_), + kernel_node_, cudnnGetRNNTrainingReserveSize(handle_, rnn_desc_, seq_len_, x_desc_.get(), &reserved_size_), "get reserve size failed"); InitSizeLists(); return true; @@ -135,16 +144,17 @@ class LstmGradWeightGpuKernel : public GpuKernel { protected: void InitResource() override { handle_ = device::gpu::GPUDeviceManager::GetInstance().GetCudnnHandle(); - CHECK_CUDNN_RET_WITH_EXCEPT(cudnnCreateTensorDescriptor(&hx_desc_), "create hx_desc_ failed"); - CHECK_CUDNN_RET_WITH_EXCEPT(cudnnCreateFilterDescriptor(&dw_desc_), "create dw_desc_ failed"); - CHECK_CUDNN_RET_WITH_EXCEPT(cudnnCreateDropoutDescriptor(&dropout_desc_), "create dropout_desc failed"); - CHECK_CUDNN_RET_WITH_EXCEPT(cudnnCreateRNNDescriptor(&rnn_desc_), "create rnn_desc failed"); + CHECK_CUDNN_RET_WITH_EXCEPT(kernel_node_, cudnnCreateTensorDescriptor(&hx_desc_), "create hx_desc_ failed"); + CHECK_CUDNN_RET_WITH_EXCEPT(kernel_node_, cudnnCreateFilterDescriptor(&dw_desc_), "create dw_desc_ failed"); + CHECK_CUDNN_RET_WITH_EXCEPT(kernel_node_, cudnnCreateDropoutDescriptor(&dropout_desc_), + "create dropout_desc failed"); + CHECK_CUDNN_RET_WITH_EXCEPT(kernel_node_, cudnnCreateRNNDescriptor(&rnn_desc_), "create rnn_desc failed"); } void InitSizeLists() override { size_t x_size = IntToSize(seq_len_ * batch_size_ * input_size_) * sizeof(T); size_t h_size = 0; - CHECK_CUDNN_RET_WITH_EXCEPT(cudnnGetTensorSizeInBytes(hx_desc_, &h_size), "get h size failed"); + CHECK_CUDNN_RET_WITH_EXCEPT(kernel_node_, cudnnGetTensorSizeInBytes(hx_desc_, &h_size), "get h size failed"); size_t y_size = IntToSize(seq_len_ * batch_size_ * hidden_size_ * (bidirectional_ ? 2 : 1)) * sizeof(T); input_size_list_.push_back(x_size); @@ -152,21 +162,24 @@ class LstmGradWeightGpuKernel : public GpuKernel { input_size_list_.push_back(y_size); input_size_list_.push_back(reserved_size_); size_t state_size = 0; - CHECK_CUDNN_RET_WITH_EXCEPT(cudnnDropoutGetStatesSize(handle_, &state_size), "get dropout states size failed"); + CHECK_CUDNN_RET_WITH_EXCEPT(kernel_node_, cudnnDropoutGetStatesSize(handle_, &state_size), + "get dropout states size failed"); input_size_list_.push_back(state_size); output_size_list_.push_back(weight_size_); size_t workspace_size = 0; - CHECK_CUDNN_RET_WITH_EXCEPT(cudnnGetRNNWorkspaceSize(handle_, rnn_desc_, seq_len_, x_desc_.get(), &workspace_size), + CHECK_CUDNN_RET_WITH_EXCEPT(kernel_node_, + cudnnGetRNNWorkspaceSize(handle_, rnn_desc_, seq_len_, x_desc_.get(), &workspace_size), "get workspace size failed"); workspace_size_list_.push_back(workspace_size); } void DestroyResource() noexcept override { - CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyRNNDescriptor(rnn_desc_), "destroy rnn_desc failed"); - CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyDropoutDescriptor(dropout_desc_), "destroy dropout_desc failed"); - CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyFilterDescriptor(dw_desc_), "destroy dw_desc_ failed"); - CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(hx_desc_), "destroy hx_desc_ failed"); + CHECK_CUDNN_RET_WITH_ERROR(kernel_node_, cudnnDestroyRNNDescriptor(rnn_desc_), "destroy rnn_desc failed"); + CHECK_CUDNN_RET_WITH_ERROR(kernel_node_, cudnnDestroyDropoutDescriptor(dropout_desc_), + "destroy dropout_desc failed"); + CHECK_CUDNN_RET_WITH_ERROR(kernel_node_, cudnnDestroyFilterDescriptor(dw_desc_), "destroy dw_desc_ failed"); + CHECK_CUDNN_RET_WITH_ERROR(kernel_node_, cudnnDestroyTensorDescriptor(hx_desc_), "destroy hx_desc_ failed"); DestroyTensorDescGrp(); } @@ -179,19 +192,21 @@ class LstmGradWeightGpuKernel : public GpuKernel { y_desc_ = std::make_unique(seq_len_); for (size_t i = 0; i < IntToSize(seq_len_); ++i) { - CHECK_CUDNN_RET_WITH_EXCEPT(cudnnCreateTensorDescriptor(&x_desc_[i]), "create x_desc failed"); + CHECK_CUDNN_RET_WITH_EXCEPT(kernel_node_, cudnnCreateTensorDescriptor(&x_desc_[i]), "create x_desc failed"); CHECK_CUDNN_RET_WITH_EXCEPT( - cudnnSetTensorNdDescriptorEx(x_desc_[i], CUDNN_TENSOR_NCHW, cudnn_data_type_, 3, x_dims), "set x_desc failed"); + kernel_node_, cudnnSetTensorNdDescriptorEx(x_desc_[i], CUDNN_TENSOR_NCHW, cudnn_data_type_, 3, x_dims), + "set x_desc failed"); - CHECK_CUDNN_RET_WITH_EXCEPT(cudnnCreateTensorDescriptor(&y_desc_[i]), "create y_desc failed"); + CHECK_CUDNN_RET_WITH_EXCEPT(kernel_node_, cudnnCreateTensorDescriptor(&y_desc_[i]), "create y_desc failed"); CHECK_CUDNN_RET_WITH_EXCEPT( - cudnnSetTensorNdDescriptorEx(y_desc_[i], CUDNN_TENSOR_NCHW, cudnn_data_type_, 3, y_dims), "set y_desc failed"); + kernel_node_, cudnnSetTensorNdDescriptorEx(y_desc_[i], CUDNN_TENSOR_NCHW, cudnn_data_type_, 3, y_dims), + "set y_desc failed"); } } void DestroyTensorDescGrp() { for (size_t i = 0; i < IntToSize(seq_len_); ++i) { - CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(y_desc_[i]), "destroy y_desc failed"); - CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(x_desc_[i]), "destroy x_desc failed"); + CHECK_CUDNN_RET_WITH_ERROR(kernel_node_, cudnnDestroyTensorDescriptor(y_desc_[i]), "destroy y_desc failed"); + CHECK_CUDNN_RET_WITH_ERROR(kernel_node_, cudnnDestroyTensorDescriptor(x_desc_[i]), "destroy x_desc failed"); } } diff --git a/mindspore/ccsrc/backend/kernel_compiler/gpu/nn/pooling_gpu_kernel.h b/mindspore/ccsrc/backend/kernel_compiler/gpu/nn/pooling_gpu_kernel.h index 6b3791a5ba..caeecafce4 100644 --- a/mindspore/ccsrc/backend/kernel_compiler/gpu/nn/pooling_gpu_kernel.h +++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/nn/pooling_gpu_kernel.h @@ -66,12 +66,14 @@ class PoolingGpuFwdKernel : public GpuKernel { const float alpha = 1; const float beta = 0; - CHECK_CUDNN_RET_WITH_EXCEPT(cudnnPoolingForward(cudnn_handle_, pooling_descriptor_, &alpha, input_descriptor_, + CHECK_CUDNN_RET_WITH_EXCEPT(kernel_node_, + cudnnPoolingForward(cudnn_handle_, pooling_descriptor_, &alpha, input_descriptor_, input_addr, &beta, output_descriptor_, output_addr), "cudnnPoolingForward failed"); return true; } bool Init(const CNodePtr &kernel_node) { + kernel_node_ = kernel_node; InitResource(); if (!CheckParam(kernel_node)) { return false; @@ -102,10 +104,10 @@ class PoolingGpuFwdKernel : public GpuKernel { SetDimA(output_shape, dimAout, 4, data_format_); SetStrideA(output_shape, strideAout, 4, data_format_); CHECK_CUDNN_RET_WITH_EXCEPT( - cudnnSetTensorNdDescriptor(input_descriptor_, cudnn_data_type_, nbDims, dimA, strideAin), + kernel_node_, cudnnSetTensorNdDescriptor(input_descriptor_, cudnn_data_type_, nbDims, dimA, strideAin), "cudnnSetTensor4dDescriptor failed"); CHECK_CUDNN_RET_WITH_EXCEPT( - cudnnSetTensorNdDescriptor(output_descriptor_, cudnn_data_type_, nbDims, dimAout, strideAout), + kernel_node_, cudnnSetTensorNdDescriptor(output_descriptor_, cudnn_data_type_, nbDims, dimAout, strideAout), "cudnnSetTensor4dDescriptor failed"); SetPoolingMode(kernel_node); SetPad(kernel_node); @@ -114,27 +116,31 @@ class PoolingGpuFwdKernel : public GpuKernel { } void DestroyResource() noexcept override { - CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyPoolingDescriptor(pooling_descriptor_), + CHECK_CUDNN_RET_WITH_ERROR(kernel_node_, cudnnDestroyPoolingDescriptor(pooling_descriptor_), "cudnnDestroyPoolingDescriptor failed"); - CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(output_descriptor_), "cudnnDestroyTensorDescriptor failed"); - CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(input_descriptor_), "cudnnDestroyTensorDescriptor failed"); + CHECK_CUDNN_RET_WITH_ERROR(kernel_node_, cudnnDestroyTensorDescriptor(output_descriptor_), + "cudnnDestroyTensorDescriptor failed"); + CHECK_CUDNN_RET_WITH_ERROR(kernel_node_, cudnnDestroyTensorDescriptor(input_descriptor_), + "cudnnDestroyTensorDescriptor failed"); } protected: void InitResource() { cudnn_handle_ = device::gpu::GPUDeviceManager::GetInstance().GetCudnnHandle(); - CHECK_CUDNN_RET_WITH_EXCEPT(cudnnCreateTensorDescriptor(&input_descriptor_), "cudnnCreateTensorDescriptor failed"); - CHECK_CUDNN_RET_WITH_EXCEPT(cudnnCreateTensorDescriptor(&output_descriptor_), "cudnnCreateTensorDescriptor failed"); - CHECK_CUDNN_RET_WITH_EXCEPT(cudnnCreatePoolingDescriptor(&pooling_descriptor_), + CHECK_CUDNN_RET_WITH_EXCEPT(kernel_node_, cudnnCreateTensorDescriptor(&input_descriptor_), + "cudnnCreateTensorDescriptor failed"); + CHECK_CUDNN_RET_WITH_EXCEPT(kernel_node_, cudnnCreateTensorDescriptor(&output_descriptor_), + "cudnnCreateTensorDescriptor failed"); + CHECK_CUDNN_RET_WITH_EXCEPT(kernel_node_, cudnnCreatePoolingDescriptor(&pooling_descriptor_), "cudnnCreatePoolingDescriptor failed"); } void InitSizeLists() { if (!is_null_input_) { CHECK_CUDNN_RET_WITH_EXCEPT( - cudnnGetTensorSizeInBytes(input_descriptor_, reinterpret_cast(&input_size_)), + kernel_node_, cudnnGetTensorSizeInBytes(input_descriptor_, reinterpret_cast(&input_size_)), "cudnnGetTensorSizeInBytes failed"); CHECK_CUDNN_RET_WITH_EXCEPT( - cudnnGetTensorSizeInBytes(output_descriptor_, reinterpret_cast(&output_size_)), + kernel_node_, cudnnGetTensorSizeInBytes(output_descriptor_, reinterpret_cast(&output_size_)), "cudnnGetTensorSizeInBytes failed"); } input_size_list_.push_back(input_size_); @@ -199,7 +205,8 @@ class PoolingGpuFwdKernel : public GpuKernel { pad_height_ = 0; pad_width_ = 0; } - CHECK_CUDNN_RET_WITH_EXCEPT(cudnnSetPoolingNdDescriptor(pooling_descriptor_, pooling_mode_, CUDNN_NOT_PROPAGATE_NAN, + CHECK_CUDNN_RET_WITH_EXCEPT(kernel_node_, + cudnnSetPoolingNdDescriptor(pooling_descriptor_, pooling_mode_, CUDNN_NOT_PROPAGATE_NAN, 2, windowDimA, paddingA, strideA), "cudnnSetPoolingNdDescriptor failed"); } diff --git a/mindspore/ccsrc/backend/kernel_compiler/gpu/nn/pooling_grad_gpu_kernel.h b/mindspore/ccsrc/backend/kernel_compiler/gpu/nn/pooling_grad_gpu_kernel.h index 49a4142943..c599a2bee1 100644 --- a/mindspore/ccsrc/backend/kernel_compiler/gpu/nn/pooling_grad_gpu_kernel.h +++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/nn/pooling_grad_gpu_kernel.h @@ -71,6 +71,7 @@ class PoolingGradGpuKernel : public GpuKernel { const float alpha = 1; const float beta = 0; CHECK_CUDNN_RET_WITH_EXCEPT( + kernel_node_, cudnnPoolingBackward(cudnn_handle_, pooling_descriptor_, &alpha, y_descriptor_, y, dy_descriptor_, dy, x_descriptor_, x_data, &beta, dx_descriptor_, dx), "cudnnPoolingBackward failed"); @@ -108,6 +109,7 @@ class PoolingGradGpuKernel : public GpuKernel { } bool Init(const CNodePtr &kernel_node) override { + kernel_node_ = kernel_node; InitResource(); if (!CheckParam(kernel_node)) { return false; @@ -124,14 +126,17 @@ class PoolingGradGpuKernel : public GpuKernel { if (!InitShape(kernel_node, dimA, strideAin, dimAy, strideAiny, dimAdy, strideAdy, dimAout, strideAout)) { return true; } - CHECK_CUDNN_RET_WITH_EXCEPT(cudnnSetTensorNdDescriptor(y_descriptor_, cudnn_data_type_, nbDims, dimAy, strideAiny), + CHECK_CUDNN_RET_WITH_EXCEPT(kernel_node_, + cudnnSetTensorNdDescriptor(y_descriptor_, cudnn_data_type_, nbDims, dimAy, strideAiny), "cudnnSetTensor4dDescriptor failed"); - CHECK_CUDNN_RET_WITH_EXCEPT(cudnnSetTensorNdDescriptor(dy_descriptor_, cudnn_data_type_, nbDims, dimAdy, strideAdy), + CHECK_CUDNN_RET_WITH_EXCEPT(kernel_node_, + cudnnSetTensorNdDescriptor(dy_descriptor_, cudnn_data_type_, nbDims, dimAdy, strideAdy), "cudnnSetTensor4dDescriptor failed"); CHECK_CUDNN_RET_WITH_EXCEPT( - cudnnSetTensorNdDescriptor(dx_descriptor_, cudnn_data_type_, nbDims, dimAout, strideAout), + kernel_node_, cudnnSetTensorNdDescriptor(dx_descriptor_, cudnn_data_type_, nbDims, dimAout, strideAout), "cudnnSetTensor4dDescriptor failed"); - CHECK_CUDNN_RET_WITH_EXCEPT(cudnnSetTensorNdDescriptor(x_descriptor_, cudnn_data_type_, nbDims, dimA, strideAin), + CHECK_CUDNN_RET_WITH_EXCEPT(kernel_node_, + cudnnSetTensorNdDescriptor(x_descriptor_, cudnn_data_type_, nbDims, dimA, strideAin), "cudnnSetTensor4dDescriptor failed"); SetPoolingMode(kernel_node); SetPad(kernel_node); @@ -140,41 +145,49 @@ class PoolingGradGpuKernel : public GpuKernel { } void DestroyResource() noexcept override { - CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyPoolingDescriptor(pooling_descriptor_), + CHECK_CUDNN_RET_WITH_ERROR(kernel_node_, cudnnDestroyPoolingDescriptor(pooling_descriptor_), "cudnnDestroyPoolingDescriptor failed"); - CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(dx_descriptor_), "cudnnDestroyTensorDescriptor failed"); - CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(x_descriptor_), "cudnnDestroyTensorDescriptor failed"); - CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(dy_descriptor_), "cudnnDestroyTensorDescriptor failed"); - CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(y_descriptor_), "cudnnDestroyTensorDescriptor failed"); + CHECK_CUDNN_RET_WITH_ERROR(kernel_node_, cudnnDestroyTensorDescriptor(dx_descriptor_), + "cudnnDestroyTensorDescriptor failed"); + CHECK_CUDNN_RET_WITH_ERROR(kernel_node_, cudnnDestroyTensorDescriptor(x_descriptor_), + "cudnnDestroyTensorDescriptor failed"); + CHECK_CUDNN_RET_WITH_ERROR(kernel_node_, cudnnDestroyTensorDescriptor(dy_descriptor_), + "cudnnDestroyTensorDescriptor failed"); + CHECK_CUDNN_RET_WITH_ERROR(kernel_node_, cudnnDestroyTensorDescriptor(y_descriptor_), + "cudnnDestroyTensorDescriptor failed"); } protected: void InitResource() override { cudnn_handle_ = device::gpu::GPUDeviceManager::GetInstance().GetCudnnHandle(); - CHECK_CUDNN_RET_WITH_EXCEPT(cudnnCreateTensorDescriptor(&y_descriptor_), "cudnnCreateTensorDescriptor failed"); - CHECK_CUDNN_RET_WITH_EXCEPT(cudnnCreateTensorDescriptor(&dy_descriptor_), "cudnnCreateTensorDescriptor failed"); - CHECK_CUDNN_RET_WITH_EXCEPT(cudnnCreateTensorDescriptor(&x_descriptor_), "cudnnCreateTensorDescriptor failed"); - CHECK_CUDNN_RET_WITH_EXCEPT(cudnnCreateTensorDescriptor(&dx_descriptor_), "cudnnCreateTensorDescriptor failed"); - CHECK_CUDNN_RET_WITH_EXCEPT(cudnnCreatePoolingDescriptor(&pooling_descriptor_), + CHECK_CUDNN_RET_WITH_EXCEPT(kernel_node_, cudnnCreateTensorDescriptor(&y_descriptor_), + "cudnnCreateTensorDescriptor failed"); + CHECK_CUDNN_RET_WITH_EXCEPT(kernel_node_, cudnnCreateTensorDescriptor(&dy_descriptor_), + "cudnnCreateTensorDescriptor failed"); + CHECK_CUDNN_RET_WITH_EXCEPT(kernel_node_, cudnnCreateTensorDescriptor(&x_descriptor_), + "cudnnCreateTensorDescriptor failed"); + CHECK_CUDNN_RET_WITH_EXCEPT(kernel_node_, cudnnCreateTensorDescriptor(&dx_descriptor_), + "cudnnCreateTensorDescriptor failed"); + CHECK_CUDNN_RET_WITH_EXCEPT(kernel_node_, cudnnCreatePoolingDescriptor(&pooling_descriptor_), "cudnnCreatePoolingDescriptor failed"); } void InitSizeLists() override { if (!is_null_input_) { - CHECK_CUDNN_RET_WITH_EXCEPT(cudnnGetTensorSizeInBytes(y_descriptor_, &input_size_), + CHECK_CUDNN_RET_WITH_EXCEPT(kernel_node_, cudnnGetTensorSizeInBytes(y_descriptor_, &input_size_), "cudnnGetTensorSizeInBytes failed"); - CHECK_CUDNN_RET_WITH_EXCEPT(cudnnGetTensorSizeInBytes(dx_descriptor_, &output_size_), + CHECK_CUDNN_RET_WITH_EXCEPT(kernel_node_, cudnnGetTensorSizeInBytes(dx_descriptor_, &output_size_), "cudnnGetTensorSizeInBytes failed"); } input_size_list_.push_back(input_size_); output_size_list_.push_back(output_size_); if (!is_null_input_) { - CHECK_CUDNN_RET_WITH_EXCEPT(cudnnGetTensorSizeInBytes(dy_descriptor_, &input_size_), + CHECK_CUDNN_RET_WITH_EXCEPT(kernel_node_, cudnnGetTensorSizeInBytes(dy_descriptor_, &input_size_), "cudnnGetTensorSizeInBytes failed"); } input_size_list_.push_back(input_size_); if (!is_null_input_) { - CHECK_CUDNN_RET_WITH_EXCEPT(cudnnGetTensorSizeInBytes(x_descriptor_, &input_size_), + CHECK_CUDNN_RET_WITH_EXCEPT(kernel_node_, cudnnGetTensorSizeInBytes(x_descriptor_, &input_size_), "cudnnGetTensorSizeInBytes failed"); } input_size_list_.push_back(input_size_); @@ -234,7 +247,8 @@ class PoolingGradGpuKernel : public GpuKernel { pad_width_ = 0; } } - CHECK_CUDNN_RET_WITH_EXCEPT(cudnnSetPoolingNdDescriptor(pooling_descriptor_, pooling_mode_, CUDNN_NOT_PROPAGATE_NAN, + CHECK_CUDNN_RET_WITH_EXCEPT(kernel_node_, + cudnnSetPoolingNdDescriptor(pooling_descriptor_, pooling_mode_, CUDNN_NOT_PROPAGATE_NAN, 2, windowDimA, paddingA, strideA), "cudnnSetPoolingNdDescriptor failed"); } diff --git a/mindspore/ccsrc/backend/kernel_compiler/gpu/nn/softmax_cross_entropy_with_logits_gpu_kernel.h b/mindspore/ccsrc/backend/kernel_compiler/gpu/nn/softmax_cross_entropy_with_logits_gpu_kernel.h index f9246022f4..472dc5a8b0 100644 --- a/mindspore/ccsrc/backend/kernel_compiler/gpu/nn/softmax_cross_entropy_with_logits_gpu_kernel.h +++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/nn/softmax_cross_entropy_with_logits_gpu_kernel.h @@ -65,6 +65,7 @@ class SoftmaxCrossEntropyWithLogitsGpuKernel : public GpuKernel { const float alpha = 1; const float beta = 0; CHECK_CUDNN_RET_WITH_EXCEPT( + kernel_node_, cudnnSoftmaxForward(cudnn_handle_, algo_, mode_, &alpha, logits_descriptor_, logits_addr, &beta, softmax_output_descriptor_, softmax_output_logits), "cudnnSoftmaxForward failed."); @@ -74,6 +75,7 @@ class SoftmaxCrossEntropyWithLogitsGpuKernel : public GpuKernel { return true; } bool Init(const CNodePtr &kernel_node) override { + kernel_node_ = kernel_node; InitResource(); size_t input_num = AnfAlgo::GetInputTensorNum(kernel_node); if (input_num != 2) { @@ -90,10 +92,12 @@ class SoftmaxCrossEntropyWithLogitsGpuKernel : public GpuKernel { cudnn_data_type_ = GetCudnnDataType(TypeIdLabel(AnfAlgo::GetInputDeviceDataType(kernel_node, 0))); InferInputOutputSize(kernel_node); - CHECK_CUDNN_RET_WITH_EXCEPT(cudnnSetTensor4dDescriptor(logits_descriptor_, CUDNN_TENSOR_NCHW, cudnn_data_type_, + CHECK_CUDNN_RET_WITH_EXCEPT(kernel_node_, + cudnnSetTensor4dDescriptor(logits_descriptor_, CUDNN_TENSOR_NCHW, cudnn_data_type_, batch_size_, channel_size_, height_, width_), "cudnnSetTensor4dDescriptor failed."); CHECK_CUDNN_RET_WITH_EXCEPT( + kernel_node_, cudnnSetTensor4dDescriptor(softmax_output_descriptor_, CUDNN_TENSOR_NCHW, cudnn_data_type_, batch_size_, channel_size_, height_, width_), "cudnnSetTensor4dDescriptor failed."); @@ -102,18 +106,18 @@ class SoftmaxCrossEntropyWithLogitsGpuKernel : public GpuKernel { } void DestroyResource() noexcept override { - CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(softmax_output_descriptor_), + CHECK_CUDNN_RET_WITH_ERROR(kernel_node_, cudnnDestroyTensorDescriptor(softmax_output_descriptor_), "cudnnDestroyTensorDescriptor failed."); - CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(logits_descriptor_), + CHECK_CUDNN_RET_WITH_ERROR(kernel_node_, cudnnDestroyTensorDescriptor(logits_descriptor_), "cudnnDestroyTensorDescriptor failed."); } protected: void InitResource() override { cudnn_handle_ = device::gpu::GPUDeviceManager::GetInstance().GetCudnnHandle(); - CHECK_CUDNN_RET_WITH_EXCEPT(cudnnCreateTensorDescriptor(&logits_descriptor_), + CHECK_CUDNN_RET_WITH_EXCEPT(kernel_node_, cudnnCreateTensorDescriptor(&logits_descriptor_), "cudnnCreateTensorDescriptor failed."); - CHECK_CUDNN_RET_WITH_EXCEPT(cudnnCreateTensorDescriptor(&softmax_output_descriptor_), + CHECK_CUDNN_RET_WITH_EXCEPT(kernel_node_, cudnnCreateTensorDescriptor(&softmax_output_descriptor_), "cudnnCreateTensorDescriptor failed."); } void InitSizeLists() override { diff --git a/mindspore/ccsrc/backend/kernel_compiler/gpu/nn/softmax_gpu_kernel.h b/mindspore/ccsrc/backend/kernel_compiler/gpu/nn/softmax_gpu_kernel.h index 597cc49352..6fb8a55179 100644 --- a/mindspore/ccsrc/backend/kernel_compiler/gpu/nn/softmax_gpu_kernel.h +++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/nn/softmax_gpu_kernel.h @@ -63,7 +63,8 @@ class SoftmaxGpuKernel : public GpuKernel { const float beta = 0; if (axis_ == 1) { - CHECK_CUDNN_RET_WITH_EXCEPT(cudnnSoftmaxForward(cudnn_handle_, algo_, mode_, &alpha, input_descriptor_, + CHECK_CUDNN_RET_WITH_EXCEPT(kernel_node_, + cudnnSoftmaxForward(cudnn_handle_, algo_, mode_, &alpha, input_descriptor_, input_addr, &beta, output_descriptor_, output_addr), "cudnnSoftmaxForward failed"); } else { @@ -72,19 +73,23 @@ class SoftmaxGpuKernel : public GpuKernel { size_t *input_shape = GetDeviceAddress(workspace, 2); size_t *transpose_shape = GetDeviceAddress(workspace, 3); size_t *transpose_axis = GetDeviceAddress(workspace, 4); - CHECK_CUDA_RET_WITH_EXCEPT(cudaMemcpyAsync(input_shape, &input_shape_[0], workspace_size_, cudaMemcpyHostToDevice, + CHECK_CUDA_RET_WITH_EXCEPT(kernel_node_, + cudaMemcpyAsync(input_shape, &input_shape_[0], workspace_size_, cudaMemcpyHostToDevice, reinterpret_cast(stream_ptr)), "cudaMemcpyAsync input_shape failed"); - CHECK_CUDA_RET_WITH_EXCEPT(cudaMemcpyAsync(transpose_shape, &transpose_shape_[0], workspace_size_, + CHECK_CUDA_RET_WITH_EXCEPT(kernel_node_, + cudaMemcpyAsync(transpose_shape, &transpose_shape_[0], workspace_size_, cudaMemcpyHostToDevice, reinterpret_cast(stream_ptr)), "cudaMemcpyAsync input_shape failed"); - CHECK_CUDA_RET_WITH_EXCEPT(cudaMemcpyAsync(transpose_axis, &transpose_axis_[0], workspace_size_, + CHECK_CUDA_RET_WITH_EXCEPT(kernel_node_, + cudaMemcpyAsync(transpose_axis, &transpose_axis_[0], workspace_size_, cudaMemcpyHostToDevice, reinterpret_cast(stream_ptr)), "cudaMemcpyAsync input_axis failed"); size_t size = input_size_ / sizeof(T); CalTranspose(size, input_addr, input_shape, transpose_axis, shape_size_, transpose_input_addr, reinterpret_cast(stream_ptr)); CHECK_CUDNN_RET_WITH_EXCEPT( + kernel_node_, cudnnSoftmaxForward(cudnn_handle_, algo_, mode_, &alpha, input_descriptor_, transpose_input_addr, &beta, output_descriptor_, transpose_output_addr), "cudnnSoftmaxForward failed"); @@ -95,6 +100,7 @@ class SoftmaxGpuKernel : public GpuKernel { } bool Init(const CNodePtr &kernel_node) override { + kernel_node_ = kernel_node; InitResource(); cudnn_data_type_ = GetCudnnDataType(TypeIdLabel(AnfAlgo::GetInputDeviceDataType(kernel_node, 0))); size_t input_num = AnfAlgo::GetInputTensorNum(kernel_node); @@ -129,10 +135,12 @@ class SoftmaxGpuKernel : public GpuKernel { InitSizeByAxis(input_shape, axis[0]); } CHECK_CUDNN_RET_WITH_EXCEPT( + kernel_node_, cudnnSetTensor4dDescriptor(input_descriptor_, CUDNN_TENSOR_NCHW, cudnn_data_type_, SizeToInt(batch_size_), SizeToInt(channel_size_), SizeToInt(height_), SizeToInt(width_)), "set input_descriptor failed"); CHECK_CUDNN_RET_WITH_EXCEPT( + kernel_node_, cudnnSetTensor4dDescriptor(output_descriptor_, CUDNN_TENSOR_NCHW, cudnn_data_type_, SizeToInt(batch_size_), SizeToInt(channel_size_), SizeToInt(height_), SizeToInt(width_)), "set output_descriptor failed"); @@ -141,15 +149,19 @@ class SoftmaxGpuKernel : public GpuKernel { } void DestroyResource() noexcept override { - CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(output_descriptor_), "destroy output_descriptor failed"); - CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(input_descriptor_), "destroy input_descriptor failed"); + CHECK_CUDNN_RET_WITH_ERROR(kernel_node_, cudnnDestroyTensorDescriptor(output_descriptor_), + "destroy output_descriptor failed"); + CHECK_CUDNN_RET_WITH_ERROR(kernel_node_, cudnnDestroyTensorDescriptor(input_descriptor_), + "destroy input_descriptor failed"); } protected: void InitResource() override { cudnn_handle_ = device::gpu::GPUDeviceManager::GetInstance().GetCudnnHandle(); - CHECK_CUDNN_RET_WITH_EXCEPT(cudnnCreateTensorDescriptor(&input_descriptor_), "create input_descriptor failed"); - CHECK_CUDNN_RET_WITH_EXCEPT(cudnnCreateTensorDescriptor(&output_descriptor_), "create output_descriptor failed"); + CHECK_CUDNN_RET_WITH_EXCEPT(kernel_node_, cudnnCreateTensorDescriptor(&input_descriptor_), + "create input_descriptor failed"); + CHECK_CUDNN_RET_WITH_EXCEPT(kernel_node_, cudnnCreateTensorDescriptor(&output_descriptor_), + "create output_descriptor failed"); } void InitSizeLists() override { diff --git a/mindspore/ccsrc/backend/kernel_compiler/gpu/nn/softmax_grad_gpu_kernel.h b/mindspore/ccsrc/backend/kernel_compiler/gpu/nn/softmax_grad_gpu_kernel.h index 1ea5f5fa79..0900a30833 100644 --- a/mindspore/ccsrc/backend/kernel_compiler/gpu/nn/softmax_grad_gpu_kernel.h +++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/nn/softmax_grad_gpu_kernel.h @@ -70,17 +70,21 @@ class SoftmaxGradGpuKernel : public GpuKernel { const float beta = 0; if (axis_ == 1) { - CHECK_CUDNN_RET_WITH_EXCEPT(cudnnSoftmaxBackward(cudnn_handle_, algo_, mode_, &alpha, y_desc_, y_addr, y_desc_, + CHECK_CUDNN_RET_WITH_EXCEPT(kernel_node_, + cudnnSoftmaxBackward(cudnn_handle_, algo_, mode_, &alpha, y_desc_, y_addr, y_desc_, dy_addr, &beta, y_desc_, dx_addr), "cudnnSoftmaxBackward failed"); } else { - CHECK_CUDA_RET_WITH_EXCEPT(cudaMemcpyAsync(input_shape, &input_shape_[0], workspace_size_, cudaMemcpyHostToDevice, + CHECK_CUDA_RET_WITH_EXCEPT(kernel_node_, + cudaMemcpyAsync(input_shape, &input_shape_[0], workspace_size_, cudaMemcpyHostToDevice, reinterpret_cast(stream_ptr)), "cudaMemcpyAsync input_shape failed"); - CHECK_CUDA_RET_WITH_EXCEPT(cudaMemcpyAsync(transpose_shape, &transpose_shape_[0], workspace_size_, + CHECK_CUDA_RET_WITH_EXCEPT(kernel_node_, + cudaMemcpyAsync(transpose_shape, &transpose_shape_[0], workspace_size_, cudaMemcpyHostToDevice, reinterpret_cast(stream_ptr)), "cudaMemcpyAsync input_shape failed"); - CHECK_CUDA_RET_WITH_EXCEPT(cudaMemcpyAsync(transpose_axis, &transpose_axis_[0], workspace_size_, + CHECK_CUDA_RET_WITH_EXCEPT(kernel_node_, + cudaMemcpyAsync(transpose_axis, &transpose_axis_[0], workspace_size_, cudaMemcpyHostToDevice, reinterpret_cast(stream_ptr)), "cudaMemcpyAsync input_axis failed"); size_t size = input_size_ / sizeof(T); @@ -88,7 +92,8 @@ class SoftmaxGradGpuKernel : public GpuKernel { reinterpret_cast(stream_ptr)); CalTranspose(size, dy_addr, input_shape, transpose_axis, shape_size_, transpose_dy_addr, reinterpret_cast(stream_ptr)); - CHECK_CUDNN_RET_WITH_EXCEPT(cudnnSoftmaxBackward(cudnn_handle_, algo_, mode_, &alpha, y_desc_, transpose_y_addr, + CHECK_CUDNN_RET_WITH_EXCEPT(kernel_node_, + cudnnSoftmaxBackward(cudnn_handle_, algo_, mode_, &alpha, y_desc_, transpose_y_addr, y_desc_, transpose_dy_addr, &beta, y_desc_, transpose_dx_addr), "cudnnSoftmaxBackward failed"); CalTranspose(size, transpose_dx_addr, transpose_shape, transpose_axis, shape_size_, dx_addr, @@ -98,6 +103,7 @@ class SoftmaxGradGpuKernel : public GpuKernel { } bool Init(const CNodePtr &kernel_node) override { + kernel_node_ = kernel_node; InitResource(); cudnn_data_type_ = GetCudnnDataType(TypeIdLabel(AnfAlgo::GetInputDeviceDataType(kernel_node, 0))); size_t input_num = AnfAlgo::GetInputTensorNum(kernel_node); @@ -135,6 +141,7 @@ class SoftmaxGradGpuKernel : public GpuKernel { InitSizeByAxis(input_shape, axis[0]); } CHECK_CUDNN_RET_WITH_EXCEPT( + kernel_node_, cudnnSetTensor4dDescriptor(y_desc_, CUDNN_TENSOR_NCHW, cudnn_data_type_, SizeToInt(batch_size_), SizeToInt(channel_size_), SizeToInt(height_), SizeToInt(width_)), "set input_descriptor failed"); @@ -143,13 +150,13 @@ class SoftmaxGradGpuKernel : public GpuKernel { } void DestroyResource() noexcept override { - CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(y_desc_), "destroy output_descriptor failed"); + CHECK_CUDNN_RET_WITH_ERROR(kernel_node_, cudnnDestroyTensorDescriptor(y_desc_), "destroy output_descriptor failed"); } protected: void InitResource() override { cudnn_handle_ = device::gpu::GPUDeviceManager::GetInstance().GetCudnnHandle(); - CHECK_CUDNN_RET_WITH_EXCEPT(cudnnCreateTensorDescriptor(&y_desc_), "create input_descriptor failed"); + CHECK_CUDNN_RET_WITH_EXCEPT(kernel_node_, cudnnCreateTensorDescriptor(&y_desc_), "create input_descriptor failed"); } void InitSizeLists() override { diff --git a/mindspore/ccsrc/backend/kernel_compiler/gpu/nn/sparse_softmax_cross_entropy_with_logits_gpu_kernel.h b/mindspore/ccsrc/backend/kernel_compiler/gpu/nn/sparse_softmax_cross_entropy_with_logits_gpu_kernel.h index 74b3cbe24b..3e2410ad14 100644 --- a/mindspore/ccsrc/backend/kernel_compiler/gpu/nn/sparse_softmax_cross_entropy_with_logits_gpu_kernel.h +++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/nn/sparse_softmax_cross_entropy_with_logits_gpu_kernel.h @@ -64,6 +64,7 @@ class SparseSoftmaxCrossEntropyWithLogitsGpuKernel : public GpuKernel { const float alpha = 1; const float beta = 0; CHECK_CUDNN_RET_WITH_EXCEPT( + kernel_node_, cudnnSoftmaxForward(cudnn_handle_, algo_, mode_, &alpha, logits_descriptor_, logits_addr, &beta, softmax_output_descriptor_, softmax_output_logits), "cudnnSoftmaxForward failed."); @@ -75,6 +76,7 @@ class SparseSoftmaxCrossEntropyWithLogitsGpuKernel : public GpuKernel { return true; } bool Init(const CNodePtr &kernel_node) override { + kernel_node_ = kernel_node; InitResource(); size_t input_num = AnfAlgo::GetInputTensorNum(kernel_node); if (input_num != 2) { @@ -92,10 +94,12 @@ class SparseSoftmaxCrossEntropyWithLogitsGpuKernel : public GpuKernel { cudnn_data_type_ = GetCudnnDataType(TypeIdLabel(AnfAlgo::GetInputDeviceDataType(kernel_node, 0))); InferInputOutputSize(kernel_node); - CHECK_CUDNN_RET_WITH_EXCEPT(cudnnSetTensor4dDescriptor(logits_descriptor_, CUDNN_TENSOR_NCHW, cudnn_data_type_, + CHECK_CUDNN_RET_WITH_EXCEPT(kernel_node_, + cudnnSetTensor4dDescriptor(logits_descriptor_, CUDNN_TENSOR_NCHW, cudnn_data_type_, batch_size_, channel_size_, height_, width_), "cudnnSetTensor4dDescriptor failed."); CHECK_CUDNN_RET_WITH_EXCEPT( + kernel_node_, cudnnSetTensor4dDescriptor(softmax_output_descriptor_, CUDNN_TENSOR_NCHW, cudnn_data_type_, batch_size_, channel_size_, height_, width_), "cudnnSetTensor4dDescriptor failed."); @@ -104,18 +108,18 @@ class SparseSoftmaxCrossEntropyWithLogitsGpuKernel : public GpuKernel { } void DestroyResource() noexcept override { - CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(softmax_output_descriptor_), + CHECK_CUDNN_RET_WITH_ERROR(kernel_node_, cudnnDestroyTensorDescriptor(softmax_output_descriptor_), "cudnnDestroyTensorDescriptor failed."); - CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(logits_descriptor_), + CHECK_CUDNN_RET_WITH_ERROR(kernel_node_, cudnnDestroyTensorDescriptor(logits_descriptor_), "cudnnDestroyTensorDescriptor failed."); } protected: void InitResource() override { cudnn_handle_ = device::gpu::GPUDeviceManager::GetInstance().GetCudnnHandle(); - CHECK_CUDNN_RET_WITH_EXCEPT(cudnnCreateTensorDescriptor(&logits_descriptor_), + CHECK_CUDNN_RET_WITH_EXCEPT(kernel_node_, cudnnCreateTensorDescriptor(&logits_descriptor_), "cudnnCreateTensorDescriptor failed."); - CHECK_CUDNN_RET_WITH_EXCEPT(cudnnCreateTensorDescriptor(&softmax_output_descriptor_), + CHECK_CUDNN_RET_WITH_EXCEPT(kernel_node_, cudnnCreateTensorDescriptor(&softmax_output_descriptor_), "cudnnCreateTensorDescriptor failed."); } void InitSizeLists() override { diff --git a/mindspore/ccsrc/backend/kernel_compiler/gpu/other/assign_gpu_kernel.h b/mindspore/ccsrc/backend/kernel_compiler/gpu/other/assign_gpu_kernel.h index 2be341f50a..1333513e29 100644 --- a/mindspore/ccsrc/backend/kernel_compiler/gpu/other/assign_gpu_kernel.h +++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/other/assign_gpu_kernel.h @@ -38,15 +38,18 @@ class AssignGpuKernel : public GpuKernel { T *value = GetDeviceAddress(inputs, 1); T *output = GetDeviceAddress(outputs, 0); CHECK_CUDA_RET_WITH_EXCEPT( + kernel_node_, cudaMemcpyAsync(var, value, input_size_, cudaMemcpyDeviceToDevice, reinterpret_cast(stream_ptr)), "cudaMemxcpyAsync failed."); CHECK_CUDA_RET_WITH_EXCEPT( + kernel_node_, cudaMemcpyAsync(output, value, input_size_, cudaMemcpyDeviceToDevice, reinterpret_cast(stream_ptr)), "cudaMemxcpyAsync failed."); return true; } bool Init(const CNodePtr &kernel_node) override { + kernel_node_ = kernel_node; if (!CheckParam(kernel_node)) { return false; } diff --git a/mindspore/ccsrc/backend/kernel_compiler/gpu/other/gpu_convert_to_dynamic_shape_gpu_kernel.h b/mindspore/ccsrc/backend/kernel_compiler/gpu/other/gpu_convert_to_dynamic_shape_gpu_kernel.h index 4ef06b67e8..eb06e33511 100644 --- a/mindspore/ccsrc/backend/kernel_compiler/gpu/other/gpu_convert_to_dynamic_shape_gpu_kernel.h +++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/other/gpu_convert_to_dynamic_shape_gpu_kernel.h @@ -41,7 +41,8 @@ class GpuConvertToDynamicShapeGpuKernel : public GpuKernel { T *output_device_address = GetDeviceAddress(outputs, 0); cuda_stream_ptr_ = stream_ptr; - CHECK_CUDA_RET_WITH_ERROR(cudaMemcpyAsync(output_device_address, input_device_address, input_size_ * sizeof(T), + CHECK_CUDA_RET_WITH_ERROR(kernel_node_, + cudaMemcpyAsync(output_device_address, input_device_address, input_size_ * sizeof(T), cudaMemcpyDeviceToDevice, reinterpret_cast(stream_ptr)), "Failed to copy gpu memory."); @@ -49,7 +50,7 @@ class GpuConvertToDynamicShapeGpuKernel : public GpuKernel { } void PostExecute() override { - CHECK_CUDA_RET_WITH_EXCEPT(cudaStreamSynchronize(reinterpret_cast(cuda_stream_ptr_)), + CHECK_CUDA_RET_WITH_EXCEPT(kernel_node_, cudaStreamSynchronize(reinterpret_cast(cuda_stream_ptr_)), "cudaStreamSynchronized failed"); std::vector output_types = {AnfAlgo::GetOutputInferDataType(c_node_ptr_, 0)}; @@ -58,6 +59,7 @@ class GpuConvertToDynamicShapeGpuKernel : public GpuKernel { } bool Init(const CNodePtr &kernel_node) override { + kernel_node_ = kernel_node; size_t input_count = AnfAlgo::GetInputTensorNum(kernel_node); if (input_count != 1) { MS_LOG(ERROR) << input_count << "inputs were provided, but GpuConvertToDynamicShapeGpuKernel exepects 1."; diff --git a/mindspore/ccsrc/backend/kernel_compiler/gpu/quant/batchnorm_fold2_grad_gpu_kernel.h b/mindspore/ccsrc/backend/kernel_compiler/gpu/quant/batchnorm_fold2_grad_gpu_kernel.h index 6f060ddf4c..ded52d83b1 100644 --- a/mindspore/ccsrc/backend/kernel_compiler/gpu/quant/batchnorm_fold2_grad_gpu_kernel.h +++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/quant/batchnorm_fold2_grad_gpu_kernel.h @@ -68,10 +68,12 @@ class BatchNormFold2GradGpuKernel : public GpuKernel { int32_t current_step_host[1]; size_t x_size = batch_size_ * channel_ * height_ * width_ * sizeof(T); - CHECK_CUDA_RET_WITH_ERROR(cudaMemcpyAsync(current_step_host, global_step, sizeof(int32_t), cudaMemcpyDeviceToHost, + CHECK_CUDA_RET_WITH_ERROR(kernel_node_, + cudaMemcpyAsync(current_step_host, global_step, sizeof(int32_t), cudaMemcpyDeviceToHost, reinterpret_cast(stream_ptr)), "Failed to copy gpu memory."); CHECK_CUDA_RET_WITH_ERROR( + kernel_node_, cudaMemcpyAsync(d_x, dout, x_size, cudaMemcpyDeviceToDevice, reinterpret_cast(stream_ptr)), "Failed to copy gpu memory."); @@ -90,8 +92,8 @@ class BatchNormFold2GradGpuKernel : public GpuKernel { } bool Init(const CNodePtr &kernel_node) override { + kernel_node_ = kernel_node; InitResource(); - size_t input_num = AnfAlgo::GetInputTensorNum(kernel_node); if (input_num != 8) { MS_LOG(ERROR) << "Argument number is " << input_num << ", but BatchNormFold2GradGpuKernel needs 8."; diff --git a/mindspore/ccsrc/backend/kernel_compiler/gpu/quant/batchnorm_fold_gpu_kernel.h b/mindspore/ccsrc/backend/kernel_compiler/gpu/quant/batchnorm_fold_gpu_kernel.h index ab79aebd02..680f34f30e 100644 --- a/mindspore/ccsrc/backend/kernel_compiler/gpu/quant/batchnorm_fold_gpu_kernel.h +++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/quant/batchnorm_fold_gpu_kernel.h @@ -58,7 +58,8 @@ class BatchNormFoldGpuKernel : public GpuKernel { auto variance = GetDeviceAddress(inputs, 2); int *current_step = GetDeviceAddress(inputs, 3); int current_step_host[1]; - CHECK_CUDA_RET_WITH_ERROR(cudaMemcpyAsync(current_step_host, current_step, sizeof(int), cudaMemcpyDeviceToHost, + CHECK_CUDA_RET_WITH_ERROR(kernel_node_, + cudaMemcpyAsync(current_step_host, current_step, sizeof(int), cudaMemcpyDeviceToHost, reinterpret_cast(stream_ptr)), "Copy gpu memoy failed."); if (x == nullptr) { @@ -83,21 +84,24 @@ class BatchNormFoldGpuKernel : public GpuKernel { auto running_std = GetDeviceAddress(outputs, 3); auto y = GetDeviceAddress(workspace, 0); - CHECK_CUDA_RET_WITH_ERROR(cudaMemcpyAsync(running_mean, mean, output_size_, cudaMemcpyDeviceToDevice, + CHECK_CUDA_RET_WITH_ERROR(kernel_node_, + cudaMemcpyAsync(running_mean, mean, output_size_, cudaMemcpyDeviceToDevice, reinterpret_cast(stream_ptr)), "Failed to copy gpu memory."); - CHECK_CUDA_RET_WITH_ERROR(cudaMemcpyAsync(running_std, variance, output_size_, cudaMemcpyDeviceToDevice, + CHECK_CUDA_RET_WITH_ERROR(kernel_node_, + cudaMemcpyAsync(running_std, variance, output_size_, cudaMemcpyDeviceToDevice, reinterpret_cast(stream_ptr)), "Failed to copy gpu memory."); CalUpdateRunningStd(channel_, epsilon_, running_std, reinterpret_cast(stream_ptr)); if (!is_training_ || current_step_host[0] >= freeze_bn_) { - CHECK_CUDA_RET_WITH_ERROR(cudaMemset(batch_mean, 0, output_size_), "Failed to set gpu memory."); + CHECK_CUDA_RET_WITH_ERROR(kernel_node_, cudaMemset(batch_mean, 0, output_size_), "Failed to set gpu memory."); ThrustFillWith(batch_std, channel_, 1.f, reinterpret_cast(stream_ptr)); return true; } const T alpha = 1; const T beta = 0; - CHECK_CUDNN_RET_WITH_EXCEPT(cudnnBatchNormalizationForwardTraining( + CHECK_CUDNN_RET_WITH_EXCEPT(kernel_node_, + cudnnBatchNormalizationForwardTraining( handle_, mode_, &alpha, &beta, x_desc_, x, x_desc_, y, scale_bias_mean_var_desc_, mean, mean, exp_avg_factor_, mean, variance, epsilon_, batch_mean, batch_std), "Failed to launch kernel.") @@ -106,6 +110,7 @@ class BatchNormFoldGpuKernel : public GpuKernel { } bool Init(const CNodePtr &kernel_node) override { + kernel_node_ = kernel_node; InitResource(); size_t input_num = AnfAlgo::GetInputTensorNum(kernel_node); if (input_num != 4) { @@ -141,10 +146,12 @@ class BatchNormFoldGpuKernel : public GpuKernel { cudnnDataType_t cudnnDataType = GetCudnnDataType(TypeIdLabel(AnfAlgo::GetInputDeviceDataType(kernel_node, 0))); CHECK_CUDNN_RET_WITH_EXCEPT( + kernel_node_, cudnnSetTensor4dDescriptor(x_desc_, CUDNN_TENSOR_NCHW, cudnnDataType, batch_, channel_, height_, width_), "Set x desc failed"); CHECK_CUDNN_RET_WITH_EXCEPT( + kernel_node_, cudnnSetTensor4dDescriptor(scale_bias_mean_var_desc_, CUDNN_TENSOR_NCHW, cudnnDataType, 1, channel_, 1, 1), "Set para desc failed"); @@ -153,8 +160,9 @@ class BatchNormFoldGpuKernel : public GpuKernel { } void DestroyResource() noexcept override { - CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(x_desc_), "Destroy x desc failed"); - CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(scale_bias_mean_var_desc_), "Destroy para desc failed"); + CHECK_CUDNN_RET_WITH_ERROR(kernel_node_, cudnnDestroyTensorDescriptor(x_desc_), "Destroy x desc failed"); + CHECK_CUDNN_RET_WITH_ERROR(kernel_node_, cudnnDestroyTensorDescriptor(scale_bias_mean_var_desc_), + "Destroy para desc failed"); } protected: @@ -177,8 +185,9 @@ class BatchNormFoldGpuKernel : public GpuKernel { void InitResource() override { handle_ = device::gpu::GPUDeviceManager::GetInstance().GetCudnnHandle(); - CHECK_CUDNN_RET_WITH_EXCEPT(cudnnCreateTensorDescriptor(&x_desc_), "Create x desc failed"); - CHECK_CUDNN_RET_WITH_EXCEPT(cudnnCreateTensorDescriptor(&scale_bias_mean_var_desc_), "Create para desc failed"); + CHECK_CUDNN_RET_WITH_EXCEPT(kernel_node_, cudnnCreateTensorDescriptor(&x_desc_), "Create x desc failed"); + CHECK_CUDNN_RET_WITH_EXCEPT(kernel_node_, cudnnCreateTensorDescriptor(&scale_bias_mean_var_desc_), + "Create para desc failed"); } private: diff --git a/mindspore/ccsrc/backend/kernel_compiler/gpu/quant/batchnorm_fold_grad_gpu_kernel.h b/mindspore/ccsrc/backend/kernel_compiler/gpu/quant/batchnorm_fold_grad_gpu_kernel.h index 8dd19ff453..7b27f88635 100644 --- a/mindspore/ccsrc/backend/kernel_compiler/gpu/quant/batchnorm_fold_grad_gpu_kernel.h +++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/quant/batchnorm_fold_grad_gpu_kernel.h @@ -56,7 +56,8 @@ class BatchNormFoldGradGpuKernel : public GpuKernel { T *batch_std = GetDeviceAddress(inputs, 4); int *current_step = GetDeviceAddress(inputs, 5); int current_step_host[1]; - CHECK_CUDA_RET_WITH_ERROR(cudaMemcpyAsync(current_step_host, current_step, sizeof(int), cudaMemcpyDeviceToHost, + CHECK_CUDA_RET_WITH_ERROR(kernel_node_, + cudaMemcpyAsync(current_step_host, current_step, sizeof(int), cudaMemcpyDeviceToHost, reinterpret_cast(stream_ptr)), "Copy gpu memoy failed."); if (d_batch_mean == nullptr) { @@ -95,6 +96,7 @@ class BatchNormFoldGradGpuKernel : public GpuKernel { } bool Init(const CNodePtr &kernel_node) override { + kernel_node_ = kernel_node; size_t input_num = AnfAlgo::GetInputTensorNum(kernel_node); if (input_num != 6) { MS_LOG(ERROR) << "Input number is " << input_num << ", but BatchNormFoldGrad GpuKernel OP needs 6 input."; diff --git a/mindspore/ccsrc/backend/kernel_compiler/gpu/quant/fake_quant_perchannel_gpu_kernel.cc b/mindspore/ccsrc/backend/kernel_compiler/gpu/quant/fake_quant_perchannel_gpu_kernel.cc index 261ab9ffad..8c3634ab62 100644 --- a/mindspore/ccsrc/backend/kernel_compiler/gpu/quant/fake_quant_perchannel_gpu_kernel.cc +++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/quant/fake_quant_perchannel_gpu_kernel.cc @@ -42,6 +42,7 @@ const std::vector &FakeQuantPerChannelGpuKernel::GetOutputSizeList() con const std::vector &FakeQuantPerChannelGpuKernel::GetWorkspaceSizeList() const { return workspace_size_list_; } bool FakeQuantPerChannelGpuKernel::Init(const CNodePtr &kernel_node) { + kernel_node_ = kernel_node; size_t input_num = AnfAlgo::GetInputTensorNum(kernel_node); if (input_num != 3) { MS_LOG(EXCEPTION) << "Input number is " << input_num << ", but FakeQuant GpuKernel OP needs 3 input."; @@ -130,7 +131,8 @@ bool FakeQuantPerChannelGpuKernel::Launch(const std::vector &inputs, if (global_step_ >= quant_delay_) { CalFakeQuantize(input, output, input_min, input_max, nudge_min, nudge_max, scale, stream_ptr); } else { - CHECK_CUDA_RET_WITH_ERROR(cudaMemcpyAsync(output, input, input_size_, cudaMemcpyDeviceToDevice, + CHECK_CUDA_RET_WITH_ERROR(kernel_node_, + cudaMemcpyAsync(output, input, input_size_, cudaMemcpyDeviceToDevice, reinterpret_cast(stream_ptr)), "Copy gpu memory failed."); } diff --git a/mindspore/ccsrc/backend/kernel_compiler/gpu/quant/fake_quant_perchannel_grad_gpu_kernel.cc b/mindspore/ccsrc/backend/kernel_compiler/gpu/quant/fake_quant_perchannel_grad_gpu_kernel.cc index b713184224..11bcfd55d0 100644 --- a/mindspore/ccsrc/backend/kernel_compiler/gpu/quant/fake_quant_perchannel_grad_gpu_kernel.cc +++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/quant/fake_quant_perchannel_grad_gpu_kernel.cc @@ -39,6 +39,7 @@ const std::vector &FakeQuantPerChannelGradGpuKernel::GetWorkspaceSizeLis } bool FakeQuantPerChannelGradGpuKernel::Init(const CNodePtr &kernel_node) { + kernel_node_ = kernel_node; size_t input_num = AnfAlgo::GetInputTensorNum(kernel_node); if (input_num != 4) { MS_LOG(EXCEPTION) << "Input number is " << input_num << ", but FakeQuantGrad GpuKernel OP needs 4 output."; @@ -123,7 +124,8 @@ bool FakeQuantPerChannelGradGpuKernel::Launch(const std::vector &inp CalFakeQuantPerChannelGrad(input, gradient, output, total_size, num_channels_, nudge_min, nudge_max, reinterpret_cast(stream_ptr)); } else { - CHECK_CUDA_RET_WITH_ERROR(cudaMemcpyAsync(output, gradient, input_size_, cudaMemcpyDeviceToDevice, + CHECK_CUDA_RET_WITH_ERROR(kernel_node_, + cudaMemcpyAsync(output, gradient, input_size_, cudaMemcpyDeviceToDevice, reinterpret_cast(stream_ptr)), "Copy gpu memory failed."); } diff --git a/mindspore/ccsrc/backend/kernel_compiler/gpu/quant/fake_quant_perlayer_gpu_kernel.cc b/mindspore/ccsrc/backend/kernel_compiler/gpu/quant/fake_quant_perlayer_gpu_kernel.cc index 6fbf6a9895..d2be5369cd 100644 --- a/mindspore/ccsrc/backend/kernel_compiler/gpu/quant/fake_quant_perlayer_gpu_kernel.cc +++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/quant/fake_quant_perlayer_gpu_kernel.cc @@ -42,6 +42,7 @@ const std::vector &FakeQuantPerLayerGpuKernel::GetOutputSizeList() const const std::vector &FakeQuantPerLayerGpuKernel::GetWorkspaceSizeList() const { return workspace_size_list_; } bool FakeQuantPerLayerGpuKernel::Init(const CNodePtr &kernel_node) { + kernel_node_ = kernel_node; size_t input_num = AnfAlgo::GetInputTensorNum(kernel_node); if (input_num != 3) { MS_LOG(EXCEPTION) << "Input number is " << input_num << ", but FakeQuant GpuKernel OP needs 3 output."; @@ -122,7 +123,8 @@ bool FakeQuantPerLayerGpuKernel::Launch(const std::vector &inputs, c CalFakeQuantPerLayer(input, output, quant_num_, nudge_min, nudge_max, scale, reinterpret_cast(stream_ptr)); } else { - CHECK_CUDA_RET_WITH_ERROR(cudaMemcpyAsync(output, input, input_size_, cudaMemcpyDeviceToDevice, + CHECK_CUDA_RET_WITH_ERROR(kernel_node_, + cudaMemcpyAsync(output, input, input_size_, cudaMemcpyDeviceToDevice, reinterpret_cast(stream_ptr)), "Copy gpu memory failed"); } diff --git a/mindspore/ccsrc/backend/kernel_compiler/gpu/quant/fake_quant_perlayer_grad_gpu_kernel.cc b/mindspore/ccsrc/backend/kernel_compiler/gpu/quant/fake_quant_perlayer_grad_gpu_kernel.cc index 0a81fa9c6a..820cf05fa3 100644 --- a/mindspore/ccsrc/backend/kernel_compiler/gpu/quant/fake_quant_perlayer_grad_gpu_kernel.cc +++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/quant/fake_quant_perlayer_grad_gpu_kernel.cc @@ -38,6 +38,7 @@ const std::vector &FakeQuantPerLayerGradGpuKernel::GetOutputSizeList() c const std::vector &FakeQuantPerLayerGradGpuKernel::GetWorkspaceSizeList() const { return workspace_size_list_; } bool FakeQuantPerLayerGradGpuKernel::Init(const CNodePtr &kernel_node) { + kernel_node_ = kernel_node; size_t input_num = AnfAlgo::GetInputTensorNum(kernel_node); if (input_num != 4) { MS_LOG(EXCEPTION) << "Input number is " << input_num << ", but FakeQuantGrad GpuKernel OP needs 4 output."; @@ -120,7 +121,8 @@ bool FakeQuantPerLayerGradGpuKernel::Launch(const std::vector &input CalFakeQuantPerLayerGrad(input, gradient, output, quant_num_, nudge_min, nudge_max, reinterpret_cast(stream_ptr)); } else { - CHECK_CUDA_RET_WITH_ERROR(cudaMemcpyAsync(output, gradient, input_size_, cudaMemcpyDeviceToDevice, + CHECK_CUDA_RET_WITH_ERROR(kernel_node_, + cudaMemcpyAsync(output, gradient, input_size_, cudaMemcpyDeviceToDevice, reinterpret_cast(stream_ptr)), "Copy gpu memory failed"); } diff --git a/mindspore/ccsrc/backend/kernel_compiler/gpu/random/random_categorical_gpu_kernel.h b/mindspore/ccsrc/backend/kernel_compiler/gpu/random/random_categorical_gpu_kernel.h index f4b5164b09..0bfe0dd682 100644 --- a/mindspore/ccsrc/backend/kernel_compiler/gpu/random/random_categorical_gpu_kernel.h +++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/random/random_categorical_gpu_kernel.h @@ -47,7 +47,8 @@ class RandomCategoricalGpuKernel : public GpuKernel { host_cdf[i] = GetDeviceAddress(workspaces, i); } double **dev_cdf = GetDeviceAddress(workspaces, batch_size_); - CHECK_CUDA_RET_WITH_EXCEPT(cudaMemcpyAsync(dev_cdf, // NOLINT + CHECK_CUDA_RET_WITH_EXCEPT(kernel_node_, + cudaMemcpyAsync(dev_cdf, // NOLINT host_cdf.get(), sizeof(double *) * batch_size_, cudaMemcpyHostToDevice, reinterpret_cast(stream_ptr)), "Random_categorica cudaMemcpyAsync dev_cdf failed"); @@ -68,12 +69,14 @@ class RandomCategoricalGpuKernel : public GpuKernel { for (int j = 0; j < num_samples_; j++) { host_1d_rand[j] = dist(rng); } - CHECK_CUDA_RET_WITH_EXCEPT(cudaMemcpyAsync(host_rand[i], // NOLINT + CHECK_CUDA_RET_WITH_EXCEPT(kernel_node_, + cudaMemcpyAsync(host_rand[i], // NOLINT host_1d_rand.get(), sizeof(double) * num_samples_, cudaMemcpyHostToDevice, reinterpret_cast(stream_ptr)), "Random_categorica cudaMemcpyAsync host_1d_rand failed"); } - CHECK_CUDA_RET_WITH_EXCEPT(cudaMemcpyAsync(dev_rand, // NOLINT + CHECK_CUDA_RET_WITH_EXCEPT(kernel_node_, + cudaMemcpyAsync(dev_rand, // NOLINT host_rand.get(), sizeof(double *) * batch_size_, cudaMemcpyHostToDevice, reinterpret_cast(stream_ptr)), "Random_categorica cudaMemcpyAsync dev_rand failed"); @@ -86,6 +89,7 @@ class RandomCategoricalGpuKernel : public GpuKernel { } bool Init(const CNodePtr &kernel_node) override { + kernel_node_ = kernel_node; size_t input_num = AnfAlgo::GetInputTensorNum(kernel_node); if (input_num != 3) { MS_LOG(ERROR) << "Input number is " << input_num << ", but RandomCategorical needs 3 inputs."; diff --git a/mindspore/ccsrc/backend/kernel_compiler/gpu/random/uniform_candidate_sampler_gpu_kernel.h b/mindspore/ccsrc/backend/kernel_compiler/gpu/random/uniform_candidate_sampler_gpu_kernel.h index 429029e257..30aa5958c8 100644 --- a/mindspore/ccsrc/backend/kernel_compiler/gpu/random/uniform_candidate_sampler_gpu_kernel.h +++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/random/uniform_candidate_sampler_gpu_kernel.h @@ -47,7 +47,8 @@ class UniformCandidateSamplerGpuKernel : public GpuKernel { if (remove_accidental_hits_) { T *input = GetDeviceAddress(inputs, 0); array_input_ = std::vector(input_size_, 0); - CHECK_CUDA_RET_WITH_EXCEPT(cudaMemcpyAsync(&array_input_[0], input, input_size_ * sizeof(T), + CHECK_CUDA_RET_WITH_EXCEPT(kernel_node_, + cudaMemcpyAsync(&array_input_[0], input, input_size_ * sizeof(T), cudaMemcpyDeviceToHost, reinterpret_cast(stream_ptr)), "cudaMemcpyAsync sampled_candidates failed"); for (const auto item : array_input_) { @@ -58,7 +59,8 @@ class UniformCandidateSamplerGpuKernel : public GpuKernel { float prob = Probability(); size_t sampled_candidates_size = num_sampled_ * sizeof(T); S value = ApproximateExpectedCount(prob, num_sampled_, counter); - CHECK_CUDA_RET_WITH_EXCEPT(cudaMemcpyAsync(sampled_candidates, &sampled_candidates_[0], sampled_candidates_size, + CHECK_CUDA_RET_WITH_EXCEPT(kernel_node_, + cudaMemcpyAsync(sampled_candidates, &sampled_candidates_[0], sampled_candidates_size, cudaMemcpyHostToDevice, reinterpret_cast(stream_ptr)), "cudaMemcpyAsync sampled_candidates failed"); CalUniformCandidateSampler(static_cast(input_size_), num_sampled_, value, true_expected_count, @@ -67,6 +69,7 @@ class UniformCandidateSamplerGpuKernel : public GpuKernel { } bool Init(const CNodePtr &kernel_node) override { + kernel_node_ = kernel_node; size_t input_num = AnfAlgo::GetInputTensorNum(kernel_node); if (input_num != 1) { MS_LOG(ERROR) << "Input number is " << input_num << ", but UniformCandidateSampler needs 1 input."; diff --git a/mindspore/ccsrc/runtime/device/gpu/blocking_queue.cc b/mindspore/ccsrc/runtime/device/gpu/blocking_queue.cc index 228bfe7b34..6cd18d223f 100644 --- a/mindspore/ccsrc/runtime/device/gpu/blocking_queue.cc +++ b/mindspore/ccsrc/runtime/device/gpu/blocking_queue.cc @@ -16,7 +16,7 @@ #include "runtime/device/gpu/blocking_queue.h" #include -#include "runtime/device/gpu/gpu_common.h" +#include "runtime/device/gpu/queue_common.h" #include "utils/ms_utils.h" namespace mindspore { diff --git a/mindspore/ccsrc/runtime/device/gpu/gpu_common.h b/mindspore/ccsrc/runtime/device/gpu/gpu_common.h index d1505ea55e..9ae4db2e78 100644 --- a/mindspore/ccsrc/runtime/device/gpu/gpu_common.h +++ b/mindspore/ccsrc/runtime/device/gpu/gpu_common.h @@ -22,6 +22,7 @@ #include #include #include "utils/log_adapter.h" +#include "utils/trace_base.h" #include "include/curand.h" namespace mindspore { @@ -43,25 +44,34 @@ namespace gpu { } \ } -#define CHECK_CUDA_RET_WITH_ERROR(expression, message) \ - { \ - cudaError_t status = (expression); \ - if (status != cudaSuccess) { \ - MS_LOG(ERROR) << "CUDA Error: " << message << " | Error Number: " << status << " " \ - << cudaGetErrorString(status); \ - } \ +#define CHECK_CUDA_RET_WITH_ERROR(node, expression, message) \ + { \ + cudaError_t status = (expression); \ + if (status != cudaSuccess) { \ + MS_LOG(ERROR) << "CUDA Error: " << message << " | Error Number: " << status << " " << cudaGetErrorString(status) \ + << trace::DumpSourceLines(node); \ + } \ } -#define CHECK_CUDA_RET_WITH_EXCEPT(expression, message) \ +#define CHECK_CUDA_RET_WITH_EXCEPT(node, expression, message) \ { \ cudaError_t status = (expression); \ if (status != cudaSuccess) { \ MS_LOG(EXCEPTION) << "CUDA Error: " << message << " | Error Number: " << status << " " \ - << cudaGetErrorString(status); \ + << cudaGetErrorString(status) << trace::DumpSourceLines(node); \ } \ } -#define CHECK_CUDNN_RET_WITH_EXCEPT(expression, message) \ +#define CHECK_CUDNN_RET_WITH_EXCEPT(node, expression, message) \ + { \ + cudnnStatus_t status = (expression); \ + if (status != CUDNN_STATUS_SUCCESS) { \ + MS_LOG(EXCEPTION) << "cuDNN Error: " << message << " | Error Number: " << status << " " \ + << cudnnGetErrorString(status) << trace::DumpSourceLines(node); \ + } \ + } + +#define CHECK_CUDNN_RET_WITH_EXCEPT_NOTRACE(expression, message) \ { \ cudnnStatus_t status = (expression); \ if (status != CUDNN_STATUS_SUCCESS) { \ @@ -70,7 +80,7 @@ namespace gpu { } \ } -#define CHECK_CUDNN_RET_WITH_ERROR(expression, message) \ +#define CHECK_CUDNN_RET_WITH_ERROR_NOTRACE(expression, message) \ { \ cudnnStatus_t status = (expression); \ if (status != CUDNN_STATUS_SUCCESS) { \ @@ -79,7 +89,16 @@ namespace gpu { } \ } -#define CHECK_CUBLAS_RET_WITH_EXCEPT(expression, message) \ +#define CHECK_CUDNN_RET_WITH_ERROR(node, expression, message) \ + { \ + cudnnStatus_t status = (expression); \ + if (status != CUDNN_STATUS_SUCCESS) { \ + MS_LOG(ERROR) << "cuDNN Error: " << message << " | Error Number: " << status << " " \ + << cudnnGetErrorString(status) << trace::DumpSourceLines(node); \ + } \ + } + +#define CHECK_CUBLAS_RET_WITH_EXCEPT_NOTRACE(expression, message) \ { \ cublasStatus_t status = (expression); \ if (status != CUBLAS_STATUS_SUCCESS) { \ @@ -87,6 +106,15 @@ namespace gpu { } \ } +#define CHECK_CUBLAS_RET_WITH_EXCEPT(node, expression, message) \ + { \ + cublasStatus_t status = (expression); \ + if (status != CUBLAS_STATUS_SUCCESS) { \ + MS_LOG(EXCEPTION) << "cuBLAS Error: " << message << " | Error Number: " << status \ + << trace::DumpSourceLines(node); \ + } \ + } + #define CHECK_CUBLAS_RET_WITH_ERROR(expression, message) \ { \ cublasStatus_t status = (expression); \ @@ -95,7 +123,7 @@ namespace gpu { } \ } -#define CHECK_CUSOLVER_RET_WITH_EXCEPT(expression, message) \ +#define CHECK_CUSOLVER_RET_WITH_EXCEPT_NOTRACE(expression, message) \ { \ cusolverStatus_t status = (expression); \ if (status != CUSOLVER_STATUS_SUCCESS) { \ @@ -103,6 +131,16 @@ namespace gpu { } \ } +#define CHECK_CUSOLVER_RET_WITH_EXCEPT(node, expression, message) \ + { \ + cusolverStatus_t status = (expression); \ + if (status != CUSOLVER_STATUS_SUCCESS) { \ + MS_LOG(EXCEPTION) << "cusolver Error: " << message << " | Error Number: " << status \ + << trace::DumpSourceLines(node); \ + ; \ + } \ + } + #define CHECK_CUSOLVER_RET_WITH_ERROR(expression, message) \ { \ cusolverStatus_t status = (expression); \ @@ -111,12 +149,12 @@ namespace gpu { } \ } -#define CHECK_NCCL_RET_WITH_EXCEPT(expression, message) \ - { \ - int result = (expression); \ - if (result != ncclSuccess) { \ - MS_LOG(EXCEPTION) << "NCCL Error: " << message << " | Error Number: " << result; \ - } \ +#define CHECK_NCCL_RET_WITH_EXCEPT(node, expression, message) \ + { \ + int result = (expression); \ + if (result != ncclSuccess) { \ + MS_LOG(EXCEPTION) << "NCCL Error: " << message << " | Error Number: " << result << trace::DumpSourceLines(node); \ + } \ } #define VARIABLE_NOT_USED(var) \ diff --git a/mindspore/ccsrc/runtime/device/gpu/gpu_device_manager.cc b/mindspore/ccsrc/runtime/device/gpu/gpu_device_manager.cc index 5207bdf1b6..c10ee6fef9 100644 --- a/mindspore/ccsrc/runtime/device/gpu/gpu_device_manager.cc +++ b/mindspore/ccsrc/runtime/device/gpu/gpu_device_manager.cc @@ -26,14 +26,16 @@ namespace gpu { void GPUDeviceManager::InitDevice() { CHECK_OP_RET_WITH_EXCEPT(CudaDriver::set_current_device(SizeToInt(cur_dev_id_)), "Failed to set current device id"); CHECK_OP_RET_WITH_EXCEPT(CreateStream(&default_stream_), "Failed to create CUDA stream."); - CHECK_CUDNN_RET_WITH_EXCEPT(cudnnCreate(&cudnn_handle_), "Failed to create cuDNN handle"); - CHECK_CUDNN_RET_WITH_EXCEPT(cudnnSetStream(cudnn_handle_, reinterpret_cast(default_stream())), - "Failed to set stream for cuDNN handle."); - CHECK_CUBLAS_RET_WITH_EXCEPT(cublasCreate(&cublas_handle_), "Failed to create cuBLAS handle."); - CHECK_CUBLAS_RET_WITH_EXCEPT(cublasSetStream(cublas_handle_, reinterpret_cast(default_stream())), - "Failed to set stream for cuBLAS handle."); - CHECK_CUSOLVER_RET_WITH_EXCEPT(cusolverDnCreate(&cusolver_dn_handle_), "Failed to create cusolver dn handle."); - CHECK_CUSOLVER_RET_WITH_EXCEPT( + CHECK_CUDNN_RET_WITH_EXCEPT_NOTRACE(cudnnCreate(&cudnn_handle_), "Failed to create cuDNN handle"); + CHECK_CUDNN_RET_WITH_EXCEPT_NOTRACE(cudnnSetStream(cudnn_handle_, reinterpret_cast(default_stream())), + "Failed to set stream for cuDNN handle."); + CHECK_CUBLAS_RET_WITH_EXCEPT_NOTRACE(cublasCreate(&cublas_handle_), "Failed to create cuBLAS handle."); + CHECK_CUBLAS_RET_WITH_EXCEPT_NOTRACE( + cublasSetStream(cublas_handle_, reinterpret_cast(default_stream())), + "Failed to set stream for cuBLAS handle."); + CHECK_CUSOLVER_RET_WITH_EXCEPT_NOTRACE(cusolverDnCreate(&cusolver_dn_handle_), + "Failed to create cusolver dn handle."); + CHECK_CUSOLVER_RET_WITH_EXCEPT_NOTRACE( cusolverDnSetStream(cusolver_dn_handle_, reinterpret_cast(default_stream())), "Failed to set stream for cusolver dn handle"); CHECK_OP_RET_WITH_EXCEPT(GPUMemoryAllocator::GetInstance().Init(), "Failed to Init gpu memory allocator") @@ -46,7 +48,7 @@ void GPUDeviceManager::ReleaseDevice() { } } if (cudnn_handle_ != nullptr) { - CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroy(cudnn_handle_), "Failed to destroy cuDNN handle"); + CHECK_CUDNN_RET_WITH_ERROR_NOTRACE(cudnnDestroy(cudnn_handle_), "Failed to destroy cuDNN handle"); } if (cublas_handle_ != nullptr) { CHECK_CUBLAS_RET_WITH_ERROR(cublasDestroy(cublas_handle_), "Failed to destroy cuBLAS handle."); diff --git a/mindspore/ccsrc/runtime/device/gpu/gpu_stream_assign.cc b/mindspore/ccsrc/runtime/device/gpu/gpu_stream_assign.cc index 4df7c91e3b..de69c73d52 100644 --- a/mindspore/ccsrc/runtime/device/gpu/gpu_stream_assign.cc +++ b/mindspore/ccsrc/runtime/device/gpu/gpu_stream_assign.cc @@ -174,7 +174,8 @@ bool GenSendRecvCNodesForAllReduce(const std::shared_ptr & MS_EXCEPTION_IF_NULL(*recv_node); cudaEvent_t event = nullptr; - CHECK_CUDA_RET_WITH_EXCEPT(cudaEventCreate(&event, cudaEventDisableTiming), "Creating cuda event failed."); + CHECK_CUDA_RET_WITH_EXCEPT(*send_node, cudaEventCreate(&event, cudaEventDisableTiming), + "Creating cuda event failed."); AnfAlgo::SetNodeAttr(kAttrRecordEvent, MakeValue(reinterpret_cast(event)), *send_node); AnfAlgo::SetNodeAttr(kAttrWaitEvent, MakeValue(reinterpret_cast(event)), *recv_node); diff --git a/mindspore/ccsrc/runtime/device/gpu/queue_common.h b/mindspore/ccsrc/runtime/device/gpu/queue_common.h new file mode 100644 index 0000000000..b711190ba3 --- /dev/null +++ b/mindspore/ccsrc/runtime/device/gpu/queue_common.h @@ -0,0 +1,42 @@ +/** + * Copyright 2020 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef MINDSPORE_CCSRC_RUNTIME_DEVICE_GPU_QUEUE_COMMON_H_ +#define MINDSPORE_CCSRC_RUNTIME_DEVICE_GPU_QUEUE_COMMON_H_ + +#include +#include +#include +#include +#include "utils/log_adapter.h" +#include "include/curand.h" + +namespace mindspore { +namespace device { +namespace gpu { +#define CHECK_CUDA_RET_WITH_ERROR(expression, message) \ + { \ + cudaError_t status = (expression); \ + if (status != cudaSuccess) { \ + MS_LOG(ERROR) << "CUDA Error: " << message << " | Error Number: " << status << " " \ + << cudaGetErrorString(status); \ + } \ + } +} // namespace gpu +} // namespace device +} // namespace mindspore + +#endif // MINDSPORE_CCSRC_RUNTIME_DEVICE_GPU_QUEUE_COMMON_H_