diff --git a/mindspore/ccsrc/backend/kernel_compiler/gpu/arrays/array_reduce_gpu_kernel.h b/mindspore/ccsrc/backend/kernel_compiler/gpu/arrays/array_reduce_gpu_kernel.h index e615b2b049..09df5021b5 100644 --- a/mindspore/ccsrc/backend/kernel_compiler/gpu/arrays/array_reduce_gpu_kernel.h +++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/arrays/array_reduce_gpu_kernel.h @@ -35,21 +35,7 @@ const std::map kReduceTypeMap = { template class ArrayReduceGpuKernel : public GpuKernel { public: - ArrayReduceGpuKernel() - : cudnn_handle_(nullptr), - reduce_tensor_op_(CUDNN_REDUCE_TENSOR_ADD), - data_type_(CUDNN_DATA_FLOAT), - nan_prop_(CUDNN_NOT_PROPAGATE_NAN), - reduce_indices_(CUDNN_REDUCE_TENSOR_NO_INDICES), - reduce_tensor_descriptor_(nullptr), - inputA_descriptor_(nullptr), - outputC_descriptor_(nullptr), - keep_dims_(false), - all_match_(false), - is_null_input_(false), - input_size_(0), - output_size_(0), - workspace_size_(0) {} + ArrayReduceGpuKernel() { ResetResource(); } ~ArrayReduceGpuKernel() override { DestroyResource(); } const std::vector &GetInputSizeList() const override { return input_size_list_; } @@ -94,7 +80,7 @@ class ArrayReduceGpuKernel : public GpuKernel { MS_LOG(ERROR) << "Output number is " << output_num << ", but reduce op needs 1 output."; return false; } - int input_dim_length = SizeToInt(AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 0).size()); + int input_dim_length = SizeToInt(AnfAlgo::GetInputRealDeviceShapeIfExist(kernel_node, 0).size()); if (AnfAlgo::GetCNodePrimitive(kernel_node)->GetAttr("axis")->isa() || AnfAlgo::GetCNodePrimitive(kernel_node)->GetAttr("axis")->isa()) { @@ -117,8 +103,8 @@ class ArrayReduceGpuKernel : public GpuKernel { } keep_dims_ = GetAttr(kernel_node, "keep_dims"); - auto inputA_shape = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 0); - auto outputC_shape = AnfAlgo::GetOutputInferShape(kernel_node, 0); + auto inputA_shape = AnfAlgo::GetInputRealDeviceShapeIfExist(kernel_node, 0); + auto outputC_shape = AnfAlgo::GetOutputRealDeviceShapeIfExist(kernel_node, 0); is_null_input_ = CHECK_NULL_INPUT(inputA_shape); if (is_null_input_) { MS_LOG(WARNING) << "ArrayReduceGpuKernel input is null"; @@ -132,6 +118,35 @@ class ArrayReduceGpuKernel : public GpuKernel { return true; } + void ResetResource() noexcept override { + cudnn_handle_ = nullptr; + reduce_tensor_op_ = CUDNN_REDUCE_TENSOR_ADD; + data_type_ = CUDNN_DATA_FLOAT; + nan_prop_ = CUDNN_NOT_PROPAGATE_NAN; + reduce_indices_ = CUDNN_REDUCE_TENSOR_NO_INDICES; + reduce_tensor_descriptor_ = nullptr; + inputA_descriptor_ = nullptr; + outputC_descriptor_ = nullptr; + keep_dims_ = false; + all_match_ = false; + is_null_input_ = false; + input_size_ = 0; + output_size_ = 0; + workspace_size_ = 0; + input_size_list_.clear(); + output_size_list_.clear(); + workspace_size_list_.clear(); + } + + void DestroyResource() noexcept override { + CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyReduceTensorDescriptor(reduce_tensor_descriptor_), + "cudnnDestroyReduceTensorDescriptor failed."); + CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(inputA_descriptor_), + "cudnnDestroyTensorDescriptor failed."); + CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(outputC_descriptor_), + "cudnnDestroyTensorDescriptor failed."); + } + protected: void InitResource() override { cudnn_handle_ = device::gpu::GPUDeviceManager::GetInstance().GetCudnnHandle(); @@ -160,14 +175,6 @@ class ArrayReduceGpuKernel : public GpuKernel { } private: - void DestroyResource() noexcept { - CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyReduceTensorDescriptor(reduce_tensor_descriptor_), - "cudnnDestroyReduceTensorDescriptor failed."); - CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(inputA_descriptor_), - "cudnnDestroyTensorDescriptor failed."); - CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(outputC_descriptor_), - "cudnnDestroyTensorDescriptor failed."); - } void InferArrayReduceType(const CNodePtr &kernel_node) { std::string kernel_name = AnfAlgo::GetCNodeName(kernel_node); auto iter = kReduceTypeMap.find(kernel_name); diff --git a/mindspore/ccsrc/backend/kernel_compiler/gpu/arrays/gatherv2_gpu_kernel.cc b/mindspore/ccsrc/backend/kernel_compiler/gpu/arrays/gatherv2_gpu_kernel.cc index 7a13d05d38..a54627b41f 100644 --- a/mindspore/ccsrc/backend/kernel_compiler/gpu/arrays/gatherv2_gpu_kernel.cc +++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/arrays/gatherv2_gpu_kernel.cc @@ -26,5 +26,14 @@ MS_REG_GPU_KERNEL_TWO( GatherV2, KernelAttr().AddInputAttr(kNumberTypeFloat16).AddInputAttr(kNumberTypeInt32).AddOutputAttr(kNumberTypeFloat16), GatherV2GpuFwdKernel, half, int) + +MS_REG_GPU_KERNEL_TWO( + SparseGatherV2, + KernelAttr().AddInputAttr(kNumberTypeFloat32).AddInputAttr(kNumberTypeInt32).AddOutputAttr(kNumberTypeFloat32), + GatherV2GpuFwdKernel, float, int) +MS_REG_GPU_KERNEL_TWO( + SparseGatherV2, + KernelAttr().AddInputAttr(kNumberTypeFloat16).AddInputAttr(kNumberTypeInt32).AddOutputAttr(kNumberTypeFloat16), + GatherV2GpuFwdKernel, half, int) } // namespace kernel } // namespace mindspore diff --git a/mindspore/ccsrc/backend/kernel_compiler/gpu/arrays/gatherv2_gpu_kernel.h b/mindspore/ccsrc/backend/kernel_compiler/gpu/arrays/gatherv2_gpu_kernel.h index d5649bb0d4..81136ff4fe 100644 --- a/mindspore/ccsrc/backend/kernel_compiler/gpu/arrays/gatherv2_gpu_kernel.h +++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/arrays/gatherv2_gpu_kernel.h @@ -27,7 +27,7 @@ namespace kernel { template class GatherV2GpuFwdKernel : public GpuKernel { public: - GatherV2GpuFwdKernel() : axis_(0), handle_(nullptr) {} + GatherV2GpuFwdKernel() { ResetResource(); } ~GatherV2GpuFwdKernel() = default; const std::vector &GetInputSizeList() const override { return input_size_list_; } @@ -52,9 +52,9 @@ class GatherV2GpuFwdKernel : public GpuKernel { if (input_num != 2) { MS_LOG(EXCEPTION) << "Argument number is " << input_num << ", but GatherGpuV2FwdKernel needs 2."; } - input_shapes_ = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 0); - indices_shapes_ = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 1); - output_shapes_ = AnfAlgo::GetOutputInferShape(kernel_node, 0); + input_shapes_ = AnfAlgo::GetInputRealDeviceShapeIfExist(kernel_node, 0); + indices_shapes_ = AnfAlgo::GetInputRealDeviceShapeIfExist(kernel_node, 1); + output_shapes_ = AnfAlgo::GetOutputRealDeviceShapeIfExist(kernel_node, 0); axis_ = static_cast(GetAttr(kernel_node, "axis")); if (axis_ < 0) { @@ -65,9 +65,18 @@ class GatherV2GpuFwdKernel : public GpuKernel { InitSizeLists(); return true; } + void ResetResource() noexcept override { + input_shapes_.clear(); + indices_shapes_.clear(); + output_shapes_.clear(); + std::fill(dims_, dims_ + 3, 0); + axis_ = 0; + input_size_list_.clear(); + output_size_list_.clear(); + workspace_size_list_.clear(); + } protected: - void InitResource() override { handle_ = device::gpu::GPUDeviceManager::GetInstance().GetCudnnHandle(); } void InitSizeLists() override { size_t size = GetSize(input_shapes_); input_size_list_.push_back(size); @@ -118,7 +127,6 @@ class GatherV2GpuFwdKernel : public GpuKernel { size_t dims_[3] = {}; int axis_; - cudnnHandle_t handle_; std::vector input_size_list_; std::vector output_size_list_; diff --git a/mindspore/ccsrc/backend/kernel_compiler/gpu/arrays/split_gpu_kernel.h b/mindspore/ccsrc/backend/kernel_compiler/gpu/arrays/split_gpu_kernel.h index 124a7734a2..a487e5febc 100644 --- a/mindspore/ccsrc/backend/kernel_compiler/gpu/arrays/split_gpu_kernel.h +++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/arrays/split_gpu_kernel.h @@ -28,14 +28,7 @@ namespace kernel { template class SplitGpuFwdKernel : public GpuKernel { public: - SplitGpuFwdKernel() - : axis_(0), - output_num_(1), - input_size_(1), - axis_step_(1), - all_size_before_axis_(1), - all_size_axis_(1), - outputs_host_(nullptr) {} + SplitGpuFwdKernel() { ResetResource(); } ~SplitGpuFwdKernel() override = default; const std::vector &GetInputSizeList() const override { return input_size_list_; } const std::vector &GetOutputSizeList() const override { return output_size_list_; } @@ -59,7 +52,7 @@ class SplitGpuFwdKernel : public GpuKernel { bool Init(const CNodePtr &kernel_node) override { axis_ = static_cast(GetAttr(kernel_node, "axis")); if (axis_ < 0) { - auto input_shape = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 0); + auto input_shape = AnfAlgo::GetInputRealDeviceShapeIfExist(kernel_node, 0); axis_ += SizeToInt(input_shape.size()); } output_num_ = static_cast(GetAttr(kernel_node, "output_num")); @@ -68,7 +61,7 @@ class SplitGpuFwdKernel : public GpuKernel { return false; } - auto input_shape = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 0); + auto input_shape = AnfAlgo::GetInputRealDeviceShapeIfExist(kernel_node, 0); input_size_ = 1; all_size_before_axis_ = 1; all_size_axis_ = 1; @@ -88,7 +81,7 @@ class SplitGpuFwdKernel : public GpuKernel { for (int i = 0; i < output_num_; i++) { size_t output_size = 1; - auto output_shape = AnfAlgo::GetOutputInferShape(kernel_node, i); + auto output_shape = AnfAlgo::GetOutputRealDeviceShapeIfExist(kernel_node, i); for (size_t j = 0; j < output_shape.size(); j++) { output_size *= output_shape[j]; } @@ -100,6 +93,19 @@ class SplitGpuFwdKernel : public GpuKernel { return true; } + void ResetResource() noexcept override { + axis_ = 0; + output_num_ = 1; + input_size_ = 1; + axis_step_ = 1; + all_size_before_axis_ = 1; + all_size_axis_ = 1; + outputs_host_ = nullptr; + input_size_list_.clear(); + output_size_list_.clear(); + workspace_size_list_.clear(); + } + protected: void InitSizeLists() override {} diff --git a/mindspore/ccsrc/backend/kernel_compiler/gpu/arrays/transpose_gpu_kernel.h b/mindspore/ccsrc/backend/kernel_compiler/gpu/arrays/transpose_gpu_kernel.h index 7bb0a6b77a..4a5fa2d0ae 100644 --- a/mindspore/ccsrc/backend/kernel_compiler/gpu/arrays/transpose_gpu_kernel.h +++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/arrays/transpose_gpu_kernel.h @@ -62,7 +62,7 @@ class TransposeGpuFwdKernel : public GpuKernel { MS_LOG(ERROR) << "Output number is " << output_num << ", but transpose needs 1 output."; return false; } - auto input_shape = AnfAlgo::GetInputDeviceShape(kernel_node, 0); + auto input_shape = AnfAlgo::GetInputRealDeviceShapeIfExist(kernel_node, 0); shape_size_ = input_shape.size(); if (shape_size_ > TRANSPOSE_MAX_DIMENSION) { MS_LOG(EXCEPTION) << "Input is " << shape_size_ << "-D, but transpose supports max " << TRANSPOSE_MAX_DIMENSION diff --git a/mindspore/ccsrc/backend/kernel_compiler/gpu/arrays/unsorted_segment_sum_gpu_kernel.h b/mindspore/ccsrc/backend/kernel_compiler/gpu/arrays/unsorted_segment_sum_gpu_kernel.h index 32527af5c6..8db9db50b8 100644 --- a/mindspore/ccsrc/backend/kernel_compiler/gpu/arrays/unsorted_segment_sum_gpu_kernel.h +++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/arrays/unsorted_segment_sum_gpu_kernel.h @@ -27,8 +27,7 @@ namespace kernel { template class UnsortedSegmentSumGpuKernel : public GpuKernel { public: - UnsortedSegmentSumGpuKernel() - : input_dim0_(1), input_dim1_(1), output_dim0_(1), output_dim1_(1), is_null_input_(false) {} + UnsortedSegmentSumGpuKernel() { ResetResource(); } ~UnsortedSegmentSumGpuKernel() override = default; const std::vector &GetInputSizeList() const override { return input_size_list_; } @@ -53,15 +52,15 @@ class UnsortedSegmentSumGpuKernel : public GpuKernel { } bool Init(const CNodePtr &kernel_node) override { - auto input_shapes = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 0); + auto input_shapes = AnfAlgo::GetInputRealDeviceShapeIfExist(kernel_node, 0); is_null_input_ = CHECK_NULL_INPUT(input_shapes); if (is_null_input_) { MS_LOG(WARNING) << "UnsortedSegmentSum input is null"; InitSizeLists(); return true; } - auto ids_shapes = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 1); - auto output_shapes = AnfAlgo::GetOutputInferShape(kernel_node, 0); + auto ids_shapes = AnfAlgo::GetInputRealDeviceShapeIfExist(kernel_node, 1); + auto output_shapes = AnfAlgo::GetOutputRealDeviceShapeIfExist(kernel_node, 0); auto axis = ids_shapes.size(); for (size_t i = 0; i < input_shapes.size(); i++) { @@ -81,6 +80,17 @@ class UnsortedSegmentSumGpuKernel : public GpuKernel { return true; } + void ResetResource() noexcept override { + input_dim0_ = 1; + input_dim1_ = 1; + output_dim0_ = 1; + output_dim1_ = 1; + is_null_input_ = false; + input_size_list_.clear(); + output_size_list_.clear(); + workspace_size_list_.clear(); + } + protected: void InitSizeLists() override { input_size_list_.push_back(input_dim0_ * input_dim1_ * sizeof(T)); diff --git a/mindspore/ccsrc/backend/kernel_compiler/gpu/gpu_kernel.cc b/mindspore/ccsrc/backend/kernel_compiler/gpu/gpu_kernel.cc new file mode 100644 index 0000000000..d5a93373f0 --- /dev/null +++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/gpu_kernel.cc @@ -0,0 +1,36 @@ +/** + * Copyright 2020 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "backend/kernel_compiler/gpu/gpu_kernel.h" + +namespace mindspore { +namespace kernel { +void GpuDynamicKernel::UpdateArgs() { + if (!is_input_dynamic_shape_ && is_output_dynamic_shape_ && !have_depends()) { + return; + } + + MS_LOG(INFO) << "Update Args: " << cnode_ptr_->fullname_with_scope(); + auto kernel_mod = AnfAlgo::GetKernelMod(cnode_ptr_); + MS_EXCEPTION_IF_NULL(kernel_mod); + auto gpu_kernel_mod = dynamic_cast(kernel_mod); + MS_EXCEPTION_IF_NULL(gpu_kernel_mod); + gpu_kernel_mod->DestroyResource(); + gpu_kernel_mod->ResetResource(); + gpu_kernel_mod->Init(cnode_ptr_); +} +} // namespace kernel +} // namespace mindspore diff --git a/mindspore/ccsrc/backend/kernel_compiler/gpu/gpu_kernel.h b/mindspore/ccsrc/backend/kernel_compiler/gpu/gpu_kernel.h index 81d86e22a6..54ad8b99ea 100644 --- a/mindspore/ccsrc/backend/kernel_compiler/gpu/gpu_kernel.h +++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/gpu_kernel.h @@ -23,11 +23,13 @@ #include #include #include +#include #include "backend/kernel_compiler/kernel.h" #include "backend/kernel_compiler/gpu/kernel_constants.h" #include "runtime/device/gpu/gpu_device_manager.h" #include "runtime/device/gpu/gpu_common.h" #include "backend/session/anf_runtime_algorithm.h" +#include "runtime/device/executor/dynamic_kernel.h" using AnfAlgo = mindspore::session::AnfRuntimeAlgorithm; namespace mindspore { @@ -45,10 +47,28 @@ static std::map kNHWCToNCHWAxisMap = { {3, 1}, }; +class GpuDynamicKernel : public device::DynamicKernel { + public: + explicit GpuDynamicKernel(const CNodePtr &cnode_ptr) : DynamicKernel(nullptr, cnode_ptr) {} + ~GpuDynamicKernel() = default; + + void UpdateArgs() override; + void PostExecute() final { MS_LOG(EXCEPTION) << "`PostExecute()` should not invoked with gpu backend"; }; + void Execute() final { MS_LOG(EXCEPTION) << "`Execute()` should not invoked with gpu backend"; } +}; + class GpuKernel : public KernelMod { public: virtual ~GpuKernel() = default; virtual bool Init(const CNodePtr &kernel_node) = 0; + virtual void ResetResource() noexcept { + MS_LOG(EXCEPTION) << "kernel must override the `ResetResource()` method when dynamic shape"; + } + virtual void DestroyResource() noexcept {} + virtual void PostExecute() {} + + void InitDynamicKernel(const CNodePtr &cnode_ptr) { dynamic_kernel_ = std::make_shared(cnode_ptr); } + device::DynamicKernelPtr DynamicKernel() const { return dynamic_kernel_; } protected: virtual void InitResource() {} @@ -228,7 +248,10 @@ class GpuKernel : public KernelMod { } return type->second; } + + device::DynamicKernelPtr dynamic_kernel_; }; + } // namespace kernel } // namespace mindspore diff --git a/mindspore/ccsrc/backend/kernel_compiler/gpu/math/addn_gpu_kernel.h b/mindspore/ccsrc/backend/kernel_compiler/gpu/math/addn_gpu_kernel.h index 5379ed9534..c1a878c1df 100644 --- a/mindspore/ccsrc/backend/kernel_compiler/gpu/math/addn_gpu_kernel.h +++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/math/addn_gpu_kernel.h @@ -123,6 +123,10 @@ class AddNGpuFwdKernel : public GpuKernel { return true; } + void DestroyResource() noexcept override { + CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(input_descriptor_), "cudnnDestroyTensorDescriptor failed"); + } + protected: void InitResource() override { cudnn_handle_ = device::gpu::GPUDeviceManager::GetInstance().GetCudnnHandle(); @@ -141,9 +145,6 @@ class AddNGpuFwdKernel : public GpuKernel { } private: - void DestroyResource() noexcept { - CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(input_descriptor_), "cudnnDestroyTensorDescriptor failed"); - } cudnnHandle_t cudnn_handle_; cudnnTensorDescriptor_t input_descriptor_; cudnnDataType_t cudnn_data_type_; diff --git a/mindspore/ccsrc/backend/kernel_compiler/gpu/math/bias_add_gpu_kernel.h b/mindspore/ccsrc/backend/kernel_compiler/gpu/math/bias_add_gpu_kernel.h index 6f0529ab72..d43d07970d 100644 --- a/mindspore/ccsrc/backend/kernel_compiler/gpu/math/bias_add_gpu_kernel.h +++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/math/bias_add_gpu_kernel.h @@ -112,6 +112,12 @@ class BiasAddGpuKernel : public GpuKernel { return true; } + void DestroyResource() noexcept override { + CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyOpTensorDescriptor(op_desc_), "cudnnDestroyTensorDescriptor failed"); + CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(b_desc_), "cudnnDestroyTensorDescriptor failed"); + CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(x_desc_), "cudnnDestroyOpTensorDescriptor failed"); + } + protected: void InitResource() override { cudnn_handle_ = device::gpu::GPUDeviceManager::GetInstance().GetCudnnHandle(); @@ -129,12 +135,6 @@ class BiasAddGpuKernel : public GpuKernel { } private: - void DestroyResource() noexcept { - CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyOpTensorDescriptor(op_desc_), "cudnnDestroyTensorDescriptor failed"); - CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(b_desc_), "cudnnDestroyTensorDescriptor failed"); - CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(x_desc_), "cudnnDestroyOpTensorDescriptor failed"); - } - cudnnHandle_t cudnn_handle_; cudnnDataType_t cudnn_data_type_; cudnnTensorDescriptor_t x_desc_; diff --git a/mindspore/ccsrc/backend/kernel_compiler/gpu/math/broadcast_gpu_kernel.h b/mindspore/ccsrc/backend/kernel_compiler/gpu/math/broadcast_gpu_kernel.h index 18851e9622..6ee45e7624 100644 --- a/mindspore/ccsrc/backend/kernel_compiler/gpu/math/broadcast_gpu_kernel.h +++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/math/broadcast_gpu_kernel.h @@ -31,13 +31,7 @@ constexpr int MAX_DIMS = 7; template class BroadcastOpGpuKernel : public GpuKernel { public: - BroadcastOpGpuKernel() - : op_type_(BROADCAST_TYPE_INVALID), - need_broadcast_(false), - is_comp_op_(false), - input1_num_(1), - input2_num_(1), - output_num_(1) {} + BroadcastOpGpuKernel() { ResetResource(); } ~BroadcastOpGpuKernel() override = default; const std::vector &GetInputSizeList() const override { return input_size_list_; } @@ -71,9 +65,9 @@ class BroadcastOpGpuKernel : public GpuKernel { } bool Init(const CNodePtr &kernel_node) override { GetOpType(kernel_node); - auto shape1 = AnfAlgo::GetInputDeviceShape(kernel_node, 0); - auto shape2 = AnfAlgo::GetInputDeviceShape(kernel_node, 1); - auto shape3 = AnfAlgo::GetOutputDeviceShape(kernel_node, 0); + auto shape1 = AnfAlgo::GetInputRealDeviceShapeIfExist(kernel_node, 0); + auto shape2 = AnfAlgo::GetInputRealDeviceShapeIfExist(kernel_node, 1); + auto shape3 = AnfAlgo::GetOutputRealDeviceShapeIfExist(kernel_node, 0); need_broadcast_ = IsBroadcast(shape1, shape2); if (need_broadcast_ && shape1.size() > 7) { MS_LOG(EXCEPTION) << "Broadcast operation not support dim greater than 7"; @@ -106,6 +100,20 @@ class BroadcastOpGpuKernel : public GpuKernel { InitSizeLists(); return true; } + void ResetResource() noexcept override { + op_type_ = BROADCAST_TYPE_INVALID; + need_broadcast_ = false; + is_comp_op_ = false; + input1_num_ = 1; + input2_num_ = 1; + output_num_ = 1; + lhs_shape_.clear(); + rhs_shape_.clear(); + output_shape_.clear(); + input_size_list_.clear(); + output_size_list_.clear(); + workspace_size_list_.clear(); + } protected: void InitResource() override { return; } diff --git a/mindspore/ccsrc/backend/kernel_compiler/gpu/math/broadcast_grad_gpu_kernel.h b/mindspore/ccsrc/backend/kernel_compiler/gpu/math/broadcast_grad_gpu_kernel.h index c6e9c90373..0b1d262a84 100644 --- a/mindspore/ccsrc/backend/kernel_compiler/gpu/math/broadcast_grad_gpu_kernel.h +++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/math/broadcast_grad_gpu_kernel.h @@ -30,14 +30,7 @@ namespace kernel { template class BroadcastOpGradGpuKernel : public GpuKernel { public: - BroadcastOpGradGpuKernel() - : op_type_(BROADCAST_GRAD_TYPE_INVALID), - need_broadcast_(false), - input1_num_(1), - input2_num_(1), - output_num_(1), - grad_x_(false), - grad_y_(false) {} + BroadcastOpGradGpuKernel() { ResetResource(); } ~BroadcastOpGradGpuKernel() override = default; const std::vector &GetInputSizeList() const override { return input_size_list_; } @@ -105,6 +98,22 @@ class BroadcastOpGradGpuKernel : public GpuKernel { return true; } + void ResetResource() noexcept override { + op_type_ = BROADCAST_GRAD_TYPE_INVALID; + need_broadcast_ = false; + input1_num_ = 1; + input2_num_ = 1; + output_num_ = 1; + std::fill(x1_shape_, x1_shape_ + 4, 1); + std::fill(x2_shape_, x2_shape_ + 4, 1); + std::fill(dy_shape_, dy_shape_ + 4, 1); + grad_x_ = false; + grad_y_ = false; + input_size_list_.clear(); + output_size_list_.clear(); + workspace_size_list_.clear(); + } + protected: void InitResource() override { return; } void InitSizeLists() override { diff --git a/mindspore/ccsrc/backend/kernel_compiler/gpu/math/unary_op_gpu_kernel.h b/mindspore/ccsrc/backend/kernel_compiler/gpu/math/unary_op_gpu_kernel.h index 591768a1b9..bf94c0bfcd 100644 --- a/mindspore/ccsrc/backend/kernel_compiler/gpu/math/unary_op_gpu_kernel.h +++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/math/unary_op_gpu_kernel.h @@ -69,21 +69,15 @@ static const std::map kUnaryOpTypeMap = {{"Exp", UNARY template class UnaryOpGpuKernel : public GpuKernel { public: - UnaryOpGpuKernel() - : unary_op_type_(UNARY_OP_INVALID_TYPE), - input_size_(sizeof(T)), - output_size_(sizeof(T)), - workspace_size_(0), - is_null_input_(false) {} + UnaryOpGpuKernel() { ResetResource(); } ~UnaryOpGpuKernel() override = default; const std::vector &GetInputSizeList() const override { return input_size_list_; } const std::vector &GetOutputSizeList() const override { return output_size_list_; } const std::vector &GetWorkspaceSizeList() const override { return workspace_size_list_; } - bool Launch(const std::vector &inputs, const std::vector &workspace, + bool Launch(const std::vector &inputs, const std::vector &, const std::vector &outputs, void *stream_ptr) override { - VARIABLE_NOT_USED(workspace); T *input_addr = GetDeviceAddress(inputs, 0); T *output_addr = GetDeviceAddress(outputs, 0); @@ -184,7 +178,7 @@ class UnaryOpGpuKernel : public GpuKernel { MS_LOG(ERROR) << "Output number is " << output_num << ", but unary op needs 1 output."; return false; } - auto input_shape = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 0); + auto input_shape = AnfAlgo::GetInputRealDeviceShapeIfExist(kernel_node, 0); is_null_input_ = CHECK_NULL_INPUT(input_shape); if (is_null_input_) { MS_LOG(WARNING) << "UnaryOpGpuKernel input is null"; @@ -198,6 +192,16 @@ class UnaryOpGpuKernel : public GpuKernel { InitSizeLists(); return true; } + void ResetResource() noexcept override { + unary_op_type_ = UNARY_OP_INVALID_TYPE; + input_size_ = sizeof(T); + output_size_ = sizeof(T); + workspace_size_ = 0; + is_null_input_ = false; + input_size_list_.clear(); + output_size_list_.clear(); + workspace_size_list_.clear(); + } protected: void InitSizeLists() override { diff --git a/mindspore/ccsrc/backend/kernel_compiler/gpu/nn/activation_gpu_kernel.h b/mindspore/ccsrc/backend/kernel_compiler/gpu/nn/activation_gpu_kernel.h index 2529cfc8ac..f6934d0416 100644 --- a/mindspore/ccsrc/backend/kernel_compiler/gpu/nn/activation_gpu_kernel.h +++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/nn/activation_gpu_kernel.h @@ -29,16 +29,7 @@ namespace kernel { template class ActivationGpuFwdKernel : public GpuKernel { public: - ActivationGpuFwdKernel() - : cudnn_handle_(nullptr), - activation_desc_(nullptr), - mode_(CUDNN_ACTIVATION_RELU), - data_descriptor_(nullptr), - is_null_input_(false), - cudnn_data_type_(CUDNN_DATA_FLOAT), - input_size_(0), - output_size_(0), - workspace_size_(0) {} + ActivationGpuFwdKernel() { ResetResource(); } ~ActivationGpuFwdKernel() override { DestroyResource(); } const std::vector &GetInputSizeList() const override { return input_size_list_; } const std::vector &GetOutputSizeList() const override { return output_size_list_; } @@ -75,7 +66,7 @@ class ActivationGpuFwdKernel : public GpuKernel { MS_LOG(ERROR) << "Argument number is " << input_num << ", but ActivationGpuFwdKernel needs 1."; return false; } - auto input_shape = AnfAlgo::GetInputDeviceShape(kernel_node, 0); + auto input_shape = AnfAlgo::GetInputRealDeviceShapeIfExist(kernel_node, 0); is_null_input_ = CHECK_NULL_INPUT(input_shape); if (is_null_input_) { MS_LOG(WARNING) << "ActivationGpuFwdKernel input is null."; @@ -113,6 +104,27 @@ class ActivationGpuFwdKernel : public GpuKernel { return true; } + void DestroyResource() noexcept override { + CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyActivationDescriptor(activation_desc_), + "cudnnDestroyActivationDescriptor failed"); + CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(data_descriptor_), "cudnnDestroyTensorDescriptor failed"); + } + + void ResetResource() noexcept override { + cudnn_handle_ = nullptr; + activation_desc_ = nullptr; + mode_ = CUDNN_ACTIVATION_RELU; + data_descriptor_ = nullptr; + is_null_input_ = false; + input_size_list_.clear(); + output_size_list_.clear(); + workspace_size_list_.clear(); + cudnn_data_type_ = CUDNN_DATA_FLOAT; + input_size_ = 0; + output_size_ = 0; + workspace_size_ = 0; + } + protected: void InitResource() override { cudnn_handle_ = device::gpu::GPUDeviceManager::GetInstance().GetCudnnHandle(); @@ -132,12 +144,6 @@ class ActivationGpuFwdKernel : public GpuKernel { } private: - void DestroyResource() noexcept { - CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyActivationDescriptor(activation_desc_), - "cudnnDestroyActivationDescriptor failed"); - CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(data_descriptor_), "cudnnDestroyTensorDescriptor failed"); - } - std::map kernel_map = {{"ReLU", CUDNN_ACTIVATION_RELU}, {"ReLU6", CUDNN_ACTIVATION_CLIPPED_RELU}, {"Tanh", CUDNN_ACTIVATION_TANH}, diff --git a/mindspore/ccsrc/backend/kernel_compiler/gpu/nn/activation_grad_kernel.h b/mindspore/ccsrc/backend/kernel_compiler/gpu/nn/activation_grad_kernel.h index e6ee2d56a8..b3d0c55af5 100644 --- a/mindspore/ccsrc/backend/kernel_compiler/gpu/nn/activation_grad_kernel.h +++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/nn/activation_grad_kernel.h @@ -29,14 +29,7 @@ namespace kernel { template class ActivationGradGpuKernel : public GpuKernel { public: - ActivationGradGpuKernel() - : cudnn_handle_(nullptr), - activation_desc_(nullptr), - mode_(CUDNN_ACTIVATION_RELU), - data_descriptor_(nullptr), - is_null_input_(false), - cudnn_data_type_(CUDNN_DATA_FLOAT), - input_size_(0) {} + ActivationGradGpuKernel() { ResetResource(); } ~ActivationGradGpuKernel() override { DestroyResource(); } const std::vector &GetInputSizeList() const override { return input_size_list_; } const std::vector &GetOutputSizeList() const override { return output_size_list_; } @@ -117,6 +110,25 @@ class ActivationGradGpuKernel : public GpuKernel { return true; } + void DestroyResource() noexcept override { + CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyActivationDescriptor(activation_desc_), + "cudnnDestroyActivationDescriptor failed"); + CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(data_descriptor_), "cudnnDestroyTensorDescriptor failed"); + } + + void ResetResource() noexcept override { + cudnn_handle_ = nullptr; + activation_desc_ = nullptr; + mode_ = CUDNN_ACTIVATION_RELU; + data_descriptor_ = nullptr; + is_null_input_ = false; + input_size_list_.clear(); + output_size_list_.clear(); + workspace_size_list_.clear(); + cudnn_data_type_ = CUDNN_DATA_FLOAT; + input_size_ = 0; + } + protected: void InitResource() override { cudnn_handle_ = device::gpu::GPUDeviceManager::GetInstance().GetCudnnHandle(); @@ -135,12 +147,6 @@ class ActivationGradGpuKernel : public GpuKernel { } private: - void DestroyResource() noexcept { - CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyActivationDescriptor(activation_desc_), - "cudnnDestroyActivationDescriptor failed"); - CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(data_descriptor_), "cudnnDestroyTensorDescriptor failed"); - } - std::map kernel_map = {{"ReluGrad", CUDNN_ACTIVATION_RELU}, {"ReLU6Grad", CUDNN_ACTIVATION_CLIPPED_RELU}, {"TanhGrad", CUDNN_ACTIVATION_TANH}, diff --git a/mindspore/ccsrc/backend/kernel_compiler/gpu/nn/batchnorm_grad_gpu_kernel.h b/mindspore/ccsrc/backend/kernel_compiler/gpu/nn/batchnorm_grad_gpu_kernel.h index c548e4bdac..5185109a90 100644 --- a/mindspore/ccsrc/backend/kernel_compiler/gpu/nn/batchnorm_grad_gpu_kernel.h +++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/nn/batchnorm_grad_gpu_kernel.h @@ -121,6 +121,13 @@ class BatchNormGradGpuKernel : public GpuKernel { return true; } + void DestroyResource() noexcept override { + CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(scale_bias_desc_), "Destroy para desc failed"); + CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(dx_desc_), "Destroy dx desc failed"); + CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(dy_desc_), "Destroy dy desc failed"); + CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(x_desc_), "Destroy x desc failed"); + } + protected: void InitResource() override { handle_ = device::gpu::GPUDeviceManager::GetInstance().GetCudnnHandle(); @@ -152,13 +159,6 @@ class BatchNormGradGpuKernel : public GpuKernel { } private: - void DestroyResource() noexcept { - CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(scale_bias_desc_), "Destroy para desc failed"); - CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(dx_desc_), "Destroy dx desc failed"); - CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(dy_desc_), "Destroy dy desc failed"); - CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(x_desc_), "Destroy x desc failed"); - } - int batch_; int channel_; int height_; diff --git a/mindspore/ccsrc/backend/kernel_compiler/gpu/nn/bias_add_grad_gpu_kenel.h b/mindspore/ccsrc/backend/kernel_compiler/gpu/nn/bias_add_grad_gpu_kenel.h index 7787c4c037..aa1be7b7b1 100644 --- a/mindspore/ccsrc/backend/kernel_compiler/gpu/nn/bias_add_grad_gpu_kenel.h +++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/nn/bias_add_grad_gpu_kenel.h @@ -111,6 +111,13 @@ class BiasAddGradGpuKernel : public GpuKernel { return true; } + void DestroyResource() noexcept override { + CHECK_CUDNN_RET_WITH_EXCEPT(cudnnDestroyReduceTensorDescriptor(op_desc_), + "cudnnDestroyReduceTensorDescriptor failed"); + CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(db_desc_), "cudnnDestroyTensorDescriptor failed"); + CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(dy_desc_), "cudnnDestroyOpTensorDescriptor failed"); + } + protected: void InitResource() override { cudnn_handle_ = device::gpu::GPUDeviceManager::GetInstance().GetCudnnHandle(); @@ -137,13 +144,6 @@ class BiasAddGradGpuKernel : public GpuKernel { } private: - void DestroyResource() noexcept { - CHECK_CUDNN_RET_WITH_EXCEPT(cudnnDestroyReduceTensorDescriptor(op_desc_), - "cudnnDestroyReduceTensorDescriptor failed"); - CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(db_desc_), "cudnnDestroyTensorDescriptor failed"); - CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(dy_desc_), "cudnnDestroyOpTensorDescriptor failed"); - } - bool same_dims_; cudnnHandle_t cudnn_handle_; cudnnDataType_t cudnn_data_type_; diff --git a/mindspore/ccsrc/backend/kernel_compiler/gpu/nn/conv2d_gpu_kernel.h b/mindspore/ccsrc/backend/kernel_compiler/gpu/nn/conv2d_gpu_kernel.h index 0afdf7c844..c5b88da02d 100644 --- a/mindspore/ccsrc/backend/kernel_compiler/gpu/nn/conv2d_gpu_kernel.h +++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/nn/conv2d_gpu_kernel.h @@ -198,6 +198,15 @@ class Conv2dGpuFwdKernel : public GpuKernel { return true; } + void DestroyResource() noexcept override { + CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyConvolutionDescriptor(conv_desc_), + "cudnnDestroyConvolutionDescriptor failed"); + CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyFilterDescriptor(filter_desc_), "cudnnDestroyTensorDescriptor failed"); + CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(padded_desc_), "cudnnDestroyTensorDescriptor failed"); + CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(output_desc_), "cudnnDestroyTensorDescriptor failed"); + CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(input_desc_), "cudnnDestroyTensorDescriptor failed"); + } + protected: void InitResource() override { cudnn_handle_ = device::gpu::GPUDeviceManager::GetInstance().GetCudnnHandle(); @@ -243,14 +252,6 @@ class Conv2dGpuFwdKernel : public GpuKernel { } private: - void DestroyResource() noexcept { - CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyConvolutionDescriptor(conv_desc_), - "cudnnDestroyConvolutionDescriptor failed"); - CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyFilterDescriptor(filter_desc_), "cudnnDestroyTensorDescriptor failed"); - CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(padded_desc_), "cudnnDestroyTensorDescriptor failed"); - CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(output_desc_), "cudnnDestroyTensorDescriptor failed"); - CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(input_desc_), "cudnnDestroyTensorDescriptor failed"); - } bool CheckParam(const CNodePtr &kernel_node) { size_t input_num = AnfAlgo::GetInputTensorNum(kernel_node); if (input_num != 2) { diff --git a/mindspore/ccsrc/backend/kernel_compiler/gpu/nn/conv2d_grad_filter_gpu_kernel.h b/mindspore/ccsrc/backend/kernel_compiler/gpu/nn/conv2d_grad_filter_gpu_kernel.h index 22fb7ced2c..49f1124411 100644 --- a/mindspore/ccsrc/backend/kernel_compiler/gpu/nn/conv2d_grad_filter_gpu_kernel.h +++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/nn/conv2d_grad_filter_gpu_kernel.h @@ -199,6 +199,15 @@ class ConvGradFilterGpuBkwKernel : public GpuKernel { return true; } + void DestroyResource() noexcept override { + CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyConvolutionDescriptor(conv_desc_), + "cudnnDestroyConvolutionDescriptor failed"); + CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyFilterDescriptor(dw_desc_), "cudnnDestroyFilterDescriptor failed"); + CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(padded_descriptor_), "cudnnDestroyTensorDescriptor failed"); + CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(dy_desc_), "cudnnDestroyTensorDescriptor failed"); + CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(x_desc_), "cudnnDestroyTensorDescriptor failed"); + } + protected: void InitResource() override { cudnn_handle_ = device::gpu::GPUDeviceManager::GetInstance().GetCudnnHandle(); @@ -243,14 +252,6 @@ class ConvGradFilterGpuBkwKernel : public GpuKernel { } private: - void DestroyResource() noexcept { - CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyConvolutionDescriptor(conv_desc_), - "cudnnDestroyConvolutionDescriptor failed"); - CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyFilterDescriptor(dw_desc_), "cudnnDestroyFilterDescriptor failed"); - CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(padded_descriptor_), "cudnnDestroyTensorDescriptor failed"); - CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(dy_desc_), "cudnnDestroyTensorDescriptor failed"); - CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(x_desc_), "cudnnDestroyTensorDescriptor failed"); - } bool CheckParam(const CNodePtr &kernel_node) { size_t input_num = AnfAlgo::GetInputTensorNum(kernel_node); if (input_num != 2) { diff --git a/mindspore/ccsrc/backend/kernel_compiler/gpu/nn/conv2d_grad_input_gpu_kernel.h b/mindspore/ccsrc/backend/kernel_compiler/gpu/nn/conv2d_grad_input_gpu_kernel.h index 965d67b846..9608204c4e 100644 --- a/mindspore/ccsrc/backend/kernel_compiler/gpu/nn/conv2d_grad_input_gpu_kernel.h +++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/nn/conv2d_grad_input_gpu_kernel.h @@ -203,6 +203,15 @@ class ConvGradInputGpuBkwKernel : public GpuKernel { return true; } + void DestroyResource() noexcept override { + CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyConvolutionDescriptor(conv_desc_), + "cudnnDestroyConvolutionDescriptor failed"); + CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyFilterDescriptor(w_desc_), "cudnnDestroyFilterDescriptor failed"); + CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(padded_descriptor_), "cudnnDestroyTensorDescriptor failed"); + CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(dy_desc_), "cudnnDestroyTensorDescriptor failed"); + CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(dx_desc_), "cudnnDestroyTensorDescriptor failed"); + } + protected: void InitResource() override { cudnn_handle_ = device::gpu::GPUDeviceManager::GetInstance().GetCudnnHandle(); @@ -244,14 +253,6 @@ class ConvGradInputGpuBkwKernel : public GpuKernel { } private: - void DestroyResource() noexcept { - CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyConvolutionDescriptor(conv_desc_), - "cudnnDestroyConvolutionDescriptor failed"); - CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyFilterDescriptor(w_desc_), "cudnnDestroyFilterDescriptor failed"); - CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(padded_descriptor_), "cudnnDestroyTensorDescriptor failed"); - CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(dy_desc_), "cudnnDestroyTensorDescriptor failed"); - CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(dx_desc_), "cudnnDestroyTensorDescriptor failed"); - } bool CheckParam(const CNodePtr &kernel_node) { size_t input_num = AnfAlgo::GetInputTensorNum(kernel_node); if (input_num != 2) { diff --git a/mindspore/ccsrc/backend/kernel_compiler/gpu/nn/flatten_gpu_kernel.h b/mindspore/ccsrc/backend/kernel_compiler/gpu/nn/flatten_gpu_kernel.h index baf6e35f2e..3e199cec18 100644 --- a/mindspore/ccsrc/backend/kernel_compiler/gpu/nn/flatten_gpu_kernel.h +++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/nn/flatten_gpu_kernel.h @@ -27,7 +27,7 @@ namespace kernel { template class FlattenGpuFwdKernel : public GpuKernel { public: - FlattenGpuFwdKernel() : input_size_(0), output_size_(0), workspace_size_(0) {} + FlattenGpuFwdKernel() : input_size_(0) {} ~FlattenGpuFwdKernel() override = default; const std::vector &GetInputSizeList() const override { return input_size_list_; } @@ -47,7 +47,7 @@ class FlattenGpuFwdKernel : public GpuKernel { return true; } bool Init(const CNodePtr &kernel_node) override { - auto shape = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 0); + auto shape = AnfAlgo::GetInputRealDeviceShapeIfExist(kernel_node, 0); input_size_ = sizeof(T); for (size_t i = 0; i < shape.size(); ++i) { input_size_ *= shape[i]; @@ -55,12 +55,17 @@ class FlattenGpuFwdKernel : public GpuKernel { InitSizeLists(); return true; } + void ResetResource() noexcept override { + input_size_ = 0; + input_size_list_.clear(); + output_size_list_.clear(); + workspace_size_list_.clear(); + } protected: void InitSizeLists() override { input_size_list_.push_back(input_size_); - output_size_ = input_size_; - output_size_list_.push_back(output_size_); + output_size_list_.push_back(input_size_); } private: @@ -69,8 +74,6 @@ class FlattenGpuFwdKernel : public GpuKernel { std::vector workspace_size_list_; size_t input_size_; - size_t output_size_; - size_t workspace_size_; }; } // namespace kernel } // namespace mindspore diff --git a/mindspore/ccsrc/backend/kernel_compiler/gpu/nn/flatten_grad_gpu_kernel.h b/mindspore/ccsrc/backend/kernel_compiler/gpu/nn/flatten_grad_gpu_kernel.h index 700fe5884a..92ed5e07e0 100644 --- a/mindspore/ccsrc/backend/kernel_compiler/gpu/nn/flatten_grad_gpu_kernel.h +++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/nn/flatten_grad_gpu_kernel.h @@ -27,7 +27,7 @@ namespace kernel { template class FlattenGardGpuBkwKernel : public GpuKernel { public: - FlattenGardGpuBkwKernel() : input_size_(0), output_size_(0), workspace_size_(0) {} + FlattenGardGpuBkwKernel() { ResetResource(); } ~FlattenGardGpuBkwKernel() override = default; const std::vector &GetInputSizeList() const override { return input_size_list_; } @@ -54,7 +54,7 @@ class FlattenGardGpuBkwKernel : public GpuKernel { return false; } - auto shape = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 0); + auto shape = AnfAlgo::GetInputRealDeviceShapeIfExist(kernel_node, 0); for (size_t i = 0; i < shape.size(); ++i) { if (input_size_ == 0) { input_size_ = 1; @@ -67,11 +67,17 @@ class FlattenGardGpuBkwKernel : public GpuKernel { return true; } + void ResetResource() noexcept override { + input_size_ = 0; + input_size_list_.clear(); + output_size_list_.clear(); + workspace_size_list_.clear(); + } + protected: void InitSizeLists() override { input_size_list_.push_back(input_size_); - output_size_ = input_size_; - output_size_list_.push_back(output_size_); + output_size_list_.push_back(input_size_); } private: @@ -80,8 +86,6 @@ class FlattenGardGpuBkwKernel : public GpuKernel { std::vector workspace_size_list_; size_t input_size_; - size_t output_size_; - size_t workspace_size_; }; } // namespace kernel } // namespace mindspore diff --git a/mindspore/ccsrc/backend/kernel_compiler/gpu/nn/fused_batch_norm_ex_gpu_kernel.h b/mindspore/ccsrc/backend/kernel_compiler/gpu/nn/fused_batch_norm_ex_gpu_kernel.h index 8262e3423b..e8564d354c 100644 --- a/mindspore/ccsrc/backend/kernel_compiler/gpu/nn/fused_batch_norm_ex_gpu_kernel.h +++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/nn/fused_batch_norm_ex_gpu_kernel.h @@ -140,6 +140,20 @@ class FusedBatchNormExGpuKernel : public GpuKernel { return true; } + void DestroyResource() noexcept override { + CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(x_desc_), "Destroy x desc failed"); + CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(y_desc_), "Destroy y desc failed"); + CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(scale_bias_mean_var_desc_), "Destroy para desc failed"); + if (bn_ops_ == CUDNN_BATCHNORM_OPS_BN_ADD_ACTIVATION) { + CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(z_desc_), "Destroy z desc failed"); + } + + if (bn_ops_ != CUDNN_BATCHNORM_OPS_BN) { + CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyActivationDescriptor(activation_desc_), + "Destroy activation descriptor failed"); + } + } + protected: void InitResource() override { handle_ = device::gpu::GPUDeviceManager::GetInstance().GetCudnnHandle(); @@ -238,20 +252,6 @@ class FusedBatchNormExGpuKernel : public GpuKernel { } } - void DestroyResource() noexcept { - CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(x_desc_), "Destroy x desc failed"); - CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(y_desc_), "Destroy y desc failed"); - CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(scale_bias_mean_var_desc_), "Destroy para desc failed"); - if (bn_ops_ == CUDNN_BATCHNORM_OPS_BN_ADD_ACTIVATION) { - CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(z_desc_), "Destroy z desc failed"); - } - - if (bn_ops_ != CUDNN_BATCHNORM_OPS_BN) { - CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyActivationDescriptor(activation_desc_), - "Destroy activation descriptor failed"); - } - } - size_t input_x_size_; size_t input_z_size_; size_t para_size_; diff --git a/mindspore/ccsrc/backend/kernel_compiler/gpu/nn/fused_batch_norm_gpu_kernel.h b/mindspore/ccsrc/backend/kernel_compiler/gpu/nn/fused_batch_norm_gpu_kernel.h index 2d107136f7..695c453e64 100644 --- a/mindspore/ccsrc/backend/kernel_compiler/gpu/nn/fused_batch_norm_gpu_kernel.h +++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/nn/fused_batch_norm_gpu_kernel.h @@ -133,6 +133,12 @@ class FusedBatchNormGpuKernel : public GpuKernel { return true; } + void DestroyResource() noexcept override { + CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(x_desc_), "Destroy x desc failed"); + CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(y_desc_), "Destroy y desc failed"); + CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(scale_bias_mean_var_desc_), "Destroy para desc failed"); + } + protected: void InitResource() override { handle_ = device::gpu::GPUDeviceManager::GetInstance().GetCudnnHandle(); @@ -165,12 +171,6 @@ class FusedBatchNormGpuKernel : public GpuKernel { } private: - void DestroyResource() noexcept { - CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(x_desc_), "Destroy x desc failed"); - CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(y_desc_), "Destroy y desc failed"); - CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(scale_bias_mean_var_desc_), "Destroy para desc failed"); - } - int batch_; int channel_; int height_; diff --git a/mindspore/ccsrc/backend/kernel_compiler/gpu/nn/fused_batch_norm_grad_ex_gpu_kernel.h b/mindspore/ccsrc/backend/kernel_compiler/gpu/nn/fused_batch_norm_grad_ex_gpu_kernel.h index 49d05b8729..7704dccec0 100644 --- a/mindspore/ccsrc/backend/kernel_compiler/gpu/nn/fused_batch_norm_grad_ex_gpu_kernel.h +++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/nn/fused_batch_norm_grad_ex_gpu_kernel.h @@ -201,6 +201,21 @@ class FusedBatchNormGradExGpuKernel : public GpuKernel { workspace_size_list_.push_back(workspace_size_); } + void DestroyResource() noexcept override { + CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(x_desc_), "Destroy x desc failed"); + if (bn_ops_ != CUDNN_BATCHNORM_OPS_BN) { + CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(y_desc_), "Destroy y desc failed"); + CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyActivationDescriptor(activation_desc_), + "Destroy activation descriptor failed"); + } + CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(dy_desc_), "Destroy dy desc failed"); + + CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(dx_desc_), "Destroy dx desc failed"); + if (bn_ops_ == CUDNN_BATCHNORM_OPS_BN_ADD_ACTIVATION) { + CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(dz_desc_), "Destroy z desc failed"); + } + CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(scale_bias_diff_desc_), "Destroy para desc failed"); + } private: void SetTensorDescriptor(const std::string &format, const std::vector &shape) { @@ -255,22 +270,6 @@ class FusedBatchNormGradExGpuKernel : public GpuKernel { } } - void DestroyResource() noexcept { - CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(x_desc_), "Destroy x desc failed"); - if (bn_ops_ != CUDNN_BATCHNORM_OPS_BN) { - CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(y_desc_), "Destroy y desc failed"); - CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyActivationDescriptor(activation_desc_), - "Destroy activation descriptor failed"); - } - CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(dy_desc_), "Destroy dy desc failed"); - - CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(dx_desc_), "Destroy dx desc failed"); - if (bn_ops_ == CUDNN_BATCHNORM_OPS_BN_ADD_ACTIVATION) { - CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(dz_desc_), "Destroy z desc failed"); - } - CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(scale_bias_diff_desc_), "Destroy para desc failed"); - } - size_t x_size_; size_t para_size_; size_t workspace_size_; diff --git a/mindspore/ccsrc/backend/kernel_compiler/gpu/nn/fused_batchnorm_grad_gpu_kernel.h b/mindspore/ccsrc/backend/kernel_compiler/gpu/nn/fused_batchnorm_grad_gpu_kernel.h index b22cc2f03f..c459df03cf 100644 --- a/mindspore/ccsrc/backend/kernel_compiler/gpu/nn/fused_batchnorm_grad_gpu_kernel.h +++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/nn/fused_batchnorm_grad_gpu_kernel.h @@ -117,6 +117,13 @@ class FusedBatchNormGradGpuKernel : public GpuKernel { return true; } + void DestroyResource() noexcept override { + CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(scale_bias_desc_), "Destroy para desc failed"); + CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(dx_desc_), "Destroy dx desc failed"); + CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(dy_desc_), "Destroy dy desc failed"); + CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(x_desc_), "Destroy x desc failed"); + } + protected: void InitResource() override { handle_ = device::gpu::GPUDeviceManager::GetInstance().GetCudnnHandle(); @@ -146,13 +153,6 @@ class FusedBatchNormGradGpuKernel : public GpuKernel { } private: - void DestroyResource() noexcept { - CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(scale_bias_desc_), "Destroy para desc failed"); - CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(dx_desc_), "Destroy dx desc failed"); - CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(dy_desc_), "Destroy dy desc failed"); - CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(x_desc_), "Destroy x desc failed"); - } - int batch_; int channel_; int height_; diff --git a/mindspore/ccsrc/backend/kernel_compiler/gpu/nn/im2col_gpu_kernel.h b/mindspore/ccsrc/backend/kernel_compiler/gpu/nn/im2col_gpu_kernel.h index 4fd4c90154..fa27e77d8e 100644 --- a/mindspore/ccsrc/backend/kernel_compiler/gpu/nn/im2col_gpu_kernel.h +++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/nn/im2col_gpu_kernel.h @@ -123,6 +123,15 @@ class Im2ColGpuFwdKernel : public GpuKernel { return true; } + void DestroyResource() noexcept override { + CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyConvolutionDescriptor(conv_desc_), + "cudnnDestroyConvolutionDescriptor failed"); + CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyFilterDescriptor(filter_desc_), "cudnnDestroyTensorDescriptor failed"); + CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(padded_desc_), "cudnnDestroyTensorDescriptor failed"); + CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(output_desc_), "cudnnDestroyTensorDescriptor failed"); + CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(input_desc_), "cudnnDestroyTensorDescriptor failed"); + } + protected: void InitResource() override { cudnn_handle_ = device::gpu::GPUDeviceManager::GetInstance().GetCudnnHandle(); @@ -152,14 +161,6 @@ class Im2ColGpuFwdKernel : public GpuKernel { } private: - void DestroyResource() noexcept { - CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyConvolutionDescriptor(conv_desc_), - "cudnnDestroyConvolutionDescriptor failed"); - CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyFilterDescriptor(filter_desc_), "cudnnDestroyTensorDescriptor failed"); - CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(padded_desc_), "cudnnDestroyTensorDescriptor failed"); - CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(output_desc_), "cudnnDestroyTensorDescriptor failed"); - CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(input_desc_), "cudnnDestroyTensorDescriptor failed"); - } bool CheckParam(const CNodePtr &kernel_node) { cudnn_data_type_ = GetCudnnDataType(TypeIdLabel(AnfAlgo::GetInputDeviceDataType(kernel_node, 0))); size_t input_num = AnfAlgo::GetInputTensorNum(kernel_node); diff --git a/mindspore/ccsrc/backend/kernel_compiler/gpu/nn/lstm_gpu_kernel.h b/mindspore/ccsrc/backend/kernel_compiler/gpu/nn/lstm_gpu_kernel.h index 315748b7d9..60cb4e43dd 100644 --- a/mindspore/ccsrc/backend/kernel_compiler/gpu/nn/lstm_gpu_kernel.h +++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/nn/lstm_gpu_kernel.h @@ -157,6 +157,21 @@ class LstmGpuKernel : public GpuKernel { } } + void DestroyResource() noexcept override { + CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyRNNDescriptor(rnn_desc_), "destroy rnn_desc failed"); + CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyDropoutDescriptor(dropout_desc_), "destroy dropout_desc failed"); + CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(cy_desc_), "destroy cy_desc failed"); + CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(hy_desc_), "destroy hy_desc failed"); + CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyFilterDescriptor(w_desc_), "destroy w_desc failed"); + CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(hx_desc_), "destroy hx_desc failed"); + CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(cx_desc_), "destroy cx_desc failed"); + + for (size_t i = 0; i < IntToSize(seq_len_); ++i) { + CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(y_desc_[i]), "destroy y_desc failed"); + CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(x_desc_[i]), "destroy x_desc failed"); + } + } + protected: void InitResource() override { handle_ = device::gpu::GPUDeviceManager::GetInstance().GetCudnnHandle(); @@ -195,21 +210,6 @@ class LstmGpuKernel : public GpuKernel { } private: - void DestroyResource() noexcept { - CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyRNNDescriptor(rnn_desc_), "destroy rnn_desc failed"); - CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyDropoutDescriptor(dropout_desc_), "destroy dropout_desc failed"); - CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(cy_desc_), "destroy cy_desc failed"); - CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(hy_desc_), "destroy hy_desc failed"); - CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyFilterDescriptor(w_desc_), "destroy w_desc failed"); - CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(hx_desc_), "destroy hx_desc failed"); - CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(cx_desc_), "destroy cx_desc failed"); - - for (size_t i = 0; i < IntToSize(seq_len_); ++i) { - CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(y_desc_[i]), "destroy y_desc failed"); - CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(x_desc_[i]), "destroy x_desc failed"); - } - } - int batch_size_; int seq_len_; int input_size_; diff --git a/mindspore/ccsrc/backend/kernel_compiler/gpu/nn/lstm_grad_data_gpu_kernel.h b/mindspore/ccsrc/backend/kernel_compiler/gpu/nn/lstm_grad_data_gpu_kernel.h index 10813422bd..dee8d56cd7 100644 --- a/mindspore/ccsrc/backend/kernel_compiler/gpu/nn/lstm_grad_data_gpu_kernel.h +++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/nn/lstm_grad_data_gpu_kernel.h @@ -150,6 +150,18 @@ class LstmGradDataGpuKernel : public GpuKernel { InitSizeLists(); return true; } + void DestroyResource() noexcept override { + CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyRNNDescriptor(rnn_desc_), "destroy rnn_desc failed"); + CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyDropoutDescriptor(dropout_desc_), "destroy dropout_desc failed"); + CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(dcx_desc_), "destroy dcx_desc_ failed"); + CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(dhx_desc_), "destroy dhx_desc_ failed"); + CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyFilterDescriptor(w_desc_), "destroy w_desc_ failed"); + CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(cx_desc_), "destroy cx_desc_ failed"); + CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(hx_desc_), "destroy hx_desc_ failed"); + CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(dcy_desc_), "destroy dcy_desc_ failed"); + CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(dhy_desc_), "destroy dhy_desc_ failed"); + DestroyTensorDescGrp(); + } protected: void InitResource() override { @@ -195,18 +207,6 @@ class LstmGradDataGpuKernel : public GpuKernel { } private: - void DestroyResource() noexcept { - CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyRNNDescriptor(rnn_desc_), "destroy rnn_desc failed"); - CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyDropoutDescriptor(dropout_desc_), "destroy dropout_desc failed"); - CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(dcx_desc_), "destroy dcx_desc_ failed"); - CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(dhx_desc_), "destroy dhx_desc_ failed"); - CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyFilterDescriptor(w_desc_), "destroy w_desc_ failed"); - CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(cx_desc_), "destroy cx_desc_ failed"); - CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(hx_desc_), "destroy hx_desc_ failed"); - CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(dcy_desc_), "destroy dcy_desc_ failed"); - CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(dhy_desc_), "destroy dhy_desc_ failed"); - DestroyTensorDescGrp(); - } void CreateTensorDescGrp() { int x_dims[3]{batch_size_, input_size_, 1}; int y_dims[3]{batch_size_, hidden_size_ * (bidirectional_ ? 2 : 1), 1}; diff --git a/mindspore/ccsrc/backend/kernel_compiler/gpu/nn/lstm_grad_weight_gpu_kernel.h b/mindspore/ccsrc/backend/kernel_compiler/gpu/nn/lstm_grad_weight_gpu_kernel.h index 66d00f2cf1..81f620c1ad 100644 --- a/mindspore/ccsrc/backend/kernel_compiler/gpu/nn/lstm_grad_weight_gpu_kernel.h +++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/nn/lstm_grad_weight_gpu_kernel.h @@ -162,6 +162,13 @@ class LstmGradWeightGpuKernel : public GpuKernel { "get workspace size failed"); workspace_size_list_.push_back(workspace_size); } + void DestroyResource() noexcept override { + CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyRNNDescriptor(rnn_desc_), "destroy rnn_desc failed"); + CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyDropoutDescriptor(dropout_desc_), "destroy dropout_desc failed"); + CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyFilterDescriptor(dw_desc_), "destroy dw_desc_ failed"); + CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(hx_desc_), "destroy hx_desc_ failed"); + DestroyTensorDescGrp(); + } private: void CreateTensorDescGrp() { @@ -187,13 +194,6 @@ class LstmGradWeightGpuKernel : public GpuKernel { CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(x_desc_[i]), "destroy x_desc failed"); } } - void DestroyResource() noexcept { - CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyRNNDescriptor(rnn_desc_), "destroy rnn_desc failed"); - CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyDropoutDescriptor(dropout_desc_), "destroy dropout_desc failed"); - CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyFilterDescriptor(dw_desc_), "destroy dw_desc_ failed"); - CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(hx_desc_), "destroy hx_desc_ failed"); - DestroyTensorDescGrp(); - } int batch_size_; int seq_len_; diff --git a/mindspore/ccsrc/backend/kernel_compiler/gpu/nn/pooling_gpu_kernel.h b/mindspore/ccsrc/backend/kernel_compiler/gpu/nn/pooling_gpu_kernel.h index d2fd897a19..6b3791a5ba 100644 --- a/mindspore/ccsrc/backend/kernel_compiler/gpu/nn/pooling_gpu_kernel.h +++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/nn/pooling_gpu_kernel.h @@ -113,6 +113,13 @@ class PoolingGpuFwdKernel : public GpuKernel { return true; } + void DestroyResource() noexcept override { + CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyPoolingDescriptor(pooling_descriptor_), + "cudnnDestroyPoolingDescriptor failed"); + CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(output_descriptor_), "cudnnDestroyTensorDescriptor failed"); + CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(input_descriptor_), "cudnnDestroyTensorDescriptor failed"); + } + protected: void InitResource() { cudnn_handle_ = device::gpu::GPUDeviceManager::GetInstance().GetCudnnHandle(); @@ -196,12 +203,6 @@ class PoolingGpuFwdKernel : public GpuKernel { 2, windowDimA, paddingA, strideA), "cudnnSetPoolingNdDescriptor failed"); } - void DestroyResource() noexcept { - CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyPoolingDescriptor(pooling_descriptor_), - "cudnnDestroyPoolingDescriptor failed"); - CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(output_descriptor_), "cudnnDestroyTensorDescriptor failed"); - CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(input_descriptor_), "cudnnDestroyTensorDescriptor failed"); - } cudnnHandle_t cudnn_handle_; cudnnTensorDescriptor_t input_descriptor_; diff --git a/mindspore/ccsrc/backend/kernel_compiler/gpu/nn/pooling_grad_gpu_kernel.h b/mindspore/ccsrc/backend/kernel_compiler/gpu/nn/pooling_grad_gpu_kernel.h index 7772292ff6..a3a8456800 100644 --- a/mindspore/ccsrc/backend/kernel_compiler/gpu/nn/pooling_grad_gpu_kernel.h +++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/nn/pooling_grad_gpu_kernel.h @@ -129,6 +129,14 @@ class PoolingGradGpuKernel : public GpuKernel { InitSizeLists(); return true; } + void DestroyResource() noexcept override { + CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyPoolingDescriptor(pooling_descriptor_), + "cudnnDestroyPoolingDescriptor failed"); + CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(dx_descriptor_), "cudnnDestroyTensorDescriptor failed"); + CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(x_descriptor_), "cudnnDestroyTensorDescriptor failed"); + CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(dy_descriptor_), "cudnnDestroyTensorDescriptor failed"); + CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(y_descriptor_), "cudnnDestroyTensorDescriptor failed"); + } protected: void InitResource() override { @@ -230,14 +238,6 @@ class PoolingGradGpuKernel : public GpuKernel { pad_value_ = kSignedMinFloat; } } - void DestroyResource() noexcept { - CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyPoolingDescriptor(pooling_descriptor_), - "cudnnDestroyPoolingDescriptor failed"); - CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(dx_descriptor_), "cudnnDestroyTensorDescriptor failed"); - CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(x_descriptor_), "cudnnDestroyTensorDescriptor failed"); - CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(dy_descriptor_), "cudnnDestroyTensorDescriptor failed"); - CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(y_descriptor_), "cudnnDestroyTensorDescriptor failed"); - } cudnnHandle_t cudnn_handle_; cudnnPoolingDescriptor_t pooling_descriptor_; diff --git a/mindspore/ccsrc/backend/kernel_compiler/gpu/nn/softmax_cross_entropy_with_logits_gpu_kernel.h b/mindspore/ccsrc/backend/kernel_compiler/gpu/nn/softmax_cross_entropy_with_logits_gpu_kernel.h index 8a93c4c455..f9246022f4 100644 --- a/mindspore/ccsrc/backend/kernel_compiler/gpu/nn/softmax_cross_entropy_with_logits_gpu_kernel.h +++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/nn/softmax_cross_entropy_with_logits_gpu_kernel.h @@ -101,6 +101,13 @@ class SoftmaxCrossEntropyWithLogitsGpuKernel : public GpuKernel { return true; } + void DestroyResource() noexcept override { + CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(softmax_output_descriptor_), + "cudnnDestroyTensorDescriptor failed."); + CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(logits_descriptor_), + "cudnnDestroyTensorDescriptor failed."); + } + protected: void InitResource() override { cudnn_handle_ = device::gpu::GPUDeviceManager::GetInstance().GetCudnnHandle(); @@ -118,12 +125,6 @@ class SoftmaxCrossEntropyWithLogitsGpuKernel : public GpuKernel { } private: - void DestroyResource() noexcept { - CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(softmax_output_descriptor_), - "cudnnDestroyTensorDescriptor failed."); - CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(logits_descriptor_), - "cudnnDestroyTensorDescriptor failed."); - } void InferInputOutputSize(const CNodePtr &kernel_node) { auto logits_shape = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 0); is_null_input_ = CHECK_NULL_INPUT(logits_shape); diff --git a/mindspore/ccsrc/backend/kernel_compiler/gpu/nn/softmax_gpu_kernel.h b/mindspore/ccsrc/backend/kernel_compiler/gpu/nn/softmax_gpu_kernel.h index 5ee0769304..597cc49352 100644 --- a/mindspore/ccsrc/backend/kernel_compiler/gpu/nn/softmax_gpu_kernel.h +++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/nn/softmax_gpu_kernel.h @@ -140,6 +140,11 @@ class SoftmaxGpuKernel : public GpuKernel { return true; } + void DestroyResource() noexcept override { + CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(output_descriptor_), "destroy output_descriptor failed"); + CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(input_descriptor_), "destroy input_descriptor failed"); + } + protected: void InitResource() override { cudnn_handle_ = device::gpu::GPUDeviceManager::GetInstance().GetCudnnHandle(); @@ -159,11 +164,6 @@ class SoftmaxGpuKernel : public GpuKernel { } private: - void DestroyResource() noexcept { - CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(output_descriptor_), "destroy output_descriptor failed"); - CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(input_descriptor_), "destroy input_descriptor failed"); - } - void InitSizeByAxis(const std::vector &input_shape, const int &axis) { if (input_shape.size() == 2) { InitSizeByAxis2D(input_shape, axis); diff --git a/mindspore/ccsrc/backend/kernel_compiler/gpu/nn/softmax_grad_gpu_kernel.h b/mindspore/ccsrc/backend/kernel_compiler/gpu/nn/softmax_grad_gpu_kernel.h index c9637c213c..1ea5f5fa79 100644 --- a/mindspore/ccsrc/backend/kernel_compiler/gpu/nn/softmax_grad_gpu_kernel.h +++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/nn/softmax_grad_gpu_kernel.h @@ -142,6 +142,10 @@ class SoftmaxGradGpuKernel : public GpuKernel { return true; } + void DestroyResource() noexcept override { + CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(y_desc_), "destroy output_descriptor failed"); + } + protected: void InitResource() override { cudnn_handle_ = device::gpu::GPUDeviceManager::GetInstance().GetCudnnHandle(); @@ -161,10 +165,6 @@ class SoftmaxGradGpuKernel : public GpuKernel { } private: - void DestroyResource() noexcept { - CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(y_desc_), "destroy output_descriptor failed"); - } - void InitSizeByAxis(const std::vector input_shape, const int axis) { axis_ = axis; if (axis_ < 0) { diff --git a/mindspore/ccsrc/backend/kernel_compiler/gpu/nn/sparse_softmax_cross_entropy_with_logits_gpu_kernel.h b/mindspore/ccsrc/backend/kernel_compiler/gpu/nn/sparse_softmax_cross_entropy_with_logits_gpu_kernel.h index 88ec631017..74b3cbe24b 100644 --- a/mindspore/ccsrc/backend/kernel_compiler/gpu/nn/sparse_softmax_cross_entropy_with_logits_gpu_kernel.h +++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/nn/sparse_softmax_cross_entropy_with_logits_gpu_kernel.h @@ -103,6 +103,13 @@ class SparseSoftmaxCrossEntropyWithLogitsGpuKernel : public GpuKernel { return true; } + void DestroyResource() noexcept override { + CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(softmax_output_descriptor_), + "cudnnDestroyTensorDescriptor failed."); + CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(logits_descriptor_), + "cudnnDestroyTensorDescriptor failed."); + } + protected: void InitResource() override { cudnn_handle_ = device::gpu::GPUDeviceManager::GetInstance().GetCudnnHandle(); @@ -120,12 +127,6 @@ class SparseSoftmaxCrossEntropyWithLogitsGpuKernel : public GpuKernel { } private: - void DestroyResource() noexcept { - CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(softmax_output_descriptor_), - "cudnnDestroyTensorDescriptor failed."); - CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(logits_descriptor_), - "cudnnDestroyTensorDescriptor failed."); - } void InferInputOutputSize(const CNodePtr &kernel_node) { auto logits_shape = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 0); is_null_input_ = CHECK_NULL_INPUT(logits_shape); diff --git a/mindspore/ccsrc/backend/kernel_compiler/gpu/quant/batchnorm_fold2_gpu_kernel.h b/mindspore/ccsrc/backend/kernel_compiler/gpu/quant/batchnorm_fold2_gpu_kernel.h index 2899572c41..f103db4c37 100644 --- a/mindspore/ccsrc/backend/kernel_compiler/gpu/quant/batchnorm_fold2_gpu_kernel.h +++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/quant/batchnorm_fold2_gpu_kernel.h @@ -113,8 +113,6 @@ class BatchNormFold2GpuKernel : public GpuKernel { } private: - void DestroyResource() noexcept {} - cudnnHandle_t cudnn_handle_; bool is_null_input_; size_t batch_size_; diff --git a/mindspore/ccsrc/backend/kernel_compiler/gpu/quant/batchnorm_fold_gpu_kernel.h b/mindspore/ccsrc/backend/kernel_compiler/gpu/quant/batchnorm_fold_gpu_kernel.h index d4a1221b39..ab79aebd02 100644 --- a/mindspore/ccsrc/backend/kernel_compiler/gpu/quant/batchnorm_fold_gpu_kernel.h +++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/quant/batchnorm_fold_gpu_kernel.h @@ -152,6 +152,11 @@ class BatchNormFoldGpuKernel : public GpuKernel { return true; } + void DestroyResource() noexcept override { + CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(x_desc_), "Destroy x desc failed"); + CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(scale_bias_mean_var_desc_), "Destroy para desc failed"); + } + protected: void InitSizeLists() override { // x, mean, variance, current_step @@ -177,11 +182,6 @@ class BatchNormFoldGpuKernel : public GpuKernel { } private: - void DestroyResource() noexcept { - CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(x_desc_), "Destroy x desc failed"); - CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(scale_bias_mean_var_desc_), "Destroy para desc failed"); - } - size_t input_size_; size_t output_size_; std::vector input_size_list_; diff --git a/mindspore/ccsrc/backend/kernel_compiler/gpu/quant/correction_mul_gpu_kernel.h b/mindspore/ccsrc/backend/kernel_compiler/gpu/quant/correction_mul_gpu_kernel.h index 20b413da71..85dca5855f 100644 --- a/mindspore/ccsrc/backend/kernel_compiler/gpu/quant/correction_mul_gpu_kernel.h +++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/quant/correction_mul_gpu_kernel.h @@ -81,8 +81,6 @@ class CorrectionMulGpuKernel : public GpuKernel { void InitResource() override {} private: - void DestroyResource() noexcept {} - size_t batch_size_; size_t channel_; size_t height_; diff --git a/mindspore/ccsrc/backend/kernel_compiler/gpu/quant/correction_mul_grad_gpu_kernel.h b/mindspore/ccsrc/backend/kernel_compiler/gpu/quant/correction_mul_grad_gpu_kernel.h index 533266e185..fd1dd95e1e 100644 --- a/mindspore/ccsrc/backend/kernel_compiler/gpu/quant/correction_mul_grad_gpu_kernel.h +++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/quant/correction_mul_grad_gpu_kernel.h @@ -89,8 +89,6 @@ class CorrectionMulGradGpuKernel : public GpuKernel { void InitResource() override {} private: - void DestroyResource() noexcept {} - size_t batch_size_; size_t channel_; size_t height_; diff --git a/mindspore/ccsrc/backend/optimizer/mem_reuse/mem_swap_manager.cc b/mindspore/ccsrc/backend/optimizer/mem_reuse/mem_swap_manager.cc index 70bb78ef73..1edc9acf0d 100644 --- a/mindspore/ccsrc/backend/optimizer/mem_reuse/mem_swap_manager.cc +++ b/mindspore/ccsrc/backend/optimizer/mem_reuse/mem_swap_manager.cc @@ -237,6 +237,10 @@ void MemSwapManager::SaveUserKernelTopoOrder() { continue; } + if (opt::IsNopNode(user_kernel)) { + continue; + } + size_t user_kernel_topo_sort = SearchKernelExecutionInfo(user_kernel).topo_order_; auto kernel_with_index = AnfAlgo::GetPrevNodeOutput(user_kernel, node_pair.second - 1); auto &output_idx = kernel_with_index.second; diff --git a/mindspore/ccsrc/backend/session/anf_runtime_algorithm.cc b/mindspore/ccsrc/backend/session/anf_runtime_algorithm.cc index 027e088200..f4f19b6ff3 100644 --- a/mindspore/ccsrc/backend/session/anf_runtime_algorithm.cc +++ b/mindspore/ccsrc/backend/session/anf_runtime_algorithm.cc @@ -50,6 +50,10 @@ bool IsShapeDynamic(const abstract::ShapePtr &shape) { return std::any_of(shape->shape().begin(), shape->shape().end(), [](int s) { return s < 0; }); } +bool IsShapeDynamic(const std::vector &shape) { + return std::any_of(shape.begin(), shape.end(), [](int s) { return s < 0; }); +} + std::vector TransShapeToSizet(const abstract::ShapePtr &shape) { MS_EXCEPTION_IF_NULL(shape); std::vector shape_size_t; @@ -1389,5 +1393,29 @@ bool AnfRuntimeAlgorithm::IsNodeDynamicShape(const AnfNodePtr &node) { } return false; } + +std::vector AnfRuntimeAlgorithm::GetInputRealDeviceShapeIfExist(const AnfNodePtr &anf_node, size_t index) { + auto device_shape = GetInputDeviceShape(anf_node, index); + // Initialize GPUKernel with max shape to fit 'InitDynamicOutputKernelRef()' for memory reuse. + if (IsShapeDynamic(device_shape)) { + auto max_shape = GetInputMaxShape(anf_node, index); + std::transform(max_shape.begin(), max_shape.end(), device_shape.begin(), IntToSize); + auto format = GetInputFormat(anf_node, index); + trans::TransShapeToDevice(device_shape, format); + } + return device_shape; +} + +std::vector AnfRuntimeAlgorithm::GetOutputRealDeviceShapeIfExist(const AnfNodePtr &anf_node, size_t index) { + auto device_shape = GetOutputDeviceShape(anf_node, index); + // Initialize GPUKernel with max shape to fit 'InitDynamicOutputKernelRef()' for memory reuse. + if (IsShapeDynamic(device_shape)) { + auto max_shape = GetOutputMaxShape(anf_node, index); + std::transform(max_shape.begin(), max_shape.end(), device_shape.begin(), IntToSize); + auto format = GetOutputFormat(anf_node, index); + trans::TransShapeToDevice(device_shape, format); + } + return device_shape; +} } // namespace session } // namespace mindspore diff --git a/mindspore/ccsrc/backend/session/anf_runtime_algorithm.h b/mindspore/ccsrc/backend/session/anf_runtime_algorithm.h index 81d906f1d1..3f9ef917e3 100644 --- a/mindspore/ccsrc/backend/session/anf_runtime_algorithm.h +++ b/mindspore/ccsrc/backend/session/anf_runtime_algorithm.h @@ -230,6 +230,8 @@ class AnfRuntimeAlgorithm { static std::vector GetOutputMaxShape(const AnfNodePtr &anf_node, size_t index); static std::vector GetOutputMinShape(const AnfNodePtr &anf_node, size_t index); static bool IsNodeDynamicShape(const AnfNodePtr &node); + static std::vector GetInputRealDeviceShapeIfExist(const AnfNodePtr &anf_node, size_t index); + static std::vector GetOutputRealDeviceShapeIfExist(const AnfNodePtr &anf_node, size_t index); }; } // namespace session using AnfAlgo = session::AnfRuntimeAlgorithm; diff --git a/mindspore/ccsrc/backend/session/gpu_session.cc b/mindspore/ccsrc/backend/session/gpu_session.cc index 295707a2b3..c5267a8d1d 100644 --- a/mindspore/ccsrc/backend/session/gpu_session.cc +++ b/mindspore/ccsrc/backend/session/gpu_session.cc @@ -306,7 +306,9 @@ GraphId GPUSession::CompileGraphImpl(const AnfNodePtrList &lst, const AnfNodePtr if (save_graphs) { DumpIRProto(graph, "before_removeNop_" + std::to_string(graph_id)); } - + // Update Graph Dynamic Shape Attr. + UpdateGraphDynamicShapeAttr(NOT_NULL(graph)); + graph->UpdateGraphDynamicAttr(); // Hide NopOp from execution graph opt::HideNopNode(graph.get()); // Build kernel if node is cnode @@ -317,13 +319,10 @@ GraphId GPUSession::CompileGraphImpl(const AnfNodePtrList &lst, const AnfNodePtr graph->set_execution_order(execution_order); // Get summary nodes. SetSummaryNodes(graph.get()); - // Remove NopOp from execution graph - opt::RemoveNopNode(graph.get()); // Dump .pb graph after graph optimization if (save_graphs) { DumpIRProto(graph, "after_opt_" + std::to_string(graph_id)); } - // Set graph manager. MS_EXCEPTION_IF_NULL(context_); FuncGraphManagerPtr manager = MakeManager({graph}); diff --git a/mindspore/ccsrc/runtime/device/CMakeLists.txt b/mindspore/ccsrc/runtime/device/CMakeLists.txt index b74dce0523..37ce7a93c3 100644 --- a/mindspore/ccsrc/runtime/device/CMakeLists.txt +++ b/mindspore/ccsrc/runtime/device/CMakeLists.txt @@ -1,5 +1,5 @@ file(GLOB_RECURSE DEVICE_SRC_LIST RELATIVE ${CMAKE_CURRENT_SOURCE_DIR} "common/*.cc" - "kernel_info.cc" "executor/dynamic_kernel.cc" "kernel_runtime.cc" "memory_manager.cc" "kernel_runtime_manager.cc" "convert_tensor_utils.cc" + "kernel_info.cc" "executor/dynamic_kernel.cc" "executor/executor_callback.cc" "kernel_runtime.cc" "memory_manager.cc" "kernel_runtime_manager.cc" "convert_tensor_utils.cc" ) if (ENABLE_GPU) diff --git a/mindspore/ccsrc/runtime/device/ascend/ascend_kernel_runtime.cc b/mindspore/ccsrc/runtime/device/ascend/ascend_kernel_runtime.cc index dc3b5adaf4..32cfcc8469 100644 --- a/mindspore/ccsrc/runtime/device/ascend/ascend_kernel_runtime.cc +++ b/mindspore/ccsrc/runtime/device/ascend/ascend_kernel_runtime.cc @@ -48,7 +48,7 @@ #include "backend/optimizer/mem_reuse/mem_reuse_checker.h" #endif #include "runtime/device/ascend/executor/tiling/op_tiling_calculater.h" -#include "runtime/device/ascend/executor/executor_callback.h" +#include "runtime/device/executor/executor_callback.h" #include "runtime/device/ascend/executor/hccl_dynamic_kernel.h" #include "profiler/device/ascend/ascend_profiling.h" #include "profiler/device/ascend/profiling_context.h" diff --git a/mindspore/ccsrc/runtime/device/ascend/executor/ai_cpu_dynamic_kernel.cc b/mindspore/ccsrc/runtime/device/ascend/executor/ai_cpu_dynamic_kernel.cc index be249bda7c..2f58516ec7 100644 --- a/mindspore/ccsrc/runtime/device/ascend/executor/ai_cpu_dynamic_kernel.cc +++ b/mindspore/ccsrc/runtime/device/ascend/executor/ai_cpu_dynamic_kernel.cc @@ -22,7 +22,7 @@ #include "runtime/kernel.h" #include "backend/session/anf_runtime_algorithm.h" #include "backend/kernel_compiler/aicpu/aicpu_util.h" -#include "runtime/device/ascend/executor/executor_callback.h" +#include "runtime/device/executor/executor_callback.h" namespace mindspore { namespace device { diff --git a/mindspore/ccsrc/runtime/device/ascend/executor/executor_callback.cc b/mindspore/ccsrc/runtime/device/executor/executor_callback.cc similarity index 92% rename from mindspore/ccsrc/runtime/device/ascend/executor/executor_callback.cc rename to mindspore/ccsrc/runtime/device/executor/executor_callback.cc index 33d4bb08e0..fb7dbc5168 100644 --- a/mindspore/ccsrc/runtime/device/ascend/executor/executor_callback.cc +++ b/mindspore/ccsrc/runtime/device/executor/executor_callback.cc @@ -14,12 +14,11 @@ * limitations under the License. */ -#include "runtime/device/ascend/executor/executor_callback.h" +#include "runtime/device/executor/executor_callback.h" #include "utils/log_adapter.h" namespace mindspore { namespace device { -namespace ascend { void ExecutorCallback::RegistCallback(const std::function &callback) { std::lock_guard guard(lock_); callback_queue_.push(callback); @@ -36,6 +35,5 @@ void ExecutorCallback::Consume() { callback_func(); } } -} // namespace ascend } // namespace device } // namespace mindspore diff --git a/mindspore/ccsrc/runtime/device/ascend/executor/executor_callback.h b/mindspore/ccsrc/runtime/device/executor/executor_callback.h similarity index 79% rename from mindspore/ccsrc/runtime/device/ascend/executor/executor_callback.h rename to mindspore/ccsrc/runtime/device/executor/executor_callback.h index 2994f9b70e..01c6793b47 100644 --- a/mindspore/ccsrc/runtime/device/ascend/executor/executor_callback.h +++ b/mindspore/ccsrc/runtime/device/executor/executor_callback.h @@ -14,8 +14,8 @@ * limitations under the License. */ -#ifndef MINDSPORE_MINDSPORE_CCSRC_RUNTIME_DEVICE_ASCEND_EXECUTOR_EXECUTOR_CALLBACK_H_ -#define MINDSPORE_MINDSPORE_CCSRC_RUNTIME_DEVICE_ASCEND_EXECUTOR_EXECUTOR_CALLBACK_H_ +#ifndef MINDSPORE_MINDSPORE_CCSRC_RUNTIME_DEVICE_EXECUTOR_EXECUTOR_CALLBACK_H_ +#define MINDSPORE_MINDSPORE_CCSRC_RUNTIME_DEVICE_EXECUTOR_EXECUTOR_CALLBACK_H_ #include #include @@ -24,7 +24,6 @@ namespace mindspore { namespace device { -namespace ascend { class ExecutorCallback { public: static ExecutorCallback &GetInstance() { @@ -43,7 +42,6 @@ class ExecutorCallback { std::queue> callback_queue_; std::mutex lock_; }; -} // namespace ascend } // namespace device } // namespace mindspore -#endif // MINDSPORE_MINDSPORE_CCSRC_RUNTIME_DEVICE_ASCEND_EXECUTOR_EXECUTOR_CALLBACK_H_ +#endif // MINDSPORE_MINDSPORE_CCSRC_RUNTIME_DEVICE_EXECUTOR_EXECUTOR_CALLBACK_H_ diff --git a/mindspore/ccsrc/runtime/device/gpu/gpu_kernel_build.cc b/mindspore/ccsrc/runtime/device/gpu/gpu_kernel_build.cc index 00c56b5279..3c6f69bab8 100644 --- a/mindspore/ccsrc/runtime/device/gpu/gpu_kernel_build.cc +++ b/mindspore/ccsrc/runtime/device/gpu/gpu_kernel_build.cc @@ -67,6 +67,8 @@ void GpuBuild(const KernelGraphPtr &kernel_graph) { if (!gpu_kernel_ptr->Init(kernel)) { MS_LOG(EXCEPTION) << "Initialize gpu kernel op[" << kernel->fullname_with_scope() << "] failed."; } + gpu_kernel_ptr->InitDynamicKernel(kernel); + gpu_kernel_ptr->DynamicKernel()->Initialize(); session::AnfRuntimeAlgorithm::SetKernelMod((kernel::KernelModPtr)gpu_kernel_ptr, kernel.get()); } } diff --git a/mindspore/ccsrc/runtime/device/gpu/gpu_kernel_runtime.cc b/mindspore/ccsrc/runtime/device/gpu/gpu_kernel_runtime.cc index fb7fd4f602..bebe38c2ac 100644 --- a/mindspore/ccsrc/runtime/device/gpu/gpu_kernel_runtime.cc +++ b/mindspore/ccsrc/runtime/device/gpu/gpu_kernel_runtime.cc @@ -36,6 +36,8 @@ #include "profiler/device/gpu/gpu_profiling.h" #include "utils/shape_utils.h" #include "debug/data_dump/dump_json_parser.h" +#include "backend/kernel_compiler/gpu/gpu_kernel.h" +#include "runtime/device/executor/executor_callback.h" #ifdef ENABLE_DEBUGGER #include "debug/debug_services.h" #endif @@ -588,6 +590,29 @@ bool GPUKernelRuntime::LaunchKernelDynamic(const session::KernelGraph *graph, bo MS_LOG(INFO) << "[inplace optimizer] skip node: " << kernel->DebugString(); continue; } + + // akg kernel do not support dynamic shape by now. + device::DynamicKernelPtr dynamic_kernel = nullptr; + kernel::GpuKernel *gpu_kernel = nullptr; + if (session::AnfRuntimeAlgorithm::GetKernelType(kernel) != KernelType::AKG_KERNEL) { + gpu_kernel = dynamic_cast(kernel_mod); + dynamic_kernel = gpu_kernel->DynamicKernel(); + } + + if (dynamic_kernel && dynamic_kernel->have_depends()) { + MS_LOG(INFO) << "Match Dynamic Kernel, Start SyncStream"; + if (!SyncStream()) { + MS_LOG(ERROR) << "SyncStream failed"; + return false; + } + } + + if (dynamic_kernel && dynamic_kernel->is_dynamic_shape()) { + ExecutorCallback::GetInstance().Consume(); + dynamic_kernel->InferShape(); + dynamic_kernel->UpdateArgs(); + } + AddressPtrList kernel_inputs; AddressPtrList kernel_workspaces; AddressPtrList kernel_outputs; @@ -615,6 +640,10 @@ bool GPUKernelRuntime::LaunchKernelDynamic(const session::KernelGraph *graph, bo } else { LaunchKernelWithTimeProfiling(kernel, kernel_inputs, kernel_workspaces, kernel_outputs); } + + ExecutorCallback::GetInstance().RegistCallback([&gpu_kernel] { + if (gpu_kernel) gpu_kernel->PostExecute(); + }); // called once per kernel to collect the outputs to the kernel (does a SyncDeviceToHost) LoadKernelData(debugger_.get(), kernel, kernel_inputs, kernel_workspaces, kernel_outputs, exec_order, stream_, dump_enabled); @@ -633,6 +662,7 @@ bool GPUKernelRuntime::LaunchKernelDynamic(const session::KernelGraph *graph, bo // collect weights and bias for dump mode debugger_->LoadParametersAndConst(); CHECK_OP_RET_WITH_EXCEPT(SyncStream(), "SyncStream failed."); + ExecutorCallback::GetInstance().Consume(); } ClearSwapInfo(mock); return true; diff --git a/tests/ut/cpp/stub/dynamic_shape/dynamic_shape_stub.cc b/tests/ut/cpp/stub/dynamic_shape/dynamic_shape_stub.cc index 4c54ef16d4..5560abd0a6 100644 --- a/tests/ut/cpp/stub/dynamic_shape/dynamic_shape_stub.cc +++ b/tests/ut/cpp/stub/dynamic_shape/dynamic_shape_stub.cc @@ -19,12 +19,19 @@ #include "runtime/device/ascend/executor/rts/profiling_rts_dynamic_kernel.h" #include "runtime/device/ascend/executor/ai_core_dynamic_kernel.h" #include "profiler/device/ascend/rt_callback_manager.h" -#include "runtime/device/ascend/executor/executor_callback.h" +#include "runtime/device/executor/executor_callback.h" #include "profiler/device/ascend/ascend_profiling.h" #include "runtime/device/ascend/executor/tiling/op_tiling_calculater.h" #include "backend/kernel_compiler/host/host_kernel_metadata.h" #include "backend/kernel_compiler/host/host_kernel_build.h" +namespace mindspore { +namespace device { +void ExecutorCallback::RegistCallback(const std::function &callback) {} +void ExecutorCallback::Consume() {} +} // namespace device +} // namespace mindspore + namespace mindspore { namespace device { namespace ascend { @@ -45,13 +52,11 @@ void AiCoreDynamicKernel::PostExecute() {} bool HcclExecutorManager::Initialize() { return true; } bool HcclExecutorManager::Finalize() { return true; } -void ExecutorCallback::RegistCallback(const std::function &callback) {} -void ExecutorCallback::Consume() {} - void OpTilingCalculater::Init() {} -void OpTilingCalculater::CalculateTiling(const NotNull &cnode, const NotNull> &compile_info_json, - const std::map &depend_tensor_map, - NotNull op_run_info) {} +void OpTilingCalculater::CalculateTiling(const NotNull &cnode, + const NotNull> &compile_info_json, + const std::map &depend_tensor_map, + NotNull op_run_info) {} } // namespace ascend } // namespace device } // namespace mindspore