diff --git a/mindspore/ccsrc/backend/kernel_compiler/gpu/arrays/array_reduce_gpu_kernel.h b/mindspore/ccsrc/backend/kernel_compiler/gpu/arrays/array_reduce_gpu_kernel.h
index e615b2b049..09df5021b5 100644
--- a/mindspore/ccsrc/backend/kernel_compiler/gpu/arrays/array_reduce_gpu_kernel.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/arrays/array_reduce_gpu_kernel.h
@@ -35,21 +35,7 @@ const std::map<std::string, cudnnReduceTensorOp_t> kReduceTypeMap = {
 template <typename T>
 class ArrayReduceGpuKernel : public GpuKernel {
  public:
-  ArrayReduceGpuKernel()
-      : cudnn_handle_(nullptr),
-        reduce_tensor_op_(CUDNN_REDUCE_TENSOR_ADD),
-        data_type_(CUDNN_DATA_FLOAT),
-        nan_prop_(CUDNN_NOT_PROPAGATE_NAN),
-        reduce_indices_(CUDNN_REDUCE_TENSOR_NO_INDICES),
-        reduce_tensor_descriptor_(nullptr),
-        inputA_descriptor_(nullptr),
-        outputC_descriptor_(nullptr),
-        keep_dims_(false),
-        all_match_(false),
-        is_null_input_(false),
-        input_size_(0),
-        output_size_(0),
-        workspace_size_(0) {}
+  ArrayReduceGpuKernel() { ResetResource(); }
   ~ArrayReduceGpuKernel() override { DestroyResource(); }
 
   const std::vector<size_t> &GetInputSizeList() const override { return input_size_list_; }
@@ -94,7 +80,7 @@ class ArrayReduceGpuKernel : public GpuKernel {
       MS_LOG(ERROR) << "Output number is " << output_num << ", but reduce op needs 1 output.";
       return false;
     }
-    int input_dim_length = SizeToInt(AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 0).size());
+    int input_dim_length = SizeToInt(AnfAlgo::GetInputRealDeviceShapeIfExist(kernel_node, 0).size());
 
     if (AnfAlgo::GetCNodePrimitive(kernel_node)->GetAttr("axis")->isa<ValueTuple>() ||
         AnfAlgo::GetCNodePrimitive(kernel_node)->GetAttr("axis")->isa<ValueList>()) {
@@ -117,8 +103,8 @@ class ArrayReduceGpuKernel : public GpuKernel {
     }
     keep_dims_ = GetAttr<bool>(kernel_node, "keep_dims");
 
-    auto inputA_shape = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 0);
-    auto outputC_shape = AnfAlgo::GetOutputInferShape(kernel_node, 0);
+    auto inputA_shape = AnfAlgo::GetInputRealDeviceShapeIfExist(kernel_node, 0);
+    auto outputC_shape = AnfAlgo::GetOutputRealDeviceShapeIfExist(kernel_node, 0);
     is_null_input_ = CHECK_NULL_INPUT(inputA_shape);
     if (is_null_input_) {
       MS_LOG(WARNING) << "ArrayReduceGpuKernel input is null";
@@ -132,6 +118,35 @@ class ArrayReduceGpuKernel : public GpuKernel {
     return true;
   }
 
+  void ResetResource() noexcept override {
+    cudnn_handle_ = nullptr;
+    reduce_tensor_op_ = CUDNN_REDUCE_TENSOR_ADD;
+    data_type_ = CUDNN_DATA_FLOAT;
+    nan_prop_ = CUDNN_NOT_PROPAGATE_NAN;
+    reduce_indices_ = CUDNN_REDUCE_TENSOR_NO_INDICES;
+    reduce_tensor_descriptor_ = nullptr;
+    inputA_descriptor_ = nullptr;
+    outputC_descriptor_ = nullptr;
+    keep_dims_ = false;
+    all_match_ = false;
+    is_null_input_ = false;
+    input_size_ = 0;
+    output_size_ = 0;
+    workspace_size_ = 0;
+    input_size_list_.clear();
+    output_size_list_.clear();
+    workspace_size_list_.clear();
+  }
+
+  void DestroyResource() noexcept override {
+    CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyReduceTensorDescriptor(reduce_tensor_descriptor_),
+                               "cudnnDestroyReduceTensorDescriptor failed.");
+    CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(inputA_descriptor_),
+                               "cudnnDestroyTensorDescriptor failed.");
+    CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(outputC_descriptor_),
+                               "cudnnDestroyTensorDescriptor failed.");
+  }
+
  protected:
   void InitResource() override {
     cudnn_handle_ = device::gpu::GPUDeviceManager::GetInstance().GetCudnnHandle();
@@ -160,14 +175,6 @@ class ArrayReduceGpuKernel : public GpuKernel {
   }
 
  private:
-  void DestroyResource() noexcept {
-    CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyReduceTensorDescriptor(reduce_tensor_descriptor_),
-                               "cudnnDestroyReduceTensorDescriptor failed.");
-    CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(inputA_descriptor_),
-                               "cudnnDestroyTensorDescriptor failed.");
-    CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(outputC_descriptor_),
-                               "cudnnDestroyTensorDescriptor failed.");
-  }
   void InferArrayReduceType(const CNodePtr &kernel_node) {
     std::string kernel_name = AnfAlgo::GetCNodeName(kernel_node);
     auto iter = kReduceTypeMap.find(kernel_name);
diff --git a/mindspore/ccsrc/backend/kernel_compiler/gpu/arrays/gatherv2_gpu_kernel.cc b/mindspore/ccsrc/backend/kernel_compiler/gpu/arrays/gatherv2_gpu_kernel.cc
index 7a13d05d38..a54627b41f 100644
--- a/mindspore/ccsrc/backend/kernel_compiler/gpu/arrays/gatherv2_gpu_kernel.cc
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/arrays/gatherv2_gpu_kernel.cc
@@ -26,5 +26,14 @@ MS_REG_GPU_KERNEL_TWO(
   GatherV2,
   KernelAttr().AddInputAttr(kNumberTypeFloat16).AddInputAttr(kNumberTypeInt32).AddOutputAttr(kNumberTypeFloat16),
   GatherV2GpuFwdKernel, half, int)
+
+MS_REG_GPU_KERNEL_TWO(
+  SparseGatherV2,
+  KernelAttr().AddInputAttr(kNumberTypeFloat32).AddInputAttr(kNumberTypeInt32).AddOutputAttr(kNumberTypeFloat32),
+  GatherV2GpuFwdKernel, float, int)
+MS_REG_GPU_KERNEL_TWO(
+  SparseGatherV2,
+  KernelAttr().AddInputAttr(kNumberTypeFloat16).AddInputAttr(kNumberTypeInt32).AddOutputAttr(kNumberTypeFloat16),
+  GatherV2GpuFwdKernel, half, int)
 }  // namespace kernel
 }  // namespace mindspore
diff --git a/mindspore/ccsrc/backend/kernel_compiler/gpu/arrays/gatherv2_gpu_kernel.h b/mindspore/ccsrc/backend/kernel_compiler/gpu/arrays/gatherv2_gpu_kernel.h
index d5649bb0d4..81136ff4fe 100644
--- a/mindspore/ccsrc/backend/kernel_compiler/gpu/arrays/gatherv2_gpu_kernel.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/arrays/gatherv2_gpu_kernel.h
@@ -27,7 +27,7 @@ namespace kernel {
 template <typename T, typename S>
 class GatherV2GpuFwdKernel : public GpuKernel {
  public:
-  GatherV2GpuFwdKernel() : axis_(0), handle_(nullptr) {}
+  GatherV2GpuFwdKernel() { ResetResource(); }
   ~GatherV2GpuFwdKernel() = default;
 
   const std::vector<size_t> &GetInputSizeList() const override { return input_size_list_; }
@@ -52,9 +52,9 @@ class GatherV2GpuFwdKernel : public GpuKernel {
     if (input_num != 2) {
       MS_LOG(EXCEPTION) << "Argument number is " << input_num << ", but GatherGpuV2FwdKernel needs 2.";
     }
-    input_shapes_ = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 0);
-    indices_shapes_ = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 1);
-    output_shapes_ = AnfAlgo::GetOutputInferShape(kernel_node, 0);
+    input_shapes_ = AnfAlgo::GetInputRealDeviceShapeIfExist(kernel_node, 0);
+    indices_shapes_ = AnfAlgo::GetInputRealDeviceShapeIfExist(kernel_node, 1);
+    output_shapes_ = AnfAlgo::GetOutputRealDeviceShapeIfExist(kernel_node, 0);
 
     axis_ = static_cast<int>(GetAttr<int64_t>(kernel_node, "axis"));
     if (axis_ < 0) {
@@ -65,9 +65,18 @@ class GatherV2GpuFwdKernel : public GpuKernel {
     InitSizeLists();
     return true;
   }
+  void ResetResource() noexcept override {
+    input_shapes_.clear();
+    indices_shapes_.clear();
+    output_shapes_.clear();
+    std::fill(dims_, dims_ + 3, 0);
+    axis_ = 0;
+    input_size_list_.clear();
+    output_size_list_.clear();
+    workspace_size_list_.clear();
+  }
 
  protected:
-  void InitResource() override { handle_ = device::gpu::GPUDeviceManager::GetInstance().GetCudnnHandle(); }
   void InitSizeLists() override {
     size_t size = GetSize(input_shapes_);
     input_size_list_.push_back(size);
@@ -118,7 +127,6 @@ class GatherV2GpuFwdKernel : public GpuKernel {
 
   size_t dims_[3] = {};
   int axis_;
-  cudnnHandle_t handle_;
 
   std::vector<size_t> input_size_list_;
   std::vector<size_t> output_size_list_;
diff --git a/mindspore/ccsrc/backend/kernel_compiler/gpu/arrays/split_gpu_kernel.h b/mindspore/ccsrc/backend/kernel_compiler/gpu/arrays/split_gpu_kernel.h
index 124a7734a2..a487e5febc 100644
--- a/mindspore/ccsrc/backend/kernel_compiler/gpu/arrays/split_gpu_kernel.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/arrays/split_gpu_kernel.h
@@ -28,14 +28,7 @@ namespace kernel {
 template <typename T>
 class SplitGpuFwdKernel : public GpuKernel {
  public:
-  SplitGpuFwdKernel()
-      : axis_(0),
-        output_num_(1),
-        input_size_(1),
-        axis_step_(1),
-        all_size_before_axis_(1),
-        all_size_axis_(1),
-        outputs_host_(nullptr) {}
+  SplitGpuFwdKernel() { ResetResource(); }
   ~SplitGpuFwdKernel() override = default;
   const std::vector<size_t> &GetInputSizeList() const override { return input_size_list_; }
   const std::vector<size_t> &GetOutputSizeList() const override { return output_size_list_; }
@@ -59,7 +52,7 @@ class SplitGpuFwdKernel : public GpuKernel {
   bool Init(const CNodePtr &kernel_node) override {
     axis_ = static_cast<int64_t>(GetAttr<int64_t>(kernel_node, "axis"));
     if (axis_ < 0) {
-      auto input_shape = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 0);
+      auto input_shape = AnfAlgo::GetInputRealDeviceShapeIfExist(kernel_node, 0);
       axis_ += SizeToInt(input_shape.size());
     }
     output_num_ = static_cast<int64_t>(GetAttr<int64_t>(kernel_node, "output_num"));
@@ -68,7 +61,7 @@ class SplitGpuFwdKernel : public GpuKernel {
       return false;
     }
 
-    auto input_shape = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 0);
+    auto input_shape = AnfAlgo::GetInputRealDeviceShapeIfExist(kernel_node, 0);
     input_size_ = 1;
     all_size_before_axis_ = 1;
     all_size_axis_ = 1;
@@ -88,7 +81,7 @@ class SplitGpuFwdKernel : public GpuKernel {
 
     for (int i = 0; i < output_num_; i++) {
       size_t output_size = 1;
-      auto output_shape = AnfAlgo::GetOutputInferShape(kernel_node, i);
+      auto output_shape = AnfAlgo::GetOutputRealDeviceShapeIfExist(kernel_node, i);
       for (size_t j = 0; j < output_shape.size(); j++) {
         output_size *= output_shape[j];
       }
@@ -100,6 +93,19 @@ class SplitGpuFwdKernel : public GpuKernel {
     return true;
   }
 
+  void ResetResource() noexcept override {
+    axis_ = 0;
+    output_num_ = 1;
+    input_size_ = 1;
+    axis_step_ = 1;
+    all_size_before_axis_ = 1;
+    all_size_axis_ = 1;
+    outputs_host_ = nullptr;
+    input_size_list_.clear();
+    output_size_list_.clear();
+    workspace_size_list_.clear();
+  }
+
  protected:
   void InitSizeLists() override {}
 
diff --git a/mindspore/ccsrc/backend/kernel_compiler/gpu/arrays/transpose_gpu_kernel.h b/mindspore/ccsrc/backend/kernel_compiler/gpu/arrays/transpose_gpu_kernel.h
index 7bb0a6b77a..4a5fa2d0ae 100644
--- a/mindspore/ccsrc/backend/kernel_compiler/gpu/arrays/transpose_gpu_kernel.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/arrays/transpose_gpu_kernel.h
@@ -62,7 +62,7 @@ class TransposeGpuFwdKernel : public GpuKernel {
       MS_LOG(ERROR) << "Output number is " << output_num << ", but transpose needs 1 output.";
       return false;
     }
-    auto input_shape = AnfAlgo::GetInputDeviceShape(kernel_node, 0);
+    auto input_shape = AnfAlgo::GetInputRealDeviceShapeIfExist(kernel_node, 0);
     shape_size_ = input_shape.size();
     if (shape_size_ > TRANSPOSE_MAX_DIMENSION) {
       MS_LOG(EXCEPTION) << "Input is " << shape_size_ << "-D, but transpose supports max " << TRANSPOSE_MAX_DIMENSION
diff --git a/mindspore/ccsrc/backend/kernel_compiler/gpu/arrays/unsorted_segment_sum_gpu_kernel.h b/mindspore/ccsrc/backend/kernel_compiler/gpu/arrays/unsorted_segment_sum_gpu_kernel.h
index 32527af5c6..8db9db50b8 100644
--- a/mindspore/ccsrc/backend/kernel_compiler/gpu/arrays/unsorted_segment_sum_gpu_kernel.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/arrays/unsorted_segment_sum_gpu_kernel.h
@@ -27,8 +27,7 @@ namespace kernel {
 template <typename T, typename S>
 class UnsortedSegmentSumGpuKernel : public GpuKernel {
  public:
-  UnsortedSegmentSumGpuKernel()
-      : input_dim0_(1), input_dim1_(1), output_dim0_(1), output_dim1_(1), is_null_input_(false) {}
+  UnsortedSegmentSumGpuKernel() { ResetResource(); }
   ~UnsortedSegmentSumGpuKernel() override = default;
 
   const std::vector<size_t> &GetInputSizeList() const override { return input_size_list_; }
@@ -53,15 +52,15 @@ class UnsortedSegmentSumGpuKernel : public GpuKernel {
   }
 
   bool Init(const CNodePtr &kernel_node) override {
-    auto input_shapes = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 0);
+    auto input_shapes = AnfAlgo::GetInputRealDeviceShapeIfExist(kernel_node, 0);
     is_null_input_ = CHECK_NULL_INPUT(input_shapes);
     if (is_null_input_) {
       MS_LOG(WARNING) << "UnsortedSegmentSum input is null";
       InitSizeLists();
       return true;
     }
-    auto ids_shapes = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 1);
-    auto output_shapes = AnfAlgo::GetOutputInferShape(kernel_node, 0);
+    auto ids_shapes = AnfAlgo::GetInputRealDeviceShapeIfExist(kernel_node, 1);
+    auto output_shapes = AnfAlgo::GetOutputRealDeviceShapeIfExist(kernel_node, 0);
 
     auto axis = ids_shapes.size();
     for (size_t i = 0; i < input_shapes.size(); i++) {
@@ -81,6 +80,17 @@ class UnsortedSegmentSumGpuKernel : public GpuKernel {
     return true;
   }
 
+  void ResetResource() noexcept override {
+    input_dim0_ = 1;
+    input_dim1_ = 1;
+    output_dim0_ = 1;
+    output_dim1_ = 1;
+    is_null_input_ = false;
+    input_size_list_.clear();
+    output_size_list_.clear();
+    workspace_size_list_.clear();
+  }
+
  protected:
   void InitSizeLists() override {
     input_size_list_.push_back(input_dim0_ * input_dim1_ * sizeof(T));
diff --git a/mindspore/ccsrc/backend/kernel_compiler/gpu/gpu_kernel.cc b/mindspore/ccsrc/backend/kernel_compiler/gpu/gpu_kernel.cc
new file mode 100644
index 0000000000..d5a93373f0
--- /dev/null
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/gpu_kernel.cc
@@ -0,0 +1,36 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "backend/kernel_compiler/gpu/gpu_kernel.h"
+
+namespace mindspore {
+namespace kernel {
+void GpuDynamicKernel::UpdateArgs() {
+  if (!is_input_dynamic_shape_ && is_output_dynamic_shape_ && !have_depends()) {
+    return;
+  }
+
+  MS_LOG(INFO) << "Update Args: " << cnode_ptr_->fullname_with_scope();
+  auto kernel_mod = AnfAlgo::GetKernelMod(cnode_ptr_);
+  MS_EXCEPTION_IF_NULL(kernel_mod);
+  auto gpu_kernel_mod = dynamic_cast<GpuKernel *>(kernel_mod);
+  MS_EXCEPTION_IF_NULL(gpu_kernel_mod);
+  gpu_kernel_mod->DestroyResource();
+  gpu_kernel_mod->ResetResource();
+  gpu_kernel_mod->Init(cnode_ptr_);
+}
+}  // namespace kernel
+}  // namespace mindspore
diff --git a/mindspore/ccsrc/backend/kernel_compiler/gpu/gpu_kernel.h b/mindspore/ccsrc/backend/kernel_compiler/gpu/gpu_kernel.h
index 81d86e22a6..54ad8b99ea 100644
--- a/mindspore/ccsrc/backend/kernel_compiler/gpu/gpu_kernel.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/gpu_kernel.h
@@ -23,11 +23,13 @@
 #include <vector>
 #include <utility>
 #include <map>
+#include <memory>
 #include "backend/kernel_compiler/kernel.h"
 #include "backend/kernel_compiler/gpu/kernel_constants.h"
 #include "runtime/device/gpu/gpu_device_manager.h"
 #include "runtime/device/gpu/gpu_common.h"
 #include "backend/session/anf_runtime_algorithm.h"
+#include "runtime/device/executor/dynamic_kernel.h"
 using AnfAlgo = mindspore::session::AnfRuntimeAlgorithm;
 
 namespace mindspore {
@@ -45,10 +47,28 @@ static std::map<int, int> kNHWCToNCHWAxisMap = {
   {3, 1},
 };
 
+class GpuDynamicKernel : public device::DynamicKernel {
+ public:
+  explicit GpuDynamicKernel(const CNodePtr &cnode_ptr) : DynamicKernel(nullptr, cnode_ptr) {}
+  ~GpuDynamicKernel() = default;
+
+  void UpdateArgs() override;
+  void PostExecute() final { MS_LOG(EXCEPTION) << "`PostExecute()` should not invoked with gpu backend"; };
+  void Execute() final { MS_LOG(EXCEPTION) << "`Execute()` should not invoked with gpu backend"; }
+};
+
 class GpuKernel : public KernelMod {
  public:
   virtual ~GpuKernel() = default;
   virtual bool Init(const CNodePtr &kernel_node) = 0;
+  virtual void ResetResource() noexcept {
+    MS_LOG(EXCEPTION) << "kernel must override the `ResetResource()` method when dynamic shape";
+  }
+  virtual void DestroyResource() noexcept {}
+  virtual void PostExecute() {}
+
+  void InitDynamicKernel(const CNodePtr &cnode_ptr) { dynamic_kernel_ = std::make_shared<GpuDynamicKernel>(cnode_ptr); }
+  device::DynamicKernelPtr DynamicKernel() const { return dynamic_kernel_; }
 
  protected:
   virtual void InitResource() {}
@@ -228,7 +248,10 @@ class GpuKernel : public KernelMod {
     }
     return type->second;
   }
+
+  device::DynamicKernelPtr dynamic_kernel_;
 };
+
 }  // namespace kernel
 }  // namespace mindspore
 
diff --git a/mindspore/ccsrc/backend/kernel_compiler/gpu/math/addn_gpu_kernel.h b/mindspore/ccsrc/backend/kernel_compiler/gpu/math/addn_gpu_kernel.h
index 5379ed9534..c1a878c1df 100644
--- a/mindspore/ccsrc/backend/kernel_compiler/gpu/math/addn_gpu_kernel.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/math/addn_gpu_kernel.h
@@ -123,6 +123,10 @@ class AddNGpuFwdKernel : public GpuKernel {
     return true;
   }
 
+  void DestroyResource() noexcept override {
+    CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(input_descriptor_), "cudnnDestroyTensorDescriptor failed");
+  }
+
  protected:
   void InitResource() override {
     cudnn_handle_ = device::gpu::GPUDeviceManager::GetInstance().GetCudnnHandle();
@@ -141,9 +145,6 @@ class AddNGpuFwdKernel : public GpuKernel {
   }
 
  private:
-  void DestroyResource() noexcept {
-    CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(input_descriptor_), "cudnnDestroyTensorDescriptor failed");
-  }
   cudnnHandle_t cudnn_handle_;
   cudnnTensorDescriptor_t input_descriptor_;
   cudnnDataType_t cudnn_data_type_;
diff --git a/mindspore/ccsrc/backend/kernel_compiler/gpu/math/bias_add_gpu_kernel.h b/mindspore/ccsrc/backend/kernel_compiler/gpu/math/bias_add_gpu_kernel.h
index 6f0529ab72..d43d07970d 100644
--- a/mindspore/ccsrc/backend/kernel_compiler/gpu/math/bias_add_gpu_kernel.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/math/bias_add_gpu_kernel.h
@@ -112,6 +112,12 @@ class BiasAddGpuKernel : public GpuKernel {
     return true;
   }
 
+  void DestroyResource() noexcept override {
+    CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyOpTensorDescriptor(op_desc_), "cudnnDestroyTensorDescriptor failed");
+    CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(b_desc_), "cudnnDestroyTensorDescriptor failed");
+    CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(x_desc_), "cudnnDestroyOpTensorDescriptor failed");
+  }
+
  protected:
   void InitResource() override {
     cudnn_handle_ = device::gpu::GPUDeviceManager::GetInstance().GetCudnnHandle();
@@ -129,12 +135,6 @@ class BiasAddGpuKernel : public GpuKernel {
   }
 
  private:
-  void DestroyResource() noexcept {
-    CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyOpTensorDescriptor(op_desc_), "cudnnDestroyTensorDescriptor failed");
-    CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(b_desc_), "cudnnDestroyTensorDescriptor failed");
-    CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(x_desc_), "cudnnDestroyOpTensorDescriptor failed");
-  }
-
   cudnnHandle_t cudnn_handle_;
   cudnnDataType_t cudnn_data_type_;
   cudnnTensorDescriptor_t x_desc_;
diff --git a/mindspore/ccsrc/backend/kernel_compiler/gpu/math/broadcast_gpu_kernel.h b/mindspore/ccsrc/backend/kernel_compiler/gpu/math/broadcast_gpu_kernel.h
index 18851e9622..6ee45e7624 100644
--- a/mindspore/ccsrc/backend/kernel_compiler/gpu/math/broadcast_gpu_kernel.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/math/broadcast_gpu_kernel.h
@@ -31,13 +31,7 @@ constexpr int MAX_DIMS = 7;
 template <typename T>
 class BroadcastOpGpuKernel : public GpuKernel {
  public:
-  BroadcastOpGpuKernel()
-      : op_type_(BROADCAST_TYPE_INVALID),
-        need_broadcast_(false),
-        is_comp_op_(false),
-        input1_num_(1),
-        input2_num_(1),
-        output_num_(1) {}
+  BroadcastOpGpuKernel() { ResetResource(); }
   ~BroadcastOpGpuKernel() override = default;
 
   const std::vector<size_t> &GetInputSizeList() const override { return input_size_list_; }
@@ -71,9 +65,9 @@ class BroadcastOpGpuKernel : public GpuKernel {
   }
   bool Init(const CNodePtr &kernel_node) override {
     GetOpType(kernel_node);
-    auto shape1 = AnfAlgo::GetInputDeviceShape(kernel_node, 0);
-    auto shape2 = AnfAlgo::GetInputDeviceShape(kernel_node, 1);
-    auto shape3 = AnfAlgo::GetOutputDeviceShape(kernel_node, 0);
+    auto shape1 = AnfAlgo::GetInputRealDeviceShapeIfExist(kernel_node, 0);
+    auto shape2 = AnfAlgo::GetInputRealDeviceShapeIfExist(kernel_node, 1);
+    auto shape3 = AnfAlgo::GetOutputRealDeviceShapeIfExist(kernel_node, 0);
     need_broadcast_ = IsBroadcast(shape1, shape2);
     if (need_broadcast_ && shape1.size() > 7) {
       MS_LOG(EXCEPTION) << "Broadcast operation not support dim greater than 7";
@@ -106,6 +100,20 @@ class BroadcastOpGpuKernel : public GpuKernel {
     InitSizeLists();
     return true;
   }
+  void ResetResource() noexcept override {
+    op_type_ = BROADCAST_TYPE_INVALID;
+    need_broadcast_ = false;
+    is_comp_op_ = false;
+    input1_num_ = 1;
+    input2_num_ = 1;
+    output_num_ = 1;
+    lhs_shape_.clear();
+    rhs_shape_.clear();
+    output_shape_.clear();
+    input_size_list_.clear();
+    output_size_list_.clear();
+    workspace_size_list_.clear();
+  }
 
  protected:
   void InitResource() override { return; }
diff --git a/mindspore/ccsrc/backend/kernel_compiler/gpu/math/broadcast_grad_gpu_kernel.h b/mindspore/ccsrc/backend/kernel_compiler/gpu/math/broadcast_grad_gpu_kernel.h
index c6e9c90373..0b1d262a84 100644
--- a/mindspore/ccsrc/backend/kernel_compiler/gpu/math/broadcast_grad_gpu_kernel.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/math/broadcast_grad_gpu_kernel.h
@@ -30,14 +30,7 @@ namespace kernel {
 template <typename T>
 class BroadcastOpGradGpuKernel : public GpuKernel {
  public:
-  BroadcastOpGradGpuKernel()
-      : op_type_(BROADCAST_GRAD_TYPE_INVALID),
-        need_broadcast_(false),
-        input1_num_(1),
-        input2_num_(1),
-        output_num_(1),
-        grad_x_(false),
-        grad_y_(false) {}
+  BroadcastOpGradGpuKernel() { ResetResource(); }
   ~BroadcastOpGradGpuKernel() override = default;
 
   const std::vector<size_t> &GetInputSizeList() const override { return input_size_list_; }
@@ -105,6 +98,22 @@ class BroadcastOpGradGpuKernel : public GpuKernel {
     return true;
   }
 
+  void ResetResource() noexcept override {
+    op_type_ = BROADCAST_GRAD_TYPE_INVALID;
+    need_broadcast_ = false;
+    input1_num_ = 1;
+    input2_num_ = 1;
+    output_num_ = 1;
+    std::fill(x1_shape_, x1_shape_ + 4, 1);
+    std::fill(x2_shape_, x2_shape_ + 4, 1);
+    std::fill(dy_shape_, dy_shape_ + 4, 1);
+    grad_x_ = false;
+    grad_y_ = false;
+    input_size_list_.clear();
+    output_size_list_.clear();
+    workspace_size_list_.clear();
+  }
+
  protected:
   void InitResource() override { return; }
   void InitSizeLists() override {
diff --git a/mindspore/ccsrc/backend/kernel_compiler/gpu/math/unary_op_gpu_kernel.h b/mindspore/ccsrc/backend/kernel_compiler/gpu/math/unary_op_gpu_kernel.h
index 591768a1b9..bf94c0bfcd 100644
--- a/mindspore/ccsrc/backend/kernel_compiler/gpu/math/unary_op_gpu_kernel.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/math/unary_op_gpu_kernel.h
@@ -69,21 +69,15 @@ static const std::map<std::string, UnaryOptype> kUnaryOpTypeMap = {{"Exp", UNARY
 template <typename T>
 class UnaryOpGpuKernel : public GpuKernel {
  public:
-  UnaryOpGpuKernel()
-      : unary_op_type_(UNARY_OP_INVALID_TYPE),
-        input_size_(sizeof(T)),
-        output_size_(sizeof(T)),
-        workspace_size_(0),
-        is_null_input_(false) {}
+  UnaryOpGpuKernel() { ResetResource(); }
   ~UnaryOpGpuKernel() override = default;
 
   const std::vector<size_t> &GetInputSizeList() const override { return input_size_list_; }
   const std::vector<size_t> &GetOutputSizeList() const override { return output_size_list_; }
   const std::vector<size_t> &GetWorkspaceSizeList() const override { return workspace_size_list_; }
 
-  bool Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &workspace,
+  bool Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &,
               const std::vector<AddressPtr> &outputs, void *stream_ptr) override {
-    VARIABLE_NOT_USED(workspace);
     T *input_addr = GetDeviceAddress<T>(inputs, 0);
     T *output_addr = GetDeviceAddress<T>(outputs, 0);
 
@@ -184,7 +178,7 @@ class UnaryOpGpuKernel : public GpuKernel {
       MS_LOG(ERROR) << "Output number is " << output_num << ", but unary op needs 1 output.";
       return false;
     }
-    auto input_shape = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 0);
+    auto input_shape = AnfAlgo::GetInputRealDeviceShapeIfExist(kernel_node, 0);
     is_null_input_ = CHECK_NULL_INPUT(input_shape);
     if (is_null_input_) {
       MS_LOG(WARNING) << "UnaryOpGpuKernel input is null";
@@ -198,6 +192,16 @@ class UnaryOpGpuKernel : public GpuKernel {
     InitSizeLists();
     return true;
   }
+  void ResetResource() noexcept override {
+    unary_op_type_ = UNARY_OP_INVALID_TYPE;
+    input_size_ = sizeof(T);
+    output_size_ = sizeof(T);
+    workspace_size_ = 0;
+    is_null_input_ = false;
+    input_size_list_.clear();
+    output_size_list_.clear();
+    workspace_size_list_.clear();
+  }
 
  protected:
   void InitSizeLists() override {
diff --git a/mindspore/ccsrc/backend/kernel_compiler/gpu/nn/activation_gpu_kernel.h b/mindspore/ccsrc/backend/kernel_compiler/gpu/nn/activation_gpu_kernel.h
index 2529cfc8ac..f6934d0416 100644
--- a/mindspore/ccsrc/backend/kernel_compiler/gpu/nn/activation_gpu_kernel.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/nn/activation_gpu_kernel.h
@@ -29,16 +29,7 @@ namespace kernel {
 template <typename T>
 class ActivationGpuFwdKernel : public GpuKernel {
  public:
-  ActivationGpuFwdKernel()
-      : cudnn_handle_(nullptr),
-        activation_desc_(nullptr),
-        mode_(CUDNN_ACTIVATION_RELU),
-        data_descriptor_(nullptr),
-        is_null_input_(false),
-        cudnn_data_type_(CUDNN_DATA_FLOAT),
-        input_size_(0),
-        output_size_(0),
-        workspace_size_(0) {}
+  ActivationGpuFwdKernel() { ResetResource(); }
   ~ActivationGpuFwdKernel() override { DestroyResource(); }
   const std::vector<size_t> &GetInputSizeList() const override { return input_size_list_; }
   const std::vector<size_t> &GetOutputSizeList() const override { return output_size_list_; }
@@ -75,7 +66,7 @@ class ActivationGpuFwdKernel : public GpuKernel {
       MS_LOG(ERROR) << "Argument number is " << input_num << ", but ActivationGpuFwdKernel needs 1.";
       return false;
     }
-    auto input_shape = AnfAlgo::GetInputDeviceShape(kernel_node, 0);
+    auto input_shape = AnfAlgo::GetInputRealDeviceShapeIfExist(kernel_node, 0);
     is_null_input_ = CHECK_NULL_INPUT(input_shape);
     if (is_null_input_) {
       MS_LOG(WARNING) << "ActivationGpuFwdKernel input is null.";
@@ -113,6 +104,27 @@ class ActivationGpuFwdKernel : public GpuKernel {
     return true;
   }
 
+  void DestroyResource() noexcept override {
+    CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyActivationDescriptor(activation_desc_),
+                               "cudnnDestroyActivationDescriptor failed");
+    CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(data_descriptor_), "cudnnDestroyTensorDescriptor failed");
+  }
+
+  void ResetResource() noexcept override {
+    cudnn_handle_ = nullptr;
+    activation_desc_ = nullptr;
+    mode_ = CUDNN_ACTIVATION_RELU;
+    data_descriptor_ = nullptr;
+    is_null_input_ = false;
+    input_size_list_.clear();
+    output_size_list_.clear();
+    workspace_size_list_.clear();
+    cudnn_data_type_ = CUDNN_DATA_FLOAT;
+    input_size_ = 0;
+    output_size_ = 0;
+    workspace_size_ = 0;
+  }
+
  protected:
   void InitResource() override {
     cudnn_handle_ = device::gpu::GPUDeviceManager::GetInstance().GetCudnnHandle();
@@ -132,12 +144,6 @@ class ActivationGpuFwdKernel : public GpuKernel {
   }
 
  private:
-  void DestroyResource() noexcept {
-    CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyActivationDescriptor(activation_desc_),
-                               "cudnnDestroyActivationDescriptor failed");
-    CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(data_descriptor_), "cudnnDestroyTensorDescriptor failed");
-  }
-
   std::map<std::string, cudnnActivationMode_t> kernel_map = {{"ReLU", CUDNN_ACTIVATION_RELU},
                                                              {"ReLU6", CUDNN_ACTIVATION_CLIPPED_RELU},
                                                              {"Tanh", CUDNN_ACTIVATION_TANH},
diff --git a/mindspore/ccsrc/backend/kernel_compiler/gpu/nn/activation_grad_kernel.h b/mindspore/ccsrc/backend/kernel_compiler/gpu/nn/activation_grad_kernel.h
index e6ee2d56a8..b3d0c55af5 100644
--- a/mindspore/ccsrc/backend/kernel_compiler/gpu/nn/activation_grad_kernel.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/nn/activation_grad_kernel.h
@@ -29,14 +29,7 @@ namespace kernel {
 template <typename T>
 class ActivationGradGpuKernel : public GpuKernel {
  public:
-  ActivationGradGpuKernel()
-      : cudnn_handle_(nullptr),
-        activation_desc_(nullptr),
-        mode_(CUDNN_ACTIVATION_RELU),
-        data_descriptor_(nullptr),
-        is_null_input_(false),
-        cudnn_data_type_(CUDNN_DATA_FLOAT),
-        input_size_(0) {}
+  ActivationGradGpuKernel() { ResetResource(); }
   ~ActivationGradGpuKernel() override { DestroyResource(); }
   const std::vector<size_t> &GetInputSizeList() const override { return input_size_list_; }
   const std::vector<size_t> &GetOutputSizeList() const override { return output_size_list_; }
@@ -117,6 +110,25 @@ class ActivationGradGpuKernel : public GpuKernel {
     return true;
   }
 
+  void DestroyResource() noexcept override {
+    CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyActivationDescriptor(activation_desc_),
+                               "cudnnDestroyActivationDescriptor failed");
+    CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(data_descriptor_), "cudnnDestroyTensorDescriptor failed");
+  }
+
+  void ResetResource() noexcept override {
+    cudnn_handle_ = nullptr;
+    activation_desc_ = nullptr;
+    mode_ = CUDNN_ACTIVATION_RELU;
+    data_descriptor_ = nullptr;
+    is_null_input_ = false;
+    input_size_list_.clear();
+    output_size_list_.clear();
+    workspace_size_list_.clear();
+    cudnn_data_type_ = CUDNN_DATA_FLOAT;
+    input_size_ = 0;
+  }
+
  protected:
   void InitResource() override {
     cudnn_handle_ = device::gpu::GPUDeviceManager::GetInstance().GetCudnnHandle();
@@ -135,12 +147,6 @@ class ActivationGradGpuKernel : public GpuKernel {
   }
 
  private:
-  void DestroyResource() noexcept {
-    CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyActivationDescriptor(activation_desc_),
-                               "cudnnDestroyActivationDescriptor failed");
-    CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(data_descriptor_), "cudnnDestroyTensorDescriptor failed");
-  }
-
   std::map<std::string, cudnnActivationMode_t> kernel_map = {{"ReluGrad", CUDNN_ACTIVATION_RELU},
                                                              {"ReLU6Grad", CUDNN_ACTIVATION_CLIPPED_RELU},
                                                              {"TanhGrad", CUDNN_ACTIVATION_TANH},
diff --git a/mindspore/ccsrc/backend/kernel_compiler/gpu/nn/batchnorm_grad_gpu_kernel.h b/mindspore/ccsrc/backend/kernel_compiler/gpu/nn/batchnorm_grad_gpu_kernel.h
index c548e4bdac..5185109a90 100644
--- a/mindspore/ccsrc/backend/kernel_compiler/gpu/nn/batchnorm_grad_gpu_kernel.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/nn/batchnorm_grad_gpu_kernel.h
@@ -121,6 +121,13 @@ class BatchNormGradGpuKernel : public GpuKernel {
     return true;
   }
 
+  void DestroyResource() noexcept override {
+    CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(scale_bias_desc_), "Destroy para desc failed");
+    CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(dx_desc_), "Destroy dx desc failed");
+    CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(dy_desc_), "Destroy dy desc failed");
+    CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(x_desc_), "Destroy x desc failed");
+  }
+
  protected:
   void InitResource() override {
     handle_ = device::gpu::GPUDeviceManager::GetInstance().GetCudnnHandle();
@@ -152,13 +159,6 @@ class BatchNormGradGpuKernel : public GpuKernel {
   }
 
  private:
-  void DestroyResource() noexcept {
-    CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(scale_bias_desc_), "Destroy para desc failed");
-    CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(dx_desc_), "Destroy dx desc failed");
-    CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(dy_desc_), "Destroy dy desc failed");
-    CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(x_desc_), "Destroy x desc failed");
-  }
-
   int batch_;
   int channel_;
   int height_;
diff --git a/mindspore/ccsrc/backend/kernel_compiler/gpu/nn/bias_add_grad_gpu_kenel.h b/mindspore/ccsrc/backend/kernel_compiler/gpu/nn/bias_add_grad_gpu_kenel.h
index 7787c4c037..aa1be7b7b1 100644
--- a/mindspore/ccsrc/backend/kernel_compiler/gpu/nn/bias_add_grad_gpu_kenel.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/nn/bias_add_grad_gpu_kenel.h
@@ -111,6 +111,13 @@ class BiasAddGradGpuKernel : public GpuKernel {
     return true;
   }
 
+  void DestroyResource() noexcept override {
+    CHECK_CUDNN_RET_WITH_EXCEPT(cudnnDestroyReduceTensorDescriptor(op_desc_),
+                                "cudnnDestroyReduceTensorDescriptor failed");
+    CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(db_desc_), "cudnnDestroyTensorDescriptor failed");
+    CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(dy_desc_), "cudnnDestroyOpTensorDescriptor failed");
+  }
+
  protected:
   void InitResource() override {
     cudnn_handle_ = device::gpu::GPUDeviceManager::GetInstance().GetCudnnHandle();
@@ -137,13 +144,6 @@ class BiasAddGradGpuKernel : public GpuKernel {
   }
 
  private:
-  void DestroyResource() noexcept {
-    CHECK_CUDNN_RET_WITH_EXCEPT(cudnnDestroyReduceTensorDescriptor(op_desc_),
-                                "cudnnDestroyReduceTensorDescriptor failed");
-    CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(db_desc_), "cudnnDestroyTensorDescriptor failed");
-    CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(dy_desc_), "cudnnDestroyOpTensorDescriptor failed");
-  }
-
   bool same_dims_;
   cudnnHandle_t cudnn_handle_;
   cudnnDataType_t cudnn_data_type_;
diff --git a/mindspore/ccsrc/backend/kernel_compiler/gpu/nn/conv2d_gpu_kernel.h b/mindspore/ccsrc/backend/kernel_compiler/gpu/nn/conv2d_gpu_kernel.h
index 0afdf7c844..c5b88da02d 100644
--- a/mindspore/ccsrc/backend/kernel_compiler/gpu/nn/conv2d_gpu_kernel.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/nn/conv2d_gpu_kernel.h
@@ -198,6 +198,15 @@ class Conv2dGpuFwdKernel : public GpuKernel {
     return true;
   }
 
+  void DestroyResource() noexcept override {
+    CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyConvolutionDescriptor(conv_desc_),
+                               "cudnnDestroyConvolutionDescriptor failed");
+    CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyFilterDescriptor(filter_desc_), "cudnnDestroyTensorDescriptor failed");
+    CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(padded_desc_), "cudnnDestroyTensorDescriptor failed");
+    CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(output_desc_), "cudnnDestroyTensorDescriptor failed");
+    CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(input_desc_), "cudnnDestroyTensorDescriptor failed");
+  }
+
  protected:
   void InitResource() override {
     cudnn_handle_ = device::gpu::GPUDeviceManager::GetInstance().GetCudnnHandle();
@@ -243,14 +252,6 @@ class Conv2dGpuFwdKernel : public GpuKernel {
   }
 
  private:
-  void DestroyResource() noexcept {
-    CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyConvolutionDescriptor(conv_desc_),
-                               "cudnnDestroyConvolutionDescriptor failed");
-    CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyFilterDescriptor(filter_desc_), "cudnnDestroyTensorDescriptor failed");
-    CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(padded_desc_), "cudnnDestroyTensorDescriptor failed");
-    CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(output_desc_), "cudnnDestroyTensorDescriptor failed");
-    CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(input_desc_), "cudnnDestroyTensorDescriptor failed");
-  }
   bool CheckParam(const CNodePtr &kernel_node) {
     size_t input_num = AnfAlgo::GetInputTensorNum(kernel_node);
     if (input_num != 2) {
diff --git a/mindspore/ccsrc/backend/kernel_compiler/gpu/nn/conv2d_grad_filter_gpu_kernel.h b/mindspore/ccsrc/backend/kernel_compiler/gpu/nn/conv2d_grad_filter_gpu_kernel.h
index 22fb7ced2c..49f1124411 100644
--- a/mindspore/ccsrc/backend/kernel_compiler/gpu/nn/conv2d_grad_filter_gpu_kernel.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/nn/conv2d_grad_filter_gpu_kernel.h
@@ -199,6 +199,15 @@ class ConvGradFilterGpuBkwKernel : public GpuKernel {
     return true;
   }
 
+  void DestroyResource() noexcept override {
+    CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyConvolutionDescriptor(conv_desc_),
+                               "cudnnDestroyConvolutionDescriptor failed");
+    CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyFilterDescriptor(dw_desc_), "cudnnDestroyFilterDescriptor failed");
+    CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(padded_descriptor_), "cudnnDestroyTensorDescriptor failed");
+    CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(dy_desc_), "cudnnDestroyTensorDescriptor failed");
+    CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(x_desc_), "cudnnDestroyTensorDescriptor failed");
+  }
+
  protected:
   void InitResource() override {
     cudnn_handle_ = device::gpu::GPUDeviceManager::GetInstance().GetCudnnHandle();
@@ -243,14 +252,6 @@ class ConvGradFilterGpuBkwKernel : public GpuKernel {
   }
 
  private:
-  void DestroyResource() noexcept {
-    CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyConvolutionDescriptor(conv_desc_),
-                               "cudnnDestroyConvolutionDescriptor failed");
-    CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyFilterDescriptor(dw_desc_), "cudnnDestroyFilterDescriptor failed");
-    CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(padded_descriptor_), "cudnnDestroyTensorDescriptor failed");
-    CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(dy_desc_), "cudnnDestroyTensorDescriptor failed");
-    CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(x_desc_), "cudnnDestroyTensorDescriptor failed");
-  }
   bool CheckParam(const CNodePtr &kernel_node) {
     size_t input_num = AnfAlgo::GetInputTensorNum(kernel_node);
     if (input_num != 2) {
diff --git a/mindspore/ccsrc/backend/kernel_compiler/gpu/nn/conv2d_grad_input_gpu_kernel.h b/mindspore/ccsrc/backend/kernel_compiler/gpu/nn/conv2d_grad_input_gpu_kernel.h
index 965d67b846..9608204c4e 100644
--- a/mindspore/ccsrc/backend/kernel_compiler/gpu/nn/conv2d_grad_input_gpu_kernel.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/nn/conv2d_grad_input_gpu_kernel.h
@@ -203,6 +203,15 @@ class ConvGradInputGpuBkwKernel : public GpuKernel {
     return true;
   }
 
+  void DestroyResource() noexcept override {
+    CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyConvolutionDescriptor(conv_desc_),
+                               "cudnnDestroyConvolutionDescriptor failed");
+    CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyFilterDescriptor(w_desc_), "cudnnDestroyFilterDescriptor failed");
+    CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(padded_descriptor_), "cudnnDestroyTensorDescriptor failed");
+    CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(dy_desc_), "cudnnDestroyTensorDescriptor failed");
+    CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(dx_desc_), "cudnnDestroyTensorDescriptor failed");
+  }
+
  protected:
   void InitResource() override {
     cudnn_handle_ = device::gpu::GPUDeviceManager::GetInstance().GetCudnnHandle();
@@ -244,14 +253,6 @@ class ConvGradInputGpuBkwKernel : public GpuKernel {
   }
 
  private:
-  void DestroyResource() noexcept {
-    CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyConvolutionDescriptor(conv_desc_),
-                               "cudnnDestroyConvolutionDescriptor failed");
-    CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyFilterDescriptor(w_desc_), "cudnnDestroyFilterDescriptor failed");
-    CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(padded_descriptor_), "cudnnDestroyTensorDescriptor failed");
-    CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(dy_desc_), "cudnnDestroyTensorDescriptor failed");
-    CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(dx_desc_), "cudnnDestroyTensorDescriptor failed");
-  }
   bool CheckParam(const CNodePtr &kernel_node) {
     size_t input_num = AnfAlgo::GetInputTensorNum(kernel_node);
     if (input_num != 2) {
diff --git a/mindspore/ccsrc/backend/kernel_compiler/gpu/nn/flatten_gpu_kernel.h b/mindspore/ccsrc/backend/kernel_compiler/gpu/nn/flatten_gpu_kernel.h
index baf6e35f2e..3e199cec18 100644
--- a/mindspore/ccsrc/backend/kernel_compiler/gpu/nn/flatten_gpu_kernel.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/nn/flatten_gpu_kernel.h
@@ -27,7 +27,7 @@ namespace kernel {
 template <typename T>
 class FlattenGpuFwdKernel : public GpuKernel {
  public:
-  FlattenGpuFwdKernel() : input_size_(0), output_size_(0), workspace_size_(0) {}
+  FlattenGpuFwdKernel() : input_size_(0) {}
   ~FlattenGpuFwdKernel() override = default;
 
   const std::vector<size_t> &GetInputSizeList() const override { return input_size_list_; }
@@ -47,7 +47,7 @@ class FlattenGpuFwdKernel : public GpuKernel {
     return true;
   }
   bool Init(const CNodePtr &kernel_node) override {
-    auto shape = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 0);
+    auto shape = AnfAlgo::GetInputRealDeviceShapeIfExist(kernel_node, 0);
     input_size_ = sizeof(T);
     for (size_t i = 0; i < shape.size(); ++i) {
       input_size_ *= shape[i];
@@ -55,12 +55,17 @@ class FlattenGpuFwdKernel : public GpuKernel {
     InitSizeLists();
     return true;
   }
+  void ResetResource() noexcept override {
+    input_size_ = 0;
+    input_size_list_.clear();
+    output_size_list_.clear();
+    workspace_size_list_.clear();
+  }
 
  protected:
   void InitSizeLists() override {
     input_size_list_.push_back(input_size_);
-    output_size_ = input_size_;
-    output_size_list_.push_back(output_size_);
+    output_size_list_.push_back(input_size_);
   }
 
  private:
@@ -69,8 +74,6 @@ class FlattenGpuFwdKernel : public GpuKernel {
   std::vector<size_t> workspace_size_list_;
 
   size_t input_size_;
-  size_t output_size_;
-  size_t workspace_size_;
 };
 }  // namespace kernel
 }  // namespace mindspore
diff --git a/mindspore/ccsrc/backend/kernel_compiler/gpu/nn/flatten_grad_gpu_kernel.h b/mindspore/ccsrc/backend/kernel_compiler/gpu/nn/flatten_grad_gpu_kernel.h
index 700fe5884a..92ed5e07e0 100644
--- a/mindspore/ccsrc/backend/kernel_compiler/gpu/nn/flatten_grad_gpu_kernel.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/nn/flatten_grad_gpu_kernel.h
@@ -27,7 +27,7 @@ namespace kernel {
 template <typename T>
 class FlattenGardGpuBkwKernel : public GpuKernel {
  public:
-  FlattenGardGpuBkwKernel() : input_size_(0), output_size_(0), workspace_size_(0) {}
+  FlattenGardGpuBkwKernel() { ResetResource(); }
   ~FlattenGardGpuBkwKernel() override = default;
 
   const std::vector<size_t> &GetInputSizeList() const override { return input_size_list_; }
@@ -54,7 +54,7 @@ class FlattenGardGpuBkwKernel : public GpuKernel {
       return false;
     }
 
-    auto shape = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 0);
+    auto shape = AnfAlgo::GetInputRealDeviceShapeIfExist(kernel_node, 0);
     for (size_t i = 0; i < shape.size(); ++i) {
       if (input_size_ == 0) {
         input_size_ = 1;
@@ -67,11 +67,17 @@ class FlattenGardGpuBkwKernel : public GpuKernel {
     return true;
   }
 
+  void ResetResource() noexcept override {
+    input_size_ = 0;
+    input_size_list_.clear();
+    output_size_list_.clear();
+    workspace_size_list_.clear();
+  }
+
  protected:
   void InitSizeLists() override {
     input_size_list_.push_back(input_size_);
-    output_size_ = input_size_;
-    output_size_list_.push_back(output_size_);
+    output_size_list_.push_back(input_size_);
   }
 
  private:
@@ -80,8 +86,6 @@ class FlattenGardGpuBkwKernel : public GpuKernel {
   std::vector<size_t> workspace_size_list_;
 
   size_t input_size_;
-  size_t output_size_;
-  size_t workspace_size_;
 };
 }  // namespace kernel
 }  // namespace mindspore
diff --git a/mindspore/ccsrc/backend/kernel_compiler/gpu/nn/fused_batch_norm_ex_gpu_kernel.h b/mindspore/ccsrc/backend/kernel_compiler/gpu/nn/fused_batch_norm_ex_gpu_kernel.h
index 8262e3423b..e8564d354c 100644
--- a/mindspore/ccsrc/backend/kernel_compiler/gpu/nn/fused_batch_norm_ex_gpu_kernel.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/nn/fused_batch_norm_ex_gpu_kernel.h
@@ -140,6 +140,20 @@ class FusedBatchNormExGpuKernel : public GpuKernel {
     return true;
   }
 
+  void DestroyResource() noexcept override {
+    CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(x_desc_), "Destroy x desc failed");
+    CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(y_desc_), "Destroy y desc failed");
+    CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(scale_bias_mean_var_desc_), "Destroy para desc failed");
+    if (bn_ops_ == CUDNN_BATCHNORM_OPS_BN_ADD_ACTIVATION) {
+      CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(z_desc_), "Destroy z desc failed");
+    }
+
+    if (bn_ops_ != CUDNN_BATCHNORM_OPS_BN) {
+      CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyActivationDescriptor(activation_desc_),
+                                 "Destroy activation descriptor failed");
+    }
+  }
+
  protected:
   void InitResource() override {
     handle_ = device::gpu::GPUDeviceManager::GetInstance().GetCudnnHandle();
@@ -238,20 +252,6 @@ class FusedBatchNormExGpuKernel : public GpuKernel {
     }
   }
 
-  void DestroyResource() noexcept {
-    CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(x_desc_), "Destroy x desc failed");
-    CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(y_desc_), "Destroy y desc failed");
-    CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(scale_bias_mean_var_desc_), "Destroy para desc failed");
-    if (bn_ops_ == CUDNN_BATCHNORM_OPS_BN_ADD_ACTIVATION) {
-      CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(z_desc_), "Destroy z desc failed");
-    }
-
-    if (bn_ops_ != CUDNN_BATCHNORM_OPS_BN) {
-      CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyActivationDescriptor(activation_desc_),
-                                 "Destroy activation descriptor failed");
-    }
-  }
-
   size_t input_x_size_;
   size_t input_z_size_;
   size_t para_size_;
diff --git a/mindspore/ccsrc/backend/kernel_compiler/gpu/nn/fused_batch_norm_gpu_kernel.h b/mindspore/ccsrc/backend/kernel_compiler/gpu/nn/fused_batch_norm_gpu_kernel.h
index 2d107136f7..695c453e64 100644
--- a/mindspore/ccsrc/backend/kernel_compiler/gpu/nn/fused_batch_norm_gpu_kernel.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/nn/fused_batch_norm_gpu_kernel.h
@@ -133,6 +133,12 @@ class FusedBatchNormGpuKernel : public GpuKernel {
     return true;
   }
 
+  void DestroyResource() noexcept override {
+    CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(x_desc_), "Destroy x desc failed");
+    CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(y_desc_), "Destroy y desc failed");
+    CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(scale_bias_mean_var_desc_), "Destroy para desc failed");
+  }
+
  protected:
   void InitResource() override {
     handle_ = device::gpu::GPUDeviceManager::GetInstance().GetCudnnHandle();
@@ -165,12 +171,6 @@ class FusedBatchNormGpuKernel : public GpuKernel {
   }
 
  private:
-  void DestroyResource() noexcept {
-    CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(x_desc_), "Destroy x desc failed");
-    CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(y_desc_), "Destroy y desc failed");
-    CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(scale_bias_mean_var_desc_), "Destroy para desc failed");
-  }
-
   int batch_;
   int channel_;
   int height_;
diff --git a/mindspore/ccsrc/backend/kernel_compiler/gpu/nn/fused_batch_norm_grad_ex_gpu_kernel.h b/mindspore/ccsrc/backend/kernel_compiler/gpu/nn/fused_batch_norm_grad_ex_gpu_kernel.h
index 49d05b8729..7704dccec0 100644
--- a/mindspore/ccsrc/backend/kernel_compiler/gpu/nn/fused_batch_norm_grad_ex_gpu_kernel.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/nn/fused_batch_norm_grad_ex_gpu_kernel.h
@@ -201,6 +201,21 @@ class FusedBatchNormGradExGpuKernel : public GpuKernel {
 
     workspace_size_list_.push_back(workspace_size_);
   }
+  void DestroyResource() noexcept override {
+    CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(x_desc_), "Destroy x desc failed");
+    if (bn_ops_ != CUDNN_BATCHNORM_OPS_BN) {
+      CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(y_desc_), "Destroy y desc failed");
+      CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyActivationDescriptor(activation_desc_),
+                                 "Destroy activation descriptor failed");
+    }
+    CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(dy_desc_), "Destroy dy desc failed");
+
+    CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(dx_desc_), "Destroy dx desc failed");
+    if (bn_ops_ == CUDNN_BATCHNORM_OPS_BN_ADD_ACTIVATION) {
+      CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(dz_desc_), "Destroy z desc failed");
+    }
+    CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(scale_bias_diff_desc_), "Destroy para desc failed");
+  }
 
  private:
   void SetTensorDescriptor(const std::string &format, const std::vector<size_t> &shape) {
@@ -255,22 +270,6 @@ class FusedBatchNormGradExGpuKernel : public GpuKernel {
     }
   }
 
-  void DestroyResource() noexcept {
-    CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(x_desc_), "Destroy x desc failed");
-    if (bn_ops_ != CUDNN_BATCHNORM_OPS_BN) {
-      CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(y_desc_), "Destroy y desc failed");
-      CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyActivationDescriptor(activation_desc_),
-                                 "Destroy activation descriptor failed");
-    }
-    CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(dy_desc_), "Destroy dy desc failed");
-
-    CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(dx_desc_), "Destroy dx desc failed");
-    if (bn_ops_ == CUDNN_BATCHNORM_OPS_BN_ADD_ACTIVATION) {
-      CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(dz_desc_), "Destroy z desc failed");
-    }
-    CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(scale_bias_diff_desc_), "Destroy para desc failed");
-  }
-
   size_t x_size_;
   size_t para_size_;
   size_t workspace_size_;
diff --git a/mindspore/ccsrc/backend/kernel_compiler/gpu/nn/fused_batchnorm_grad_gpu_kernel.h b/mindspore/ccsrc/backend/kernel_compiler/gpu/nn/fused_batchnorm_grad_gpu_kernel.h
index b22cc2f03f..c459df03cf 100644
--- a/mindspore/ccsrc/backend/kernel_compiler/gpu/nn/fused_batchnorm_grad_gpu_kernel.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/nn/fused_batchnorm_grad_gpu_kernel.h
@@ -117,6 +117,13 @@ class FusedBatchNormGradGpuKernel : public GpuKernel {
     return true;
   }
 
+  void DestroyResource() noexcept override {
+    CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(scale_bias_desc_), "Destroy para desc failed");
+    CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(dx_desc_), "Destroy dx desc failed");
+    CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(dy_desc_), "Destroy dy desc failed");
+    CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(x_desc_), "Destroy x desc failed");
+  }
+
  protected:
   void InitResource() override {
     handle_ = device::gpu::GPUDeviceManager::GetInstance().GetCudnnHandle();
@@ -146,13 +153,6 @@ class FusedBatchNormGradGpuKernel : public GpuKernel {
   }
 
  private:
-  void DestroyResource() noexcept {
-    CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(scale_bias_desc_), "Destroy para desc failed");
-    CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(dx_desc_), "Destroy dx desc failed");
-    CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(dy_desc_), "Destroy dy desc failed");
-    CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(x_desc_), "Destroy x desc failed");
-  }
-
   int batch_;
   int channel_;
   int height_;
diff --git a/mindspore/ccsrc/backend/kernel_compiler/gpu/nn/im2col_gpu_kernel.h b/mindspore/ccsrc/backend/kernel_compiler/gpu/nn/im2col_gpu_kernel.h
index 4fd4c90154..fa27e77d8e 100644
--- a/mindspore/ccsrc/backend/kernel_compiler/gpu/nn/im2col_gpu_kernel.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/nn/im2col_gpu_kernel.h
@@ -123,6 +123,15 @@ class Im2ColGpuFwdKernel : public GpuKernel {
     return true;
   }
 
+  void DestroyResource() noexcept override {
+    CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyConvolutionDescriptor(conv_desc_),
+                               "cudnnDestroyConvolutionDescriptor failed");
+    CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyFilterDescriptor(filter_desc_), "cudnnDestroyTensorDescriptor failed");
+    CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(padded_desc_), "cudnnDestroyTensorDescriptor failed");
+    CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(output_desc_), "cudnnDestroyTensorDescriptor failed");
+    CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(input_desc_), "cudnnDestroyTensorDescriptor failed");
+  }
+
  protected:
   void InitResource() override {
     cudnn_handle_ = device::gpu::GPUDeviceManager::GetInstance().GetCudnnHandle();
@@ -152,14 +161,6 @@ class Im2ColGpuFwdKernel : public GpuKernel {
   }
 
  private:
-  void DestroyResource() noexcept {
-    CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyConvolutionDescriptor(conv_desc_),
-                               "cudnnDestroyConvolutionDescriptor failed");
-    CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyFilterDescriptor(filter_desc_), "cudnnDestroyTensorDescriptor failed");
-    CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(padded_desc_), "cudnnDestroyTensorDescriptor failed");
-    CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(output_desc_), "cudnnDestroyTensorDescriptor failed");
-    CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(input_desc_), "cudnnDestroyTensorDescriptor failed");
-  }
   bool CheckParam(const CNodePtr &kernel_node) {
     cudnn_data_type_ = GetCudnnDataType(TypeIdLabel(AnfAlgo::GetInputDeviceDataType(kernel_node, 0)));
     size_t input_num = AnfAlgo::GetInputTensorNum(kernel_node);
diff --git a/mindspore/ccsrc/backend/kernel_compiler/gpu/nn/lstm_gpu_kernel.h b/mindspore/ccsrc/backend/kernel_compiler/gpu/nn/lstm_gpu_kernel.h
index 315748b7d9..60cb4e43dd 100644
--- a/mindspore/ccsrc/backend/kernel_compiler/gpu/nn/lstm_gpu_kernel.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/nn/lstm_gpu_kernel.h
@@ -157,6 +157,21 @@ class LstmGpuKernel : public GpuKernel {
     }
   }
 
+  void DestroyResource() noexcept override {
+    CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyRNNDescriptor(rnn_desc_), "destroy rnn_desc failed");
+    CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyDropoutDescriptor(dropout_desc_), "destroy dropout_desc failed");
+    CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(cy_desc_), "destroy cy_desc failed");
+    CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(hy_desc_), "destroy hy_desc failed");
+    CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyFilterDescriptor(w_desc_), "destroy w_desc failed");
+    CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(hx_desc_), "destroy hx_desc failed");
+    CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(cx_desc_), "destroy cx_desc failed");
+
+    for (size_t i = 0; i < IntToSize(seq_len_); ++i) {
+      CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(y_desc_[i]), "destroy y_desc failed");
+      CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(x_desc_[i]), "destroy x_desc failed");
+    }
+  }
+
  protected:
   void InitResource() override {
     handle_ = device::gpu::GPUDeviceManager::GetInstance().GetCudnnHandle();
@@ -195,21 +210,6 @@ class LstmGpuKernel : public GpuKernel {
   }
 
  private:
-  void DestroyResource() noexcept {
-    CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyRNNDescriptor(rnn_desc_), "destroy rnn_desc failed");
-    CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyDropoutDescriptor(dropout_desc_), "destroy dropout_desc failed");
-    CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(cy_desc_), "destroy cy_desc failed");
-    CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(hy_desc_), "destroy hy_desc failed");
-    CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyFilterDescriptor(w_desc_), "destroy w_desc failed");
-    CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(hx_desc_), "destroy hx_desc failed");
-    CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(cx_desc_), "destroy cx_desc failed");
-
-    for (size_t i = 0; i < IntToSize(seq_len_); ++i) {
-      CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(y_desc_[i]), "destroy y_desc failed");
-      CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(x_desc_[i]), "destroy x_desc failed");
-    }
-  }
-
   int batch_size_;
   int seq_len_;
   int input_size_;
diff --git a/mindspore/ccsrc/backend/kernel_compiler/gpu/nn/lstm_grad_data_gpu_kernel.h b/mindspore/ccsrc/backend/kernel_compiler/gpu/nn/lstm_grad_data_gpu_kernel.h
index 10813422bd..dee8d56cd7 100644
--- a/mindspore/ccsrc/backend/kernel_compiler/gpu/nn/lstm_grad_data_gpu_kernel.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/nn/lstm_grad_data_gpu_kernel.h
@@ -150,6 +150,18 @@ class LstmGradDataGpuKernel : public GpuKernel {
     InitSizeLists();
     return true;
   }
+  void DestroyResource() noexcept override {
+    CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyRNNDescriptor(rnn_desc_), "destroy rnn_desc failed");
+    CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyDropoutDescriptor(dropout_desc_), "destroy dropout_desc failed");
+    CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(dcx_desc_), "destroy dcx_desc_ failed");
+    CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(dhx_desc_), "destroy dhx_desc_ failed");
+    CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyFilterDescriptor(w_desc_), "destroy w_desc_ failed");
+    CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(cx_desc_), "destroy cx_desc_ failed");
+    CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(hx_desc_), "destroy hx_desc_ failed");
+    CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(dcy_desc_), "destroy dcy_desc_ failed");
+    CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(dhy_desc_), "destroy dhy_desc_ failed");
+    DestroyTensorDescGrp();
+  }
 
  protected:
   void InitResource() override {
@@ -195,18 +207,6 @@ class LstmGradDataGpuKernel : public GpuKernel {
   }
 
  private:
-  void DestroyResource() noexcept {
-    CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyRNNDescriptor(rnn_desc_), "destroy rnn_desc failed");
-    CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyDropoutDescriptor(dropout_desc_), "destroy dropout_desc failed");
-    CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(dcx_desc_), "destroy dcx_desc_ failed");
-    CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(dhx_desc_), "destroy dhx_desc_ failed");
-    CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyFilterDescriptor(w_desc_), "destroy w_desc_ failed");
-    CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(cx_desc_), "destroy cx_desc_ failed");
-    CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(hx_desc_), "destroy hx_desc_ failed");
-    CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(dcy_desc_), "destroy dcy_desc_ failed");
-    CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(dhy_desc_), "destroy dhy_desc_ failed");
-    DestroyTensorDescGrp();
-  }
   void CreateTensorDescGrp() {
     int x_dims[3]{batch_size_, input_size_, 1};
     int y_dims[3]{batch_size_, hidden_size_ * (bidirectional_ ? 2 : 1), 1};
diff --git a/mindspore/ccsrc/backend/kernel_compiler/gpu/nn/lstm_grad_weight_gpu_kernel.h b/mindspore/ccsrc/backend/kernel_compiler/gpu/nn/lstm_grad_weight_gpu_kernel.h
index 66d00f2cf1..81f620c1ad 100644
--- a/mindspore/ccsrc/backend/kernel_compiler/gpu/nn/lstm_grad_weight_gpu_kernel.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/nn/lstm_grad_weight_gpu_kernel.h
@@ -162,6 +162,13 @@ class LstmGradWeightGpuKernel : public GpuKernel {
                                 "get workspace size failed");
     workspace_size_list_.push_back(workspace_size);
   }
+  void DestroyResource() noexcept override {
+    CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyRNNDescriptor(rnn_desc_), "destroy rnn_desc failed");
+    CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyDropoutDescriptor(dropout_desc_), "destroy dropout_desc failed");
+    CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyFilterDescriptor(dw_desc_), "destroy dw_desc_ failed");
+    CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(hx_desc_), "destroy hx_desc_ failed");
+    DestroyTensorDescGrp();
+  }
 
  private:
   void CreateTensorDescGrp() {
@@ -187,13 +194,6 @@ class LstmGradWeightGpuKernel : public GpuKernel {
       CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(x_desc_[i]), "destroy x_desc failed");
     }
   }
-  void DestroyResource() noexcept {
-    CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyRNNDescriptor(rnn_desc_), "destroy rnn_desc failed");
-    CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyDropoutDescriptor(dropout_desc_), "destroy dropout_desc failed");
-    CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyFilterDescriptor(dw_desc_), "destroy dw_desc_ failed");
-    CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(hx_desc_), "destroy hx_desc_ failed");
-    DestroyTensorDescGrp();
-  }
 
   int batch_size_;
   int seq_len_;
diff --git a/mindspore/ccsrc/backend/kernel_compiler/gpu/nn/pooling_gpu_kernel.h b/mindspore/ccsrc/backend/kernel_compiler/gpu/nn/pooling_gpu_kernel.h
index d2fd897a19..6b3791a5ba 100644
--- a/mindspore/ccsrc/backend/kernel_compiler/gpu/nn/pooling_gpu_kernel.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/nn/pooling_gpu_kernel.h
@@ -113,6 +113,13 @@ class PoolingGpuFwdKernel : public GpuKernel {
     return true;
   }
 
+  void DestroyResource() noexcept override {
+    CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyPoolingDescriptor(pooling_descriptor_),
+                               "cudnnDestroyPoolingDescriptor failed");
+    CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(output_descriptor_), "cudnnDestroyTensorDescriptor failed");
+    CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(input_descriptor_), "cudnnDestroyTensorDescriptor failed");
+  }
+
  protected:
   void InitResource() {
     cudnn_handle_ = device::gpu::GPUDeviceManager::GetInstance().GetCudnnHandle();
@@ -196,12 +203,6 @@ class PoolingGpuFwdKernel : public GpuKernel {
                                                             2, windowDimA, paddingA, strideA),
                                 "cudnnSetPoolingNdDescriptor failed");
   }
-  void DestroyResource() noexcept {
-    CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyPoolingDescriptor(pooling_descriptor_),
-                               "cudnnDestroyPoolingDescriptor failed");
-    CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(output_descriptor_), "cudnnDestroyTensorDescriptor failed");
-    CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(input_descriptor_), "cudnnDestroyTensorDescriptor failed");
-  }
 
   cudnnHandle_t cudnn_handle_;
   cudnnTensorDescriptor_t input_descriptor_;
diff --git a/mindspore/ccsrc/backend/kernel_compiler/gpu/nn/pooling_grad_gpu_kernel.h b/mindspore/ccsrc/backend/kernel_compiler/gpu/nn/pooling_grad_gpu_kernel.h
index 7772292ff6..a3a8456800 100644
--- a/mindspore/ccsrc/backend/kernel_compiler/gpu/nn/pooling_grad_gpu_kernel.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/nn/pooling_grad_gpu_kernel.h
@@ -129,6 +129,14 @@ class PoolingGradGpuKernel : public GpuKernel {
     InitSizeLists();
     return true;
   }
+  void DestroyResource() noexcept override {
+    CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyPoolingDescriptor(pooling_descriptor_),
+                               "cudnnDestroyPoolingDescriptor failed");
+    CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(dx_descriptor_), "cudnnDestroyTensorDescriptor failed");
+    CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(x_descriptor_), "cudnnDestroyTensorDescriptor failed");
+    CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(dy_descriptor_), "cudnnDestroyTensorDescriptor failed");
+    CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(y_descriptor_), "cudnnDestroyTensorDescriptor failed");
+  }
 
  protected:
   void InitResource() override {
@@ -230,14 +238,6 @@ class PoolingGradGpuKernel : public GpuKernel {
       pad_value_ = kSignedMinFloat;
     }
   }
-  void DestroyResource() noexcept {
-    CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyPoolingDescriptor(pooling_descriptor_),
-                               "cudnnDestroyPoolingDescriptor failed");
-    CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(dx_descriptor_), "cudnnDestroyTensorDescriptor failed");
-    CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(x_descriptor_), "cudnnDestroyTensorDescriptor failed");
-    CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(dy_descriptor_), "cudnnDestroyTensorDescriptor failed");
-    CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(y_descriptor_), "cudnnDestroyTensorDescriptor failed");
-  }
 
   cudnnHandle_t cudnn_handle_;
   cudnnPoolingDescriptor_t pooling_descriptor_;
diff --git a/mindspore/ccsrc/backend/kernel_compiler/gpu/nn/softmax_cross_entropy_with_logits_gpu_kernel.h b/mindspore/ccsrc/backend/kernel_compiler/gpu/nn/softmax_cross_entropy_with_logits_gpu_kernel.h
index 8a93c4c455..f9246022f4 100644
--- a/mindspore/ccsrc/backend/kernel_compiler/gpu/nn/softmax_cross_entropy_with_logits_gpu_kernel.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/nn/softmax_cross_entropy_with_logits_gpu_kernel.h
@@ -101,6 +101,13 @@ class SoftmaxCrossEntropyWithLogitsGpuKernel : public GpuKernel {
     return true;
   }
 
+  void DestroyResource() noexcept override {
+    CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(softmax_output_descriptor_),
+                               "cudnnDestroyTensorDescriptor failed.");
+    CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(logits_descriptor_),
+                               "cudnnDestroyTensorDescriptor failed.");
+  }
+
  protected:
   void InitResource() override {
     cudnn_handle_ = device::gpu::GPUDeviceManager::GetInstance().GetCudnnHandle();
@@ -118,12 +125,6 @@ class SoftmaxCrossEntropyWithLogitsGpuKernel : public GpuKernel {
   }
 
  private:
-  void DestroyResource() noexcept {
-    CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(softmax_output_descriptor_),
-                               "cudnnDestroyTensorDescriptor failed.");
-    CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(logits_descriptor_),
-                               "cudnnDestroyTensorDescriptor failed.");
-  }
   void InferInputOutputSize(const CNodePtr &kernel_node) {
     auto logits_shape = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 0);
     is_null_input_ = CHECK_NULL_INPUT(logits_shape);
diff --git a/mindspore/ccsrc/backend/kernel_compiler/gpu/nn/softmax_gpu_kernel.h b/mindspore/ccsrc/backend/kernel_compiler/gpu/nn/softmax_gpu_kernel.h
index 5ee0769304..597cc49352 100644
--- a/mindspore/ccsrc/backend/kernel_compiler/gpu/nn/softmax_gpu_kernel.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/nn/softmax_gpu_kernel.h
@@ -140,6 +140,11 @@ class SoftmaxGpuKernel : public GpuKernel {
     return true;
   }
 
+  void DestroyResource() noexcept override {
+    CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(output_descriptor_), "destroy output_descriptor failed");
+    CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(input_descriptor_), "destroy input_descriptor failed");
+  }
+
  protected:
   void InitResource() override {
     cudnn_handle_ = device::gpu::GPUDeviceManager::GetInstance().GetCudnnHandle();
@@ -159,11 +164,6 @@ class SoftmaxGpuKernel : public GpuKernel {
   }
 
  private:
-  void DestroyResource() noexcept {
-    CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(output_descriptor_), "destroy output_descriptor failed");
-    CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(input_descriptor_), "destroy input_descriptor failed");
-  }
-
   void InitSizeByAxis(const std::vector<size_t> &input_shape, const int &axis) {
     if (input_shape.size() == 2) {
       InitSizeByAxis2D(input_shape, axis);
diff --git a/mindspore/ccsrc/backend/kernel_compiler/gpu/nn/softmax_grad_gpu_kernel.h b/mindspore/ccsrc/backend/kernel_compiler/gpu/nn/softmax_grad_gpu_kernel.h
index c9637c213c..1ea5f5fa79 100644
--- a/mindspore/ccsrc/backend/kernel_compiler/gpu/nn/softmax_grad_gpu_kernel.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/nn/softmax_grad_gpu_kernel.h
@@ -142,6 +142,10 @@ class SoftmaxGradGpuKernel : public GpuKernel {
     return true;
   }
 
+  void DestroyResource() noexcept override {
+    CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(y_desc_), "destroy output_descriptor failed");
+  }
+
  protected:
   void InitResource() override {
     cudnn_handle_ = device::gpu::GPUDeviceManager::GetInstance().GetCudnnHandle();
@@ -161,10 +165,6 @@ class SoftmaxGradGpuKernel : public GpuKernel {
   }
 
  private:
-  void DestroyResource() noexcept {
-    CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(y_desc_), "destroy output_descriptor failed");
-  }
-
   void InitSizeByAxis(const std::vector<size_t> input_shape, const int axis) {
     axis_ = axis;
     if (axis_ < 0) {
diff --git a/mindspore/ccsrc/backend/kernel_compiler/gpu/nn/sparse_softmax_cross_entropy_with_logits_gpu_kernel.h b/mindspore/ccsrc/backend/kernel_compiler/gpu/nn/sparse_softmax_cross_entropy_with_logits_gpu_kernel.h
index 88ec631017..74b3cbe24b 100644
--- a/mindspore/ccsrc/backend/kernel_compiler/gpu/nn/sparse_softmax_cross_entropy_with_logits_gpu_kernel.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/nn/sparse_softmax_cross_entropy_with_logits_gpu_kernel.h
@@ -103,6 +103,13 @@ class SparseSoftmaxCrossEntropyWithLogitsGpuKernel : public GpuKernel {
     return true;
   }
 
+  void DestroyResource() noexcept override {
+    CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(softmax_output_descriptor_),
+                               "cudnnDestroyTensorDescriptor failed.");
+    CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(logits_descriptor_),
+                               "cudnnDestroyTensorDescriptor failed.");
+  }
+
  protected:
   void InitResource() override {
     cudnn_handle_ = device::gpu::GPUDeviceManager::GetInstance().GetCudnnHandle();
@@ -120,12 +127,6 @@ class SparseSoftmaxCrossEntropyWithLogitsGpuKernel : public GpuKernel {
   }
 
  private:
-  void DestroyResource() noexcept {
-    CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(softmax_output_descriptor_),
-                               "cudnnDestroyTensorDescriptor failed.");
-    CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(logits_descriptor_),
-                               "cudnnDestroyTensorDescriptor failed.");
-  }
   void InferInputOutputSize(const CNodePtr &kernel_node) {
     auto logits_shape = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 0);
     is_null_input_ = CHECK_NULL_INPUT(logits_shape);
diff --git a/mindspore/ccsrc/backend/kernel_compiler/gpu/quant/batchnorm_fold2_gpu_kernel.h b/mindspore/ccsrc/backend/kernel_compiler/gpu/quant/batchnorm_fold2_gpu_kernel.h
index 2899572c41..f103db4c37 100644
--- a/mindspore/ccsrc/backend/kernel_compiler/gpu/quant/batchnorm_fold2_gpu_kernel.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/quant/batchnorm_fold2_gpu_kernel.h
@@ -113,8 +113,6 @@ class BatchNormFold2GpuKernel : public GpuKernel {
   }
 
  private:
-  void DestroyResource() noexcept {}
-
   cudnnHandle_t cudnn_handle_;
   bool is_null_input_;
   size_t batch_size_;
diff --git a/mindspore/ccsrc/backend/kernel_compiler/gpu/quant/batchnorm_fold_gpu_kernel.h b/mindspore/ccsrc/backend/kernel_compiler/gpu/quant/batchnorm_fold_gpu_kernel.h
index d4a1221b39..ab79aebd02 100644
--- a/mindspore/ccsrc/backend/kernel_compiler/gpu/quant/batchnorm_fold_gpu_kernel.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/quant/batchnorm_fold_gpu_kernel.h
@@ -152,6 +152,11 @@ class BatchNormFoldGpuKernel : public GpuKernel {
     return true;
   }
 
+  void DestroyResource() noexcept override {
+    CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(x_desc_), "Destroy x desc failed");
+    CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(scale_bias_mean_var_desc_), "Destroy para desc failed");
+  }
+
  protected:
   void InitSizeLists() override {
     // x, mean, variance, current_step
@@ -177,11 +182,6 @@ class BatchNormFoldGpuKernel : public GpuKernel {
   }
 
  private:
-  void DestroyResource() noexcept {
-    CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(x_desc_), "Destroy x desc failed");
-    CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(scale_bias_mean_var_desc_), "Destroy para desc failed");
-  }
-
   size_t input_size_;
   size_t output_size_;
   std::vector<size_t> input_size_list_;
diff --git a/mindspore/ccsrc/backend/kernel_compiler/gpu/quant/correction_mul_gpu_kernel.h b/mindspore/ccsrc/backend/kernel_compiler/gpu/quant/correction_mul_gpu_kernel.h
index 20b413da71..85dca5855f 100644
--- a/mindspore/ccsrc/backend/kernel_compiler/gpu/quant/correction_mul_gpu_kernel.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/quant/correction_mul_gpu_kernel.h
@@ -81,8 +81,6 @@ class CorrectionMulGpuKernel : public GpuKernel {
   void InitResource() override {}
 
  private:
-  void DestroyResource() noexcept {}
-
   size_t batch_size_;
   size_t channel_;
   size_t height_;
diff --git a/mindspore/ccsrc/backend/kernel_compiler/gpu/quant/correction_mul_grad_gpu_kernel.h b/mindspore/ccsrc/backend/kernel_compiler/gpu/quant/correction_mul_grad_gpu_kernel.h
index 533266e185..fd1dd95e1e 100644
--- a/mindspore/ccsrc/backend/kernel_compiler/gpu/quant/correction_mul_grad_gpu_kernel.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/quant/correction_mul_grad_gpu_kernel.h
@@ -89,8 +89,6 @@ class CorrectionMulGradGpuKernel : public GpuKernel {
   void InitResource() override {}
 
  private:
-  void DestroyResource() noexcept {}
-
   size_t batch_size_;
   size_t channel_;
   size_t height_;
diff --git a/mindspore/ccsrc/backend/optimizer/mem_reuse/mem_swap_manager.cc b/mindspore/ccsrc/backend/optimizer/mem_reuse/mem_swap_manager.cc
index 70bb78ef73..1edc9acf0d 100644
--- a/mindspore/ccsrc/backend/optimizer/mem_reuse/mem_swap_manager.cc
+++ b/mindspore/ccsrc/backend/optimizer/mem_reuse/mem_swap_manager.cc
@@ -237,6 +237,10 @@ void MemSwapManager::SaveUserKernelTopoOrder() {
         continue;
       }
 
+      if (opt::IsNopNode(user_kernel)) {
+        continue;
+      }
+
       size_t user_kernel_topo_sort = SearchKernelExecutionInfo(user_kernel).topo_order_;
       auto kernel_with_index = AnfAlgo::GetPrevNodeOutput(user_kernel, node_pair.second - 1);
       auto &output_idx = kernel_with_index.second;
diff --git a/mindspore/ccsrc/backend/session/anf_runtime_algorithm.cc b/mindspore/ccsrc/backend/session/anf_runtime_algorithm.cc
index 027e088200..f4f19b6ff3 100644
--- a/mindspore/ccsrc/backend/session/anf_runtime_algorithm.cc
+++ b/mindspore/ccsrc/backend/session/anf_runtime_algorithm.cc
@@ -50,6 +50,10 @@ bool IsShapeDynamic(const abstract::ShapePtr &shape) {
   return std::any_of(shape->shape().begin(), shape->shape().end(), [](int s) { return s < 0; });
 }
 
+bool IsShapeDynamic(const std::vector<size_t> &shape) {
+  return std::any_of(shape.begin(), shape.end(), [](int s) { return s < 0; });
+}
+
 std::vector<size_t> TransShapeToSizet(const abstract::ShapePtr &shape) {
   MS_EXCEPTION_IF_NULL(shape);
   std::vector<size_t> shape_size_t;
@@ -1389,5 +1393,29 @@ bool AnfRuntimeAlgorithm::IsNodeDynamicShape(const AnfNodePtr &node) {
   }
   return false;
 }
+
+std::vector<size_t> AnfRuntimeAlgorithm::GetInputRealDeviceShapeIfExist(const AnfNodePtr &anf_node, size_t index) {
+  auto device_shape = GetInputDeviceShape(anf_node, index);
+  // Initialize GPUKernel with max shape to fit 'InitDynamicOutputKernelRef()' for memory reuse.
+  if (IsShapeDynamic(device_shape)) {
+    auto max_shape = GetInputMaxShape(anf_node, index);
+    std::transform(max_shape.begin(), max_shape.end(), device_shape.begin(), IntToSize);
+    auto format = GetInputFormat(anf_node, index);
+    trans::TransShapeToDevice(device_shape, format);
+  }
+  return device_shape;
+}
+
+std::vector<size_t> AnfRuntimeAlgorithm::GetOutputRealDeviceShapeIfExist(const AnfNodePtr &anf_node, size_t index) {
+  auto device_shape = GetOutputDeviceShape(anf_node, index);
+  // Initialize GPUKernel with max shape to fit 'InitDynamicOutputKernelRef()' for memory reuse.
+  if (IsShapeDynamic(device_shape)) {
+    auto max_shape = GetOutputMaxShape(anf_node, index);
+    std::transform(max_shape.begin(), max_shape.end(), device_shape.begin(), IntToSize);
+    auto format = GetOutputFormat(anf_node, index);
+    trans::TransShapeToDevice(device_shape, format);
+  }
+  return device_shape;
+}
 }  // namespace session
 }  // namespace mindspore
diff --git a/mindspore/ccsrc/backend/session/anf_runtime_algorithm.h b/mindspore/ccsrc/backend/session/anf_runtime_algorithm.h
index 81d906f1d1..3f9ef917e3 100644
--- a/mindspore/ccsrc/backend/session/anf_runtime_algorithm.h
+++ b/mindspore/ccsrc/backend/session/anf_runtime_algorithm.h
@@ -230,6 +230,8 @@ class AnfRuntimeAlgorithm {
   static std::vector<int64_t> GetOutputMaxShape(const AnfNodePtr &anf_node, size_t index);
   static std::vector<int64_t> GetOutputMinShape(const AnfNodePtr &anf_node, size_t index);
   static bool IsNodeDynamicShape(const AnfNodePtr &node);
+  static std::vector<size_t> GetInputRealDeviceShapeIfExist(const AnfNodePtr &anf_node, size_t index);
+  static std::vector<size_t> GetOutputRealDeviceShapeIfExist(const AnfNodePtr &anf_node, size_t index);
 };
 }  // namespace session
 using AnfAlgo = session::AnfRuntimeAlgorithm;
diff --git a/mindspore/ccsrc/backend/session/gpu_session.cc b/mindspore/ccsrc/backend/session/gpu_session.cc
index 295707a2b3..c5267a8d1d 100644
--- a/mindspore/ccsrc/backend/session/gpu_session.cc
+++ b/mindspore/ccsrc/backend/session/gpu_session.cc
@@ -306,7 +306,9 @@ GraphId GPUSession::CompileGraphImpl(const AnfNodePtrList &lst, const AnfNodePtr
   if (save_graphs) {
     DumpIRProto(graph, "before_removeNop_" + std::to_string(graph_id));
   }
-
+  // Update Graph Dynamic Shape Attr.
+  UpdateGraphDynamicShapeAttr(NOT_NULL(graph));
+  graph->UpdateGraphDynamicAttr();
   // Hide NopOp from execution graph
   opt::HideNopNode(graph.get());
   // Build kernel if node is cnode
@@ -317,13 +319,10 @@ GraphId GPUSession::CompileGraphImpl(const AnfNodePtrList &lst, const AnfNodePtr
   graph->set_execution_order(execution_order);
   // Get summary nodes.
   SetSummaryNodes(graph.get());
-  // Remove NopOp from execution graph
-  opt::RemoveNopNode(graph.get());
   // Dump .pb graph after graph optimization
   if (save_graphs) {
     DumpIRProto(graph, "after_opt_" + std::to_string(graph_id));
   }
-
   // Set graph manager.
   MS_EXCEPTION_IF_NULL(context_);
   FuncGraphManagerPtr manager = MakeManager({graph});
diff --git a/mindspore/ccsrc/runtime/device/CMakeLists.txt b/mindspore/ccsrc/runtime/device/CMakeLists.txt
index b74dce0523..37ce7a93c3 100644
--- a/mindspore/ccsrc/runtime/device/CMakeLists.txt
+++ b/mindspore/ccsrc/runtime/device/CMakeLists.txt
@@ -1,5 +1,5 @@
 file(GLOB_RECURSE DEVICE_SRC_LIST RELATIVE ${CMAKE_CURRENT_SOURCE_DIR} "common/*.cc"
-    "kernel_info.cc" "executor/dynamic_kernel.cc" "kernel_runtime.cc" "memory_manager.cc" "kernel_runtime_manager.cc" "convert_tensor_utils.cc"
+    "kernel_info.cc" "executor/dynamic_kernel.cc" "executor/executor_callback.cc" "kernel_runtime.cc" "memory_manager.cc" "kernel_runtime_manager.cc" "convert_tensor_utils.cc"
 )
 
 if (ENABLE_GPU)
diff --git a/mindspore/ccsrc/runtime/device/ascend/ascend_kernel_runtime.cc b/mindspore/ccsrc/runtime/device/ascend/ascend_kernel_runtime.cc
index dc3b5adaf4..32cfcc8469 100644
--- a/mindspore/ccsrc/runtime/device/ascend/ascend_kernel_runtime.cc
+++ b/mindspore/ccsrc/runtime/device/ascend/ascend_kernel_runtime.cc
@@ -48,7 +48,7 @@
 #include "backend/optimizer/mem_reuse/mem_reuse_checker.h"
 #endif
 #include "runtime/device/ascend/executor/tiling/op_tiling_calculater.h"
-#include "runtime/device/ascend/executor/executor_callback.h"
+#include "runtime/device/executor/executor_callback.h"
 #include "runtime/device/ascend/executor/hccl_dynamic_kernel.h"
 #include "profiler/device/ascend/ascend_profiling.h"
 #include "profiler/device/ascend/profiling_context.h"
diff --git a/mindspore/ccsrc/runtime/device/ascend/executor/ai_cpu_dynamic_kernel.cc b/mindspore/ccsrc/runtime/device/ascend/executor/ai_cpu_dynamic_kernel.cc
index be249bda7c..2f58516ec7 100644
--- a/mindspore/ccsrc/runtime/device/ascend/executor/ai_cpu_dynamic_kernel.cc
+++ b/mindspore/ccsrc/runtime/device/ascend/executor/ai_cpu_dynamic_kernel.cc
@@ -22,7 +22,7 @@
 #include "runtime/kernel.h"
 #include "backend/session/anf_runtime_algorithm.h"
 #include "backend/kernel_compiler/aicpu/aicpu_util.h"
-#include "runtime/device/ascend/executor/executor_callback.h"
+#include "runtime/device/executor/executor_callback.h"
 
 namespace mindspore {
 namespace device {
diff --git a/mindspore/ccsrc/runtime/device/ascend/executor/executor_callback.cc b/mindspore/ccsrc/runtime/device/executor/executor_callback.cc
similarity index 92%
rename from mindspore/ccsrc/runtime/device/ascend/executor/executor_callback.cc
rename to mindspore/ccsrc/runtime/device/executor/executor_callback.cc
index 33d4bb08e0..fb7dbc5168 100644
--- a/mindspore/ccsrc/runtime/device/ascend/executor/executor_callback.cc
+++ b/mindspore/ccsrc/runtime/device/executor/executor_callback.cc
@@ -14,12 +14,11 @@
  * limitations under the License.
  */
 
-#include "runtime/device/ascend/executor/executor_callback.h"
+#include "runtime/device/executor/executor_callback.h"
 #include "utils/log_adapter.h"
 
 namespace mindspore {
 namespace device {
-namespace ascend {
 void ExecutorCallback::RegistCallback(const std::function<void()> &callback) {
   std::lock_guard<std::mutex> guard(lock_);
   callback_queue_.push(callback);
@@ -36,6 +35,5 @@ void ExecutorCallback::Consume() {
     callback_func();
   }
 }
-}  // namespace ascend
 }  // namespace device
 }  // namespace mindspore
diff --git a/mindspore/ccsrc/runtime/device/ascend/executor/executor_callback.h b/mindspore/ccsrc/runtime/device/executor/executor_callback.h
similarity index 79%
rename from mindspore/ccsrc/runtime/device/ascend/executor/executor_callback.h
rename to mindspore/ccsrc/runtime/device/executor/executor_callback.h
index 2994f9b70e..01c6793b47 100644
--- a/mindspore/ccsrc/runtime/device/ascend/executor/executor_callback.h
+++ b/mindspore/ccsrc/runtime/device/executor/executor_callback.h
@@ -14,8 +14,8 @@
  * limitations under the License.
  */
 
-#ifndef MINDSPORE_MINDSPORE_CCSRC_RUNTIME_DEVICE_ASCEND_EXECUTOR_EXECUTOR_CALLBACK_H_
-#define MINDSPORE_MINDSPORE_CCSRC_RUNTIME_DEVICE_ASCEND_EXECUTOR_EXECUTOR_CALLBACK_H_
+#ifndef MINDSPORE_MINDSPORE_CCSRC_RUNTIME_DEVICE_EXECUTOR_EXECUTOR_CALLBACK_H_
+#define MINDSPORE_MINDSPORE_CCSRC_RUNTIME_DEVICE_EXECUTOR_EXECUTOR_CALLBACK_H_
 
 #include <queue>
 #include <mutex>
@@ -24,7 +24,6 @@
 
 namespace mindspore {
 namespace device {
-namespace ascend {
 class ExecutorCallback {
  public:
   static ExecutorCallback &GetInstance() {
@@ -43,7 +42,6 @@ class ExecutorCallback {
   std::queue<std::function<void()>> callback_queue_;
   std::mutex lock_;
 };
-}  // namespace ascend
 }  // namespace device
 }  // namespace mindspore
-#endif  // MINDSPORE_MINDSPORE_CCSRC_RUNTIME_DEVICE_ASCEND_EXECUTOR_EXECUTOR_CALLBACK_H_
+#endif  // MINDSPORE_MINDSPORE_CCSRC_RUNTIME_DEVICE_EXECUTOR_EXECUTOR_CALLBACK_H_
diff --git a/mindspore/ccsrc/runtime/device/gpu/gpu_kernel_build.cc b/mindspore/ccsrc/runtime/device/gpu/gpu_kernel_build.cc
index 00c56b5279..3c6f69bab8 100644
--- a/mindspore/ccsrc/runtime/device/gpu/gpu_kernel_build.cc
+++ b/mindspore/ccsrc/runtime/device/gpu/gpu_kernel_build.cc
@@ -67,6 +67,8 @@ void GpuBuild(const KernelGraphPtr &kernel_graph) {
       if (!gpu_kernel_ptr->Init(kernel)) {
         MS_LOG(EXCEPTION) << "Initialize gpu kernel op[" << kernel->fullname_with_scope() << "] failed.";
       }
+      gpu_kernel_ptr->InitDynamicKernel(kernel);
+      gpu_kernel_ptr->DynamicKernel()->Initialize();
       session::AnfRuntimeAlgorithm::SetKernelMod((kernel::KernelModPtr)gpu_kernel_ptr, kernel.get());
     }
   }
diff --git a/mindspore/ccsrc/runtime/device/gpu/gpu_kernel_runtime.cc b/mindspore/ccsrc/runtime/device/gpu/gpu_kernel_runtime.cc
index fb7fd4f602..bebe38c2ac 100644
--- a/mindspore/ccsrc/runtime/device/gpu/gpu_kernel_runtime.cc
+++ b/mindspore/ccsrc/runtime/device/gpu/gpu_kernel_runtime.cc
@@ -36,6 +36,8 @@
 #include "profiler/device/gpu/gpu_profiling.h"
 #include "utils/shape_utils.h"
 #include "debug/data_dump/dump_json_parser.h"
+#include "backend/kernel_compiler/gpu/gpu_kernel.h"
+#include "runtime/device/executor/executor_callback.h"
 #ifdef ENABLE_DEBUGGER
 #include "debug/debug_services.h"
 #endif
@@ -588,6 +590,29 @@ bool GPUKernelRuntime::LaunchKernelDynamic(const session::KernelGraph *graph, bo
       MS_LOG(INFO) << "[inplace optimizer] skip node: " << kernel->DebugString();
       continue;
     }
+
+    // akg kernel do not support dynamic shape by now.
+    device::DynamicKernelPtr dynamic_kernel = nullptr;
+    kernel::GpuKernel *gpu_kernel = nullptr;
+    if (session::AnfRuntimeAlgorithm::GetKernelType(kernel) != KernelType::AKG_KERNEL) {
+      gpu_kernel = dynamic_cast<kernel::GpuKernel *>(kernel_mod);
+      dynamic_kernel = gpu_kernel->DynamicKernel();
+    }
+
+    if (dynamic_kernel && dynamic_kernel->have_depends()) {
+      MS_LOG(INFO) << "Match Dynamic Kernel, Start SyncStream";
+      if (!SyncStream()) {
+        MS_LOG(ERROR) << "SyncStream failed";
+        return false;
+      }
+    }
+
+    if (dynamic_kernel && dynamic_kernel->is_dynamic_shape()) {
+      ExecutorCallback::GetInstance().Consume();
+      dynamic_kernel->InferShape();
+      dynamic_kernel->UpdateArgs();
+    }
+
     AddressPtrList kernel_inputs;
     AddressPtrList kernel_workspaces;
     AddressPtrList kernel_outputs;
@@ -615,6 +640,10 @@ bool GPUKernelRuntime::LaunchKernelDynamic(const session::KernelGraph *graph, bo
       } else {
         LaunchKernelWithTimeProfiling(kernel, kernel_inputs, kernel_workspaces, kernel_outputs);
       }
+
+      ExecutorCallback::GetInstance().RegistCallback([&gpu_kernel] {
+        if (gpu_kernel) gpu_kernel->PostExecute();
+      });
       // called once per kernel to collect the outputs to the kernel (does a SyncDeviceToHost)
       LoadKernelData(debugger_.get(), kernel, kernel_inputs, kernel_workspaces, kernel_outputs, exec_order, stream_,
                      dump_enabled);
@@ -633,6 +662,7 @@ bool GPUKernelRuntime::LaunchKernelDynamic(const session::KernelGraph *graph, bo
     // collect weights and bias for dump mode
     debugger_->LoadParametersAndConst();
     CHECK_OP_RET_WITH_EXCEPT(SyncStream(), "SyncStream failed.");
+    ExecutorCallback::GetInstance().Consume();
   }
   ClearSwapInfo(mock);
   return true;
diff --git a/tests/ut/cpp/stub/dynamic_shape/dynamic_shape_stub.cc b/tests/ut/cpp/stub/dynamic_shape/dynamic_shape_stub.cc
index 4c54ef16d4..5560abd0a6 100644
--- a/tests/ut/cpp/stub/dynamic_shape/dynamic_shape_stub.cc
+++ b/tests/ut/cpp/stub/dynamic_shape/dynamic_shape_stub.cc
@@ -19,12 +19,19 @@
 #include "runtime/device/ascend/executor/rts/profiling_rts_dynamic_kernel.h"
 #include "runtime/device/ascend/executor/ai_core_dynamic_kernel.h"
 #include "profiler/device/ascend/rt_callback_manager.h"
-#include "runtime/device/ascend/executor/executor_callback.h"
+#include "runtime/device/executor/executor_callback.h"
 #include "profiler/device/ascend/ascend_profiling.h"
 #include "runtime/device/ascend/executor/tiling/op_tiling_calculater.h"
 #include "backend/kernel_compiler/host/host_kernel_metadata.h"
 #include "backend/kernel_compiler/host/host_kernel_build.h"
 
+namespace mindspore {
+namespace device {
+void ExecutorCallback::RegistCallback(const std::function<void()> &callback) {}
+void ExecutorCallback::Consume() {}
+}  // namespace device
+}  // namespace mindspore
+
 namespace mindspore {
 namespace device {
 namespace ascend {
@@ -45,13 +52,11 @@ void AiCoreDynamicKernel::PostExecute() {}
 bool HcclExecutorManager::Initialize() { return true; }
 bool HcclExecutorManager::Finalize() { return true; }
 
-void ExecutorCallback::RegistCallback(const std::function<void()> &callback) {}
-void ExecutorCallback::Consume() {}
-
 void OpTilingCalculater::Init() {}
-void OpTilingCalculater::CalculateTiling(const NotNull<CNodePtr> &cnode, const NotNull<std::shared_ptr<nlohmann::json>> &compile_info_json,
-                     const std::map<uint32_t, tensor::TensorPtr> &depend_tensor_map,
-                     NotNull<optiling::OpRunInfo *> op_run_info) {}
+void OpTilingCalculater::CalculateTiling(const NotNull<CNodePtr> &cnode,
+                                         const NotNull<std::shared_ptr<nlohmann::json>> &compile_info_json,
+                                         const std::map<uint32_t, tensor::TensorPtr> &depend_tensor_map,
+                                         NotNull<optiling::OpRunInfo *> op_run_info) {}
 }  // namespace ascend
 }  // namespace device
 }  // namespace mindspore