diff --git a/mindspore/ccsrc/backend/session/ascend_session.cc b/mindspore/ccsrc/backend/session/ascend_session.cc
index 9658c3af48..9e30b4bb02 100644
--- a/mindspore/ccsrc/backend/session/ascend_session.cc
+++ b/mindspore/ccsrc/backend/session/ascend_session.cc
@@ -1018,7 +1018,6 @@ void AscendSession::AdjustKernel(const std::shared_ptr<KernelGraph> &kernel_grap
 
 void AscendSession::RunOpAdjustKernel(const std::shared_ptr<KernelGraph> &kernel_graph) const {
   MS_LOG(INFO) << "Start!";
-  opt::HideNopNode(kernel_graph.get());
   // Insert CLearZero op
   // prepare for next step from json get atomic info
   BuildKernel(kernel_graph);
@@ -1079,7 +1078,6 @@ void AscendSession::RunOpMemoryAlloc(const std::vector<tensor::TensorPtr> &input
                                      KernelGraph *kernel_graph) const {
   MS_LOG(INFO) << "Start memory alloc!";
   MS_EXCEPTION_IF_NULL(kernel_graph);
-  opt::RemoveNopNode(kernel_graph);
   auto runtime_instance = device::KernelRuntimeManager::Instance().GetKernelRuntime(kAscendDevice, device_id_);
   MS_EXCEPTION_IF_NULL(runtime_instance);
   runtime_instance->RunOpAssignMemory(input_tensors, kernel_graph);
diff --git a/mindspore/ccsrc/backend/session/gpu_session.cc b/mindspore/ccsrc/backend/session/gpu_session.cc
index 930433bc9f..bafd9fe266 100644
--- a/mindspore/ccsrc/backend/session/gpu_session.cc
+++ b/mindspore/ccsrc/backend/session/gpu_session.cc
@@ -418,8 +418,6 @@ void GPUSession::BuildOpImpl(const OpRunInfo &op_run_info, const GraphInfo &grap
   SelectKernel(kernel_graph);
   RunOpHardwareOptimize(kernel_graph);
   StartKernelRT();
-  // Hide NopOp from execution graph
-  opt::HideNopNode(kernel_graph.get());
   BuildKernel(kernel_graph);
   run_op_graphs_[graph_info] = kernel_graph;
 }
@@ -434,8 +432,6 @@ void GPUSession::RunOpImpl(const GraphInfo &graph_info, OpRunInfo *op_run_info,
   // run op
   auto kernel_graph = run_op_graphs_[graph_info];
   MS_EXCEPTION_IF_NULL(kernel_graph);
-  // Remove NopOp from execution graph
-  opt::RemoveNopNode(kernel_graph.get());
   RunOpAllocateMemory(*input_tensors, kernel_graph.get());
   // Execute the computation
   LoadInputData(kernel_graph, *input_tensors);
diff --git a/mindspore/ccsrc/backend/session/session_basic.cc b/mindspore/ccsrc/backend/session/session_basic.cc
index 33fe155184..18df95c427 100644
--- a/mindspore/ccsrc/backend/session/session_basic.cc
+++ b/mindspore/ccsrc/backend/session/session_basic.cc
@@ -1173,7 +1173,12 @@ void SessionBasic::UpdateOutputs(const std::shared_ptr<KernelGraph> &kernel_grap
     auto &tensor = item.first;
     auto &node = item.second.first;
     auto &output_index = item.second.second;
-    auto address = AnfAlgo::GetMutableOutputAddr(node, output_index);
+    DeviceAddressPtr address = nullptr;
+    if (ms_context->get_param<int>(MS_CTX_EXECUTION_MODE) == kPynativeMode) {
+      address = AnfAlgo::GetMutableOutputAddr(node, output_index, false);
+    } else {
+      address = AnfAlgo::GetMutableOutputAddr(node, output_index);
+    }
     MS_EXCEPTION_IF_NULL(tensor);
     tensor->set_device_address(address);
     tensor->SetNeedWait(false);
diff --git a/mindspore/ccsrc/pipeline/pynative/pynative_execute.cc b/mindspore/ccsrc/pipeline/pynative/pynative_execute.cc
index 302072cfed..39f4c54c90 100644
--- a/mindspore/ccsrc/pipeline/pynative/pynative_execute.cc
+++ b/mindspore/ccsrc/pipeline/pynative/pynative_execute.cc
@@ -988,7 +988,7 @@ AnfNodePtr PynativeExecutor::GetInput(const py::object &obj, bool op_mask) {
     // out = op(cell1(x, y))
     // out = op(cell1(x, y)[0])
     node = GetObjNode(obj, obj_id);
-  } else if (py::isinstance<py::tuple>(obj)) {
+  } else if (py::isinstance<py::tuple>(obj) || py::isinstance<py::list>(obj)) {
     // out = op((x, y))
     // out = cell((x, y))
     auto tuple = obj.cast<py::tuple>();
@@ -1100,6 +1100,23 @@ void PynativeExecutor::CleanPreMemoryInValueNode(const std::string &cell_id) {
     top_cell_id_ = cell_id;
     return;
   }
+  if (dynamic_cell_) {
+    std::set<std::string> forward_op_tensor_id;
+    for (const auto &elem : cell_op_index_with_tensor_id_[top_cell_id_]) {
+      const auto &tensor_id_list = elem.second;
+      for (const auto &tensor_id : tensor_id_list) {
+        forward_op_tensor_id.emplace(tensor_id);
+      }
+    }
+    for (auto &tensor : all_value_node_tensors_) {
+      if (tensor->device_address() != nullptr &&
+          forward_op_tensor_id.find(tensor->id()) != forward_op_tensor_id.end()) {
+        tensor->device_address()->ClearDeviceMemory();
+        tensor->set_device_address(nullptr);
+      }
+    }
+    all_value_node_tensors_.clear();
+  }
   const auto &tensor_id_with_tensor = cell_tensor_id_with_tensor_[top_cell_id_];
   for (const auto &elem : tensor_id_with_tensor) {
     const auto &tensors_in_value_node = elem.second;
@@ -2111,6 +2128,37 @@ std::string PynativeExecutor::GetGradCellId(bool has_sens, const py::object &cel
   return cell_id;
 }
 
+void PynativeExecutor::SaveAllValueNodeTensors(const FuncGraphPtr &graph) {
+  std::unordered_set<tensor::TensorPtr> all_value_node_tensors;
+  auto trace_function = [&all_value_node_tensors](const AnfNodePtr &anf_node) {
+    auto value = GetValueNode(anf_node);
+    if (value) {
+      if (value->isa<tensor::Tensor>()) {
+        auto tensor = value->cast<tensor::TensorPtr>();
+        MS_EXCEPTION_IF_NULL(tensor);
+        if (tensor->device_address()) {
+          all_value_node_tensors.emplace(tensor);
+        }
+      } else if (value->isa<ValueTuple>()) {
+        auto tuple = value->cast<ValueTuplePtr>();
+        MS_EXCEPTION_IF_NULL(tuple);
+        for (size_t i = 0; i < tuple->size(); i++) {
+          if ((*tuple)[i]->isa<tensor::Tensor>()) {
+            auto tensor = (*tuple)[i]->cast<tensor::TensorPtr>();
+            MS_EXCEPTION_IF_NULL(tensor);
+            if (tensor->device_address()) {
+              all_value_node_tensors.emplace(tensor);
+            }
+          }
+        }
+      }
+    }
+    return FOLLOW;
+  };
+  (void)TopoSort(graph->get_return(), SuccDeeperSimple, trace_function);
+  all_value_node_tensors_ = all_value_node_tensors;
+}
+
 void PynativeExecutor::GradNetInner(const GradOperationPtr &grad, const py::object &cell, const py::object &weights,
                                     const py::args &args) {
   auto size = args.size();
@@ -2152,6 +2200,9 @@ void PynativeExecutor::GradNetInner(const GradOperationPtr &grad, const py::obje
   resource->results()[pipeline::kBackend] = compile::CreateBackend();
 
   MS_LOG(INFO) << "Start opt";
+  if (dynamic_cell_) {
+    SaveAllValueNodeTensors(resource->func_graph());
+  }
   PynativeOptimizeAction(resource);
   SaveTensorsInValueNode(resource);
   TaskEmitAction(resource);
diff --git a/mindspore/ccsrc/pipeline/pynative/pynative_execute.h b/mindspore/ccsrc/pipeline/pynative/pynative_execute.h
index 55bcfb7836..eb330fabd2 100644
--- a/mindspore/ccsrc/pipeline/pynative/pynative_execute.h
+++ b/mindspore/ccsrc/pipeline/pynative/pynative_execute.h
@@ -200,6 +200,7 @@ class PynativeExecutor : public std::enable_shared_from_this<PynativeExecutor> {
   // Update the abstract and device address info of value node and tensors in bprop graph
   void UpdateAbstractAndDeviceAddress(const OpExecInfoPtr &op_exec_info, const py::object &out_real);
   void SaveTensorsInValueNode(const ResourcePtr &resource);
+  void SaveAllValueNodeTensors(const FuncGraphPtr &graph);
   void CleanPreMemoryInValueNode(const std::string &cell_id);
 
   // Construct grad graph
@@ -306,6 +307,7 @@ class PynativeExecutor : public std::enable_shared_from_this<PynativeExecutor> {
   std::unordered_map<std::string, TensorIdWithTensor> cell_tensor_id_with_tensor_;
   std::unordered_map<std::string, abstract::AbstractBasePtr> node_abs_map_;
   std::unordered_map<std::string, AbstractListMap> prim_abs_list_;
+  std::unordered_set<tensor::TensorPtr> all_value_node_tensors_;
 };
 
 using PynativeExecutorPtr = std::shared_ptr<PynativeExecutor>;
diff --git a/mindspore/ccsrc/runtime/device/ascend/ascend_device_address.cc b/mindspore/ccsrc/runtime/device/ascend/ascend_device_address.cc
index 1639051cf1..c6147e28b0 100644
--- a/mindspore/ccsrc/runtime/device/ascend/ascend_device_address.cc
+++ b/mindspore/ccsrc/runtime/device/ascend/ascend_device_address.cc
@@ -612,7 +612,7 @@ bool AscendDeviceAddress::ConvertFormatAndSyncHostToDevice(const ShapeVector &sh
   return sync_ok;
 }
 
-AscendDeviceAddress::~AscendDeviceAddress() {
+void AscendDeviceAddress::ClearDeviceMemory() {
   if (ptr_ == nullptr) {
     return;
   }
@@ -627,6 +627,8 @@ AscendDeviceAddress::~AscendDeviceAddress() {
   }
 }
 
+AscendDeviceAddress::~AscendDeviceAddress() { ClearDeviceMemory(); }
+
 bool AscendDeviceAddress::DumpMemToFile(bool trans_flag, const std::string &filepath, const std::string &host_fmt,
                                         const ShapeVector &host_shape, TypeId host_type) const {
   bool ret = false;
diff --git a/mindspore/ccsrc/runtime/device/ascend/ascend_device_address.h b/mindspore/ccsrc/runtime/device/ascend/ascend_device_address.h
index 393525c2e3..819e1a325a 100644
--- a/mindspore/ccsrc/runtime/device/ascend/ascend_device_address.h
+++ b/mindspore/ccsrc/runtime/device/ascend/ascend_device_address.h
@@ -41,6 +41,7 @@ class AscendDeviceAddress : public DeviceAddress {
   ~AscendDeviceAddress() override;
   bool SyncDeviceToHost(const ShapeVector &shape, size_t size, TypeId type, void *host_ptr) const override;
   bool SyncHostToDevice(const ShapeVector &shape, size_t size, TypeId type, const void *host_ptr) const override;
+  void ClearDeviceMemory() override;
   DeviceAddressType DeviceType() const override { return DeviceAddressType::kAscend; }
   bool DumpMemToFile(bool dump_mode, const std::string &filepath, const std::string &host_fmt,
                      const ShapeVector &host_shape, TypeId host_type) const override;
diff --git a/mindspore/ccsrc/runtime/device/cpu/cpu_device_address.h b/mindspore/ccsrc/runtime/device/cpu/cpu_device_address.h
index d73804c324..bf8230ec35 100644
--- a/mindspore/ccsrc/runtime/device/cpu/cpu_device_address.h
+++ b/mindspore/ccsrc/runtime/device/cpu/cpu_device_address.h
@@ -35,6 +35,7 @@ class CPUDeviceAddress : public DeviceAddress {
 
   bool SyncDeviceToHost(const ShapeVector &shape, size_t size, TypeId type, void *host_ptr) const override;
   bool SyncHostToDevice(const ShapeVector &shape, size_t size, TypeId type, const void *host_ptr) const override;
+  void ClearDeviceMemory() override {}
   DeviceAddressType DeviceType() const override { return DeviceAddressType::kCPU; }
 };
 }  // namespace cpu
diff --git a/mindspore/ccsrc/runtime/device/gpu/gpu_device_address.cc b/mindspore/ccsrc/runtime/device/gpu/gpu_device_address.cc
index af2570598b..4893ebdc38 100644
--- a/mindspore/ccsrc/runtime/device/gpu/gpu_device_address.cc
+++ b/mindspore/ccsrc/runtime/device/gpu/gpu_device_address.cc
@@ -69,7 +69,7 @@ bool GPUDeviceAddress::SyncHostToDevice(const ShapeVector &, size_t size, TypeId
   return GPUDeviceManager::GetInstance().SyncStream(stream);
 }
 
-GPUDeviceAddress::~GPUDeviceAddress() {
+void GPUDeviceAddress::ClearDeviceMemory() {
   if (ptr_ == nullptr) {
     return;
   }
@@ -78,6 +78,8 @@ GPUDeviceAddress::~GPUDeviceAddress() {
     ptr_ = nullptr;
   }
 }
+
+GPUDeviceAddress::~GPUDeviceAddress() { ClearDeviceMemory(); }
 #ifdef ENABLE_DEBUGGER
 bool GPUDeviceAddress::LoadMemToHost(const std::string &tensor_name, int execution_order, const std::string &host_fmt,
                                      const ShapeVector &host_shape, TypeId host_type, size_t slot,
diff --git a/mindspore/ccsrc/runtime/device/gpu/gpu_device_address.h b/mindspore/ccsrc/runtime/device/gpu/gpu_device_address.h
index a98f67786b..943ca8e596 100644
--- a/mindspore/ccsrc/runtime/device/gpu/gpu_device_address.h
+++ b/mindspore/ccsrc/runtime/device/gpu/gpu_device_address.h
@@ -38,6 +38,7 @@ class GPUDeviceAddress : public DeviceAddress {
 
   bool SyncDeviceToHost(const ShapeVector &shape, size_t size, TypeId type, void *host_ptr) const override;
   bool SyncHostToDevice(const ShapeVector &shape, size_t size, TypeId type, const void *host_ptr) const override;
+  void ClearDeviceMemory() override;
   void set_status(DeviceAddressStatus status) { status_ = status; }
   DeviceAddressStatus status() const { return status_; }
   DeviceAddressType DeviceType() const override { return DeviceAddressType::kGPU; }
diff --git a/mindspore/ccsrc/runtime/device/kernel_runtime.cc b/mindspore/ccsrc/runtime/device/kernel_runtime.cc
index fe9dc119aa..c32c0b5ae1 100644
--- a/mindspore/ccsrc/runtime/device/kernel_runtime.cc
+++ b/mindspore/ccsrc/runtime/device/kernel_runtime.cc
@@ -819,6 +819,9 @@ void KernelRuntime::GenLaunchArgs(const mindspore::kernel::KernelMod &kernel_mod
   if (AnfAlgo::GetCNodeName(cnode) == kAtomicAddrCleanOpName) {
     return GenAddrCleanLaunchArgs(cnode, kernel_inputs);
   }
+  auto ms_context = MsContext::GetInstance();
+  MS_EXCEPTION_IF_NULL(ms_context);
+  auto visit_nop_node = (ms_context->get_param<int>(MS_CTX_EXECUTION_MODE) != kPynativeMode);
   for (size_t i = 0; i < AnfAlgo::GetInputTensorNum(kernel); ++i) {
     auto op_name = AnfAlgo::GetCNodeName(cnode);
     constexpr auto none_placeholder_index = 3;
@@ -833,7 +836,7 @@ void KernelRuntime::GenLaunchArgs(const mindspore::kernel::KernelMod &kernel_mod
       }
     }
     auto real_input = AnfAlgo::GetRealInputIndex(kernel, i);
-    auto device_address = AnfAlgo::GetPrevNodeOutputAddr(kernel, real_input);
+    auto device_address = AnfAlgo::GetPrevNodeOutputAddr(kernel, real_input, visit_nop_node);
     MS_EXCEPTION_IF_NULL(device_address);
     kernel::AddressPtr input = std::make_shared<kernel::Address>();
     MS_EXCEPTION_IF_NULL(input);
@@ -844,7 +847,7 @@ void KernelRuntime::GenLaunchArgs(const mindspore::kernel::KernelMod &kernel_mod
   }
 
   for (size_t i = 0; i < kernel_mod.GetOutputSizeList().size(); ++i) {
-    auto device_address = AnfAlgo::GetOutputAddr(kernel, i);
+    auto device_address = AnfAlgo::GetOutputAddr(kernel, i, visit_nop_node);
     kernel::AddressPtr output = std::make_shared<kernel::Address>();
     MS_EXCEPTION_IF_NULL(output);
     output->addr = device_address->ptr_;
diff --git a/mindspore/core/ir/device_sync.h b/mindspore/core/ir/device_sync.h
index 2cf7ecd38e..766049370e 100644
--- a/mindspore/core/ir/device_sync.h
+++ b/mindspore/core/ir/device_sync.h
@@ -33,6 +33,7 @@ class DeviceSync {
   virtual bool SyncDeviceToHost(const ShapeVector &shape, size_t size, TypeId type, void *host_ptr) const = 0;
   virtual bool SyncHostToDevice(const ShapeVector &shape, size_t size, TypeId type, const void *host_ptr) const = 0;
   virtual void *GetMutablePtr() const = 0;
+  virtual void ClearDeviceMemory() = 0;
 };
 using DeviceSyncPtr = std::shared_ptr<DeviceSync>;
 }  // namespace mindspore