diff --git a/mindspore/ccsrc/backend/session/ascend_session.cc b/mindspore/ccsrc/backend/session/ascend_session.cc index 9658c3af48..9e30b4bb02 100644 --- a/mindspore/ccsrc/backend/session/ascend_session.cc +++ b/mindspore/ccsrc/backend/session/ascend_session.cc @@ -1018,7 +1018,6 @@ void AscendSession::AdjustKernel(const std::shared_ptr &kernel_grap void AscendSession::RunOpAdjustKernel(const std::shared_ptr &kernel_graph) const { MS_LOG(INFO) << "Start!"; - opt::HideNopNode(kernel_graph.get()); // Insert CLearZero op // prepare for next step from json get atomic info BuildKernel(kernel_graph); @@ -1079,7 +1078,6 @@ void AscendSession::RunOpMemoryAlloc(const std::vector &input KernelGraph *kernel_graph) const { MS_LOG(INFO) << "Start memory alloc!"; MS_EXCEPTION_IF_NULL(kernel_graph); - opt::RemoveNopNode(kernel_graph); auto runtime_instance = device::KernelRuntimeManager::Instance().GetKernelRuntime(kAscendDevice, device_id_); MS_EXCEPTION_IF_NULL(runtime_instance); runtime_instance->RunOpAssignMemory(input_tensors, kernel_graph); diff --git a/mindspore/ccsrc/backend/session/gpu_session.cc b/mindspore/ccsrc/backend/session/gpu_session.cc index 930433bc9f..bafd9fe266 100644 --- a/mindspore/ccsrc/backend/session/gpu_session.cc +++ b/mindspore/ccsrc/backend/session/gpu_session.cc @@ -418,8 +418,6 @@ void GPUSession::BuildOpImpl(const OpRunInfo &op_run_info, const GraphInfo &grap SelectKernel(kernel_graph); RunOpHardwareOptimize(kernel_graph); StartKernelRT(); - // Hide NopOp from execution graph - opt::HideNopNode(kernel_graph.get()); BuildKernel(kernel_graph); run_op_graphs_[graph_info] = kernel_graph; } @@ -434,8 +432,6 @@ void GPUSession::RunOpImpl(const GraphInfo &graph_info, OpRunInfo *op_run_info, // run op auto kernel_graph = run_op_graphs_[graph_info]; MS_EXCEPTION_IF_NULL(kernel_graph); - // Remove NopOp from execution graph - opt::RemoveNopNode(kernel_graph.get()); RunOpAllocateMemory(*input_tensors, kernel_graph.get()); // Execute the computation LoadInputData(kernel_graph, *input_tensors); diff --git a/mindspore/ccsrc/backend/session/session_basic.cc b/mindspore/ccsrc/backend/session/session_basic.cc index 33fe155184..18df95c427 100644 --- a/mindspore/ccsrc/backend/session/session_basic.cc +++ b/mindspore/ccsrc/backend/session/session_basic.cc @@ -1173,7 +1173,12 @@ void SessionBasic::UpdateOutputs(const std::shared_ptr &kernel_grap auto &tensor = item.first; auto &node = item.second.first; auto &output_index = item.second.second; - auto address = AnfAlgo::GetMutableOutputAddr(node, output_index); + DeviceAddressPtr address = nullptr; + if (ms_context->get_param(MS_CTX_EXECUTION_MODE) == kPynativeMode) { + address = AnfAlgo::GetMutableOutputAddr(node, output_index, false); + } else { + address = AnfAlgo::GetMutableOutputAddr(node, output_index); + } MS_EXCEPTION_IF_NULL(tensor); tensor->set_device_address(address); tensor->SetNeedWait(false); diff --git a/mindspore/ccsrc/pipeline/pynative/pynative_execute.cc b/mindspore/ccsrc/pipeline/pynative/pynative_execute.cc index 302072cfed..39f4c54c90 100644 --- a/mindspore/ccsrc/pipeline/pynative/pynative_execute.cc +++ b/mindspore/ccsrc/pipeline/pynative/pynative_execute.cc @@ -988,7 +988,7 @@ AnfNodePtr PynativeExecutor::GetInput(const py::object &obj, bool op_mask) { // out = op(cell1(x, y)) // out = op(cell1(x, y)[0]) node = GetObjNode(obj, obj_id); - } else if (py::isinstance(obj)) { + } else if (py::isinstance(obj) || py::isinstance(obj)) { // out = op((x, y)) // out = cell((x, y)) auto tuple = obj.cast(); @@ -1100,6 +1100,23 @@ void PynativeExecutor::CleanPreMemoryInValueNode(const std::string &cell_id) { top_cell_id_ = cell_id; return; } + if (dynamic_cell_) { + std::set forward_op_tensor_id; + for (const auto &elem : cell_op_index_with_tensor_id_[top_cell_id_]) { + const auto &tensor_id_list = elem.second; + for (const auto &tensor_id : tensor_id_list) { + forward_op_tensor_id.emplace(tensor_id); + } + } + for (auto &tensor : all_value_node_tensors_) { + if (tensor->device_address() != nullptr && + forward_op_tensor_id.find(tensor->id()) != forward_op_tensor_id.end()) { + tensor->device_address()->ClearDeviceMemory(); + tensor->set_device_address(nullptr); + } + } + all_value_node_tensors_.clear(); + } const auto &tensor_id_with_tensor = cell_tensor_id_with_tensor_[top_cell_id_]; for (const auto &elem : tensor_id_with_tensor) { const auto &tensors_in_value_node = elem.second; @@ -2111,6 +2128,37 @@ std::string PynativeExecutor::GetGradCellId(bool has_sens, const py::object &cel return cell_id; } +void PynativeExecutor::SaveAllValueNodeTensors(const FuncGraphPtr &graph) { + std::unordered_set all_value_node_tensors; + auto trace_function = [&all_value_node_tensors](const AnfNodePtr &anf_node) { + auto value = GetValueNode(anf_node); + if (value) { + if (value->isa()) { + auto tensor = value->cast(); + MS_EXCEPTION_IF_NULL(tensor); + if (tensor->device_address()) { + all_value_node_tensors.emplace(tensor); + } + } else if (value->isa()) { + auto tuple = value->cast(); + MS_EXCEPTION_IF_NULL(tuple); + for (size_t i = 0; i < tuple->size(); i++) { + if ((*tuple)[i]->isa()) { + auto tensor = (*tuple)[i]->cast(); + MS_EXCEPTION_IF_NULL(tensor); + if (tensor->device_address()) { + all_value_node_tensors.emplace(tensor); + } + } + } + } + } + return FOLLOW; + }; + (void)TopoSort(graph->get_return(), SuccDeeperSimple, trace_function); + all_value_node_tensors_ = all_value_node_tensors; +} + void PynativeExecutor::GradNetInner(const GradOperationPtr &grad, const py::object &cell, const py::object &weights, const py::args &args) { auto size = args.size(); @@ -2152,6 +2200,9 @@ void PynativeExecutor::GradNetInner(const GradOperationPtr &grad, const py::obje resource->results()[pipeline::kBackend] = compile::CreateBackend(); MS_LOG(INFO) << "Start opt"; + if (dynamic_cell_) { + SaveAllValueNodeTensors(resource->func_graph()); + } PynativeOptimizeAction(resource); SaveTensorsInValueNode(resource); TaskEmitAction(resource); diff --git a/mindspore/ccsrc/pipeline/pynative/pynative_execute.h b/mindspore/ccsrc/pipeline/pynative/pynative_execute.h index 55bcfb7836..eb330fabd2 100644 --- a/mindspore/ccsrc/pipeline/pynative/pynative_execute.h +++ b/mindspore/ccsrc/pipeline/pynative/pynative_execute.h @@ -200,6 +200,7 @@ class PynativeExecutor : public std::enable_shared_from_this { // Update the abstract and device address info of value node and tensors in bprop graph void UpdateAbstractAndDeviceAddress(const OpExecInfoPtr &op_exec_info, const py::object &out_real); void SaveTensorsInValueNode(const ResourcePtr &resource); + void SaveAllValueNodeTensors(const FuncGraphPtr &graph); void CleanPreMemoryInValueNode(const std::string &cell_id); // Construct grad graph @@ -306,6 +307,7 @@ class PynativeExecutor : public std::enable_shared_from_this { std::unordered_map cell_tensor_id_with_tensor_; std::unordered_map node_abs_map_; std::unordered_map prim_abs_list_; + std::unordered_set all_value_node_tensors_; }; using PynativeExecutorPtr = std::shared_ptr; diff --git a/mindspore/ccsrc/runtime/device/ascend/ascend_device_address.cc b/mindspore/ccsrc/runtime/device/ascend/ascend_device_address.cc index 1639051cf1..c6147e28b0 100644 --- a/mindspore/ccsrc/runtime/device/ascend/ascend_device_address.cc +++ b/mindspore/ccsrc/runtime/device/ascend/ascend_device_address.cc @@ -612,7 +612,7 @@ bool AscendDeviceAddress::ConvertFormatAndSyncHostToDevice(const ShapeVector &sh return sync_ok; } -AscendDeviceAddress::~AscendDeviceAddress() { +void AscendDeviceAddress::ClearDeviceMemory() { if (ptr_ == nullptr) { return; } @@ -627,6 +627,8 @@ AscendDeviceAddress::~AscendDeviceAddress() { } } +AscendDeviceAddress::~AscendDeviceAddress() { ClearDeviceMemory(); } + bool AscendDeviceAddress::DumpMemToFile(bool trans_flag, const std::string &filepath, const std::string &host_fmt, const ShapeVector &host_shape, TypeId host_type) const { bool ret = false; diff --git a/mindspore/ccsrc/runtime/device/ascend/ascend_device_address.h b/mindspore/ccsrc/runtime/device/ascend/ascend_device_address.h index 393525c2e3..819e1a325a 100644 --- a/mindspore/ccsrc/runtime/device/ascend/ascend_device_address.h +++ b/mindspore/ccsrc/runtime/device/ascend/ascend_device_address.h @@ -41,6 +41,7 @@ class AscendDeviceAddress : public DeviceAddress { ~AscendDeviceAddress() override; bool SyncDeviceToHost(const ShapeVector &shape, size_t size, TypeId type, void *host_ptr) const override; bool SyncHostToDevice(const ShapeVector &shape, size_t size, TypeId type, const void *host_ptr) const override; + void ClearDeviceMemory() override; DeviceAddressType DeviceType() const override { return DeviceAddressType::kAscend; } bool DumpMemToFile(bool dump_mode, const std::string &filepath, const std::string &host_fmt, const ShapeVector &host_shape, TypeId host_type) const override; diff --git a/mindspore/ccsrc/runtime/device/cpu/cpu_device_address.h b/mindspore/ccsrc/runtime/device/cpu/cpu_device_address.h index d73804c324..bf8230ec35 100644 --- a/mindspore/ccsrc/runtime/device/cpu/cpu_device_address.h +++ b/mindspore/ccsrc/runtime/device/cpu/cpu_device_address.h @@ -35,6 +35,7 @@ class CPUDeviceAddress : public DeviceAddress { bool SyncDeviceToHost(const ShapeVector &shape, size_t size, TypeId type, void *host_ptr) const override; bool SyncHostToDevice(const ShapeVector &shape, size_t size, TypeId type, const void *host_ptr) const override; + void ClearDeviceMemory() override {} DeviceAddressType DeviceType() const override { return DeviceAddressType::kCPU; } }; } // namespace cpu diff --git a/mindspore/ccsrc/runtime/device/gpu/gpu_device_address.cc b/mindspore/ccsrc/runtime/device/gpu/gpu_device_address.cc index af2570598b..4893ebdc38 100644 --- a/mindspore/ccsrc/runtime/device/gpu/gpu_device_address.cc +++ b/mindspore/ccsrc/runtime/device/gpu/gpu_device_address.cc @@ -69,7 +69,7 @@ bool GPUDeviceAddress::SyncHostToDevice(const ShapeVector &, size_t size, TypeId return GPUDeviceManager::GetInstance().SyncStream(stream); } -GPUDeviceAddress::~GPUDeviceAddress() { +void GPUDeviceAddress::ClearDeviceMemory() { if (ptr_ == nullptr) { return; } @@ -78,6 +78,8 @@ GPUDeviceAddress::~GPUDeviceAddress() { ptr_ = nullptr; } } + +GPUDeviceAddress::~GPUDeviceAddress() { ClearDeviceMemory(); } #ifdef ENABLE_DEBUGGER bool GPUDeviceAddress::LoadMemToHost(const std::string &tensor_name, int execution_order, const std::string &host_fmt, const ShapeVector &host_shape, TypeId host_type, size_t slot, diff --git a/mindspore/ccsrc/runtime/device/gpu/gpu_device_address.h b/mindspore/ccsrc/runtime/device/gpu/gpu_device_address.h index a98f67786b..943ca8e596 100644 --- a/mindspore/ccsrc/runtime/device/gpu/gpu_device_address.h +++ b/mindspore/ccsrc/runtime/device/gpu/gpu_device_address.h @@ -38,6 +38,7 @@ class GPUDeviceAddress : public DeviceAddress { bool SyncDeviceToHost(const ShapeVector &shape, size_t size, TypeId type, void *host_ptr) const override; bool SyncHostToDevice(const ShapeVector &shape, size_t size, TypeId type, const void *host_ptr) const override; + void ClearDeviceMemory() override; void set_status(DeviceAddressStatus status) { status_ = status; } DeviceAddressStatus status() const { return status_; } DeviceAddressType DeviceType() const override { return DeviceAddressType::kGPU; } diff --git a/mindspore/ccsrc/runtime/device/kernel_runtime.cc b/mindspore/ccsrc/runtime/device/kernel_runtime.cc index fe9dc119aa..c32c0b5ae1 100644 --- a/mindspore/ccsrc/runtime/device/kernel_runtime.cc +++ b/mindspore/ccsrc/runtime/device/kernel_runtime.cc @@ -819,6 +819,9 @@ void KernelRuntime::GenLaunchArgs(const mindspore::kernel::KernelMod &kernel_mod if (AnfAlgo::GetCNodeName(cnode) == kAtomicAddrCleanOpName) { return GenAddrCleanLaunchArgs(cnode, kernel_inputs); } + auto ms_context = MsContext::GetInstance(); + MS_EXCEPTION_IF_NULL(ms_context); + auto visit_nop_node = (ms_context->get_param(MS_CTX_EXECUTION_MODE) != kPynativeMode); for (size_t i = 0; i < AnfAlgo::GetInputTensorNum(kernel); ++i) { auto op_name = AnfAlgo::GetCNodeName(cnode); constexpr auto none_placeholder_index = 3; @@ -833,7 +836,7 @@ void KernelRuntime::GenLaunchArgs(const mindspore::kernel::KernelMod &kernel_mod } } auto real_input = AnfAlgo::GetRealInputIndex(kernel, i); - auto device_address = AnfAlgo::GetPrevNodeOutputAddr(kernel, real_input); + auto device_address = AnfAlgo::GetPrevNodeOutputAddr(kernel, real_input, visit_nop_node); MS_EXCEPTION_IF_NULL(device_address); kernel::AddressPtr input = std::make_shared(); MS_EXCEPTION_IF_NULL(input); @@ -844,7 +847,7 @@ void KernelRuntime::GenLaunchArgs(const mindspore::kernel::KernelMod &kernel_mod } for (size_t i = 0; i < kernel_mod.GetOutputSizeList().size(); ++i) { - auto device_address = AnfAlgo::GetOutputAddr(kernel, i); + auto device_address = AnfAlgo::GetOutputAddr(kernel, i, visit_nop_node); kernel::AddressPtr output = std::make_shared(); MS_EXCEPTION_IF_NULL(output); output->addr = device_address->ptr_; diff --git a/mindspore/core/ir/device_sync.h b/mindspore/core/ir/device_sync.h index 2cf7ecd38e..766049370e 100644 --- a/mindspore/core/ir/device_sync.h +++ b/mindspore/core/ir/device_sync.h @@ -33,6 +33,7 @@ class DeviceSync { virtual bool SyncDeviceToHost(const ShapeVector &shape, size_t size, TypeId type, void *host_ptr) const = 0; virtual bool SyncHostToDevice(const ShapeVector &shape, size_t size, TypeId type, const void *host_ptr) const = 0; virtual void *GetMutablePtr() const = 0; + virtual void ClearDeviceMemory() = 0; }; using DeviceSyncPtr = std::shared_ptr; } // namespace mindspore