!2183 gpu add the graph cache of pynative mode

Merge pull request !2183 from limingqi107/gpu_pynative_optimize
pull/2183/MERGE
mindspore-ci-bot 5 years ago committed by Gitee
commit 893afd3796

@ -153,6 +153,34 @@ void KernelRuntime::RunOpAssignMemory(const std::vector<tensor::TensorPtr> &inpu
UpdateRefNodeOutputMem(graph); UpdateRefNodeOutputMem(graph);
} }
void KernelRuntime::RunOpClearMemory(session::KernelGraph *graph) {
MS_EXCEPTION_IF_NULL(graph);
// clear input parameter memory resource
for (const auto &input_node : graph->inputs()) {
MS_EXCEPTION_IF_NULL(input_node);
AnfAlgo::SetOutputAddr(nullptr, 0, input_node.get());
}
// clear input value node memory resource
for (const auto &value_node : graph->graph_value_nodes()) {
MS_EXCEPTION_IF_NULL(value_node);
AnfAlgo::SetOutputAddr(nullptr, 0, value_node.get());
}
for (const auto &cnode : graph->execution_order()) {
MS_EXCEPTION_IF_NULL(cnode);
// clear output memory resource
for (size_t index = 0; index < AnfAlgo::GetOutputTensorNum(cnode); ++index) {
AnfAlgo::SetOutputAddr(nullptr, index, cnode.get());
}
// clear workspace memory resource
auto kernel_mod = AnfAlgo::GetKernelMod(cnode);
MS_EXCEPTION_IF_NULL(kernel_mod);
auto workspace_lists = kernel_mod->GetWorkspaceSizeList();
for (size_t index = 0; index < workspace_lists.size(); ++index) {
AnfAlgo::SetWorkspaceAddr(nullptr, index, cnode.get());
}
}
}
void KernelRuntime::AssignStaticMemory(session::KernelGraph *graph) { void KernelRuntime::AssignStaticMemory(session::KernelGraph *graph) {
AssignStaticMemoryInput(graph); AssignStaticMemoryInput(graph);
AssignStaticMemoryValueNode(graph); AssignStaticMemoryValueNode(graph);

@ -47,6 +47,7 @@ class KernelRuntime {
virtual bool Init() = 0; virtual bool Init() = 0;
virtual void AssignMemory(session::KernelGraph *graph); virtual void AssignMemory(session::KernelGraph *graph);
void RunOpAssignMemory(const std::vector<tensor::TensorPtr> &input_tensors, session::KernelGraph *graph); void RunOpAssignMemory(const std::vector<tensor::TensorPtr> &input_tensors, session::KernelGraph *graph);
void RunOpClearMemory(session::KernelGraph *graph);
virtual bool Run(session::KernelGraph *graph); virtual bool Run(session::KernelGraph *graph);
virtual bool DumpData(session::KernelGraph *graph); virtual bool DumpData(session::KernelGraph *graph);
virtual bool RunTask(const session::KernelGraph *graph); virtual bool RunTask(const session::KernelGraph *graph);

@ -131,34 +131,6 @@ std::vector<BaseRef> GetRealArgs(const KernelGraphPtr graph, const VectorRef &ar
return real_args; return real_args;
} }
void ClearRunOpMemoryResource(const KernelGraphPtr &kernel_graph) {
MS_EXCEPTION_IF_NULL(kernel_graph);
// clear input parameter memory resource
for (const auto &input_node : kernel_graph->inputs()) {
MS_EXCEPTION_IF_NULL(input_node);
AnfAlgo::SetOutputAddr(nullptr, 0, input_node.get());
}
// clear input value node memory resource
for (const auto &value_node : kernel_graph->graph_value_nodes()) {
MS_EXCEPTION_IF_NULL(value_node);
AnfAlgo::SetOutputAddr(nullptr, 0, value_node.get());
}
for (const auto &cnode : kernel_graph->execution_order()) {
MS_EXCEPTION_IF_NULL(cnode);
// clear output memory resource
for (size_t index = 0; index < AnfAlgo::GetOutputTensorNum(cnode); ++index) {
AnfAlgo::SetOutputAddr(nullptr, index, cnode.get());
}
// clear workspace memory resource
auto kernel_mod = AnfAlgo::GetKernelMod(cnode);
MS_EXCEPTION_IF_NULL(kernel_mod);
auto workspace_lists = kernel_mod->GetWorkspaceSizeList();
for (size_t index = 0; index < workspace_lists.size(); ++index) {
AnfAlgo::SetWorkspaceAddr(nullptr, index, cnode.get());
}
}
}
std::vector<CNodePtr> GetCNodes(const std::vector<AnfNodePtr> &anf_nodes) { std::vector<CNodePtr> GetCNodes(const std::vector<AnfNodePtr> &anf_nodes) {
std::vector<CNodePtr> cnodes = {}; std::vector<CNodePtr> cnodes = {};
size_t i = 0; size_t i = 0;
@ -518,7 +490,7 @@ py::tuple AscendSession::RunOp(const OpRunInfo &op_run_info, const GraphInfo &gr
} }
py::object tuple_obj = utils::cast<PyObjectRef>(output_tensors).object_; py::object tuple_obj = utils::cast<PyObjectRef>(output_tensors).object_;
py::tuple tuple_tensors = py::cast<py::tuple>(tuple_obj); py::tuple tuple_tensors = py::cast<py::tuple>(tuple_obj);
ClearRunOpMemoryResource(graph); RunOpMemoryClear(graph.get());
MS_LOG(INFO) << "Run op " << op_run_info.op_name << " finish!"; MS_LOG(INFO) << "Run op " << op_run_info.op_name << " finish!";
return tuple_tensors; return tuple_tensors;
} }
@ -652,6 +624,13 @@ void AscendSession::RunOpMemoryAlloc(const std::vector<tensor::TensorPtr> &input
MS_LOG(INFO) << "Finish!"; MS_LOG(INFO) << "Finish!";
} }
void AscendSession::RunOpMemoryClear(KernelGraph *kernel_graph) const {
MS_EXCEPTION_IF_NULL(kernel_graph);
auto runtime_instance = device::KernelRuntimeManager::Instance().GetKernelRuntime(kAscendDevice, device_id_);
MS_EXCEPTION_IF_NULL(runtime_instance);
runtime_instance->RunOpClearMemory(kernel_graph);
}
void AscendSession::GenerateTaskInfo(const std::shared_ptr<KernelGraph> &kernel_graph) const { void AscendSession::GenerateTaskInfo(const std::shared_ptr<KernelGraph> &kernel_graph) const {
MS_LOG(INFO) << "Start!"; MS_LOG(INFO) << "Start!";
(void)device::KernelAdjust::GetInstance().StepLoadCtrlInputs(kernel_graph); (void)device::KernelAdjust::GetInstance().StepLoadCtrlInputs(kernel_graph);

@ -80,6 +80,7 @@ class AscendSession : public SessionBasic {
void BuildKernel(const std::shared_ptr<KernelGraph> &kernel_graph) const; void BuildKernel(const std::shared_ptr<KernelGraph> &kernel_graph) const;
void MemoryAlloc(KernelGraph *kernel_graph) const; void MemoryAlloc(KernelGraph *kernel_graph) const;
void RunOpMemoryAlloc(const std::vector<tensor::TensorPtr> &input_tensors, KernelGraph *kernel_graph) const; void RunOpMemoryAlloc(const std::vector<tensor::TensorPtr> &input_tensors, KernelGraph *kernel_graph) const;
void RunOpMemoryClear(KernelGraph *kernel_graph) const;
void GenerateTaskInfo(const std::shared_ptr<KernelGraph> &kernel_graph) const; void GenerateTaskInfo(const std::shared_ptr<KernelGraph> &kernel_graph) const;
void LoadTask(const std::shared_ptr<KernelGraph> &kernel_graph) const; void LoadTask(const std::shared_ptr<KernelGraph> &kernel_graph) const;
void ExecTask(const std::shared_ptr<KernelGraph> &kernel_graph) const; void ExecTask(const std::shared_ptr<KernelGraph> &kernel_graph) const;

@ -86,6 +86,13 @@ void GPUSession::RunOpAllocateMemory(const std::vector<tensor::TensorPtr> &input
runtime_instance->RunOpAssignMemory(input_tensors, kernel_graph); runtime_instance->RunOpAssignMemory(input_tensors, kernel_graph);
} }
void GPUSession::RunOpClearMemory(KernelGraph *kernel_graph) const {
MS_EXCEPTION_IF_NULL(kernel_graph);
auto runtime_instance = device::KernelRuntimeManager::Instance().GetSingleKernelRuntime(kGPUDevice, device_id_);
MS_EXCEPTION_IF_NULL(runtime_instance);
runtime_instance->RunOpClearMemory(kernel_graph);
}
void GPUSession::LoadInputData(const std::shared_ptr<KernelGraph> &kernel_graph, void GPUSession::LoadInputData(const std::shared_ptr<KernelGraph> &kernel_graph,
const std::vector<tensor::TensorPtr> &inputs_const) const { const std::vector<tensor::TensorPtr> &inputs_const) const {
std::vector<tensor::TensorPtr> inputs(inputs_const); std::vector<tensor::TensorPtr> inputs(inputs_const);
@ -202,6 +209,10 @@ void GPUSession::RunGraph(const GraphId &graph_id, const std::vector<tensor::Ten
void GPUSession::BuildOp(const OpRunInfo &op_run_info, const GraphInfo &graph_info, void GPUSession::BuildOp(const OpRunInfo &op_run_info, const GraphInfo &graph_info,
const std::vector<tensor::TensorPtr> &input_tensors, const std::vector<int> &tensors_mask) { const std::vector<tensor::TensorPtr> &input_tensors, const std::vector<int> &tensors_mask) {
// Check if the graph cache exists.
if (run_op_graphs_.find(graph_info) != run_op_graphs_.end()) {
return;
}
// Prepare the graph // Prepare the graph
auto kernel_graph = ConstructSingleOpGraph(op_run_info, input_tensors, tensors_mask); auto kernel_graph = ConstructSingleOpGraph(op_run_info, input_tensors, tensors_mask);
MS_EXCEPTION_IF_NULL(kernel_graph); MS_EXCEPTION_IF_NULL(kernel_graph);
@ -234,7 +245,7 @@ py::tuple GPUSession::RunOp(const OpRunInfo &op_run_info, const GraphInfo &graph
} }
py::object tuple_obj = utils::cast<PyObjectRef>(output_tensors).object_; py::object tuple_obj = utils::cast<PyObjectRef>(output_tensors).object_;
py::tuple tuple_tensors = py::cast<py::tuple>(tuple_obj); py::tuple tuple_tensors = py::cast<py::tuple>(tuple_obj);
run_op_graphs_.clear(); RunOpClearMemory(kernel_graph.get());
return tuple_tensors; return tuple_tensors;
} }
} // namespace gpu } // namespace gpu

@ -59,6 +59,8 @@ class GPUSession : public SessionBasic {
void RunOpAllocateMemory(const std::vector<tensor::TensorPtr> &input_tensors, KernelGraph *kernel_graph) const; void RunOpAllocateMemory(const std::vector<tensor::TensorPtr> &input_tensors, KernelGraph *kernel_graph) const;
void RunOpClearMemory(KernelGraph *kernel_graph) const;
void LoadInputData(const std::shared_ptr<KernelGraph> &kernel_graph, void LoadInputData(const std::shared_ptr<KernelGraph> &kernel_graph,
const std::vector<tensor::TensorPtr> &inputs_const) const override; const std::vector<tensor::TensorPtr> &inputs_const) const override;

Loading…
Cancel
Save