From 83399c1b8de5a4e16db91d79913fc550a0ea7db0 Mon Sep 17 00:00:00 2001 From: wenfangpei Date: Tue, 6 Apr 2021 11:06:50 +0800 Subject: [PATCH] adapt for layermorm C++ code --- .../graph_kernel/model/graph_split.py | 2 +- .../graph_kernel/graph_kernel_helper.cc | 14 ++++-- .../graph_kernel/graph_kernel_optimization.cc | 20 ++++---- .../graph_kernel/graph_kernel_splitter.cc | 10 +++- .../ccsrc/backend/session/ascend_session.cc | 1 + .../device/ascend/kernel_select_ascend.cc | 50 +++++++++++++++++++ .../device/ascend/kernel_select_ascend.h | 1 + 7 files changed, 82 insertions(+), 16 deletions(-) diff --git a/mindspore/_extends/graph_kernel/model/graph_split.py b/mindspore/_extends/graph_kernel/model/graph_split.py index 3051de7253..ee54838eea 100644 --- a/mindspore/_extends/graph_kernel/model/graph_split.py +++ b/mindspore/_extends/graph_kernel/model/graph_split.py @@ -472,7 +472,7 @@ class GraphSplitAscend(GraphSplitByPattern): def get_default_mode(self, op): if op.prim == "MatMul": return self.Area.MODE_COMPOSITE if op.inputs[0].dtype == "float16" else self.Area.MODE_BASIC - if op.prim in ("Tile", "BroadcastTo"): + if op.prim in ("Tile", "BroadcastTo", "ExpandDims"): return self.Area.MODE_COMPOSITE return self.Area.MODE_BASIC diff --git a/mindspore/ccsrc/backend/optimizer/graph_kernel/graph_kernel_helper.cc b/mindspore/ccsrc/backend/optimizer/graph_kernel/graph_kernel_helper.cc index edbb9ddfe0..6a5839926d 100644 --- a/mindspore/ccsrc/backend/optimizer/graph_kernel/graph_kernel_helper.cc +++ b/mindspore/ccsrc/backend/optimizer/graph_kernel/graph_kernel_helper.cc @@ -34,7 +34,9 @@ #include "pipeline/jit/action.h" #include "utils/context/graph_kernel_flags.h" #include "vm/segment_runner.h" -#if ENABLE_GPU +#if ENABLE_D +#include "runtime/device/ascend/kernel_select_ascend.h" +#elif ENABLE_GPU #include "runtime/device/gpu/kernel_info_setter.h" #endif @@ -620,7 +622,11 @@ bool IsBasicFuseOp(const AnfNodePtr &node) { std::vector basic_ops = GetFusibleOpList(); #if ENABLE_D if (!CheckProcessor(node)) { - return false; + std::vector fused_aicpu_op = {prim::kPrimExpandDims, prim::kPrimReshape}; + if (!std::any_of(fused_aicpu_op.begin(), fused_aicpu_op.end(), + [&node](const PrimitivePtr &prim) { return IsPrimitiveCNode(node, prim); })) { + return false; + } } #endif return std::any_of(basic_ops.begin(), basic_ops.end(), @@ -630,7 +636,9 @@ bool IsBasicFuseOp(const AnfNodePtr &node) { void ResetKernelInfo(const AnfNodePtr &node, KernelType kernel_type) { auto cnode = node->cast(); MS_EXCEPTION_IF_NULL(cnode); -#if ENABLE_GPU +#if ENABLE_D + device::ascend::SetKernelInfo(cnode, kernel_type); +#elif ENABLE_GPU device::gpu::SetKernelInfo(cnode, kernel_type); #endif } diff --git a/mindspore/ccsrc/backend/optimizer/graph_kernel/graph_kernel_optimization.cc b/mindspore/ccsrc/backend/optimizer/graph_kernel/graph_kernel_optimization.cc index 1e39d4167c..9359e9d279 100644 --- a/mindspore/ccsrc/backend/optimizer/graph_kernel/graph_kernel_optimization.cc +++ b/mindspore/ccsrc/backend/optimizer/graph_kernel/graph_kernel_optimization.cc @@ -100,17 +100,17 @@ PassManagerPtr GraphKernelOptimizer::HighLevelOpt1() { PassManagerPtr GraphKernelOptimizer::Split() { auto pm = std::make_shared("graphkernel_stage4_split"); + // Move the non-scalar tensor (in composite node) to parameter list pm->AddPass(std::make_shared()); // Make certain nodes redundant so that they are used by only one user, // which can avoid unnecessary input-output and get better performance. - if (is_gpu) { - // preprocess for ShapeOpsSplitter - pm->AddPass(std::make_shared()); - std::vector duplicated_ops = {prim::kPrimReshape, prim::kPrimExpandDims, prim::kPrimCast}; - pm->AddPass(std::make_shared(duplicated_ops)); - } + + // preprocess for ShapeOpsSplitter + pm->AddPass(std::make_shared()); + std::vector duplicated_ops = {prim::kPrimReshape, prim::kPrimExpandDims, prim::kPrimCast}; + pm->AddPass(std::make_shared(duplicated_ops)); // Split kernel according to costmodel pm->AddPass(std::make_shared()); @@ -120,11 +120,9 @@ PassManagerPtr GraphKernelOptimizer::Split() { pm->AddPass(std::make_shared()); // Eliminate the redundant node that is copied above but not handled by GraphKernelSplitter - if (is_gpu) { - pm->AddPass(std::make_shared()); - pm->AddPass(std::make_shared()); - pm->AddPass(std::make_shared()); - } + pm->AddPass(std::make_shared()); + pm->AddPass(std::make_shared()); + pm->AddPass(std::make_shared()); return pm; } diff --git a/mindspore/ccsrc/backend/optimizer/graph_kernel/graph_kernel_splitter.cc b/mindspore/ccsrc/backend/optimizer/graph_kernel/graph_kernel_splitter.cc index 625d09edbc..a50aebc5f3 100644 --- a/mindspore/ccsrc/backend/optimizer/graph_kernel/graph_kernel_splitter.cc +++ b/mindspore/ccsrc/backend/optimizer/graph_kernel/graph_kernel_splitter.cc @@ -359,12 +359,19 @@ class Splitter { Splitter(const CNodePtr &main_cnode, SplitSchemerPtr split_schemer) : main_func_graph_(main_cnode->func_graph()), old_subgraph_cnode_(main_cnode), split_schemer_(split_schemer) {} + void ResetInlinedNodesKernelInfo() { + for (const auto &node : inlined_nodes_) { + ResetKernelInfo(node); + } + } + // Maintain new subgraphs in main graph. void RebuildGraph(const std::vector &cnodes_group_id) { BindFuncGraph(); RecoverParameter(); ConnectToMainGraph(cnodes_group_id); UpdateSubGraphInfo(); + ResetInlinedNodesKernelInfo(); } // Rebind nodes to its new sub_func_graph @@ -420,7 +427,7 @@ class Splitter { } } if (AnfAlgo::IsRealKernel(node)) { - ResetKernelInfo(node); + inlined_nodes_.push_back(node); } } } @@ -533,6 +540,7 @@ class Splitter { FuncGraphPtr main_func_graph_; CNodePtr old_subgraph_cnode_; // The cnode that holds the original sub_func_graph std::vector new_subgraph_cnodes_; // The cnode list that hold the new sub_func_graph + std::vector inlined_nodes_; SplitSchemerPtr split_schemer_; std::unordered_map param_to_main_graph_node_map_; }; diff --git a/mindspore/ccsrc/backend/session/ascend_session.cc b/mindspore/ccsrc/backend/session/ascend_session.cc index d3b9d840c3..bcb08ba618 100644 --- a/mindspore/ccsrc/backend/session/ascend_session.cc +++ b/mindspore/ccsrc/backend/session/ascend_session.cc @@ -54,6 +54,7 @@ #include "debug/data_dump/dump_json_parser.h" #include "debug/tensor_load.h" #include "debug/anf_ir_utils.h" +#include "backend/optimizer/graph_kernel/shape_ops_splitter.h" #include "backend/optimizer/graph_kernel/graph_kernel_optimization.h" #include "backend/session/ascend_auto_monad.h" #include "debug/data_dump/e2e_dump_util.h" diff --git a/mindspore/ccsrc/runtime/device/ascend/kernel_select_ascend.cc b/mindspore/ccsrc/runtime/device/ascend/kernel_select_ascend.cc index c335ec3a46..7aac83e526 100644 --- a/mindspore/ccsrc/runtime/device/ascend/kernel_select_ascend.cc +++ b/mindspore/ccsrc/runtime/device/ascend/kernel_select_ascend.cc @@ -515,6 +515,56 @@ KernelSelectStatus SelectKernelInfo(const CNodePtr &kernel_node, KernelType kern } return select_status; } + +void SetKernelInfo(const CNodePtr &kernel_node, KernelType kernel_type) { + auto kernel_info = static_cast(kernel_node->kernel_info()); + MS_EXCEPTION_IF_NULL(kernel_info); + auto kernel_build_info = kernel_info->select_kernel_build_info(); + MS_EXCEPTION_IF_NULL(kernel_build_info); + + if (AnfAlgo::IsGraphKernel(kernel_node)) { + return; + } + + auto builder = std::make_shared(); + builder->SetOriginDataFormat(kernel_build_info->GetOriginDataFormat()); + builder->SetInputsFormat(kernel_build_info->GetAllInputFormats()); + builder->SetInputsDeviceType(kernel_build_info->GetAllInputDeviceTypes()); + builder->SetOutputsFormat(kernel_build_info->GetAllOutputFormats()); + builder->SetOutputsDeviceType(kernel_build_info->GetAllOutputDeviceTypes()); + builder->SetOpPattern(kernel_build_info->op_pattern()); + builder->SetFusionType(kernel_build_info->fusion_type()); + + auto new_kernel_type = kernel_type; + auto new_processor = kernel_build_info->processor(); + if (kernel_type == UNKNOWN_KERNEL_TYPE) { + std::vector> kernel_info_list; + std::vector> aicpu_kernel_info_list; + kernel::KernelQuery(kernel_node, &kernel_info_list, kernel_type); + auto select_status = SetMatchedKernelInfo(kernel_node, kernel_info_list); + if (select_status != kNoMatched) { + new_kernel_type = TBE_KERNEL; + new_processor = kernel::Processor::AICORE; + MS_LOG(INFO) << kernel_node->fullname_with_scope() << " uses TBE_KERNEL"; + } else { + kernel::AICPUQuery(kernel_node, &aicpu_kernel_info_list); + select_status = SetMatchedKernelInfo(kernel_node, aicpu_kernel_info_list); + if (select_status != kNoMatched) { + new_kernel_type = AICPU_KERNEL; + new_processor = kernel::Processor::AICPU; + MS_LOG(INFO) << kernel_node->fullname_with_scope() << " uses AICPU_KERNEL"; + } + } + } + if (new_kernel_type == UNKNOWN_KERNEL_TYPE) { + new_kernel_type = AKG_KERNEL; + new_processor = kernel::Processor::AICORE; + MS_LOG(INFO) << kernel_node->fullname_with_scope() << " uses AKG_KERNEL"; + } + builder->SetKernelType(new_kernel_type); + builder->SetProcessor(new_processor); + AnfAlgo::SetSelectKernelBuildInfo(builder->Build(), kernel_node.get()); +} } // namespace ascend } // namespace device } // namespace mindspore diff --git a/mindspore/ccsrc/runtime/device/ascend/kernel_select_ascend.h b/mindspore/ccsrc/runtime/device/ascend/kernel_select_ascend.h index 064d898500..b4fd09a98a 100644 --- a/mindspore/ccsrc/runtime/device/ascend/kernel_select_ascend.h +++ b/mindspore/ccsrc/runtime/device/ascend/kernel_select_ascend.h @@ -31,6 +31,7 @@ KernelSelectStatus SelectKernelInfo(const CNodePtr &kernel_node, KernelType kernel_type = KernelType::UNKNOWN_KERNEL_TYPE); void SetTensorDeviceInfo(const CNodePtr &kernel_node); void SelectGraphKernelInfo(const CNodePtr &kernel_node, const FuncGraphPtr &func_graph); +void SetKernelInfo(const CNodePtr &kernel_node, KernelType kernel_type); } // namespace ascend } // namespace device } // namespace mindspore