From 76d322464d20f7399160203319cc7795eb92cad7 Mon Sep 17 00:00:00 2001 From: looop5 Date: Sat, 27 Mar 2021 14:44:18 +0800 Subject: [PATCH] unify graph kernel pass add_atomic_clean on Ascend and GPU back-end refactor CanActivateAtomicAdd use smart pointer --- .../graph_kernel/add_atomic_clean.cc | 589 +++++++++++++++--- .../optimizer/graph_kernel/add_atomic_clean.h | 91 ++- .../graph_kernel/add_atomic_clean_gpu.cc | 514 --------------- .../graph_kernel/add_atomic_clean_gpu.h | 68 -- .../add_stitch_atomic_clean_gpu.h | 2 +- .../graph_kernel/graph_kernel_helper.cc | 2 +- .../graph_kernel/graph_kernel_optimization.cc | 5 +- tests/st/ops/graph_kernel/test_atomic_add.py | 27 + 8 files changed, 625 insertions(+), 673 deletions(-) delete mode 100644 mindspore/ccsrc/backend/optimizer/graph_kernel/add_atomic_clean_gpu.cc delete mode 100644 mindspore/ccsrc/backend/optimizer/graph_kernel/add_atomic_clean_gpu.h diff --git a/mindspore/ccsrc/backend/optimizer/graph_kernel/add_atomic_clean.cc b/mindspore/ccsrc/backend/optimizer/graph_kernel/add_atomic_clean.cc index bad58ffd51..92b6a4cae7 100644 --- a/mindspore/ccsrc/backend/optimizer/graph_kernel/add_atomic_clean.cc +++ b/mindspore/ccsrc/backend/optimizer/graph_kernel/add_atomic_clean.cc @@ -1,5 +1,5 @@ /** - * Copyright 2020 Huawei Technologies Co., Ltd + * Copyright 2020-2021 Huawei Technologies Co., Ltd * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -15,12 +15,20 @@ */ #include "backend/optimizer/graph_kernel/add_atomic_clean.h" +#include +#include #include +#include +#include +#include +#include #include -#include #include "base/core_ops.h" +#include "ir/tensor.h" #include "utils/utils.h" #include "utils/log_adapter.h" +#include "backend/kernel_compiler/kernel.h" +#include "backend/optimizer/graph_kernel/graph_kernel_helper.h" #include "backend/session/anf_runtime_algorithm.h" #include "backend/session/kernel_graph.h" #include "debug/anf_ir_dump.h" @@ -28,102 +36,531 @@ namespace mindspore { namespace opt { namespace { -static std::vector g_output_idx; +std::set GetUniqReduceAxes(const AnfNodePtr &node) { + if (!IsPrimitiveCNode(node, prim::kPrimReduceSum)) { + MS_LOG(EXCEPTION) << "Only process for reduce sum!"; + } -bool HasAtomic(const AnfNodePtr &input) { - if (IsPrimitiveCNode(input)) { - const auto &cnode = input->cast(); - const auto &prim = GetValueNode(cnode->input(0)); - return prim->HasAttr("atomic_add"); + auto input = node->cast()->input(kFirstDataInputIndex); + auto src_shape_vec = GetShape(input); + auto axis_vec = GetReduceAxis(node); + if (axis_vec.empty()) { + for (size_t i = 0; i < src_shape_vec.size(); ++i) { + axis_vec.push_back(i); + } + } else { + std::transform(axis_vec.begin(), axis_vec.end(), axis_vec.begin(), + [&src_shape_vec](int64_t axis) -> int64_t { return axis < 0 ? axis + src_shape_vec.size() : axis; }); } - return false; + + std::set axis_set(axis_vec.begin(), axis_vec.end()); + return axis_set; } -std::vector CalCleanSize(const CNodePtr &pre_node) { - MS_EXCEPTION_IF_NULL(pre_node); - std::vector clean_size_list; - // clean output - for (auto &index : g_output_idx) { - TypeId output_type_id = AnfAlgo::GetOutputDeviceDataType(pre_node, index); - size_t type_size = GetTypeByte(TypeIdToType(output_type_id)); - std::vector shape = AnfAlgo::GetOutputDeviceShape(pre_node, index); - auto size = std::accumulate(shape.begin(), shape.end(), type_size, std::multiplies()); - clean_size_list.push_back((size + kMemAlignSize + 31) / kMemAlignSize * kMemAlignSize); +bool HaveReduceInPredecessors(const AnfNodePtr &node) { + std::stack st; + st.push(node); + while (!st.empty()) { + auto n = st.top(); + st.pop(); + + if (n != node) { + if (!n->isa()) { + continue; + } + if (IsPrimitiveCNode(n, prim::kPrimReduceSum)) { + return true; + } + } + + auto n_inputs = n->cast()->inputs(); + std::for_each(n_inputs.cbegin() + 1, n_inputs.cend(), [&st](const AnfNodePtr &n) -> void { st.push(n); }); } - MS_LOG(DEBUG) << "Clear output size: " << clean_size_list.size() << ", pre_node: " << pre_node->fullname_with_scope(); - return clean_size_list; + + return false; } -CNodePtr CreateTbeAtomicCleanNode(const std::shared_ptr &kernel_graph, - const mindspore::CNodePtr &pre_node) { - MS_EXCEPTION_IF_NULL(kernel_graph); - MS_EXCEPTION_IF_NULL(pre_node); - auto clean_zero_prim = std::make_shared(kAtomicAddrCleanOpName); - auto new_value_node = NewValueNode(clean_zero_prim); - std::vector inputs = {new_value_node}; - CNodePtr clean_zero = kernel_graph->NewCNode(inputs); - AbstractBasePtr abstract = std::make_shared(); - clean_zero->set_abstract(abstract); - auto builder = std::make_shared(); - builder->SetKernelType(KernelType::TBE_KERNEL); - AnfAlgo::SetSelectKernelBuildInfo(builder->Build(), clean_zero.get()); - auto clean_size = CalCleanSize(pre_node); - AnfAlgo::SetNodeAttr(kAttrAtomicAddMemSize, MakeValue(clean_size), clean_zero); - AnfAlgo::SetNodeAttr(kAttrAtomicOutputIndexs, MakeValue(g_output_idx), clean_zero); - AnfAlgo::SetStreamDistinctionLabel(AnfAlgo::GetStreamDistinctionLabel(pre_node.get()), clean_zero.get()); - return clean_zero; +inline int64_t CalNewIndex(int64_t old_index, int64_t reduce_index) { + return old_index - (old_index > reduce_index ? 1 : 0); } } // namespace +std::shared_ptr AtomicAddChecker::Init() { + auto processor = GetProcessorFromContext(); + if (processor == kernel::Processor::AICORE) { + return std::make_shared(); + } else if (processor == kernel::Processor::CUDA) { + return std::make_shared(); + } + return nullptr; +} + +bool AtomicAddChecker::FindCandidate(const AnfNodePtr &anf_node) { + auto node = anf_node->cast(); + MS_EXCEPTION_IF_NULL(node); + auto sub_graph = AnfAlgo::GetCNodeFuncGraphPtr(node); + auto mng_sub = sub_graph->manager(); + if (mng_sub == nullptr) { + mng_sub = Manage(sub_graph, false); + sub_graph->set_manager(mng_sub); + } + + // Rule: Only one ReduceSum inside sub-graph. + auto real_return_node = sub_graph->get_return()->input(kFirstDataInputIndex); + if (IsPrimitiveCNode(real_return_node, prim::kPrimMakeTuple)) { + AnfNodePtrList reduce_ops; + size_t reduce_cnt = 0; + const auto &inputs = real_return_node->cast()->inputs(); + for (size_t i = 1; i < inputs.size(); ++i) { + if (IsPrimitiveCNode(inputs[i], prim::kPrimReduceSum)) { + atomic_add_info_.atomic_add_node = inputs[i]->cast(); + atomic_add_info_.reduce_real_output_index = i - 1; + reduce_cnt++; + } + } + + if (reduce_cnt != 1) { + return false; + } + atomic_add_info_.real_output_num = inputs.size() - 1; + } else if (IsPrimitiveCNode(real_return_node, prim::kPrimReduceSum)) { + atomic_add_info_.atomic_add_node = real_return_node->cast(); + atomic_add_info_.real_output_num = 1; + } else { + return false; + } + + // Rule: ReduceSum should not fuse any other ops in out direction, which means it should be in output list. + return (mng_sub->node_users()[atomic_add_info_.atomic_add_node].size() <= 1); +} + +bool AtomicAddChecker::CanActivateAtomicAdd(const AnfNodePtr &anf_node) { + // Rules to activate atomic add: + // 1. Find only one ReduceSum inside sub-graph, and it should not fuse any other ops in out direction, + // which mean it should be in output list. + // 2. The reduce axis and reduce number should meet condition: + // (GPU) all-reduce or reduce-x when fuse number is greater than or equal to 1024, or reduce-y. + // (Ascend) all-reduce or non-reduce axes with dimension 1 + // 3. No other ReduceSum as output ReduceSum's predecessors (reduce compile limitation). + + // Rule 1. + if (!FindCandidate(anf_node)) { + return false; + } + + // Rule 2. + if (!SuitableForAtomicAdd(atomic_add_info_.atomic_add_node)) { + return false; + } + + // Rule 3. + return !HaveReduceInPredecessors(atomic_add_info_.atomic_add_node); +} + +bool AtomicAddChecker::Check(const AnfNodePtr &node) { + return (AnfAlgo::IsGraphKernel(node) && CanActivateAtomicAdd(node)); +} + +bool AtomicAddCheckerGPU::SuitableForAtomicAdd(const AnfNodePtr &node) { + auto input = node->cast()->input(kFirstDataInputIndex); + auto src_shape_vec = GetShape(input); + std::set axis_set = GetUniqReduceAxes(node); + + // For reduce whose last dim is reduced (including all-reduce), + // it is suitable for atomic add only the reduce num is greater than or equal to 1024. + if (axis_set.count(src_shape_vec.size() - 1) != 0) { + size_t reduce_size = + std::accumulate(axis_set.begin(), axis_set.end(), 1, + [&src_shape_vec](size_t size, int64_t axis) { return size * src_shape_vec[axis]; }); + return reduce_size >= 1024; + } + + // For reduce whose last dim is not reduced, always true. + return true; +} + +bool AtomicAddCheckerAscend::SuitableForAtomicAdd(const AnfNodePtr &node) { + auto input = node->cast()->input(kFirstDataInputIndex); + auto src_shape_vec = GetShape(input); + std::set axis_set = GetUniqReduceAxes(node); + + // case 1: all reduce + if (src_shape_vec.size() == axis_set.size()) { + return true; + } + // case 2: non-reduce axes with dimension 1 + for (size_t i = 0; i < src_shape_vec.size(); ++i) { + if (axis_set.find(i) == axis_set.end()) { + if (src_shape_vec[i] != 1) { + return false; + } + } + } + return true; +} + +void AtomicCleanInsertter::CorrectKernelBuildInfo(const AnfNodePtr &composite_node, const AnfNodePtr &new_input) { + // Change kernel build info. + auto kernel_info = static_cast(composite_node->kernel_info()); + MS_EXCEPTION_IF_NULL(kernel_info); + const auto &origin_kernel_build_info = kernel_info->GetMutableSelectKernelBuildInfo(); + auto origin_inputs_format = origin_kernel_build_info->GetAllInputFormats(); + auto origin_outputs_format = origin_kernel_build_info->GetAllOutputFormats(); + auto origin_inputs_type = origin_kernel_build_info->GetAllInputDeviceTypes(); + auto origin_outputs_type = origin_kernel_build_info->GetAllOutputDeviceTypes(); + auto origin_processor = origin_kernel_build_info->processor(); + + std::vector &new_inputs_format = origin_inputs_format; + std::vector &new_inputs_type = origin_inputs_type; + std::vector new_outputs_format; + std::vector new_outputs_type; + for (size_t i = 0; i < origin_outputs_format.size(); ++i) { + if (real_output_num_ > 1 && i == reduce_real_output_index_) { + continue; + } + new_outputs_format.push_back(origin_outputs_format[i]); + new_outputs_type.push_back(origin_outputs_type[i]); + } + + auto kernel_with_index = AnfAlgo::VisitKernel(new_input, 0); + new_inputs_format.push_back(AnfAlgo::GetOutputFormat(kernel_with_index.first, kernel_with_index.second)); + new_inputs_type.push_back(AnfAlgo::GetOutputDeviceDataType(kernel_with_index.first, kernel_with_index.second)); + + kernel::KernelBuildInfo::KernelBuildInfoBuilder new_info_builder; + new_info_builder.SetInputsFormat(new_inputs_format); + new_info_builder.SetInputsDeviceType(new_inputs_type); + new_info_builder.SetOutputsFormat(new_outputs_format); + new_info_builder.SetOutputsDeviceType(new_outputs_type); + new_info_builder.SetProcessor(origin_processor); + new_info_builder.SetKernelType(KernelType::AKG_KERNEL); + new_info_builder.SetFusionType(kernel::FusionType::OPAQUE); + auto new_selected_info = new_info_builder.Build(); + AnfAlgo::SetSelectKernelBuildInfo(new_selected_info, composite_node.get()); +} + +void AtomicCleanInsertter::CreateInplaceAssignNodeAndCorrectReturn(const FuncGraphPtr &sub_graph, + const AnfNodePtr &new_parameter) { + // add inplaceassign + AnfNodePtr out_node; + bool fake_out = false; + size_t replace_index = 0; + auto retrun_node = sub_graph->get_return()->input(kFirstDataInputIndex); + if (IsPrimitiveCNode(retrun_node, prim::kPrimMakeTuple)) { + const auto &outs = retrun_node->cast()->inputs(); + for (size_t i = 1; i < outs.size(); ++i) { + if (i != reduce_real_output_index_ + 1) { + out_node = outs[i]; + replace_index = i; + break; + } + } + } else { + out_node = atomic_add_node_; // Use result data itself, and set attr "fake_out" true. + fake_out = true; + } + + auto inplace_assign_node = + CreateCNode({NewValueNode(prim::kPrimInplaceAssign), new_parameter, atomic_add_node_, out_node}, sub_graph, + {.format = GetFormat(out_node), .shape = GetShape(out_node), .type = GetType(out_node)}); + SetNodeAttrSafely("fake_output", MakeValue(fake_out), inplace_assign_node); + + CNodePtr new_out_node; + if (real_output_num_ > 2) { + std::vector output_args = {NewValueNode(prim::kPrimMakeTuple)}; + const auto &outs = retrun_node->cast()->inputs(); + for (size_t i = 1; i < outs.size(); ++i) { + if (i == reduce_real_output_index_ + 1) { + continue; + } else if (i == replace_index) { + output_args.push_back(inplace_assign_node); + } else { + output_args.push_back(outs[i]); + } + } + // Set output for AnfGraph + new_out_node = sub_graph->NewCNode(output_args); + } else { + new_out_node = inplace_assign_node; + } + sub_graph->set_output(new_out_node); +} -bool AddAtomicClean(const std::shared_ptr &kernel_graph) { +void AtomicCleanInsertter::CorrectAbstract(const AnfNodePtr &composite_node) { + // If there is only one output(ReduceSum), it should be a fake output with the same abstract with origin output. + if (real_output_num_ <= 1) { + return; + } + + // Change abstract. + auto origin_out_spec = composite_node->abstract()->cast(); + MS_EXCEPTION_IF_NULL(origin_out_spec); + const auto &origin_out_specs = origin_out_spec->elements(); + AbstractBasePtrList new_out_specs; + for (size_t i = 0; i < origin_out_specs.size(); ++i) { + if (i != reduce_real_output_index_) { + new_out_specs.push_back(origin_out_specs[i]); + } + } + composite_node->set_abstract(std::make_shared(new_out_specs)); +} + +void AtomicCleanInsertter::ProcessOriginCNode(const AnfNodePtr &composite_node, const AnfNodePtr &new_input, + const FuncGraphManagerPtr &mng) { + auto sub_graph = AnfAlgo::GetCNodeFuncGraphPtr(composite_node); + auto mng_sub = sub_graph->manager(); + if (mng_sub == nullptr) { + mng_sub = Manage(sub_graph, false); + sub_graph->set_manager(mng_sub); + } + + // Add atomic attribute to reducesum node. + SetNodeAttrSafely("enable_atomic_add", MakeValue(true), atomic_add_node_); + + // add input + auto inputs = composite_node->cast()->inputs(); + inputs.push_back(new_input); + composite_node->cast()->set_inputs(inputs); + + // add parameter + auto parameter = sub_graph->add_parameter(); + parameter->set_abstract(new_input->abstract()); + parameter->set_kernel_info(new_input->kernel_info_ptr()); + + CreateInplaceAssignNodeAndCorrectReturn(sub_graph, parameter); + + CorrectAbstract(composite_node); + CorrectKernelBuildInfo(composite_node, new_input); + + auto old_graph_name = GetValue(sub_graph->get_attr(FUNC_GRAPH_ATTR_GRAPH_KERNEL)); + auto new_graph_name = ExtractGraphKernelName(TopoSort(sub_graph->get_return()), "", "atomic_add"); + sub_graph->set_attr(FUNC_GRAPH_ATTR_GRAPH_KERNEL, MakeValue(new_graph_name)); + MS_LOG(INFO) << "Convert " << old_graph_name << " to atomic add graph " << new_graph_name; +} + +void AtomicCleanInsertter::AddDepend(const FuncGraphPtr &main_graph, const AnfNodePtr &clean_node, + const AnfNodePtr &composite_node, const AnfNodePtr &user_node, int index) { + // Create depend node to hold new control depend node. + AnfNodePtrList d_inputs = {NewValueNode(prim::kPrimDepend), clean_node, composite_node}; + auto depend_cnode = main_graph->NewCNode(d_inputs); + depend_cnode->set_abstract(clean_node->abstract()); + main_graph->AddNode(depend_cnode); + + auto user_cnode = user_node->cast(); + MS_EXCEPTION_IF_NULL(user_cnode); + user_cnode->set_input(index, depend_cnode); +} + +CNodePtr AtomicCleanInsertter::InsertUpdateState(const KernelGraphPtr &main_graph, const CNodePtr &composite_node) { + // Insert update_state_node, need mount a monad node. + auto u = NewValueNode(kUMonad); + u->set_abstract(kUMonad->ToAbstract()); + AnfNodePtrList update_state_inputs = {NewValueNode(prim::kPrimUpdateState), u, composite_node}; + auto update_state_cnode = main_graph->NewCNode(update_state_inputs); + main_graph->AddNode(update_state_cnode); + return update_state_cnode; +} + +CNodePtr AtomicCleanInsertter::CreateAtomicCleanCompositeNode(const KernelGraphPtr &main_graph, TypeId dst_type) { + std::set data_support = {kNumberTypeFloat16, kNumberTypeFloat32, kNumberTypeFloat64}; + + if (!std::any_of(data_support.cbegin(), data_support.cend(), [&dst_type](TypeId type) { return dst_type == type; })) { + MS_LOG(EXCEPTION) << "Atomic add not support data type " << dst_type; + } + + // Create zero value which will be broadcast to target shape. + auto format = GetFormat(atomic_add_node_); + auto dtype = (dst_type == kNumberTypeFloat16) ? kNumberTypeFloat32 : dst_type; + ValueNodePtr value_node; + if (dtype == kNumberTypeFloat32) { + value_node = CreateScalarTensorValueNode({.format = format, .shape = {1}, .type = TypeIdToType(dtype)}, + static_cast(0), sizeof(float)); + } else { + value_node = CreateScalarTensorValueNode({.format = format, .shape = {1}, .type = TypeIdToType(dtype)}, + static_cast(0), sizeof(double)); + } + + // Create composite op's sub-graph. + auto new_sub_graph = std::make_shared(); + + AnfNodePtr broadcast_input_node = value_node; + if (dst_type == kNumberTypeFloat16) { + AnfNodePtrList cast_inputs = {NewValueNode(prim::kPrimCast), value_node}; + auto cast_node_inner = + CreateCNode(cast_inputs, new_sub_graph, {.format = format, .shape = {1}, .type = TypeIdToType(dst_type)}); + SetNodeAttrSafely("dst_type", MakeValue("float32"), cast_node_inner); + broadcast_input_node = cast_node_inner; + } + + // Create broadcast basic op. + auto dst_shape_vec = GetShape(atomic_add_node_); + AnfNodePtrList atomic_clean_inputs = {NewValueNode(prim::kPrimBroadcastTo), broadcast_input_node}; + auto broadcast_to_node_inner = CreateCNode( + atomic_clean_inputs, new_sub_graph, {.format = format, .shape = dst_shape_vec, .type = GetType(atomic_add_node_)}); + SetNodeAttrSafely("shape", MakeValue(GetDeviceShape(atomic_add_node_)), broadcast_to_node_inner); + + // Makeup sub-graph. + new_sub_graph->set_output(broadcast_to_node_inner); + auto broadcast_to_composite_node = main_graph->NewCNode({NewValueNode(new_sub_graph)}); + broadcast_to_composite_node->set_abstract(broadcast_to_node_inner->abstract()); + SetNewKernelInfo(broadcast_to_composite_node, new_sub_graph, {}, {broadcast_to_node_inner}, + AnfAlgo::GetProcessor(atomic_add_node_)); + auto graph_attr = ExtractGraphKernelName(TopoSort(new_sub_graph->get_return()), "", "atomic_clean"); + new_sub_graph->set_attr(FUNC_GRAPH_ATTR_GRAPH_KERNEL, MakeValue(graph_attr)); + new_sub_graph->set_attr("composite_type", MakeValue("atomic_clean")); + + return broadcast_to_composite_node; +} + +std::vector > AtomicCleanInsertter::FindOriginCNodeUsers(const KernelGraphPtr &main_graph, + const AnfNodePtr &composite_node, + const FuncGraphManagerPtr &mng, + bool correct_index) { + std::vector > reduce_user_nodes; + if (real_output_num_ <= 1) { + auto users = mng->node_users()[composite_node]; + std::transform(users.cbegin(), users.cend(), std::back_inserter(reduce_user_nodes), + [](const std::pair &pair) { return pair; }); + } else { + std::vector > getitem_user_nodes; + auto users = mng->node_users()[composite_node]; + for (const auto &node_index : users) { + const auto &user_node = node_index.first; + if (!IsPrimitiveCNode(user_node, prim::kPrimTupleGetItem)) { + continue; + } + auto get_item_cnode = user_node->cast(); + auto value_input = get_item_cnode->input(kInputNodeOutputIndexInTupleGetItem); + MS_EXCEPTION_IF_NULL(value_input); + auto value_node = value_input->cast(); + MS_EXCEPTION_IF_NULL(value_node); + auto item_idx = GetValue(value_node->value()); + if (item_idx == static_cast(reduce_real_output_index_)) { + getitem_user_nodes.push_back(node_index); + } else if (correct_index) { + if (real_output_num_ > 2) { + // Recorrect other getitem index. + int64_t new_item_idx = CalNewIndex(item_idx, reduce_real_output_index_); + AnfNodePtrList new_inputs = {NewValueNode(prim::kPrimTupleGetItem), composite_node, + NewValueNode(new_item_idx)}; + auto new_out = main_graph->NewCNode(new_inputs); + new_out->set_abstract(get_item_cnode->abstract()); + for (const auto &[user, index] : mng->node_users()[get_item_cnode]) { + auto user_cnode = user->cast(); + MS_EXCEPTION_IF_NULL(user_cnode); + user_cnode->set_input(index, new_out); + } + } else { + for (const auto &[user, index] : mng->node_users()[node_index.first]) { + auto user_cnode = user->cast(); + MS_EXCEPTION_IF_NULL(user_cnode); + user_cnode->set_input(index, composite_node); + } + } + } + } + for (auto &pair : getitem_user_nodes) { + // Directory to find real user. + auto real_users = mng->node_users()[pair.first]; + reduce_user_nodes.insert(reduce_user_nodes.end(), real_users.begin(), real_users.end()); + } + } + + return reduce_user_nodes; +} + +void AtomicCleanInsertter::ProcessOriginCNodeUser(const KernelGraphPtr &main_graph, const AnfNodePtr &composite_node, + const AnfNodePtr &broadcast_to_node, + const AnfNodePtr &update_state_node, const FuncGraphManagerPtr &mng) { + // 1. find users, change getitem index if needed. + std::vector > reduce_user_nodes = + FindOriginCNodeUsers(main_graph, composite_node, mng, true); + for (const auto &[user_node, index] : reduce_user_nodes) { + // 2. Make sure modified composite node running first, So firstly, create load_node, then add edge to connect + // update_state_node, broadcat_node and load_node to keep order. + AnfNodePtrList load_inputs = {NewValueNode(prim::kPrimLoad), broadcast_to_node, update_state_node}; + auto load_node = main_graph->NewCNode(load_inputs); + main_graph->AddNode(load_node); + auto user_cnode = user_node->cast(); + MS_EXCEPTION_IF_NULL(user_cnode); + user_cnode->set_input(index, load_node); + to_process_order_.emplace_back(composite_node, user_node); + } +} + +void AtomicCleanInsertter::InsertAtomicClean(const KernelGraphPtr &main_graph, const AnfNodePtr &anf_node, + const FuncGraphManagerPtr &mng) { + auto origin_composite_node = anf_node->cast(); + MS_EXCEPTION_IF_NULL(origin_composite_node); + + // Create broadcst node. + auto out_type = GetType(atomic_add_node_)->cast(); + MS_EXCEPTION_IF_NULL(out_type); + auto broadcast_to_node = CreateAtomicCleanCompositeNode(main_graph, out_type->element()->type_id()); + + // Insert extra input(broadcast node output) to composite node, and make Reducesum inplaceassign to it. + // Note: if it's single output, this will increase total memory because of a fake out. + ProcessOriginCNode(origin_composite_node, broadcast_to_node, mng); + + // Insert update_state_node to keep execution order. + auto update_state_node = InsertUpdateState(main_graph, origin_composite_node); + + // Replace origin ReduceSum's user with atomic clean output + ProcessOriginCNodeUser(main_graph, origin_composite_node, broadcast_to_node, update_state_node, mng); + MS_LOG(INFO) << "Target node: " << origin_composite_node->fullname_with_scope() + << ", clean node: " << broadcast_to_node->fullname_with_scope(); +} + +bool AtomicCleanInsertter::IsExistStructuralObstacle(const KernelGraphPtr &main_graph, const AnfNodePtr &node, + const FuncGraphManagerPtr &mng) { + auto reduce_users = FindOriginCNodeUsers(main_graph, node, mng, false); + // If reduce user is MakeTuple and not last node, there is no cheap method to set right running order between reduce + // node and user node. If reduce is Depend or ControlDepend node, the origin node may be wrong! + return std::all_of(reduce_users.cbegin(), reduce_users.cend(), + [&main_graph](const std::pair &user_info) -> bool { + auto &user = user_info.first; + if ((IsPrimitiveCNode(user, prim::kPrimMakeTuple) || IsPrimitiveCNode(user, prim::kPrimDepend) || + IsPrimitiveCNode(user, prim::kPrimControlDepend)) && + !(IsPrimitiveCNode(user, prim::kPrimReturn) || user == main_graph->output())) { + return false; + } else { + return true; + } + }); +} + +bool AtomicCleanInsertter::Run(const FuncGraphPtr &func_graph) { + auto kernel_graph = std::dynamic_pointer_cast(func_graph); MS_EXCEPTION_IF_NULL(kernel_graph); auto mng = kernel_graph->manager(); if (mng == nullptr) { mng = Manage(kernel_graph, true); kernel_graph->set_manager(mng); } - auto &todos = kernel_graph->execution_order(); + bool changed = false; - for (auto iter = todos.cbegin(); iter != todos.end(); ++iter) { - auto node = *iter; - if (AnfAlgo::IsGraphKernel(node) && kernel_graph->nodes().contains(node)) { - auto fg = GetValueNode(node->input(kAnfPrimitiveIndex)); - MS_EXCEPTION_IF_NULL(fg); - auto input = fg->get_return()->input(1); - if (IsPrimitiveCNode(input, prim::kPrimMakeTuple)) { - const auto &cnode = input->cast(); - for (size_t i = 0; i < cnode->inputs().size(); ++i) { - if (HasAtomic(cnode->input(i))) { - g_output_idx.push_back(i - 1); - } - } - } else if (HasAtomic(input)) { - g_output_idx.push_back(0); - } + std::shared_ptr atomic_add_checker = AtomicAddChecker::Init(); + if (atomic_add_checker == nullptr) { + return changed; + } - if (!g_output_idx.empty()) { - auto zero_node = CreateTbeAtomicCleanNode(kernel_graph, node); - auto depend = kernel_graph->NewCNode({NewValueNode(prim::kPrimDepend), node->input(1), zero_node}); - std::vector new_input = node->inputs(); - new_input[1] = depend; - auto new_cnode = std::make_shared(new_input, kernel_graph); - // Set abstract - new_cnode->set_abstract(node->abstract()); - // Set kernel info - new_cnode->set_kernel_info(node->kernel_info_ptr()); - mng->Replace(node, new_cnode); - g_output_idx.clear(); - - changed = true; - } + auto topo_nodes = TopoSort(kernel_graph->get_return()); + for (const auto &node : topo_nodes) { + if (!atomic_add_checker->Check(node) || !IsExistStructuralObstacle(kernel_graph, node, mng)) { + continue; } + auto atomic_add_info = atomic_add_checker->GetAtomicAddInfo(); + atomic_add_node_ = atomic_add_info.atomic_add_node; + reduce_real_output_index_ = atomic_add_info.reduce_real_output_index; + real_output_num_ = atomic_add_info.real_output_num; + InsertAtomicClean(kernel_graph, node, mng); + changed = true; } - return changed; -} + if (changed) { + mng->RemoveRoots(); + mng->KeepRoots({func_graph}); + } -bool CleanAddAtomic::Run(const FuncGraphPtr &func_graph) { - return AddAtomicClean(std::dynamic_pointer_cast(func_graph)); + return changed; } } // namespace opt } // namespace mindspore diff --git a/mindspore/ccsrc/backend/optimizer/graph_kernel/add_atomic_clean.h b/mindspore/ccsrc/backend/optimizer/graph_kernel/add_atomic_clean.h index 3fe5967eb3..46a64bd478 100644 --- a/mindspore/ccsrc/backend/optimizer/graph_kernel/add_atomic_clean.h +++ b/mindspore/ccsrc/backend/optimizer/graph_kernel/add_atomic_clean.h @@ -1,5 +1,5 @@ /** - * Copyright 2020 Huawei Technologies Co., Ltd + * Copyright 2020-2021 Huawei Technologies Co., Ltd * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -14,23 +14,96 @@ * limitations under the License. */ -#ifndef MINDSPORE_CCSRC_BACKEND_OPTIMIZER_GRAPH_KERNEL_CLEAN_ADD_ATOMIC_H_ -#define MINDSPORE_CCSRC_BACKEND_OPTIMIZER_GRAPH_KERNEL_CLEAN_ADD_ATOMIC_H_ +#ifndef MINDSPORE_CCSRC_BACKEND_OPTIMIZER_GRAPH_KERNEL_ADD_ATOMIC_CLEAN_H_ +#define MINDSPORE_CCSRC_BACKEND_OPTIMIZER_GRAPH_KERNEL_ADD_ATOMIC_CLEAN_H_ #include +#include +#include +#include +#include #include "backend/optimizer/common/optimizer.h" #include "backend/session/kernel_graph.h" namespace mindspore { namespace opt { -class CleanAddAtomic : public Pass { +struct AtomicAddInfo { + CNodePtr atomic_add_node{nullptr}; + size_t reduce_real_output_index{0}; + size_t real_output_num{0}; +}; + +class AtomicAddChecker { + public: + AtomicAddChecker() = default; + virtual ~AtomicAddChecker() = default; + static std::shared_ptr Init(); + + bool Check(const AnfNodePtr &node); + AtomicAddInfo GetAtomicAddInfo() { return atomic_add_info_; } + + protected: + virtual bool SuitableForAtomicAdd(const AnfNodePtr &node) { return false; } + virtual bool FindCandidate(const AnfNodePtr &anf_node); + virtual bool CanActivateAtomicAdd(const AnfNodePtr &anf_node); + + private: + AtomicAddInfo atomic_add_info_; +}; + +class AtomicAddCheckerGPU : public AtomicAddChecker { public: - CleanAddAtomic() : Pass("clean_add_atomic") {} - ~CleanAddAtomic() override = default; - bool Run(const FuncGraphPtr &func_graph) override; + AtomicAddCheckerGPU() = default; + ~AtomicAddCheckerGPU() = default; + + protected: + bool SuitableForAtomicAdd(const AnfNodePtr &node) override; +}; + +class AtomicAddCheckerAscend : public AtomicAddChecker { + public: + AtomicAddCheckerAscend() = default; + ~AtomicAddCheckerAscend() = default; + + protected: + bool SuitableForAtomicAdd(const AnfNodePtr &node) override; +}; + +class AtomicCleanInsertter : public Pass { + public: + explicit AtomicCleanInsertter(const std::string &name = "atomic_clean") : Pass(name) {} + ~AtomicCleanInsertter() override = default; + virtual bool Run(const FuncGraphPtr &func_graph); + + protected: + virtual void CorrectKernelBuildInfo(const AnfNodePtr &composite_node, const AnfNodePtr &new_input); + virtual void ProcessOriginCNode(const AnfNodePtr &composite_node, const AnfNodePtr &new_input, + const FuncGraphManagerPtr &mng); + void AddDepend(const FuncGraphPtr &main_graph, const AnfNodePtr &clean_node, const AnfNodePtr &composite_node, + const AnfNodePtr &user_node, int index); + void InsertAtomicClean(const KernelGraphPtr &main_graph, const AnfNodePtr &anf_node, const FuncGraphManagerPtr &mng); + CNodePtr InsertUpdateState(const KernelGraphPtr &main_graph, const CNodePtr &composite_node); + CNodePtr atomic_add_node_{nullptr}; + + private: + void CorrectAbstract(const AnfNodePtr &composite_node); + CNodePtr CreateAtomicCleanCompositeNode(const KernelGraphPtr &main_graph, TypeId dst_type); + void CreateInplaceAssignNodeAndCorrectReturn(const FuncGraphPtr &sub_graph, const AnfNodePtr &new_parameter); + void ProcessOriginCNodeUser(const KernelGraphPtr &main_graph, const AnfNodePtr &composite_node, + const AnfNodePtr &broadcast_to_node, const AnfNodePtr &update_state_node, + const FuncGraphManagerPtr &mng); + std::vector> FindOriginCNodeUsers(const KernelGraphPtr &main_graph, + const AnfNodePtr &composite_node, + const FuncGraphManagerPtr &mng, bool correct_index); + bool IsExistStructuralObstacle(const KernelGraphPtr &main_graph, const AnfNodePtr &node, + const FuncGraphManagerPtr &mng); + + size_t reduce_real_output_index_{0}; + size_t real_output_num_{0}; + std::vector> to_process_order_; }; -using CleanAddAtomicPtr = std::shared_ptr; +using AtomicCleanInsertterPtr = std::shared_ptr; } // namespace opt } // namespace mindspore -#endif // MINDSPORE_CCSRC_BACKEND_OPTIMIZER_GRAPH_KERNEL_CLEAN_ADD_ATOMIC_H_ +#endif // MINDSPORE_CCSRC_BACKEND_OPTIMIZER_GRAPH_KERNEL_ADD_ATOMIC_CLEAN_H_ diff --git a/mindspore/ccsrc/backend/optimizer/graph_kernel/add_atomic_clean_gpu.cc b/mindspore/ccsrc/backend/optimizer/graph_kernel/add_atomic_clean_gpu.cc deleted file mode 100644 index c59d48f0b0..0000000000 --- a/mindspore/ccsrc/backend/optimizer/graph_kernel/add_atomic_clean_gpu.cc +++ /dev/null @@ -1,514 +0,0 @@ -/** - * Copyright 2020-2021 Huawei Technologies Co., Ltd - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include "backend/optimizer/graph_kernel/add_atomic_clean_gpu.h" -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include "base/core_ops.h" -#include "ir/tensor.h" -#include "utils/utils.h" -#include "utils/log_adapter.h" -#include "backend/kernel_compiler/kernel.h" -#include "backend/optimizer/graph_kernel/graph_kernel_helper.h" -#include "backend/session/anf_runtime_algorithm.h" -#include "backend/session/kernel_graph.h" -#include "debug/anf_ir_dump.h" - -namespace mindspore { -namespace opt { -namespace { -bool SuitableForAtomicAdd(const AnfNodePtr &node) { - if (!IsPrimitiveCNode(node, prim::kPrimReduceSum)) { - MS_LOG(EXCEPTION) << "Only process for reduce sum!"; - } - - auto input = node->cast()->input(kFirstDataInputIndex); - auto src_shape_vec = GetShape(input); - auto axis_vec = GetReduceAxis(node); - if (axis_vec.empty()) { - for (size_t i = 0; i < src_shape_vec.size(); ++i) { - axis_vec.push_back(i); - } - } else { - std::transform(axis_vec.begin(), axis_vec.end(), axis_vec.begin(), - [&src_shape_vec](int64_t axis) -> int64_t { return axis < 0 ? axis + src_shape_vec.size() : axis; }); - } - - std::set axis_set(axis_vec.begin(), axis_vec.end()); - - // For reduce whose last dim is reduced (including all-reduce), - // it is suitable for atomic add only the reduce num is greater than or equal to 1024. - if (axis_set.count(src_shape_vec.size() - 1) != 0) { - size_t reduce_size = - std::accumulate(axis_set.begin(), axis_set.end(), 1, - [&src_shape_vec](size_t size, int64_t axis) { return size * src_shape_vec[axis]; }); - return reduce_size >= 1024; - } - - // For reduce whose last dim is not reduced, always true. - return true; -} - -bool HaveReduceInPredecessors(const AnfNodePtr &node) { - std::stack st; - st.push(node); - while (!st.empty()) { - auto n = st.top(); - st.pop(); - - if (n != node) { - if (!n->isa()) { - continue; - } - if (IsPrimitiveCNode(n, prim::kPrimReduceSum)) { - return true; - } - } - - auto n_inputs = n->cast()->inputs(); - std::for_each(n_inputs.cbegin() + 1, n_inputs.cend(), [&st](const AnfNodePtr &n) -> void { st.push(n); }); - } - - return false; -} - -inline int64_t CalNewIndex(int64_t old_index, int64_t reduce_index) { - return old_index - (old_index > reduce_index ? 1 : 0); -} -} // namespace - -bool AtomicCleanInsertter::CanActivateAtomicAdd(const AnfNodePtr &anf_node) { - auto node = anf_node->cast(); - MS_EXCEPTION_IF_NULL(node); - auto sub_graph = AnfAlgo::GetCNodeFuncGraphPtr(node); - auto mng_sub = sub_graph->manager(); - if (mng_sub == nullptr) { - mng_sub = Manage(sub_graph, false); - sub_graph->set_manager(mng_sub); - } - - // Rules to activate atomic add: - // 1. ReduceSum should not fuse any other ops in out direction, which mean it should be in output list. - // 2. only one ReduceSum in output list. - // 3. The reduce axis and reduce number should meet condition (all-reduce or reduce-x when fuse number is greater than - // or equal to 1024, or reduce-y). - // 4. No other ReduceSum as output ReduceSum's predecessors (reduce compile limitation). - - // Rule 2. - auto real_return_node = sub_graph->get_return()->input(kFirstDataInputIndex); - if (IsPrimitiveCNode(real_return_node, prim::kPrimMakeTuple)) { - AnfNodePtrList reduce_ops; - size_t reduce_cnt = 0; - const auto &inputs = real_return_node->cast()->inputs(); - for (size_t i = 1; i < inputs.size(); ++i) { - if (IsPrimitiveCNode(inputs[i], prim::kPrimReduceSum)) { - atomic_add_node_ = inputs[i]->cast(); - reduce_real_output_index_ = i - 1; - reduce_cnt++; - } - } - - if (reduce_cnt != 1) { - return false; - } - real_output_num_ = inputs.size() - 1; - } else if (IsPrimitiveCNode(real_return_node, prim::kPrimReduceSum)) { - atomic_add_node_ = real_return_node->cast(); - real_output_num_ = 1; - } else { - return false; - } - - // Rule 1. - if (mng_sub->node_users()[atomic_add_node_].size() > 1) { - return false; - } - - // Rule 3 and 4. - if (!SuitableForAtomicAdd(atomic_add_node_) || HaveReduceInPredecessors(atomic_add_node_)) { - return false; - } - - return true; -} - -void AtomicCleanInsertter::CorrectKernelBuildInfo(const AnfNodePtr &composite_node, const AnfNodePtr &new_input) { - // Change kernel build info. - auto kernel_info = static_cast(composite_node->kernel_info()); - MS_EXCEPTION_IF_NULL(kernel_info); - const auto &origin_kernel_build_info = kernel_info->GetMutableSelectKernelBuildInfo(); - auto origin_inputs_format = origin_kernel_build_info->GetAllInputFormats(); - auto origin_outputs_format = origin_kernel_build_info->GetAllOutputFormats(); - auto origin_inputs_type = origin_kernel_build_info->GetAllInputDeviceTypes(); - auto origin_outputs_type = origin_kernel_build_info->GetAllOutputDeviceTypes(); - auto origin_processor = origin_kernel_build_info->processor(); - - std::vector &new_inputs_format = origin_inputs_format; - std::vector &new_inputs_type = origin_inputs_type; - std::vector new_outputs_format; - std::vector new_outputs_type; - for (size_t i = 0; i < origin_outputs_format.size(); ++i) { - if (real_output_num_ > 1 && i == reduce_real_output_index_) { - continue; - } - new_outputs_format.push_back(origin_outputs_format[i]); - new_outputs_type.push_back(origin_outputs_type[i]); - } - - auto kernel_with_index = AnfAlgo::VisitKernel(new_input, 0); - new_inputs_format.push_back(AnfAlgo::GetOutputFormat(kernel_with_index.first, kernel_with_index.second)); - new_inputs_type.push_back(AnfAlgo::GetOutputDeviceDataType(kernel_with_index.first, kernel_with_index.second)); - - kernel::KernelBuildInfo::KernelBuildInfoBuilder new_info_builder; - new_info_builder.SetInputsFormat(new_inputs_format); - new_info_builder.SetInputsDeviceType(new_inputs_type); - new_info_builder.SetOutputsFormat(new_outputs_format); - new_info_builder.SetOutputsDeviceType(new_outputs_type); - new_info_builder.SetProcessor(origin_processor); - new_info_builder.SetKernelType(KernelType::AKG_KERNEL); - new_info_builder.SetFusionType(kernel::FusionType::OPAQUE); - auto new_selected_info = new_info_builder.Build(); - AnfAlgo::SetSelectKernelBuildInfo(new_selected_info, composite_node.get()); -} - -void AtomicCleanInsertter::CreateInplaceAssignNodeAndCorrectReturn(const FuncGraphPtr &sub_graph, - const AnfNodePtr &new_parameter) { - // add inplaceassign - AnfNodePtr out_node; - bool fake_out = false; - size_t replace_index = 0; - auto retrun_node = sub_graph->get_return()->input(kFirstDataInputIndex); - if (IsPrimitiveCNode(retrun_node, prim::kPrimMakeTuple)) { - const auto &outs = retrun_node->cast()->inputs(); - for (size_t i = 1; i < outs.size(); ++i) { - if (i != reduce_real_output_index_ + 1) { - out_node = outs[i]; - replace_index = i; - break; - } - } - } else { - out_node = atomic_add_node_; // Use result data itself, and set attr "fake_out" true. - fake_out = true; - } - - auto inplace_assign_node = - CreateCNode({NewValueNode(prim::kPrimInplaceAssign), new_parameter, atomic_add_node_, out_node}, sub_graph, - {.format = GetFormat(out_node), .shape = GetShape(out_node), .type = GetType(out_node)}); - SetNodeAttrSafely("fake_output", MakeValue(fake_out), inplace_assign_node); - - CNodePtr new_out_node; - if (real_output_num_ > 2) { - std::vector output_args = {NewValueNode(prim::kPrimMakeTuple)}; - const auto &outs = retrun_node->cast()->inputs(); - for (size_t i = 1; i < outs.size(); ++i) { - if (i == reduce_real_output_index_ + 1) { - continue; - } else if (i == replace_index) { - output_args.push_back(inplace_assign_node); - } else { - output_args.push_back(outs[i]); - } - } - // Set output for AnfGraph - new_out_node = sub_graph->NewCNode(output_args); - } else { - new_out_node = inplace_assign_node; - } - sub_graph->set_output(new_out_node); -} - -void AtomicCleanInsertter::CorrectAbstract(const AnfNodePtr &composite_node) { - // If there is only one output(ReduceSum), it should be a fake output with the same abstract with origin output. - if (real_output_num_ <= 1) { - return; - } - - // Change abstract. - auto origin_out_spec = composite_node->abstract()->cast(); - MS_EXCEPTION_IF_NULL(origin_out_spec); - const auto &origin_out_specs = origin_out_spec->elements(); - AbstractBasePtrList new_out_specs; - for (size_t i = 0; i < origin_out_specs.size(); ++i) { - if (i != reduce_real_output_index_) { - new_out_specs.push_back(origin_out_specs[i]); - } - } - composite_node->set_abstract(std::make_shared(new_out_specs)); -} - -void AtomicCleanInsertter::ProcessOriginCNode(const AnfNodePtr &composite_node, const AnfNodePtr &new_input, - const FuncGraphManagerPtr &mng) { - auto sub_graph = AnfAlgo::GetCNodeFuncGraphPtr(composite_node); - auto mng_sub = sub_graph->manager(); - if (mng_sub == nullptr) { - mng_sub = Manage(sub_graph, false); - sub_graph->set_manager(mng_sub); - } - - // Add atomic attribute to reducesum node. - SetNodeAttrSafely("enable_atomic_add", MakeValue(true), atomic_add_node_); - - // add input - auto inputs = composite_node->cast()->inputs(); - inputs.push_back(new_input); - composite_node->cast()->set_inputs(inputs); - - // add parameter - auto parameter = sub_graph->add_parameter(); - parameter->set_abstract(new_input->abstract()); - parameter->set_kernel_info(new_input->kernel_info_ptr()); - - CreateInplaceAssignNodeAndCorrectReturn(sub_graph, parameter); - - CorrectAbstract(composite_node); - CorrectKernelBuildInfo(composite_node, new_input); - - auto old_graph_name = GetValue(sub_graph->get_attr(FUNC_GRAPH_ATTR_GRAPH_KERNEL)); - auto new_graph_name = ExtractGraphKernelName(TopoSort(sub_graph->get_return()), "", "atomic_add"); - sub_graph->set_attr(FUNC_GRAPH_ATTR_GRAPH_KERNEL, MakeValue(new_graph_name)); - MS_LOG(INFO) << "Convert " << old_graph_name << " to atomic add graph " << new_graph_name; -} - -void AtomicCleanInsertter::AddDepend(const FuncGraphPtr &main_graph, const AnfNodePtr &clean_node, - const AnfNodePtr &composite_node, const AnfNodePtr &user_node, int index) { - // Create depend node to hold new control depend node. - AnfNodePtrList d_inputs = {NewValueNode(prim::kPrimDepend), clean_node, composite_node}; - auto depend_cnode = main_graph->NewCNode(d_inputs); - depend_cnode->set_abstract(clean_node->abstract()); - main_graph->AddNode(depend_cnode); - - auto user_cnode = user_node->cast(); - MS_EXCEPTION_IF_NULL(user_cnode); - user_cnode->set_input(index, depend_cnode); -} - -CNodePtr AtomicCleanInsertter::InsertUpdateState(const KernelGraphPtr &main_graph, const CNodePtr &composite_node) { - // Insert update_state_node, need mount a monad node. - auto u = NewValueNode(kUMonad); - u->set_abstract(kUMonad->ToAbstract()); - AnfNodePtrList update_state_inputs = {NewValueNode(prim::kPrimUpdateState), u, composite_node}; - auto update_state_cnode = main_graph->NewCNode(update_state_inputs); - main_graph->AddNode(update_state_cnode); - return update_state_cnode; -} - -CNodePtr AtomicCleanInsertter::CreateAtomicCleanCompositeNode(const KernelGraphPtr &main_graph, TypeId dst_type) { - std::set data_support = {kNumberTypeFloat16, kNumberTypeFloat32, kNumberTypeFloat64}; - - if (!std::any_of(data_support.cbegin(), data_support.cend(), [&dst_type](TypeId type) { return dst_type == type; })) { - MS_LOG(EXCEPTION) << "Atomic add not support data type " << dst_type; - } - - // Create zero value which will be broadcast to target shape. - auto format = GetFormat(atomic_add_node_); - auto dtype = (dst_type == kNumberTypeFloat16) ? kNumberTypeFloat32 : dst_type; - ValueNodePtr value_node; - if (dtype == kNumberTypeFloat32) { - value_node = CreateScalarTensorValueNode({.format = format, .shape = {1}, .type = TypeIdToType(dtype)}, - static_cast(0), sizeof(float)); - } else { - value_node = CreateScalarTensorValueNode({.format = format, .shape = {1}, .type = TypeIdToType(dtype)}, - static_cast(0), sizeof(double)); - } - - // Create composite op's sub-graph. - auto new_sub_graph = std::make_shared(); - - AnfNodePtr broadcast_input_node = value_node; - if (dst_type == kNumberTypeFloat16) { - AnfNodePtrList cast_inputs = {NewValueNode(prim::kPrimCast), value_node}; - auto cast_node_inner = - CreateCNode(cast_inputs, new_sub_graph, {.format = format, .shape = {1}, .type = TypeIdToType(dst_type)}); - SetNodeAttrSafely("dst_type", MakeValue("float32"), cast_node_inner); - broadcast_input_node = cast_node_inner; - } - - // Create broadcast basic op. - auto dst_shape_vec = GetShape(atomic_add_node_); - AnfNodePtrList atomic_clean_inputs = {NewValueNode(prim::kPrimBroadcastTo), broadcast_input_node}; - auto broadcast_to_node_inner = CreateCNode( - atomic_clean_inputs, new_sub_graph, {.format = format, .shape = dst_shape_vec, .type = GetType(atomic_add_node_)}); - SetNodeAttrSafely("shape", MakeValue(GetDeviceShape(atomic_add_node_)), broadcast_to_node_inner); - - // Makeup sub-graph. - new_sub_graph->set_output(broadcast_to_node_inner); - auto broadcast_to_composite_node = main_graph->NewCNode({NewValueNode(new_sub_graph)}); - broadcast_to_composite_node->set_abstract(broadcast_to_node_inner->abstract()); - SetNewKernelInfo(broadcast_to_composite_node, new_sub_graph, {}, {broadcast_to_node_inner}, - AnfAlgo::GetProcessor(atomic_add_node_)); - auto graph_attr = ExtractGraphKernelName(TopoSort(new_sub_graph->get_return()), "", "atomic_clean"); - new_sub_graph->set_attr(FUNC_GRAPH_ATTR_GRAPH_KERNEL, MakeValue(graph_attr)); - new_sub_graph->set_attr("composite_type", MakeValue("atomic_clean")); - - return broadcast_to_composite_node; -} - -std::vector > AtomicCleanInsertter::FindOriginCNodeUsers(const KernelGraphPtr &main_graph, - const AnfNodePtr &composite_node, - const FuncGraphManagerPtr &mng, - bool correct_index) { - std::vector > reduce_user_nodes; - if (real_output_num_ <= 1) { - auto users = mng->node_users()[composite_node]; - std::transform(users.cbegin(), users.cend(), std::back_inserter(reduce_user_nodes), - [](const std::pair &pair) { return pair; }); - } else { - std::vector > getitem_user_nodes; - auto users = mng->node_users()[composite_node]; - for (const auto &node_index : users) { - const auto &user_node = node_index.first; - if (!IsPrimitiveCNode(user_node, prim::kPrimTupleGetItem)) { - continue; - } - auto get_item_cnode = user_node->cast(); - auto value_input = get_item_cnode->input(kInputNodeOutputIndexInTupleGetItem); - MS_EXCEPTION_IF_NULL(value_input); - auto value_node = value_input->cast(); - MS_EXCEPTION_IF_NULL(value_node); - auto item_idx = GetValue(value_node->value()); - if (item_idx == static_cast(reduce_real_output_index_)) { - getitem_user_nodes.push_back(node_index); - } else if (correct_index) { - if (real_output_num_ > 2) { - // Recorrect other getitem index. - int64_t new_item_idx = CalNewIndex(item_idx, reduce_real_output_index_); - AnfNodePtrList new_inputs = {NewValueNode(prim::kPrimTupleGetItem), composite_node, - NewValueNode(new_item_idx)}; - auto new_out = main_graph->NewCNode(new_inputs); - new_out->set_abstract(get_item_cnode->abstract()); - for (const auto &[user, index] : mng->node_users()[get_item_cnode]) { - auto user_cnode = user->cast(); - MS_EXCEPTION_IF_NULL(user_cnode); - user_cnode->set_input(index, new_out); - } - } else { - for (const auto &[user, index] : mng->node_users()[node_index.first]) { - auto user_cnode = user->cast(); - MS_EXCEPTION_IF_NULL(user_cnode); - user_cnode->set_input(index, composite_node); - } - } - } - } - for (auto &pair : getitem_user_nodes) { - // Directory to find real user. - auto real_users = mng->node_users()[pair.first]; - reduce_user_nodes.insert(reduce_user_nodes.end(), real_users.begin(), real_users.end()); - } - } - - return reduce_user_nodes; -} - -void AtomicCleanInsertter::ProcessOriginCNodeUser(const KernelGraphPtr &main_graph, const AnfNodePtr &composite_node, - const AnfNodePtr &broadcast_to_node, - const AnfNodePtr &update_state_node, const FuncGraphManagerPtr &mng) { - // 1. find users, change getitem index if needed. - std::vector > reduce_user_nodes = - FindOriginCNodeUsers(main_graph, composite_node, mng, true); - for (const auto &[user_node, index] : reduce_user_nodes) { - // 2. Make sure modified composite node running first, So firstly, create load_node, then add edge to connect - // update_state_node, broadcat_node and load_node to keep order. - AnfNodePtrList load_inputs = {NewValueNode(prim::kPrimLoad), broadcast_to_node, update_state_node}; - auto load_node = main_graph->NewCNode(load_inputs); - main_graph->AddNode(load_node); - auto user_cnode = user_node->cast(); - MS_EXCEPTION_IF_NULL(user_cnode); - user_cnode->set_input(index, load_node); - to_process_order_.emplace_back(composite_node, user_node); - } -} - -void AtomicCleanInsertter::InsertAtomicClean(const KernelGraphPtr &main_graph, const AnfNodePtr &anf_node, - const FuncGraphManagerPtr &mng) { - auto origin_composite_node = anf_node->cast(); - MS_EXCEPTION_IF_NULL(origin_composite_node); - - // Create broadcst node. - auto out_type = GetType(atomic_add_node_)->cast(); - MS_EXCEPTION_IF_NULL(out_type); - auto broadcast_to_node = CreateAtomicCleanCompositeNode(main_graph, out_type->element()->type_id()); - - // Insert extra input(broadcast node output) to composite node, and make Reducesum inplaceassign to it. - // Note: if it's single output, this will increase total memory because of a fake out. - ProcessOriginCNode(origin_composite_node, broadcast_to_node, mng); - - // Insert update_state_node to keep execution order. - auto update_state_node = InsertUpdateState(main_graph, origin_composite_node); - - // Replace origin ReduceSum's user with atomic clean output - ProcessOriginCNodeUser(main_graph, origin_composite_node, broadcast_to_node, update_state_node, mng); - MS_LOG(INFO) << "Target node: " << origin_composite_node->fullname_with_scope() - << ", clean node: " << broadcast_to_node->fullname_with_scope(); -} - -bool AtomicCleanInsertter::IsExistStructuralObstacle(const KernelGraphPtr &main_graph, const AnfNodePtr &node, - const FuncGraphManagerPtr &mng) { - auto reduce_users = FindOriginCNodeUsers(main_graph, node, mng, false); - // If reduce user is MakeTuple and not last node, there is no cheap method to set right running order between reduce - // node and user node. If reduce is Depend or ControlDepend node, the origin node may be wrong! - return std::all_of(reduce_users.cbegin(), reduce_users.cend(), - [&main_graph](const std::pair &user_info) -> bool { - auto &user = user_info.first; - if ((IsPrimitiveCNode(user, prim::kPrimMakeTuple) || IsPrimitiveCNode(user, prim::kPrimDepend) || - IsPrimitiveCNode(user, prim::kPrimControlDepend)) && - !(IsPrimitiveCNode(user, prim::kPrimReturn) || user == main_graph->output())) { - return false; - } else { - return true; - } - }); -} - -bool AtomicCleanInsertter::Run(const FuncGraphPtr &func_graph) { - auto kernel_graph = std::dynamic_pointer_cast(func_graph); - MS_EXCEPTION_IF_NULL(kernel_graph); - auto mng = kernel_graph->manager(); - if (mng == nullptr) { - mng = Manage(kernel_graph, true); - kernel_graph->set_manager(mng); - } - - bool changed = false; - auto topo_nodes = TopoSort(kernel_graph->get_return()); - for (const auto &node : topo_nodes) { - if (!AnfAlgo::IsGraphKernel(node) || !CanActivateAtomicAdd(node) || - !IsExistStructuralObstacle(kernel_graph, node, mng)) { - continue; - } - InsertAtomicClean(kernel_graph, node, mng); - changed = true; - } - - if (changed) { - mng->RemoveRoots(); - mng->KeepRoots({func_graph}); - } - - return changed; -} -} // namespace opt -} // namespace mindspore diff --git a/mindspore/ccsrc/backend/optimizer/graph_kernel/add_atomic_clean_gpu.h b/mindspore/ccsrc/backend/optimizer/graph_kernel/add_atomic_clean_gpu.h deleted file mode 100644 index be137919e8..0000000000 --- a/mindspore/ccsrc/backend/optimizer/graph_kernel/add_atomic_clean_gpu.h +++ /dev/null @@ -1,68 +0,0 @@ -/** - * Copyright 2020-2021 Huawei Technologies Co., Ltd - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#ifndef MINDSPORE_CCSRC_BACKEND_OPTIMIZER_GRAPH_KERNEL_ADD_ATOMIC_CLEAN_GPU_H_ -#define MINDSPORE_CCSRC_BACKEND_OPTIMIZER_GRAPH_KERNEL_ADD_ATOMIC_CLEAN_GPU_H_ - -#include -#include -#include -#include -#include -#include "backend/optimizer/common/optimizer.h" -#include "backend/session/kernel_graph.h" - -namespace mindspore { -namespace opt { -class AtomicCleanInsertter : public Pass { - public: - explicit AtomicCleanInsertter(const std::string &name = "atomic_clean") : Pass(name) {} - ~AtomicCleanInsertter() override = default; - virtual bool Run(const FuncGraphPtr &func_graph); - - protected: - virtual void CorrectKernelBuildInfo(const AnfNodePtr &composite_node, const AnfNodePtr &new_input); - virtual void ProcessOriginCNode(const AnfNodePtr &composite_node, const AnfNodePtr &new_input, - const FuncGraphManagerPtr &mng); - void AddDepend(const FuncGraphPtr &main_graph, const AnfNodePtr &clean_node, const AnfNodePtr &composite_node, - const AnfNodePtr &user_node, int index); - void InsertAtomicClean(const KernelGraphPtr &main_graph, const AnfNodePtr &anf_node, const FuncGraphManagerPtr &mng); - CNodePtr InsertUpdateState(const KernelGraphPtr &main_graph, const CNodePtr &composite_node); - CNodePtr atomic_add_node_{nullptr}; - - private: - bool CanActivateAtomicAdd(const AnfNodePtr &anf_node); - void CorrectAbstract(const AnfNodePtr &composite_node); - CNodePtr CreateAtomicCleanCompositeNode(const KernelGraphPtr &main_graph, TypeId dst_type); - void CreateInplaceAssignNodeAndCorrectReturn(const FuncGraphPtr &sub_graph, const AnfNodePtr &new_parameter); - void ProcessOriginCNodeUser(const KernelGraphPtr &main_graph, const AnfNodePtr &composite_node, - const AnfNodePtr &broadcast_to_node, const AnfNodePtr &update_state_node, - const FuncGraphManagerPtr &mng); - std::vector> FindOriginCNodeUsers(const KernelGraphPtr &main_graph, - const AnfNodePtr &composite_node, - const FuncGraphManagerPtr &mng, bool correct_index); - bool IsExistStructuralObstacle(const KernelGraphPtr &main_graph, const AnfNodePtr &node, - const FuncGraphManagerPtr &mng); - - size_t reduce_real_output_index_{0}; - size_t real_output_num_{0}; - std::vector> to_process_order_; -}; -using AtomicCleanInsertterPtr = std::shared_ptr; -} // namespace opt -} // namespace mindspore - -#endif // MINDSPORE_CCSRC_BACKEND_OPTIMIZER_GRAPH_KERNEL_ADD_ATOMIC_CLEAN_GPU_H_ diff --git a/mindspore/ccsrc/backend/optimizer/graph_kernel/add_stitch_atomic_clean_gpu.h b/mindspore/ccsrc/backend/optimizer/graph_kernel/add_stitch_atomic_clean_gpu.h index 85b5f4d690..4e8ca8d27d 100644 --- a/mindspore/ccsrc/backend/optimizer/graph_kernel/add_stitch_atomic_clean_gpu.h +++ b/mindspore/ccsrc/backend/optimizer/graph_kernel/add_stitch_atomic_clean_gpu.h @@ -22,7 +22,7 @@ #include #include #include "backend/optimizer/common/optimizer.h" -#include "backend/optimizer/graph_kernel/add_atomic_clean_gpu.h" +#include "backend/optimizer/graph_kernel/add_atomic_clean.h" #include "backend/session/kernel_graph.h" namespace mindspore { diff --git a/mindspore/ccsrc/backend/optimizer/graph_kernel/graph_kernel_helper.cc b/mindspore/ccsrc/backend/optimizer/graph_kernel/graph_kernel_helper.cc index 60fc96e4f4..6bfbb98f76 100644 --- a/mindspore/ccsrc/backend/optimizer/graph_kernel/graph_kernel_helper.cc +++ b/mindspore/ccsrc/backend/optimizer/graph_kernel/graph_kernel_helper.cc @@ -584,7 +584,7 @@ std::vector GetFusibleOpList() { prim::kPrimExpandDims, prim::kPrimMul, prim::kPrimMinimum, prim::kPrimMaximum, prim::kPrimLog, prim::kPrimPow, prim::kPrimSub, prim::kPrimRsqrt, prim::kPrimSqrt, prim::kPrimAddN, prim::kPrimEqual, prim::kPrimReciprocal, prim::kPrimTanh, prim::kPrimReshape, prim::kPrimTranspose, - prim::kPrimCast, prim::kPrimRealDiv, prim::kPrimMatMul, prim::kPrimAssign}; + prim::kPrimCast, prim::kPrimRealDiv, prim::kPrimMatMul, prim::kPrimAssign, prim::kPrimReduceSum}; #elif ENABLE_GPU std::vector fusible_basic_ops = { prim::kPrimAbs, prim::kPrimRound, prim::kPrimNeg, prim::kPrimExp, prim::kPrimAdd, diff --git a/mindspore/ccsrc/backend/optimizer/graph_kernel/graph_kernel_optimization.cc b/mindspore/ccsrc/backend/optimizer/graph_kernel/graph_kernel_optimization.cc index 7fbdbaeb8b..c999a155f0 100644 --- a/mindspore/ccsrc/backend/optimizer/graph_kernel/graph_kernel_optimization.cc +++ b/mindspore/ccsrc/backend/optimizer/graph_kernel/graph_kernel_optimization.cc @@ -22,7 +22,6 @@ #include "ir/func_graph.h" #include "utils/ms_context.h" #include "backend/optimizer/graph_kernel/add_atomic_clean.h" -#include "backend/optimizer/graph_kernel/add_atomic_clean_gpu.h" #include "backend/optimizer/graph_kernel/add_stitch_atomic_clean_gpu.h" #include "backend/optimizer/graph_kernel/arithmetic_simplify.h" #include "backend/optimizer/graph_kernel/basic_ops_fusion.h" @@ -132,11 +131,9 @@ PassManagerPtr GraphKernelOptimizer::Split() { PassManagerPtr GraphKernelOptimizer::HighLevelOpt2() { auto pm = std::make_shared("graphkernel_stage5_highlevelopt2"); // Enable atomic add + pm->AddPass(std::make_shared()); if (is_gpu) { - pm->AddPass(std::make_shared()); pm->AddPass(std::make_shared()); - } else /* if (is_ascend) */ { - pm->AddPass(std::make_shared()); } return pm; } diff --git a/tests/st/ops/graph_kernel/test_atomic_add.py b/tests/st/ops/graph_kernel/test_atomic_add.py index b8786f46b1..fcc2b59207 100644 --- a/tests/st/ops/graph_kernel/test_atomic_add.py +++ b/tests/st/ops/graph_kernel/test_atomic_add.py @@ -122,3 +122,30 @@ def test_atomic_add_single_output_gpu(): def test_atomic_add_multi_output_gpu(): context.set_context(mode=context.GRAPH_MODE, enable_graph_kernel=True, device_target="GPU") atomic_add_multi_output() + + +@pytest.mark.level0 +@pytest.mark.platform_arm_ascend_training +@pytest.mark.platform_x86_ascend_training +@pytest.mark.env_onecard +def test_atomic_add_sum_output_ascend(): + context.set_context(mode=context.GRAPH_MODE, enable_graph_kernel=True, device_target="Ascend") + atomic_add_sum_output() + + +@pytest.mark.level0 +@pytest.mark.platform_arm_ascend_training +@pytest.mark.platform_x86_ascend_training +@pytest.mark.env_onecard +def test_atomic_add_single_output_ascend(): + context.set_context(mode=context.GRAPH_MODE, enable_graph_kernel=True, device_target="Ascend") + atomic_add_single_output() + + +@pytest.mark.level0 +@pytest.mark.platform_arm_ascend_training +@pytest.mark.platform_x86_ascend_training +@pytest.mark.env_onecard +def test_atomic_add_multi_output_ascend(): + context.set_context(mode=context.GRAPH_MODE, enable_graph_kernel=True, device_target="Ascend") + atomic_add_multi_output()