!14314 [GraphKernel] unify graph kernel pass add_atomic_clean on Ascend and GPU back-end

From: @looop5
Reviewed-by: @gaoxiong1,@gaoxiong1,@dylangeng
Signed-off-by: @dylangeng
pull/14314/MERGE
mindspore-ci-bot 4 years ago committed by Gitee
commit 69526df01e

@ -1,5 +1,5 @@
/**
* Copyright 2020 Huawei Technologies Co., Ltd
* Copyright 2020-2021 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
@ -14,23 +14,96 @@
* limitations under the License.
*/
#ifndef MINDSPORE_CCSRC_BACKEND_OPTIMIZER_GRAPH_KERNEL_CLEAN_ADD_ATOMIC_H_
#define MINDSPORE_CCSRC_BACKEND_OPTIMIZER_GRAPH_KERNEL_CLEAN_ADD_ATOMIC_H_
#ifndef MINDSPORE_CCSRC_BACKEND_OPTIMIZER_GRAPH_KERNEL_ADD_ATOMIC_CLEAN_H_
#define MINDSPORE_CCSRC_BACKEND_OPTIMIZER_GRAPH_KERNEL_ADD_ATOMIC_CLEAN_H_
#include <memory>
#include <tuple>
#include <utility>
#include <vector>
#include <string>
#include "backend/optimizer/common/optimizer.h"
#include "backend/session/kernel_graph.h"
namespace mindspore {
namespace opt {
class CleanAddAtomic : public Pass {
struct AtomicAddInfo {
CNodePtr atomic_add_node{nullptr};
size_t reduce_real_output_index{0};
size_t real_output_num{0};
};
class AtomicAddChecker {
public:
AtomicAddChecker() = default;
virtual ~AtomicAddChecker() = default;
static std::shared_ptr<AtomicAddChecker> Init();
bool Check(const AnfNodePtr &node);
AtomicAddInfo GetAtomicAddInfo() { return atomic_add_info_; }
protected:
virtual bool SuitableForAtomicAdd(const AnfNodePtr &node) { return false; }
virtual bool FindCandidate(const AnfNodePtr &anf_node);
virtual bool CanActivateAtomicAdd(const AnfNodePtr &anf_node);
private:
AtomicAddInfo atomic_add_info_;
};
class AtomicAddCheckerGPU : public AtomicAddChecker {
public:
CleanAddAtomic() : Pass("clean_add_atomic") {}
~CleanAddAtomic() override = default;
bool Run(const FuncGraphPtr &func_graph) override;
AtomicAddCheckerGPU() = default;
~AtomicAddCheckerGPU() = default;
protected:
bool SuitableForAtomicAdd(const AnfNodePtr &node) override;
};
class AtomicAddCheckerAscend : public AtomicAddChecker {
public:
AtomicAddCheckerAscend() = default;
~AtomicAddCheckerAscend() = default;
protected:
bool SuitableForAtomicAdd(const AnfNodePtr &node) override;
};
class AtomicCleanInsertter : public Pass {
public:
explicit AtomicCleanInsertter(const std::string &name = "atomic_clean") : Pass(name) {}
~AtomicCleanInsertter() override = default;
virtual bool Run(const FuncGraphPtr &func_graph);
protected:
virtual void CorrectKernelBuildInfo(const AnfNodePtr &composite_node, const AnfNodePtr &new_input);
virtual void ProcessOriginCNode(const AnfNodePtr &composite_node, const AnfNodePtr &new_input,
const FuncGraphManagerPtr &mng);
void AddDepend(const FuncGraphPtr &main_graph, const AnfNodePtr &clean_node, const AnfNodePtr &composite_node,
const AnfNodePtr &user_node, int index);
void InsertAtomicClean(const KernelGraphPtr &main_graph, const AnfNodePtr &anf_node, const FuncGraphManagerPtr &mng);
CNodePtr InsertUpdateState(const KernelGraphPtr &main_graph, const CNodePtr &composite_node);
CNodePtr atomic_add_node_{nullptr};
private:
void CorrectAbstract(const AnfNodePtr &composite_node);
CNodePtr CreateAtomicCleanCompositeNode(const KernelGraphPtr &main_graph, TypeId dst_type);
void CreateInplaceAssignNodeAndCorrectReturn(const FuncGraphPtr &sub_graph, const AnfNodePtr &new_parameter);
void ProcessOriginCNodeUser(const KernelGraphPtr &main_graph, const AnfNodePtr &composite_node,
const AnfNodePtr &broadcast_to_node, const AnfNodePtr &update_state_node,
const FuncGraphManagerPtr &mng);
std::vector<std::pair<AnfNodePtr, int>> FindOriginCNodeUsers(const KernelGraphPtr &main_graph,
const AnfNodePtr &composite_node,
const FuncGraphManagerPtr &mng, bool correct_index);
bool IsExistStructuralObstacle(const KernelGraphPtr &main_graph, const AnfNodePtr &node,
const FuncGraphManagerPtr &mng);
size_t reduce_real_output_index_{0};
size_t real_output_num_{0};
std::vector<std::pair<AnfNodePtr, AnfNodePtr>> to_process_order_;
};
using CleanAddAtomicPtr = std::shared_ptr<CleanAddAtomic>;
using AtomicCleanInsertterPtr = std::shared_ptr<AtomicCleanInsertter>;
} // namespace opt
} // namespace mindspore
#endif // MINDSPORE_CCSRC_BACKEND_OPTIMIZER_GRAPH_KERNEL_CLEAN_ADD_ATOMIC_H_
#endif // MINDSPORE_CCSRC_BACKEND_OPTIMIZER_GRAPH_KERNEL_ADD_ATOMIC_CLEAN_H_

@ -1,68 +0,0 @@
/**
* Copyright 2020-2021 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef MINDSPORE_CCSRC_BACKEND_OPTIMIZER_GRAPH_KERNEL_ADD_ATOMIC_CLEAN_GPU_H_
#define MINDSPORE_CCSRC_BACKEND_OPTIMIZER_GRAPH_KERNEL_ADD_ATOMIC_CLEAN_GPU_H_
#include <memory>
#include <tuple>
#include <utility>
#include <vector>
#include <string>
#include "backend/optimizer/common/optimizer.h"
#include "backend/session/kernel_graph.h"
namespace mindspore {
namespace opt {
class AtomicCleanInsertter : public Pass {
public:
explicit AtomicCleanInsertter(const std::string &name = "atomic_clean") : Pass(name) {}
~AtomicCleanInsertter() override = default;
virtual bool Run(const FuncGraphPtr &func_graph);
protected:
virtual void CorrectKernelBuildInfo(const AnfNodePtr &composite_node, const AnfNodePtr &new_input);
virtual void ProcessOriginCNode(const AnfNodePtr &composite_node, const AnfNodePtr &new_input,
const FuncGraphManagerPtr &mng);
void AddDepend(const FuncGraphPtr &main_graph, const AnfNodePtr &clean_node, const AnfNodePtr &composite_node,
const AnfNodePtr &user_node, int index);
void InsertAtomicClean(const KernelGraphPtr &main_graph, const AnfNodePtr &anf_node, const FuncGraphManagerPtr &mng);
CNodePtr InsertUpdateState(const KernelGraphPtr &main_graph, const CNodePtr &composite_node);
CNodePtr atomic_add_node_{nullptr};
private:
bool CanActivateAtomicAdd(const AnfNodePtr &anf_node);
void CorrectAbstract(const AnfNodePtr &composite_node);
CNodePtr CreateAtomicCleanCompositeNode(const KernelGraphPtr &main_graph, TypeId dst_type);
void CreateInplaceAssignNodeAndCorrectReturn(const FuncGraphPtr &sub_graph, const AnfNodePtr &new_parameter);
void ProcessOriginCNodeUser(const KernelGraphPtr &main_graph, const AnfNodePtr &composite_node,
const AnfNodePtr &broadcast_to_node, const AnfNodePtr &update_state_node,
const FuncGraphManagerPtr &mng);
std::vector<std::pair<AnfNodePtr, int>> FindOriginCNodeUsers(const KernelGraphPtr &main_graph,
const AnfNodePtr &composite_node,
const FuncGraphManagerPtr &mng, bool correct_index);
bool IsExistStructuralObstacle(const KernelGraphPtr &main_graph, const AnfNodePtr &node,
const FuncGraphManagerPtr &mng);
size_t reduce_real_output_index_{0};
size_t real_output_num_{0};
std::vector<std::pair<AnfNodePtr, AnfNodePtr>> to_process_order_;
};
using AtomicCleanInsertterPtr = std::shared_ptr<AtomicCleanInsertter>;
} // namespace opt
} // namespace mindspore
#endif // MINDSPORE_CCSRC_BACKEND_OPTIMIZER_GRAPH_KERNEL_ADD_ATOMIC_CLEAN_GPU_H_

@ -22,7 +22,7 @@
#include <utility>
#include <vector>
#include "backend/optimizer/common/optimizer.h"
#include "backend/optimizer/graph_kernel/add_atomic_clean_gpu.h"
#include "backend/optimizer/graph_kernel/add_atomic_clean.h"
#include "backend/session/kernel_graph.h"
namespace mindspore {

@ -584,7 +584,7 @@ std::vector<PrimitivePtr> GetFusibleOpList() {
prim::kPrimExpandDims, prim::kPrimMul, prim::kPrimMinimum, prim::kPrimMaximum, prim::kPrimLog,
prim::kPrimPow, prim::kPrimSub, prim::kPrimRsqrt, prim::kPrimSqrt, prim::kPrimAddN,
prim::kPrimEqual, prim::kPrimReciprocal, prim::kPrimTanh, prim::kPrimReshape, prim::kPrimTranspose,
prim::kPrimCast, prim::kPrimRealDiv, prim::kPrimMatMul, prim::kPrimAssign};
prim::kPrimCast, prim::kPrimRealDiv, prim::kPrimMatMul, prim::kPrimAssign, prim::kPrimReduceSum};
#elif ENABLE_GPU
std::vector<PrimitivePtr> fusible_basic_ops = {
prim::kPrimAbs, prim::kPrimRound, prim::kPrimNeg, prim::kPrimExp, prim::kPrimAdd,

@ -22,7 +22,6 @@
#include "ir/func_graph.h"
#include "utils/ms_context.h"
#include "backend/optimizer/graph_kernel/add_atomic_clean.h"
#include "backend/optimizer/graph_kernel/add_atomic_clean_gpu.h"
#include "backend/optimizer/graph_kernel/add_stitch_atomic_clean_gpu.h"
#include "backend/optimizer/graph_kernel/arithmetic_simplify.h"
#include "backend/optimizer/graph_kernel/basic_ops_fusion.h"
@ -132,11 +131,9 @@ PassManagerPtr GraphKernelOptimizer::Split() {
PassManagerPtr GraphKernelOptimizer::HighLevelOpt2() {
auto pm = std::make_shared<PassManager>("graphkernel_stage5_highlevelopt2");
// Enable atomic add
pm->AddPass(std::make_shared<AtomicCleanInsertter>());
if (is_gpu) {
pm->AddPass(std::make_shared<AtomicCleanInsertter>());
pm->AddPass(std::make_shared<StitchAtomicCleanInsertter>());
} else /* if (is_ascend) */ {
pm->AddPass(std::make_shared<CleanAddAtomic>());
}
return pm;
}

@ -122,3 +122,30 @@ def test_atomic_add_single_output_gpu():
def test_atomic_add_multi_output_gpu():
context.set_context(mode=context.GRAPH_MODE, enable_graph_kernel=True, device_target="GPU")
atomic_add_multi_output()
@pytest.mark.level0
@pytest.mark.platform_arm_ascend_training
@pytest.mark.platform_x86_ascend_training
@pytest.mark.env_onecard
def test_atomic_add_sum_output_ascend():
context.set_context(mode=context.GRAPH_MODE, enable_graph_kernel=True, device_target="Ascend")
atomic_add_sum_output()
@pytest.mark.level0
@pytest.mark.platform_arm_ascend_training
@pytest.mark.platform_x86_ascend_training
@pytest.mark.env_onecard
def test_atomic_add_single_output_ascend():
context.set_context(mode=context.GRAPH_MODE, enable_graph_kernel=True, device_target="Ascend")
atomic_add_single_output()
@pytest.mark.level0
@pytest.mark.platform_arm_ascend_training
@pytest.mark.platform_x86_ascend_training
@pytest.mark.env_onecard
def test_atomic_add_multi_output_ascend():
context.set_context(mode=context.GRAPH_MODE, enable_graph_kernel=True, device_target="Ascend")
atomic_add_multi_output()

Loading…
Cancel
Save