!12667 【GraphKernel】Combine the GraphKernelOptimization of Gpu and Ascend

From: @dayschan
Reviewed-by: @lingyunli63
Signed-off-by:
pull/12667/MERGE
mindspore-ci-bot 4 years ago committed by Gitee
commit 4e1931481a

@ -0,0 +1,171 @@
/**
* Copyright 2021 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "backend/optimizer/graph_kernel/graph_kernel_optimization.h"
#include <vector>
#include <string>
#include <memory>
#include "ir/func_graph.h"
#include "utils/ms_context.h"
#include "backend/optimizer/graph_kernel/add_atomic_clean.h"
#include "backend/optimizer/graph_kernel/add_atomic_clean_gpu.h"
#include "backend/optimizer/graph_kernel/add_stitch_atomic_clean_gpu.h"
#include "backend/optimizer/graph_kernel/arithmetic_simplify.h"
#include "backend/optimizer/graph_kernel/basic_ops_fusion.h"
#include "backend/optimizer/graph_kernel/clean_all_in_once.h"
#include "backend/optimizer/graph_kernel/depend_formater.h"
#include "backend/optimizer/graph_kernel/eliminate_redundant_output.h"
#include "backend/optimizer/graph_kernel/tensor_promotion.h"
#include "backend/optimizer/graph_kernel/graph_kernel_splitter.h"
#include "backend/optimizer/graph_kernel/graph_kernel_expander.h"
#include "backend/optimizer/graph_kernel/raise_reduction_precision.h"
#include "backend/optimizer/graph_kernel/graph_kernel_cse.h"
#include "backend/optimizer/graph_kernel/shape_ops_splitter.h"
#include "backend/optimizer/graph_kernel/value_graph_binder.h"
#include "backend/optimizer/graph_kernel/parallel_fusion.h"
#include "backend/optimizer/graph_kernel/optimize_assign.h"
#include "backend/optimizer/graph_kernel/split_assign.h"
#include "backend/optimizer/graph_kernel/reorder_ops.h"
#include "backend/optimizer/pass/getitem_tuple.h"
namespace mindspore {
namespace opt {
PassManagerPtr GraphKernelOptimizer::PreProcess() {
auto pm = std::make_shared<PassManager>("graphkernel_stage1_preprocess");
// Change Assign(p, a, U) to Assign(Depend(p, U), a)
pm->AddPass(std::make_shared<SplitAssign>());
// Move the Depend nodes to the bottom of graph
pm->AddPass(std::make_shared<DependFormater>());
// Reorder TransData-Cast to Cast-TransData,
if (is_ascend) {
pm->AddPass(std::make_shared<ReorderOps>());
}
return pm;
}
PassManagerPtr GraphKernelOptimizer::Cluster() {
auto pm = std::make_shared<PassManager>("graphkernel_stage2_cluster");
// Expand complex basic kernels to composite kernels
pm->AddPass(std::make_shared<GraphKernelExpander>());
// Fuse basic kernels and composite kernels
pm->AddPass(std::make_shared<BasicOpsFusion>());
// Eliminate the outputs without external user
pm->AddPass(std::make_shared<EliminateRedundantOutput>());
return pm;
}
PassManagerPtr GraphKernelOptimizer::HighLevelOpt1() {
auto pm = std::make_shared<PassManager>("graphkernel_stage3_highlevelopt1");
// Replace Assign with InplaceAssign, and replace original output with overridden parameters
pm->AddPass(std::make_shared<OptimizeAssign>());
pm->AddPass(std::make_shared<EliminateRedundantOutput>());
// Cast the input of ReduceSum from float16 to float32 for higher precision*/
pm->AddPass(std::make_shared<RaiseReductionPrecision>());
// Universal arithmetic simplify
if (is_gpu) {
pm->AddPass(std::make_shared<ArithmeticSimplify>());
}
// Common subexpression elimination
pm->AddPass(std::make_shared<GraphKernelCSE>());
return pm;
}
PassManagerPtr GraphKernelOptimizer::Split() {
auto pm = std::make_shared<PassManager>("graphkernel_stage4_split");
// Move the non-scalar tensor (in composite node) to parameter list
pm->AddPass(std::make_shared<TensorPromotion>());
// Make certain nodes redundant so that they are used by only one user,
// which can avoid unnecessary input-output and get better performance.
if (is_gpu) {
std::vector<PrimitivePtr> duplicated_ops = {prim::kPrimReshape, prim::kPrimExpandDims, prim::kPrimCast};
pm->AddPass(std::make_shared<ShapeOpsSplitter>(duplicated_ops));
}
// Split kernel according to costmodel
pm->AddPass(std::make_shared<GraphKernelSplitter>());
// Eliminate the redundant node that is copied above but not handled by GraphKernelSplitter
if (is_gpu) {
pm->AddPass(std::make_shared<GraphKernelCSE>());
pm->AddPass(std::make_shared<EliminateRedundantOutput>());
}
// After Simplify and Splitter, a lot of redundant getitem/maketuple
// will be exposed, use GetitemTuple Pass to delete them.
pm->AddPass(std::make_shared<GetitemTuple>());
return pm;
}
PassManagerPtr GraphKernelOptimizer::HighLevelOpt2() {
auto pm = std::make_shared<PassManager>("graphkernel_stage5_highlevelopt2");
// Enable atomic add
if (is_gpu) {
pm->AddPass(std::make_shared<AtomicCleanInsertter>());
pm->AddPass(std::make_shared<StitchAtomicCleanInsertter>());
} else /* if (is_ascend) */ {
pm->AddPass(std::make_shared<CleanAddAtomic>());
}
return pm;
}
PassManagerPtr GraphKernelOptimizer::Combine() {
auto pm = std::make_shared<PassManager>("graphkernel_stage6_combine");
// Enable parallel fusion
if (is_gpu) {
// Prevent fake loop in parallel fusion
pm->AddPass(std::make_shared<DependFormater>());
// Do parallel fusion for gpu device
pm->AddPass(std::make_shared<ParallelOpFusion>(kGPUDevice, ParallelConfig(7)));
}
return pm;
}
PassManagerPtr GraphKernelOptimizer::PostProcess() {
auto pm = std::make_shared<PassManager>("graphkernel_stage7_postprocess");
// Add the new tensors to the kernel_graph
pm->AddPass(std::make_shared<BindValueToGraph>());
return pm;
}
void GraphKernelOptimizer::Run(const KernelGraphPtr &kernel_graph) {
auto context_ptr = MsContext::GetInstance();
MS_EXCEPTION_IF_NULL(context_ptr);
is_gpu = (context_ptr->get_param<std::string>(MS_CTX_DEVICE_TARGET) == kGPUDevice);
is_ascend = (context_ptr->get_param<std::string>(MS_CTX_DEVICE_TARGET) == kAscendDevice);
auto optimizer = std::make_shared<GraphOptimizer>("graph_kernel_optimizer");
optimizer->AddPassManager(PreProcess());
optimizer->AddPassManager(Cluster());
optimizer->AddPassManager(HighLevelOpt1());
optimizer->AddPassManager(Split());
optimizer->AddPassManager(HighLevelOpt2());
optimizer->AddPassManager(Combine());
optimizer->AddPassManager(PostProcess());
(void)optimizer->Optimize(kernel_graph);
}
void GraphKernelOptimize(const KernelGraphPtr &kernel_graph) { GraphKernelOptimizer().Run(kernel_graph); }
} // namespace opt
} // namespace mindspore

@ -0,0 +1,54 @@
/**
* Copyright 2021 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef MINDSPORE_CCSRC_BACKEND_OPTIMIZER_GRAPH_KERNEL_GRAPH_KERNEL_OPTIMIZATION_H_
#define MINDSPORE_CCSRC_BACKEND_OPTIMIZER_GRAPH_KERNEL_GRAPH_KERNEL_OPTIMIZATION_H_
#include "ir/anf.h"
#include "ir/func_graph.h"
#include "backend/session/kernel_graph.h"
#include "backend/optimizer/common/optimizer.h"
#include "backend/optimizer/common/pass_manager.h"
namespace mindspore {
namespace opt {
class GraphKernelOptimizer {
public:
void Run(const KernelGraphPtr &kernel_graph);
private:
// Pre-process
PassManagerPtr PreProcess();
// Cluster kernels
PassManagerPtr Cluster();
// High level optimize 1
PassManagerPtr HighLevelOpt1();
// Split kernels
PassManagerPtr Split();
// High level optimize 2
PassManagerPtr HighLevelOpt2();
// Combine kernels
PassManagerPtr Combine();
// Post-process
PassManagerPtr PostProcess();
bool is_gpu{false};
bool is_ascend{false};
};
void GraphKernelOptimize(const KernelGraphPtr &kernel_graph);
} // namespace opt
} // namespace mindspore
#endif // MINDSPORE_CCSRC_BACKEND_OPTIMIZER_GRAPH_KERNEL_GRAPH_KERNEL_OPTIMIZATION_H_

@ -17,6 +17,7 @@
#define MINDSPORE_CCSRC_BACKEND_OPTIMIZER_GRAPH_KERNEL_SHAPE_OPS_SPLITTER_H_
#include <memory>
#include <vector>
#include <utility>
#include "ir/func_graph.h"
#include "backend/optimizer/common/pass.h"
@ -24,15 +25,15 @@ namespace mindspore {
namespace opt {
class ShapeOpsSplitter : public Pass {
public:
explicit ShapeOpsSplitter(const std::vector<PrimitivePtr> &shape_ops)
: Pass("shape_ops_splitter"), shape_ops_(shape_ops) {}
explicit ShapeOpsSplitter(std::vector<PrimitivePtr> shape_ops)
: Pass("shape_ops_splitter"), shape_ops_(std::move(shape_ops)) {}
~ShapeOpsSplitter() override = default;
bool Run(const FuncGraphPtr &func_graph);
private:
bool Process(const FuncGraphPtr &func_graph);
bool IsMultiUserShapeOps(const AnfNodePtr &node, const FuncGraphManagerPtr &mng);
const std::vector<PrimitivePtr> &shape_ops_;
std::vector<PrimitivePtr> shape_ops_;
};
using ShapeOpsSplitterPtr = std::shared_ptr<ShapeOpsSplitter>;
} // namespace opt

@ -51,16 +51,7 @@
#include "debug/data_dump/dump_json_parser.h"
#include "debug/tensor_load.h"
#include "debug/anf_ir_utils.h"
#include "backend/optimizer/graph_kernel/reorder_ops.h"
#include "backend/optimizer/graph_kernel/basic_ops_fusion.h"
#include "backend/optimizer/graph_kernel/eliminate_redundant_output.h"
#include "backend/optimizer/graph_kernel/tensor_promotion.h"
#include "backend/optimizer/graph_kernel/graph_kernel_splitter.h"
#include "backend/optimizer/graph_kernel/graph_kernel_expander.h"
#include "backend/optimizer/graph_kernel/graph_kernel_cse.h"
#include "backend/optimizer/graph_kernel/value_graph_binder.h"
#include "backend/optimizer/graph_kernel/add_atomic_clean.h"
#include "backend/optimizer/pass/getitem_tuple.h"
#include "backend/optimizer/graph_kernel/graph_kernel_optimization.h"
#include "backend/session/ascend_auto_monad.h"
#include "debug/data_dump/e2e_dump_util.h"
#include "debug/anf_ir_dump.h"
@ -843,22 +834,8 @@ void AscendSession::GraphKernelOptimize(const std::shared_ptr<KernelGraph> &kern
if (!(context_ptr->get_param<bool>(MS_CTX_ENABLE_GRAPH_KERNEL))) {
return;
}
auto optimizer = std::make_shared<opt::GraphOptimizer>();
auto pm = std::make_shared<opt::PassManager>("graph_kernel_pm");
pm->AddPass(std::make_shared<opt::ReorderOps>());
pm->AddPass(std::make_shared<opt::GraphKernelExpander>());
pm->AddPass(std::make_shared<opt::BasicOpsFusion>());
pm->AddPass(std::make_shared<opt::EliminateRedundantOutput>());
pm->AddPass(std::make_shared<opt::GraphKernelCSE>());
pm->AddPass(std::make_shared<opt::TensorPromotion>());
pm->AddPass(std::make_shared<opt::GraphKernelSplitter>());
// After Simplify and Splitter, a lot of redundant getitem/maketuple
// will be exposed, use GetitemTuple Pass to delete them.
pm->AddPass(std::make_shared<opt::GetitemTuple>());
pm->AddPass(std::make_shared<opt::BindValueToGraph>());
pm->AddPass(std::make_shared<opt::CleanAddAtomic>());
optimizer->AddPassManager(pm);
(void)optimizer->Optimize(kernel_graph);
opt::GraphKernelOptimize(kernel_graph);
kernel_graph->SetExecOrderByDefault();
}
void AscendSession::AdjustKernel(const std::shared_ptr<KernelGraph> &kernel_graph) const {

@ -42,23 +42,7 @@
#include "backend/optimizer/gpu/relu_v2_pass.h"
#include "backend/optimizer/gpu/add_relu_v2_fusion.h"
#include "backend/optimizer/gpu/add_relu_grad_v2_fusion.h"
#include "backend/optimizer/graph_kernel/add_atomic_clean_gpu.h"
#include "backend/optimizer/graph_kernel/add_stitch_atomic_clean_gpu.h"
#include "backend/optimizer/graph_kernel/arithmetic_simplify.h"
#include "backend/optimizer/graph_kernel/basic_ops_fusion.h"
#include "backend/optimizer/graph_kernel/clean_all_in_once.h"
#include "backend/optimizer/graph_kernel/depend_formater.h"
#include "backend/optimizer/graph_kernel/eliminate_redundant_output.h"
#include "backend/optimizer/graph_kernel/tensor_promotion.h"
#include "backend/optimizer/graph_kernel/graph_kernel_splitter.h"
#include "backend/optimizer/graph_kernel/graph_kernel_expander.h"
#include "backend/optimizer/graph_kernel/raise_reduction_precision.h"
#include "backend/optimizer/graph_kernel/graph_kernel_cse.h"
#include "backend/optimizer/graph_kernel/shape_ops_splitter.h"
#include "backend/optimizer/graph_kernel/value_graph_binder.h"
#include "backend/optimizer/graph_kernel/parallel_fusion.h"
#include "backend/optimizer/graph_kernel/optimize_assign.h"
#include "backend/optimizer/graph_kernel/split_assign.h"
#include "backend/optimizer/graph_kernel/graph_kernel_optimization.h"
#include "backend/optimizer/pass/communication_op_fusion.h"
#include "backend/optimizer/pass/getitem_tuple.h"
#include "common/trans.h"
@ -197,36 +181,7 @@ void GPUSession::GraphKernelOptimize(const std::shared_ptr<KernelGraph> &kernel_
if (!(context_ptr->get_param<bool>(MS_CTX_ENABLE_GRAPH_KERNEL))) {
return;
}
auto optimizer = std::make_shared<opt::GraphOptimizer>();
auto pm = std::make_shared<opt::PassManager>("graph_kernel_pm");
std::vector<PrimitivePtr> duplicated_ops = {prim::kPrimReshape, prim::kPrimExpandDims, prim::kPrimCast};
pm->AddPass(std::make_shared<opt::SplitAssign>());
pm->AddPass(std::make_shared<opt::DependFormater>()); // Make more fusion opportunity.
pm->AddPass(std::make_shared<opt::GraphKernelExpander>());
pm->AddPass(std::make_shared<opt::BasicOpsFusion>());
pm->AddPass(std::make_shared<opt::EliminateRedundantOutput>());
pm->AddPass(std::make_shared<opt::OptimizeAssign>());
pm->AddPass(std::make_shared<opt::EliminateRedundantOutput>());
pm->AddPass(std::make_shared<opt::RaiseReductionPrecision>());
pm->AddPass(std::make_shared<opt::GraphKernelCSE>());
pm->AddPass(std::make_shared<opt::ArithmeticSimplify>());
pm->AddPass(std::make_shared<opt::GraphKernelCSE>());
pm->AddPass(std::make_shared<opt::TensorPromotion>());
pm->AddPass(std::make_shared<opt::ShapeOpsSplitter>(duplicated_ops));
pm->AddPass(std::make_shared<opt::GraphKernelSplitter>());
pm->AddPass(std::make_shared<opt::GraphKernelCSE>());
// The CSE may output a graph with repeated outputs.
pm->AddPass(std::make_shared<opt::EliminateRedundantOutput>());
// After Simplify and Splitter, a lot of redundant getitem/maketuple
// will be exposed, use GetitemTuple Pass to delete them.
pm->AddPass(std::make_shared<opt::GetitemTuple>());
pm->AddPass(std::make_shared<opt::AtomicCleanInsertter>());
pm->AddPass(std::make_shared<opt::StitchAtomicCleanInsertter>());
pm->AddPass(std::make_shared<opt::DependFormater>()); // Prevent fake loop in parallel fusion.
pm->AddPass(std::make_shared<opt::ParallelOpFusion>(kGPUDevice, opt::ParallelConfig(7)));
pm->AddPass(std::make_shared<opt::BindValueToGraph>());
optimizer->AddPassManager(pm);
(void)optimizer->Optimize(kernel_graph);
opt::GraphKernelOptimize(kernel_graph);
kernel_graph->SetExecOrderByDefault();
}

Loading…
Cancel
Save