!865 profiling training trace

From: @zhengyuanhua
Reviewed-by: @youui,@xchu42
Signed-off-by: @youui
pull/865/MERGE
mindspore-ci-bot 4 years ago committed by Gitee
commit d595f9d770

@ -302,6 +302,8 @@ FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY void ProfilingManager::Profilin
}
data.append(" model_id:").append(std::to_string(model_id));
data.append(" task_id:").append(std::to_string(graph.task_id));
data.append(" stream_id:").append(std::to_string(graph.stream_id));
data.append("\n");
GraphDescReport(device_id, data);

@ -480,6 +480,9 @@ REGISTER_OPTYPE_DEFINE(HVDWAIT, "HorovodWait");
// aicpu op for online_infer dynamic_dims
REGISTER_OPTYPE_DEFINE(GETDYNAMICDIMS, "GetDynamicDims");
// profiling training trace node
REGISTER_OPTYPE_DEFINE(PROFILINGTRAININGTRACE, "ProfilingTrainingTrace");
const std::string MODEL_ATTR_TASKS = "tasks";
const std::string MODEL_ATTR_TASK_GEN_BASE_ADDR = "task_gen_base_addr";
const std::string MODEL_ATTR_TASK_GEN_WEIGHT_ADDR = "task_gen_weight_addr";

@ -421,6 +421,52 @@ static Status GenerateTaskForConstant(const std::shared_ptr<ComputeGraph> &graph
return SUCCESS;
}
Status GraphBuilder::MarkFpBpProfilingTaskAttr(ComputeGraphPtr &com_graph) {
bool original_unknown_shape_flag = com_graph->GetGraphUnknownFlag();
com_graph->SetGraphUnknownFlag(false);
GELOGD("Start to mark profiling task attr for fp and bp.");
TaskGenerator task_generator;
ProfilingPoint profiling_point;
std::vector<uint32_t> all_reduce_node_index;
Status ret = task_generator.FindProfilingNodeIndex(com_graph, profiling_point, all_reduce_node_index);
com_graph->SetGraphUnknownFlag(original_unknown_shape_flag);
if (ret != SUCCESS) {
GELOGW("Find profiling node index failed.");
}
if (profiling_point.fp_index == 0 || profiling_point.bp_index == 0 || profiling_point.end_index.empty()) {
GELOGD("No need to mark fp bp profiling task attr.");
return SUCCESS;
}
// mark profiling task attr for node
uint32_t node_index = 0;
for (const auto &node : com_graph->GetAllNodes()) {
OpDescPtr op_desc = node->GetOpDesc();
GE_CHECK_NOTNULL(node->GetOpDesc());
node_index++;
if (profiling_point.fp_index == node_index) {
GELOGI("The first fp node of dynamic graph is %s, idx %u", op_desc->GetName().c_str(), node_index);
(void)ge::AttrUtils::SetBool(op_desc, ATTR_NAME_INSERT_FP_PROFILILNG_TASK, true);
}
if (profiling_point.bp_index == node_index) {
GELOGI("The bp node of dynamic graph is %s, idx %u", op_desc->GetName().c_str(), node_index);
(void)ge::AttrUtils::SetBool(op_desc, ATTR_NAME_INSERT_BP_PROFILILNG_TASK, true);
}
for (size_t i = 0; i < all_reduce_node_index.size(); i++) {
if (all_reduce_node_index[i] == node_index) {
GELOGI("The all reduce node of dynamic graph is %s, idx %u", op_desc->GetName().c_str(), node_index);
(void)ge::AttrUtils::SetBool(op_desc, ATTR_NAME_INSERT_BP_PROFILILNG_TASK, true);
continue;
}
}
if (profiling_point.end_index.find(node_index) != profiling_point.end_index.end()) {
GELOGI("The end node of dynamic graph is %s, idx %u", op_desc->GetName().c_str(), node_index);
(void)ge::AttrUtils::SetBool(op_desc, ATTR_NAME_INSERT_END_PROFILILNG_TASK, true);
}
}
return SUCCESS;
}
Status GraphBuilder::BuildForDynamicShapeGraph(ComputeGraphPtr &comp_graph,
std::vector<SubGraphInfoPtr> &subgraph_ptr_list,
GeRootModelPtr &ge_root_model_ptr, GeModelPtr &ge_model_ptr,
@ -437,6 +483,12 @@ Status GraphBuilder::BuildForDynamicShapeGraph(ComputeGraphPtr &comp_graph,
}
}
// Set fp bp profiling task attr for graph
if (MarkFpBpProfilingTaskAttr(comp_graph) != SUCCESS) {
GELOGE(FAILED, "Set fp bp profiling task attr for graph.");
return FAILED;
}
auto all_graphs = comp_graph->GetAllSubgraphs();
if (all_graphs.empty()) {
all_graphs.push_back(comp_graph);

@ -60,6 +60,7 @@ class GraphBuilder {
Status UpdateParentNodeOutputSize(const ge::ComputeGraphPtr &graph, ge::NodePtr &parent_node_ptr);
Status CalcDynShapeRootGraphDataSize(const ge::OpDescPtr &op_desc);
Status SecondPartition(ge::ComputeGraphPtr &comp_graph, vector<ge::SubGraphInfoPtr> &subgraph_ptr_list);
Status MarkFpBpProfilingTaskAttr(ComputeGraphPtr &com_graph);
Status BuildForDynamicShapeGraph(ComputeGraphPtr &comp_graph, std::vector<SubGraphInfoPtr> &subgraph_ptr_list,
GeRootModelPtr &ge_root_model_ptr, GeModelPtr &ge_model_ptr,
uint64_t session_id = INVALID_SESSION_ID);

@ -274,6 +274,7 @@ Status TaskGenerator::GenerateTask(RunContext &run_context, ComputeGraphPtr &gra
};
GE_MAKE_GUARD(release, callback);
uint64_t all_reduce_node_idx = 0;
for (auto &node : graph->GetNodes(graph->GetGraphUnknownFlag())) {
OpDescPtr op_desc = node->GetOpDesc();
GE_CHECK_NOTNULL(op_desc);
@ -292,7 +293,7 @@ Status TaskGenerator::GenerateTask(RunContext &run_context, ComputeGraphPtr &gra
// Part2: Call
auto fusion_task_info =
FusionTaskInfo{run_context, graph, node, op_desc, node_index, ge_lib,
ops_kernel_manager, task_def_list, op_name_map, profiling_point, all_reduce_nodes};
ops_kernel_manager, task_def_list, op_name_map, profiling_point, all_reduce_nodes, all_reduce_node_idx};
GE_CHK_STATUS_RET(GenerateTaskForFusionNode(fusion_task_info, fusion_nodes, fusion_nodes_seen),
"Call GenerateTaskForFusionNode node:%s(%s) failed", name.c_str(), type.c_str());
// continue directly
@ -316,7 +317,8 @@ Status TaskGenerator::GenerateTask(RunContext &run_context, ComputeGraphPtr &gra
type.c_str());
// Profiling task
size_t task_list_size_before = task_def_list.size();
GE_CHK_STATUS_RET(InsertProfilingTaskBefore(op_desc, profiling_point, all_reduce_nodes, node_index, task_def_list));
GE_CHK_STATUS_RET(InsertProfilingTaskBefore(op_desc, profiling_point, all_reduce_nodes,
node_index, task_def_list, all_reduce_node_idx));
int64_t op_id = op_desc->GetId();
// Compatible with dynamic shape scenes, the default is 0
int64_t stream_id = 0;
@ -336,8 +338,8 @@ Status TaskGenerator::GenerateTask(RunContext &run_context, ComputeGraphPtr &gra
return ret;
}
// Profiling task
GE_CHK_STATUS_RET(InsertProfilingTaskAfter(op_desc, profiling_point, all_reduce_nodes, node_index, task_def_list));
GE_CHK_STATUS_RET(InsertProfilingTaskAfter(op_desc, profiling_point, all_reduce_nodes,
node_index, task_def_list, all_reduce_node_idx));
size_t task_list_size_after = task_def_list.size();
// If tasks is reduced
if (task_list_size_after < task_list_size_before) {
@ -380,6 +382,7 @@ Status TaskGenerator::GenerateTaskForFusionNode(FusionTaskInfo &fusion_task_info
auto &op_name_map = fusion_task_info.op_name_map;
auto &profiling_point = fusion_task_info.profiling_point;
auto &all_reduce_nodes = fusion_task_info.all_reduce_nodes;
auto &all_reduce_idx = fusion_task_info.all_reduce_node_idx;
// If op_desc have this attr, call nodes with same group key in a stream together
if (ge::AttrUtils::GetInt(fusion_op_desc, ATTR_NAME_FUSION_GROUP_KEY, group_key) &&
(fusion_nodes_seen.count(node.get()) == 0)) {
@ -426,7 +429,8 @@ Status TaskGenerator::GenerateTaskForFusionNode(FusionTaskInfo &fusion_task_info
return INTERNAL_ERROR;
}
// profiling task
(void)InsertProfilingTaskBefore(op_desc, profiling_point, all_reduce_nodes, node_index, task_def_list);
(void)InsertProfilingTaskBefore(op_desc, profiling_point, all_reduce_nodes,
node_index, task_def_list, all_reduce_idx);
run_context.stream = run_context.graphStreamList[stream_id];
GELOGI("Fusion: Call %s to generate fusion_node:[fusion_node_name:%s(%s), id:%ld, stream_id:%ld] task.",
op_kernel_lib_name.c_str(), fusion_node_name.c_str(), fusion_node_type.c_str(), op_id, stream_id);
@ -439,7 +443,8 @@ Status TaskGenerator::GenerateTaskForFusionNode(FusionTaskInfo &fusion_task_info
return ret;
}
// profiling task
(void)InsertProfilingTaskAfter(op_desc, profiling_point, all_reduce_nodes, node_index, task_def_list);
(void)InsertProfilingTaskAfter(op_desc, profiling_point, all_reduce_nodes,
node_index, task_def_list, all_reduce_idx);
size_t task_list_size_after = task_def_list.size();
// if tasks is reduced
if (task_list_size_after < task_list_size_before) {
@ -830,6 +835,11 @@ Status TaskGenerator::GetFpBpIndex(const ComputeGraphPtr &graph, ProfilingPoint
return SUCCESS;
}
Status TaskGenerator::FindProfilingNodeIndex(const ComputeGraphPtr &graph, ProfilingPoint &profiling_point,
std::vector<uint32_t> &all_reduce_nodes) {
return FindProfilingTaskIndex(graph, profiling_point, all_reduce_nodes);
}
Status TaskGenerator::FindProfilingTaskIndex(const ComputeGraphPtr &graph, ProfilingPoint &profiling_point,
vector<uint32_t> &all_reduce_nodes) const {
GE_CHECK_NOTNULL(graph);
@ -840,7 +850,6 @@ Status TaskGenerator::FindProfilingTaskIndex(const ComputeGraphPtr &graph, Profi
GELOGD("Profiling is not open.");
return SUCCESS;
}
GELOGI("Start get FP/BP index.");
std::string fp_point_str;
std::string bp_point_str;
@ -878,18 +887,27 @@ Status TaskGenerator::FindProfilingTaskIndex(const ComputeGraphPtr &graph, Profi
return SUCCESS;
}
Status TaskGenerator::InsertProfilingTaskBefore(const OpDescPtr &op_desc, const ProfilingPoint &profiling_point,
vector<uint32_t> &all_reduce_nodes, uint32_t node_index,
vector<domi::TaskDef> &task_def_list) {
vector<domi::TaskDef> &task_def_list, uint64_t &all_reduce_node_idx) {
const char *profiling_mode = std::getenv(kProfilingMode);
bool is_profiling = (profiling_mode != nullptr) || ProfilingManager::Instance().ProfilingOn() ||
ProfilingManager::Instance().ProfilingTrainingTraceOn();
if (!is_profiling || (profiling_point.fp_index == 0) || (profiling_point.bp_index == 0) ||
(profiling_point.end_index.empty())) {
bool is_insert_fp_profiling_task = false;
(void)ge::AttrUtils::GetBool(op_desc, ATTR_NAME_INSERT_FP_PROFILILNG_TASK, is_insert_fp_profiling_task);
bool is_insert_bp_profiling_task = false;
(void)ge::AttrUtils::GetBool(op_desc, ATTR_NAME_INSERT_BP_PROFILILNG_TASK, is_insert_bp_profiling_task);
bool no_insert_profiling_task = ((profiling_point.fp_index == 0) || (profiling_point.bp_index == 0) ||
(profiling_point.end_index.empty())) &&
(!(is_insert_fp_profiling_task || is_insert_bp_profiling_task));
if (!is_profiling || no_insert_profiling_task) {
return SUCCESS;
}
if (profiling_point.fp_index == node_index) {
GELOGD("Insert fp profiling task: %d, insert bp profiling task: %d, fp index: %u, bp index: %u, end index size: %zu",
is_insert_fp_profiling_task, is_insert_bp_profiling_task, profiling_point.fp_index, profiling_point.bp_index,
profiling_point.end_index.size());
if ((profiling_point.fp_index == node_index) || is_insert_fp_profiling_task) {
uint64_t jobid_log_id = ge::GetContext().TraceId();
GELOGI("The first FP operator is %s, idx %u, job_id %lu", op_desc->GetName().c_str(), node_index, jobid_log_id);
@ -913,22 +931,40 @@ Status TaskGenerator::InsertProfilingTaskBefore(const OpDescPtr &op_desc, const
task_def_list.emplace_back(fp_task_def);
}
for (size_t i = 0; i < all_reduce_nodes.size(); i++) {
if (all_reduce_nodes[i] != node_index) {
continue;
bool is_all_reduce = (op_desc->GetType() == HCOMALLREDUCE || op_desc->GetType() == HVDCALLBACKALLREDUCE);
uint64_t all_reduce_task_idx = 0;
bool is_insert_all_reduce_task = false;
if (is_all_reduce && is_insert_bp_profiling_task) {
all_reduce_task_idx = all_reduce_node_idx;
is_insert_all_reduce_task = true;
}
if (is_all_reduce) {
all_reduce_node_idx++;
}
if (!is_insert_all_reduce_task) {
for (size_t i = 0; i < all_reduce_nodes.size(); i++) {
if (all_reduce_nodes[i] == node_index) {
all_reduce_task_idx = i;
is_insert_all_reduce_task = true;
break;
}
}
}
if (is_insert_all_reduce_task) {
GELOGI("The start allreduce operator is %s, idx %u", op_desc->GetName().c_str(), node_index);
TaskDef ar_task_def;
ar_task_def.set_type(RT_MODEL_TASK_PROFILER_TRACE);
ar_task_def.set_stream_id(op_desc->GetStreamId());
LogTimeStampDef *ar_log_def = ar_task_def.mutable_log_timestamp();
if (ar_log_def != nullptr) {
GE_IF_BOOL_EXEC(TypeUtils::CheckUint64MulOverflow(i, kProfilingArStep),
GE_IF_BOOL_EXEC(TypeUtils::CheckUint64MulOverflow(all_reduce_task_idx, kProfilingArStep),
GELOGE(FAILED, "Multiply result is out of range.");
return FAILED);
auto log_id = i * kProfilingArStep + kProfilingArStartLogid;
auto log_id = all_reduce_task_idx * kProfilingArStep + kProfilingArStartLogid;
ar_log_def->set_logid(log_id);
ar_log_def->set_notify(false);
(void)ge::AttrUtils::SetInt(op_desc, ATTR_NAME_INSERT_PROFILILNG_TASK_LOG_ID, log_id);
}
task_def_list.push_back(ar_task_def);
}
@ -937,16 +973,27 @@ Status TaskGenerator::InsertProfilingTaskBefore(const OpDescPtr &op_desc, const
Status TaskGenerator::InsertProfilingTaskAfter(const OpDescPtr &op_desc, const ProfilingPoint &profiling_point,
vector<uint32_t> &all_reduce_nodes, uint32_t node_index,
vector<domi::TaskDef> &task_def_list) {
vector<domi::TaskDef> &task_def_list, uint64_t all_reduce_node_idx) {
GE_CHECK_NOTNULL(op_desc);
const char *profiling_mode = std::getenv(kProfilingMode);
bool is_profiling = (profiling_mode != nullptr) || ProfilingManager::Instance().ProfilingOn() ||
ProfilingManager::Instance().ProfilingTrainingTraceOn();
if (!is_profiling || (profiling_point.fp_index == 0) || (profiling_point.bp_index == 0) ||
(profiling_point.end_index.empty())) {
bool is_insert_bp_profiling_task = false;
(void)ge::AttrUtils::GetBool(op_desc, ATTR_NAME_INSERT_BP_PROFILILNG_TASK, is_insert_bp_profiling_task);
bool is_insert_end_profiling_task = false;
(void)ge::AttrUtils::GetBool(op_desc, ATTR_NAME_INSERT_END_PROFILILNG_TASK, is_insert_end_profiling_task);
bool no_insert_profiling_task = ((profiling_point.fp_index == 0) || (profiling_point.bp_index == 0) ||
(profiling_point.end_index.empty())) &&
(!(is_insert_bp_profiling_task || is_insert_end_profiling_task));
if (!is_profiling || no_insert_profiling_task) {
return SUCCESS;
}
if (profiling_point.bp_index == node_index) {
GELOGD("Insert bp profiling task: %d, insert end profiling task: %d, fp index: %u, bp index: %u, end index size: %zu",
is_insert_bp_profiling_task, is_insert_end_profiling_task, profiling_point.fp_index, profiling_point.bp_index,
profiling_point.end_index.size() );
bool is_all_reduce = (op_desc->GetType() == HCOMALLREDUCE || op_desc->GetType() == HVDCALLBACKALLREDUCE);
if ((profiling_point.bp_index == node_index) || (!is_all_reduce && is_insert_bp_profiling_task)) {
GELOGI("The last BP operator is %s, idx %u", op_desc->GetName().c_str(), node_index);
TaskDef bp_task_def;
bp_task_def.set_type(RT_MODEL_TASK_PROFILER_TRACE);
@ -957,7 +1004,9 @@ Status TaskGenerator::InsertProfilingTaskAfter(const OpDescPtr &op_desc, const P
bp_log_def->set_notify(false);
task_def_list.emplace_back(bp_task_def);
}
if (profiling_point.end_index.find(node_index) != profiling_point.end_index.end()) {
if (profiling_point.end_index.find(node_index) != profiling_point.end_index.end() ||
is_insert_end_profiling_task) {
GELOGI("The iteration end operator is %s, idx %u", op_desc->GetName().c_str(), node_index);
TaskDef end_task_def;
end_task_def.set_type(RT_MODEL_TASK_PROFILER_TRACE);
@ -969,20 +1018,32 @@ Status TaskGenerator::InsertProfilingTaskAfter(const OpDescPtr &op_desc, const P
task_def_list.emplace_back(end_task_def);
}
uint32_t all_reduce_task_idx = 0;
bool is_insert_all_reduce_task = false;
if (is_all_reduce && is_insert_bp_profiling_task) {
all_reduce_task_idx = all_reduce_node_idx;
is_insert_all_reduce_task = true;
}
for (size_t i = 0; i < all_reduce_nodes.size(); i++) {
if (all_reduce_nodes[i] != node_index) {
continue;
if (all_reduce_nodes[i] == node_index) {
all_reduce_task_idx = i;
is_insert_all_reduce_task = true;
break;
}
}
if (is_insert_all_reduce_task) {
GELOGI("The end allreduce operator is %s, idx %u", op_desc->GetName().c_str(), node_index);
TaskDef ar_task_def;
ar_task_def.set_type(RT_MODEL_TASK_PROFILER_TRACE);
ar_task_def.set_stream_id(op_desc->GetStreamId());
LogTimeStampDef *ar_log_def = ar_task_def.mutable_log_timestamp();
GE_CHECK_NOTNULL(ar_log_def);
GE_IF_BOOL_EXEC(TypeUtils::CheckUint64MulOverflow(i, kProfilingArStep),
GE_IF_BOOL_EXEC(TypeUtils::CheckUint64MulOverflow(all_reduce_task_idx, kProfilingArStep),
GELOGE(FAILED, "Multiply result is out of range.");
return FAILED);
auto log_id = i * kProfilingArStep + kProfilingArEndLogid;
auto log_id = all_reduce_task_idx * kProfilingArStep + kProfilingArEndLogid;
ar_log_def->set_logid(log_id);
ar_log_def->set_notify(false);
task_def_list.emplace_back(ar_task_def);

@ -51,6 +51,7 @@ struct FusionTaskInfo {
std::map<uint32_t, string> &op_name_map;
ProfilingPoint &profiling_point;
vector<uint32_t> all_reduce_nodes;
uint64_t all_reduce_node_idx;
};
class TaskGenerator {
@ -76,6 +77,8 @@ class TaskGenerator {
///
Status GetTaskInfo(Model &model, ComputeGraphPtr &graph, uint64_t session_id, RunContext &run_context);
Status FindProfilingNodeIndex(const ComputeGraphPtr &graph, ProfilingPoint &profiling_point,
std::vector<uint32_t> &all_reduce_nodes);
private:
Status UpdateAnchorStatus(const NodePtr &node);
@ -126,10 +129,10 @@ class TaskGenerator {
std::vector<uint32_t> &all_reduce_nodes) const;
Status InsertProfilingTaskBefore(const OpDescPtr &op_desc, const ProfilingPoint &profiling_point,
std::vector<uint32_t> &all_reduce_nodes, uint32_t node_index,
std::vector<domi::TaskDef> &task_def_list);
std::vector<domi::TaskDef> &task_def_list, uint64_t &all_reduce_node_idx);
Status InsertProfilingTaskAfter(const OpDescPtr &op_desc, const ProfilingPoint &profiling_point,
std::vector<uint32_t> &all_reduce_nodes, uint32_t node_index,
std::vector<domi::TaskDef> &task_def_list);
std::vector<domi::TaskDef> &task_def_list, uint64_t all_reduce_node_idx);
static bool IsProfPoint(const OpDescPtr &op, const std::string &name);

@ -3102,6 +3102,8 @@ Status DavinciModel::DistributeTask() {
task_desc_info.stream_id = task->GetStreamId();
task_desc_info.shape_type = "static";
task_desc_info.cur_iter_num = 0;
profiler_report_op_info_[task_desc_info.op_name] =
std::pair<uint32_t, uint32_t>(task_desc_info.task_id, task_desc_info.stream_id);
task_desc_info_.emplace_back(task_desc_info);
if (flag) {
if (task->GetSktTaskID() != 0xFFFFFFFF) {
@ -3109,6 +3111,8 @@ Status DavinciModel::DistributeTask() {
string op_name = "super_kernel_" + to_string(task_index);
task_desc_info.op_name = op_name;
task_desc_info.task_id = task->GetSktTaskID();
profiler_report_op_info_[task_desc_info.op_name] =
std::pair<uint32_t, uint32_t>(task_desc_info.task_id, task_desc_info.stream_id);
task_desc_info_.emplace_back(task_desc_info);
}
}
@ -3980,7 +3984,15 @@ Status DavinciModel::GetComputeGraphInfo(vector<ComputeGraphDescInfo> &graph_des
compute_graph_info.output_format = op_desc.output_format;
compute_graph_info.output_shape = op_desc.output_shape;
compute_graph_info.output_data_type = op_desc.output_data_type;
uint32_t task_id = 0;
uint32_t stream_id = 0;
auto iter = profiler_report_op_info_.find(op_desc.op_name);
if (iter != profiler_report_op_info_.end()) {
task_id = iter->second.first;
stream_id = iter->second.second;
}
compute_graph_info.task_id = task_id;
compute_graph_info.stream_id = stream_id;
graph_desc_info.emplace_back(compute_graph_info);
}
return SUCCESS;

@ -962,6 +962,8 @@ class DavinciModel {
// for profiling task and graph info
vector<TaskDescInfo> task_desc_info_;
std::map<std::string, std::pair<uint32_t, uint32_t>> profiler_report_op_info_;
int64_t maxDumpOpNum_;
// for data dump
DataDumper data_dumper_;

@ -221,6 +221,8 @@ Status NodeDoneCallback::GetGraphDescInfo(const NodePtr node, const HybridModel
tmp_compute_graph_info.output_shape.emplace_back(output_desc.GetShape().GetDims());
tmp_compute_graph_info.output_data_type.emplace_back(output_desc.GetDataType());
}
tmp_compute_graph_info.task_id = context_->GetTaskId();
tmp_compute_graph_info.stream_id = context_->GetStreamId();
compute_graph_info.emplace_back(tmp_compute_graph_info);
GELOGD("GetComputeGraphInfo of node [%s] end.", node->GetName().c_str());
}

@ -35,11 +35,22 @@
namespace ge {
namespace hybrid {
using domi::LogTimeStampDef;
using domi::TaskDef;
namespace {
const uint32_t kSubgraphIndex = 0U;
const uint32_t kVarOutputIndex = 0U;
const uint64_t kProfilingFpStartLogid = 1U;
const uint64_t kProfilingBpEndLogid = 2U;
const uint64_t kProfilingIterEndLogid = 65535U;
const int kBytes = 8;
const char *const kOwnerGraphIsUnknown = "OwnerGraphIsUnknown";
const char *const kProfilingGraph = "ProfilingGraph";
const char *const kProfilingFpNode = "ProfilingFpNode";
const char *const kProfilingBpNode = "ProfilingBpNode";
const char *const kProfilingEndNode = "ProfilingEndNode";
const char *const kProfilingArNode = "ProfilingAllReduceNode";
const char *const kEngineNameRts = "DNN_VM_RTS_OP_STORE";
Status SetOutputNameAttr(ComputeGraph &graph) {
vector<string> output_names;
@ -1531,6 +1542,188 @@ Status HybridModelBuilder::RecoverGraphUnknownFlag() {
return SUCCESS;
}
Status HybridModelBuilder::GenerateFpProfilingTask(const OpDescPtr &op_desc, vector<domi::TaskDef> &task_def_list) {
uint64_t jobid_log_id = ge::GetContext().TraceId();
GELOGD("The first FP operator is %s,, job_id %lu", op_desc->GetName().c_str(), jobid_log_id);
TaskDef job_task_def;
job_task_def.set_type(RT_MODEL_TASK_PROFILER_TRACE);
job_task_def.set_stream_id(op_desc->GetStreamId());
LogTimeStampDef *job_log_def = job_task_def.mutable_log_timestamp();
if (job_log_def != nullptr) {
job_log_def->set_logid(jobid_log_id);
job_log_def->set_notify(false);
}
task_def_list.emplace_back(job_task_def);
TaskDef fp_task_def;
fp_task_def.set_type(RT_MODEL_TASK_PROFILER_TRACE);
fp_task_def.set_stream_id(op_desc->GetStreamId());
LogTimeStampDef *fp_log_def = fp_task_def.mutable_log_timestamp();
if (fp_log_def != nullptr) {
fp_log_def->set_logid(kProfilingFpStartLogid);
fp_log_def->set_notify(false);
}
task_def_list.emplace_back(fp_task_def);
return SUCCESS;
}
Status HybridModelBuilder::GenerateArProfilingTask(const OpDescPtr &op_desc, int64_t log_id,
vector<domi::TaskDef> &task_def_list) {
TaskDef ar_task_def;
ar_task_def.set_type(RT_MODEL_TASK_PROFILER_TRACE);
ar_task_def.set_stream_id(op_desc->GetStreamId());
LogTimeStampDef *ar_log_def = ar_task_def.mutable_log_timestamp();
if (ar_log_def != nullptr) {
ar_log_def->set_logid(log_id);
ar_log_def->set_notify(false);
}
task_def_list.emplace_back(ar_task_def);
return SUCCESS;
}
Status HybridModelBuilder::GenerateBpProfilingTask(const OpDescPtr &op_desc, vector<domi::TaskDef> &task_def_list) {
TaskDef bp_task_def;
bp_task_def.set_type(RT_MODEL_TASK_PROFILER_TRACE);
bp_task_def.set_stream_id(op_desc->GetStreamId());
LogTimeStampDef *bp_log_def = bp_task_def.mutable_log_timestamp();
GE_CHECK_NOTNULL(bp_log_def);
bp_log_def->set_logid(kProfilingBpEndLogid);
bp_log_def->set_notify(false);
task_def_list.emplace_back(bp_task_def);
return SUCCESS;
}
Status HybridModelBuilder::GenerateEndProfilingTask(const OpDescPtr &op_desc, vector<domi::TaskDef> &task_def_list) {
TaskDef end_task_def;
end_task_def.set_type(RT_MODEL_TASK_PROFILER_TRACE);
end_task_def.set_stream_id(op_desc->GetStreamId());
LogTimeStampDef *end_log_def = end_task_def.mutable_log_timestamp();
GE_CHECK_NOTNULL(end_log_def);
end_log_def->set_logid(kProfilingIterEndLogid);
end_log_def->set_notify(true);
task_def_list.emplace_back(end_task_def);
return SUCCESS;
}
Status HybridModelBuilder::CreateProfilingNodeBefore(GraphItem &graph_item, const NodePtr &node) {
GE_CHECK_NOTNULL(node);
const OpDescPtr &op_desc = node->GetOpDesc();
GE_CHECK_NOTNULL(op_desc);
const auto &compute_graph = MakeShared<ComputeGraph>(kProfilingGraph);
GE_CHECK_NOTNULL(compute_graph);
NodePtr node_ptr = nullptr;
vector<domi::TaskDef> task_def_list;
// create fp node
bool is_insert_fp_profiling_task = false;
(void)ge::AttrUtils::GetBool(op_desc, ATTR_NAME_INSERT_FP_PROFILILNG_TASK, is_insert_fp_profiling_task);
if (is_insert_fp_profiling_task) {
(void)GenerateFpProfilingTask(op_desc, task_def_list);
auto fp_desc = MakeShared<OpDesc>(kProfilingFpNode, PROFILINGTRAININGTRACE);
GE_CHECK_NOTNULL(fp_desc);
fp_desc->SetOpKernelLibName(kEngineNameRts);
node_ptr = compute_graph->AddNode(fp_desc);
GELOGD("Create fp profiling node success before.");
}
// creat all reduce start node
bool is_insert_bp_profiling_task = false;
(void)ge::AttrUtils::GetBool(op_desc, ATTR_NAME_INSERT_BP_PROFILILNG_TASK, is_insert_bp_profiling_task);
bool is_all_reduce = (op_desc->GetType() == HCOMALLREDUCE || op_desc->GetType() == HVDCALLBACKALLREDUCE);
if (is_all_reduce && is_insert_bp_profiling_task) {
int64_t log_id = 0;
(void)ge::AttrUtils::GetInt(op_desc, ATTR_NAME_INSERT_PROFILILNG_TASK_LOG_ID, log_id);
GELOGD("All reduce node profiling task log id: %ld before", log_id);
(void) GenerateArProfilingTask(op_desc, log_id, task_def_list);
string op_name = string(kProfilingArNode) + std::to_string(log_id);
auto ar_desc_start = MakeShared<OpDesc>(op_name, PROFILINGTRAININGTRACE);
GE_CHECK_NOTNULL(ar_desc_start);
ar_desc_start->SetOpKernelLibName(kEngineNameRts);
node_ptr = compute_graph->AddNode(ar_desc_start);
GELOGD("Create all reduce start profiling node success before.");
}
if (node_ptr != nullptr) {
for (const auto &task_def : task_def_list) {
hybrid_model_.task_defs_[node_ptr].emplace_back(task_def);
}
NodeItem *node_item = nullptr;
GE_CHK_STATUS_RET_NOLOG(GetOrCreateNodeItem(node_ptr, &node_item));
node_item->input_start = 0;
node_item->output_start = 0;
graph_item.node_items_.emplace_back(node_item);
} else {
GELOGD("No need to create profiling node before.");
}
return SUCCESS;
}
Status HybridModelBuilder::CreateProfilingNodeAfter(GraphItem &graph_item, const NodePtr &node) {
GE_CHECK_NOTNULL(node);
const OpDescPtr &op_desc = node->GetOpDesc();
GE_CHECK_NOTNULL(op_desc);
const auto &compute_graph = MakeShared<ComputeGraph>(kProfilingGraph);
GE_CHECK_NOTNULL(compute_graph);
NodePtr node_ptr = nullptr;
vector<domi::TaskDef> task_def_list;
// Create all reduce end node
bool is_insert_bp_profiling_task = false;
(void)ge::AttrUtils::GetBool(op_desc, ATTR_NAME_INSERT_BP_PROFILILNG_TASK, is_insert_bp_profiling_task);
bool is_all_reduce = (op_desc->GetType() == HCOMALLREDUCE || op_desc->GetType() == HVDCALLBACKALLREDUCE);
if (is_all_reduce && is_insert_bp_profiling_task) {
int64_t log_id = 0;
(void)ge::AttrUtils::GetInt(op_desc, ATTR_NAME_INSERT_PROFILILNG_TASK_LOG_ID, log_id);
GELOGD("All reduce node profiling task log id: %ld after", log_id);
(void) GenerateArProfilingTask(op_desc, log_id + 1, task_def_list);
string op_name = string(kProfilingArNode) + std::to_string(log_id + 1);
auto ar_desc_end = MakeShared<OpDesc>(op_name, PROFILINGTRAININGTRACE);
GE_CHECK_NOTNULL(ar_desc_end);
ar_desc_end->SetOpKernelLibName(kEngineNameRts);
node_ptr = compute_graph->AddNode(ar_desc_end);
GELOGD("Create all reduce end profiling node success after.");
}
// create bp node
if (!is_all_reduce && is_insert_bp_profiling_task) {
(void) GenerateBpProfilingTask(op_desc, task_def_list);
auto bp_op_desc = MakeShared<OpDesc>(kProfilingBpNode, PROFILINGTRAININGTRACE);
GE_CHECK_NOTNULL(bp_op_desc);
bp_op_desc->SetOpKernelLibName(kEngineNameRts);
node_ptr = compute_graph->AddNode(bp_op_desc);
GELOGD("Create bp profiling node success after.");
}
// create end node
bool is_insert_end_profiling_task = false;
(void)ge::AttrUtils::GetBool(op_desc, ATTR_NAME_INSERT_END_PROFILILNG_TASK, is_insert_end_profiling_task);
if (is_insert_end_profiling_task) {
(void)GenerateEndProfilingTask(op_desc, task_def_list);
auto end_desc = MakeShared<OpDesc>(kProfilingEndNode, PROFILINGTRAININGTRACE);
GE_CHECK_NOTNULL(end_desc);
end_desc->SetOpKernelLibName(kEngineNameRts);
node_ptr = compute_graph->AddNode(end_desc);
GELOGD("Create end profiling node success after.");
}
if (node_ptr != nullptr) {
for (const auto &task_def : task_def_list) {
hybrid_model_.task_defs_[node_ptr].emplace_back(task_def);
}
NodeItem *node_item = nullptr;
GE_CHK_STATUS_RET_NOLOG(GetOrCreateNodeItem(node_ptr, &node_item));
node_item->input_start = 0;
node_item->output_start = 0;
graph_item.node_items_.emplace_back(node_item);
} else {
GELOGD("No need to create profiling node after.");
}
return SUCCESS;
}
Status HybridModelBuilder::LoadDynamicSubgraph(ComputeGraph &graph, bool is_root_graph) {
GELOGD("Start to load subgraph [%s]", graph.GetName().c_str());
// for known partitioned call, load all nodes
@ -1567,8 +1760,9 @@ Status HybridModelBuilder::LoadDynamicSubgraph(ComputeGraph &graph, bool is_root
graph_item->output_node_ = node_item;
GE_CHK_STATUS_RET_NOLOG(BuildOutputMapping(*graph_item, *node_item, is_root_graph));
}
GE_CHK_STATUS_RET_NOLOG(CreateProfilingNodeBefore(*graph_item, node));
graph_item->node_items_.emplace_back(node_item);
GE_CHK_STATUS_RET_NOLOG(CreateProfilingNodeAfter(*graph_item, node));
// parse var outputs
GE_CHK_STATUS_RET_NOLOG(ParseVarOutputs(*node_item));
GELOGD("NodeItem created: %s", node_item->DebugString().c_str());

@ -79,6 +79,12 @@ class HybridModelBuilder {
Status LoadKnownShapedSubgraph(ComputeGraph &graph, NodeItem *parent_node_item);
Status RecoverGraphUnknownFlag();
Status CheckAicpuOpList();
Status CreateProfilingNodeBefore(GraphItem &graph_item, const NodePtr &node);
Status CreateProfilingNodeAfter(GraphItem &graph_item, const NodePtr &node);
Status GenerateFpProfilingTask(const OpDescPtr &op_desc, vector<domi::TaskDef> &task_def_list);
Status GenerateBpProfilingTask(const OpDescPtr &op_desc, vector<domi::TaskDef> &task_def_list);
Status GenerateEndProfilingTask(const OpDescPtr &op_desc, vector<domi::TaskDef> &task_def_list);
Status GenerateArProfilingTask(const OpDescPtr &op_desc, int64_t log_id, vector<domi::TaskDef> &task_def_list);
const char* GetGraphName() const {
return hybrid_model_.model_name_.c_str();

@ -18,6 +18,7 @@
#include "common/debug/log.h"
#include "common/ge/ge_util.h"
#include "graph/utils/tensor_utils.h"
#include "hybrid/model/hybrid_model.h"
#include "runtime/rt.h"
namespace ge {
@ -79,12 +80,44 @@ Status IdentityNNodeTask::ExecuteAsync(TaskContext &context, std::function<void(
return SUCCESS;
}
Status ProfilingTraceNodeTask::UpdateArgs(TaskContext &context) {
return SUCCESS;
}
Status ProfilingTraceNodeTask::ExecuteAsync(TaskContext &context, std::function<void()> done_callback) {
for (const auto &task_def : task_defs_) {
auto log_time_stamp_def = task_def.log_timestamp();
uint64_t log_id = log_time_stamp_def.logid();
bool notify = log_time_stamp_def.notify();
uint32_t flat = log_time_stamp_def.flat();
GELOGD("ProfilingTraceTask execute async start. logid = %lu, notify = %d.", log_id, notify);
rtError_t rt_ret = rtProfilerTrace(log_id, notify, flat, context.GetStream());
if (rt_ret != RT_ERROR_NONE) {
GELOGE(RT_FAILED, "Call rt api failed, ret: 0x%X", rt_ret);
return RT_ERROR_TO_GE_STATUS(rt_ret);
}
GELOGD("[%s] ProfilingTraceTask[%lu] execute success.", context.GetNodeName(), log_id);
}
return SUCCESS;
};
Status RtsNodeExecutor::LoadTask(const HybridModel &model, const NodePtr &node, shared_ptr<NodeTask> &task) const {
GE_CHECK_NOTNULL(node);
auto op_type = node->GetType();
if (op_type == IDENTITY) {
task = MakeShared<IdentityNodeTask>();
} else if (op_type == IDENTITYN) {
task = MakeShared<IdentityNNodeTask>();
} else if (op_type == PROFILINGTRAININGTRACE) {
auto *task_defs = model.GetTaskDefs(node);
if (task_defs == nullptr || task_defs->empty()) {
GELOGE(INTERNAL_ERROR, "Profiling node has no task to execute.");
return INTERNAL_ERROR;
}
task = MakeShared<ProfilingTraceNodeTask>(*task_defs);
} else {
GELOGE(INTERNAL_ERROR, "[%s] Unsupported RTS op type: %s", node->GetName().c_str(), op_type.c_str());
return INTERNAL_ERROR;

@ -18,6 +18,7 @@
#define GE_HYBRID_NODE_EXECUTOR_RTS_RTS_NODE_EXECUTOR_H_
#include "hybrid/node_executor/node_executor.h"
#include "proto/task.pb.h"
namespace ge {
namespace hybrid {
@ -35,6 +36,18 @@ class IdentityNNodeTask : public IdentityNodeTask {
Status ExecuteAsync(TaskContext &context, std::function<void()> done_callback) override;
};
class ProfilingTraceNodeTask : public NodeTask {
public:
explicit ProfilingTraceNodeTask(const std::vector<domi::TaskDef> &task_defs) : task_defs_(task_defs) {}
~ProfilingTraceNodeTask() override = default;
Status UpdateArgs(TaskContext &context) override;
Status ExecuteAsync(TaskContext &context, std::function<void()> done_callback) override;
private:
std::vector<domi::TaskDef> task_defs_;
};
class RtsNodeExecutor : public NodeExecutor {
public:
Status LoadTask(const HybridModel &model, const NodePtr &node, shared_ptr<NodeTask> &task) const override;

@ -123,7 +123,7 @@ class TaskContext {
Status status_ = SUCCESS;
std::vector<void *> workspaces_;
uint64_t iteration_ = 0;
uint32_t task_id_= 0;
uint32_t task_id_ = 0;
uint32_t stream_id_ = 0;
};
} // namespace hybrid

@ -263,6 +263,8 @@ struct ComputeGraphDescInfo {
std::vector<Format> output_format;
std::vector<std::vector<int64_t>> output_shape;
std::vector<DataType> output_data_type;
uint32_t task_id;
uint32_t stream_id;
};
struct OpDescInfo {

@ -529,6 +529,9 @@ REGISTER_OPTYPE_DECLARE(HVDWAIT, "HorovodWait");
// aicpu op for online_infer dynamic_dims
REGISTER_OPTYPE_DECLARE(GETDYNAMICDIMS, "GetDynamicDims");
// profiling training trace node
REGISTER_OPTYPE_DECLARE(PROFILINGTRAININGTRACE, "ProfilingTrainingTrace");
enum InputMode { INPUT = 0, CONST_INPUT };
// Definition of the processing status enum of the process module

Loading…
Cancel
Save