!8787 Fix the bug of gpu step trace feature did not get all the op name

From: @gzhcv
Reviewed-by: @wangyue01,@jonyguo
Signed-off-by: @jonyguo
pull/8787/MERGE
mindspore-ci-bot 4 years ago committed by Gitee
commit a354cd90d3

@ -275,7 +275,6 @@ void DataSaver::WriteStepTrace(const std::string &saver_base_dir) {
// write step trace time info into file // write step trace time info into file
uint32_t factor = 10; uint32_t factor = 10;
std::vector<std::string> op_name_arr; std::vector<std::string> op_name_arr;
op_name_arr.push_back(step_trace_op_name.trace_iter_start);
op_name_arr.push_back(step_trace_op_name.trace_fp_start); op_name_arr.push_back(step_trace_op_name.trace_fp_start);
op_name_arr.push_back(step_trace_op_name.trace_bp_end); op_name_arr.push_back(step_trace_op_name.trace_bp_end);
op_name_arr.push_back(step_trace_op_name.trace_iter_end); op_name_arr.push_back(step_trace_op_name.trace_iter_end);

@ -27,11 +27,10 @@ namespace gpu {
constexpr char kFpStartNode[] = "PROFILING_FP_START"; constexpr char kFpStartNode[] = "PROFILING_FP_START";
constexpr char kBpEndNode[] = "PROFILING_BP_END"; constexpr char kBpEndNode[] = "PROFILING_BP_END";
constexpr char kIterEndNode[] = "PROFILING_ITER_END"; constexpr char kIterEndNode[] = "PROFILING_ITER_END";
constexpr int fpExistGraphId = 2; constexpr auto kInitDatasetQueueOpName = "InitDataSetQueue";
uint32_t ProfilingUtils::last_graph_id = 0;
bool ProfilingUtils::have_communication_op = false; bool ProfilingUtils::have_communication_op = false;
ProfilingTraceInfo ProfilingUtils::profiling_trace = {"", "", "", "", false}; ProfilingTraceInfo ProfilingUtils::profiling_trace = {"", "", ""};
ProfilingTraceInfo ProfilingUtils::GetProfilingTraceFromEnv(NotNull<const session::KernelGraph *> graph_ptr) { ProfilingTraceInfo ProfilingUtils::GetProfilingTraceFromEnv(NotNull<const session::KernelGraph *> graph_ptr) {
MS_LOG(INFO) << "get current subgraph op name start."; MS_LOG(INFO) << "get current subgraph op name start.";
@ -39,30 +38,21 @@ ProfilingTraceInfo ProfilingUtils::GetProfilingTraceFromEnv(NotNull<const sessio
if (cnode_exec_order.empty()) { if (cnode_exec_order.empty()) {
return profiling_trace; return profiling_trace;
} }
uint32_t current_graph_id = graph_ptr->graph_id();
// current graph id less than last graph id indicates all subgraph have called.
if (current_graph_id < last_graph_id) {
profiling_trace.IsFirstStepEnd = true;
OutputStepTraceOpNameStatus();
return profiling_trace;
}
SetTraceIterStart(cnode_exec_order);
SetTraceIterEnd(cnode_exec_order); SetTraceIterEnd(cnode_exec_order);
SetTraceFpStart(cnode_exec_order, current_graph_id); SetTraceFpStart(cnode_exec_order);
SetTraceBpEnd(cnode_exec_order); SetTraceBpEnd(cnode_exec_order);
GetTraceHccl(cnode_exec_order); GetTraceHccl(cnode_exec_order);
last_graph_id = current_graph_id; OutputStepTraceOpNameStatus();
return profiling_trace; return profiling_trace;
} }
void ProfilingUtils::OutputStepTraceOpNameStatus() { void ProfilingUtils::OutputStepTraceOpNameStatus() {
if (!profiling_trace.IsValid()) { if (profiling_trace.IsValid()) {
MS_LOG(ERROR) << "Did not get all the step_trace op name."; MS_LOG(INFO) << "Get all the step_trace op name.";
} }
MS_LOG(INFO) << "[profiling]trace_iter_start: " << profiling_trace.trace_iter_start MS_LOG(INFO) << "[profiling]trace_fp_start: " << profiling_trace.trace_fp_start
<< "trace_fp_start: " << profiling_trace.trace_fp_start
<< "trace_bp_end: " << profiling_trace.trace_bp_end << "trace_bp_end: " << profiling_trace.trace_bp_end
<< "trace_iter_end: " << profiling_trace.trace_iter_end; << "trace_iter_end: " << profiling_trace.trace_iter_end;
MS_LOG(INFO) << "get step_trace op name end."; MS_LOG(INFO) << "get step_trace op name end.";
@ -81,19 +71,7 @@ void ProfilingUtils::GetTraceHccl(const std::vector<CNodePtr> &cnode_exec_order)
} }
} }
void ProfilingUtils::SetTraceIterStart(const std::vector<CNodePtr> &cnode_exec_order) { void ProfilingUtils::SetTraceFpStart(const std::vector<CNodePtr> &cnode_exec_order) {
if (!profiling_trace.trace_iter_start.empty()) {
return;
}
auto first_node = cnode_exec_order.front();
MS_EXCEPTION_IF_NULL(first_node);
if (AnfAlgo::GetCNodeName(first_node) == kGetNextOpName) {
profiling_trace.trace_iter_start = first_node->fullname_with_scope();
}
}
void ProfilingUtils::SetTraceFpStart(const std::vector<CNodePtr> &cnode_exec_order, uint32_t graph_id) {
if (!profiling_trace.trace_fp_start.empty()) { if (!profiling_trace.trace_fp_start.empty()) {
return; return;
} }
@ -105,9 +83,20 @@ void ProfilingUtils::SetTraceFpStart(const std::vector<CNodePtr> &cnode_exec_ord
return; return;
} }
if (graph_id == fpExistGraphId) { auto first_node = cnode_exec_order.front();
auto first_node = cnode_exec_order.front(); MS_EXCEPTION_IF_NULL(first_node);
MS_EXCEPTION_IF_NULL(first_node); auto node_name = AnfAlgo::GetCNodeName(first_node);
if (node_name == kInitDatasetQueueOpName) {
return;
}
if (node_name == kGetNextOpName) {
if (cnode_exec_order.size() > 1) {
profiling_trace.trace_fp_start = cnode_exec_order.at(1)->fullname_with_scope();
} else {
MS_LOG(ERROR) << "No Op Behind the GetNext Op" << std::endl;
}
} else {
profiling_trace.trace_fp_start = first_node->fullname_with_scope(); profiling_trace.trace_fp_start = first_node->fullname_with_scope();
} }
} }

@ -30,23 +30,17 @@ namespace profiler {
namespace gpu { namespace gpu {
struct ProfilingTraceInfo { struct ProfilingTraceInfo {
// support get all the op name from environment variable // support get all the op name from environment variable
// iteration start op is GetNext // fp start op is the first op in all subgraph except data related op
std::string trace_iter_start;
// fp start op is the first op of Graph that ID is 2
std::string trace_fp_start; std::string trace_fp_start;
// bp end op is the input node op of the last communication op (if exist) // bp end op is the input node op of the last communication op (if exist)
std::string trace_bp_end; std::string trace_bp_end;
// iteration end op is the last executed op // iteration end op is the last executed op
std::string trace_iter_end; std::string trace_iter_end;
bool IsFirstStepEnd;
// profiling specific op, such as AllReduce; // profiling specific op, such as AllReduce;
std::vector<std::string> trace_custom_node; std::vector<std::string> trace_custom_node;
bool IsValid() const { bool IsValid() const { return !(trace_fp_start.empty() || trace_bp_end.empty() || trace_iter_end.empty()); }
return !(trace_iter_start.empty() || trace_fp_start.empty() || trace_bp_end.empty() || trace_iter_end.empty());
}
}; };
class ProfilingUtils { class ProfilingUtils {
@ -61,13 +55,11 @@ class ProfilingUtils {
static ProfilingTraceInfo GetProfilingTraceFromEnv(NotNull<const session::KernelGraph *> graph_ptr); static ProfilingTraceInfo GetProfilingTraceFromEnv(NotNull<const session::KernelGraph *> graph_ptr);
static void OutputStepTraceOpNameStatus(); static void OutputStepTraceOpNameStatus();
static uint32_t last_graph_id;
static bool have_communication_op; static bool have_communication_op;
static ProfilingTraceInfo profiling_trace; static ProfilingTraceInfo profiling_trace;
private: private:
static void SetTraceIterStart(const std::vector<CNodePtr> &cnode_exec_order); static void SetTraceFpStart(const std::vector<CNodePtr> &cnode_exec_order);
static void SetTraceFpStart(const std::vector<CNodePtr> &cnode_exec_order, uint32_t graph_id);
static void SetTraceBpEnd(const std::vector<CNodePtr> &cnode_exec_order); static void SetTraceBpEnd(const std::vector<CNodePtr> &cnode_exec_order);
static void SetTraceIterEnd(const std::vector<CNodePtr> &cnode_exec_order); static void SetTraceIterEnd(const std::vector<CNodePtr> &cnode_exec_order);
static std::string GetGraphSecondLastKernelName(const std::vector<CNodePtr> &cnode_exec_order); static std::string GetGraphSecondLastKernelName(const std::vector<CNodePtr> &cnode_exec_order);

@ -587,14 +587,10 @@ bool GPUKernelRuntime::LaunchKernelDynamic(const session::KernelGraph *graph, bo
auto profiler_inst = profiler::gpu::GPUProfiler::GetInstance(); auto profiler_inst = profiler::gpu::GPUProfiler::GetInstance();
MS_EXCEPTION_IF_NULL(profiler_inst); MS_EXCEPTION_IF_NULL(profiler_inst);
static bool FlagGetStepTraceOpName = false; if (is_first_step_map_[graph->graph_id()]) {
if (!FlagGetStepTraceOpName) {
profiler::gpu::ProfilingTraceInfo profiling_trace = profiler::gpu::ProfilingTraceInfo profiling_trace =
profiler::gpu::ProfilingUtils::GetProfilingTraceFromEnv(NOT_NULL(graph)); profiler::gpu::ProfilingUtils::GetProfilingTraceFromEnv(NOT_NULL(graph));
if (profiling_trace.IsFirstStepEnd) { profiler_inst->SetStepTraceOpName(profiling_trace);
FlagGetStepTraceOpName = true;
profiler_inst->SetStepTraceOpName(profiling_trace);
}
} }
for (const auto &kernel : kernels) { for (const auto &kernel : kernels) {

@ -43,6 +43,7 @@ class BaseStepTraceParser:
job_id (int): The job id used to define the start of new step. Default: 0. job_id (int): The job id used to define the start of new step. Default: 0.
skip_first_step (bool): Whether skip the first step or not. skip_first_step (bool): Whether skip the first step or not.
""" """
def __init__(self, input_dir, output_file_path, job_id=0, skip_first_step=False): def __init__(self, input_dir, output_file_path, job_id=0, skip_first_step=False):
self._input_dir = input_dir self._input_dir = input_dir
self._output_path = output_file_path self._output_path = output_file_path
@ -333,6 +334,7 @@ class BaseStepTraceParser:
class GpuStepTraceParser(BaseStepTraceParser): class GpuStepTraceParser(BaseStepTraceParser):
"""The parser for gpu step trace data.""" """The parser for gpu step trace data."""
def record_point_info(self, source_file, output_path): def record_point_info(self, source_file, output_path):
""" """
Record point info into json. Record point info into json.
@ -344,7 +346,7 @@ class GpuStepTraceParser(BaseStepTraceParser):
Returns: Returns:
dict, parsed point info. dict, parsed point info.
""" """
fp_start, bp_end = 1, 2 fp_start, bp_end = 0, 1
try: try:
with open(source_file, 'r') as f: with open(source_file, 'r') as f:
lines = f.readlines() lines = f.readlines()
@ -374,7 +376,7 @@ class GpuStepTraceParser(BaseStepTraceParser):
def _parse(self, source_file): def _parse(self, source_file):
"""Parse source step trace files.""" """Parse source step trace files."""
log.info("Start to parse step trace file.") log.info("Start to parse step trace file.")
iter_start, fp_start, bp_end, iter_end = 0, 1, 2, 3 fp_start, bp_end, iter_end, iter_start = 0, 1, 2, 3
reduce_start = 4 reduce_start = 4
start_time, end_time = 0, 1 start_time, end_time = 0, 1
@ -382,15 +384,16 @@ class GpuStepTraceParser(BaseStepTraceParser):
try: try:
with open(source_file, 'r') as f: with open(source_file, 'r') as f:
lines = f.readlines() lines = f.readlines()
step_trace_info_all = [line.strip().split() for line in lines] step_trace_info_all = [line.strip().split()[1:] for line in lines]
num_of_step = len(step_trace_info_all[0]) num_of_step = len(step_trace_info_all[0])
# in callback mode that set the profiling step range, each op count is not equal iter_start_info = [step_trace_info_all[fp_start][0]] + \
step_trace_info_all = [line[-num_of_step:] for line in step_trace_info_all] step_trace_info_all[iter_end][:num_of_step]
step_trace_info_all.insert(iter_start, iter_start_info)
except (IOError, OSError) as err: except (IOError, OSError) as err:
log.warning(f'Failed to read {source_file}', err) log.warning(f'Failed to read {source_file}', err)
raise ProfilerIOException raise ProfilerIOException
for step_num in range(1, num_of_step): for step_num in range(num_of_step):
step_trace = { step_trace = {
'start': int(step_trace_info_all[iter_start][step_num].split(',')[start_time]), 'start': int(step_trace_info_all[iter_start][step_num].split(',')[start_time]),
'fp': int(step_trace_info_all[fp_start][step_num].split(',')[start_time]), 'fp': int(step_trace_info_all[fp_start][step_num].split(',')[start_time]),

Loading…
Cancel
Save