From 3d6c1fc5f6fe6c35b31ded926196ec9f4cfb0074 Mon Sep 17 00:00:00 2001 From: gzhcv Date: Thu, 19 Nov 2020 16:49:05 +0800 Subject: [PATCH] fix the bug of step_trace --- .../ccsrc/profiler/device/gpu/data_saver.cc | 1 - .../device/gpu/gpu_profiling_utils.cc | 55 ++++++++----------- .../profiler/device/gpu/gpu_profiling_utils.h | 14 +---- .../runtime/device/gpu/gpu_kernel_runtime.cc | 8 +-- .../profiler/parser/step_trace_parser.py | 15 +++-- 5 files changed, 36 insertions(+), 57 deletions(-) diff --git a/mindspore/ccsrc/profiler/device/gpu/data_saver.cc b/mindspore/ccsrc/profiler/device/gpu/data_saver.cc index cddff4938f..5ccb57cc24 100644 --- a/mindspore/ccsrc/profiler/device/gpu/data_saver.cc +++ b/mindspore/ccsrc/profiler/device/gpu/data_saver.cc @@ -275,7 +275,6 @@ void DataSaver::WriteStepTrace(const std::string &saver_base_dir) { // write step trace time info into file uint32_t factor = 10; std::vector op_name_arr; - op_name_arr.push_back(step_trace_op_name.trace_iter_start); op_name_arr.push_back(step_trace_op_name.trace_fp_start); op_name_arr.push_back(step_trace_op_name.trace_bp_end); op_name_arr.push_back(step_trace_op_name.trace_iter_end); diff --git a/mindspore/ccsrc/profiler/device/gpu/gpu_profiling_utils.cc b/mindspore/ccsrc/profiler/device/gpu/gpu_profiling_utils.cc index 52a1dfa31b..b8a7a59a5a 100644 --- a/mindspore/ccsrc/profiler/device/gpu/gpu_profiling_utils.cc +++ b/mindspore/ccsrc/profiler/device/gpu/gpu_profiling_utils.cc @@ -27,11 +27,10 @@ namespace gpu { constexpr char kFpStartNode[] = "PROFILING_FP_START"; constexpr char kBpEndNode[] = "PROFILING_BP_END"; constexpr char kIterEndNode[] = "PROFILING_ITER_END"; -constexpr int fpExistGraphId = 2; +constexpr auto kInitDatasetQueueOpName = "InitDataSetQueue"; -uint32_t ProfilingUtils::last_graph_id = 0; bool ProfilingUtils::have_communication_op = false; -ProfilingTraceInfo ProfilingUtils::profiling_trace = {"", "", "", "", false}; +ProfilingTraceInfo ProfilingUtils::profiling_trace = {"", "", ""}; ProfilingTraceInfo ProfilingUtils::GetProfilingTraceFromEnv(NotNull graph_ptr) { MS_LOG(INFO) << "get current subgraph op name start."; @@ -39,30 +38,21 @@ ProfilingTraceInfo ProfilingUtils::GetProfilingTraceFromEnv(NotNullgraph_id(); - // current graph id less than last graph id indicates all subgraph have called. - if (current_graph_id < last_graph_id) { - profiling_trace.IsFirstStepEnd = true; - OutputStepTraceOpNameStatus(); - return profiling_trace; - } - SetTraceIterStart(cnode_exec_order); SetTraceIterEnd(cnode_exec_order); - SetTraceFpStart(cnode_exec_order, current_graph_id); + SetTraceFpStart(cnode_exec_order); SetTraceBpEnd(cnode_exec_order); GetTraceHccl(cnode_exec_order); - last_graph_id = current_graph_id; + OutputStepTraceOpNameStatus(); return profiling_trace; } void ProfilingUtils::OutputStepTraceOpNameStatus() { - if (!profiling_trace.IsValid()) { - MS_LOG(ERROR) << "Did not get all the step_trace op name."; + if (profiling_trace.IsValid()) { + MS_LOG(INFO) << "Get all the step_trace op name."; } - MS_LOG(INFO) << "[profiling]trace_iter_start: " << profiling_trace.trace_iter_start - << "trace_fp_start: " << profiling_trace.trace_fp_start + MS_LOG(INFO) << "[profiling]trace_fp_start: " << profiling_trace.trace_fp_start << "trace_bp_end: " << profiling_trace.trace_bp_end << "trace_iter_end: " << profiling_trace.trace_iter_end; MS_LOG(INFO) << "get step_trace op name end."; @@ -81,19 +71,7 @@ void ProfilingUtils::GetTraceHccl(const std::vector &cnode_exec_order) } } -void ProfilingUtils::SetTraceIterStart(const std::vector &cnode_exec_order) { - if (!profiling_trace.trace_iter_start.empty()) { - return; - } - - auto first_node = cnode_exec_order.front(); - MS_EXCEPTION_IF_NULL(first_node); - if (AnfAlgo::GetCNodeName(first_node) == kGetNextOpName) { - profiling_trace.trace_iter_start = first_node->fullname_with_scope(); - } -} - -void ProfilingUtils::SetTraceFpStart(const std::vector &cnode_exec_order, uint32_t graph_id) { +void ProfilingUtils::SetTraceFpStart(const std::vector &cnode_exec_order) { if (!profiling_trace.trace_fp_start.empty()) { return; } @@ -105,9 +83,20 @@ void ProfilingUtils::SetTraceFpStart(const std::vector &cnode_exec_ord return; } - if (graph_id == fpExistGraphId) { - auto first_node = cnode_exec_order.front(); - MS_EXCEPTION_IF_NULL(first_node); + auto first_node = cnode_exec_order.front(); + MS_EXCEPTION_IF_NULL(first_node); + auto node_name = AnfAlgo::GetCNodeName(first_node); + if (node_name == kInitDatasetQueueOpName) { + return; + } + + if (node_name == kGetNextOpName) { + if (cnode_exec_order.size() > 1) { + profiling_trace.trace_fp_start = cnode_exec_order.at(1)->fullname_with_scope(); + } else { + MS_LOG(ERROR) << "No Op Behind the GetNext Op" << std::endl; + } + } else { profiling_trace.trace_fp_start = first_node->fullname_with_scope(); } } diff --git a/mindspore/ccsrc/profiler/device/gpu/gpu_profiling_utils.h b/mindspore/ccsrc/profiler/device/gpu/gpu_profiling_utils.h index 58a1c79572..df2ae9ae20 100644 --- a/mindspore/ccsrc/profiler/device/gpu/gpu_profiling_utils.h +++ b/mindspore/ccsrc/profiler/device/gpu/gpu_profiling_utils.h @@ -30,23 +30,17 @@ namespace profiler { namespace gpu { struct ProfilingTraceInfo { // support get all the op name from environment variable - // iteration start op is GetNext - std::string trace_iter_start; - // fp start op is the first op of Graph that ID is 2 + // fp start op is the first op in all subgraph except data related op std::string trace_fp_start; // bp end op is the input node op of the last communication op (if exist) std::string trace_bp_end; // iteration end op is the last executed op std::string trace_iter_end; - bool IsFirstStepEnd; - // profiling specific op, such as AllReduce; std::vector trace_custom_node; - bool IsValid() const { - return !(trace_iter_start.empty() || trace_fp_start.empty() || trace_bp_end.empty() || trace_iter_end.empty()); - } + bool IsValid() const { return !(trace_fp_start.empty() || trace_bp_end.empty() || trace_iter_end.empty()); } }; class ProfilingUtils { @@ -61,13 +55,11 @@ class ProfilingUtils { static ProfilingTraceInfo GetProfilingTraceFromEnv(NotNull graph_ptr); static void OutputStepTraceOpNameStatus(); - static uint32_t last_graph_id; static bool have_communication_op; static ProfilingTraceInfo profiling_trace; private: - static void SetTraceIterStart(const std::vector &cnode_exec_order); - static void SetTraceFpStart(const std::vector &cnode_exec_order, uint32_t graph_id); + static void SetTraceFpStart(const std::vector &cnode_exec_order); static void SetTraceBpEnd(const std::vector &cnode_exec_order); static void SetTraceIterEnd(const std::vector &cnode_exec_order); static std::string GetGraphSecondLastKernelName(const std::vector &cnode_exec_order); diff --git a/mindspore/ccsrc/runtime/device/gpu/gpu_kernel_runtime.cc b/mindspore/ccsrc/runtime/device/gpu/gpu_kernel_runtime.cc index 3296746b52..b4aed04819 100644 --- a/mindspore/ccsrc/runtime/device/gpu/gpu_kernel_runtime.cc +++ b/mindspore/ccsrc/runtime/device/gpu/gpu_kernel_runtime.cc @@ -587,14 +587,10 @@ bool GPUKernelRuntime::LaunchKernelDynamic(const session::KernelGraph *graph, bo auto profiler_inst = profiler::gpu::GPUProfiler::GetInstance(); MS_EXCEPTION_IF_NULL(profiler_inst); - static bool FlagGetStepTraceOpName = false; - if (!FlagGetStepTraceOpName) { + if (is_first_step_map_[graph->graph_id()]) { profiler::gpu::ProfilingTraceInfo profiling_trace = profiler::gpu::ProfilingUtils::GetProfilingTraceFromEnv(NOT_NULL(graph)); - if (profiling_trace.IsFirstStepEnd) { - FlagGetStepTraceOpName = true; - profiler_inst->SetStepTraceOpName(profiling_trace); - } + profiler_inst->SetStepTraceOpName(profiling_trace); } for (const auto &kernel : kernels) { diff --git a/mindspore/profiler/parser/step_trace_parser.py b/mindspore/profiler/parser/step_trace_parser.py index 471c3ed5d8..ddad5367cb 100644 --- a/mindspore/profiler/parser/step_trace_parser.py +++ b/mindspore/profiler/parser/step_trace_parser.py @@ -43,6 +43,7 @@ class BaseStepTraceParser: job_id (int): The job id used to define the start of new step. Default: 0. skip_first_step (bool): Whether skip the first step or not. """ + def __init__(self, input_dir, output_file_path, job_id=0, skip_first_step=False): self._input_dir = input_dir self._output_path = output_file_path @@ -333,6 +334,7 @@ class BaseStepTraceParser: class GpuStepTraceParser(BaseStepTraceParser): """The parser for gpu step trace data.""" + def record_point_info(self, source_file, output_path): """ Record point info into json. @@ -344,7 +346,7 @@ class GpuStepTraceParser(BaseStepTraceParser): Returns: dict, parsed point info. """ - fp_start, bp_end = 1, 2 + fp_start, bp_end = 0, 1 try: with open(source_file, 'r') as f: lines = f.readlines() @@ -374,7 +376,7 @@ class GpuStepTraceParser(BaseStepTraceParser): def _parse(self, source_file): """Parse source step trace files.""" log.info("Start to parse step trace file.") - iter_start, fp_start, bp_end, iter_end = 0, 1, 2, 3 + fp_start, bp_end, iter_end, iter_start = 0, 1, 2, 3 reduce_start = 4 start_time, end_time = 0, 1 @@ -382,15 +384,16 @@ class GpuStepTraceParser(BaseStepTraceParser): try: with open(source_file, 'r') as f: lines = f.readlines() - step_trace_info_all = [line.strip().split() for line in lines] + step_trace_info_all = [line.strip().split()[1:] for line in lines] num_of_step = len(step_trace_info_all[0]) - # in callback mode that set the profiling step range, each op count is not equal - step_trace_info_all = [line[-num_of_step:] for line in step_trace_info_all] + iter_start_info = [step_trace_info_all[fp_start][0]] + \ + step_trace_info_all[iter_end][:num_of_step] + step_trace_info_all.insert(iter_start, iter_start_info) except (IOError, OSError) as err: log.warning(f'Failed to read {source_file}', err) raise ProfilerIOException - for step_num in range(1, num_of_step): + for step_num in range(num_of_step): step_trace = { 'start': int(step_trace_info_all[iter_start][step_num].split(',')[start_time]), 'fp': int(step_trace_info_all[fp_start][step_num].split(',')[start_time]),