From de3a653c57611c340349ec8fc53a8c5b3a9752eb Mon Sep 17 00:00:00 2001 From: gzhcv Date: Mon, 7 Dec 2020 10:27:59 +0800 Subject: [PATCH] fix the bug of step_trace incorrect content in inference scene --- .../ccsrc/profiler/device/gpu/data_saver.cc | 2 +- .../device/gpu/gpu_profiling_utils.cc | 1 - .../runtime/device/gpu/gpu_kernel_runtime.cc | 2 +- mindspore/profiler/common/util.py | 26 ++++++--- mindspore/profiler/parser/integrator.py | 20 ++++++- .../profiler/parser/step_trace_parser.py | 58 ++++++++++++++----- mindspore/profiler/profiling.py | 16 +++-- 7 files changed, 92 insertions(+), 33 deletions(-) diff --git a/mindspore/ccsrc/profiler/device/gpu/data_saver.cc b/mindspore/ccsrc/profiler/device/gpu/data_saver.cc index fb74820538..5703c9e0e8 100644 --- a/mindspore/ccsrc/profiler/device/gpu/data_saver.cc +++ b/mindspore/ccsrc/profiler/device/gpu/data_saver.cc @@ -273,7 +273,7 @@ void DataSaver::WriteStepTrace(const std::string &saver_base_dir) { } // write step trace time info into file - uint32_t factor = 10; + const uint32_t factor = 10; std::vector op_name_arr; op_name_arr.push_back(step_trace_op_name.trace_fp_start); op_name_arr.push_back(step_trace_op_name.trace_bp_end); diff --git a/mindspore/ccsrc/profiler/device/gpu/gpu_profiling_utils.cc b/mindspore/ccsrc/profiler/device/gpu/gpu_profiling_utils.cc index b8a7a59a5a..b0bfb40a0c 100644 --- a/mindspore/ccsrc/profiler/device/gpu/gpu_profiling_utils.cc +++ b/mindspore/ccsrc/profiler/device/gpu/gpu_profiling_utils.cc @@ -170,7 +170,6 @@ std::string ProfilingUtils::GetGraphSecondLastKernelName(const std::vectorgraph_id()]) { + if (profiler_inst->GetEnableFlag() && is_first_step_map_[graph->graph_id()]) { profiler::gpu::ProfilingTraceInfo profiling_trace = profiler::gpu::ProfilingUtils::GetProfilingTraceFromEnv(NOT_NULL(graph)); profiler_inst->SetStepTraceOpName(profiling_trace); diff --git a/mindspore/profiler/common/util.py b/mindspore/profiler/common/util.py index 180d163ff2..3860597069 100644 --- a/mindspore/profiler/common/util.py +++ b/mindspore/profiler/common/util.py @@ -233,24 +233,34 @@ def query_step_trace_file(profiler_dir): return None -def get_summary_for_step_trace(average_info, header): +def get_summary_for_step_trace(average_info, header, is_training_mode=True): """The property of summary info.""" if not average_info or not header: return {} total_time = get_field_value(average_info, 'total', header) iteration_interval = get_field_value(average_info, 'iteration_interval', header) - fp_and_bp = get_field_value(average_info, 'fp_and_bp', header) - tail = get_field_value(average_info, 'tail', header) - summary = { + summary_part = { 'total_time': total_time, 'iteration_interval': iteration_interval, 'iteration_interval_percent': calculate_percent(iteration_interval, total_time), - 'fp_and_bp': fp_and_bp, - 'fp_and_bp_percent': calculate_percent(fp_and_bp, total_time), - 'tail': tail, - 'tail_percent': calculate_percent(tail, total_time) } + if is_training_mode: + fp_and_bp = get_field_value(average_info, 'fp_and_bp', header) + tail = get_field_value(average_info, 'tail', header) + summary = { + 'fp_and_bp': fp_and_bp, + 'fp_and_bp_percent': calculate_percent(fp_and_bp, total_time), + 'tail': tail, + 'tail_percent': calculate_percent(tail, total_time) + } + else: + fp = get_field_value(average_info, 'fp', header) + summary = { + 'fp': fp, + 'fp_percent': calculate_percent(fp, total_time) + } + summary.update(summary_part) return summary diff --git a/mindspore/profiler/parser/integrator.py b/mindspore/profiler/parser/integrator.py index ce96f77b51..92767d2a6b 100644 --- a/mindspore/profiler/parser/integrator.py +++ b/mindspore/profiler/parser/integrator.py @@ -21,7 +21,7 @@ from decimal import Decimal from mindspore import log as logger from mindspore.profiler.common.exceptions.exceptions import ProfilerIOException, \ - ProfilerFileNotFoundException, ProfilerRawFileException + ProfilerFileNotFoundException, ProfilerRawFileException, ProfilerParamValueErrorException from mindspore.profiler.common.util import query_latest_trace_time_file, to_int, to_millisecond from mindspore.profiler.common.validator.validate_path import validate_and_normalize_path from mindspore.profiler.parser.container import TimelineContainer @@ -776,6 +776,24 @@ class GpuTimelineGenerator(BaseTimelineGenerator): # Update timeline summary info self._timeline_summary['num_of_streams'] += len(stream_count_dict.keys()) + def check_op_name(self, op_name): + """ + Check whether the operator name exists. + + Args: + op_name (str): The operator name or operator name prefix. + + Returns: + bool, `True` if the operator name does exist, else `False`. + """ + if not op_name: + raise ProfilerParamValueErrorException('The op_name should exist.') + for op_time_info in self._timeline_meta: + full_op_name = op_time_info['name'] + if full_op_name and full_op_name.startswith(op_name): + return True + return False + class AscendTimelineGenerator(BaseTimelineGenerator): """Generate ascend Timeline data from file.""" _display_filename = 'ascend_timeline_display_{}.json' diff --git a/mindspore/profiler/parser/step_trace_parser.py b/mindspore/profiler/parser/step_trace_parser.py index 22875fb6ad..a61537bea6 100644 --- a/mindspore/profiler/parser/step_trace_parser.py +++ b/mindspore/profiler/parser/step_trace_parser.py @@ -42,9 +42,10 @@ class BaseStepTraceParser: output_file_path (str): The output file path. job_id (int): The job id used to define the start of new step. Default: 0. skip_first_step (bool): Whether skip the first step or not. + is_training_mode (bool): Whether in training mode or not. """ - def __init__(self, input_dir, output_file_path, job_id=0, skip_first_step=False): + def __init__(self, input_dir, output_file_path, job_id=0, skip_first_step=False, is_training_mode=True): self._input_dir = input_dir self._output_path = output_file_path self._job_id = job_id @@ -53,6 +54,7 @@ class BaseStepTraceParser: self._header = [] self._step_num = 0 self._tag_map = {} + self._is_training_mode = is_training_mode @property def output_file(self): @@ -64,7 +66,7 @@ class BaseStepTraceParser: """The property of step trace info.""" summary_info = {} if self._result: - summary_info = get_summary_for_step_trace(self._result[-1], self._header) + summary_info = get_summary_for_step_trace(self._result[-1], self._header, self._is_training_mode) summary_info['total_steps'] = len(self._result) - 1 print('\nStep trace summary info (unit: syscnt):') print(summary_info) @@ -321,15 +323,27 @@ class BaseStepTraceParser: log.info("Finish add average info for step trace.") def _save(self): + """save step trace file.""" + BP_POINT, TAIL, FP_DURATION = 5, -1, -2 log.info("Start to save step trace file.") if not self._header: return - with open(self._output_path, 'w') as file_handle: - csv_writer = csv.writer(file_handle) - csv_writer.writerow(self._header) - for row_data in self._result: - csv_writer.writerow(row_data) - os.chmod(self._output_path, stat.S_IRUSR) + try: + with open(self._output_path, 'w') as file_handle: + csv_writer = csv.writer(file_handle) + if not self._is_training_mode: + self._header[FP_DURATION] = 'fp' + self._header = self._header[:BP_POINT] + self._header[BP_POINT+1:TAIL] + csv_writer.writerow(self._header) + for row_data in self._result: + if not self._is_training_mode: + row_data[FP_DURATION] += row_data[TAIL] + row_data = row_data[:BP_POINT] + row_data[BP_POINT+1:TAIL] + csv_writer.writerow(row_data) + os.chmod(self._output_path, stat.S_IRUSR) + except (IOError, OSError) as err: + log.warning('Failed to save step trace raw info. %s', err) + raise ProfilerIOException class GpuStepTraceParser(BaseStepTraceParser): @@ -356,10 +370,16 @@ class GpuStepTraceParser(BaseStepTraceParser): log.warning(f'Failed to read {source_file}', err) raise ProfilerIOException - points = { - 'fp_start': fp_start_name, - 'bp_end': bp_end_name - } + if self._is_training_mode: + points = { + 'fp_start': fp_start_name, + 'bp_end': bp_end_name + } + else: + points = { + 'fp_start': fp_start_name, + } + try: with open(output_path, 'w') as json_file: json.dump(points, json_file) @@ -456,10 +476,16 @@ class AscendStepTraceParser(BaseStepTraceParser): Returns: dict, parsed point info. """ - points = { - 'fp_start': point_info.get(self._fp_tag, ''), - 'bp_end': point_info.get(self._bp_tag, '') - } + if self._is_training_mode: + points = { + 'fp_start': point_info.get(self._fp_tag, ''), + 'bp_end': point_info.get(self._bp_tag, '') + } + else: + points = { + 'fp_start': point_info.get(self._fp_tag, ''), + } + try: with open(output_path, 'w') as json_file: json.dump(points, json_file) diff --git a/mindspore/profiler/profiling.py b/mindspore/profiler/profiling.py index 2770bafcc7..e206fee748 100644 --- a/mindspore/profiler/profiling.py +++ b/mindspore/profiler/profiling.py @@ -151,7 +151,7 @@ class Profiler: logger.error('Please check the Profiler object initialized after set_auto_parallel_context() ' 'and init(). Profiler should be initialized after these code. ') self._gpu_profiler.stop() - self._generate_timeline() + timeline_generator = self._generate_timeline() # parse minddata pipeline operator and queue for GPU try: @@ -162,7 +162,7 @@ class Profiler: # analyse step trace info try: - self._analyse_step_trace() + self._analyse_step_trace(is_training_mode_flag=timeline_generator.check_op_name('Gradients')) except ProfilerException as err: logger.warning(err.message) @@ -239,13 +239,14 @@ class Profiler: os.environ['PROFILING_MODE'] = str("false") context.set_context(enable_profiling=False) - def _analyse_step_trace(self, source_path=None, framework_parser=None): + def _analyse_step_trace(self, source_path=None, framework_parser=None, is_training_mode_flag=True): """ Analyse step trace data and save the result. Args: source_path (str): The directory that contains the step trace original data. framework_parser (FrameworkParser): The framework parse instance. + is_training_mode_flag (bool): Whether in training mode or not. """ logger.info("Begin to parse step trace.") # construct output path @@ -266,19 +267,23 @@ class Profiler: f'step_trace_profiling_{self._dev_id}.txt' ) parser = GpuStepTraceParser(input_dir=input_file_path, - output_file_path=step_trace_intermediate_file_path) + output_file_path=step_trace_intermediate_file_path, + is_training_mode=is_training_mode_flag) parser.parse_and_save() point_info = parser.record_point_info(input_file_path, point_info_file_path) else: # whether keep the first step skip_first_step_flag = framework_parser.check_op_name(INIT_OP_NAME) point_info = framework_parser.point_info + # recognize inference or traning mode + is_traning_mode_flag = framework_parser.check_op_name("Gradients") # parser the step trace files and save the result to disk source_path = validate_and_normalize_path(source_path) parser = AscendStepTraceParser(input_dir=source_path, output_file_path=step_trace_intermediate_file_path, job_id=self._job_id_env, - skip_first_step=skip_first_step_flag) + skip_first_step=skip_first_step_flag, + is_training_mode=is_traning_mode_flag) parser.update_tag_op_type_map(point_info) parser.parse_and_save() point_info = parser.record_point_info(point_info, point_info_file_path) @@ -332,6 +337,7 @@ class Profiler: timeline_generator.init_timeline() timeline_generator.write_timeline(size_limit) timeline_generator.write_timeline_summary() + return timeline_generator except (ProfilerIOException, ProfilerFileNotFoundException, RuntimeError) as err: logger.warning('Fail to write timeline data: %s', err) raise RuntimeError('Fail to write timeline data.')