fix the bug of step_trace incorrect content in inference scene

4 years ago · de3a653c57
parent 7f8c0cbcf8
commit de3a653c57
7 changed files with 92 additions and 33 deletions
--- a/mindspore/ccsrc/profiler/device/gpu/data_saver.cc
+++ b/mindspore/ccsrc/profiler/device/gpu/data_saver.cc
@ -273,7 +273,7 @@ void DataSaver::WriteStepTrace(const std::string &saver_base_dir) {
  }
  // write step trace time info into file
-  uint32_t factor = 10;
+  const uint32_t factor = 10;
  std::vector<std::string> op_name_arr;
  op_name_arr.push_back(step_trace_op_name.trace_fp_start);
  op_name_arr.push_back(step_trace_op_name.trace_bp_end);
--- a/mindspore/ccsrc/profiler/device/gpu/gpu_profiling_utils.cc
+++ b/mindspore/ccsrc/profiler/device/gpu/gpu_profiling_utils.cc
@ -170,7 +170,6 @@ std::string ProfilingUtils::GetGraphSecondLastKernelName(const std::vector<CNode
  return second_last_kernel_name;
 }
 }  // namespace gpu
 }  // namespace profiler
 }  // namespace mindspore
--- a/mindspore/ccsrc/runtime/device/gpu/gpu_kernel_runtime.cc
+++ b/mindspore/ccsrc/runtime/device/gpu/gpu_kernel_runtime.cc
@ -592,7 +592,7 @@ bool GPUKernelRuntime::LaunchKernelDynamic(const session::KernelGraph *graph, bo
  auto profiler_inst = profiler::gpu::GPUProfiler::GetInstance();
  MS_EXCEPTION_IF_NULL(profiler_inst);
-  if (is_first_step_map_[graph->graph_id()]) {
+  if (profiler_inst->GetEnableFlag() && is_first_step_map_[graph->graph_id()]) {
    profiler::gpu::ProfilingTraceInfo profiling_trace =
      profiler::gpu::ProfilingUtils::GetProfilingTraceFromEnv(NOT_NULL(graph));
    profiler_inst->SetStepTraceOpName(profiling_trace);
--- a/mindspore/profiler/common/util.py
+++ b/mindspore/profiler/common/util.py
@ -233,24 +233,34 @@ def query_step_trace_file(profiler_dir):
    return None
-def get_summary_for_step_trace(average_info, header):
+def get_summary_for_step_trace(average_info, header, is_training_mode=True):
    """The property of summary info."""
    if not average_info or not header:
        return {}
    total_time = get_field_value(average_info, 'total', header)
    iteration_interval = get_field_value(average_info, 'iteration_interval',
                                         header)
-    fp_and_bp = get_field_value(average_info, 'fp_and_bp', header)
+    summary_part = {
    tail = get_field_value(average_info, 'tail', header)
    summary = {
        'total_time': total_time,
        'iteration_interval': iteration_interval,
        'iteration_interval_percent': calculate_percent(iteration_interval, total_time),
        'fp_and_bp': fp_and_bp,
        'fp_and_bp_percent': calculate_percent(fp_and_bp, total_time),
        'tail': tail,
        'tail_percent': calculate_percent(tail, total_time)
    }
    if is_training_mode:
        fp_and_bp = get_field_value(average_info, 'fp_and_bp', header)
        tail = get_field_value(average_info, 'tail', header)
        summary = {
            'fp_and_bp': fp_and_bp,
            'fp_and_bp_percent': calculate_percent(fp_and_bp, total_time),
            'tail': tail,
            'tail_percent': calculate_percent(tail, total_time)
        }
    else:
        fp = get_field_value(average_info, 'fp', header)
        summary = {
            'fp': fp,
            'fp_percent': calculate_percent(fp, total_time)
        }
    summary.update(summary_part)
    return summary
--- a/mindspore/profiler/parser/integrator.py
+++ b/mindspore/profiler/parser/integrator.py
@ -21,7 +21,7 @@ from decimal import Decimal
 from mindspore import log as logger
 from mindspore.profiler.common.exceptions.exceptions import ProfilerIOException, \
-    ProfilerFileNotFoundException, ProfilerRawFileException
+    ProfilerFileNotFoundException, ProfilerRawFileException, ProfilerParamValueErrorException
 from mindspore.profiler.common.util import query_latest_trace_time_file, to_int, to_millisecond
 from mindspore.profiler.common.validator.validate_path import validate_and_normalize_path
 from mindspore.profiler.parser.container import TimelineContainer
@ -776,6 +776,24 @@ class GpuTimelineGenerator(BaseTimelineGenerator):
        # Update timeline summary info
        self._timeline_summary['num_of_streams'] += len(stream_count_dict.keys())
    def check_op_name(self, op_name):
        """
        Check whether the operator name exists.
        Args:
            op_name (str): The operator name or operator name prefix.
        Returns:
            bool, `True` if the operator name does exist, else `False`.
        """
        if not op_name:
            raise ProfilerParamValueErrorException('The op_name should exist.')
        for op_time_info in self._timeline_meta:
            full_op_name = op_time_info['name']
            if full_op_name and full_op_name.startswith(op_name):
                return True
        return False
 class AscendTimelineGenerator(BaseTimelineGenerator):
    """Generate ascend Timeline data from file."""
    _display_filename = 'ascend_timeline_display_{}.json'
--- a/mindspore/profiler/parser/step_trace_parser.py
+++ b/mindspore/profiler/parser/step_trace_parser.py
@ -42,9 +42,10 @@ class BaseStepTraceParser:
        output_file_path (str): The output file path.
        job_id (int): The job id used to define the start of new step. Default: 0.
        skip_first_step (bool): Whether skip the first step or not.
        is_training_mode (bool): Whether in training mode or not.
    """
-    def __init__(self, input_dir, output_file_path, job_id=0, skip_first_step=False):
+    def __init__(self, input_dir, output_file_path, job_id=0, skip_first_step=False, is_training_mode=True):
        self._input_dir = input_dir
        self._output_path = output_file_path
        self._job_id = job_id
@ -53,6 +54,7 @@ class BaseStepTraceParser:
        self._header = []
        self._step_num = 0
        self._tag_map = {}
        self._is_training_mode = is_training_mode
    @property
    def output_file(self):
@ -64,7 +66,7 @@ class BaseStepTraceParser:
        """The property of step trace info."""
        summary_info = {}
        if self._result:
-            summary_info = get_summary_for_step_trace(self._result[-1], self._header)
+            summary_info = get_summary_for_step_trace(self._result[-1], self._header, self._is_training_mode)
            summary_info['total_steps'] = len(self._result) - 1
        print('\nStep trace summary info (unit: syscnt):')
        print(summary_info)
@ -321,15 +323,27 @@ class BaseStepTraceParser:
        log.info("Finish add average info for step trace.")
    def _save(self):
        """save step trace file."""
        BP_POINT, TAIL, FP_DURATION = 5, -1, -2
        log.info("Start to save step trace file.")
        if not self._header:
            return
-        with open(self._output_path, 'w') as file_handle:
+        try:
-            csv_writer = csv.writer(file_handle)
+            with open(self._output_path, 'w') as file_handle:
-            csv_writer.writerow(self._header)
+                csv_writer = csv.writer(file_handle)
-            for row_data in self._result:
+                if not self._is_training_mode:
-                csv_writer.writerow(row_data)
+                    self._header[FP_DURATION] = 'fp'
-        os.chmod(self._output_path, stat.S_IRUSR)
+                    self._header = self._header[:BP_POINT] + self._header[BP_POINT+1:TAIL]
                csv_writer.writerow(self._header)
                for row_data in self._result:
                    if not self._is_training_mode:
                        row_data[FP_DURATION] += row_data[TAIL]
                        row_data = row_data[:BP_POINT] + row_data[BP_POINT+1:TAIL]
                    csv_writer.writerow(row_data)
            os.chmod(self._output_path, stat.S_IRUSR)
        except (IOError, OSError) as err:
            log.warning('Failed to save step trace raw info. %s', err)
            raise ProfilerIOException
 class GpuStepTraceParser(BaseStepTraceParser):
@ -356,10 +370,16 @@ class GpuStepTraceParser(BaseStepTraceParser):
            log.warning(f'Failed to read {source_file}', err)
            raise ProfilerIOException
-        points = {
+        if self._is_training_mode:
-            'fp_start': fp_start_name,
+            points = {
-            'bp_end': bp_end_name
+                'fp_start': fp_start_name,
-        }
+                'bp_end': bp_end_name
            }
        else:
            points = {
                'fp_start': fp_start_name,
            }
        try:
            with open(output_path, 'w') as json_file:
                json.dump(points, json_file)
@ -456,10 +476,16 @@ class AscendStepTraceParser(BaseStepTraceParser):
        Returns:
            dict, parsed point info.
        """
-        points = {
+        if self._is_training_mode:
-            'fp_start': point_info.get(self._fp_tag, ''),
+            points = {
-            'bp_end': point_info.get(self._bp_tag, '')
+                'fp_start': point_info.get(self._fp_tag, ''),
-        }
+                'bp_end': point_info.get(self._bp_tag, '')
            }
        else:
            points = {
                'fp_start': point_info.get(self._fp_tag, ''),
            }
        try:
            with open(output_path, 'w') as json_file:
                json.dump(points, json_file)
--- a/mindspore/profiler/profiling.py
+++ b/mindspore/profiler/profiling.py
@ -151,7 +151,7 @@ class Profiler:
                logger.error('Please check the Profiler object initialized after set_auto_parallel_context() '
                             'and init(). Profiler should be initialized after these code. ')
            self._gpu_profiler.stop()
-            self._generate_timeline()
+            timeline_generator = self._generate_timeline()
            # parse minddata pipeline operator and queue for GPU
            try:
@ -162,7 +162,7 @@ class Profiler:
            # analyse step trace info
            try:
-                self._analyse_step_trace()
+                self._analyse_step_trace(is_training_mode_flag=timeline_generator.check_op_name('Gradients'))
            except ProfilerException as err:
                logger.warning(err.message)
@ -239,13 +239,14 @@ class Profiler:
            os.environ['PROFILING_MODE'] = str("false")
            context.set_context(enable_profiling=False)
-    def _analyse_step_trace(self, source_path=None, framework_parser=None):
+    def _analyse_step_trace(self, source_path=None, framework_parser=None, is_training_mode_flag=True):
        """
        Analyse step trace data and save the result.
        Args:
            source_path (str): The directory that contains the step trace original data.
            framework_parser (FrameworkParser): The framework parse instance.
            is_training_mode_flag (bool): Whether in training mode or not.
        """
        logger.info("Begin to parse step trace.")
        # construct output path
@ -266,19 +267,23 @@ class Profiler:
                f'step_trace_profiling_{self._dev_id}.txt'
            )
            parser = GpuStepTraceParser(input_dir=input_file_path,
-                                        output_file_path=step_trace_intermediate_file_path)
+                                        output_file_path=step_trace_intermediate_file_path,
                                        is_training_mode=is_training_mode_flag)
            parser.parse_and_save()
            point_info = parser.record_point_info(input_file_path, point_info_file_path)
        else:
            # whether keep the first step
            skip_first_step_flag = framework_parser.check_op_name(INIT_OP_NAME)
            point_info = framework_parser.point_info
            # recognize inference or traning mode
            is_traning_mode_flag = framework_parser.check_op_name("Gradients")
            # parser the step trace files and save the result to disk
            source_path = validate_and_normalize_path(source_path)
            parser = AscendStepTraceParser(input_dir=source_path,
                                           output_file_path=step_trace_intermediate_file_path,
                                           job_id=self._job_id_env,
-                                           skip_first_step=skip_first_step_flag)
+                                           skip_first_step=skip_first_step_flag,
                                           is_training_mode=is_traning_mode_flag)
            parser.update_tag_op_type_map(point_info)
            parser.parse_and_save()
            point_info = parser.record_point_info(point_info, point_info_file_path)
@ -332,6 +337,7 @@ class Profiler:
            timeline_generator.init_timeline()
            timeline_generator.write_timeline(size_limit)
            timeline_generator.write_timeline_summary()
            return timeline_generator
        except (ProfilerIOException, ProfilerFileNotFoundException, RuntimeError) as err:
            logger.warning('Fail to write timeline data: %s', err)
            raise RuntimeError('Fail to write timeline data.')