fix the bug of step_trace incorrect content in inference scene

pull/9665/head
gzhcv 4 years ago
parent 7f8c0cbcf8
commit de3a653c57

@ -273,7 +273,7 @@ void DataSaver::WriteStepTrace(const std::string &saver_base_dir) {
} }
// write step trace time info into file // write step trace time info into file
uint32_t factor = 10; const uint32_t factor = 10;
std::vector<std::string> op_name_arr; std::vector<std::string> op_name_arr;
op_name_arr.push_back(step_trace_op_name.trace_fp_start); op_name_arr.push_back(step_trace_op_name.trace_fp_start);
op_name_arr.push_back(step_trace_op_name.trace_bp_end); op_name_arr.push_back(step_trace_op_name.trace_bp_end);

@ -170,7 +170,6 @@ std::string ProfilingUtils::GetGraphSecondLastKernelName(const std::vector<CNode
return second_last_kernel_name; return second_last_kernel_name;
} }
} // namespace gpu } // namespace gpu
} // namespace profiler } // namespace profiler
} // namespace mindspore } // namespace mindspore

@ -592,7 +592,7 @@ bool GPUKernelRuntime::LaunchKernelDynamic(const session::KernelGraph *graph, bo
auto profiler_inst = profiler::gpu::GPUProfiler::GetInstance(); auto profiler_inst = profiler::gpu::GPUProfiler::GetInstance();
MS_EXCEPTION_IF_NULL(profiler_inst); MS_EXCEPTION_IF_NULL(profiler_inst);
if (is_first_step_map_[graph->graph_id()]) { if (profiler_inst->GetEnableFlag() && is_first_step_map_[graph->graph_id()]) {
profiler::gpu::ProfilingTraceInfo profiling_trace = profiler::gpu::ProfilingTraceInfo profiling_trace =
profiler::gpu::ProfilingUtils::GetProfilingTraceFromEnv(NOT_NULL(graph)); profiler::gpu::ProfilingUtils::GetProfilingTraceFromEnv(NOT_NULL(graph));
profiler_inst->SetStepTraceOpName(profiling_trace); profiler_inst->SetStepTraceOpName(profiling_trace);

@ -233,24 +233,34 @@ def query_step_trace_file(profiler_dir):
return None return None
def get_summary_for_step_trace(average_info, header): def get_summary_for_step_trace(average_info, header, is_training_mode=True):
"""The property of summary info.""" """The property of summary info."""
if not average_info or not header: if not average_info or not header:
return {} return {}
total_time = get_field_value(average_info, 'total', header) total_time = get_field_value(average_info, 'total', header)
iteration_interval = get_field_value(average_info, 'iteration_interval', iteration_interval = get_field_value(average_info, 'iteration_interval',
header) header)
fp_and_bp = get_field_value(average_info, 'fp_and_bp', header) summary_part = {
tail = get_field_value(average_info, 'tail', header)
summary = {
'total_time': total_time, 'total_time': total_time,
'iteration_interval': iteration_interval, 'iteration_interval': iteration_interval,
'iteration_interval_percent': calculate_percent(iteration_interval, total_time), 'iteration_interval_percent': calculate_percent(iteration_interval, total_time),
'fp_and_bp': fp_and_bp,
'fp_and_bp_percent': calculate_percent(fp_and_bp, total_time),
'tail': tail,
'tail_percent': calculate_percent(tail, total_time)
} }
if is_training_mode:
fp_and_bp = get_field_value(average_info, 'fp_and_bp', header)
tail = get_field_value(average_info, 'tail', header)
summary = {
'fp_and_bp': fp_and_bp,
'fp_and_bp_percent': calculate_percent(fp_and_bp, total_time),
'tail': tail,
'tail_percent': calculate_percent(tail, total_time)
}
else:
fp = get_field_value(average_info, 'fp', header)
summary = {
'fp': fp,
'fp_percent': calculate_percent(fp, total_time)
}
summary.update(summary_part)
return summary return summary

@ -21,7 +21,7 @@ from decimal import Decimal
from mindspore import log as logger from mindspore import log as logger
from mindspore.profiler.common.exceptions.exceptions import ProfilerIOException, \ from mindspore.profiler.common.exceptions.exceptions import ProfilerIOException, \
ProfilerFileNotFoundException, ProfilerRawFileException ProfilerFileNotFoundException, ProfilerRawFileException, ProfilerParamValueErrorException
from mindspore.profiler.common.util import query_latest_trace_time_file, to_int, to_millisecond from mindspore.profiler.common.util import query_latest_trace_time_file, to_int, to_millisecond
from mindspore.profiler.common.validator.validate_path import validate_and_normalize_path from mindspore.profiler.common.validator.validate_path import validate_and_normalize_path
from mindspore.profiler.parser.container import TimelineContainer from mindspore.profiler.parser.container import TimelineContainer
@ -776,6 +776,24 @@ class GpuTimelineGenerator(BaseTimelineGenerator):
# Update timeline summary info # Update timeline summary info
self._timeline_summary['num_of_streams'] += len(stream_count_dict.keys()) self._timeline_summary['num_of_streams'] += len(stream_count_dict.keys())
def check_op_name(self, op_name):
"""
Check whether the operator name exists.
Args:
op_name (str): The operator name or operator name prefix.
Returns:
bool, `True` if the operator name does exist, else `False`.
"""
if not op_name:
raise ProfilerParamValueErrorException('The op_name should exist.')
for op_time_info in self._timeline_meta:
full_op_name = op_time_info['name']
if full_op_name and full_op_name.startswith(op_name):
return True
return False
class AscendTimelineGenerator(BaseTimelineGenerator): class AscendTimelineGenerator(BaseTimelineGenerator):
"""Generate ascend Timeline data from file.""" """Generate ascend Timeline data from file."""
_display_filename = 'ascend_timeline_display_{}.json' _display_filename = 'ascend_timeline_display_{}.json'

@ -42,9 +42,10 @@ class BaseStepTraceParser:
output_file_path (str): The output file path. output_file_path (str): The output file path.
job_id (int): The job id used to define the start of new step. Default: 0. job_id (int): The job id used to define the start of new step. Default: 0.
skip_first_step (bool): Whether skip the first step or not. skip_first_step (bool): Whether skip the first step or not.
is_training_mode (bool): Whether in training mode or not.
""" """
def __init__(self, input_dir, output_file_path, job_id=0, skip_first_step=False): def __init__(self, input_dir, output_file_path, job_id=0, skip_first_step=False, is_training_mode=True):
self._input_dir = input_dir self._input_dir = input_dir
self._output_path = output_file_path self._output_path = output_file_path
self._job_id = job_id self._job_id = job_id
@ -53,6 +54,7 @@ class BaseStepTraceParser:
self._header = [] self._header = []
self._step_num = 0 self._step_num = 0
self._tag_map = {} self._tag_map = {}
self._is_training_mode = is_training_mode
@property @property
def output_file(self): def output_file(self):
@ -64,7 +66,7 @@ class BaseStepTraceParser:
"""The property of step trace info.""" """The property of step trace info."""
summary_info = {} summary_info = {}
if self._result: if self._result:
summary_info = get_summary_for_step_trace(self._result[-1], self._header) summary_info = get_summary_for_step_trace(self._result[-1], self._header, self._is_training_mode)
summary_info['total_steps'] = len(self._result) - 1 summary_info['total_steps'] = len(self._result) - 1
print('\nStep trace summary info (unit: syscnt):') print('\nStep trace summary info (unit: syscnt):')
print(summary_info) print(summary_info)
@ -321,15 +323,27 @@ class BaseStepTraceParser:
log.info("Finish add average info for step trace.") log.info("Finish add average info for step trace.")
def _save(self): def _save(self):
"""save step trace file."""
BP_POINT, TAIL, FP_DURATION = 5, -1, -2
log.info("Start to save step trace file.") log.info("Start to save step trace file.")
if not self._header: if not self._header:
return return
with open(self._output_path, 'w') as file_handle: try:
csv_writer = csv.writer(file_handle) with open(self._output_path, 'w') as file_handle:
csv_writer.writerow(self._header) csv_writer = csv.writer(file_handle)
for row_data in self._result: if not self._is_training_mode:
csv_writer.writerow(row_data) self._header[FP_DURATION] = 'fp'
os.chmod(self._output_path, stat.S_IRUSR) self._header = self._header[:BP_POINT] + self._header[BP_POINT+1:TAIL]
csv_writer.writerow(self._header)
for row_data in self._result:
if not self._is_training_mode:
row_data[FP_DURATION] += row_data[TAIL]
row_data = row_data[:BP_POINT] + row_data[BP_POINT+1:TAIL]
csv_writer.writerow(row_data)
os.chmod(self._output_path, stat.S_IRUSR)
except (IOError, OSError) as err:
log.warning('Failed to save step trace raw info. %s', err)
raise ProfilerIOException
class GpuStepTraceParser(BaseStepTraceParser): class GpuStepTraceParser(BaseStepTraceParser):
@ -356,10 +370,16 @@ class GpuStepTraceParser(BaseStepTraceParser):
log.warning(f'Failed to read {source_file}', err) log.warning(f'Failed to read {source_file}', err)
raise ProfilerIOException raise ProfilerIOException
points = { if self._is_training_mode:
'fp_start': fp_start_name, points = {
'bp_end': bp_end_name 'fp_start': fp_start_name,
} 'bp_end': bp_end_name
}
else:
points = {
'fp_start': fp_start_name,
}
try: try:
with open(output_path, 'w') as json_file: with open(output_path, 'w') as json_file:
json.dump(points, json_file) json.dump(points, json_file)
@ -456,10 +476,16 @@ class AscendStepTraceParser(BaseStepTraceParser):
Returns: Returns:
dict, parsed point info. dict, parsed point info.
""" """
points = { if self._is_training_mode:
'fp_start': point_info.get(self._fp_tag, ''), points = {
'bp_end': point_info.get(self._bp_tag, '') 'fp_start': point_info.get(self._fp_tag, ''),
} 'bp_end': point_info.get(self._bp_tag, '')
}
else:
points = {
'fp_start': point_info.get(self._fp_tag, ''),
}
try: try:
with open(output_path, 'w') as json_file: with open(output_path, 'w') as json_file:
json.dump(points, json_file) json.dump(points, json_file)

@ -151,7 +151,7 @@ class Profiler:
logger.error('Please check the Profiler object initialized after set_auto_parallel_context() ' logger.error('Please check the Profiler object initialized after set_auto_parallel_context() '
'and init(). Profiler should be initialized after these code. ') 'and init(). Profiler should be initialized after these code. ')
self._gpu_profiler.stop() self._gpu_profiler.stop()
self._generate_timeline() timeline_generator = self._generate_timeline()
# parse minddata pipeline operator and queue for GPU # parse minddata pipeline operator and queue for GPU
try: try:
@ -162,7 +162,7 @@ class Profiler:
# analyse step trace info # analyse step trace info
try: try:
self._analyse_step_trace() self._analyse_step_trace(is_training_mode_flag=timeline_generator.check_op_name('Gradients'))
except ProfilerException as err: except ProfilerException as err:
logger.warning(err.message) logger.warning(err.message)
@ -239,13 +239,14 @@ class Profiler:
os.environ['PROFILING_MODE'] = str("false") os.environ['PROFILING_MODE'] = str("false")
context.set_context(enable_profiling=False) context.set_context(enable_profiling=False)
def _analyse_step_trace(self, source_path=None, framework_parser=None): def _analyse_step_trace(self, source_path=None, framework_parser=None, is_training_mode_flag=True):
""" """
Analyse step trace data and save the result. Analyse step trace data and save the result.
Args: Args:
source_path (str): The directory that contains the step trace original data. source_path (str): The directory that contains the step trace original data.
framework_parser (FrameworkParser): The framework parse instance. framework_parser (FrameworkParser): The framework parse instance.
is_training_mode_flag (bool): Whether in training mode or not.
""" """
logger.info("Begin to parse step trace.") logger.info("Begin to parse step trace.")
# construct output path # construct output path
@ -266,19 +267,23 @@ class Profiler:
f'step_trace_profiling_{self._dev_id}.txt' f'step_trace_profiling_{self._dev_id}.txt'
) )
parser = GpuStepTraceParser(input_dir=input_file_path, parser = GpuStepTraceParser(input_dir=input_file_path,
output_file_path=step_trace_intermediate_file_path) output_file_path=step_trace_intermediate_file_path,
is_training_mode=is_training_mode_flag)
parser.parse_and_save() parser.parse_and_save()
point_info = parser.record_point_info(input_file_path, point_info_file_path) point_info = parser.record_point_info(input_file_path, point_info_file_path)
else: else:
# whether keep the first step # whether keep the first step
skip_first_step_flag = framework_parser.check_op_name(INIT_OP_NAME) skip_first_step_flag = framework_parser.check_op_name(INIT_OP_NAME)
point_info = framework_parser.point_info point_info = framework_parser.point_info
# recognize inference or traning mode
is_traning_mode_flag = framework_parser.check_op_name("Gradients")
# parser the step trace files and save the result to disk # parser the step trace files and save the result to disk
source_path = validate_and_normalize_path(source_path) source_path = validate_and_normalize_path(source_path)
parser = AscendStepTraceParser(input_dir=source_path, parser = AscendStepTraceParser(input_dir=source_path,
output_file_path=step_trace_intermediate_file_path, output_file_path=step_trace_intermediate_file_path,
job_id=self._job_id_env, job_id=self._job_id_env,
skip_first_step=skip_first_step_flag) skip_first_step=skip_first_step_flag,
is_training_mode=is_traning_mode_flag)
parser.update_tag_op_type_map(point_info) parser.update_tag_op_type_map(point_info)
parser.parse_and_save() parser.parse_and_save()
point_info = parser.record_point_info(point_info, point_info_file_path) point_info = parser.record_point_info(point_info, point_info_file_path)
@ -332,6 +337,7 @@ class Profiler:
timeline_generator.init_timeline() timeline_generator.init_timeline()
timeline_generator.write_timeline(size_limit) timeline_generator.write_timeline(size_limit)
timeline_generator.write_timeline_summary() timeline_generator.write_timeline_summary()
return timeline_generator
except (ProfilerIOException, ProfilerFileNotFoundException, RuntimeError) as err: except (ProfilerIOException, ProfilerFileNotFoundException, RuntimeError) as err:
logger.warning('Fail to write timeline data: %s', err) logger.warning('Fail to write timeline data: %s', err)
raise RuntimeError('Fail to write timeline data.') raise RuntimeError('Fail to write timeline data.')

Loading…
Cancel
Save