Fix the bug of Profiler timestamp variate between each process in multi-card scene

pull/11539/head
gzhcv 4 years ago
parent 27dc6e19a3
commit e823152929

@ -80,7 +80,9 @@ class Profiler:
def __init__(self, **kwargs):
# get device_id and device_target
self._get_devid_and_devtarget()
format_time = int(time.time())
# to avoid get different timestamp between each process in multi-card training,
# set the timestamp which is divisible by 3
format_time = int(time.time() - time.time() % 3)
output_path = kwargs.pop("output_path", f"data-{format_time}")
self._output_path = validate_and_normalize_path(output_path)
self._output_path = os.path.join(self._output_path, f"profiler-{format_time}")
@ -171,29 +173,13 @@ class Profiler:
>>> profiler.analyse()
"""
if self._device_target and self._device_target == "GPU":
if context.get_auto_parallel_context('device_num') > 1 and self._dev_id != get_rank():
self._dev_id = get_rank()
logger.error('Please check the Profiler object initialized after set_auto_parallel_context() '
'and init(). Profiler should be initialized after these code. ')
self._gpu_profiler.stop()
timeline_generator = self._generate_timeline()
# parse minddata pipeline operator and queue for GPU
try:
pipeline_parser = MinddataPipelineParser(self._output_path, self._dev_id, self._output_path)
pipeline_parser.parse()
except ProfilerException as err:
logger.warning(err.message)
# analyse step trace info
try:
self._analyse_step_trace(is_training_mode_flag=timeline_generator.check_op_name('Gradients'))
except ProfilerException as err:
logger.warning(err.message)
os.environ['PROFILING_MODE'] = str("false")
self._gpu_analyse()
elif self._device_target and self._device_target == "Ascend":
self._ascend_analyse()
def _ascend_analyse(self):
"""Collect and analyse ascend performance data"""
release()
job_id = self._get_profiling_job_id()
@ -206,7 +192,7 @@ class Profiler:
source_path = validate_and_normalize_path(source_path)
hwts_output_filename = validate_and_normalize_path(hwts_output_filename)
hwtslog_parser = HWTSLogParser(source_path, hwts_output_filename)
_ = hwtslog_parser.execute()
hwtslog_parser.execute()
# parse Framework file, and get the relation of op and tasks
framework_parser = FrameworkParser(job_id, self._dev_id, self._output_path)
@ -271,6 +257,30 @@ class Profiler:
os.environ['PROFILING_MODE'] = str("false")
context.set_context(enable_profiling=False)
def _gpu_analyse(self):
"""Collect and analyse gpu performance data"""
if context.get_auto_parallel_context('device_num') > 1 and self._dev_id != get_rank():
self._dev_id = get_rank()
logger.error('Please check the Profiler object initialized after set_auto_parallel_context() '
'and init(). Profiler should be initialized after these code. ')
self._gpu_profiler.stop()
timeline_generator = self._generate_timeline()
# parse minddata pipeline operator and queue for GPU
try:
pipeline_parser = MinddataPipelineParser(self._output_path, self._dev_id, self._output_path)
pipeline_parser.parse()
except ProfilerException as err:
logger.warning(err.message)
# analyse step trace info
try:
self._analyse_step_trace(is_training_mode_flag=timeline_generator.check_op_name('Gradients'))
except ProfilerException as err:
logger.warning(err.message)
os.environ['PROFILING_MODE'] = str("false")
def _analyse_step_trace(self, source_path=None, framework_parser=None, is_training_mode_flag=True):
"""
Analyse step trace data and save the result.

Loading…
Cancel
Save