From 208b3fae1883af3ee29be1a199723dd91249b257 Mon Sep 17 00:00:00 2001 From: gzhcv Date: Wed, 10 Feb 2021 16:44:17 +0800 Subject: [PATCH] Fix the bug of Profiler cannot find corresponding job dir in Ascend scene --- mindspore/profiler/profiling.py | 64 ++++++++++++++++----------------- 1 file changed, 32 insertions(+), 32 deletions(-) diff --git a/mindspore/profiler/profiling.py b/mindspore/profiler/profiling.py index 6353d734c5..98b180eaf0 100644 --- a/mindspore/profiler/profiling.py +++ b/mindspore/profiler/profiling.py @@ -22,6 +22,7 @@ from enum import Enum from mindspore import log as logger, context from mindspore.communication.management import release, get_rank +import mindspore._c_expression as c_expression from mindspore.profiler.common.exceptions.exceptions import ProfilerFileNotFoundException, \ ProfilerIOException, ProfilerException, ProfilerRawFileException from mindspore.profiler.common.util import get_file_names, fwrite_format @@ -86,18 +87,18 @@ class Profiler: os.environ['MINDDATA_PROFILING_DIR'] = self._output_path if self._device_target: - from mindspore._c_expression import CPUProfiler + CPUProfiler = c_expression.CPUProfiler self._cpu_profiler = CPUProfiler.get_instance() self._cpu_profiler.init(self._output_path) self._cpu_profiler.step_profiling_enable(True) if self._device_target and self._device_target == "GPU": - from mindspore._c_expression import GPUProfiler + GPUProfiler = c_expression.GPUProfiler self._gpu_profiler = GPUProfiler.get_instance() self._gpu_profiler.init(self._output_path) self._gpu_profiler.step_profiling_enable(True) if context.get_auto_parallel_context('device_num') > 1: - self._dev_id = get_rank() - os.environ['DEVICE_ID'] = str(self._dev_id) + self._dev_id = str(get_rank()) + os.environ['DEVICE_ID'] = self._dev_id if kwargs: logger.warning("Params not be supported yet on GPU.") @@ -253,8 +254,8 @@ class Profiler: def _gpu_analyse(self): """Collect and analyse gpu performance data""" - if context.get_auto_parallel_context('device_num') > 1 and self._dev_id != get_rank(): - self._dev_id = get_rank() + if context.get_auto_parallel_context('device_num') > 1 and self._dev_id != str(get_rank()): + self._dev_id = str(get_rank()) logger.error('Please check the Profiler object initialized after set_auto_parallel_context() ' 'and init(). Profiler should be initialized after these code. ') self._gpu_profiler.stop() @@ -403,6 +404,7 @@ class Profiler: """ job_id = "" + for item in os.listdir(self._output_path): if item.startswith('JOB'): path = os.path.join(self._output_path, item) @@ -410,25 +412,23 @@ class Profiler: log_file = get_file_names(path, "host_start.log") if not log_file: logger.error("Profiling: job path %s, host_start.log not exist.", path) + continue + + training_device_id = log_file[0].split('.')[-1] + if self._dev_id == training_device_id: + log_file = os.path.join(path, log_file[0]) + job_start_time = self._parse_host_start_log(log_file) + if not job_start_time: + logger.error("Profiling: job path %s, fail to get job start info.", path) + break + job_id = item + if self._start_time > int(job_start_time): + logger.info("Profiling: job path %s, start_time %s, training start_time %d.", + path, job_start_time, self._start_time) break - - log_file = os.path.join(path, log_file[0]) - item_dict = self._parse_host_start_log(log_file) - - if not item_dict: - logger.error("Profiling: job path %s, fail to get job start info.", path) - break - - job_id = item - - if self._dev_id != item_dict["device_id"]: + else: logger.info("Profiling: job path %s, dev id %s, training device id %s.", - path, item_dict["device_id"], self._dev_id) - - if self._start_time > int(item_dict["start_time"]): - logger.info("Profiling: job path %s, start_time %s, training start_time %d.", - path, item_dict["start_time"], self._start_time) - break + path, training_device_id, self._dev_id) if not job_id: msg = "Fail to get profiling job, please check whether job dir was generated" @@ -438,23 +438,23 @@ class Profiler: def _parse_host_start_log(self, input_file): """ - Parse host start log file, get the device id and start time of the job. + Parse host start log file, get the start time of the job. Args: input_file (str): The file path of the host start log file. Returns: - dict, job start time and device id. + str, job start time. """ - item_dict = {} - for line in open(input_file): - if "Device" in line: - item_dict["device_id"] = line[7:len(line)-2] - elif "clock_realtime" in line: - item_dict["start_time"] = line[16:len(line)-3] + job_start_time = "" + with open(input_file) as f: + for line in f.readlines(): + if "clock_realtime" in line: + # 16 means the first digit of the timestamp, len(line)-3 means the last. + job_start_time = line[16:len(line)-3] - return item_dict + return job_start_time def _analyser_op_info(self): """Analyse the operator information."""