Fix the bug of Profiler cannot find corresponding job dir in Ascend scene

pull/12405/head
gzhcv 4 years ago
parent d1d03a8eff
commit 208b3fae18

@ -22,6 +22,7 @@ from enum import Enum
from mindspore import log as logger, context
from mindspore.communication.management import release, get_rank
import mindspore._c_expression as c_expression
from mindspore.profiler.common.exceptions.exceptions import ProfilerFileNotFoundException, \
ProfilerIOException, ProfilerException, ProfilerRawFileException
from mindspore.profiler.common.util import get_file_names, fwrite_format
@ -86,18 +87,18 @@ class Profiler:
os.environ['MINDDATA_PROFILING_DIR'] = self._output_path
if self._device_target:
from mindspore._c_expression import CPUProfiler
CPUProfiler = c_expression.CPUProfiler
self._cpu_profiler = CPUProfiler.get_instance()
self._cpu_profiler.init(self._output_path)
self._cpu_profiler.step_profiling_enable(True)
if self._device_target and self._device_target == "GPU":
from mindspore._c_expression import GPUProfiler
GPUProfiler = c_expression.GPUProfiler
self._gpu_profiler = GPUProfiler.get_instance()
self._gpu_profiler.init(self._output_path)
self._gpu_profiler.step_profiling_enable(True)
if context.get_auto_parallel_context('device_num') > 1:
self._dev_id = get_rank()
os.environ['DEVICE_ID'] = str(self._dev_id)
self._dev_id = str(get_rank())
os.environ['DEVICE_ID'] = self._dev_id
if kwargs:
logger.warning("Params not be supported yet on GPU.")
@ -253,8 +254,8 @@ class Profiler:
def _gpu_analyse(self):
"""Collect and analyse gpu performance data"""
if context.get_auto_parallel_context('device_num') > 1 and self._dev_id != get_rank():
self._dev_id = get_rank()
if context.get_auto_parallel_context('device_num') > 1 and self._dev_id != str(get_rank()):
self._dev_id = str(get_rank())
logger.error('Please check the Profiler object initialized after set_auto_parallel_context() '
'and init(). Profiler should be initialized after these code. ')
self._gpu_profiler.stop()
@ -403,6 +404,7 @@ class Profiler:
"""
job_id = ""
for item in os.listdir(self._output_path):
if item.startswith('JOB'):
path = os.path.join(self._output_path, item)
@ -410,25 +412,23 @@ class Profiler:
log_file = get_file_names(path, "host_start.log")
if not log_file:
logger.error("Profiling: job path %s, host_start.log not exist.", path)
continue
training_device_id = log_file[0].split('.')[-1]
if self._dev_id == training_device_id:
log_file = os.path.join(path, log_file[0])
job_start_time = self._parse_host_start_log(log_file)
if not job_start_time:
logger.error("Profiling: job path %s, fail to get job start info.", path)
break
job_id = item
if self._start_time > int(job_start_time):
logger.info("Profiling: job path %s, start_time %s, training start_time %d.",
path, job_start_time, self._start_time)
break
log_file = os.path.join(path, log_file[0])
item_dict = self._parse_host_start_log(log_file)
if not item_dict:
logger.error("Profiling: job path %s, fail to get job start info.", path)
break
job_id = item
if self._dev_id != item_dict["device_id"]:
else:
logger.info("Profiling: job path %s, dev id %s, training device id %s.",
path, item_dict["device_id"], self._dev_id)
if self._start_time > int(item_dict["start_time"]):
logger.info("Profiling: job path %s, start_time %s, training start_time %d.",
path, item_dict["start_time"], self._start_time)
break
path, training_device_id, self._dev_id)
if not job_id:
msg = "Fail to get profiling job, please check whether job dir was generated"
@ -438,23 +438,23 @@ class Profiler:
def _parse_host_start_log(self, input_file):
"""
Parse host start log file, get the device id and start time of the job.
Parse host start log file, get the start time of the job.
Args:
input_file (str): The file path of the host start log file.
Returns:
dict, job start time and device id.
str, job start time.
"""
item_dict = {}
for line in open(input_file):
if "Device" in line:
item_dict["device_id"] = line[7:len(line)-2]
elif "clock_realtime" in line:
item_dict["start_time"] = line[16:len(line)-3]
job_start_time = ""
with open(input_file) as f:
for line in f.readlines():
if "clock_realtime" in line:
# 16 means the first digit of the timestamp, len(line)-3 means the last.
job_start_time = line[16:len(line)-3]
return item_dict
return job_start_time
def _analyser_op_info(self):
"""Analyse the operator information."""

Loading…
Cancel
Save