|
|
|
@ -1,4 +1,4 @@
|
|
|
|
|
# Copyright 2020 Huawei Technologies Co., Ltd
|
|
|
|
|
# Copyright 2020-2021 Huawei Technologies Co., Ltd
|
|
|
|
|
#
|
|
|
|
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
|
|
|
# you may not use this file except in compliance with the License.
|
|
|
|
@ -83,7 +83,6 @@ class Profiler:
|
|
|
|
|
output_path = kwargs.pop("output_path", f"data-{format_time}")
|
|
|
|
|
self._output_path = validate_and_normalize_path(output_path)
|
|
|
|
|
self._output_path = os.path.join(self._output_path, f"profiler-{format_time}")
|
|
|
|
|
self._base_profiling_container_path = os.path.join(self._output_path, "container")
|
|
|
|
|
if not os.path.exists(self._output_path):
|
|
|
|
|
os.makedirs(self._output_path, exist_ok=True)
|
|
|
|
|
os.chmod(self._output_path, stat.S_IRUSR | stat.S_IWUSR | stat.S_IXUSR)
|
|
|
|
@ -109,7 +108,14 @@ class Profiler:
|
|
|
|
|
optypes_not_deal = kwargs.pop("optypes_not_deal", "Variable")
|
|
|
|
|
if not isinstance(optypes_not_deal, str):
|
|
|
|
|
raise TypeError("The parameter optypes_not_deal must be str.")
|
|
|
|
|
job_id = kwargs.pop("ascend_job_id", "")
|
|
|
|
|
job_dir = kwargs.pop("ascend_job_id", "")
|
|
|
|
|
if job_dir:
|
|
|
|
|
job_dir = validate_and_normalize_path(job_dir)
|
|
|
|
|
if not os.path.exists(job_dir):
|
|
|
|
|
msg = f"Invalid ascend_job_id: {job_dir}, Please pass the absolute path of the JOB dir"
|
|
|
|
|
logger.error(msg)
|
|
|
|
|
raise ValueError(msg)
|
|
|
|
|
self._output_path, _ = os.path.split(job_dir)
|
|
|
|
|
if kwargs:
|
|
|
|
|
logger.warning("There are invalid params which don't work.")
|
|
|
|
|
|
|
|
|
@ -130,18 +136,19 @@ class Profiler:
|
|
|
|
|
profiling_options = json.dumps(profiling_options)
|
|
|
|
|
# Characters longer than 2048 are ignored, resulting in profiling option resolution errors
|
|
|
|
|
if len(profiling_options) > 2048:
|
|
|
|
|
raise ValueError("The parameter length exceeds the limit (2048), please input valid parameters.")
|
|
|
|
|
msg = "The parameter length exceeds the limit (2048), please input valid parameters."
|
|
|
|
|
logger.error(msg)
|
|
|
|
|
raise ValueError(msg)
|
|
|
|
|
# use context interface to open profiling, for the new mindspore version(after 2020.5.21)
|
|
|
|
|
context.set_context(enable_profiling=True, profiling_options=profiling_options)
|
|
|
|
|
|
|
|
|
|
self._container_path = os.path.join(self._base_profiling_container_path, self._dev_id)
|
|
|
|
|
data_path = os.path.join(self._container_path, "data")
|
|
|
|
|
base_profiling_container_path = os.path.join(self._output_path, "container")
|
|
|
|
|
container_path = os.path.join(base_profiling_container_path, self._dev_id)
|
|
|
|
|
data_path = os.path.join(container_path, "data")
|
|
|
|
|
data_path = validate_and_normalize_path(data_path)
|
|
|
|
|
if not os.path.exists(data_path):
|
|
|
|
|
os.makedirs(data_path, exist_ok=True)
|
|
|
|
|
|
|
|
|
|
self._filt_optype_names = optypes_not_deal.split(",") if optypes_not_deal else []
|
|
|
|
|
self._profiling_job_id = job_id
|
|
|
|
|
# add job id env through user input later
|
|
|
|
|
self._job_id_env = 0
|
|
|
|
|
self._start_time = int(time.time() * 10000000)
|
|
|
|
@ -362,27 +369,27 @@ class Profiler:
|
|
|
|
|
"""Get profiling job id, which was generated by ada service.
|
|
|
|
|
|
|
|
|
|
Returns:
|
|
|
|
|
str: profiling jon id.
|
|
|
|
|
str, profiling job id.
|
|
|
|
|
"""
|
|
|
|
|
|
|
|
|
|
if self._profiling_job_id:
|
|
|
|
|
return self._profiling_job_id
|
|
|
|
|
|
|
|
|
|
job_id = ""
|
|
|
|
|
for item in os.listdir(self._output_path):
|
|
|
|
|
if item.startswith('JOB'):
|
|
|
|
|
path = os.path.join(self._output_path, item)
|
|
|
|
|
job_id = item
|
|
|
|
|
|
|
|
|
|
log_file = get_file_names(path, "host_start.log")
|
|
|
|
|
if not log_file:
|
|
|
|
|
logger.error("Profiling: job path %s, host_start.log not exist.", path)
|
|
|
|
|
break
|
|
|
|
|
|
|
|
|
|
log_file = os.path.join(path, log_file[0])
|
|
|
|
|
item_dict = self._parse_host_start_log(log_file)
|
|
|
|
|
|
|
|
|
|
if not item_dict:
|
|
|
|
|
logger.error("Profiling: job path %s, fail to get job start info.", path)
|
|
|
|
|
break
|
|
|
|
|
|
|
|
|
|
job_id = item
|
|
|
|
|
|
|
|
|
|
if self._dev_id != item_dict["device_id"]:
|
|
|
|
|
logger.info("Profiling: job path %s, dev id %s, training device id %s.",
|
|
|
|
@ -391,7 +398,6 @@ class Profiler:
|
|
|
|
|
if self._start_time > int(item_dict["start_time"]):
|
|
|
|
|
logger.info("Profiling: job path %s, start_time %s, training start_time %d.",
|
|
|
|
|
path, item_dict["start_time"], self._start_time)
|
|
|
|
|
|
|
|
|
|
break
|
|
|
|
|
|
|
|
|
|
if not job_id:
|
|
|
|
|