diff --git a/mindspore/context.py b/mindspore/context.py index d28fa91983..1d68375f93 100644 --- a/mindspore/context.py +++ b/mindspore/context.py @@ -214,11 +214,8 @@ class _Context: self.set_param(ms_ctx_param.max_call_depth, max_call_depth) def set_profiling_options(self, option): - options = ["training_trace", "task_trace", - "task_trace:training_trace", "training_trace:task_trace", "op_trace"] - if option not in options: - raise ValueError("Profiling options must be in 'training_trace' 'task_trace' " - "'task_trace:training_trace' 'training_trace:task_trace' or 'op_trace'.") + if not isinstance(option, str): + raise TypeError("The parameter option must be str.") self.set_param(ms_ctx_param.profiling_options, option) def set_variable_memory_max_size(self, variable_memory_max_size): diff --git a/mindspore/profiler/parser/framework_parser.py b/mindspore/profiler/parser/framework_parser.py index 7fe0569461..33b9ee6e89 100644 --- a/mindspore/profiler/parser/framework_parser.py +++ b/mindspore/profiler/parser/framework_parser.py @@ -174,7 +174,6 @@ class FrameworkParser: device_id (str): The device ID. output_path (str): The directory of the parsed file. Default: `./`. """ - _raw_data_dir = '/var/log/npu/profiling' _regex_framework = r'Framework\.(?P.+)\.(?P\d).+' _regex_framework_in_data = r'Framework\.(?P.+)\.' \ r'(?P\d)\.(?P[a-zA-Z0-9]+).+' @@ -193,6 +192,7 @@ class FrameworkParser: _task_id_threshold = 25000 def __init__(self, profiling_id, device_id, output_path='./'): + self._raw_data_dir = output_path self._profiling_path = self._get_raw_profiling_path(profiling_id) self._backend_type = None self._framework_path = {'graph': [], 'task': [], 'point': []} diff --git a/mindspore/profiler/profiling.py b/mindspore/profiler/profiling.py index 2770bafcc7..f899ba0b76 100644 --- a/mindspore/profiler/profiling.py +++ b/mindspore/profiler/profiling.py @@ -16,6 +16,7 @@ import os import stat import time +import json from enum import Enum from mindspore import log as logger, context @@ -37,7 +38,6 @@ from mindspore.profiler.parser.optime_parser import OPComputeTimeParser from mindspore.profiler.parser.step_trace_parser import GpuStepTraceParser, AscendStepTraceParser from mindspore.nn.cell import Cell -PROFILING_LOG_BASE_PATH = "/var/log/npu/profiling" INIT_OP_NAME = 'Default/InitDataSetQueue' class ProfileOption(Enum): @@ -72,7 +72,6 @@ class Profiler: >>> profiler.analyse() """ - _base_profiling_container_path = "/var/log/npu/profiling/container" _hwts_output_filename_target = "output_format_data_hwts_" _opcompute_output_filename_target = "output_op_compute_time_" _aicpu_op_output_filename_target = "output_data_preprocess_aicpu_" @@ -80,9 +79,11 @@ class Profiler: def __init__(self, **kwargs): # get device_id and device_target self._get_devid_and_devtarget() - output_path = kwargs.pop("output_path", "./data") + format_time = int(time.time()) + output_path = kwargs.pop("output_path", f"data-{format_time}") self._output_path = validate_and_normalize_path(output_path) - self._output_path = os.path.join(self._output_path, "profiler") + self._output_path = os.path.join(self._output_path, f"profiler-{format_time}") + self._base_profiling_container_path = os.path.join(self._output_path, "container") if not os.path.exists(self._output_path): os.makedirs(self._output_path, exist_ok=True) os.chmod(self._output_path, stat.S_IRUSR | stat.S_IWUSR | stat.S_IXUSR) @@ -113,10 +114,25 @@ class Profiler: logger.warning("There are invalid params which don't work.") os.environ['DEVICE_ID'] = self._dev_id - os.environ['AICPU_PROFILING_MODE'] = 'true' - + fp_point = os.environ.get("PROFILING_FP_START", "") + bp_point = os.environ.get("PROFILING_BP_END", "") + + profiling_options = { + "result_path": self._output_path, + "fp_point": fp_point, + "bp_point": bp_point, + "training_trace": "on", + "task_trace": "on", + "ai_core_metrics": "PipeUtilization", + "aicpu_trace": "on" + } + + profiling_options = json.dumps(profiling_options) + # Characters longer than 2048 are ignored, resulting in profiling option resolution errors + if len(profiling_options) > 2048: + raise ValueError("The parameter length exceeds the limit (2048)") # use context interface to open profiling, for the new mindspore version(after 2020.5.21) - context.set_context(enable_profiling=True, profiling_options="training_trace:task_trace") + context.set_context(enable_profiling=True, profiling_options=profiling_options) self._container_path = os.path.join(self._base_profiling_container_path, self._dev_id) data_path = os.path.join(self._container_path, "data") @@ -174,7 +190,7 @@ class Profiler: job_id = self._get_profiling_job_id() logger.info("Profiling: job id is %s ", job_id) - source_path = os.path.join(PROFILING_LOG_BASE_PATH, job_id) + source_path = os.path.join(self._output_path, job_id) # parse hwts.log.data.45.dev file, and get task profiling data hwts_output_filename = self._hwts_output_filename_target + self._dev_id + ".txt" hwts_output_filename = os.path.join(self._output_path, hwts_output_filename) @@ -347,12 +363,12 @@ class Profiler: return self._profiling_job_id job_id = "" - cmd = "ls -t " + PROFILING_LOG_BASE_PATH + "|grep JOB|awk '{print $1}'" + cmd = "ls -t " + self._output_path + "|grep JOB|awk '{print $1}'" r = os.popen(cmd) profiling_job_dirs = r.readlines() r.close() for item in profiling_job_dirs: - path = os.path.join(PROFILING_LOG_BASE_PATH, item.strip()) + path = os.path.join(self._output_path, item.strip()) log_file = get_file_names(path, "host_start.log") if not log_file: logger.error("Profiling: job path %s, host_start.log not exist.", path) diff --git a/tests/st/profiler/test_profiler.py b/tests/st/profiler/test_profiler.py index 9744033736..d75068014e 100644 --- a/tests/st/profiler/test_profiler.py +++ b/tests/st/profiler/test_profiler.py @@ -128,7 +128,6 @@ def cleanup(): class TestProfiler: device_id = int(os.getenv('DEVICE_ID')) if os.getenv('DEVICE_ID') else 0 mnist_path = '/home/workspace/mindspore_dataset/mnist' - profiler_path = os.path.join(os.getcwd(), 'data/profiler/') @classmethod def teardown_class(cls): @@ -140,7 +139,9 @@ class TestProfiler: @pytest.mark.env_onecard def test_gpu_profiler(self): context.set_context(mode=context.GRAPH_MODE, device_target="GPU") - profiler = Profiler() + profiler = Profiler(output_path='data') + profiler_name = os.listdir(os.path.join(os.getcwd(), 'data'))[0] + self.profiler_path = os.path.join(os.getcwd(), f'data/{profiler_name}/') ds_train = create_dataset(os.path.join(self.mnist_path, "train")) if ds_train.get_dataset_size() == 0: raise ValueError("Please check dataset size > 0 and batch_size <= dataset size") diff --git a/tests/ut/python/profiler/parser/test_framework_parser.py b/tests/ut/python/profiler/parser/test_framework_parser.py index d37bd19dd9..8c0dd110b3 100644 --- a/tests/ut/python/profiler/parser/test_framework_parser.py +++ b/tests/ut/python/profiler/parser/test_framework_parser.py @@ -49,13 +49,15 @@ class TestFrameworkParser: """Test the class of `FrameworkParser`.""" def setup_method(self): """Initialization before test case execution.""" - with mock.patch.object(FrameworkParser, '_raw_data_dir', RAW_DATA_BASE): - self._output_path_1 = tempfile.mkdtemp(prefix='test_framework_parser_') - self._parser_1 = FrameworkParser('JOB1', '0', self._output_path_1) - self._output_path_2 = tempfile.mkdtemp(prefix='test_framework_parser_') - self._parser_2 = FrameworkParser('JOB2', '0', self._output_path_2) - self._output_path_4 = tempfile.mkdtemp(prefix='test_framework_parser_') - self._parser_4 = FrameworkParser('JOB4', '0', self._output_path_4) + self._output_path_1 = tempfile.NamedTemporaryFile(prefix='test_framework_parser_').name + shutil.copytree(RAW_DATA_BASE, self._output_path_1) + self._parser_1 = FrameworkParser('JOB1', '0', self._output_path_1) + self._output_path_2 = tempfile.NamedTemporaryFile(prefix='test_framework_parser_').name + shutil.copytree(RAW_DATA_BASE, self._output_path_2) + self._parser_2 = FrameworkParser('JOB2', '0', self._output_path_2) + self._output_path_4 = tempfile.NamedTemporaryFile(prefix='test_framework_parser_').name + shutil.copytree(RAW_DATA_BASE, self._output_path_4) + self._parser_4 = FrameworkParser('JOB4', '0', self._output_path_4) def teardown_method(self) -> None: """Clear up after test case execution.""" diff --git a/tests/ut/python/pynative_mode/test_context.py b/tests/ut/python/pynative_mode/test_context.py index 53d6e97f78..68ce4223b5 100644 --- a/tests/ut/python/pynative_mode/test_context.py +++ b/tests/ut/python/pynative_mode/test_context.py @@ -15,6 +15,7 @@ """ test_context """ import os import shutil +import json import pytest from mindspore import context @@ -94,14 +95,18 @@ def test_profiling_options(): context.set_context(profiling_options=True) with pytest.raises(TypeError): context.set_context(profiling_options=1) - with pytest.raises(ValueError): - context.set_context(profiling_options="training_") - with pytest.raises(ValueError): - context.set_context(profiling_options="training_trace:op_trace") - context.set_context(profiling_options="training_trace") - assert context.get_context("profiling_options") == "training_trace" - context.set_context(profiling_options="training_trace:task_trace") - assert context.get_context("profiling_options") == "training_trace:task_trace" + profiling_options = { + "result_path": "", + "fp_point": "", + "bp_point": "", + "training_trace": "on", + "task_trace": "on", + "ai_core_metrics": "PipeUtilization", + "aicpu_trace": "on" + } + profiling_options = json.dumps(profiling_options) + context.set_context(profiling_options=profiling_options) + assert context.get_context("profiling_options") == profiling_options def test_variable_memory_max_size():