From b6ef88c168ee73910257ab0ac46b025585afe756 Mon Sep 17 00:00:00 2001 From: ms_yan Date: Wed, 24 Mar 2021 13:38:10 +0800 Subject: [PATCH] add stop_dataset_profiler option --- .../python/bindings/dataset/core/bindings.cc | 2 ++ .../minddata/dataset/core/config_manager.cc | 6 ++++++ .../minddata/dataset/core/config_manager.h | 19 +++++++++++++++++++ .../dataset/engine/perf/cpu_sampling.cc | 10 +++++----- .../minddata/dataset/engine/perf/monitor.cc | 6 +++++- mindspore/dataset/core/config.py | 16 ++++++++++++++++ mindspore/profiler/profiling.py | 2 ++ 7 files changed, 55 insertions(+), 6 deletions(-) diff --git a/mindspore/ccsrc/minddata/dataset/api/python/bindings/dataset/core/bindings.cc b/mindspore/ccsrc/minddata/dataset/api/python/bindings/dataset/core/bindings.cc index cf47634bfb..ca06deb0ee 100644 --- a/mindspore/ccsrc/minddata/dataset/api/python/bindings/dataset/core/bindings.cc +++ b/mindspore/ccsrc/minddata/dataset/api/python/bindings/dataset/core/bindings.cc @@ -50,6 +50,8 @@ PYBIND_REGISTER(ConfigManager, 0, ([](const py::module *m) { .def("set_auto_worker_config", &ConfigManager::set_auto_worker_config_) .def("set_callback_timeout", &ConfigManager::set_callback_timeout) .def("set_monitor_sampling_interval", &ConfigManager::set_monitor_sampling_interval) + .def("stop_dataset_profiler", &ConfigManager::stop_dataset_profiler) + .def("get_profiler_file_status", &ConfigManager::get_profiler_file_status) .def("set_num_parallel_workers", &ConfigManager::set_num_parallel_workers) .def("set_op_connector_size", &ConfigManager::set_op_connector_size) .def("set_rows_per_buffer", &ConfigManager::set_rows_per_buffer) diff --git a/mindspore/ccsrc/minddata/dataset/core/config_manager.cc b/mindspore/ccsrc/minddata/dataset/core/config_manager.cc index 8073030262..87dd6e1345 100644 --- a/mindspore/ccsrc/minddata/dataset/core/config_manager.cc +++ b/mindspore/ccsrc/minddata/dataset/core/config_manager.cc @@ -39,6 +39,8 @@ ConfigManager::ConfigManager() seed_(kCfgDefaultSeed), numa_enable_(false), monitor_sampling_interval_(kCfgMonitorSamplingInterval), + stop_profiler_(false), + file_ready_(true), callback_timout_(kCfgCallbackTimeout), cache_host_(kCfgDefaultCacheHost), cache_port_(kCfgDefaultCachePort), @@ -139,6 +141,10 @@ void ConfigManager::set_seed(uint32_t seed) { seed_ = seed; } void ConfigManager::set_monitor_sampling_interval(uint32_t interval) { monitor_sampling_interval_ = interval; } +void ConfigManager::stop_dataset_profiler(bool stop_profiler) { stop_profiler_ = stop_profiler; } + +void ConfigManager::set_profiler_file_status(bool file_ready) { file_ready_ = file_ready; } + void ConfigManager::set_callback_timeout(uint32_t timeout) { callback_timout_ = timeout; } void ConfigManager::set_cache_host(std::string cache_host) { cache_host_ = std::move(cache_host); } diff --git a/mindspore/ccsrc/minddata/dataset/core/config_manager.h b/mindspore/ccsrc/minddata/dataset/core/config_manager.h index 22d0d58875..21d2f06aa4 100644 --- a/mindspore/ccsrc/minddata/dataset/core/config_manager.h +++ b/mindspore/ccsrc/minddata/dataset/core/config_manager.h @@ -16,6 +16,7 @@ #ifndef MINDSPORE_CCSRC_MINDDATA_DATASET_CORE_CONFIG_MANAGER_H_ #define MINDSPORE_CCSRC_MINDDATA_DATASET_CORE_CONFIG_MANAGER_H_ +#include #include #include #include @@ -178,6 +179,22 @@ class ConfigManager { // @return The interval of monitor sampling int32_t monitor_sampling_interval() const { return monitor_sampling_interval_; } + // setter function + // @param stop_profiler - The setting to apply to the config + void stop_dataset_profiler(bool stop_profiler); + + // getter function + // @return The status of stop profiler + bool stop_profiler_status() const { return stop_profiler_; } + + // setter function + // @param file_ready - The setting to apply to the config + void set_profiler_file_status(bool file_ready); + + // getter function + // @return The status of profiler file, whether generated + bool get_profiler_file_status() const { return file_ready_; } + // setter function // @param auto_num_workers - whether assign threads to each op automatically void set_auto_num_workers(bool auto_num_workers) { auto_num_workers_ = auto_num_workers; } @@ -223,6 +240,8 @@ class ConfigManager { int32_t rank_id_; uint32_t seed_; uint32_t monitor_sampling_interval_; + std::atomic_bool stop_profiler_; + std::atomic_bool file_ready_; uint32_t callback_timout_; std::string cache_host_; int32_t cache_port_; diff --git a/mindspore/ccsrc/minddata/dataset/engine/perf/cpu_sampling.cc b/mindspore/ccsrc/minddata/dataset/engine/perf/cpu_sampling.cc index de73df5e45..4758fe8c27 100644 --- a/mindspore/ccsrc/minddata/dataset/engine/perf/cpu_sampling.cc +++ b/mindspore/ccsrc/minddata/dataset/engine/perf/cpu_sampling.cc @@ -113,7 +113,7 @@ Status DeviceCpu::ParseRunningProcess(const std::string &str) { Status DeviceCpu::Collect(ExecutionTree *tree) { std::ifstream file("/proc/stat"); if (!file.is_open()) { - MS_LOG(WARNING) << "Open CPU file failed when collect CPU information"; + MS_LOG(INFO) << "Open CPU file failed when collect CPU information"; return Status::OK(); } bool first_line = true; @@ -214,7 +214,7 @@ Status OperatorCpu::ParseCpuInfo(int32_t op_id, int64_t thread_id, std::ifstream file(stat_path); if (!file.is_open()) { - MS_LOG(WARNING) << "Open CPU file failed when collect CPU information"; + MS_LOG(INFO) << "Open CPU file failed when collect CPU information"; return Status::OK(); } std::string str; @@ -236,7 +236,7 @@ Status OperatorCpu::ParseCpuInfo(int32_t op_id, int64_t thread_id, Status OperatorCpu::GetTotalCpuTime(uint64_t *total_stat) { std::ifstream file("/proc/stat"); if (!file.is_open()) { - MS_LOG(WARNING) << "Open CPU file failed when collect CPU information"; + MS_LOG(INFO) << "Open CPU file failed when collect CPU information"; return Status::OK(); } std::string str; @@ -443,7 +443,7 @@ Status ProcessCpu::ParseCpuInfo() { std::ifstream file(stat_path); if (!file.is_open()) { - MS_LOG(WARNING) << "Open CPU file failed when collect CPU information"; + MS_LOG(INFO) << "Open CPU file failed when collect CPU information"; continue; } std::string str; @@ -479,7 +479,7 @@ Status ProcessCpu::ParseCpuInfo() { Status ProcessCpu::GetTotalCpuTime(uint64_t *total_stat) { std::ifstream file("/proc/stat"); if (!file.is_open()) { - MS_LOG(WARNING) << "Open CPU file failed when collect CPU information"; + MS_LOG(INFO) << "Open CPU file failed when collect CPU information"; return Status::OK(); } std::string str; diff --git a/mindspore/ccsrc/minddata/dataset/engine/perf/monitor.cc b/mindspore/ccsrc/minddata/dataset/engine/perf/monitor.cc index 16fdc6b593..323fafd2cf 100644 --- a/mindspore/ccsrc/minddata/dataset/engine/perf/monitor.cc +++ b/mindspore/ccsrc/minddata/dataset/engine/perf/monitor.cc @@ -29,11 +29,13 @@ Monitor::Monitor(ExecutionTree *tree) : tree_(tree) { Status Monitor::operator()() { // Register this thread with TaskManager to receive proper interrupt signal. TaskManager::FindMe()->Post(); + std::shared_ptr cfg = GlobalContext::config_manager(); + cfg->set_profiler_file_status(false); // Keep sampling if // 1) Monitor Task is not interrupted by TaskManager AND // 2) Iterator has not received EOF - while (!this_thread::is_interrupted() && !(tree_->isFinished())) { + while (!this_thread::is_interrupted() && !(tree_->isFinished()) && !(cfg->stop_profiler_status())) { if (tree_->IsEpochEnd()) { RETURN_IF_NOT_OK(tree_->GetProfilingManager()->SaveProfilingData()); tree_->SetExecuting(); @@ -48,6 +50,8 @@ Status Monitor::operator()() { RETURN_IF_NOT_OK(tree_->GetProfilingManager()->Analyze()); RETURN_IF_NOT_OK(tree_->GetProfilingManager()->SaveProfilingData()); RETURN_IF_NOT_OK(tree_->GetProfilingManager()->ChangeFileMode()); + + cfg->set_profiler_file_status(true); return Status::OK(); } diff --git a/mindspore/dataset/core/config.py b/mindspore/dataset/core/config.py index 1a7eb3f56d..24436c8ef4 100644 --- a/mindspore/dataset/core/config.py +++ b/mindspore/dataset/core/config.py @@ -18,8 +18,10 @@ configuration parameters, and read a configuration file. """ import os import random +import time import numpy import mindspore._c_dataengine as cde +from mindspore import log as logger __all__ = ['set_seed', 'get_seed', 'set_prefetch_size', 'get_prefetch_size', 'set_num_parallel_workers', 'get_num_parallel_workers', 'set_monitor_sampling_interval', 'get_monitor_sampling_interval', 'load', @@ -357,3 +359,17 @@ def load(file): >>> ds.config.load(config_file) """ _config.load(file) + + +def _stop_dataset_profiler(): + """ + Mainly for stop dataset profiler. + + Returns: + bool, whether the profiler file has generated. + """ + + while not _config.get_profiler_file_status(): + _config.stop_dataset_profiler(True) + logger.warning("Profiling: waiting for dataset part profiling stop.") + time.sleep(1) diff --git a/mindspore/profiler/profiling.py b/mindspore/profiler/profiling.py index a14c6ae0b0..d4b0501a14 100644 --- a/mindspore/profiler/profiling.py +++ b/mindspore/profiler/profiling.py @@ -23,6 +23,7 @@ from enum import Enum from mindspore import log as logger, context from mindspore.communication.management import GlobalComm, release, get_rank import mindspore._c_expression as c_expression +from mindspore.dataset.core.config import _stop_dataset_profiler from mindspore.profiler.common.exceptions.exceptions import ProfilerFileNotFoundException, \ ProfilerIOException, ProfilerException, ProfilerRawFileException from mindspore.profiler.common.util import get_file_names, fwrite_format @@ -189,6 +190,7 @@ class Profiler: Collect and analyse performance data, called after training or during training. The example shows above. """ self._cpu_profiler.stop() + _stop_dataset_profiler() if self._device_target and self._device_target == "GPU": self._gpu_analyse()