|
|
|
@ -108,6 +108,10 @@ class SummaryCollector(Callback):
|
|
|
|
|
custom_lineage_data (Union[dict, None]): Allows you to customize the data and present it on the MingInsight
|
|
|
|
|
lineage page. In the custom data, the key type support str, and the value type support str/int/float.
|
|
|
|
|
Default: None, it means there is no custom data.
|
|
|
|
|
collect_tensor_freq (Optional[int]): Same as the `collect_freq`, but controls TensorSummary specifically.
|
|
|
|
|
Default: None, which means the frequency is auto-calculated just to collect at most 50 steps TensorSummary.
|
|
|
|
|
max_file_size (Optional[int]): The maximum size in bytes each file can be written to the disk.
|
|
|
|
|
Default: None, which means no limit.
|
|
|
|
|
|
|
|
|
|
Raises:
|
|
|
|
|
ValueError: If the parameter value is not expected.
|
|
|
|
@ -145,16 +149,28 @@ class SummaryCollector(Callback):
|
|
|
|
|
'histogram_regular': None
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
def __init__(self, summary_dir, collect_freq=10, collect_specified_data=None,
|
|
|
|
|
keep_default_action=True, custom_lineage_data=None):
|
|
|
|
|
def __init__(self,
|
|
|
|
|
summary_dir,
|
|
|
|
|
collect_freq=10,
|
|
|
|
|
collect_specified_data=None,
|
|
|
|
|
keep_default_action=True,
|
|
|
|
|
custom_lineage_data=None,
|
|
|
|
|
collect_tensor_freq=None,
|
|
|
|
|
max_file_size=None):
|
|
|
|
|
super(SummaryCollector, self).__init__()
|
|
|
|
|
|
|
|
|
|
self._summary_dir = self._process_summary_dir(summary_dir)
|
|
|
|
|
self._record = None
|
|
|
|
|
|
|
|
|
|
self._check_collect_freq(collect_freq)
|
|
|
|
|
self._check_positive('collect_freq', collect_freq)
|
|
|
|
|
self._collect_freq = collect_freq
|
|
|
|
|
|
|
|
|
|
self._check_positive('collect_tensor_freq', collect_tensor_freq, allow_none=True)
|
|
|
|
|
self._collect_tensor_freq = collect_tensor_freq
|
|
|
|
|
|
|
|
|
|
self._check_positive('max_file_size', max_file_size, allow_none=True)
|
|
|
|
|
self._max_file_size = max_file_size
|
|
|
|
|
|
|
|
|
|
self._check_action(keep_default_action)
|
|
|
|
|
|
|
|
|
|
self._collect_specified_data = self._process_specified_data(collect_specified_data, keep_default_action)
|
|
|
|
@ -165,16 +181,14 @@ class SummaryCollector(Callback):
|
|
|
|
|
self._custom_lineage_data = custom_lineage_data
|
|
|
|
|
|
|
|
|
|
self._temp_optimizer = None
|
|
|
|
|
self._has_saved_train_network = False
|
|
|
|
|
self._has_saved_custom_data = False
|
|
|
|
|
self._is_parse_loss_success = True
|
|
|
|
|
self._first_step = True
|
|
|
|
|
self._dataset_sink_mode = True
|
|
|
|
|
|
|
|
|
|
def __enter__(self):
|
|
|
|
|
self._first_step = True
|
|
|
|
|
self._dataset_sink_mode = True
|
|
|
|
|
self._record = SummaryRecord(log_dir=self._summary_dir)
|
|
|
|
|
self._record = SummaryRecord(log_dir=self._summary_dir, max_file_size=self._max_file_size)
|
|
|
|
|
self._first_step, self._dataset_sink_mode = True, True
|
|
|
|
|
return self
|
|
|
|
|
|
|
|
|
|
def __exit__(self, *err):
|
|
|
|
@ -198,11 +212,13 @@ class SummaryCollector(Callback):
|
|
|
|
|
return summary_dir
|
|
|
|
|
|
|
|
|
|
@staticmethod
|
|
|
|
|
def _check_collect_freq(freq):
|
|
|
|
|
"""Check collect freq type and value."""
|
|
|
|
|
check_value_type('collect_freq', freq, int)
|
|
|
|
|
if freq <= 0:
|
|
|
|
|
raise ValueError(f'For `collect_freq` the value should be greater than 0, but got `{freq}`.')
|
|
|
|
|
def _check_positive(name, value, allow_none=False):
|
|
|
|
|
"""Check if the value to be int type and positive."""
|
|
|
|
|
if allow_none:
|
|
|
|
|
return
|
|
|
|
|
check_value_type(name, value, int)
|
|
|
|
|
if value <= 0:
|
|
|
|
|
raise ValueError(f'For `{name}` the value should be greater than 0, but got `{value}`.')
|
|
|
|
|
|
|
|
|
|
@staticmethod
|
|
|
|
|
def _check_custom_lineage_data(custom_lineage_data):
|
|
|
|
@ -276,6 +292,9 @@ class SummaryCollector(Callback):
|
|
|
|
|
self._collect_graphs(cb_params)
|
|
|
|
|
|
|
|
|
|
self._collect_dataset_graph(cb_params)
|
|
|
|
|
if self._collect_tensor_freq is None:
|
|
|
|
|
total_step = cb_params.epoch_num * cb_params.batch_num
|
|
|
|
|
self._collect_tensor_freq = max(self._collect_freq, total_step // 50)
|
|
|
|
|
|
|
|
|
|
if self._custom_lineage_data and not self._has_saved_custom_data:
|
|
|
|
|
packaged_custom_data = self._package_custom_lineage_data(self._custom_lineage_data)
|
|
|
|
@ -287,24 +306,29 @@ class SummaryCollector(Callback):
|
|
|
|
|
|
|
|
|
|
def step_end(self, run_context):
|
|
|
|
|
cb_params = run_context.original_args()
|
|
|
|
|
if cb_params.mode != ModeEnum.TRAIN.value:
|
|
|
|
|
return
|
|
|
|
|
if self._first_step:
|
|
|
|
|
# Notice: This way of determining whether dataset sink mode is True does not work in the eval scenario
|
|
|
|
|
self._dataset_sink_mode = bool(cb_params.cur_step_num == cb_params.batch_num)
|
|
|
|
|
|
|
|
|
|
if cb_params.mode == ModeEnum.TRAIN.value:
|
|
|
|
|
|
|
|
|
|
if not self._is_collect_this_step(cb_params):
|
|
|
|
|
return
|
|
|
|
|
self._dataset_sink_mode = cb_params.cur_step_num == cb_params.batch_num
|
|
|
|
|
self._collect_at_step_end(cb_params, plugin_filter=None)
|
|
|
|
|
self._first_step = False
|
|
|
|
|
else:
|
|
|
|
|
current = cb_params.cur_epoch_num if self._dataset_sink_mode else cb_params.cur_step_num
|
|
|
|
|
if current % self._collect_freq == 0 and current % self._collect_tensor_freq == 0:
|
|
|
|
|
self._collect_at_step_end(cb_params, plugin_filter=None)
|
|
|
|
|
elif current % self._collect_tensor_freq == 0:
|
|
|
|
|
self._collect_at_step_end(cb_params, lambda plugin: plugin == PluginEnum.TENSOR.value)
|
|
|
|
|
elif current % self._collect_freq == 0:
|
|
|
|
|
self._collect_at_step_end(cb_params, lambda plugin: plugin != PluginEnum.TENSOR.value)
|
|
|
|
|
|
|
|
|
|
if not self._has_saved_train_network:
|
|
|
|
|
self._collect_graphs(cb_params)
|
|
|
|
|
|
|
|
|
|
self._collect_input_data(cb_params)
|
|
|
|
|
self._collect_metric(cb_params)
|
|
|
|
|
self._collect_histogram(cb_params)
|
|
|
|
|
def _collect_at_step_end(self, cb_params, plugin_filter):
|
|
|
|
|
self._collect_input_data(cb_params)
|
|
|
|
|
self._collect_metric(cb_params)
|
|
|
|
|
self._collect_histogram(cb_params)
|
|
|
|
|
self._record.record(cb_params.cur_step_num, plugin_filter=plugin_filter)
|
|
|
|
|
|
|
|
|
|
self._first_step = False
|
|
|
|
|
self._record.record(cb_params.cur_step_num)
|
|
|
|
|
|
|
|
|
|
def end(self, run_context):
|
|
|
|
|
cb_params = run_context.original_args()
|
|
|
|
@ -331,18 +355,6 @@ class SummaryCollector(Callback):
|
|
|
|
|
raise ValueError(f"There are more than one {self.__class__.__name__} instance in callback list,"
|
|
|
|
|
f"but expected only one {self.__class__.__name__} instance.")
|
|
|
|
|
|
|
|
|
|
def _is_collect_this_step(self, cb_params):
|
|
|
|
|
"""Decide whether to collect data for the current step."""
|
|
|
|
|
# Make sure the first step data is recorded
|
|
|
|
|
if not self._first_step:
|
|
|
|
|
if self._dataset_sink_mode:
|
|
|
|
|
if cb_params.cur_epoch_num % self._collect_freq:
|
|
|
|
|
return False
|
|
|
|
|
else:
|
|
|
|
|
if cb_params.cur_step_num % self._collect_freq:
|
|
|
|
|
return False
|
|
|
|
|
return True
|
|
|
|
|
|
|
|
|
|
@staticmethod
|
|
|
|
|
def _package_custom_lineage_data(custom_lineage_data):
|
|
|
|
|
"""
|
|
|
|
@ -411,7 +423,6 @@ class SummaryCollector(Callback):
|
|
|
|
|
if graph_proto is None:
|
|
|
|
|
return
|
|
|
|
|
|
|
|
|
|
self._has_saved_train_network = True
|
|
|
|
|
self._record.add_value(PluginEnum.GRAPH.value, 'train_network/auto', graph_proto)
|
|
|
|
|
|
|
|
|
|
def _collect_metric(self, cb_params):
|
|
|
|
|