not supporte to do auto saving intergrated checkpoint files in manual mode parallel

pull/715/head
WeibiaoYu 5 years ago
parent 5519bce8ae
commit aacc85caec

@ -150,8 +150,8 @@ class CheckpointConfig:
keep_checkpoint_max (int): Maximum step to save checkpoint. Default: 5. keep_checkpoint_max (int): Maximum step to save checkpoint. Default: 5.
keep_checkpoint_per_n_minutes (int): Keep one checkpoint every n minutes. Default: 0. keep_checkpoint_per_n_minutes (int): Keep one checkpoint every n minutes. Default: 0.
Can't be used with keep_checkpoint_max at the same time. Can't be used with keep_checkpoint_max at the same time.
integrated_save (bool): Whether to intergrated save in automatic model parall scene. Default: True. integrated_save (bool): Whether to intergrated save in automatic model parallel scene. Default: True.
Integrated save function is only supported in automatic parall scene, not supported in manual parallel. Integrated save function is only supported in automatic parallel scene, not supported in manual parallel.
Raises: Raises:
ValueError: If the input_param is None or 0. ValueError: If the input_param is None or 0.

@ -225,15 +225,6 @@ def load_param_into_net(net, parameter_dict):
raise TypeError(msg) raise TypeError(msg)
logger.info("Execute load parameter into net process.") logger.info("Execute load parameter into net process.")
for name in parameter_dict:
for _, param in net.parameters_and_names():
if name == param.name and param.layerwise_parallel:
# layerwise parallel parameter data loaded from checkpoint file,
# was a complete(merged) data, need to be splited
new_param = parameter_dict[param.name]
_load_tensor_for_layerwise(new_param, param)
break
param_not_load = [] param_not_load = []
for _, param in net.parameters_and_names(): for _, param in net.parameters_and_names():
if param.name in parameter_dict: if param.name in parameter_dict:
@ -363,34 +354,6 @@ def _get_merged_param_data(net, param_name, param_data):
return param_data return param_data
def _load_tensor_for_layerwise(new_param, old_param):
"""
Replaces parameters with sliced tensors by layerwise parallel strategies.
Args:
new_param (Parameter): The new layerwise parallel parameter, will be loaded into net.
old_param(Parameter): The current parameter in the net.
"""
if not isinstance(new_param.data, Tensor) or not isinstance(old_param.data, Tensor):
logger.error("Failed to combine the net and the parameters.")
msg = ("layerwise parallel parameter should be a Tensor, but got {}.".format(type(new_param.data)))
raise TypeError(msg)
if old_param.data.shape() == new_param.data.shape():
return
from mindspore.parallel._tensor import _load_tensor
from mindspore.communication.management import get_group_size
dev_mat = [get_group_size()]
shape = new_param.data.shape()
for x in range(len(shape)): # dim 0 set 0, others set -1
if x:
tensor_map.append(-1)
new_tensor = _load_tensor(new_param.data, dev_mat, tensor_map)
new_param.set_parameter_data(new_tensor)
def _fill_param_into_net(net, parameter_list): def _fill_param_into_net(net, parameter_list):
""" """
Fills parameter_list into net. Fills parameter_list into net.

Loading…
Cancel
Save