|
|
|
@ -165,6 +165,7 @@ def save_vars(executor,
|
|
|
|
|
|
|
|
|
|
save_vars(
|
|
|
|
|
executor,
|
|
|
|
|
main_program=main_program,
|
|
|
|
|
dirname=dirname,
|
|
|
|
|
vars=list(filter(predicate, main_program.list_vars())),
|
|
|
|
|
filename=filename)
|
|
|
|
@ -172,11 +173,18 @@ def save_vars(executor,
|
|
|
|
|
save_program = Program()
|
|
|
|
|
save_block = save_program.global_block()
|
|
|
|
|
|
|
|
|
|
if main_program is None:
|
|
|
|
|
main_program = default_main_program()
|
|
|
|
|
if not isinstance(main_program, Program):
|
|
|
|
|
raise TypeError("program should be as Program type or None")
|
|
|
|
|
|
|
|
|
|
save_var_map = {}
|
|
|
|
|
for each_var in vars:
|
|
|
|
|
# NOTE: don't save the variable which type is RAW
|
|
|
|
|
if each_var.type == core.VarDesc.VarType.RAW:
|
|
|
|
|
continue
|
|
|
|
|
if each_var.name == main_program._distributed_lookup_table:
|
|
|
|
|
continue
|
|
|
|
|
new_var = _clone_var_in_block_(save_block, each_var)
|
|
|
|
|
if filename is None:
|
|
|
|
|
save_block.append_op(
|
|
|
|
@ -198,6 +206,16 @@ def save_vars(executor,
|
|
|
|
|
outputs={},
|
|
|
|
|
attrs={'file_path': os.path.join(dirname, filename)})
|
|
|
|
|
|
|
|
|
|
# if there is lookup table, the trainer 0 will notify all pserver to save.
|
|
|
|
|
if main_program._is_distributed and main_program._is_chief and main_program._distributed_lookup_table:
|
|
|
|
|
lookup_table_filename = os.path.join(dirname, "__lookup_table__")
|
|
|
|
|
attrs = {}
|
|
|
|
|
attrs['epmap'] = main_program._endpoints
|
|
|
|
|
attrs['dir'] = lookup_table_filename
|
|
|
|
|
attrs['lookup_table'] = main_program._distributed_lookup_table
|
|
|
|
|
save_block.append_op(
|
|
|
|
|
type='checkpoint_notify', inputs={}, outputs={}, attrs=attrs)
|
|
|
|
|
|
|
|
|
|
executor.run(save_program)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
@ -379,11 +397,22 @@ def load_vars(executor,
|
|
|
|
|
load_prog = Program()
|
|
|
|
|
load_block = load_prog.global_block()
|
|
|
|
|
|
|
|
|
|
if main_program is None:
|
|
|
|
|
main_program = default_main_program()
|
|
|
|
|
if not isinstance(main_program, Program):
|
|
|
|
|
raise TypeError("program should be as Program type or None")
|
|
|
|
|
|
|
|
|
|
load_slice_vars = []
|
|
|
|
|
for each_var in main_program._slice_vars_and_attrs:
|
|
|
|
|
load_slice_vars.append(each_var[2].name)
|
|
|
|
|
|
|
|
|
|
load_var_map = {}
|
|
|
|
|
for each_var in vars:
|
|
|
|
|
assert isinstance(each_var, Variable)
|
|
|
|
|
if each_var.type == core.VarDesc.VarType.RAW:
|
|
|
|
|
continue
|
|
|
|
|
if each_var.name in load_slice_vars:
|
|
|
|
|
continue
|
|
|
|
|
new_var = _clone_var_in_block_(load_block, each_var)
|
|
|
|
|
if filename is None:
|
|
|
|
|
load_block.append_op(
|
|
|
|
@ -406,9 +435,6 @@ def load_vars(executor,
|
|
|
|
|
attrs={'file_path': os.path.join(dirname, filename)})
|
|
|
|
|
executor.run(load_prog)
|
|
|
|
|
|
|
|
|
|
if main_program is None:
|
|
|
|
|
main_program = default_main_program()
|
|
|
|
|
|
|
|
|
|
# load slice vars on pserver, if have it.
|
|
|
|
|
_load_slice_up_vars(executor, dirname,
|
|
|
|
|
main_program._slice_vars_and_attrs)
|
|
|
|
@ -618,13 +644,6 @@ def save_inference_model(dirname,
|
|
|
|
|
if main_program is None:
|
|
|
|
|
main_program = default_main_program()
|
|
|
|
|
|
|
|
|
|
# if there is lookup table, the trainer 0 will notify all pserver to save.
|
|
|
|
|
if main_program._is_distributed and main_program._is_chief and main_program._distributed_lookup_table:
|
|
|
|
|
lookup_table_filename = os.path.join(dirname, "__lookup_table__")
|
|
|
|
|
_save_lookup_tables_by_notify(executor, lookup_table_filename,
|
|
|
|
|
main_program._distributed_lookup_table,
|
|
|
|
|
main_program._endpoints)
|
|
|
|
|
|
|
|
|
|
# when a pserver and a trainer running on the same machine, mkdir may conflict
|
|
|
|
|
try:
|
|
|
|
|
os.makedirs(dirname)
|
|
|
|
@ -642,6 +661,9 @@ def save_inference_model(dirname,
|
|
|
|
|
# it can only be loaded for inference directly. If it's false, the whole
|
|
|
|
|
# original program and related meta are saved so that future usage can be
|
|
|
|
|
# more flexible.
|
|
|
|
|
|
|
|
|
|
origin_program = main_program.clone()
|
|
|
|
|
|
|
|
|
|
if export_for_deployment:
|
|
|
|
|
main_program = main_program.clone()
|
|
|
|
|
global_block = main_program.global_block()
|
|
|
|
@ -666,8 +688,11 @@ def save_inference_model(dirname,
|
|
|
|
|
with open(model_basename + ".main_program", "wb") as f:
|
|
|
|
|
f.write(main_program.desc.serialize_to_string())
|
|
|
|
|
|
|
|
|
|
main_program._copy_dist_param_info_from(origin_program)
|
|
|
|
|
|
|
|
|
|
if params_filename is not None:
|
|
|
|
|
params_filename = os.path.basename(params_filename)
|
|
|
|
|
|
|
|
|
|
save_persistables(executor, dirname, main_program, params_filename)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
@ -897,6 +922,9 @@ def _load_slice_up_vars(executor, dirname, slice_vars_and_attrs):
|
|
|
|
|
slice_var = var_tuple[2]
|
|
|
|
|
end = start + slice_var.shape[0]
|
|
|
|
|
|
|
|
|
|
orig_var_name = orig_var.name
|
|
|
|
|
orig_var.name = "{}.origin".format(orig_var_name)
|
|
|
|
|
|
|
|
|
|
clone_orig_var = load_block.create_var(
|
|
|
|
|
name=orig_var.name,
|
|
|
|
|
type=orig_var.type,
|
|
|
|
@ -915,7 +943,7 @@ def _load_slice_up_vars(executor, dirname, slice_vars_and_attrs):
|
|
|
|
|
type='load',
|
|
|
|
|
inputs={},
|
|
|
|
|
outputs={'Out': [clone_orig_var]},
|
|
|
|
|
attrs={'file_path': os.path.join(dirname, clone_orig_var.name)})
|
|
|
|
|
attrs={'file_path': os.path.join(dirname, orig_var_name)})
|
|
|
|
|
load_block.append_op(
|
|
|
|
|
type="slice",
|
|
|
|
|
inputs={'Input': clone_orig_var},
|
|
|
|
@ -924,6 +952,7 @@ def _load_slice_up_vars(executor, dirname, slice_vars_and_attrs):
|
|
|
|
|
'starts': [start],
|
|
|
|
|
'ends': [end]})
|
|
|
|
|
need_delete_vars.append(clone_orig_var)
|
|
|
|
|
|
|
|
|
|
load_block.append_op(
|
|
|
|
|
type='delete_var',
|
|
|
|
|
inputs={'X': need_delete_vars}, )
|
|
|
|
|