|
|
|
@ -182,12 +182,16 @@ class DistributedAdam(DistributedOptimizerImplBase):
|
|
|
|
|
prog_id_to_param_grads = OrderedDict()
|
|
|
|
|
# sparse_grads of each program
|
|
|
|
|
prog_id_to_sparse_grads = OrderedDict()
|
|
|
|
|
# unique program set
|
|
|
|
|
program_id_set = set()
|
|
|
|
|
|
|
|
|
|
sparse_table_to_index = OrderedDict()
|
|
|
|
|
sparse_table_index = 0
|
|
|
|
|
for loss in losses:
|
|
|
|
|
sparse_table = self._find_multi_distributed_lookup_table([loss])
|
|
|
|
|
prog_id = str(id(loss.block.program))
|
|
|
|
|
if prog_id not in program_id_set:
|
|
|
|
|
program_id_set.add(prog_id)
|
|
|
|
|
sparse_table = self._find_multi_distributed_lookup_table([loss])
|
|
|
|
|
prog_id_to_sparse_table[prog_id] = sparse_table
|
|
|
|
|
|
|
|
|
|
# get sparse_table_to_index
|
|
|
|
@ -207,16 +211,20 @@ class DistributedAdam(DistributedOptimizerImplBase):
|
|
|
|
|
|
|
|
|
|
prog_id_to_worker[prog_id] = DownpourWorker(self._window)
|
|
|
|
|
|
|
|
|
|
grads_dict = self._find_distributed_lookup_table_grads(
|
|
|
|
|
loss.block.program, sparse_table)
|
|
|
|
|
prog_id_to_sparse_grads[prog_id] = grads_dict
|
|
|
|
|
|
|
|
|
|
# param_grads of program
|
|
|
|
|
params_grads = sorted(
|
|
|
|
|
fluid.backward.append_backward(loss, parameter_list,
|
|
|
|
|
no_grad_set),
|
|
|
|
|
key=lambda x: x[0].name)
|
|
|
|
|
prog_id_to_param_grads[prog_id] = params_grads
|
|
|
|
|
if prog_id not in prog_id_to_param_grads:
|
|
|
|
|
prog_id_to_param_grads[prog_id] = []
|
|
|
|
|
prog_id_to_param_grads[prog_id].append(params_grads)
|
|
|
|
|
|
|
|
|
|
grads_dict = self._find_distributed_lookup_table_grads(
|
|
|
|
|
loss.block.program, sparse_table)
|
|
|
|
|
prog_id_to_sparse_grads[prog_id] = grads_dict
|
|
|
|
|
#if strategy.get("parallel_compute")
|
|
|
|
|
|
|
|
|
|
# if user specify a fleet_desc.prototxt file, then load the file
|
|
|
|
|
# instead of creating default fleet_desc.prototxt.
|
|
|
|
@ -251,8 +259,11 @@ class DistributedAdam(DistributedOptimizerImplBase):
|
|
|
|
|
server.add_sparse_table(sparse_table_index, None)
|
|
|
|
|
|
|
|
|
|
# each DownpourTrainerParameter add its own sparse tables
|
|
|
|
|
program_id_set.clear()
|
|
|
|
|
for loss in losses:
|
|
|
|
|
prog_id = str(id(loss.block.program))
|
|
|
|
|
if prog_id not in program_id_set:
|
|
|
|
|
program_id_set.add(prog_id)
|
|
|
|
|
worker = prog_id_to_worker[prog_id]
|
|
|
|
|
inputs_dict = prog_id_to_inputs_dict[prog_id]
|
|
|
|
|
outputs_dict = prog_id_to_outputs_dict[prog_id]
|
|
|
|
@ -267,8 +278,11 @@ class DistributedAdam(DistributedOptimizerImplBase):
|
|
|
|
|
program_configs = {}
|
|
|
|
|
# ServerParameter add all dense tables
|
|
|
|
|
# each DownpourTrainerParameter add its own dense tables
|
|
|
|
|
program_id_set.clear()
|
|
|
|
|
for loss_index in range(len(losses)):
|
|
|
|
|
program_id = str(id(losses[loss_index].block.program))
|
|
|
|
|
if program_id not in program_id_set:
|
|
|
|
|
program_id_set.add(program_id)
|
|
|
|
|
worker = prog_id_to_worker[program_id]
|
|
|
|
|
sparse_table_names = prog_id_to_sparse_table[program_id]
|
|
|
|
|
sparse_table_index = \
|
|
|
|
@ -280,11 +294,12 @@ class DistributedAdam(DistributedOptimizerImplBase):
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
params_grads = prog_id_to_param_grads[program_id]
|
|
|
|
|
for pg in params_grads:
|
|
|
|
|
params = []
|
|
|
|
|
grads = []
|
|
|
|
|
data_norm_params = []
|
|
|
|
|
data_norm_grads = []
|
|
|
|
|
for i in params_grads:
|
|
|
|
|
for i in pg:
|
|
|
|
|
is_data_norm_data = False
|
|
|
|
|
for data_norm_name in self.data_norm_name:
|
|
|
|
|
if i[0].name.endswith(data_norm_name):
|
|
|
|
@ -293,7 +308,7 @@ class DistributedAdam(DistributedOptimizerImplBase):
|
|
|
|
|
if not is_data_norm_data:
|
|
|
|
|
params.append(i[0])
|
|
|
|
|
|
|
|
|
|
for i in params_grads:
|
|
|
|
|
for i in pg:
|
|
|
|
|
is_data_norm_data = False
|
|
|
|
|
for data_norm_grad in self.data_norm_name:
|
|
|
|
|
if i[0].name.endswith(data_norm_grad):
|
|
|
|
@ -307,13 +322,24 @@ class DistributedAdam(DistributedOptimizerImplBase):
|
|
|
|
|
strategy['dense_table'],
|
|
|
|
|
sparse_table_names)
|
|
|
|
|
else:
|
|
|
|
|
server.add_dense_table(dense_table_index, params, grads, None,
|
|
|
|
|
sparse_table_names)
|
|
|
|
|
worker.add_dense_table(dense_table_index, self._learning_rate,
|
|
|
|
|
params, grads, dense_start_table_id,
|
|
|
|
|
sparse_table_names)
|
|
|
|
|
program_configs[program_id]["pull_dense"] = [dense_table_index]
|
|
|
|
|
program_configs[program_id]["push_dense"] = [dense_table_index]
|
|
|
|
|
server.add_dense_table(dense_table_index, params, grads,
|
|
|
|
|
None, sparse_table_names)
|
|
|
|
|
worker.add_dense_table(
|
|
|
|
|
dense_table_index, self._learning_rate, params, grads,
|
|
|
|
|
dense_start_table_id, sparse_table_names)
|
|
|
|
|
if "pull_dense" in program_configs[
|
|
|
|
|
program_id] and "push_dense" in program_configs[
|
|
|
|
|
program_id] and len(program_configs[program_id][
|
|
|
|
|
"pull_dense"]) > 0:
|
|
|
|
|
program_configs[program_id]["pull_dense"].extend(
|
|
|
|
|
[dense_table_index])
|
|
|
|
|
program_configs[program_id]["push_dense"].extend(
|
|
|
|
|
[dense_table_index])
|
|
|
|
|
else:
|
|
|
|
|
program_configs[program_id][
|
|
|
|
|
"pull_dense"] = [dense_table_index]
|
|
|
|
|
program_configs[program_id][
|
|
|
|
|
"push_dense"] = [dense_table_index]
|
|
|
|
|
if len(data_norm_params) != 0 and len(data_norm_grads) != 0:
|
|
|
|
|
dense_table_index += 1
|
|
|
|
|
if strategy.get('datanorm_table') is not None:
|
|
|
|
@ -327,7 +353,8 @@ class DistributedAdam(DistributedOptimizerImplBase):
|
|
|
|
|
data_norm_params, data_norm_grads, None,
|
|
|
|
|
sparse_table_names)
|
|
|
|
|
|
|
|
|
|
worker.add_dense_table(dense_table_index, self._learning_rate,
|
|
|
|
|
worker.add_dense_table(
|
|
|
|
|
dense_table_index, self._learning_rate,
|
|
|
|
|
data_norm_params, data_norm_grads,
|
|
|
|
|
dense_start_table_id, sparse_table_names)
|
|
|
|
|
program_configs[program_id]["pull_dense"].extend(
|
|
|
|
@ -370,13 +397,16 @@ class DistributedAdam(DistributedOptimizerImplBase):
|
|
|
|
|
opt_info["program_id_to_worker"] = prog_id_to_worker
|
|
|
|
|
opt_info["program_configs"] = program_configs
|
|
|
|
|
opt_info["trainer"] = "DistMultiTrainer"
|
|
|
|
|
opt_info["device_worker"] = "DownpourSGD"
|
|
|
|
|
opt_info["device_worker"] = strategy.get("device_worker", "DownpourSGD")
|
|
|
|
|
opt_info["optimizer"] = "DownpourSGD"
|
|
|
|
|
opt_info["fleet_desc"] = ps_param
|
|
|
|
|
opt_info["worker_skipped_ops"] = worker_skipped_ops
|
|
|
|
|
opt_info["use_cvm"] = strategy.get("use_cvm", False)
|
|
|
|
|
opt_info["no_cvm"] = strategy.get("no_cvm", False)
|
|
|
|
|
opt_info["stat_var_names"] = strategy.get("stat_var_names", [])
|
|
|
|
|
opt_info["local_tables"] = strategy.get("local_tables", [])
|
|
|
|
|
opt_info["async_tables"] = strategy.get("async_tables", [])
|
|
|
|
|
opt_info["async_tables"] = strategy.get("async_tables", [])
|
|
|
|
|
opt_info["scale_datanorm"] = strategy.get("scale_datanorm", -1)
|
|
|
|
|
opt_info["check_nan_var_names"] = strategy.get("check_nan_var_names",
|
|
|
|
|
[])
|
|
|
|
@ -391,6 +421,7 @@ class DistributedAdam(DistributedOptimizerImplBase):
|
|
|
|
|
opt_info["dump_slot"] = True
|
|
|
|
|
opt_info["adjust_ins_weight"] = strategy.get("adjust_ins_weight", {})
|
|
|
|
|
opt_info["copy_table"] = strategy.get("copy_table", {})
|
|
|
|
|
opt_info["loss_names"] = strategy.get("loss_names", [])
|
|
|
|
|
|
|
|
|
|
for loss in losses:
|
|
|
|
|
loss.block.program._fleet_opt = opt_info
|
|
|
|
|