Reformat fleet API (#17135)

* fix some logic in distributed transpiler, test=develop
* reformat fleet API, test=develop
revert-17304-fix_default_paddle_version
tangwei12 6 years ago committed by GitHub
parent a88a1faa48
commit 565d309501
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

@ -11,7 +11,6 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and # See the License for the specific language governing permissions and
import sys
import logging import logging
import paddle.fluid as fluid import paddle.fluid as fluid
@ -26,37 +25,21 @@ from ..base.fleet_base import DistributedOptimizer
class Collective(Fleet): class Collective(Fleet):
def __init__(self): def __init__(self):
super(Collective, self).__init__(Mode.COLLECTIVE) super(Collective, self).__init__(Mode.COLLECTIVE)
self.local_ip_ = 0 self._local_ip = 0
def init(self, role_maker=None): def init_worker(self):
"""
should be called only once in user's python scripts,
init() will initialize RoleMaker which is used for identifying
current node's role, e.g. worker, server, etc.
Args:
role_maker(RoleMakerBase): subclass of RoleMakerBase.
Returns:
None
"""
super(Collective, self).init(role_maker)
self._role_maker._generate_role()
def init_worker(self, executor):
logging.warn( logging.warn(
"You should not call 'init_worker' method for collective mode.") "You should not call 'init_worker' method for collective mode.")
def run_worker(self, executor, main_program=None): def run_worker(self, main_programs=None, scopes=None):
logging.warn( logging.warn(
"You should not call 'run_worker' method for collective mode.") "You should not call 'run_worker' method for collective mode.")
def init_server(self, executor, model_dir=None): def init_server(self, model_dir=None):
logging.warn( logging.warn(
"You should not call 'init_server' method for collective mode.") "You should not call 'init_server' method for collective mode.")
def run_server(self, executor): def run_server(self):
logging.warn( logging.warn(
"You should not call 'run_server' method for collective mode.") "You should not call 'run_server' method for collective mode.")
@ -64,29 +47,28 @@ class Collective(Fleet):
logging.warn( logging.warn(
"You should not call 'stop_worker' method for collective mode.") "You should not call 'stop_worker' method for collective mode.")
def stop(self, executor): def stop(self):
""" """
stop(): will be called after a user finishes his/her training task. stop(): will be called after a user finishes his/her training task.
""" """
logging.warn("You should not call 'stop' method for collective mode.") logging.warn("You should not call 'stop' method for collective mode.")
def distributed_optimizer(self, optimizer, strategy=None): def distributed_optimizer(self, optimizer, strategy=None):
self.optimizer = CollectiveOptimizer(optimizer, strategy) self._optimizer = CollectiveOptimizer(optimizer, strategy)
return self.optimizer return self._optimizer
def save_inference_model(self, def save_inference_model(self,
executor,
dirname, dirname,
feeded_var_names=None, feeded_var_names=None,
target_vars=None, target_vars=None,
main_program=None, main_program=None,
export_for_deployment=True): export_for_deployment=True):
io.save_inference_model(dirname, feeded_var_names, target_vars, io.save_inference_model(dirname, feeded_var_names, target_vars,
executor, main_program, None, None, self._executor, main_program, None, None,
export_for_deployment) export_for_deployment)
def save_persistables(self, executor, dirname, main_program=None): def save_persistables(self, dirname, main_program=None):
io.save_persistables(executor, dirname, main_program, None) io.save_persistables(self._executor, dirname, main_program, None)
fleet = Collective() fleet = Collective()
@ -143,9 +125,9 @@ class CollectiveOptimizer(DistributedOptimizer):
optimize_ops, param_grads = self._optimizer.minimize( optimize_ops, param_grads = self._optimizer.minimize(
loss, startup_program, parameter_list, no_grad_set) loss, startup_program, parameter_list, no_grad_set)
worker_endpoints = fleet.worker_endpoints worker_endpoints = fleet.worker_endpoints()
trainer_id = fleet.current_id trainer_id = fleet.worker_index()
current_endpoint = fleet.current_endpoint current_endpoint = fleet.worker_endpoints()[trainer_id]
startup_program = startup_program if startup_program else \ startup_program = startup_program if startup_program else \
fluid.framework.default_startup_program fluid.framework.default_startup_program

@ -94,7 +94,7 @@ class DownpourServer(Server):
Returns: Returns:
return None return None
""" """
table = self.server_.downpour_server_param.downpour_table_param.add() table = self._server.downpour_server_param.downpour_table_param.add()
table.table_id = table_id table.table_id = table_id
table.table_class = "DownpourDenseTable" table.table_class = "DownpourDenseTable"
table.type = pslib.PS_DENSE_TABLE table.type = pslib.PS_DENSE_TABLE
@ -169,7 +169,7 @@ class DownpourWorker(Worker):
Returns: Returns:
return None return None
""" """
table = self.worker_.sparse_table.add() table = self._worker.sparse_table.add()
table.table_id = table_id table.table_id = table_id
table.slot_key.extend([var.name for var in slot_key_vars]) table.slot_key.extend([var.name for var in slot_key_vars])
table.slot_value.extend([var.name for var in slot_value_vars]) table.slot_value.extend([var.name for var in slot_value_vars])

@ -66,8 +66,6 @@ class DistributedAdam(DistributedOptimizerImplBase):
Returns: Returns:
[optimize_ops, grads_and_weights] [optimize_ops, grads_and_weights]
""" """
if not isinstance(losses, list):
losses = [losses]
table_name = find_distributed_lookup_table(losses[0].block.program) table_name = find_distributed_lookup_table(losses[0].block.program)
prefetch_slots = find_distributed_lookup_table_inputs( prefetch_slots = find_distributed_lookup_table_inputs(
@ -77,7 +75,7 @@ class DistributedAdam(DistributedOptimizerImplBase):
ps_param = pslib.PSParameter() ps_param = pslib.PSParameter()
server = DownpourServer() server = DownpourServer()
worker = DownpourWorker(self.window_) worker = DownpourWorker(self._window)
sparse_table_index = 0 sparse_table_index = 0
server.add_sparse_table(sparse_table_index, self._learning_rate, server.add_sparse_table(sparse_table_index, self._learning_rate,
prefetch_slots, prefetch_slots_emb) prefetch_slots, prefetch_slots_emb)
@ -88,17 +86,12 @@ class DistributedAdam(DistributedOptimizerImplBase):
param_grads_list = [] param_grads_list = []
for loss_index in range(len(losses)): for loss_index in range(len(losses)):
#program_config = ps_param.trainer_param.program_config.add()
#program_config.program_id = str(
# id(losses[loss_index].block.program))
program_id = str(id(losses[loss_index].block.program)) program_id = str(id(losses[loss_index].block.program))
program_configs[program_id] = { program_configs[program_id] = {
"pull_sparse": [sparse_table_index], "pull_sparse": [sparse_table_index],
"push_sparse": [sparse_table_index] "push_sparse": [sparse_table_index]
} }
#program_config.pull_sparse_table_id.extend([sparse_table_index])
#program_config.push_sparse_table_id.extend([sparse_table_index])
params_grads = sorted( params_grads = sorted(
fluid.backward.append_backward(losses[loss_index], fluid.backward.append_backward(losses[loss_index],
parameter_list, no_grad_set), parameter_list, no_grad_set),
@ -130,8 +123,6 @@ class DistributedAdam(DistributedOptimizerImplBase):
params, grads) params, grads)
program_configs[program_id]["pull_dense"] = [dense_table_index] program_configs[program_id]["pull_dense"] = [dense_table_index]
program_configs[program_id]["push_dense"] = [dense_table_index] program_configs[program_id]["push_dense"] = [dense_table_index]
#program_config.pull_dense_table_id.extend([dense_table_index])
#program_config.push_dense_table_id.extend([dense_table_index])
if len(data_norm_params) != 0 and len(data_norm_grads) != 0: if len(data_norm_params) != 0 and len(data_norm_grads) != 0:
dense_table_index += 1 dense_table_index += 1
server.add_data_norm_table(dense_table_index, server.add_data_norm_table(dense_table_index,
@ -139,18 +130,13 @@ class DistributedAdam(DistributedOptimizerImplBase):
data_norm_params, data_norm_grads) data_norm_params, data_norm_grads)
worker.add_dense_table(dense_table_index, self._learning_rate, worker.add_dense_table(dense_table_index, self._learning_rate,
data_norm_params, data_norm_grads) data_norm_params, data_norm_grads)
#program_config.pull_dense_table_id.extend([dense_table_index])
#program_config.push_dense_table_id.extend([dense_table_index])
program_configs[program_id]["pull_dense"].extend( program_configs[program_id]["pull_dense"].extend(
[dense_table_index]) [dense_table_index])
program_configs[program_id]["push_dense"].extend( program_configs[program_id]["push_dense"].extend(
[dense_table_index]) [dense_table_index])
dense_table_index += 1 dense_table_index += 1
#program_configs.append(program_config)
ps_param.server_param.CopyFrom(server.get_desc()) ps_param.server_param.CopyFrom(server.get_desc())
ps_param.trainer_param.CopyFrom(worker.get_desc()) ps_param.trainer_param.CopyFrom(worker.get_desc())
#for program_config in program_configs:
# ps_param.trainer_param.program_config.extend([program_config])
# Todo(guru4elephant): figure out how to support more sparse parameters # Todo(guru4elephant): figure out how to support more sparse parameters
# currently only support lookup_table # currently only support lookup_table
worker_skipped_ops = ["lookup_table", "lookup_table_grad"] worker_skipped_ops = ["lookup_table", "lookup_table_grad"]

Loading…
Cancel
Save