Reformat fleet API (#17135)

* fix some logic in distributed transpiler, test=develop
* reformat fleet API, test=develop
revert-17304-fix_default_paddle_version
tangwei12 6 years ago committed by GitHub
parent a88a1faa48
commit 565d309501
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

@ -11,7 +11,6 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
import sys
import logging
import paddle.fluid as fluid
@ -26,37 +25,21 @@ from ..base.fleet_base import DistributedOptimizer
class Collective(Fleet):
def __init__(self):
super(Collective, self).__init__(Mode.COLLECTIVE)
self.local_ip_ = 0
self._local_ip = 0
def init(self, role_maker=None):
"""
should be called only once in user's python scripts,
init() will initialize RoleMaker which is used for identifying
current node's role, e.g. worker, server, etc.
Args:
role_maker(RoleMakerBase): subclass of RoleMakerBase.
Returns:
None
"""
super(Collective, self).init(role_maker)
self._role_maker._generate_role()
def init_worker(self, executor):
def init_worker(self):
logging.warn(
"You should not call 'init_worker' method for collective mode.")
def run_worker(self, executor, main_program=None):
def run_worker(self, main_programs=None, scopes=None):
logging.warn(
"You should not call 'run_worker' method for collective mode.")
def init_server(self, executor, model_dir=None):
def init_server(self, model_dir=None):
logging.warn(
"You should not call 'init_server' method for collective mode.")
def run_server(self, executor):
def run_server(self):
logging.warn(
"You should not call 'run_server' method for collective mode.")
@ -64,29 +47,28 @@ class Collective(Fleet):
logging.warn(
"You should not call 'stop_worker' method for collective mode.")
def stop(self, executor):
def stop(self):
"""
stop(): will be called after a user finishes his/her training task.
"""
logging.warn("You should not call 'stop' method for collective mode.")
def distributed_optimizer(self, optimizer, strategy=None):
self.optimizer = CollectiveOptimizer(optimizer, strategy)
return self.optimizer
self._optimizer = CollectiveOptimizer(optimizer, strategy)
return self._optimizer
def save_inference_model(self,
executor,
dirname,
feeded_var_names=None,
target_vars=None,
main_program=None,
export_for_deployment=True):
io.save_inference_model(dirname, feeded_var_names, target_vars,
executor, main_program, None, None,
self._executor, main_program, None, None,
export_for_deployment)
def save_persistables(self, executor, dirname, main_program=None):
io.save_persistables(executor, dirname, main_program, None)
def save_persistables(self, dirname, main_program=None):
io.save_persistables(self._executor, dirname, main_program, None)
fleet = Collective()
@ -143,9 +125,9 @@ class CollectiveOptimizer(DistributedOptimizer):
optimize_ops, param_grads = self._optimizer.minimize(
loss, startup_program, parameter_list, no_grad_set)
worker_endpoints = fleet.worker_endpoints
trainer_id = fleet.current_id
current_endpoint = fleet.current_endpoint
worker_endpoints = fleet.worker_endpoints()
trainer_id = fleet.worker_index()
current_endpoint = fleet.worker_endpoints()[trainer_id]
startup_program = startup_program if startup_program else \
fluid.framework.default_startup_program

@ -94,7 +94,7 @@ class DownpourServer(Server):
Returns:
return None
"""
table = self.server_.downpour_server_param.downpour_table_param.add()
table = self._server.downpour_server_param.downpour_table_param.add()
table.table_id = table_id
table.table_class = "DownpourDenseTable"
table.type = pslib.PS_DENSE_TABLE
@ -169,7 +169,7 @@ class DownpourWorker(Worker):
Returns:
return None
"""
table = self.worker_.sparse_table.add()
table = self._worker.sparse_table.add()
table.table_id = table_id
table.slot_key.extend([var.name for var in slot_key_vars])
table.slot_value.extend([var.name for var in slot_value_vars])

@ -66,8 +66,6 @@ class DistributedAdam(DistributedOptimizerImplBase):
Returns:
[optimize_ops, grads_and_weights]
"""
if not isinstance(losses, list):
losses = [losses]
table_name = find_distributed_lookup_table(losses[0].block.program)
prefetch_slots = find_distributed_lookup_table_inputs(
@ -77,7 +75,7 @@ class DistributedAdam(DistributedOptimizerImplBase):
ps_param = pslib.PSParameter()
server = DownpourServer()
worker = DownpourWorker(self.window_)
worker = DownpourWorker(self._window)
sparse_table_index = 0
server.add_sparse_table(sparse_table_index, self._learning_rate,
prefetch_slots, prefetch_slots_emb)
@ -88,17 +86,12 @@ class DistributedAdam(DistributedOptimizerImplBase):
param_grads_list = []
for loss_index in range(len(losses)):
#program_config = ps_param.trainer_param.program_config.add()
#program_config.program_id = str(
# id(losses[loss_index].block.program))
program_id = str(id(losses[loss_index].block.program))
program_configs[program_id] = {
"pull_sparse": [sparse_table_index],
"push_sparse": [sparse_table_index]
}
#program_config.pull_sparse_table_id.extend([sparse_table_index])
#program_config.push_sparse_table_id.extend([sparse_table_index])
params_grads = sorted(
fluid.backward.append_backward(losses[loss_index],
parameter_list, no_grad_set),
@ -130,8 +123,6 @@ class DistributedAdam(DistributedOptimizerImplBase):
params, grads)
program_configs[program_id]["pull_dense"] = [dense_table_index]
program_configs[program_id]["push_dense"] = [dense_table_index]
#program_config.pull_dense_table_id.extend([dense_table_index])
#program_config.push_dense_table_id.extend([dense_table_index])
if len(data_norm_params) != 0 and len(data_norm_grads) != 0:
dense_table_index += 1
server.add_data_norm_table(dense_table_index,
@ -139,18 +130,13 @@ class DistributedAdam(DistributedOptimizerImplBase):
data_norm_params, data_norm_grads)
worker.add_dense_table(dense_table_index, self._learning_rate,
data_norm_params, data_norm_grads)
#program_config.pull_dense_table_id.extend([dense_table_index])
#program_config.push_dense_table_id.extend([dense_table_index])
program_configs[program_id]["pull_dense"].extend(
[dense_table_index])
program_configs[program_id]["push_dense"].extend(
[dense_table_index])
dense_table_index += 1
#program_configs.append(program_config)
ps_param.server_param.CopyFrom(server.get_desc())
ps_param.trainer_param.CopyFrom(worker.get_desc())
#for program_config in program_configs:
# ps_param.trainer_param.program_config.extend([program_config])
# Todo(guru4elephant): figure out how to support more sparse parameters
# currently only support lookup_table
worker_skipped_ops = ["lookup_table", "lookup_table_grad"]

Loading…
Cancel
Save