set fleet_send_batch_num a default value according to trainer num

(1) set fleet_send_batch_num a default value according to trainer num, the previous 80000 is fixed,if trainer num is much less or larger than 100,global shuffle may have timeout error.

(2) fix load one table bug, add barrier
padding_in_crf
jiaqi 6 years ago committed by GitHub
parent ea6ee76fa9
commit 233746d89d
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

@ -235,7 +235,7 @@ class InMemoryDataset(DatasetBase):
""" Init. """
super(InMemoryDataset, self).__init__()
self.proto_desc.name = "MultiSlotInMemoryDataFeed"
self.fleet_send_batch_size = 80000
self.fleet_send_batch_size = None
self.queue_num = None
self.merge_by_lineid = False
@ -413,6 +413,8 @@ class InMemoryDataset(DatasetBase):
if fleet is not None:
fleet._role_maker._barrier_worker()
trainer_num = fleet.worker_num()
if self.fleet_send_batch_size is None:
self.fleet_send_batch_size = 800 * trainer_num
self.dataset.register_client2client_msg_handler()
self.dataset.set_trainer_num(trainer_num)
self.dataset.set_fleet_send_batch_size(self.fleet_send_batch_size)

@ -320,11 +320,13 @@ class PSLib(Fleet):
scope = kwargs.get("scope", None)
model_proto_file = kwargs.get("model_proto_file", None)
load_combine = kwargs.get("load_combine", False)
self._role_maker._barrier_worker()
if scope is not None and model_proto_file is not None:
self._load_one_table_from_paddle_model(
scope, table_id, model_path, model_proto_file, load_combine)
else:
elif self._role_maker.is_first_worker():
self._fleet_ptr.load_model_one_table(table_id, model_path, mode)
self._role_maker._barrier_worker()
def _load_one_table_from_paddle_model(self,
scope,

Loading…
Cancel
Save