【paddle.fleet】support multi-node cpu training for fleetrun (#26011)

* support multi-ps training mode for fleetrun; test=develop
revert-24895-update_cub
danleifeng 5 years ago committed by GitHub
parent 0067a2e4ec
commit d5a66fd7a0
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

File diff suppressed because it is too large Load Diff

@ -142,12 +142,16 @@ class Pod(object):
self.addr = None self.addr = None
self.port = None self.port = None
self.trainers = [] self.trainers = []
self.servers = []
self.workers = []
self.gpus = [] self.gpus = []
def __str__(self): def __str__(self):
return "rank:{} id:{} addr:{} port:{} visible_gpu:{} trainers:{}".format( return "rank:{} id:{} addr:{} port:{} visible_gpu:{} trainers:{} servers:{} \
self.rank, self.id, self.addr, self.port, self.gpus, workers:{}".format(self.rank, self.id, self.addr, self.port,
[str(t) for t in self.trainers]) self.gpus, [str(t) for t in self.trainers],
[str(s) for s in self.servers],
[str(w) for w in self.workers])
def __eq__(self, pod): def __eq__(self, pod):
if self.rank != pod.rank or \ if self.rank != pod.rank or \
@ -168,6 +172,26 @@ class Pod(object):
pod.trainers[i])) pod.trainers[i]))
return False return False
if len(self.servers) != len(pod.servers):
logger.debug("servers {} != {}".format(self.servers, pod.servers))
return False
for i in range(len(self.servers)):
if self.servers[i] != pod.servers[i]:
logger.debug("servers {} != {}".format(self.servers[i],
pod.servers[i]))
return False
if len(self.workers) != len(pod.workers):
logger.debug("workers {} != {}".format(self.workers, pod.workers))
return False
for i in range(len(self.workers)):
if self.workers[i] != pod.workers[i]:
logger.debug("workers {} != {}".format(self.workers[i],
pod.workers[i]))
return False
return True return True
def __ne__(self, pod): def __ne__(self, pod):
@ -303,6 +327,17 @@ def find_free_ports(num):
return None return None
def get_ports(num, offset):
if os.environ.get('FLAGS_START_PORT') is None:
ports = find_free_ports(num)
if ports is not None:
ports = list(ports)
else:
start_port = os.environ.get('FLAGS_START_PORT')
ports = range(start_port + offset, start_port + offset + num, 1)
return ports
class TrainerProc(object): class TrainerProc(object):
def __init__(self): def __init__(self):
self.proc = None self.proc = None

@ -10,6 +10,14 @@ function test_launch_ps(){
echo "test pserver launch failed" echo "test pserver launch failed"
exit -1 exit -1
fi fi
fleetrun --servers="120.0.0.1:6780,120.0.0.1:6781" --workers="120.0.0.1:6782,120.0.0.1:6783" fleet_ps_training.py 2> ut.elog
if grep -q "server are killed" ut.elog; then
echo "test pserver launch succeed"
else
echo "test pserver launch failed"
exit -1
fi
} }
if [[ ${WITH_GPU} == "OFF" ]]; then if [[ ${WITH_GPU} == "OFF" ]]; then

Loading…
Cancel
Save