|
|
@ -142,12 +142,16 @@ class Pod(object):
|
|
|
|
self.addr = None
|
|
|
|
self.addr = None
|
|
|
|
self.port = None
|
|
|
|
self.port = None
|
|
|
|
self.trainers = []
|
|
|
|
self.trainers = []
|
|
|
|
|
|
|
|
self.servers = []
|
|
|
|
|
|
|
|
self.workers = []
|
|
|
|
self.gpus = []
|
|
|
|
self.gpus = []
|
|
|
|
|
|
|
|
|
|
|
|
def __str__(self):
|
|
|
|
def __str__(self):
|
|
|
|
return "rank:{} id:{} addr:{} port:{} visible_gpu:{} trainers:{}".format(
|
|
|
|
return "rank:{} id:{} addr:{} port:{} visible_gpu:{} trainers:{} servers:{} \
|
|
|
|
self.rank, self.id, self.addr, self.port, self.gpus,
|
|
|
|
workers:{}".format(self.rank, self.id, self.addr, self.port,
|
|
|
|
[str(t) for t in self.trainers])
|
|
|
|
self.gpus, [str(t) for t in self.trainers],
|
|
|
|
|
|
|
|
[str(s) for s in self.servers],
|
|
|
|
|
|
|
|
[str(w) for w in self.workers])
|
|
|
|
|
|
|
|
|
|
|
|
def __eq__(self, pod):
|
|
|
|
def __eq__(self, pod):
|
|
|
|
if self.rank != pod.rank or \
|
|
|
|
if self.rank != pod.rank or \
|
|
|
@ -168,6 +172,26 @@ class Pod(object):
|
|
|
|
pod.trainers[i]))
|
|
|
|
pod.trainers[i]))
|
|
|
|
return False
|
|
|
|
return False
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if len(self.servers) != len(pod.servers):
|
|
|
|
|
|
|
|
logger.debug("servers {} != {}".format(self.servers, pod.servers))
|
|
|
|
|
|
|
|
return False
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
for i in range(len(self.servers)):
|
|
|
|
|
|
|
|
if self.servers[i] != pod.servers[i]:
|
|
|
|
|
|
|
|
logger.debug("servers {} != {}".format(self.servers[i],
|
|
|
|
|
|
|
|
pod.servers[i]))
|
|
|
|
|
|
|
|
return False
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if len(self.workers) != len(pod.workers):
|
|
|
|
|
|
|
|
logger.debug("workers {} != {}".format(self.workers, pod.workers))
|
|
|
|
|
|
|
|
return False
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
for i in range(len(self.workers)):
|
|
|
|
|
|
|
|
if self.workers[i] != pod.workers[i]:
|
|
|
|
|
|
|
|
logger.debug("workers {} != {}".format(self.workers[i],
|
|
|
|
|
|
|
|
pod.workers[i]))
|
|
|
|
|
|
|
|
return False
|
|
|
|
|
|
|
|
|
|
|
|
return True
|
|
|
|
return True
|
|
|
|
|
|
|
|
|
|
|
|
def __ne__(self, pod):
|
|
|
|
def __ne__(self, pod):
|
|
|
@ -303,6 +327,17 @@ def find_free_ports(num):
|
|
|
|
return None
|
|
|
|
return None
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def get_ports(num, offset):
|
|
|
|
|
|
|
|
if os.environ.get('FLAGS_START_PORT') is None:
|
|
|
|
|
|
|
|
ports = find_free_ports(num)
|
|
|
|
|
|
|
|
if ports is not None:
|
|
|
|
|
|
|
|
ports = list(ports)
|
|
|
|
|
|
|
|
else:
|
|
|
|
|
|
|
|
start_port = os.environ.get('FLAGS_START_PORT')
|
|
|
|
|
|
|
|
ports = range(start_port + offset, start_port + offset + num, 1)
|
|
|
|
|
|
|
|
return ports
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class TrainerProc(object):
|
|
|
|
class TrainerProc(object):
|
|
|
|
def __init__(self):
|
|
|
|
def __init__(self):
|
|
|
|
self.proc = None
|
|
|
|
self.proc = None
|
|
|
|