|
|
@ -30,7 +30,7 @@ class TestDistRunnerBase(object):
|
|
|
|
"get_model should be implemented by child classes.")
|
|
|
|
"get_model should be implemented by child classes.")
|
|
|
|
|
|
|
|
|
|
|
|
def get_transpiler(self, trainer_id, main_program, pserver_endpoints,
|
|
|
|
def get_transpiler(self, trainer_id, main_program, pserver_endpoints,
|
|
|
|
trainers):
|
|
|
|
trainers, sync_mode):
|
|
|
|
# NOTE: import fluid until runtime, or else forking processes will cause error.
|
|
|
|
# NOTE: import fluid until runtime, or else forking processes will cause error.
|
|
|
|
import paddle
|
|
|
|
import paddle
|
|
|
|
import paddle.fluid as fluid
|
|
|
|
import paddle.fluid as fluid
|
|
|
@ -39,17 +39,22 @@ class TestDistRunnerBase(object):
|
|
|
|
trainer_id=trainer_id,
|
|
|
|
trainer_id=trainer_id,
|
|
|
|
program=main_program,
|
|
|
|
program=main_program,
|
|
|
|
pservers=pserver_endpoints,
|
|
|
|
pservers=pserver_endpoints,
|
|
|
|
trainers=trainers)
|
|
|
|
trainers=trainers,
|
|
|
|
|
|
|
|
sync_mode=sync_mode)
|
|
|
|
return t
|
|
|
|
return t
|
|
|
|
|
|
|
|
|
|
|
|
def run_pserver(self, pserver_endpoints, trainers, current_endpoint,
|
|
|
|
def run_pserver(self,
|
|
|
|
trainer_id):
|
|
|
|
pserver_endpoints,
|
|
|
|
|
|
|
|
trainers,
|
|
|
|
|
|
|
|
current_endpoint,
|
|
|
|
|
|
|
|
trainer_id,
|
|
|
|
|
|
|
|
sync_mode=True):
|
|
|
|
import paddle
|
|
|
|
import paddle
|
|
|
|
import paddle.fluid as fluid
|
|
|
|
import paddle.fluid as fluid
|
|
|
|
self.get_model(batch_size=2)
|
|
|
|
self.get_model(batch_size=2)
|
|
|
|
t = self.get_transpiler(trainer_id,
|
|
|
|
t = self.get_transpiler(trainer_id,
|
|
|
|
fluid.default_main_program(), pserver_endpoints,
|
|
|
|
fluid.default_main_program(), pserver_endpoints,
|
|
|
|
trainers)
|
|
|
|
trainers, sync_mode)
|
|
|
|
pserver_prog = t.get_pserver_program(current_endpoint)
|
|
|
|
pserver_prog = t.get_pserver_program(current_endpoint)
|
|
|
|
startup_prog = t.get_startup_program(current_endpoint, pserver_prog)
|
|
|
|
startup_prog = t.get_startup_program(current_endpoint, pserver_prog)
|
|
|
|
place = fluid.CPUPlace()
|
|
|
|
place = fluid.CPUPlace()
|
|
|
@ -57,7 +62,13 @@ class TestDistRunnerBase(object):
|
|
|
|
exe.run(startup_prog)
|
|
|
|
exe.run(startup_prog)
|
|
|
|
exe.run(pserver_prog)
|
|
|
|
exe.run(pserver_prog)
|
|
|
|
|
|
|
|
|
|
|
|
def run_trainer(self, place, endpoints, trainer_id, trainers, is_dist=True):
|
|
|
|
def run_trainer(self,
|
|
|
|
|
|
|
|
place,
|
|
|
|
|
|
|
|
endpoints,
|
|
|
|
|
|
|
|
trainer_id,
|
|
|
|
|
|
|
|
trainers,
|
|
|
|
|
|
|
|
is_dist=True,
|
|
|
|
|
|
|
|
sync_mode=True):
|
|
|
|
import paddle
|
|
|
|
import paddle
|
|
|
|
import paddle.fluid as fluid
|
|
|
|
import paddle.fluid as fluid
|
|
|
|
test_program, avg_cost, train_reader, test_reader, batch_acc, predict = \
|
|
|
|
test_program, avg_cost, train_reader, test_reader, batch_acc, predict = \
|
|
|
@ -65,7 +76,7 @@ class TestDistRunnerBase(object):
|
|
|
|
if is_dist:
|
|
|
|
if is_dist:
|
|
|
|
t = self.get_transpiler(trainer_id,
|
|
|
|
t = self.get_transpiler(trainer_id,
|
|
|
|
fluid.default_main_program(), endpoints,
|
|
|
|
fluid.default_main_program(), endpoints,
|
|
|
|
trainers)
|
|
|
|
trainers, sync_mode)
|
|
|
|
trainer_prog = t.get_trainer_program()
|
|
|
|
trainer_prog = t.get_trainer_program()
|
|
|
|
else:
|
|
|
|
else:
|
|
|
|
trainer_prog = fluid.default_main_program()
|
|
|
|
trainer_prog = fluid.default_main_program()
|
|
|
@ -106,9 +117,9 @@ def runtime_main(test_class):
|
|
|
|
import paddle.fluid as fluid
|
|
|
|
import paddle.fluid as fluid
|
|
|
|
import paddle.fluid.core as core
|
|
|
|
import paddle.fluid.core as core
|
|
|
|
|
|
|
|
|
|
|
|
if len(sys.argv) != 7:
|
|
|
|
if len(sys.argv) != 8:
|
|
|
|
print(
|
|
|
|
print(
|
|
|
|
"Usage: python dist_se_resnext.py [pserver/trainer] [endpoints] [trainer_id] [current_endpoint] [trainers] [is_dist]"
|
|
|
|
"Usage: python dist_se_resnext.py [pserver/trainer] [endpoints] [trainer_id] [current_endpoint] [trainers] [is_dist] [sync_mode]"
|
|
|
|
)
|
|
|
|
)
|
|
|
|
role = sys.argv[1]
|
|
|
|
role = sys.argv[1]
|
|
|
|
endpoints = sys.argv[2]
|
|
|
|
endpoints = sys.argv[2]
|
|
|
@ -116,34 +127,43 @@ def runtime_main(test_class):
|
|
|
|
current_endpoint = sys.argv[4]
|
|
|
|
current_endpoint = sys.argv[4]
|
|
|
|
trainers = int(sys.argv[5])
|
|
|
|
trainers = int(sys.argv[5])
|
|
|
|
is_dist = True if sys.argv[6] == "TRUE" else False
|
|
|
|
is_dist = True if sys.argv[6] == "TRUE" else False
|
|
|
|
|
|
|
|
sync_mode = True if sys.argv[7] == "TRUE" else False
|
|
|
|
|
|
|
|
|
|
|
|
model = test_class()
|
|
|
|
model = test_class()
|
|
|
|
if role == "pserver":
|
|
|
|
if role == "pserver":
|
|
|
|
model.run_pserver(endpoints, trainers, current_endpoint, trainer_id)
|
|
|
|
model.run_pserver(endpoints, trainers, current_endpoint, trainer_id,
|
|
|
|
|
|
|
|
sync_mode)
|
|
|
|
else:
|
|
|
|
else:
|
|
|
|
p = fluid.CUDAPlace(0) if core.is_compiled_with_cuda(
|
|
|
|
p = fluid.CUDAPlace(0) if core.is_compiled_with_cuda(
|
|
|
|
) else fluid.CPUPlace()
|
|
|
|
) else fluid.CPUPlace()
|
|
|
|
model.run_trainer(p, endpoints, trainer_id, trainers, is_dist)
|
|
|
|
model.run_trainer(p, endpoints, trainer_id, trainers, is_dist,
|
|
|
|
|
|
|
|
sync_mode)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
import paddle.compat as cpt
|
|
|
|
import paddle.compat as cpt
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class TestDistBase(unittest.TestCase):
|
|
|
|
class TestDistBase(unittest.TestCase):
|
|
|
|
|
|
|
|
def _setup_config(self):
|
|
|
|
|
|
|
|
raise NotImplementedError("tests should have _setup_config implemented")
|
|
|
|
|
|
|
|
|
|
|
|
def setUp(self):
|
|
|
|
def setUp(self):
|
|
|
|
self._trainers = 2
|
|
|
|
self._trainers = 2
|
|
|
|
self._pservers = 2
|
|
|
|
self._pservers = 2
|
|
|
|
self._ps_endpoints = "127.0.0.1:9123,127.0.0.1:9124"
|
|
|
|
self._ps_endpoints = "127.0.0.1:9123,127.0.0.1:9124"
|
|
|
|
self._python_interp = "python"
|
|
|
|
self._python_interp = "python"
|
|
|
|
|
|
|
|
self._sync_mode = True
|
|
|
|
|
|
|
|
self._setup_config()
|
|
|
|
|
|
|
|
|
|
|
|
def start_pserver(self, model_file, check_error_log):
|
|
|
|
def start_pserver(self, model_file, check_error_log):
|
|
|
|
|
|
|
|
sync_mode_str = "TRUE" if self._sync_mode else "FALSE"
|
|
|
|
ps0_ep, ps1_ep = self._ps_endpoints.split(",")
|
|
|
|
ps0_ep, ps1_ep = self._ps_endpoints.split(",")
|
|
|
|
ps0_cmd = "%s %s pserver %s 0 %s %d TRUE" % \
|
|
|
|
ps0_cmd = "%s %s pserver %s 0 %s %d TRUE %s" % \
|
|
|
|
(self._python_interp, model_file, self._ps_endpoints, ps0_ep,
|
|
|
|
(self._python_interp, model_file, self._ps_endpoints, ps0_ep,
|
|
|
|
self._trainers)
|
|
|
|
self._trainers, sync_mode_str)
|
|
|
|
ps1_cmd = "%s %s pserver %s 0 %s %d TRUE" % \
|
|
|
|
ps1_cmd = "%s %s pserver %s 0 %s %d TRUE %s" % \
|
|
|
|
(self._python_interp, model_file, self._ps_endpoints, ps1_ep,
|
|
|
|
(self._python_interp, model_file, self._ps_endpoints, ps1_ep,
|
|
|
|
self._trainers)
|
|
|
|
self._trainers, sync_mode_str)
|
|
|
|
|
|
|
|
|
|
|
|
ps0_pipe = subprocess.PIPE
|
|
|
|
ps0_pipe = subprocess.PIPE
|
|
|
|
ps1_pipe = subprocess.PIPE
|
|
|
|
ps1_pipe = subprocess.PIPE
|
|
|
@ -195,9 +215,10 @@ class TestDistBase(unittest.TestCase):
|
|
|
|
# Run local to get a base line
|
|
|
|
# Run local to get a base line
|
|
|
|
env_local = {"CUDA_VISIBLE_DEVICES": "0"}
|
|
|
|
env_local = {"CUDA_VISIBLE_DEVICES": "0"}
|
|
|
|
env_local.update(required_envs)
|
|
|
|
env_local.update(required_envs)
|
|
|
|
local_cmd = "%s %s trainer %s 0 %s %d FLASE" % \
|
|
|
|
sync_mode_str = "TRUE" if self._sync_mode else "FALSE"
|
|
|
|
|
|
|
|
local_cmd = "%s %s trainer %s 0 %s %d FLASE %s" % \
|
|
|
|
(self._python_interp, model_file,
|
|
|
|
(self._python_interp, model_file,
|
|
|
|
"127.0.0.1:1234", "127.0.0.1:1234", 1)
|
|
|
|
"127.0.0.1:1234", "127.0.0.1:1234", 1, sync_mode_str)
|
|
|
|
if not check_error_log:
|
|
|
|
if not check_error_log:
|
|
|
|
local_proc = subprocess.Popen(
|
|
|
|
local_proc = subprocess.Popen(
|
|
|
|
local_cmd.split(" "),
|
|
|
|
local_cmd.split(" "),
|
|
|
@ -226,12 +247,12 @@ class TestDistBase(unittest.TestCase):
|
|
|
|
self._wait_ps_ready(ps1.pid)
|
|
|
|
self._wait_ps_ready(ps1.pid)
|
|
|
|
|
|
|
|
|
|
|
|
ps0_ep, ps1_ep = self._ps_endpoints.split(",")
|
|
|
|
ps0_ep, ps1_ep = self._ps_endpoints.split(",")
|
|
|
|
tr0_cmd = "%s %s trainer %s 0 %s %d TRUE" % \
|
|
|
|
tr0_cmd = "%s %s trainer %s 0 %s %d TRUE %s" % \
|
|
|
|
(self._python_interp, model_file, self._ps_endpoints, ps0_ep,
|
|
|
|
(self._python_interp, model_file, self._ps_endpoints, ps0_ep,
|
|
|
|
self._trainers)
|
|
|
|
self._trainers, sync_mode_str)
|
|
|
|
tr1_cmd = "%s %s trainer %s 1 %s %d TRUE" % \
|
|
|
|
tr1_cmd = "%s %s trainer %s 1 %s %d TRUE %s" % \
|
|
|
|
(self._python_interp, model_file, self._ps_endpoints, ps1_ep,
|
|
|
|
(self._python_interp, model_file, self._ps_endpoints, ps1_ep,
|
|
|
|
self._trainers)
|
|
|
|
self._trainers, sync_mode_str)
|
|
|
|
|
|
|
|
|
|
|
|
env0 = {"CUDA_VISIBLE_DEVICES": "0"}
|
|
|
|
env0 = {"CUDA_VISIBLE_DEVICES": "0"}
|
|
|
|
env1 = {"CUDA_VISIBLE_DEVICES": "1"}
|
|
|
|
env1 = {"CUDA_VISIBLE_DEVICES": "1"}
|
|
|
|