|
|
|
@ -164,6 +164,17 @@ class TestDistBase(unittest.TestCase):
|
|
|
|
|
def _setup_config(self):
|
|
|
|
|
raise NotImplementedError("tests should have _setup_config implemented")
|
|
|
|
|
|
|
|
|
|
def _after_setup_config(self):
|
|
|
|
|
if self._enforce_place == "CPU":
|
|
|
|
|
self.__use_cuda = False
|
|
|
|
|
elif self._enforce_place == "GPU":
|
|
|
|
|
self.__use_cuda = True
|
|
|
|
|
else:
|
|
|
|
|
if fluid.core.is_compiled_with_cuda():
|
|
|
|
|
self.__use_cuda = True
|
|
|
|
|
else:
|
|
|
|
|
self.__use_cuda = False
|
|
|
|
|
|
|
|
|
|
def setUp(self):
|
|
|
|
|
self._trainers = 2
|
|
|
|
|
self._pservers = 2
|
|
|
|
@ -171,11 +182,12 @@ class TestDistBase(unittest.TestCase):
|
|
|
|
|
self._find_free_port(), self._find_free_port())
|
|
|
|
|
self._python_interp = "python"
|
|
|
|
|
self._sync_mode = True
|
|
|
|
|
self._use_cuda = True
|
|
|
|
|
self._enforce_place = None
|
|
|
|
|
self._mem_opt = False
|
|
|
|
|
self._use_reduce = False
|
|
|
|
|
self._use_reader_alloc = True
|
|
|
|
|
self._setup_config()
|
|
|
|
|
self._after_setup_config()
|
|
|
|
|
|
|
|
|
|
def _find_free_port(self):
|
|
|
|
|
with closing(socket.socket(socket.AF_INET, socket.SOCK_STREAM)) as s:
|
|
|
|
@ -199,13 +211,10 @@ class TestDistBase(unittest.TestCase):
|
|
|
|
|
ps0_cmd += " --mem_opt"
|
|
|
|
|
ps1_cmd += " --mem_opt"
|
|
|
|
|
|
|
|
|
|
ps0_pipe = subprocess.PIPE
|
|
|
|
|
ps1_pipe = subprocess.PIPE
|
|
|
|
|
if check_error_log:
|
|
|
|
|
print(ps0_cmd)
|
|
|
|
|
print(ps1_cmd)
|
|
|
|
|
ps0_pipe = open("/tmp/ps0_err.log", "wb")
|
|
|
|
|
ps1_pipe = open("/tmp/ps1_err.log", "wb")
|
|
|
|
|
print(ps0_cmd)
|
|
|
|
|
print(ps1_cmd)
|
|
|
|
|
ps0_pipe = open("/tmp/ps0_err.log", "wb")
|
|
|
|
|
ps1_pipe = open("/tmp/ps1_err.log", "wb")
|
|
|
|
|
|
|
|
|
|
ps0_proc = subprocess.Popen(
|
|
|
|
|
ps0_cmd.strip().split(" "),
|
|
|
|
@ -218,10 +227,7 @@ class TestDistBase(unittest.TestCase):
|
|
|
|
|
stderr=ps1_pipe,
|
|
|
|
|
env=required_envs)
|
|
|
|
|
|
|
|
|
|
if not check_error_log:
|
|
|
|
|
return ps0_proc, ps1_proc, None, None
|
|
|
|
|
else:
|
|
|
|
|
return ps0_proc, ps1_proc, ps0_pipe, ps1_pipe
|
|
|
|
|
return ps0_proc, ps1_proc, ps0_pipe, ps1_pipe
|
|
|
|
|
|
|
|
|
|
def _wait_ps_ready(self, pid):
|
|
|
|
|
retry_times = 50
|
|
|
|
@ -242,7 +248,7 @@ class TestDistBase(unittest.TestCase):
|
|
|
|
|
|
|
|
|
|
cmd = "%s %s --role trainer" % (self._python_interp, model)
|
|
|
|
|
|
|
|
|
|
if self._use_cuda:
|
|
|
|
|
if self.__use_cuda:
|
|
|
|
|
cmd += " --use_cuda"
|
|
|
|
|
env_local = {"CUDA_VISIBLE_DEVICES": "0"}
|
|
|
|
|
else:
|
|
|
|
@ -250,7 +256,7 @@ class TestDistBase(unittest.TestCase):
|
|
|
|
|
|
|
|
|
|
envs.update(env_local)
|
|
|
|
|
|
|
|
|
|
if not check_error_log:
|
|
|
|
|
if check_error_log:
|
|
|
|
|
err_log = open("/tmp/trainer.err.log", "wb")
|
|
|
|
|
local_proc = subprocess.Popen(
|
|
|
|
|
cmd.split(" "),
|
|
|
|
@ -264,7 +270,6 @@ class TestDistBase(unittest.TestCase):
|
|
|
|
|
stderr=subprocess.PIPE,
|
|
|
|
|
env=envs)
|
|
|
|
|
|
|
|
|
|
local_proc.wait()
|
|
|
|
|
local_out, local_err = local_proc.communicate()
|
|
|
|
|
local_ret = cpt.to_text(local_out)
|
|
|
|
|
|
|
|
|
@ -305,7 +310,7 @@ class TestDistBase(unittest.TestCase):
|
|
|
|
|
if self._use_reader_alloc:
|
|
|
|
|
tr0_cmd += " --use_reader_alloc"
|
|
|
|
|
tr1_cmd += " --use_reader_alloc"
|
|
|
|
|
if self._use_cuda:
|
|
|
|
|
if self.__use_cuda:
|
|
|
|
|
tr0_cmd += " --use_cuda"
|
|
|
|
|
tr1_cmd += " --use_cuda"
|
|
|
|
|
env0 = {"CUDA_VISIBLE_DEVICES": "0"}
|
|
|
|
@ -317,15 +322,10 @@ class TestDistBase(unittest.TestCase):
|
|
|
|
|
env0.update(envs)
|
|
|
|
|
env1.update(envs)
|
|
|
|
|
|
|
|
|
|
FNULL = open(os.devnull, 'w')
|
|
|
|
|
|
|
|
|
|
tr0_pipe = subprocess.PIPE
|
|
|
|
|
tr1_pipe = subprocess.PIPE
|
|
|
|
|
if check_error_log:
|
|
|
|
|
print("tr0_cmd:{}, env0: {}".format(tr0_cmd, env0))
|
|
|
|
|
print("tr1_cmd:{}, env1: {}".format(tr1_cmd, env1))
|
|
|
|
|
tr0_pipe = open("/tmp/tr0_err.log", "wb")
|
|
|
|
|
tr1_pipe = open("/tmp/tr1_err.log", "wb")
|
|
|
|
|
print("tr0_cmd:{}, env0: {}".format(tr0_cmd, env0))
|
|
|
|
|
print("tr1_cmd:{}, env1: {}".format(tr1_cmd, env1))
|
|
|
|
|
tr0_pipe = open("/tmp/tr0_err.log", "wb")
|
|
|
|
|
tr1_pipe = open("/tmp/tr1_err.log", "wb")
|
|
|
|
|
|
|
|
|
|
tr0_proc = subprocess.Popen(
|
|
|
|
|
tr0_cmd.strip().split(" "),
|
|
|
|
@ -338,29 +338,22 @@ class TestDistBase(unittest.TestCase):
|
|
|
|
|
stderr=tr1_pipe,
|
|
|
|
|
env=env1)
|
|
|
|
|
|
|
|
|
|
tr0_proc.wait()
|
|
|
|
|
tr1_proc.wait()
|
|
|
|
|
|
|
|
|
|
tr0_out, tr0_err = tr0_proc.communicate()
|
|
|
|
|
tr0_loss_text = cpt.to_text(tr0_out)
|
|
|
|
|
tr1_out, tr1_err = tr1_proc.communicate()
|
|
|
|
|
tr1_loss_text = cpt.to_text(tr1_out)
|
|
|
|
|
|
|
|
|
|
# close trainer file
|
|
|
|
|
if check_error_log:
|
|
|
|
|
tr0_pipe.close()
|
|
|
|
|
tr1_pipe.close()
|
|
|
|
|
tr0_pipe.close()
|
|
|
|
|
tr1_pipe.close()
|
|
|
|
|
|
|
|
|
|
ps0_pipe.close()
|
|
|
|
|
ps1_pipe.close()
|
|
|
|
|
ps0_pipe.close()
|
|
|
|
|
ps1_pipe.close()
|
|
|
|
|
# FIXME: use terminate() instead of sigkill.
|
|
|
|
|
os.kill(ps0.pid, signal.SIGKILL)
|
|
|
|
|
os.kill(ps1.pid, signal.SIGKILL)
|
|
|
|
|
ps0.terminate()
|
|
|
|
|
ps1.terminate()
|
|
|
|
|
ps0.wait()
|
|
|
|
|
ps1.wait()
|
|
|
|
|
FNULL.close()
|
|
|
|
|
|
|
|
|
|
# print log
|
|
|
|
|
sys.stderr.write('trainer 0 stdout:\n %s\n' % tr0_loss_text)
|
|
|
|
@ -385,6 +378,7 @@ class TestDistBase(unittest.TestCase):
|
|
|
|
|
"LD_LIBRARY_PATH": os.getenv("LD_LIBRARY_PATH", ""),
|
|
|
|
|
"FLAGS_fraction_of_gpu_memory_to_use": "0.15",
|
|
|
|
|
"FLAGS_cudnn_deterministic": "1",
|
|
|
|
|
"http_proxy": ""
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
required_envs.update(need_envs)
|
|
|
|
|