|
|
|
@ -38,7 +38,14 @@ RUN_STEP = 5
|
|
|
|
|
DEFAULT_BATCH_SIZE = 2
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def my_print(class_name, log_str):
|
|
|
|
|
def print_to_out(out_losses):
|
|
|
|
|
if six.PY2:
|
|
|
|
|
print(pickle.dumps(out_losses))
|
|
|
|
|
else:
|
|
|
|
|
sys.stdout.buffer.write(pickle.dumps(out_losses))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def print_to_err(class_name, log_str):
|
|
|
|
|
localtime = time.asctime(time.localtime(time.time()))
|
|
|
|
|
print_str = localtime + "\t" + class_name + "\t" + log_str
|
|
|
|
|
if six.PY2:
|
|
|
|
@ -99,9 +106,9 @@ class TestDistRunnerBase(object):
|
|
|
|
|
place = fluid.CPUPlace()
|
|
|
|
|
exe = fluid.Executor(place)
|
|
|
|
|
exe.run(startup_prog)
|
|
|
|
|
my_print(type(self).__name__, "run pserver startup program done.")
|
|
|
|
|
print_to_err(type(self).__name__, "run pserver startup program done.")
|
|
|
|
|
exe.run(pserver_prog)
|
|
|
|
|
my_print(type(self).__name__, "run pserver main program done.")
|
|
|
|
|
print_to_err(type(self).__name__, "run pserver main program done.")
|
|
|
|
|
|
|
|
|
|
def run_gpu_fleet_api_trainer(self, args):
|
|
|
|
|
assert args.update_method == "nccl2"
|
|
|
|
@ -118,7 +125,7 @@ class TestDistRunnerBase(object):
|
|
|
|
|
|
|
|
|
|
role = role_maker.PaddleCloudRoleMaker(is_collective=True)
|
|
|
|
|
fleet.init(role)
|
|
|
|
|
my_print("gpu_fleet", "fleet.node_num:")
|
|
|
|
|
print_to_err("gpu_fleet", "fleet.node_num:")
|
|
|
|
|
#"fleet.node_id:", fleet.node_id(),
|
|
|
|
|
#"fleet.trainer_num:", fleet.worker_num())
|
|
|
|
|
|
|
|
|
@ -154,15 +161,15 @@ class TestDistRunnerBase(object):
|
|
|
|
|
else:
|
|
|
|
|
return origin_batch
|
|
|
|
|
|
|
|
|
|
my_print(type(self).__name__, "begin to train on trainer")
|
|
|
|
|
print_to_err(type(self).__name__, "begin to train on trainer")
|
|
|
|
|
out_losses = []
|
|
|
|
|
for i in six.moves.xrange(RUN_STEP):
|
|
|
|
|
loss, = exe.run(dist_prog,
|
|
|
|
|
fetch_list=[avg_cost.name],
|
|
|
|
|
feed=feeder.feed(get_data()))
|
|
|
|
|
out_losses.append(loss[0])
|
|
|
|
|
my_print(type(self).__name__, "run step %d finished" % i)
|
|
|
|
|
my_print(type(self).__name__, "trainer run finished")
|
|
|
|
|
print_to_err(type(self).__name__, "run step %d finished" % i)
|
|
|
|
|
print_to_err(type(self).__name__, "trainer run finished")
|
|
|
|
|
|
|
|
|
|
if six.PY2:
|
|
|
|
|
print(pickle.dumps(out_losses))
|
|
|
|
@ -182,7 +189,7 @@ class TestDistRunnerBase(object):
|
|
|
|
|
self.get_model(batch_size=args.batch_size)
|
|
|
|
|
|
|
|
|
|
if args.update_method == "pserver":
|
|
|
|
|
my_print(
|
|
|
|
|
print_to_err(
|
|
|
|
|
type(self).__name__,
|
|
|
|
|
"begin to run transpile on trainer with pserver mode")
|
|
|
|
|
t = self.get_transpiler(args.trainer_id,
|
|
|
|
@ -190,7 +197,7 @@ class TestDistRunnerBase(object):
|
|
|
|
|
args.endpoints, args.trainers,
|
|
|
|
|
args.sync_mode, args.dc_asgd)
|
|
|
|
|
trainer_prog = t.get_trainer_program()
|
|
|
|
|
my_print(
|
|
|
|
|
print_to_err(
|
|
|
|
|
type(self).__name__,
|
|
|
|
|
"get trainer program done with pserver mode.")
|
|
|
|
|
elif args.update_method == "nccl2" or args.update_method == "nccl2_reduce_layer":
|
|
|
|
@ -201,7 +208,7 @@ class TestDistRunnerBase(object):
|
|
|
|
|
if args.use_hallreduce:
|
|
|
|
|
config.use_hierarchical_allreduce = True
|
|
|
|
|
config.hierarchical_allreduce_inter_nranks = args.hallreduce_inter_nranks
|
|
|
|
|
my_print(
|
|
|
|
|
print_to_err(
|
|
|
|
|
type(self).__name__,
|
|
|
|
|
"begin to run transpile on trainer with nccl2 mode")
|
|
|
|
|
nccl2_t = fluid.DistributeTranspiler(config=config)
|
|
|
|
@ -211,16 +218,16 @@ class TestDistRunnerBase(object):
|
|
|
|
|
startup_program=fluid.default_startup_program(),
|
|
|
|
|
trainers=args.endpoints,
|
|
|
|
|
current_endpoint=args.current_endpoint)
|
|
|
|
|
my_print(
|
|
|
|
|
print_to_err(
|
|
|
|
|
type(self).__name__,
|
|
|
|
|
"get trainer program done. with nccl2 mode")
|
|
|
|
|
trainer_prog = fluid.default_main_program()
|
|
|
|
|
else:
|
|
|
|
|
my_print(
|
|
|
|
|
print_to_err(
|
|
|
|
|
type(self).__name__,
|
|
|
|
|
"do nothing about main program, just use it")
|
|
|
|
|
trainer_prog = fluid.default_main_program()
|
|
|
|
|
my_print(type(self).__name__, "use main program done.")
|
|
|
|
|
print_to_err(type(self).__name__, "use main program done.")
|
|
|
|
|
|
|
|
|
|
if args.use_cuda:
|
|
|
|
|
device_id = int(os.getenv("FLAGS_selected_gpus", "0"))
|
|
|
|
@ -230,7 +237,7 @@ class TestDistRunnerBase(object):
|
|
|
|
|
|
|
|
|
|
exe = fluid.Executor(place)
|
|
|
|
|
exe.run(fluid.default_startup_program())
|
|
|
|
|
my_print(type(self).__name__, "run worker startup program done.")
|
|
|
|
|
print_to_err(type(self).__name__, "run worker startup program done.")
|
|
|
|
|
|
|
|
|
|
exec_strategy = fluid.ExecutionStrategy()
|
|
|
|
|
exec_strategy.num_threads = 1
|
|
|
|
@ -262,12 +269,12 @@ class TestDistRunnerBase(object):
|
|
|
|
|
build_stra.num_trainers = 1
|
|
|
|
|
build_stra.trainer_id = 0
|
|
|
|
|
|
|
|
|
|
my_print(type(self).__name__, "begin to compile with data parallel")
|
|
|
|
|
print_to_err(type(self).__name__, "begin to compile with data parallel")
|
|
|
|
|
binary = compiler.CompiledProgram(trainer_prog).with_data_parallel(
|
|
|
|
|
loss_name=avg_cost.name,
|
|
|
|
|
build_strategy=build_stra,
|
|
|
|
|
exec_strategy=exec_strategy)
|
|
|
|
|
my_print(type(self).__name__, "program compiled with data parallel")
|
|
|
|
|
print_to_err(type(self).__name__, "program compiled with data parallel")
|
|
|
|
|
|
|
|
|
|
feed_var_list = [
|
|
|
|
|
var for var in trainer_prog.global_block().vars.values()
|
|
|
|
@ -288,20 +295,17 @@ class TestDistRunnerBase(object):
|
|
|
|
|
else:
|
|
|
|
|
return origin_batch
|
|
|
|
|
|
|
|
|
|
my_print(type(self).__name__, "begin to train on trainer")
|
|
|
|
|
print_to_err(type(self).__name__, "begin to train on trainer")
|
|
|
|
|
out_losses = []
|
|
|
|
|
for i in six.moves.xrange(RUN_STEP):
|
|
|
|
|
loss, = exe.run(binary,
|
|
|
|
|
fetch_list=[avg_cost.name],
|
|
|
|
|
feed=feeder.feed(get_data()))
|
|
|
|
|
out_losses.append(loss[0])
|
|
|
|
|
my_print(type(self).__name__, "run step %d finished" % i)
|
|
|
|
|
my_print(type(self).__name__, "trainer run finished")
|
|
|
|
|
print_to_err(type(self).__name__, "run step %d finished" % i)
|
|
|
|
|
print_to_err(type(self).__name__, "trainer run finished")
|
|
|
|
|
|
|
|
|
|
if six.PY2:
|
|
|
|
|
print(pickle.dumps(out_losses))
|
|
|
|
|
else:
|
|
|
|
|
sys.stdout.buffer.write(pickle.dumps(out_losses))
|
|
|
|
|
print_to_out(out_losses)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class TestParallelDyGraphRunnerBase(object):
|
|
|
|
@ -344,23 +348,23 @@ class TestParallelDyGraphRunnerBase(object):
|
|
|
|
|
strategy.local_rank = args.trainer_id
|
|
|
|
|
strategy.trainer_endpoints = args.endpoints.split(",")
|
|
|
|
|
strategy.current_endpoint = args.current_endpoint
|
|
|
|
|
my_print(
|
|
|
|
|
print_to_err(
|
|
|
|
|
type(self).__name__,
|
|
|
|
|
"begin to prepare context in dygraph with nccl2")
|
|
|
|
|
dygraph.parallel.prepare_context(strategy)
|
|
|
|
|
model = dygraph.parallel.DataParallel(model, strategy)
|
|
|
|
|
my_print(type(self).__name__, "model built in dygraph")
|
|
|
|
|
print_to_err(type(self).__name__, "model built in dygraph")
|
|
|
|
|
out_losses = []
|
|
|
|
|
my_print(type(self).__name__, "begin to run dygraph training")
|
|
|
|
|
print_to_err(type(self).__name__, "begin to run dygraph training")
|
|
|
|
|
for step_id, data in enumerate(train_reader()):
|
|
|
|
|
data = _get_data(data)
|
|
|
|
|
if step_id == RUN_STEP:
|
|
|
|
|
break
|
|
|
|
|
loss = self.run_one_loop(model, opt, data)
|
|
|
|
|
if step_id % 10 == 0:
|
|
|
|
|
my_print(
|
|
|
|
|
print_to_err(
|
|
|
|
|
type(self).__name__,
|
|
|
|
|
"loss at step %d: %f" % (step_id, loss))
|
|
|
|
|
"loss at step %d: %f" % (step_id, loss.numpy()))
|
|
|
|
|
out_losses.append(loss.numpy())
|
|
|
|
|
|
|
|
|
|
# FIXME(Yancey1989): scale the loss inplace
|
|
|
|
@ -373,7 +377,7 @@ class TestParallelDyGraphRunnerBase(object):
|
|
|
|
|
|
|
|
|
|
opt.minimize(loss)
|
|
|
|
|
model.clear_gradients()
|
|
|
|
|
my_print(type(self).__name__, pickle.dumps(out_losses))
|
|
|
|
|
print_to_out(out_losses)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def runtime_main(test_class):
|
|
|
|
@ -483,7 +487,7 @@ class TestDistBase(unittest.TestCase):
|
|
|
|
|
with closing(socket.socket(socket.AF_INET,
|
|
|
|
|
socket.SOCK_STREAM)) as s:
|
|
|
|
|
s.bind(('', 0))
|
|
|
|
|
my_print(
|
|
|
|
|
print_to_err(
|
|
|
|
|
type(self).__name__, "socket name: %s" % s.getsockname()[1])
|
|
|
|
|
return s.getsockname()[1]
|
|
|
|
|
|
|
|
|
@ -519,13 +523,13 @@ class TestDistBase(unittest.TestCase):
|
|
|
|
|
ps0_pipe = open("/tmp/ps0_err.log", "wb")
|
|
|
|
|
ps1_pipe = open("/tmp/ps1_err.log", "wb")
|
|
|
|
|
|
|
|
|
|
my_print(type(self).__name__, "going to start pserver process 0")
|
|
|
|
|
print_to_err(type(self).__name__, "going to start pserver process 0")
|
|
|
|
|
ps0_proc = subprocess.Popen(
|
|
|
|
|
ps0_cmd.strip().split(" "),
|
|
|
|
|
stdout=subprocess.PIPE,
|
|
|
|
|
stderr=ps0_pipe,
|
|
|
|
|
env=required_envs)
|
|
|
|
|
my_print(type(self).__name__, "going to start pserver process 1")
|
|
|
|
|
print_to_err(type(self).__name__, "going to start pserver process 1")
|
|
|
|
|
ps1_proc = subprocess.Popen(
|
|
|
|
|
ps1_cmd.strip().split(" "),
|
|
|
|
|
stdout=subprocess.PIPE,
|
|
|
|
@ -641,13 +645,13 @@ class TestDistBase(unittest.TestCase):
|
|
|
|
|
tr0_pipe = open("/tmp/tr0_err.log", "wb")
|
|
|
|
|
tr1_pipe = open("/tmp/tr1_err.log", "wb")
|
|
|
|
|
|
|
|
|
|
my_print(type(self).__name__, "going to start trainer process 0")
|
|
|
|
|
print_to_err(type(self).__name__, "going to start trainer process 0")
|
|
|
|
|
tr0_proc = subprocess.Popen(
|
|
|
|
|
tr0_cmd.strip().split(" "),
|
|
|
|
|
stdout=subprocess.PIPE,
|
|
|
|
|
stderr=tr0_pipe,
|
|
|
|
|
env=env0)
|
|
|
|
|
my_print(type(self).__name__, "going to start trainer process 1")
|
|
|
|
|
print_to_err(type(self).__name__, "going to start trainer process 1")
|
|
|
|
|
tr1_proc = subprocess.Popen(
|
|
|
|
|
tr1_cmd.strip().split(" "),
|
|
|
|
|
stdout=subprocess.PIPE,
|
|
|
|
@ -761,7 +765,7 @@ class TestDistBase(unittest.TestCase):
|
|
|
|
|
|
|
|
|
|
tr_pipe = open("/tmp/tr{}_err.log".format(i), "wb")
|
|
|
|
|
|
|
|
|
|
my_print(
|
|
|
|
|
print_to_err(
|
|
|
|
|
type(self).__name__,
|
|
|
|
|
"going to start process {} with nccl2".format(i))
|
|
|
|
|
tr_proc = subprocess.Popen(
|
|
|
|
|