You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
1273 lines
48 KiB
1273 lines
48 KiB
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
|
|
#
|
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
# you may not use this file except in compliance with the License.
|
|
# You may obtain a copy of the License at
|
|
#
|
|
# http://www.apache.org/licenses/LICENSE-2.0
|
|
#
|
|
# Unless required by applicable law or agreed to in writing, software
|
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
# See the License for the specific language governing permissions and
|
|
# limitations under the License.
|
|
|
|
from __future__ import print_function
|
|
import time
|
|
|
|
import ast
|
|
import unittest
|
|
import os
|
|
import sys
|
|
import signal
|
|
import subprocess
|
|
import six
|
|
import argparse
|
|
import pickle
|
|
import random
|
|
import numpy as np
|
|
import time
|
|
|
|
import paddle
|
|
import paddle.fluid as fluid
|
|
from paddle.fluid import compiler
|
|
import paddle.fluid.dygraph as dygraph
|
|
from paddle.fluid.dygraph.base import to_variable
|
|
from paddle.fluid.dygraph.parallel import DataParallel
|
|
|
|
from paddle.fluid.incubate.fleet.collective import fleet, DistributedStrategy
|
|
import paddle.fluid.incubate.fleet.base.role_maker as role_maker
|
|
|
|
RUN_STEP = 5
|
|
DEFAULT_BATCH_SIZE = 2
|
|
DIST_UT_PORT = 0
|
|
|
|
|
|
def print_to_out(out_losses):
|
|
if six.PY2:
|
|
print(pickle.dumps(out_losses))
|
|
else:
|
|
sys.stdout.buffer.write(pickle.dumps(out_losses))
|
|
|
|
|
|
def print_to_err(class_name, log_str):
|
|
localtime = time.asctime(time.localtime(time.time()))
|
|
print_str = localtime + "\t" + class_name + "\t" + log_str
|
|
if six.PY2:
|
|
sys.stderr.write(pickle.dumps(print_str))
|
|
else:
|
|
sys.stderr.buffer.write(pickle.dumps(print_str))
|
|
|
|
|
|
def eprint(*args, **kwargs):
|
|
print(*args, file=sys.stderr, **kwargs)
|
|
|
|
|
|
class TestDistRunnerBase(object):
|
|
def get_model(self,
|
|
batch_size=DEFAULT_BATCH_SIZE,
|
|
lr=0.1,
|
|
single_device=False,
|
|
use_dgc=False):
|
|
raise NotImplementedError(
|
|
"get_model should be implemented by child classes.")
|
|
|
|
@staticmethod
|
|
def get_transpiler(trainer_id,
|
|
main_program,
|
|
pserver_endpoints,
|
|
trainers,
|
|
sync_mode,
|
|
dc_asgd=False,
|
|
current_endpoint=None,
|
|
nccl_comm_num=1,
|
|
hogwild_mode=False):
|
|
# NOTE: import fluid until runtime, or else forking processes will cause error.
|
|
config = fluid.DistributeTranspilerConfig()
|
|
config.enable_dc_asgd = dc_asgd
|
|
config.sync_mode = sync_mode
|
|
config.runtime_split_send_recv = hogwild_mode
|
|
|
|
if nccl_comm_num > 1:
|
|
config.nccl_comm_num = nccl_comm_num
|
|
# config.runtime_split_send_recv = True
|
|
t = fluid.DistributeTranspiler(config=config)
|
|
t.transpile(
|
|
trainer_id=trainer_id,
|
|
program=main_program,
|
|
pservers=pserver_endpoints,
|
|
trainers=trainers,
|
|
sync_mode=sync_mode,
|
|
current_endpoint=current_endpoint)
|
|
return t
|
|
|
|
def run_pserver(self, args):
|
|
self.lr = args.lr
|
|
self.get_model(batch_size=args.batch_size)
|
|
# NOTE: pserver should not call memory optimize
|
|
|
|
t = self.get_transpiler(
|
|
trainer_id=args.trainer_id,
|
|
main_program=fluid.default_main_program(),
|
|
pserver_endpoints=args.endpoints,
|
|
trainers=args.trainers,
|
|
sync_mode=args.sync_mode,
|
|
dc_asgd=args.dc_asgd,
|
|
hogwild_mode=args.hogwild)
|
|
pserver_prog = t.get_pserver_program(args.current_endpoint)
|
|
startup_prog = t.get_startup_program(args.current_endpoint,
|
|
pserver_prog)
|
|
|
|
place = fluid.CPUPlace()
|
|
exe = fluid.Executor(place)
|
|
exe.run(startup_prog)
|
|
print_to_err(type(self).__name__, "run pserver startup program done.")
|
|
exe.run(pserver_prog)
|
|
print_to_err(type(self).__name__, "run pserver main program done.")
|
|
|
|
def run_pipeline_trainer(self, args):
|
|
self.lr = args.lr
|
|
|
|
dist_strategy = DistributedStrategy()
|
|
test_program, avg_cost, train_reader, test_reader, batch_acc, predict, data_loader = \
|
|
self.get_model(batch_size=args.batch_size, dist_strategy=dist_strategy)
|
|
|
|
device_id = int(os.getenv("FLAGS_selected_gpus", "0"))
|
|
eprint(type(self).__name__, "device_id: %d." % device_id)
|
|
place = fluid.CUDAPlace(device_id)
|
|
|
|
exe = fluid.Executor(place)
|
|
exe.run(fluid.default_startup_program())
|
|
eprint(type(self).__name__, "run worker startup program done.")
|
|
|
|
data_loader.set_sample_list_generator(train_reader, place)
|
|
data_loader.start()
|
|
print_to_err(type(self).__name__, "begin to train on trainer")
|
|
out_losses = []
|
|
for i in six.moves.xrange(RUN_STEP):
|
|
loss = exe.run(fluid.default_main_program(), fetch_list=[avg_cost])
|
|
loss = loss[0] if loss else None
|
|
out_losses.append(loss)
|
|
print_to_err(type(self).__name__, "run step %d finished" % i)
|
|
print_to_err(type(self).__name__, "trainer run finished")
|
|
|
|
if six.PY2:
|
|
print(pickle.dumps(out_losses))
|
|
else:
|
|
sys.stdout.buffer.write(pickle.dumps(out_losses))
|
|
|
|
if args.save_model:
|
|
model_save_dir = "/tmp"
|
|
if fleet.worker_index() == 0:
|
|
model_save_dir_fluid = os.path.join(model_save_dir,
|
|
"fluid_persistables")
|
|
model_save_dir_fleet = os.path.join(model_save_dir,
|
|
"fleet_persistables")
|
|
infer_save_dir_fluid = os.path.join(model_save_dir,
|
|
"fluid_infer")
|
|
infer_save_dir_fleet = os.path.join(model_save_dir,
|
|
"fleet_infer")
|
|
else:
|
|
model_save_dir_fluid = os.path.join(model_save_dir,
|
|
"fluid_persistables_2")
|
|
model_save_dir_fleet = os.path.join(model_save_dir,
|
|
"fleet_persistables_2")
|
|
infer_save_dir_fluid = os.path.join(model_save_dir,
|
|
"fluid_infer_2")
|
|
infer_save_dir_fleet = os.path.join(model_save_dir,
|
|
"fleet_infer_2")
|
|
fluid.io.save_persistables(exe, model_save_dir_fluid,
|
|
fleet._origin_program)
|
|
fleet.save_persistables(executor=exe, dirname=model_save_dir_fleet)
|
|
feeded_var_names = [var.name for var in feed_var_list]
|
|
fluid.io.save_inference_model(infer_save_dir_fluid,
|
|
feeded_var_names, [avg_cost], exe,
|
|
fleet._origin_program)
|
|
fleet.save_inference_model(exe, infer_save_dir_fleet,
|
|
feeded_var_names, [avg_cost])
|
|
|
|
def run_use_fleet_api_trainer(self, args):
|
|
assert args.update_method == "nccl2" or "bkcl"
|
|
|
|
self.lr = args.lr
|
|
|
|
exec_strategy = fluid.ExecutionStrategy()
|
|
exec_strategy.num_threads = 1
|
|
|
|
dist_strategy = DistributedStrategy()
|
|
dist_strategy.exec_strategy = exec_strategy
|
|
dist_strategy.fuse_memory_size = 1 # MB
|
|
dist_strategy.fuse_laryer_size = 1
|
|
if args.use_local_sgd:
|
|
dist_strategy.use_local_sgd = True
|
|
if args.ut4grad_allreduce:
|
|
dist_strategy._ut4grad_allreduce = True
|
|
if args.sync_batch_norm:
|
|
dist_strategy.sync_batch_norm = True
|
|
|
|
role = role_maker.PaddleCloudRoleMaker(is_collective=True)
|
|
fleet.init(role)
|
|
print_to_err("use_fleet", "fleet.node_num:")
|
|
# "fleet.node_id:", fleet.node_id(),
|
|
# "fleet.trainer_num:", fleet.worker_num())
|
|
|
|
test_program, avg_cost, train_reader, test_reader, batch_acc, predict = \
|
|
self.get_model(batch_size=args.batch_size, dist_strategy=dist_strategy)
|
|
|
|
trainer_prog = fleet._origin_program
|
|
dist_prog = fleet.main_program
|
|
|
|
if fluid.core.is_compiled_with_cuda():
|
|
device_id = int(os.getenv("FLAGS_selected_gpus", "0"))
|
|
place = fluid.CUDAPlace(device_id)
|
|
elif fluid.core.is_compiled_with_xpu():
|
|
device_id = int(os.getenv("FLAGS_selected_xpus", "0"))
|
|
place = fluid.XPUPlace(device_id)
|
|
else:
|
|
raise ValueError(
|
|
"fleet dygraph api must in paddlepaddle-xpu or paddlepaddle-gpu."
|
|
)
|
|
|
|
exe = fluid.Executor(place)
|
|
exe.run(fluid.default_startup_program())
|
|
eprint(type(self).__name__, "run worker startup program done.")
|
|
|
|
feed_var_list = [
|
|
var for var in trainer_prog.global_block().vars.values()
|
|
if var.is_data
|
|
]
|
|
|
|
eprint("feed_var_list:", feed_var_list)
|
|
|
|
# tmp add this code to pass python35 gcc8 CI
|
|
# Fixme(gongweibao, wangxi), need fix fleet api program order
|
|
if feed_var_list[0].name == 'label':
|
|
feed_var_list = feed_var_list[::-1]
|
|
|
|
feeder = fluid.DataFeeder(feed_var_list, place)
|
|
reader_generator = train_reader()
|
|
|
|
def get_data():
|
|
origin_batch = next(reader_generator)
|
|
if args.update_method != "local" and args.use_reader_alloc:
|
|
new_batch = []
|
|
for offset, item in enumerate(origin_batch):
|
|
if offset % 2 == args.trainer_id:
|
|
new_batch.append(item)
|
|
return new_batch
|
|
else:
|
|
return origin_batch
|
|
|
|
print_to_err(type(self).__name__, "begin to train on trainer")
|
|
out_losses = []
|
|
for i in six.moves.xrange(RUN_STEP):
|
|
loss, = exe.run(dist_prog,
|
|
fetch_list=[avg_cost.name],
|
|
feed=feeder.feed(get_data()))
|
|
out_losses.append(loss[0])
|
|
print_to_err(type(self).__name__, "run step %d finished" % i)
|
|
print_to_err(type(self).__name__, "trainer run finished")
|
|
|
|
if six.PY2:
|
|
print(pickle.dumps(out_losses))
|
|
else:
|
|
sys.stdout.buffer.write(pickle.dumps(out_losses))
|
|
|
|
if args.save_model:
|
|
model_save_dir = "/tmp"
|
|
if fleet.worker_index() == 0:
|
|
model_save_dir_fluid = os.path.join(model_save_dir,
|
|
"fluid_persistables")
|
|
model_save_dir_fleet = os.path.join(model_save_dir,
|
|
"fleet_persistables")
|
|
infer_save_dir_fluid = os.path.join(model_save_dir,
|
|
"fluid_infer")
|
|
infer_save_dir_fleet = os.path.join(model_save_dir,
|
|
"fleet_infer")
|
|
else:
|
|
model_save_dir_fluid = os.path.join(model_save_dir,
|
|
"fluid_persistables_2")
|
|
model_save_dir_fleet = os.path.join(model_save_dir,
|
|
"fleet_persistables_2")
|
|
infer_save_dir_fluid = os.path.join(model_save_dir,
|
|
"fluid_infer_2")
|
|
infer_save_dir_fleet = os.path.join(model_save_dir,
|
|
"fleet_infer_2")
|
|
fluid.io.save_persistables(exe, model_save_dir_fluid,
|
|
fleet._origin_program)
|
|
fleet.save_persistables(executor=exe, dirname=model_save_dir_fleet)
|
|
feeded_var_names = [var.name for var in feed_var_list]
|
|
fluid.io.save_inference_model(infer_save_dir_fluid,
|
|
feeded_var_names, [avg_cost], exe,
|
|
fleet._origin_program)
|
|
fleet.save_inference_model(exe, infer_save_dir_fleet,
|
|
feeded_var_names, [avg_cost])
|
|
|
|
def run_trainer(self, args):
|
|
self.lr = args.lr
|
|
if args.nccl2_reduce_layer_local_run:
|
|
test_program, avg_cost, train_reader, test_reader, batch_acc, predict = \
|
|
self.get_model(batch_size=args.batch_size, single_device=True)
|
|
elif args.use_dgc:
|
|
test_program, avg_cost, train_reader, test_reader, batch_acc, predict = \
|
|
self.get_model(batch_size=args.batch_size, use_dgc=args.use_dgc)
|
|
else:
|
|
test_program, avg_cost, train_reader, test_reader, batch_acc, predict = \
|
|
self.get_model(batch_size=args.batch_size)
|
|
|
|
if args.update_method == "pserver":
|
|
print_to_err(
|
|
type(self).__name__,
|
|
"begin to run transpile on trainer with pserver mode")
|
|
t = self.get_transpiler(
|
|
trainer_id=args.trainer_id,
|
|
main_program=fluid.default_main_program(),
|
|
pserver_endpoints=args.endpoints,
|
|
trainers=args.trainers,
|
|
sync_mode=args.sync_mode,
|
|
dc_asgd=args.dc_asgd,
|
|
hogwild_mode=args.hogwild)
|
|
|
|
trainer_prog = t.get_trainer_program()
|
|
print_to_err(
|
|
type(self).__name__,
|
|
"get trainer program done with pserver mode.")
|
|
elif args.update_method == "nccl2" or args.update_method == "nccl2_reduce_layer":
|
|
# transpile for nccl2
|
|
config = fluid.DistributeTranspilerConfig()
|
|
config.mode = "nccl2"
|
|
config.nccl_comm_num = args.nccl_comm_num
|
|
if args.use_hallreduce:
|
|
config.use_hierarchical_allreduce = True
|
|
config.hierarchical_allreduce_inter_nranks = args.hallreduce_inter_nranks
|
|
print_to_err(
|
|
type(self).__name__,
|
|
"begin to run transpile on trainer with nccl2 mode")
|
|
nccl2_t = fluid.DistributeTranspiler(config=config)
|
|
nccl2_t.transpile(
|
|
args.trainer_id,
|
|
program=fluid.default_main_program(),
|
|
startup_program=fluid.default_startup_program(),
|
|
trainers=args.endpoints,
|
|
current_endpoint=args.current_endpoint)
|
|
print_to_err(
|
|
type(self).__name__,
|
|
"get trainer program done. with nccl2 mode")
|
|
trainer_prog = fluid.default_main_program()
|
|
else:
|
|
print_to_err(
|
|
type(self).__name__,
|
|
"do nothing about main program, just use it")
|
|
trainer_prog = fluid.default_main_program()
|
|
print_to_err(type(self).__name__, "use main program done.")
|
|
|
|
# FIXME(gongwb):wait pserver initialization.
|
|
time.sleep(1)
|
|
|
|
if args.use_cuda:
|
|
device_id = int(os.getenv("FLAGS_selected_gpus", "0"))
|
|
place = fluid.CUDAPlace(device_id)
|
|
else:
|
|
place = fluid.CPUPlace()
|
|
|
|
exe = fluid.Executor(place)
|
|
exe.run(fluid.default_startup_program())
|
|
print_to_err(type(self).__name__, "run worker startup program done.")
|
|
|
|
exec_strategy = fluid.ExecutionStrategy()
|
|
exec_strategy.num_threads = 1
|
|
|
|
build_stra = fluid.BuildStrategy()
|
|
# FIXME force disable enable_inplace and memory_optimize
|
|
build_stra.enable_inplace = False
|
|
build_stra.memory_optimize = False
|
|
|
|
if args.fuse_all_reduce is not None:
|
|
sys.stderr.write('fuse_all_reduce={}'.format(args.fuse_all_reduce))
|
|
build_stra.fuse_all_reduce_ops = args.fuse_all_reduce
|
|
|
|
if args.hogwild:
|
|
build_stra.async_mode = True
|
|
|
|
if args.enable_backward_deps:
|
|
build_stra.enable_backward_optimizer_op_deps = True
|
|
|
|
if args.use_reduce:
|
|
build_stra.reduce_strategy = fluid.BuildStrategy.ReduceStrategy.Reduce
|
|
else:
|
|
build_stra.reduce_strategy = fluid.BuildStrategy.ReduceStrategy.AllReduce
|
|
|
|
pass_builder = None
|
|
if args.batch_merge_repeat > 1:
|
|
pass_builder = build_stra._finalize_strategy_and_create_passes()
|
|
mypass = pass_builder.insert_pass(0, "multi_batch_merge_pass")
|
|
mypass.set("num_repeats", args.batch_merge_repeat)
|
|
|
|
if args.update_method == "nccl2" or args.update_method == "nccl2_reduce_layer":
|
|
build_stra.num_trainers = len(args.endpoints.split(","))
|
|
build_stra.trainer_id = args.trainer_id
|
|
else:
|
|
# case args.update_method == "nccl2_reduce_layer":
|
|
build_stra.num_trainers = 1
|
|
build_stra.trainer_id = 0
|
|
|
|
print_to_err(type(self).__name__, "begin to compile with data parallel")
|
|
binary = compiler.CompiledProgram(trainer_prog).with_data_parallel(
|
|
loss_name=avg_cost.name,
|
|
build_strategy=build_stra,
|
|
exec_strategy=exec_strategy)
|
|
print_to_err(type(self).__name__, "program compiled with data parallel")
|
|
|
|
feed_var_list = [
|
|
var for var in trainer_prog.global_block().vars.values()
|
|
if var.is_data
|
|
]
|
|
|
|
feeder = fluid.DataFeeder(feed_var_list, place)
|
|
reader_generator = train_reader()
|
|
|
|
def get_data():
|
|
origin_batch = next(reader_generator)
|
|
if args.update_method != "local" and args.use_reader_alloc:
|
|
new_batch = []
|
|
for offset, item in enumerate(origin_batch):
|
|
if offset % 2 == args.trainer_id:
|
|
new_batch.append(item)
|
|
return new_batch
|
|
else:
|
|
return origin_batch
|
|
|
|
print_to_err(type(self).__name__, "begin to train on trainer")
|
|
out_losses = []
|
|
for i in six.moves.xrange(RUN_STEP):
|
|
loss, = exe.run(binary,
|
|
fetch_list=[avg_cost.name],
|
|
feed=feeder.feed(get_data()))
|
|
out_losses.append(loss[0])
|
|
print_to_err(type(self).__name__, "run step %d finished" % i)
|
|
print_to_err(type(self).__name__, "trainer run finished")
|
|
|
|
print_to_out(out_losses)
|
|
|
|
|
|
class TestParallelDyGraphRunnerBase(object):
|
|
def get_model(self):
|
|
raise NotImplementedError(
|
|
"get_model should be implemented by child classes.")
|
|
|
|
def run_one_loop(self, model, opt, data):
|
|
raise NotImplementedError(
|
|
"train_one_loop should be implemented by the child classes.")
|
|
|
|
def _get_data(self, batch, args):
|
|
if args.update_method != "local":
|
|
new_batch = []
|
|
for offset, item in enumerate(batch):
|
|
if offset % 2 == args.trainer_id:
|
|
new_batch.append(item)
|
|
return new_batch
|
|
else:
|
|
return batch
|
|
|
|
def run_trainer(self, args):
|
|
|
|
seed = 90
|
|
if fluid.core.is_compiled_with_cuda():
|
|
device_id = int(os.getenv("FLAGS_selected_gpus", "0"))
|
|
place = fluid.CUDAPlace(device_id)
|
|
elif fluid.core.is_compiled_with_xpu():
|
|
device_id = int(os.getenv("FLAGS_selected_xpus", "0"))
|
|
place = fluid.XPUPlace(device_id)
|
|
else:
|
|
assert ("Only support CUDAPlace or XPUPlace for now.")
|
|
|
|
with fluid.dygraph.guard(place):
|
|
fluid.default_startup_program().random_seed = seed
|
|
fluid.default_main_program().random_seed = seed
|
|
np.random.seed(seed)
|
|
import random
|
|
random.seed(seed)
|
|
model, train_reader, opt = self.get_model()
|
|
nranks = len(args.endpoints.split(",")) if args.endpoints else 1
|
|
|
|
#if args.update_method == "nccl2":
|
|
if args.update_method == "nccl2" or args.update_method == "bkcl":
|
|
strategy = dygraph.parallel.ParallelStrategy()
|
|
strategy.nranks = nranks
|
|
strategy.local_rank = args.trainer_id
|
|
strategy.trainer_endpoints = args.endpoints.split(",")
|
|
strategy.current_endpoint = args.current_endpoint
|
|
print_to_err(
|
|
type(self).__name__,
|
|
"begin to prepare context in dygraph with nccl2")
|
|
dygraph.parallel.prepare_context(strategy)
|
|
model = dygraph.parallel.DataParallel(model, strategy)
|
|
print_to_err(type(self).__name__, "model built in dygraph")
|
|
out_losses = []
|
|
print_to_err(type(self).__name__, "begin to run dygraph training")
|
|
for step_id, data in enumerate(train_reader()):
|
|
data = self._get_data(data, args)
|
|
if step_id == RUN_STEP:
|
|
break
|
|
loss = self.run_one_loop(model, opt, data)
|
|
if step_id % 10 == 0:
|
|
print_to_err(
|
|
type(self).__name__,
|
|
"loss at step %d: %f" % (step_id, loss.numpy()))
|
|
out_losses.append(loss.numpy())
|
|
|
|
loss.backward()
|
|
|
|
opt.minimize(loss)
|
|
if not args.accumulate_gradient:
|
|
model.clear_gradients()
|
|
print_to_out(out_losses)
|
|
|
|
def run_trainer_with_spawn(self, args):
|
|
# 1. enable dygraph
|
|
paddle.disable_static()
|
|
|
|
# 2. init seed
|
|
seed = 90
|
|
paddle.static.default_startup_program().random_seed = seed
|
|
paddle.static.default_main_program().random_seed = seed
|
|
np.random.seed(seed)
|
|
random.seed(seed)
|
|
# get trainer id
|
|
args.trainer_id = paddle.distributed.get_rank()
|
|
|
|
# 3. init parallel env
|
|
if args.update_method == "nccl2":
|
|
paddle.distributed.init_parallel_env()
|
|
|
|
# 4. train model
|
|
model, train_reader, opt = self.get_model()
|
|
if args.update_method == "nccl2":
|
|
model = paddle.DataParallel(model)
|
|
|
|
out_losses = []
|
|
for step_id, data in enumerate(train_reader()):
|
|
data = self._get_data(data, args)
|
|
if step_id == RUN_STEP:
|
|
break
|
|
loss = self.run_one_loop(model, opt, data)
|
|
out_losses.append(loss.numpy())
|
|
|
|
loss.backward()
|
|
|
|
opt.minimize(loss)
|
|
model.clear_gradients()
|
|
return out_losses
|
|
|
|
def run_use_fleet_api_trainer(self, args):
|
|
import paddle.distributed.fleet as fleet
|
|
import paddle.distributed.fleet.base.role_maker as role_maker
|
|
# 1. enable dygraph
|
|
paddle.disable_static()
|
|
|
|
# 2. init seed
|
|
seed = 90
|
|
paddle.static.default_startup_program().random_seed = seed
|
|
paddle.static.default_main_program().random_seed = seed
|
|
np.random.seed(seed)
|
|
random.seed(seed)
|
|
# get trainer id
|
|
args.trainer_id = paddle.distributed.get_rank()
|
|
|
|
# 3. init parallel env
|
|
if args.update_method == "nccl2" or "bkcl":
|
|
fleet.init(is_collective=True)
|
|
|
|
# 4. train model
|
|
model, train_reader, opt = self.get_model()
|
|
if args.update_method == "nccl2" or "bkcl":
|
|
opt = fleet.distributed_optimizer(opt)
|
|
model = fleet.distributed_model(model)
|
|
|
|
out_losses = []
|
|
for step_id, data in enumerate(train_reader()):
|
|
data = self._get_data(data, args)
|
|
if step_id == RUN_STEP:
|
|
break
|
|
loss = self.run_one_loop(model, opt, data)
|
|
out_losses.append(loss.numpy())
|
|
|
|
loss.backward()
|
|
|
|
opt.step()
|
|
if not args.accumulate_gradient:
|
|
opt.clear_grad()
|
|
print_to_out(out_losses)
|
|
|
|
|
|
def runtime_main(test_class):
|
|
parser = argparse.ArgumentParser(description='Run dist test.')
|
|
parser.add_argument(
|
|
'--role', type=str, required=True, choices=['pserver', 'trainer'])
|
|
parser.add_argument('--endpoints', type=str, required=False, default="")
|
|
parser.add_argument(
|
|
'--update_method',
|
|
type=str,
|
|
default="local",
|
|
choices=["pserver", "nccl2", "bkcl", "local", "nccl2_reduce_layer"])
|
|
parser.add_argument('--trainer_id', type=int, required=False, default=0)
|
|
parser.add_argument('--trainers', type=int, required=False, default=1)
|
|
parser.add_argument('--nccl_comm_num', type=int, required=False, default=1)
|
|
parser.add_argument('--enable_backward_deps', action='store_true')
|
|
parser.add_argument('--use_hallreduce', action='store_true')
|
|
parser.add_argument('--use_pipeline', action='store_true')
|
|
parser.add_argument('--use_fleet_api', action='store_true')
|
|
parser.add_argument('--use_local_sgd', action='store_true')
|
|
parser.add_argument('--ut4grad_allreduce', action='store_true')
|
|
parser.add_argument(
|
|
'--hallreduce_inter_nranks', type=int, required=False, default=2)
|
|
parser.add_argument(
|
|
'--current_endpoint', type=str, required=False, default="")
|
|
parser.add_argument('--sync_mode', action='store_true')
|
|
parser.add_argument('--use_cuda', action='store_true')
|
|
parser.add_argument('--use_xpu', action='store_true')
|
|
parser.add_argument('--use_dgc', action='store_true')
|
|
parser.add_argument('--accumulate_gradient', action='store_true')
|
|
parser.add_argument('--use_reduce', action='store_true')
|
|
parser.add_argument('--dc_asgd', action='store_true')
|
|
parser.add_argument('--hogwild', action='store_true')
|
|
parser.add_argument('--save_model', action='store_true')
|
|
parser.add_argument(
|
|
'--use_reader_alloc', action='store_true', required=False)
|
|
parser.add_argument('--batch_size', required=False, type=int, default=2)
|
|
parser.add_argument('--lr', required=False, type=float, default=0.001)
|
|
parser.add_argument(
|
|
'--batch_merge_repeat', required=False, type=int, default=1)
|
|
parser.add_argument(
|
|
'--nccl2_reduce_layer_local_run',
|
|
required=False,
|
|
type=bool,
|
|
default=False)
|
|
parser.add_argument('--sync_batch_norm', action='store_true')
|
|
parser.add_argument(
|
|
'--fuse_all_reduce',
|
|
required=False,
|
|
type=ast.literal_eval,
|
|
default=None)
|
|
|
|
args = parser.parse_args()
|
|
|
|
model = test_class()
|
|
if args.role == "pserver" and args.update_method == "pserver":
|
|
model.run_pserver(args)
|
|
elif args.use_fleet_api:
|
|
model.run_use_fleet_api_trainer(args)
|
|
elif args.use_pipeline:
|
|
model.run_pipeline_trainer(args)
|
|
else:
|
|
model.run_trainer(args)
|
|
|
|
|
|
import paddle.compat as cpt
|
|
import socket
|
|
from contextlib import closing
|
|
|
|
|
|
class TestDistBase(unittest.TestCase):
|
|
def _setup_config(self):
|
|
raise NotImplementedError("tests should have _setup_config implemented")
|
|
|
|
def _after_setup_config(self):
|
|
if self._enforce_place == "CPU":
|
|
self.__use_cuda = False
|
|
self.__use_xpu = False
|
|
self._use_dgc = False
|
|
elif self._enforce_place == "GPU":
|
|
self.__use_cuda = True
|
|
self.__use_xpu = False
|
|
elif self._enforce_place == "XPU":
|
|
self.__use_cuda = False
|
|
self.__use_xpu = True
|
|
self._use_dgc = False
|
|
else:
|
|
if fluid.core.is_compiled_with_cuda():
|
|
self.__use_cuda = True
|
|
else:
|
|
self.__use_cuda = False
|
|
self._use_dgc = False
|
|
|
|
if self._use_reduce:
|
|
assert not self._use_dgc
|
|
|
|
def setUp(self):
|
|
self._trainers = 2
|
|
self._pservers = 2
|
|
self._port_set = set()
|
|
self._python_interp = sys.executable
|
|
self._sync_mode = True
|
|
self._hogwild_mode = False
|
|
self._enforce_place = None
|
|
self._use_reduce = False
|
|
self._dc_asgd = False # must use with async mode
|
|
self._use_reader_alloc = True
|
|
self._nccl2_mode = False
|
|
self._bkcl_mode = False
|
|
self._pipeline_mode = False
|
|
self._mp_mode = False
|
|
# FIXME(typhoonzero): I added this stupid argument to enable
|
|
# testing allreduce layers, which users can call layers.allreduce
|
|
# to accumulate tensors at anywhere. Find a better way to do this
|
|
# test, reduce check this argument everywhere.
|
|
self._nccl2_reduce_layer = False
|
|
self._lr = 0.001
|
|
self._use_dgc = False
|
|
self._dygraph = False
|
|
self._nccl_comm_num = 1
|
|
self._enable_backward_deps = False
|
|
self._use_fleet_api = False
|
|
self._use_local_sgd = False
|
|
self._ut4grad_allreduce = False
|
|
self._use_hallreduce = False
|
|
self._save_model = False
|
|
self._fuse_all_reduce = None
|
|
self._accumulate_gradient = False
|
|
self._setup_config()
|
|
|
|
global DIST_UT_PORT
|
|
if DIST_UT_PORT == 0 and os.getenv("PADDLE_DIST_UT_PORT"):
|
|
DIST_UT_PORT = int(os.getenv("PADDLE_DIST_UT_PORT"))
|
|
|
|
if DIST_UT_PORT == 0:
|
|
self._ps_endpoints = "127.0.0.1:%s,127.0.0.1:%s" % (
|
|
self._find_free_port(), self._find_free_port())
|
|
else:
|
|
print("set begin_port:", DIST_UT_PORT)
|
|
self._ps_endpoints = "127.0.0.1:%s,127.0.0.1:%s" % (
|
|
DIST_UT_PORT, DIST_UT_PORT + 1)
|
|
DIST_UT_PORT += 2
|
|
|
|
self._after_setup_config()
|
|
|
|
def _find_free_port(self):
|
|
def __free_port():
|
|
with closing(socket.socket(socket.AF_INET,
|
|
socket.SOCK_STREAM)) as s:
|
|
s.bind(('', 0))
|
|
print_to_err(
|
|
type(self).__name__, "socket name: %s" % s.getsockname()[1])
|
|
return s.getsockname()[1]
|
|
|
|
while True:
|
|
port = __free_port()
|
|
if port not in self._port_set:
|
|
self._port_set.add(port)
|
|
return port
|
|
|
|
def start_pserver(self,
|
|
model_file,
|
|
check_error_log,
|
|
required_envs,
|
|
log_name=""):
|
|
ps0_ep, ps1_ep = self._ps_endpoints.split(",")
|
|
ps_cmd = "%s"
|
|
|
|
if os.getenv('WITH_COVERAGE', 'OFF') == 'ON':
|
|
required_envs['COVERAGE_FILE'] = os.getenv('COVERAGE_FILE', '')
|
|
ps_cmd += " -m coverage run --branch -p"
|
|
|
|
ps_cmd += " %s --role pserver --endpoints %s --trainer_id 0 --current_endpoint %s --trainers %d --update_method pserver"
|
|
|
|
ps0_cmd = ps_cmd % \
|
|
(self._python_interp, model_file, self._ps_endpoints, ps0_ep,
|
|
self._trainers)
|
|
ps1_cmd = ps_cmd % \
|
|
(self._python_interp, model_file, self._ps_endpoints, ps1_ep,
|
|
self._trainers)
|
|
|
|
if self._sync_mode:
|
|
ps0_cmd += " --sync_mode"
|
|
ps1_cmd += " --sync_mode"
|
|
|
|
print(ps0_cmd)
|
|
print(ps1_cmd)
|
|
ps0_pipe = open(log_name + "_ps0_err.log", "wb")
|
|
ps1_pipe = open(log_name + "_ps1_err.log", "wb")
|
|
|
|
print_to_err(type(self).__name__, "going to start pserver process 0")
|
|
ps0_proc = subprocess.Popen(
|
|
ps0_cmd.strip().split(" "),
|
|
stdout=subprocess.PIPE,
|
|
stderr=ps0_pipe,
|
|
env=required_envs)
|
|
print_to_err(type(self).__name__, "going to start pserver process 1")
|
|
ps1_proc = subprocess.Popen(
|
|
ps1_cmd.strip().split(" "),
|
|
stdout=subprocess.PIPE,
|
|
stderr=ps1_pipe,
|
|
env=required_envs)
|
|
|
|
return ps0_proc, ps1_proc, ps0_pipe, ps1_pipe
|
|
|
|
def _run_local(self,
|
|
model,
|
|
envs,
|
|
check_error_log=False,
|
|
batch_size=DEFAULT_BATCH_SIZE,
|
|
batch_merge_repeat=1,
|
|
log_name="",
|
|
devices="0"):
|
|
|
|
cmd = self._python_interp
|
|
|
|
if os.getenv('WITH_COVERAGE', 'OFF') == 'ON':
|
|
envs['COVERAGE_FILE'] = os.getenv('COVERAGE_FILE', '')
|
|
cmd += " -m coverage run --branch -p"
|
|
|
|
cmd += " %s --role trainer --update_method local --lr %f" % (model,
|
|
self._lr)
|
|
|
|
if batch_size != DEFAULT_BATCH_SIZE:
|
|
cmd += " --batch_size %d" % batch_size
|
|
if batch_merge_repeat > 1:
|
|
cmd += " --batch_merge_repeat %d" % batch_merge_repeat
|
|
if self._nccl2_reduce_layer:
|
|
cmd += " --nccl2_reduce_layer_local_run 1"
|
|
|
|
if self.__use_cuda:
|
|
cmd += " --use_cuda"
|
|
env_local = {
|
|
"CUDA_VISIBLE_DEVICES": devices,
|
|
"PADDLE_TRAINERS_NUM": "1",
|
|
"PADDLE_TRAINER_ID": "0"
|
|
}
|
|
elif self.__use_xpu:
|
|
cmd += " --use_xpu"
|
|
env_local = {
|
|
"FLAGS_selected_xpus": devices,
|
|
"PADDLE_TRAINERS_NUM": "1",
|
|
"PADDLE_TRAINER_ID": "0"
|
|
}
|
|
else:
|
|
env_local = {'CPU_NUM': '1'}
|
|
|
|
# not use dgc in single card
|
|
if len(devices) > 1 and self._use_dgc:
|
|
cmd += " --use_dgc"
|
|
|
|
if self._accumulate_gradient:
|
|
cmd += " --accumulate_gradient"
|
|
|
|
env_local.update(envs)
|
|
print("local_cmd: {}, env: {}".format(cmd, env_local))
|
|
|
|
if check_error_log:
|
|
err_log = open(log_name + "_local.log", "wb")
|
|
local_proc = subprocess.Popen(
|
|
cmd.split(" "),
|
|
stdout=subprocess.PIPE,
|
|
stderr=err_log,
|
|
env=env_local)
|
|
else:
|
|
local_proc = subprocess.Popen(
|
|
cmd.split(" "),
|
|
stdout=subprocess.PIPE,
|
|
stderr=subprocess.PIPE,
|
|
env=env_local)
|
|
|
|
local_out, local_err = local_proc.communicate()
|
|
|
|
if check_error_log:
|
|
err_log.close()
|
|
|
|
sys.stderr.write('local_stderr: %s\n' % local_err)
|
|
sys.stderr.write('local_stdout: %s\n' % pickle.loads(local_out))
|
|
|
|
return pickle.loads(local_out)
|
|
|
|
def _run_cluster(self, model, envs, check_error_log, log_name):
|
|
# Run dist train to compare with local results
|
|
ps0, ps1, ps0_pipe, ps1_pipe = self.start_pserver(
|
|
model, check_error_log, envs, log_name=log_name)
|
|
|
|
ps0_ep, ps1_ep = self._ps_endpoints.split(",")
|
|
|
|
tr_cmd = "%s"
|
|
|
|
if os.getenv('WITH_COVERAGE', 'OFF') == 'ON':
|
|
envs['COVERAGE_FILE'] = os.getenv('COVERAGE_FILE', '')
|
|
tr_cmd += " -m coverage run --branch -p"
|
|
|
|
tr_cmd += " %s --role trainer --endpoints %s --trainer_id %d --current_endpoint %s --trainers %d --update_method pserver --lr %f"
|
|
|
|
tr0_cmd = tr_cmd % \
|
|
(self._python_interp, model, self._ps_endpoints,
|
|
0, ps0_ep, self._trainers, self._lr)
|
|
tr1_cmd = tr_cmd % \
|
|
(self._python_interp, model, self._ps_endpoints,
|
|
1, ps1_ep, self._trainers, self._lr)
|
|
|
|
if self._sync_mode:
|
|
tr0_cmd += " --sync_mode"
|
|
tr1_cmd += " --sync_mode"
|
|
if self._hogwild_mode:
|
|
tr0_cmd += " --hogwild"
|
|
tr1_cmd += " --hogwild"
|
|
if self._use_reduce:
|
|
tr0_cmd += " --use_reduce"
|
|
tr1_cmd += " --use_reduce"
|
|
if self._use_reader_alloc:
|
|
tr0_cmd += " --use_reader_alloc"
|
|
tr1_cmd += " --use_reader_alloc"
|
|
if self.__use_cuda:
|
|
tr0_cmd += " --use_cuda"
|
|
tr1_cmd += " --use_cuda"
|
|
env0 = {"CUDA_VISIBLE_DEVICES": "0"}
|
|
env1 = {"CUDA_VISIBLE_DEVICES": "1"}
|
|
else:
|
|
env0 = {'CPU_NUM': '1'}
|
|
env1 = {'CPU_NUM': '1'}
|
|
|
|
env0.update(envs)
|
|
env1.update(envs)
|
|
|
|
print("tr0_cmd: {}, env: {}".format(tr0_cmd, env0))
|
|
print("tr1_cmd: {}, env: {}".format(tr1_cmd, env1))
|
|
tr0_pipe = open(log_name + "_tr0_err.log", "wb")
|
|
tr1_pipe = open(log_name + "_tr1_err.log", "wb")
|
|
|
|
print_to_err(type(self).__name__, "going to start trainer process 0")
|
|
tr0_proc = subprocess.Popen(
|
|
tr0_cmd.strip().split(" "),
|
|
stdout=subprocess.PIPE,
|
|
stderr=tr0_pipe,
|
|
env=env0)
|
|
print_to_err(type(self).__name__, "going to start trainer process 1")
|
|
tr1_proc = subprocess.Popen(
|
|
tr1_cmd.strip().split(" "),
|
|
stdout=subprocess.PIPE,
|
|
stderr=tr1_pipe,
|
|
env=env1)
|
|
|
|
# Wait until trainer process terminate
|
|
while True:
|
|
stat0 = tr0_proc.poll()
|
|
time.sleep(0.1)
|
|
if stat0 is not None:
|
|
break
|
|
while True:
|
|
stat1 = tr1_proc.poll()
|
|
time.sleep(0.1)
|
|
if stat1 is not None:
|
|
break
|
|
|
|
tr0_out, tr0_err = tr0_proc.communicate()
|
|
tr1_out, tr1_err = tr1_proc.communicate()
|
|
|
|
# close trainer file
|
|
tr0_pipe.close()
|
|
tr1_pipe.close()
|
|
ps0_pipe.close()
|
|
ps1_pipe.close()
|
|
|
|
ps0.terminate()
|
|
ps1.terminate()
|
|
|
|
return pickle.loads(tr0_out), pickle.loads(tr1_out)
|
|
|
|
def _get_nccl2_trainer_cmd(self, model, ep, update_method, trainer_id,
|
|
trainer_num):
|
|
env = {}
|
|
tr_cmd = "%s -u"
|
|
|
|
if os.getenv('WITH_COVERAGE', 'OFF') == 'ON':
|
|
tr_cmd += " -m coverage run --branch -p"
|
|
|
|
tr_cmd += " %s --role trainer --endpoints %s --trainer_id %d --current_endpoint %s --update_method %s --lr %f"
|
|
|
|
tr_cmd = tr_cmd % \
|
|
(self._python_interp, model, self._ps_endpoints,
|
|
trainer_id, ep, update_method, self._lr)
|
|
|
|
if self._use_reduce:
|
|
tr_cmd += " --use_reduce"
|
|
if self._use_reader_alloc:
|
|
tr_cmd += " --use_reader_alloc"
|
|
if self._save_model:
|
|
tr_cmd += " --save_model"
|
|
if self.__use_cuda:
|
|
tr_cmd += " --use_cuda"
|
|
env.update({
|
|
"FLAGS_selected_gpus": "{}".format(0),
|
|
"CUDA_VISIBLE_DEVICES": "{}".format(trainer_id),
|
|
"PADDLE_TRAINERS_NUM": "{}".format(trainer_num),
|
|
"PADDLE_TRAINER_ID": "{}".format(trainer_id),
|
|
"PADDLE_TRAINER_ENDPOINTS": self._ps_endpoints,
|
|
"PADDLE_CURRENT_ENDPOINT": ep,
|
|
})
|
|
# TODO(liuyuhui):XPU_VISIBLE_DEVICES is not working right now,
|
|
# will update it after Badiu Kunlun partners' support.
|
|
elif self.__use_xpu:
|
|
tr_cmd += " --use_xpu"
|
|
env.update({
|
|
"FLAGS_selected_xpus": "{}".format(trainer_id),
|
|
#"XPU_VISIBLE_DEVICES": "{}".format(trainer_id + 1),
|
|
"PADDLE_TRAINERS_NUM": "{}".format(trainer_num),
|
|
"PADDLE_TRAINER_ID": "{}".format(trainer_id),
|
|
"PADDLE_TRAINER_ENDPOINTS": self._ps_endpoints,
|
|
"PADDLE_CURRENT_ENDPOINT": ep,
|
|
"GLOG_v": "2",
|
|
})
|
|
else:
|
|
env.update({'CPU_NUM': '1'})
|
|
|
|
if self._use_dgc:
|
|
tr_cmd += " --use_dgc"
|
|
|
|
if self._accumulate_gradient:
|
|
tr_cmd += " --accumulate_gradient"
|
|
|
|
if self._pipeline_mode:
|
|
tr_cmd += " --use_pipeline"
|
|
if self._mp_mode:
|
|
env = {"FLAGS_selected_gpus": "{}".format(trainer_id)}
|
|
|
|
if self._nccl_comm_num > 1:
|
|
tr_cmd += " --nccl_comm_num {}".format(self._nccl_comm_num)
|
|
|
|
if self._use_hallreduce:
|
|
tr_cmd += " --use_hallreduce --hallreduce_inter_nranks 2"
|
|
|
|
if self._enable_backward_deps:
|
|
tr_cmd += " --enable_backward_deps"
|
|
|
|
if self._fuse_all_reduce is not None:
|
|
tr_cmd += " --fuse_all_reduce {}".format(self._fuse_all_reduce)
|
|
|
|
if self._use_fleet_api:
|
|
tr_cmd += " --use_fleet_api"
|
|
if self._use_local_sgd:
|
|
tr_cmd += " --use_local_sgd"
|
|
if self._ut4grad_allreduce:
|
|
tr_cmd += " --ut4grad_allreduce"
|
|
if hasattr(self, '_sync_batch_norm') and self._sync_batch_norm:
|
|
tr_cmd += " --sync_batch_norm"
|
|
|
|
if os.getenv('WITH_COVERAGE', 'OFF') == 'ON':
|
|
env['COVERAGE_FILE'] = os.getenv('COVERAGE_FILE', '')
|
|
|
|
return tr_cmd, env
|
|
|
|
def _run_cluster_nccl2(self, model, envs, update_method, check_error_log,
|
|
log_name):
|
|
if self._use_hallreduce:
|
|
self._ps_endpoints = ""
|
|
|
|
global DIST_UT_PORT
|
|
if DIST_UT_PORT == 0:
|
|
# NOTE(wangxi). hallreduce test must use 4cards after nccl>=2.7
|
|
for i in range(0, 4):
|
|
self._ps_endpoints += "127.0.0.1:%s," % (
|
|
self._find_free_port())
|
|
else:
|
|
for i in range(0, 4):
|
|
self._ps_endpoints += "127.0.0.1:%s," % (DIST_UT_PORT + i)
|
|
DIST_UT_PORT += 4
|
|
self._ps_endpoints = self._ps_endpoints[:-1]
|
|
|
|
# NOTE: we reuse ps_endpoints as nccl2 worker endpoints
|
|
worker_endpoints = self._ps_endpoints.split(",")
|
|
|
|
trainer_num = len(worker_endpoints)
|
|
|
|
procs = []
|
|
pipes = []
|
|
for i in range(0, trainer_num):
|
|
tr_cmd, tr_env = self._get_nccl2_trainer_cmd(
|
|
model, worker_endpoints[i], update_method, i, trainer_num)
|
|
tr_env.update(envs)
|
|
print("use_hallreduce:{} tr_cmd:{}, env: {}".format(
|
|
self._use_hallreduce, tr_cmd, tr_env))
|
|
|
|
tr_pipe = open(log_name + "_tr{}_err.log".format(i), "wb")
|
|
|
|
print_to_err(
|
|
type(self).__name__,
|
|
"going to start process {} with nccl2".format(i))
|
|
tr_proc = subprocess.Popen(
|
|
tr_cmd.strip().split(" "),
|
|
stdout=subprocess.PIPE,
|
|
stderr=tr_pipe,
|
|
env=tr_env)
|
|
|
|
procs.append(tr_proc)
|
|
pipes.append(tr_pipe)
|
|
|
|
outs = []
|
|
for i in range(0, trainer_num):
|
|
tr_out, tr_err = procs[i].communicate()
|
|
outs.append(tr_out)
|
|
pipes[i].close()
|
|
sys.stderr.write('trainer {} stderr: {}\n'.format(i, tr_err))
|
|
|
|
if check_error_log:
|
|
print("outs[0]:", outs[0])
|
|
print("outs[1]:", outs[1])
|
|
return pickle.loads(outs[0]), pickle.loads(outs[1])
|
|
|
|
def _run_pipeline(self, model, envs, check_error_log, log_name):
|
|
# NOTE: we reuse ps_endpoints as nccl2 worker endpoints
|
|
worker_endpoints = self._ps_endpoints.split(",")
|
|
update_method = "nccl2"
|
|
|
|
trainer_num = len(worker_endpoints)
|
|
|
|
procs = []
|
|
pipes = []
|
|
for i in range(0, trainer_num):
|
|
tr_cmd, tr_env = self._get_nccl2_trainer_cmd(
|
|
model, worker_endpoints[i], update_method, i, trainer_num)
|
|
tr_env.update(envs)
|
|
tr_env['CUDA_VISIBLE_DEVICES'] = "0,1"
|
|
tr_env['NCCL_SHM_DISABLE'] = '1'
|
|
tr_env['FLAGS_selected_gpus'] = str(i)
|
|
tr_env['FLAGS_cudnn_deterministic'] = '0'
|
|
print("tr_cmd:{}, env: {}".format(tr_cmd, tr_env))
|
|
|
|
tr_pipe = open("/tmp/" + "tr{}_err.log".format(i), "wb")
|
|
|
|
print_to_err(
|
|
type(self).__name__,
|
|
"going to start process {} with nccl2".format(i))
|
|
tr_proc = subprocess.Popen(
|
|
tr_cmd.strip().split(" "),
|
|
stdout=subprocess.PIPE,
|
|
stderr=tr_pipe,
|
|
env=tr_env)
|
|
|
|
procs.append(tr_proc)
|
|
pipes.append(tr_pipe)
|
|
|
|
outs = []
|
|
for i in range(0, trainer_num):
|
|
tr_out, tr_err = procs[i].communicate()
|
|
outs.append(tr_out)
|
|
pipes[i].close()
|
|
sys.stderr.write('trainer {} stderr: {}\n'.format(i, tr_err))
|
|
|
|
if check_error_log:
|
|
print("outs[0]:", outs[0])
|
|
print("outs[1]:", outs[1])
|
|
return pickle.loads(outs[0]), pickle.loads(outs[1])
|
|
|
|
def _get_required_envs(self, check_error_log=False, need_envs={}):
|
|
# TODO(typhoonzero): should auto adapt GPU count on the machine.
|
|
required_envs = {
|
|
"PATH": os.getenv("PATH", ""),
|
|
"PYTHONPATH": os.getenv("PYTHONPATH", ""),
|
|
"LD_LIBRARY_PATH": os.getenv("LD_LIBRARY_PATH", ""),
|
|
"FLAGS_fraction_of_gpu_memory_to_use": "0.15",
|
|
"FLAGS_rpc_deadline": "30000", # 5sec to fail fast
|
|
"FLAGS_rpc_retry_bind_port": "50",
|
|
"FLAGS_cudnn_deterministic": "1",
|
|
"FLAGS_rpc_disable_reuse_port": "1",
|
|
"http_proxy": "",
|
|
"NCCL_P2P_DISABLE": "1",
|
|
"NCCL_SHM_DISABLE": "1"
|
|
}
|
|
|
|
if check_error_log:
|
|
required_envs["GLOG_vmodule"] = \
|
|
"fused_all_reduce_op_handle=10,all_reduce_op_handle=10,alloc_continuous_space_op=10,fuse_all_reduce_op_pass=10," \
|
|
"alloc_continuous_space_for_grad_pass=10,fast_threaded_ssa_graph_executor=10,executor=10,operator=10," \
|
|
"sparse_all_reduce_op_handle=10,gen_nccl_id_op=10,gen_nccl_id_op_help=10,nccl_helper=10,grpc_client=10," \
|
|
"grpc_server=10,request_handler_impl=10"
|
|
required_envs["GLOG_logtostderr"] = "1"
|
|
|
|
required_envs.update(need_envs)
|
|
return required_envs
|
|
|
|
def check_with_place(self,
|
|
model_file,
|
|
delta=1e-3,
|
|
check_error_log=False,
|
|
need_envs={},
|
|
log_name=""):
|
|
|
|
required_envs = self._get_required_envs(check_error_log, need_envs)
|
|
|
|
local_losses \
|
|
= self._run_local(model_file, required_envs,
|
|
check_error_log, log_name=log_name)
|
|
|
|
if self._nccl2_mode:
|
|
if self._nccl2_reduce_layer:
|
|
tr0_losses, tr1_losses = self._run_cluster_nccl2(
|
|
model_file,
|
|
required_envs,
|
|
update_method="nccl2_reduce_layer",
|
|
check_error_log=check_error_log,
|
|
log_name=log_name)
|
|
else:
|
|
tr0_losses, tr1_losses = self._run_cluster_nccl2(
|
|
model_file,
|
|
required_envs,
|
|
update_method='nccl2',
|
|
check_error_log=check_error_log,
|
|
log_name=log_name)
|
|
elif self._bkcl_mode:
|
|
tr0_losses, tr1_losses = self._run_cluster_nccl2(
|
|
model_file,
|
|
required_envs,
|
|
update_method='bkcl',
|
|
check_error_log=check_error_log,
|
|
log_name=log_name)
|
|
|
|
elif self._pipeline_mode:
|
|
tr0_losses, tr1_losses = self._run_pipeline(
|
|
model_file, required_envs, check_error_log, log_name=log_name)
|
|
else:
|
|
tr0_losses, tr1_losses = self._run_cluster(
|
|
model_file, required_envs, check_error_log, log_name=log_name)
|
|
|
|
for step_id in range(RUN_STEP):
|
|
local_loss = local_losses[step_id]
|
|
tr0_loss = tr0_losses[step_id]
|
|
tr1_loss = tr1_losses[step_id]
|
|
if self._pipeline_mode:
|
|
dist_loss = np.array([tr1_loss])
|
|
else:
|
|
dist_loss = (np.array([tr0_loss]) + np.array([tr1_loss])) / 2
|
|
print("=======", local_loss, ":", dist_loss[0], "=======")
|
|
self.assertAlmostEqual(local_loss, dist_loss[0], delta=delta)
|
|
|
|
def check_with_place_multi_cards(self,
|
|
model_file,
|
|
delta=1e-3,
|
|
check_error_log=False,
|
|
need_envs={},
|
|
log_name=""):
|
|
|
|
# need open p2p or shm otherwise multi cards mode will hang
|
|
need_envs.update({"NCCL_P2P_DISABLE": "0", "NCCL_SHM_DISABLE": "0"})
|
|
|
|
required_envs = self._get_required_envs(check_error_log, need_envs)
|
|
|
|
if self._use_dgc:
|
|
multi_cards_losses = self._run_local(
|
|
model_file,
|
|
required_envs,
|
|
check_error_log,
|
|
log_name=log_name + "_dgc_2cards",
|
|
devices="0,1")
|
|
|
|
self._use_dgc = False
|
|
base_losses = self._run_local(
|
|
model_file,
|
|
required_envs,
|
|
check_error_log,
|
|
log_name=log_name + "_base_2cards",
|
|
devices="0,1")
|
|
|
|
self._use_dgc = True
|
|
|
|
for step_id in range(RUN_STEP):
|
|
base_loss = base_losses[step_id]
|
|
multi_cards_loss = multi_cards_losses[step_id]
|
|
print("=======", base_loss, ":", multi_cards_loss, "=======")
|
|
self.assertAlmostEqual(base_loss, multi_cards_loss, delta=delta)
|