|
|
|
@ -92,35 +92,27 @@ class ParallelExecutor(object):
|
|
|
|
|
num_trainers=1,
|
|
|
|
|
trainer_id=0,
|
|
|
|
|
scope=None):
|
|
|
|
|
# step1: get places, the places are used in run too.
|
|
|
|
|
self._places = []
|
|
|
|
|
self._act_places = []
|
|
|
|
|
if use_cuda:
|
|
|
|
|
gpus = []
|
|
|
|
|
gpus_env = os.getenv("FLAGS_selected_gpus")
|
|
|
|
|
if gpus_env:
|
|
|
|
|
gpus = [int(s) for s in gpus_env.split(",")]
|
|
|
|
|
else:
|
|
|
|
|
for i in six.moves.range(core.get_cuda_device_count()):
|
|
|
|
|
gpus.append(i)
|
|
|
|
|
for i in gpus:
|
|
|
|
|
p = core.Place()
|
|
|
|
|
self._act_places.append(core.CUDAPlace(i))
|
|
|
|
|
p.set_place(self._act_places[-1])
|
|
|
|
|
self._places.append(p)
|
|
|
|
|
gpus = [
|
|
|
|
|
i for i in six.moves.range(core.get_cuda_device_count())
|
|
|
|
|
]
|
|
|
|
|
self._places = [core.CUDAPlace(i) for i in gpus]
|
|
|
|
|
else:
|
|
|
|
|
cpu_num = int(
|
|
|
|
|
os.environ.get('CPU_NUM', multiprocessing.cpu_count()))
|
|
|
|
|
for i in six.moves.range(cpu_num):
|
|
|
|
|
p = core.Place()
|
|
|
|
|
self._act_places.append(core.CPUPlace())
|
|
|
|
|
p.set_place(self._act_places[-1])
|
|
|
|
|
self._places.append(p)
|
|
|
|
|
self._places = [core.CPUPlace() for _ in six.moves.range(cpu_num)]
|
|
|
|
|
assert self._places, "no place for execution"
|
|
|
|
|
|
|
|
|
|
# step2: init exec_strategy
|
|
|
|
|
if exec_strategy is None:
|
|
|
|
|
exec_strategy = ExecutionStrategy()
|
|
|
|
|
exec_strategy.use_cuda = use_cuda
|
|
|
|
|
|
|
|
|
|
if exec_strategy.num_threads == 0:
|
|
|
|
|
if use_cuda:
|
|
|
|
|
# Experiments on se-resnext shows that too many threads hurt
|
|
|
|
@ -131,49 +123,54 @@ class ParallelExecutor(object):
|
|
|
|
|
os.environ.get('CPU_NUM', multiprocessing.cpu_count()))
|
|
|
|
|
exec_strategy.num_threads = cpu_num * 2
|
|
|
|
|
|
|
|
|
|
# step3: init build_strategy
|
|
|
|
|
if build_strategy is None:
|
|
|
|
|
build_strategy = BuildStrategy()
|
|
|
|
|
|
|
|
|
|
build_strategy.num_trainers = num_trainers
|
|
|
|
|
build_strategy.trainer_id = trainer_id
|
|
|
|
|
|
|
|
|
|
main = main_program
|
|
|
|
|
main = main if main else framework.default_main_program()
|
|
|
|
|
# step4: get main_program, scope, local_scopes
|
|
|
|
|
main = main_program if main_program \
|
|
|
|
|
else framework.default_main_program()
|
|
|
|
|
scope = scope if scope is not None else executor.global_scope()
|
|
|
|
|
|
|
|
|
|
if share_vars_from and not isinstance(share_vars_from,
|
|
|
|
|
ParallelExecutor):
|
|
|
|
|
raise TypeError("share_vars_from must be ParallelExecutor.")
|
|
|
|
|
|
|
|
|
|
local_scopes = share_vars_from.executor.local_scopes()\
|
|
|
|
|
if share_vars_from else []
|
|
|
|
|
|
|
|
|
|
# step5: check trainers_endpoints, it is used for distribution.
|
|
|
|
|
trainers_endpoints = main._trainers_endpoints
|
|
|
|
|
if num_trainers > 1 and trainers_endpoints:
|
|
|
|
|
assert num_trainers == len(
|
|
|
|
|
trainers_endpoints), "num_trainers == len(end_points)"
|
|
|
|
|
build_strategy.trainers_endpoints = trainers_endpoints
|
|
|
|
|
|
|
|
|
|
if scope == None:
|
|
|
|
|
scope = executor.global_scope()
|
|
|
|
|
|
|
|
|
|
if share_vars_from and not isinstance(share_vars_from,
|
|
|
|
|
ParallelExecutor):
|
|
|
|
|
raise TypeError("share_vars_from must be ParallelExecutor.")
|
|
|
|
|
|
|
|
|
|
local_scopes = share_vars_from.executor.local_scopes(
|
|
|
|
|
) if share_vars_from else []
|
|
|
|
|
|
|
|
|
|
self.persistable_vars = [
|
|
|
|
|
v.name for v in [
|
|
|
|
|
# step5: get persistable_vars, parameter_vars, places. persistable_vars
|
|
|
|
|
# need be broadcast to other local_scope.
|
|
|
|
|
persistable_vars = set([
|
|
|
|
|
cpt.to_text(v.name) for v in [
|
|
|
|
|
var for var in main.list_vars()
|
|
|
|
|
if var.persistable and var.type != core.VarDesc.VarType.RAW
|
|
|
|
|
]
|
|
|
|
|
]
|
|
|
|
|
])
|
|
|
|
|
|
|
|
|
|
def place_obj(place):
|
|
|
|
|
p = core.Place()
|
|
|
|
|
p.set_place(place)
|
|
|
|
|
return p
|
|
|
|
|
|
|
|
|
|
places = list(map(place_obj, self._places))
|
|
|
|
|
|
|
|
|
|
# step6: init ParallelExecutor
|
|
|
|
|
self.executor = core.ParallelExecutor(
|
|
|
|
|
self._places,
|
|
|
|
|
set([
|
|
|
|
|
cpt.to_text(p.name)
|
|
|
|
|
for p in main.global_block().iter_parameters()
|
|
|
|
|
if not p.stop_gradient
|
|
|
|
|
]),
|
|
|
|
|
set(cpt.to_text(var) for var in self.persistable_vars), main.desc,
|
|
|
|
|
places, persistable_vars, main.desc,
|
|
|
|
|
cpt.to_text(loss_name)
|
|
|
|
|
if loss_name else six.u(''), scope, local_scopes, exec_strategy,
|
|
|
|
|
build_strategy, num_trainers, trainer_id)
|
|
|
|
|
|
|
|
|
|
self.scope = scope
|
|
|
|
|
|
|
|
|
|
def run(self, fetch_list, feed=None, feed_dict=None, return_numpy=True):
|
|
|
|
@ -261,7 +258,7 @@ class ParallelExecutor(object):
|
|
|
|
|
self.executor.feed_and_split_tensor_into_local_scopes(
|
|
|
|
|
feed_tensor_dict)
|
|
|
|
|
elif isinstance(feed, list) or isinstance(feed, tuple):
|
|
|
|
|
if len(feed) != len(self._act_places):
|
|
|
|
|
if len(feed) != len(self._places):
|
|
|
|
|
raise ValueError(
|
|
|
|
|
"Feed a list of tensor, the list should be the same size as places"
|
|
|
|
|
)
|
|
|
|
@ -277,7 +274,7 @@ class ParallelExecutor(object):
|
|
|
|
|
tensor = each[feed_name]
|
|
|
|
|
if not isinstance(tensor, core.LoDTensor):
|
|
|
|
|
tmp = core.LoDTensor()
|
|
|
|
|
tmp.set(tensor, self._act_places[i])
|
|
|
|
|
tmp.set(tensor, self._places[i])
|
|
|
|
|
tensor = tmp
|
|
|
|
|
res_dict[feed_name] = tensor
|
|
|
|
|
res.append(res_dict)
|
|
|
|
@ -294,4 +291,4 @@ class ParallelExecutor(object):
|
|
|
|
|
|
|
|
|
|
@property
|
|
|
|
|
def device_count(self):
|
|
|
|
|
return len(self._act_places)
|
|
|
|
|
return len(self._places)
|
|
|
|
|