|
|
|
@ -13,15 +13,11 @@
|
|
|
|
|
# limitations under the License.
|
|
|
|
|
|
|
|
|
|
from __future__ import print_function
|
|
|
|
|
import multiprocessing
|
|
|
|
|
from . import core
|
|
|
|
|
from . import framework
|
|
|
|
|
from . import executor
|
|
|
|
|
from .. import compat as cpt
|
|
|
|
|
import warnings
|
|
|
|
|
from . import compiler
|
|
|
|
|
import sys
|
|
|
|
|
import six
|
|
|
|
|
import os
|
|
|
|
|
|
|
|
|
|
__all__ = ['ParallelExecutor']
|
|
|
|
|
|
|
|
|
@ -97,99 +93,27 @@ class ParallelExecutor(object):
|
|
|
|
|
'Please use CompiledProgram and Executor. CompiledProgram '
|
|
|
|
|
'is a central place for optimization and Executor is the '
|
|
|
|
|
'unified executor. Example can be found in compiler.py.\n')
|
|
|
|
|
# step1: get places, the places are used in run too.
|
|
|
|
|
self._places = []
|
|
|
|
|
if use_cuda:
|
|
|
|
|
gpus_env = os.getenv("FLAGS_selected_gpus")
|
|
|
|
|
if gpus_env:
|
|
|
|
|
gpus = [int(s) for s in gpus_env.split(",")]
|
|
|
|
|
else:
|
|
|
|
|
gpus = [
|
|
|
|
|
i for i in six.moves.range(core.get_cuda_device_count())
|
|
|
|
|
]
|
|
|
|
|
self._places = [core.CUDAPlace(i) for i in gpus]
|
|
|
|
|
else:
|
|
|
|
|
cpu_num = int(
|
|
|
|
|
os.environ.get('CPU_NUM', multiprocessing.cpu_count()))
|
|
|
|
|
self._places = [core.CPUPlace() for _ in six.moves.range(cpu_num)]
|
|
|
|
|
assert self._places, "no place for execution"
|
|
|
|
|
|
|
|
|
|
# step2: init exec_strategy
|
|
|
|
|
if exec_strategy is None:
|
|
|
|
|
exec_strategy = ExecutionStrategy()
|
|
|
|
|
exec_strategy.use_cuda = use_cuda
|
|
|
|
|
if exec_strategy.num_threads == 0:
|
|
|
|
|
if use_cuda:
|
|
|
|
|
# Experiments on se-resnext shows that too many threads hurt
|
|
|
|
|
# performance. Worth tunning for other models in the future.
|
|
|
|
|
exec_strategy.num_threads = len(self._places) * 4
|
|
|
|
|
else:
|
|
|
|
|
cpu_num = int(
|
|
|
|
|
os.environ.get('CPU_NUM', multiprocessing.cpu_count()))
|
|
|
|
|
exec_strategy.num_threads = cpu_num * 2
|
|
|
|
|
|
|
|
|
|
# step3: init build_strategy
|
|
|
|
|
if build_strategy is None:
|
|
|
|
|
build_strategy = BuildStrategy()
|
|
|
|
|
build_strategy.num_trainers = num_trainers
|
|
|
|
|
build_strategy.trainer_id = trainer_id
|
|
|
|
|
# FIXME(zcd): is_distribution_ is a temporary field, because in pserver mode,
|
|
|
|
|
# num_trainers is 1, so the current fields of build_strategy doesn't tell if
|
|
|
|
|
# it's distributed model.
|
|
|
|
|
build_strategy.is_distribution = framework.is_pserver_mode(
|
|
|
|
|
main_program) or num_trainers > 1
|
|
|
|
|
|
|
|
|
|
# step4: get main_program, scope, local_scopes
|
|
|
|
|
main = main_program if main_program \
|
|
|
|
|
else framework.default_main_program()
|
|
|
|
|
# FIXME(dzhwinter): enable_inplace should be after memory_optimize
|
|
|
|
|
# if turn on python memory optimize, turn off the inplace_pass.
|
|
|
|
|
if build_strategy.memory_optimize is None:
|
|
|
|
|
build_strategy.memory_optimize = False if main._is_mem_optimized else True
|
|
|
|
|
if build_strategy.enable_inplace is None:
|
|
|
|
|
build_strategy.enable_inplace = False if main._is_mem_optimized else True
|
|
|
|
|
scope = scope if scope is not None else executor.global_scope()
|
|
|
|
|
|
|
|
|
|
if share_vars_from and not isinstance(share_vars_from,
|
|
|
|
|
ParallelExecutor):
|
|
|
|
|
raise TypeError("share_vars_from must be ParallelExecutor.")
|
|
|
|
|
|
|
|
|
|
local_scopes = share_vars_from.executor.local_scopes()\
|
|
|
|
|
if share_vars_from else []
|
|
|
|
|
|
|
|
|
|
# step5: check trainers_endpoints, it is used for distribution.
|
|
|
|
|
trainers_endpoints = main._trainers_endpoints
|
|
|
|
|
if num_trainers > 1 and trainers_endpoints:
|
|
|
|
|
assert num_trainers == len(
|
|
|
|
|
trainers_endpoints), "num_trainers == len(endpoints)"
|
|
|
|
|
build_strategy.trainers_endpoints = trainers_endpoints
|
|
|
|
|
|
|
|
|
|
# step6: get persistable_vars, places. persistable_vars
|
|
|
|
|
# need be broadcast to other local_scope.
|
|
|
|
|
persistable_vars = set([
|
|
|
|
|
cpt.to_text(v.name) for v in [
|
|
|
|
|
var for var in main.list_vars()
|
|
|
|
|
if var.persistable and var.type != core.VarDesc.VarType.RAW
|
|
|
|
|
]
|
|
|
|
|
])
|
|
|
|
|
|
|
|
|
|
def place_obj(place):
|
|
|
|
|
p = core.Place()
|
|
|
|
|
p.set_place(place)
|
|
|
|
|
return p
|
|
|
|
|
|
|
|
|
|
places = list(map(place_obj, self._places))
|
|
|
|
|
|
|
|
|
|
# step7: init ParallelExecutor
|
|
|
|
|
# ParallelExecutor API will be deprecated, don't support parallel graph.
|
|
|
|
|
self._graph = core.Graph(main.desc)
|
|
|
|
|
self._places = compiler.get_available_places(use_cuda)
|
|
|
|
|
self._scope = scope if scope is not None else executor.global_scope()
|
|
|
|
|
|
|
|
|
|
self.executor = core.ParallelExecutor(
|
|
|
|
|
places, persistable_vars,
|
|
|
|
|
cpt.to_text(loss_name) if loss_name else six.u(''), scope,
|
|
|
|
|
local_scopes, exec_strategy, build_strategy, self._graph)
|
|
|
|
|
main_program = main_program if main_program is not None \
|
|
|
|
|
else framework.default_main_program()
|
|
|
|
|
|
|
|
|
|
self.scope = scope
|
|
|
|
|
self._compiled_program = compiler.CompiledProgram(main_program)
|
|
|
|
|
self._compiled_program.with_data_parallel(
|
|
|
|
|
loss_name=loss_name,
|
|
|
|
|
build_strategy=build_strategy,
|
|
|
|
|
exec_strategy=exec_strategy,
|
|
|
|
|
share_vars_from=share_vars_from)
|
|
|
|
|
self._place = core.CUDAPlace(0) if use_cuda else core.CPUPlace()
|
|
|
|
|
self._executor = executor.Executor(self._place)
|
|
|
|
|
self._compiled_program._compile(place=self._place, scope=self._scope)
|
|
|
|
|
|
|
|
|
|
def run(self, fetch_list, feed=None, feed_dict=None, return_numpy=True):
|
|
|
|
|
"""
|
|
|
|
@ -256,56 +180,11 @@ class ParallelExecutor(object):
|
|
|
|
|
loss = pe.run(feed=feeder.feed(cur_batch),
|
|
|
|
|
fetch_list=[avg_cost.name]))
|
|
|
|
|
"""
|
|
|
|
|
if feed is None and feed_dict is not None:
|
|
|
|
|
feed = feed_dict
|
|
|
|
|
print(
|
|
|
|
|
"`feed_dict` is deprecated. Please use `feed=`",
|
|
|
|
|
file=sys.stderr)
|
|
|
|
|
|
|
|
|
|
if isinstance(feed, dict):
|
|
|
|
|
feed_tensor_dict = dict()
|
|
|
|
|
for feed_name in feed:
|
|
|
|
|
feed_tensor = feed[feed_name]
|
|
|
|
|
if not isinstance(feed_tensor, core.LoDTensor):
|
|
|
|
|
feed_tensor = core.LoDTensor()
|
|
|
|
|
# always set to CPU place, since the tensor need to be splitted
|
|
|
|
|
# it is fast in CPU
|
|
|
|
|
feed_tensor.set(feed[feed_name], core.CPUPlace())
|
|
|
|
|
feed_tensor_dict[feed_name] = feed_tensor
|
|
|
|
|
|
|
|
|
|
self.executor.feed_and_split_tensor_into_local_scopes(
|
|
|
|
|
feed_tensor_dict)
|
|
|
|
|
elif isinstance(feed, list) or isinstance(feed, tuple):
|
|
|
|
|
if len(feed) != len(self._places):
|
|
|
|
|
raise ValueError(
|
|
|
|
|
"Feed a list of tensor, the list should be the same size as places"
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
res = list()
|
|
|
|
|
|
|
|
|
|
for i, each in enumerate(feed):
|
|
|
|
|
if not isinstance(each, dict):
|
|
|
|
|
raise TypeError(
|
|
|
|
|
"Each element of feed list should be a dict")
|
|
|
|
|
res_dict = dict()
|
|
|
|
|
for feed_name in each:
|
|
|
|
|
tensor = each[feed_name]
|
|
|
|
|
if not isinstance(tensor, core.LoDTensor):
|
|
|
|
|
tmp = core.LoDTensor()
|
|
|
|
|
tmp.set(tensor, self._places[i])
|
|
|
|
|
tensor = tmp
|
|
|
|
|
res_dict[feed_name] = tensor
|
|
|
|
|
res.append(res_dict)
|
|
|
|
|
self.executor.feed_tensors_into_local_scopes(res)
|
|
|
|
|
|
|
|
|
|
fetch_var_name = 'fetch'
|
|
|
|
|
self.executor.run(fetch_list, fetch_var_name)
|
|
|
|
|
arr = self.scope.find_var(fetch_var_name).get_lod_tensor_array()
|
|
|
|
|
|
|
|
|
|
if return_numpy:
|
|
|
|
|
return executor.as_numpy(arr)
|
|
|
|
|
|
|
|
|
|
return [arr[i] for i in range(len(arr))]
|
|
|
|
|
return self._executor.run(program=self._compiled_program,
|
|
|
|
|
scope=self._scope,
|
|
|
|
|
feed=feed,
|
|
|
|
|
fetch_list=fetch_list,
|
|
|
|
|
return_numpy=return_numpy)
|
|
|
|
|
|
|
|
|
|
@property
|
|
|
|
|
def device_count(self):
|
|
|
|
|