|
|
|
@ -30,7 +30,7 @@ class ParallelExecutor(object):
|
|
|
|
|
"""
|
|
|
|
|
:api_attr: Static Graph
|
|
|
|
|
|
|
|
|
|
The ParallelExecutor is an upgraded version of :code:`fluid.Executor` that supports multi-node model
|
|
|
|
|
The ParallelExecutor is an upgraded version of :code:`paddle.static.Executor` that supports multi-node model
|
|
|
|
|
training and testing based on the data-parallel mode. In data-parallel mode,
|
|
|
|
|
ParallelExecutor will broadcast the parameters from Node0 to other nodes during
|
|
|
|
|
construction and copy the input Program to other nodes from Node0 to make sure
|
|
|
|
@ -50,12 +50,12 @@ class ParallelExecutor(object):
|
|
|
|
|
|
|
|
|
|
Args:
|
|
|
|
|
use_cuda (bool): Whether to use CUDA or not.
|
|
|
|
|
loss_name (str): This parameter is the name of the loss variable of the
|
|
|
|
|
loss_name (str): This parameter is the name of the loss Tensor of the
|
|
|
|
|
model. **Note: If it is data-parallel model training, you must set loss_name,
|
|
|
|
|
otherwise, the results may be wrong**. The default is None.
|
|
|
|
|
main_program (Program): This parameter represents the Program to be executed.
|
|
|
|
|
If this parameter is not provided, that parameter is None, the program will
|
|
|
|
|
be set to :code:`fluid.default_main_program()`. The default is None.
|
|
|
|
|
be set to :code:`paddle.static.default_main_program()`. The default is None.
|
|
|
|
|
share_vars_from(ParallelExecutor): If share_vars_from is set, the current
|
|
|
|
|
ParallelExecutor will share the parameters with the ParallelExecutor
|
|
|
|
|
specified by share_vars_from. This parameter needs to be set when model testing
|
|
|
|
@ -66,13 +66,13 @@ class ParallelExecutor(object):
|
|
|
|
|
The default is None.
|
|
|
|
|
exec_strategy(ExecutionStrategy): exec_strategy specifies the options that can
|
|
|
|
|
be changed when running the current model, such as the thread pool size.
|
|
|
|
|
For more information about exec_strategy, please refer to :code:`fluid.ExecutionStrategy`.
|
|
|
|
|
For more information about exec_strategy, please refer to :code:`paddle.static.ExecutionStrategy`.
|
|
|
|
|
The default is None.
|
|
|
|
|
build_strategy(BuildStrategy): By configuring build_strategy, we can
|
|
|
|
|
optimize the computational graph, such as operators' fusion in the
|
|
|
|
|
computational graph and memory optimization during the execution
|
|
|
|
|
of the computational graph. For more information about build_strategy,
|
|
|
|
|
please refer to :code:`fluid.BuildStrategy`. The default is None.
|
|
|
|
|
please refer to :code:`paddle.static.BuildStrategy`. The default is None.
|
|
|
|
|
num_trainers(int): This parameter needs to be set in GPU distributed training.
|
|
|
|
|
If the parameter value is greater than 1, NCCL will be initialized by multi-level
|
|
|
|
|
nodes. Each node should have the same number of GPUs. The default is 1.
|
|
|
|
@ -81,7 +81,7 @@ class ParallelExecutor(object):
|
|
|
|
|
Trainer_id indicates the "rank" of the current node. The trainer_id starts
|
|
|
|
|
counting from 0. The default is 0.
|
|
|
|
|
scope(Scope): Specifies the scope in which the program is executed.
|
|
|
|
|
The default is fluid.global_scope().
|
|
|
|
|
The default is paddle.static.global_scope().
|
|
|
|
|
|
|
|
|
|
Returns:
|
|
|
|
|
ParallelExecutor: The initialized ParallelExecutor object.
|
|
|
|
@ -101,15 +101,16 @@ class ParallelExecutor(object):
|
|
|
|
|
Examples:
|
|
|
|
|
.. code-block:: python
|
|
|
|
|
|
|
|
|
|
import paddle.fluid as fluid
|
|
|
|
|
import paddle
|
|
|
|
|
import numpy
|
|
|
|
|
import os
|
|
|
|
|
|
|
|
|
|
use_cuda = True
|
|
|
|
|
place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
|
|
|
|
|
paddle.enable_static()
|
|
|
|
|
place = paddle.CUDAPlace(0) if use_cuda else paddle.CPUPlace()
|
|
|
|
|
|
|
|
|
|
# NOTE: If you use CPU to run the program, you need
|
|
|
|
|
# to specify the CPU_NUM, otherwise, fluid will use
|
|
|
|
|
# to specify the CPU_NUM, otherwise, PaddlePaddle will use
|
|
|
|
|
# all the number of the logic core as the CPU_NUM,
|
|
|
|
|
# in that case, the batch size of the input should be
|
|
|
|
|
# greater than CPU_NUM, if not, the process will be
|
|
|
|
@ -117,24 +118,24 @@ class ParallelExecutor(object):
|
|
|
|
|
if not use_cuda:
|
|
|
|
|
os.environ['CPU_NUM'] = str(2)
|
|
|
|
|
|
|
|
|
|
exe = fluid.Executor(place)
|
|
|
|
|
exe = paddle.static.Executor(place)
|
|
|
|
|
|
|
|
|
|
train_program = fluid.Program()
|
|
|
|
|
startup_program = fluid.Program()
|
|
|
|
|
with fluid.program_guard(train_program, startup_program):
|
|
|
|
|
data = fluid.data(name='X', shape=[None, 1], dtype='float32')
|
|
|
|
|
hidden = fluid.layers.fc(input=data, size=10)
|
|
|
|
|
loss = fluid.layers.mean(hidden)
|
|
|
|
|
test_program = fluid.default_main_program().clone(for_test=True)
|
|
|
|
|
fluid.optimizer.SGD(learning_rate=0.01).minimize(loss)
|
|
|
|
|
train_program = paddle.static.Program()
|
|
|
|
|
startup_program = paddle.static.Program()
|
|
|
|
|
with paddle.static.program_guard(train_program, startup_program):
|
|
|
|
|
data = paddle.static.data(name='X', shape=[None, 1], dtype='float32')
|
|
|
|
|
hidden = paddle.static.nn.fc(data, 10)
|
|
|
|
|
loss = paddle.mean(hidden)
|
|
|
|
|
test_program = paddle.static.default_main_program().clone(for_test=True)
|
|
|
|
|
paddle.optimizer.SGD(learning_rate=0.01).minimize(loss)
|
|
|
|
|
|
|
|
|
|
startup_program.random_seed=1
|
|
|
|
|
exe.run(startup_program)
|
|
|
|
|
|
|
|
|
|
train_exe = fluid.ParallelExecutor(use_cuda=use_cuda,
|
|
|
|
|
train_exe = paddle.static.ParallelExecutor(use_cuda=use_cuda,
|
|
|
|
|
main_program=train_program,
|
|
|
|
|
loss_name=loss.name)
|
|
|
|
|
test_exe = fluid.ParallelExecutor(use_cuda=use_cuda,
|
|
|
|
|
# Note: if share_vars_from is not set here, the test parameter is different to the train one
|
|
|
|
|
test_exe = paddle.static.ParallelExecutor(use_cuda=use_cuda,
|
|
|
|
|
main_program=test_program,
|
|
|
|
|
share_vars_from=train_exe)
|
|
|
|
|
|
|
|
|
@ -205,11 +206,11 @@ class ParallelExecutor(object):
|
|
|
|
|
fetch_list.
|
|
|
|
|
|
|
|
|
|
Args:
|
|
|
|
|
fetch_list(list): This parameter represents the variables that need to be returned
|
|
|
|
|
fetch_list(list): This parameter represents the Tensors that need to be returned
|
|
|
|
|
after the model runs. The default is None.
|
|
|
|
|
feed(list|dict): This parameter represents the input variables of the model.
|
|
|
|
|
feed(list|dict): This parameter represents the input Tensors of the model.
|
|
|
|
|
If it is single card training, the feed is dict type, and if it is multi-card
|
|
|
|
|
training, the parameter feed can be dict or list type variable. If the
|
|
|
|
|
training, the parameter feed can be dict or list of Tensor. If the
|
|
|
|
|
parameter type is dict, the data in the feed will be split and sent to
|
|
|
|
|
multiple devices (CPU/GPU), that is to say, the input data will be evenly
|
|
|
|
|
sent to different devices, so you should make sure the number of samples of
|
|
|
|
@ -219,8 +220,8 @@ class ParallelExecutor(object):
|
|
|
|
|
The default is None.
|
|
|
|
|
feed_dict: Alias for feed parameter, for backward compatibility.
|
|
|
|
|
This parameter has been deprecated. Default None.
|
|
|
|
|
return_numpy(bool): This parameter indicates whether convert the fetched variables
|
|
|
|
|
(the variable specified in the fetch list) to numpy.ndarray. if it is False,
|
|
|
|
|
return_numpy(bool): This parameter indicates whether convert the fetched Tensors
|
|
|
|
|
(the Tensor specified in the fetch list) to numpy.ndarray. if it is False,
|
|
|
|
|
the type of the return value is a list of :code:`LoDTensor`. The default is True.
|
|
|
|
|
|
|
|
|
|
Returns:
|
|
|
|
@ -241,22 +242,23 @@ class ParallelExecutor(object):
|
|
|
|
|
number of CPU cores or GPU cards, if it is less than, it is recommended that
|
|
|
|
|
the batch be discarded.
|
|
|
|
|
2. If the number of CPU cores or GPU cards available is greater than 1, the fetch
|
|
|
|
|
results are spliced together in dimension 0 for the same variable values
|
|
|
|
|
(variables in fetch_list) on different devices.
|
|
|
|
|
results are spliced together in dimension 0 for the same Tensor values
|
|
|
|
|
(Tensors in fetch_list) on different devices.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
Examples:
|
|
|
|
|
.. code-block:: python
|
|
|
|
|
|
|
|
|
|
import paddle.fluid as fluid
|
|
|
|
|
import paddle
|
|
|
|
|
import numpy
|
|
|
|
|
import os
|
|
|
|
|
|
|
|
|
|
use_cuda = True
|
|
|
|
|
place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
|
|
|
|
|
paddle.enable_static()
|
|
|
|
|
place = paddle.CUDAPlace(0) if use_cuda else paddle.CPUPlace()
|
|
|
|
|
|
|
|
|
|
# NOTE: If you use CPU to run the program, you need
|
|
|
|
|
# to specify the CPU_NUM, otherwise, fluid will use
|
|
|
|
|
# to specify the CPU_NUM, otherwise, PaddlePaddle will use
|
|
|
|
|
# all the number of the logic core as the CPU_NUM,
|
|
|
|
|
# in that case, the batch size of the input should be
|
|
|
|
|
# greater than CPU_NUM, if not, the process will be
|
|
|
|
@ -264,19 +266,19 @@ class ParallelExecutor(object):
|
|
|
|
|
if not use_cuda:
|
|
|
|
|
os.environ['CPU_NUM'] = str(2)
|
|
|
|
|
|
|
|
|
|
exe = fluid.Executor(place)
|
|
|
|
|
exe = paddle.static.Executor(place)
|
|
|
|
|
|
|
|
|
|
train_program = fluid.Program()
|
|
|
|
|
startup_program = fluid.Program()
|
|
|
|
|
with fluid.program_guard(train_program, startup_program):
|
|
|
|
|
data = fluid.data(name='X', shape=[None, 1], dtype='float32')
|
|
|
|
|
hidden = fluid.layers.fc(input=data, size=10)
|
|
|
|
|
loss = fluid.layers.mean(hidden)
|
|
|
|
|
fluid.optimizer.SGD(learning_rate=0.01).minimize(loss)
|
|
|
|
|
train_program = paddle.static.Program()
|
|
|
|
|
startup_program = paddle.static.Program()
|
|
|
|
|
with paddle.static.program_guard(train_program, startup_program):
|
|
|
|
|
data = paddle.static.data(name='X', shape=[None, 1], dtype='float32')
|
|
|
|
|
hidden = paddle.static.nn.fc(data, 10)
|
|
|
|
|
loss = paddle.mean(hidden)
|
|
|
|
|
paddle.optimizer.SGD(learning_rate=0.01).minimize(loss)
|
|
|
|
|
|
|
|
|
|
exe.run(startup_program)
|
|
|
|
|
|
|
|
|
|
train_exe = fluid.ParallelExecutor(use_cuda=use_cuda,
|
|
|
|
|
train_exe = paddle.static.ParallelExecutor(use_cuda=use_cuda,
|
|
|
|
|
main_program=train_program,
|
|
|
|
|
loss_name=loss.name)
|
|
|
|
|
|
|
|
|
@ -314,7 +316,7 @@ class ParallelExecutor(object):
|
|
|
|
|
application and release of temporary variables, the strategy adopted by
|
|
|
|
|
ParallelExecutor is to drop the local execution scopes after several iterations.
|
|
|
|
|
ParallelExecutor provides the num_iteration_per_drop_scope option in
|
|
|
|
|
:code:`fluid.ExecutionStrategy`, which indicates how many iterations are intervened to
|
|
|
|
|
:code:`paddle.static.ExecutionStrategy`, which indicates how many iterations are intervened to
|
|
|
|
|
drop the local execution scopes. If the num_iteration_per_drop_scope value
|
|
|
|
|
is 100, but you want to drop the local execution scopes after 50 iterations,
|
|
|
|
|
you can call the interface manually.
|
|
|
|
@ -325,13 +327,13 @@ class ParallelExecutor(object):
|
|
|
|
|
Examples:
|
|
|
|
|
.. code-block:: python
|
|
|
|
|
|
|
|
|
|
import paddle.fluid as fluid
|
|
|
|
|
import paddle
|
|
|
|
|
import numpy
|
|
|
|
|
import os
|
|
|
|
|
|
|
|
|
|
use_cuda = True
|
|
|
|
|
# NOTE: If you use CPU to run the program, you need
|
|
|
|
|
# to specify the CPU_NUM, otherwise, fluid will use
|
|
|
|
|
# to specify the CPU_NUM, otherwise, PaddlePaddle will use
|
|
|
|
|
# all the number of the logic core as the CPU_NUM,
|
|
|
|
|
# in that case, the batch size of the input should be
|
|
|
|
|
# greater than CPU_NUM, if not, the process will be
|
|
|
|
@ -339,18 +341,19 @@ class ParallelExecutor(object):
|
|
|
|
|
if not use_cuda:
|
|
|
|
|
os.environ['CPU_NUM'] = str(2)
|
|
|
|
|
|
|
|
|
|
train_program = fluid.Program()
|
|
|
|
|
startup_program = fluid.Program()
|
|
|
|
|
with fluid.program_guard(train_program, startup_program):
|
|
|
|
|
data = fluid.data(name='X', shape=[None, 1], dtype='float32')
|
|
|
|
|
hidden = fluid.layers.fc(input=data, size=10)
|
|
|
|
|
loss = fluid.layers.mean(hidden)
|
|
|
|
|
paddle.enable_static()
|
|
|
|
|
train_program = paddle.static.Program()
|
|
|
|
|
startup_program = paddle.static.Program()
|
|
|
|
|
with paddle.static.program_guard(train_program, startup_program):
|
|
|
|
|
data = paddle.static.data(name='X', shape=[None, 1], dtype='float32')
|
|
|
|
|
hidden = paddle.static.nn.fc(data, 10)
|
|
|
|
|
loss = paddle.mean(hidden)
|
|
|
|
|
|
|
|
|
|
place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
|
|
|
|
|
exe = fluid.Executor(place)
|
|
|
|
|
place = paddle.CUDAPlace(0) if use_cuda else paddle.CPUPlace()
|
|
|
|
|
exe = paddle.static.Executor(place)
|
|
|
|
|
exe.run(startup_program)
|
|
|
|
|
|
|
|
|
|
parallel_exe = fluid.ParallelExecutor(use_cuda=use_cuda,
|
|
|
|
|
parallel_exe = paddle.static.ParallelExecutor(use_cuda=use_cuda,
|
|
|
|
|
main_program=train_program,
|
|
|
|
|
loss_name=loss.name)
|
|
|
|
|
|
|
|
|
@ -359,6 +362,7 @@ class ParallelExecutor(object):
|
|
|
|
|
fetch_list=[loss.name])
|
|
|
|
|
|
|
|
|
|
parallel_exe.drop_local_exe_scopes()
|
|
|
|
|
|
|
|
|
|
"""
|
|
|
|
|
check_type(self._compiled_program._executor,
|
|
|
|
|
"the Executor of compiled program", core.ParallelExecutor,
|
|
|
|
|