|
|
|
@ -667,16 +667,17 @@ All parameter, weight, gradient are variables in Paddle.
|
|
|
|
|
ExecutionStrategy allows the user to more preciously control how to run
|
|
|
|
|
the program in ParallelExecutor by setting the property.
|
|
|
|
|
|
|
|
|
|
The available properties include:
|
|
|
|
|
use_cuda (bool): Whether to use CUDA or not. Default True.
|
|
|
|
|
num_threads (int): The number of threads that used to run the
|
|
|
|
|
operators in ParallelExecutor. If it is not set, it will be
|
|
|
|
|
set in ParallelExecutor according to the device count.
|
|
|
|
|
Default 0.
|
|
|
|
|
allow_op_delay (bool): Whether to delay the communication operators
|
|
|
|
|
to run. Default False.
|
|
|
|
|
num_iteration_per_drop_scope (int): how many iterations between
|
|
|
|
|
the two dropping local scopes. Default 100.
|
|
|
|
|
Examples:
|
|
|
|
|
.. code-block:: python
|
|
|
|
|
|
|
|
|
|
exec_strategy = fluid.ExecutionStrategy()
|
|
|
|
|
exec_strategy.num_threads = 4
|
|
|
|
|
|
|
|
|
|
train_exe = fluid.ParallelExecutor(use_cuda=True,
|
|
|
|
|
loss_name=loss.name,
|
|
|
|
|
exec_strategy=exec_strategy)
|
|
|
|
|
|
|
|
|
|
train_loss, = train_exe.run([loss.name], feed=feed_dict)
|
|
|
|
|
|
|
|
|
|
)DOC");
|
|
|
|
|
|
|
|
|
@ -686,19 +687,34 @@ All parameter, weight, gradient are variables in Paddle.
|
|
|
|
|
[](const ExecutionStrategy &self) { return self.num_threads_; },
|
|
|
|
|
[](ExecutionStrategy &self, size_t num_threads) {
|
|
|
|
|
self.num_threads_ = num_threads;
|
|
|
|
|
})
|
|
|
|
|
},
|
|
|
|
|
R"DOC(The type is INT, num_threads represents the size of thread pool that
|
|
|
|
|
used to run the operators of the current program in ParallelExecutor.
|
|
|
|
|
If :math:`num\_threads=1`, all the operators will execute one by one,
|
|
|
|
|
but the order maybe difference between iterations.
|
|
|
|
|
If it is not set, it will be set in ParallelExecutor according to the
|
|
|
|
|
device type and device count, for GPU, :math:`num\_threads=device\_count*4`, for CPU,
|
|
|
|
|
:math:`num\_threads=CPU\_NUM*4`, the explanation of:math:`CPU\_NUM` is in ParallelExecutor.
|
|
|
|
|
if it is not set, ParallelExecutor will get the cpu count by calling
|
|
|
|
|
`multiprocessing.cpu_count()`. Default 0.)DOC")
|
|
|
|
|
.def_property(
|
|
|
|
|
"use_cuda",
|
|
|
|
|
[](const ExecutionStrategy &self) { return self.use_cuda_; },
|
|
|
|
|
[](ExecutionStrategy &self, bool use_cuda) {
|
|
|
|
|
self.use_cuda_ = use_cuda;
|
|
|
|
|
})
|
|
|
|
|
}) // FIXME(chengduo): Doesn't add doc for 'use_cuda', use_cuda may
|
|
|
|
|
// make user confuse, because ParallelExecutor has a parameter named
|
|
|
|
|
// 'use_cuda' too, in current implementation, ParallelExecutor's
|
|
|
|
|
// 'use_cuda' will rewrite ExecutionStrategy's 'use_cuda'.
|
|
|
|
|
.def_property(
|
|
|
|
|
"allow_op_delay",
|
|
|
|
|
[](const ExecutionStrategy &self) { return self.allow_op_delay_; },
|
|
|
|
|
[](ExecutionStrategy &self, bool allow_op_delay) {
|
|
|
|
|
self.allow_op_delay_ = allow_op_delay;
|
|
|
|
|
})
|
|
|
|
|
},
|
|
|
|
|
R"DOC(The type is BOOL, allow_op_delay represents whether to delay the
|
|
|
|
|
communication operators to run, it may make the execution faster.
|
|
|
|
|
Note that in some models, allow_op_delay may cause program hang. Default False.)DOC")
|
|
|
|
|
.def_property(
|
|
|
|
|
"num_iteration_per_drop_scope",
|
|
|
|
|
[](const ExecutionStrategy &self) {
|
|
|
|
@ -706,7 +722,19 @@ All parameter, weight, gradient are variables in Paddle.
|
|
|
|
|
},
|
|
|
|
|
[](ExecutionStrategy &self, size_t num_iteration_per_drop_scope) {
|
|
|
|
|
self.num_iteration_per_drop_scope_ = num_iteration_per_drop_scope;
|
|
|
|
|
});
|
|
|
|
|
},
|
|
|
|
|
R"DOC(The type is INT, num_iteration_per_drop_scope indicates how
|
|
|
|
|
many iterations to clean up the temp variables which
|
|
|
|
|
is generated during execution. It may make the execution faster,
|
|
|
|
|
because the temp variable's shape maybe the same between two iterations. Default 100.
|
|
|
|
|
|
|
|
|
|
NOTES:
|
|
|
|
|
1. If you fetch data when calling the 'run', the ParallelExecutor
|
|
|
|
|
will clean up the temp variables at the end of the current iteration.
|
|
|
|
|
2. In some NLP model, it may cause the GPU memory is insufficient,
|
|
|
|
|
in this case, you should reduce `num_iteration_per_drop_scope`.
|
|
|
|
|
)DOC");
|
|
|
|
|
|
|
|
|
|
exec_strategy.def_property(
|
|
|
|
|
"use_experimental_executor",
|
|
|
|
|
[](const ExecutionStrategy &self) {
|
|
|
|
@ -721,20 +749,17 @@ All parameter, weight, gradient are variables in Paddle.
|
|
|
|
|
BuildStrategy allows the user to more preciously control how to
|
|
|
|
|
build the SSA Graph in ParallelExecutor by setting the property.
|
|
|
|
|
|
|
|
|
|
The available properties include:
|
|
|
|
|
reduce_strategy (str): There are two reduce strategies, 'AllReduce'
|
|
|
|
|
and 'Reduce'. If you want that all parameters will be optimized
|
|
|
|
|
on all devices, you can choose 'AllReduce'; if you choose
|
|
|
|
|
'Reduce', all parameters will be evenly allocated to different
|
|
|
|
|
devices for optimization, and then broadcast the optimized
|
|
|
|
|
parameter to other devices. Default 'AllReduce'.
|
|
|
|
|
gradient_scale_strategy (str): There are two ways of defining loss@grad,
|
|
|
|
|
'CoeffNumDevice' and 'Customized'. By default, ParallelExecutor
|
|
|
|
|
sets the loss@grad according to the number of devices. If you want
|
|
|
|
|
to customize loss@grad, you can choose 'Customized'.
|
|
|
|
|
Default 'CoeffNumDevice'.
|
|
|
|
|
debug_graphviz_path (str): Whether to write the SSA Graph to file in the
|
|
|
|
|
form of graphviz. It is useful for debugging. Default "".
|
|
|
|
|
Examples:
|
|
|
|
|
.. code-block:: python
|
|
|
|
|
|
|
|
|
|
build_strategy = fluid.BuildStrategy()
|
|
|
|
|
build_strategy.reduce_strategy = fluid.BuildStrategy.ReduceStrategy.Reduce
|
|
|
|
|
|
|
|
|
|
train_exe = fluid.ParallelExecutor(use_cuda=True,
|
|
|
|
|
loss_name=loss.name,
|
|
|
|
|
build_strategy=build_strategy)
|
|
|
|
|
|
|
|
|
|
train_loss, = train_exe.run([loss.name], feed=feed_dict)
|
|
|
|
|
)DOC");
|
|
|
|
|
|
|
|
|
|
py::enum_<BuildStrategy::ReduceStrategy>(build_strategy, "ReduceStrategy")
|
|
|
|
@ -753,31 +778,51 @@ All parameter, weight, gradient are variables in Paddle.
|
|
|
|
|
[](const BuildStrategy &self) { return self.reduce_; },
|
|
|
|
|
[](BuildStrategy &self, BuildStrategy::ReduceStrategy strategy) {
|
|
|
|
|
self.reduce_ = strategy;
|
|
|
|
|
})
|
|
|
|
|
},
|
|
|
|
|
R"DOC(The type is STR, there are two reduce strategies in ParallelExecutor,
|
|
|
|
|
'AllReduce' and 'Reduce'. If you want that all the parameters'
|
|
|
|
|
optimization are done on all devices independently, you should choose 'AllReduce';
|
|
|
|
|
if you choose 'Reduce', all the parameters' optimization will be evenly distributed
|
|
|
|
|
to different devices, and then broadcast the optimized parameter to other devices.
|
|
|
|
|
In some models, `Reduce` is faster. Default 'AllReduce'. )DOC")
|
|
|
|
|
.def_property(
|
|
|
|
|
"gradient_scale_strategy",
|
|
|
|
|
[](const BuildStrategy &self) { return self.gradient_scale_; },
|
|
|
|
|
[](BuildStrategy &self,
|
|
|
|
|
BuildStrategy::GradientScaleStrategy strategy) {
|
|
|
|
|
self.gradient_scale_ = strategy;
|
|
|
|
|
})
|
|
|
|
|
},
|
|
|
|
|
R"DOC(The type is STR, there are three ways of defining :math:`loss@grad` in
|
|
|
|
|
ParallelExecutor, 'CoeffNumDevice', 'One' and 'Customized'. By default,
|
|
|
|
|
ParallelExecutor sets the :math:`loss@grad` according to the number of devices.
|
|
|
|
|
If you want to customize :math:`loss@grad`, you can choose 'Customized'.
|
|
|
|
|
Default 'CoeffNumDevice'.)DOC")
|
|
|
|
|
.def_property(
|
|
|
|
|
"debug_graphviz_path",
|
|
|
|
|
[](const BuildStrategy &self) { return self.debug_graphviz_path_; },
|
|
|
|
|
[](BuildStrategy &self, const std::string &path) {
|
|
|
|
|
self.debug_graphviz_path_ = path;
|
|
|
|
|
})
|
|
|
|
|
},
|
|
|
|
|
R"DOC(The type is STR, debug_graphviz_path indicate the path that
|
|
|
|
|
writing the SSA Graph to file in the form of graphviz, you.
|
|
|
|
|
It is useful for debugging. Default "")DOC")
|
|
|
|
|
.def_property(
|
|
|
|
|
"enable_data_balance",
|
|
|
|
|
[](const BuildStrategy &self) { return self.enable_data_balance_; },
|
|
|
|
|
[](BuildStrategy &self, bool b) { self.enable_data_balance_ = b; })
|
|
|
|
|
.def_property("fuse_elewise_add_act_ops",
|
|
|
|
|
[](const BuildStrategy &self) {
|
|
|
|
|
return self.fuse_elewise_add_act_ops_;
|
|
|
|
|
},
|
|
|
|
|
[](BuildStrategy &self, bool b) {
|
|
|
|
|
self.fuse_elewise_add_act_ops_ = b;
|
|
|
|
|
})
|
|
|
|
|
[](BuildStrategy &self, bool b) {
|
|
|
|
|
self.enable_data_balance_ = b;
|
|
|
|
|
}) // FIXME(chengudo): enable_data_balance seems not important
|
|
|
|
|
.def_property(
|
|
|
|
|
"fuse_elewise_add_act_ops",
|
|
|
|
|
[](const BuildStrategy &self) {
|
|
|
|
|
return self.fuse_elewise_add_act_ops_;
|
|
|
|
|
},
|
|
|
|
|
[](BuildStrategy &self, bool b) {
|
|
|
|
|
self.fuse_elewise_add_act_ops_ = b;
|
|
|
|
|
},
|
|
|
|
|
R"DOC(The type is BOOL, fuse_elewise_add_act_ops indicate whether
|
|
|
|
|
to fuse elementwise_add_op and activation_op,
|
|
|
|
|
it may make the execution faster. Default False)DOC")
|
|
|
|
|
.def("_create_passes_from_strategy",
|
|
|
|
|
[](BuildStrategy &self) -> std::shared_ptr<ir::PassBuilder> {
|
|
|
|
|
return self.CreatePassesFromStrategy();
|
|
|
|
|