You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
Paddle/python/paddle/fluid/dygraph/parallel.py

190 lines
6.1 KiB

# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except jin compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import os
import six
import numpy as np
from .. import core
from . import layers
from . import parallel_helper
from .. import framework
from ..layers import collective
from . import to_variable
__all__ = ["prepare_context"]
ParallelStrategy = core.ParallelStrategy
def prepare_context(strategy=None):
if strategy is None:
strategy = ParallelStrategy()
strategy.nranks = Env().nranks
strategy.local_rank = Env().local_rank
strategy.trainer_endpoints = Env().trainer_endpoints
strategy.current_endpoint = Env().current_endpoint
if strategy.nranks < 2:
return
assert framework.in_dygraph_mode() is True,\
"dygraph.parallel.prepare_context should be used with dygrahp mode."
place = framework._current_expected_place()
assert place is not None, \
"dygraph.parallel.prepare_context should be used in fluid.dygraph.guard(place) guard."
if isinstance(place, core.CUDAPlace):
parallel_helper._set_parallel_ctx(
core.NCCLParallelContext(strategy, place))
else:
# TODO(Yancey1989): add Gloo Parallel Context to support CPU parallel computation
assert ("Only support CUDAPlace for now.")
parallel_helper._init_parallel_ctx()
return strategy
class Env(object):
def __init__(self):
self._nranks = int(os.getenv("PADDLE_TRAINERS_NUM", "1"))
self._local_rank = int(os.getenv("PADDLE_TRAINER_ID", "0"))
self._dev_id = int(os.getenv("FLAGS_selected_gpus", "0"))
self._trainer_endpoints = os.getenv("PADDLE_TRAINER_ENDPOINTS",
"").split(",")
self._current_endpoint = os.getenv("PADDLE_CURRENT_ENDPOINT", "")
@property
def nranks(self):
return self._nranks
@property
def local_rank(self):
return self._local_rank
@property
def dev_id(self):
return self._dev_id
@property
def current_endpoint(self):
return self._current_endpoint
@property
def trainer_endpoints(self):
return self._trainer_endpoints
class DataParallel(layers.Layer):
"""
Runs the module with data parallelism.
Currently, DataParallel only supports to run the dynamic graph
with multi-process. The usage is:
`python -m paddle.distributed.launch --gpus 2 dynamic_graph_test.py`.
And the content of `dynamic_graph_test.py` is the code of examples.
Examples:
.. code-block:: python
import numpy as np
import paddle.fluid as fluid
import paddle.fluid.dygraph as dygraph
from paddle.fluid.optimizer import AdamOptimizer
from paddle.fluid.dygraph.nn import FC
from paddle.fluid.dygraph.base import to_variable
place = fluid.CUDAPlace(0)
with fluid.dygraph.guard(place=place):
# prepare the data parallel context
strategy=dygraph.parallel.prepare_context()
fc_layer = FC("FC", 10, act="softmax")
adam = fluid.optimizer.AdamOptimizer()
# make the module become the data parallelism module
fc_layer = dygraph.parallel.DataParallel(fc_layer, strategy)
x_data = np.random.random(size=[10, 1]).astype(np.float32)
data = to_variable(x_data)
hidden = fc_layer(data)
avg_loss = fluid.layers.mean(hidden)
# scale the loss according to the number of trainers.
avg_loss = fc_layer.scale_loss(avg_loss)
avg_loss.backward()
# collect the gradients of trainers.
fc_layer.apply_collective_grads()
adam.minimize(avg_loss)
fc_layer.clear_gradients()
Args:
layers(Layer): The module that should be executed by data parallel.
strategy(ParallelStrategy): The strategy of data parallelism.
Returns:
Layer: The data paralleled module.
"""
def __init__(self, layers, strategy):
super(DataParallel,
self).__init__(layers.full_name() + "_data_parallel")
self._layers = layers
self._strategy = strategy
def forward(self, *inputs, **kwargs):
return self._layers(*inputs, **kwargs)
def scale_loss(self, loss):
"""
Scale the loss. In data parallel mode, the loss should be scale with
the number of trainers. If not in data parallel mode, return the loss
directly.
Args:
loss(Layer): The loss of the current Model.
Returns:
Layer: the scaled loss.
"""
if not self._is_data_parallel_mode():
return loss
loss_scale = to_variable(
np.array([self._strategy.nranks]).astype("float32"))
loss_scale.stop_gradient = True
loss = loss / loss_scale
return loss
def apply_collective_grads(self):
"""
AllReduce the Parameters' gradient.
"""
if not self._is_data_parallel_mode():
return
for param in self._layers.parameters():
# NOTE(zcd): The grad_ivar maybe no generated.
if param.trainable and param._ivar._grad_ivar():
g_var = framework.Variable(
block=self._helper.main_program.current_block(),
name=param._ivar._grad_name(),
stop_gradient=True,
ivar=param._ivar._grad_ivar())
collective._allreduce(g_var, g_var, sync_mode=True)
def _is_data_parallel_mode(self):
return self._strategy.nranks > 1