|
|
|
@ -14,7 +14,7 @@
|
|
|
|
|
import os
|
|
|
|
|
import six
|
|
|
|
|
import numpy as np
|
|
|
|
|
|
|
|
|
|
from collections import OrderedDict
|
|
|
|
|
from .. import core
|
|
|
|
|
from . import layers
|
|
|
|
|
from . import parallel_helper
|
|
|
|
@ -36,7 +36,7 @@ def prepare_context(strategy=None):
|
|
|
|
|
strategy.current_endpoint = Env().current_endpoint
|
|
|
|
|
if strategy.nranks < 2:
|
|
|
|
|
return
|
|
|
|
|
assert framework.in_dygraph_mode() is True,\
|
|
|
|
|
assert framework.in_dygraph_mode() is True, \
|
|
|
|
|
"dygraph.parallel.prepare_context should be used with dygrahp mode."
|
|
|
|
|
place = framework._current_expected_place()
|
|
|
|
|
assert place is not None, \
|
|
|
|
@ -168,6 +168,37 @@ class DataParallel(layers.Layer):
|
|
|
|
|
loss = loss / loss_scale
|
|
|
|
|
return loss
|
|
|
|
|
|
|
|
|
|
def _coalesce_tensors(self, var_groups):
|
|
|
|
|
from ..layers import nn
|
|
|
|
|
coalesced_grads_and_grad_vars = []
|
|
|
|
|
for group_id, grad_vars in var_groups.items():
|
|
|
|
|
flattened_vars = []
|
|
|
|
|
g_var_shapes = []
|
|
|
|
|
for g_var in grad_vars:
|
|
|
|
|
g_var_shapes.append(g_var.shape)
|
|
|
|
|
flattened_vars.append(
|
|
|
|
|
nn.reshape(
|
|
|
|
|
x=g_var, shape=[np.prod(g_var.shape)], inplace=True))
|
|
|
|
|
coalesced_grad = nn.concat(flattened_vars)
|
|
|
|
|
coalesced_grads_and_grad_vars.append(
|
|
|
|
|
[coalesced_grad, grad_vars, g_var_shapes])
|
|
|
|
|
return coalesced_grads_and_grad_vars
|
|
|
|
|
|
|
|
|
|
def _split_tensors(self, coalesced_grads_and_grad_vars):
|
|
|
|
|
from ..layers import nn
|
|
|
|
|
for coalesced_grad, origin_grad_vars, grad_shapes in coalesced_grads_and_grad_vars:
|
|
|
|
|
grad_var_len = [np.prod(g_shape) for g_shape in grad_shapes]
|
|
|
|
|
splited_vars = nn.split(
|
|
|
|
|
coalesced_grad, num_or_sections=grad_var_len, dim=0)
|
|
|
|
|
reshaped_grad_vars = []
|
|
|
|
|
for g_var, g_shape in zip(splited_vars, grad_shapes):
|
|
|
|
|
reshaped_grad_vars.append(
|
|
|
|
|
nn.reshape(
|
|
|
|
|
x=g_var, shape=g_shape, inplace=True))
|
|
|
|
|
for origin_g_var, reshaped_g_var in zip(origin_grad_vars,
|
|
|
|
|
reshaped_grad_vars):
|
|
|
|
|
nn.assign(input=reshaped_g_var, output=origin_g_var)
|
|
|
|
|
|
|
|
|
|
def apply_collective_grads(self):
|
|
|
|
|
"""
|
|
|
|
|
AllReduce the Parameters' gradient.
|
|
|
|
@ -175,6 +206,8 @@ class DataParallel(layers.Layer):
|
|
|
|
|
if not self._is_data_parallel_mode():
|
|
|
|
|
return
|
|
|
|
|
|
|
|
|
|
grad_var_set = set()
|
|
|
|
|
grad_vars = []
|
|
|
|
|
for param in self._layers.parameters():
|
|
|
|
|
# NOTE(zcd): The grad_ivar maybe no generated.
|
|
|
|
|
if param.trainable and param._ivar._grad_ivar():
|
|
|
|
@ -183,7 +216,36 @@ class DataParallel(layers.Layer):
|
|
|
|
|
name=param._ivar._grad_name(),
|
|
|
|
|
stop_gradient=True,
|
|
|
|
|
ivar=param._ivar._grad_ivar())
|
|
|
|
|
collective._allreduce(g_var, g_var, sync_mode=True)
|
|
|
|
|
grad_vars.append(g_var)
|
|
|
|
|
assert g_var not in grad_var_set
|
|
|
|
|
grad_var_set.add(g_var)
|
|
|
|
|
|
|
|
|
|
# FIXME(zcd): the type of the var should be LoDTensor, i.e
|
|
|
|
|
# the gradients should be dense, otherwise, the following
|
|
|
|
|
# logic should be updated.
|
|
|
|
|
# 128 MB as a group
|
|
|
|
|
mega_bytes = 128 * 1024 * 1024
|
|
|
|
|
group_idx = 0
|
|
|
|
|
memory_counter = 0
|
|
|
|
|
grad_var_groups = OrderedDict()
|
|
|
|
|
dtype = grad_vars[0].dtype
|
|
|
|
|
for g_var in grad_vars:
|
|
|
|
|
# Note: the dtype of the same group should be the same.
|
|
|
|
|
bytes = np.prod(g_var.shape) * core.size_of_dtype(g_var.dtype)
|
|
|
|
|
if memory_counter < mega_bytes and dtype == g_var.dtype:
|
|
|
|
|
memory_counter += bytes
|
|
|
|
|
else:
|
|
|
|
|
memory_counter = bytes
|
|
|
|
|
group_idx += 1
|
|
|
|
|
grad_var_groups.setdefault(group_idx, []).append(g_var)
|
|
|
|
|
|
|
|
|
|
coalesced_grads_and_vars = self._coalesce_tensors(grad_var_groups)
|
|
|
|
|
|
|
|
|
|
for coalesced_grad, g_vars, g_shapes in coalesced_grads_and_vars:
|
|
|
|
|
collective._allreduce(
|
|
|
|
|
coalesced_grad, coalesced_grad, sync_mode=False)
|
|
|
|
|
|
|
|
|
|
self._split_tensors(coalesced_grads_and_vars)
|
|
|
|
|
|
|
|
|
|
def _is_data_parallel_mode(self):
|
|
|
|
|
return self._strategy.nranks > 1
|
|
|
|
|