Remove DataParallel.scale_loss & apply_collective_grads (#27603)

* remove data parallel scale loss & apply collective_grads

* move apply in minimize

* fix failed unittests
my_2.0rc
Chen Weihang 4 years ago committed by GitHub
parent 7b46fb0f14
commit dec53a9c79
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

File diff suppressed because it is too large Load Diff

@ -13,12 +13,15 @@
# limitations under the License.
import inspect
import numpy as np
import paddle
from .. import framework
from .. import core
from ..framework import Variable, Parameter, ParamBase
from .base import switch_to_static_graph
import numpy as np
from .math_op_patch import monkey_patch_math_varbase
from .parallel import scale_loss
def monkey_patch_varbase():
@ -165,7 +168,12 @@ def monkey_patch_varbase():
"""
if framework.in_dygraph_mode():
self._run_backward(framework._dygraph_tracer(), retain_graph)
if paddle.distributed.get_world_size() > 1:
scaled_loss = scale_loss(self)
scaled_loss._run_backward(framework._dygraph_tracer(),
retain_graph)
else:
self._run_backward(framework._dygraph_tracer(), retain_graph)
else:
raise ValueError(
"Variable.backward() is only available in DyGraph mode")

@ -19,8 +19,10 @@ import six
import logging
from collections import defaultdict
import paddle
from paddle.fluid.distribute_lookup_table import find_distributed_lookup_table
from paddle.fluid.framework import Program, Variable, name_scope, default_main_program, default_startup_program, device_guard
from paddle.fluid.dygraph.parallel import apply_collective_grads
from . import framework
from . import layers
@ -40,7 +42,6 @@ from paddle.fluid.layers import tensor
from functools import reduce
from .wrapped_decorator import signature_safe_contextmanager
from .. import compat as cpt
import paddle
__all__ = [
'SGD', 'Momentum', 'Adagrad', 'Adam', 'Adamax', 'Dpsgd', 'DecayedAdagrad',
@ -771,8 +772,14 @@ class Optimizer(object):
self._dtype = loss.dtype
if framework.in_dygraph_mode():
parameter_list = parameter_list if parameter_list \
else self._parameter_list
if paddle.distributed.get_world_size() > 1:
apply_collective_grads(parameter_list)
params_grads = []
for param in self._parameter_list:
for param in parameter_list:
if not param.trainable:
continue
if param._grad_ivar() is not None:
@ -939,6 +946,7 @@ class Optimizer(object):
parameter_list = parameter_list if parameter_list \
else self._parameter_list
params_grads = self.backward(
loss,
startup_program=startup_program,

@ -435,13 +435,7 @@ class TestParallelDyGraphRunnerBase(object):
"loss at step %d: %f" % (step_id, loss.numpy()))
out_losses.append(loss.numpy())
# FIXME(Yancey1989): scale the loss inplace
if args.update_method == "nccl2":
loss = model.scale_loss(loss)
loss.backward()
if args.update_method == "nccl2":
model.apply_collective_grads()
opt.minimize(loss)
model.clear_gradients()
@ -477,12 +471,7 @@ class TestParallelDyGraphRunnerBase(object):
loss = self.run_one_loop(model, opt, data)
out_losses.append(loss.numpy())
if args.update_method == "nccl2":
loss = model.scale_loss(loss)
loss.backward()
if args.update_method == "nccl2":
model.apply_collective_grads()
opt.minimize(loss)
model.clear_gradients()
@ -521,12 +510,7 @@ class TestParallelDyGraphRunnerBase(object):
loss = self.run_one_loop(model, opt, data)
out_losses.append(loss.numpy())
if args.update_method == "nccl2":
loss = model.scale_loss(loss)
loss.backward()
if args.update_method == "nccl2":
model.apply_collective_grads()
opt.step()
opt.clear_grad()

@ -22,6 +22,7 @@ import paddle.fluid as fluid
from paddle.fluid import core
from paddle.fluid.dygraph.parallel import DataParallel
from paddle.fluid.dygraph.base import to_variable
from paddle.fluid.dygraph.parallel import _coalesce_tensors, _split_tensors, _reshape_inplace
class MyLayer(fluid.Layer):
@ -57,8 +58,8 @@ class TestImperativeParallelCoalesceSplit(unittest.TestCase):
orig_var_shapes.append(var.shape)
# execute interface
coalesced_vars = test_layer._coalesce_tensors(var_groups)
test_layer._split_tensors(coalesced_vars)
coalesced_vars = _coalesce_tensors(var_groups)
_split_tensors(coalesced_vars)
# compare
for orig_var_shape, var in zip(orig_var_shapes, vars):
@ -74,7 +75,7 @@ class TestImperativeParallelCoalesceSplit(unittest.TestCase):
new_shape = [5, 10]
x_data = np.random.random(ori_shape).astype("float32")
x = to_variable(x_data)
test_layer._reshape_inplace(x, new_shape)
_reshape_inplace(x, new_shape)
self.assertEqual(x.shape, new_shape)

@ -17,6 +17,9 @@ from ..fluid import core
from ..fluid import framework
from ..fluid.framework import Variable
import paddle
from paddle.fluid.dygraph.parallel import apply_collective_grads
__all__ = ["Adam"]
@ -276,7 +279,9 @@ class Adam(Optimizer):
adam.step()
adam.clear_grad()
"""
parameter_list = self._parameter_list
if paddle.distributed.get_world_size() > 1:
apply_collective_grads(self._parameter_list)
self._dtype = None
params_grads = []
for param in self._parameter_list:

@ -16,6 +16,8 @@ from .optimizer import Optimizer
from .adam import Adam
from ..fluid import framework
import paddle
from paddle.fluid.dygraph.parallel import apply_collective_grads
__all__ = ['AdamW']
@ -184,6 +186,9 @@ class AdamW(Adam):
startup_program=None,
parameters=None,
no_grad_set=None):
parameters = parameters if parameters \
else self._parameter_list
params_grads = self.backward(
loss=loss,
startup_program=startup_program,
@ -206,7 +211,9 @@ class AdamW(Adam):
@framework.dygraph_only
def step(self):
parameter_list = self._parameter_list
if paddle.distributed.get_world_size() > 1:
apply_collective_grads(self._parameter_list)
self._dtype = None
params_grads = []
for param in self._parameter_list:
@ -224,7 +231,7 @@ class AdamW(Adam):
updated_param = paddle.fluid.layers.elementwise_sub(
x=param, y=scaled_param)
param.set_value(updated_param.numpy())
optimize_ops = self._apply_optimize(
self._apply_optimize(
loss=None, startup_program=None, params_grads=params_grads)
def __str__(self):

@ -19,9 +19,10 @@ import six
import logging
from collections import defaultdict
import paddle
from paddle.fluid.distribute_lookup_table import find_distributed_lookup_table
from paddle.fluid.framework import Program, Variable, name_scope, default_main_program, default_startup_program, device_guard
import paddle
from paddle.fluid.dygraph.parallel import apply_collective_grads
from ..fluid import framework
from ..fluid import layers
@ -675,8 +676,14 @@ class Optimizer(object):
self._dtype = loss.dtype
if framework.in_dygraph_mode():
parameter_list = parameters if parameters \
else self._parameter_list
if paddle.distributed.get_world_size() > 1:
apply_collective_grads(parameter_list)
params_grads = []
for param in self._parameter_list:
for param in parameter_list:
if not param.trainable:
continue
if param._grad_ivar() is not None:
@ -871,6 +878,7 @@ class Optimizer(object):
parameter_list = parameters if parameters \
else self._parameter_list
params_grads = self.backward(
loss,
startup_program=startup_program,
@ -907,7 +915,9 @@ class Optimizer(object):
adam.step()
adam.clear_grad()
"""
parameter_list = self._parameter_list
if paddle.distributed.get_world_size() > 1:
apply_collective_grads(self._parameter_list)
self._dtype = None
params_grads = []
for param in self._parameter_list:
@ -917,5 +927,5 @@ class Optimizer(object):
grad_var = param._grad_ivar()
params_grads.append((param, grad_var))
optimize_ops = self._apply_optimize(
self._apply_optimize(
loss=None, startup_program=None, params_grads=params_grads)

Loading…
Cancel
Save