Remove DataParallel.scale_loss & apply_collective_grads (#27603)

* remove data parallel scale loss & apply collective_grads

* move apply in minimize

* fix failed unittests
my_2.0rc
Chen Weihang 5 years ago committed by GitHub
parent 7b46fb0f14
commit dec53a9c79
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

File diff suppressed because it is too large Load Diff

@ -13,12 +13,15 @@
# limitations under the License. # limitations under the License.
import inspect import inspect
import numpy as np
import paddle
from .. import framework from .. import framework
from .. import core from .. import core
from ..framework import Variable, Parameter, ParamBase from ..framework import Variable, Parameter, ParamBase
from .base import switch_to_static_graph from .base import switch_to_static_graph
import numpy as np
from .math_op_patch import monkey_patch_math_varbase from .math_op_patch import monkey_patch_math_varbase
from .parallel import scale_loss
def monkey_patch_varbase(): def monkey_patch_varbase():
@ -165,7 +168,12 @@ def monkey_patch_varbase():
""" """
if framework.in_dygraph_mode(): if framework.in_dygraph_mode():
self._run_backward(framework._dygraph_tracer(), retain_graph) if paddle.distributed.get_world_size() > 1:
scaled_loss = scale_loss(self)
scaled_loss._run_backward(framework._dygraph_tracer(),
retain_graph)
else:
self._run_backward(framework._dygraph_tracer(), retain_graph)
else: else:
raise ValueError( raise ValueError(
"Variable.backward() is only available in DyGraph mode") "Variable.backward() is only available in DyGraph mode")

@ -19,8 +19,10 @@ import six
import logging import logging
from collections import defaultdict from collections import defaultdict
import paddle
from paddle.fluid.distribute_lookup_table import find_distributed_lookup_table from paddle.fluid.distribute_lookup_table import find_distributed_lookup_table
from paddle.fluid.framework import Program, Variable, name_scope, default_main_program, default_startup_program, device_guard from paddle.fluid.framework import Program, Variable, name_scope, default_main_program, default_startup_program, device_guard
from paddle.fluid.dygraph.parallel import apply_collective_grads
from . import framework from . import framework
from . import layers from . import layers
@ -40,7 +42,6 @@ from paddle.fluid.layers import tensor
from functools import reduce from functools import reduce
from .wrapped_decorator import signature_safe_contextmanager from .wrapped_decorator import signature_safe_contextmanager
from .. import compat as cpt from .. import compat as cpt
import paddle
__all__ = [ __all__ = [
'SGD', 'Momentum', 'Adagrad', 'Adam', 'Adamax', 'Dpsgd', 'DecayedAdagrad', 'SGD', 'Momentum', 'Adagrad', 'Adam', 'Adamax', 'Dpsgd', 'DecayedAdagrad',
@ -771,8 +772,14 @@ class Optimizer(object):
self._dtype = loss.dtype self._dtype = loss.dtype
if framework.in_dygraph_mode(): if framework.in_dygraph_mode():
parameter_list = parameter_list if parameter_list \
else self._parameter_list
if paddle.distributed.get_world_size() > 1:
apply_collective_grads(parameter_list)
params_grads = [] params_grads = []
for param in self._parameter_list: for param in parameter_list:
if not param.trainable: if not param.trainable:
continue continue
if param._grad_ivar() is not None: if param._grad_ivar() is not None:
@ -939,6 +946,7 @@ class Optimizer(object):
parameter_list = parameter_list if parameter_list \ parameter_list = parameter_list if parameter_list \
else self._parameter_list else self._parameter_list
params_grads = self.backward( params_grads = self.backward(
loss, loss,
startup_program=startup_program, startup_program=startup_program,

@ -435,13 +435,7 @@ class TestParallelDyGraphRunnerBase(object):
"loss at step %d: %f" % (step_id, loss.numpy())) "loss at step %d: %f" % (step_id, loss.numpy()))
out_losses.append(loss.numpy()) out_losses.append(loss.numpy())
# FIXME(Yancey1989): scale the loss inplace
if args.update_method == "nccl2":
loss = model.scale_loss(loss)
loss.backward() loss.backward()
if args.update_method == "nccl2":
model.apply_collective_grads()
opt.minimize(loss) opt.minimize(loss)
model.clear_gradients() model.clear_gradients()
@ -477,12 +471,7 @@ class TestParallelDyGraphRunnerBase(object):
loss = self.run_one_loop(model, opt, data) loss = self.run_one_loop(model, opt, data)
out_losses.append(loss.numpy()) out_losses.append(loss.numpy())
if args.update_method == "nccl2":
loss = model.scale_loss(loss)
loss.backward() loss.backward()
if args.update_method == "nccl2":
model.apply_collective_grads()
opt.minimize(loss) opt.minimize(loss)
model.clear_gradients() model.clear_gradients()
@ -521,12 +510,7 @@ class TestParallelDyGraphRunnerBase(object):
loss = self.run_one_loop(model, opt, data) loss = self.run_one_loop(model, opt, data)
out_losses.append(loss.numpy()) out_losses.append(loss.numpy())
if args.update_method == "nccl2":
loss = model.scale_loss(loss)
loss.backward() loss.backward()
if args.update_method == "nccl2":
model.apply_collective_grads()
opt.step() opt.step()
opt.clear_grad() opt.clear_grad()

@ -22,6 +22,7 @@ import paddle.fluid as fluid
from paddle.fluid import core from paddle.fluid import core
from paddle.fluid.dygraph.parallel import DataParallel from paddle.fluid.dygraph.parallel import DataParallel
from paddle.fluid.dygraph.base import to_variable from paddle.fluid.dygraph.base import to_variable
from paddle.fluid.dygraph.parallel import _coalesce_tensors, _split_tensors, _reshape_inplace
class MyLayer(fluid.Layer): class MyLayer(fluid.Layer):
@ -57,8 +58,8 @@ class TestImperativeParallelCoalesceSplit(unittest.TestCase):
orig_var_shapes.append(var.shape) orig_var_shapes.append(var.shape)
# execute interface # execute interface
coalesced_vars = test_layer._coalesce_tensors(var_groups) coalesced_vars = _coalesce_tensors(var_groups)
test_layer._split_tensors(coalesced_vars) _split_tensors(coalesced_vars)
# compare # compare
for orig_var_shape, var in zip(orig_var_shapes, vars): for orig_var_shape, var in zip(orig_var_shapes, vars):
@ -74,7 +75,7 @@ class TestImperativeParallelCoalesceSplit(unittest.TestCase):
new_shape = [5, 10] new_shape = [5, 10]
x_data = np.random.random(ori_shape).astype("float32") x_data = np.random.random(ori_shape).astype("float32")
x = to_variable(x_data) x = to_variable(x_data)
test_layer._reshape_inplace(x, new_shape) _reshape_inplace(x, new_shape)
self.assertEqual(x.shape, new_shape) self.assertEqual(x.shape, new_shape)

@ -17,6 +17,9 @@ from ..fluid import core
from ..fluid import framework from ..fluid import framework
from ..fluid.framework import Variable from ..fluid.framework import Variable
import paddle
from paddle.fluid.dygraph.parallel import apply_collective_grads
__all__ = ["Adam"] __all__ = ["Adam"]
@ -276,7 +279,9 @@ class Adam(Optimizer):
adam.step() adam.step()
adam.clear_grad() adam.clear_grad()
""" """
parameter_list = self._parameter_list if paddle.distributed.get_world_size() > 1:
apply_collective_grads(self._parameter_list)
self._dtype = None self._dtype = None
params_grads = [] params_grads = []
for param in self._parameter_list: for param in self._parameter_list:

@ -16,6 +16,8 @@ from .optimizer import Optimizer
from .adam import Adam from .adam import Adam
from ..fluid import framework from ..fluid import framework
import paddle import paddle
from paddle.fluid.dygraph.parallel import apply_collective_grads
__all__ = ['AdamW'] __all__ = ['AdamW']
@ -184,6 +186,9 @@ class AdamW(Adam):
startup_program=None, startup_program=None,
parameters=None, parameters=None,
no_grad_set=None): no_grad_set=None):
parameters = parameters if parameters \
else self._parameter_list
params_grads = self.backward( params_grads = self.backward(
loss=loss, loss=loss,
startup_program=startup_program, startup_program=startup_program,
@ -206,7 +211,9 @@ class AdamW(Adam):
@framework.dygraph_only @framework.dygraph_only
def step(self): def step(self):
parameter_list = self._parameter_list if paddle.distributed.get_world_size() > 1:
apply_collective_grads(self._parameter_list)
self._dtype = None self._dtype = None
params_grads = [] params_grads = []
for param in self._parameter_list: for param in self._parameter_list:
@ -224,7 +231,7 @@ class AdamW(Adam):
updated_param = paddle.fluid.layers.elementwise_sub( updated_param = paddle.fluid.layers.elementwise_sub(
x=param, y=scaled_param) x=param, y=scaled_param)
param.set_value(updated_param.numpy()) param.set_value(updated_param.numpy())
optimize_ops = self._apply_optimize( self._apply_optimize(
loss=None, startup_program=None, params_grads=params_grads) loss=None, startup_program=None, params_grads=params_grads)
def __str__(self): def __str__(self):

@ -19,9 +19,10 @@ import six
import logging import logging
from collections import defaultdict from collections import defaultdict
import paddle
from paddle.fluid.distribute_lookup_table import find_distributed_lookup_table from paddle.fluid.distribute_lookup_table import find_distributed_lookup_table
from paddle.fluid.framework import Program, Variable, name_scope, default_main_program, default_startup_program, device_guard from paddle.fluid.framework import Program, Variable, name_scope, default_main_program, default_startup_program, device_guard
import paddle from paddle.fluid.dygraph.parallel import apply_collective_grads
from ..fluid import framework from ..fluid import framework
from ..fluid import layers from ..fluid import layers
@ -675,8 +676,14 @@ class Optimizer(object):
self._dtype = loss.dtype self._dtype = loss.dtype
if framework.in_dygraph_mode(): if framework.in_dygraph_mode():
parameter_list = parameters if parameters \
else self._parameter_list
if paddle.distributed.get_world_size() > 1:
apply_collective_grads(parameter_list)
params_grads = [] params_grads = []
for param in self._parameter_list: for param in parameter_list:
if not param.trainable: if not param.trainable:
continue continue
if param._grad_ivar() is not None: if param._grad_ivar() is not None:
@ -871,6 +878,7 @@ class Optimizer(object):
parameter_list = parameters if parameters \ parameter_list = parameters if parameters \
else self._parameter_list else self._parameter_list
params_grads = self.backward( params_grads = self.backward(
loss, loss,
startup_program=startup_program, startup_program=startup_program,
@ -907,7 +915,9 @@ class Optimizer(object):
adam.step() adam.step()
adam.clear_grad() adam.clear_grad()
""" """
parameter_list = self._parameter_list if paddle.distributed.get_world_size() > 1:
apply_collective_grads(self._parameter_list)
self._dtype = None self._dtype = None
params_grads = [] params_grads = []
for param in self._parameter_list: for param in self._parameter_list:
@ -917,5 +927,5 @@ class Optimizer(object):
grad_var = param._grad_ivar() grad_var = param._grad_ivar()
params_grads.append((param, grad_var)) params_grads.append((param, grad_var))
optimize_ops = self._apply_optimize( self._apply_optimize(
loss=None, startup_program=None, params_grads=params_grads) loss=None, startup_program=None, params_grads=params_grads)

Loading…
Cancel
Save