Add pure fp16 training with master weights. (#27712)

* add the weight decay func for the momentum op * Add the multi_precision function in Momentum Optimizer. * Make sure that the initial value of master weights are same with the fp16 weights. * add static loss scaling. * add the rescale_grad function in the pure fp16 training. * use the original momentum updating method. * Polish some codes, such as variable names. * add docstring for apis. * update the var creation details of _create_master_weight. * not modify codes about imperative momentum updating. * Fix the error of test_dist_sparse_tensor_load_momentum UT. * add unit test for multi precision fp16 training. * add more unit tests for CI. * Use lower threshold values for allclose comparing in test_multi_precision_fp16_train UT. * For CI Coverage Checking.
4 years ago · be3777a50a
parent 976961de6d
commit be3777a50a
9 changed files with 912 additions and 184 deletions
--- a/paddle/fluid/operators/optimizers/momentum_op.cc
+++ b/paddle/fluid/operators/optimizers/momentum_op.cc
@ -49,13 +49,17 @@ void MomentumOpMaker::Make() {
  AddInput("LearningRate",
           "(Tensor, default Tensor<float>) "
           "Input learning rate");
-
+  AddInput("MasterParam", "FP32 master weight for AMP.").AsDispensable();
  AddOutput("ParamOut",
            "(Tensor) This output is updated parameter. "
            "It shared memory with Input(Param).");
  AddOutput("VelocityOut",
            "(Tensor) This output is updated velocity. "
            "It shared memory with Input(Velocity).");
  AddOutput("MasterParamOut",
            "The updated FP32 master weight for AMP. "
            "It shared memory with Input(MasterParam).")
      .AsDispensable();
  AddAttr<float>("mu", "(float) Momentum coefficient");
  AddAttr<bool>("use_nesterov",
@ -67,7 +71,17 @@ void MomentumOpMaker::Make() {
      "(string) regularization_method, right now only support l2decay or none")
      .SetDefault("");
  AddAttr<float>("regularization_coeff", "(float) regularization_coeff")
-      .SetDefault(0);
+      .SetDefault(0.0f);
  AddAttr<bool>("multi_precision",
                "(bool, default false) "
                "Whether to use multi-precision during weight updating.")
      .SetDefault(false);
  AddAttr<float>(
      "rescale_grad",
      "(float, default 1.0) Multiply the gradient with `rescale_grad`"
      "before updating. Often choose to be `1.0/batch_size`.")
      .SetDefault(1.0f);
  AddComment(R"DOC(
 Momentum Optimizer.
@ -109,4 +123,12 @@ REGISTER_OP_VERSION(momentum)
                     "l2decay or none",
                     std::string(""))
            .NewAttr("regularization_coeff", "(float) regularization_coeff",
-                     0.0f));
+                     0.0f)
            .NewAttr(
                "multi_precision",
                "(bool) Whether to use multi-precision during weight updating.",
                false)
            .NewAttr("rescale_grad",
                     "(float) Multiply the gradient with `rescale_grad`"
                     "before updating. Often choose to be `1.0/batch_size`.",
                     1.0f));
--- a/paddle/fluid/operators/optimizers/momentum_op.h
+++ b/paddle/fluid/operators/optimizers/momentum_op.h
--- a/paddle/fluid/pybind/op_function_generator.cc
+++ b/paddle/fluid/pybind/op_function_generator.cc
@ -54,6 +54,7 @@ std::map<std::string, std::set<std::string>> op_ins_map = {
    {"moving_average_abs_max_scale", {"X", "InAccum", "InState"}},
    {"multiclass_nms3", {"BBoxes", "Scores", "RoisNum"}},
    {"box_coder", {"PriorBox", "PriorBoxVar", "TargetBox"}},
    {"momentum", {"Param", "Grad", "Velocity", "LearningRate"}},
 };
 // NOTE(zhiqiu): Like op_ins_map.
@ -82,6 +83,7 @@ std::map<std::string, std::set<std::string>> op_outs_map = {
    {"moving_average_abs_max_scale", {"OutScale", "OutAccum", "OutState"}},
    {"multiclass_nms3", {"Out", "NmsRoisNum"}},
    {"generate_proposals_v2", {"RpnRois", "RpnRoiProbs", "RpnRoisNum"}},
    {"momentum", {"ParamOut", "VelocityOut"}},
 };
 // NOTE(zhiqiu): Commonly, the outputs in auto-generated OP function are
--- a/python/paddle/fluid/contrib/mixed_precision/fp16_utils.py
+++ b/python/paddle/fluid/contrib/mixed_precision/fp16_utils.py
@ -16,6 +16,12 @@ from __future__ import print_function
 from ... import core
 from ... import layers
 from ... import global_scope
 from ...log_helper import get_logger
 import logging
 import numpy as np
 _logger = get_logger(
    __name__, logging.INFO, fmt='%(asctime)s-%(levelname)s: %(message)s')
 def _rename_arg(op, old_name, new_name):
@ -191,6 +197,127 @@ def _is_in_black_varnames(op, amp_lists):
    return False
 def cast_model_to_fp16(main_program):
    """
    Traverse all ops in the whole model and set their inputs and outputs
    to the fp16 data type. This function will do some special process for
    the batch normalization, which keeps the computational process of
    batchnorms in FP32.
    Args:
        main_program (Program): The main program for training.
    """
    valid_types = [
        core.VarDesc.VarType.LOD_TENSOR, core.VarDesc.VarType.SELECTED_ROWS,
        core.VarDesc.VarType.LOD_TENSOR_ARRAY
    ]
    global_block = main_program.global_block()
    for block in main_program.blocks:
        ops = block.ops
        for op in ops:
            if op.type == 'create_py_reader' or op.type == 'read':
                continue
            for in_name in op.input_names:
                if op.type in {
                        'batch_norm', 'fused_bn_add_activation', 'layer_norm'
                } and in_name not in {'X', 'Z'}:
                    continue
                for in_var_name in op.input(in_name):
                    in_var = None
                    try:
                        in_var = block.var(in_var_name)
                    except ValueError as e:
                        _logger.debug(
                            "-- {}, try to get it in the global block. --".
                            format(e))
                        in_var = global_block.var(in_var_name)
                        if in_var is not None:
                            _logger.debug(
                                "-- var {} is got in the global block. --".
                                format(in_var_name))
                    if in_var is None or in_var.type not in valid_types:
                        continue
                    if in_var.dtype == core.VarDesc.VarType.FP32:
                        in_var.desc.set_dtype(core.VarDesc.VarType.FP16)
                    _logger.debug(
                        "-- op type: {}, in var name: {}, in var dtype: {} --".
                        format(op.type, in_var_name, in_var.dtype))
            for out_name in op.output_names:
                if op.type in {
                        'batch_norm', 'fused_bn_add_activation', 'layer_norm'
                } and out_name != 'Y':
                    continue
                for out_var_name in op.output(out_name):
                    out_var = None
                    try:
                        out_var = block.var(out_var_name)
                    except ValueError as e:
                        _logger.debug(
                            "-- {}, try to get it in the global block. --".
                            format(e))
                        out_var = global_block.var(out_var_name)
                        if out_var is not None:
                            _logger.debug(
                                "-- var {} is got in the global block. --".
                                format(out_var_name))
                    if out_var is None or out_var.type not in valid_types:
                        continue
                    if out_var.dtype == core.VarDesc.VarType.FP32:
                        out_var.desc.set_dtype(core.VarDesc.VarType.FP16)
                    _logger.debug(
                        "-- op type: {}, out var name: {}, out var dtype: {} --".
                        format(op.type, out_var_name, out_var.dtype))
            if op.has_attr('in_dtype') and op.attr(
                    'in_dtype') == core.VarDesc.VarType.FP32:
                op._set_attr('in_dtype', core.VarDesc.VarType.FP16)
            if op.has_attr('out_dtype') and op.attr(
                    'out_dtype') == core.VarDesc.VarType.FP32:
                op._set_attr('out_dtype', core.VarDesc.VarType.FP16)
            if op.has_attr('dtype') and op.attr(
                    'dtype') == core.VarDesc.VarType.FP32:
                op._set_attr('dtype', core.VarDesc.VarType.FP16)
 def cast_parameters_to_fp16(place, main_program, scope=None):
    """
    Traverse all parameters in the whole model and set them to the fp16 data type.
    Whereas, this function will keep parameters of batchnorms in FP32.
    Args:
        place(fluid.CPUPlace|fluid.CUDAPlace): place is used to restore the weight tensors.
        main_program (Program): The main program for training.
        scope(fluid.Scope, optional): scope is used to get the weight tensor values.
        Default is None.
    """
    all_ops = []
    for block in main_program.blocks:
        all_ops.extend(block.ops)
    bn_params = set()
    for op in all_ops:
        if op.type not in {
                'batch_norm', 'fused_bn_add_activation', 'layer_norm'
        }:
            continue
        for in_name in op.input_names:
            if in_name not in {'X', 'Z'}:
                for in_var_name in op.input(in_name):
                    bn_params.add(in_var_name)
    global_block = main_program.global_block()
    all_parameters = global_block.all_parameters()
    var_scope = scope if scope is not None else global_scope()
    for param in all_parameters:
        if param.name not in bn_params:
            param_t = var_scope.find_var(param.name).get_tensor()
            data = np.array(param_t)
            param_t.set(np.float16(data), place)
 def rewrite_program(main_prog, amp_lists):
    """
    Traverse all ops in current block and insert cast op according to 
--- a/python/paddle/fluid/contrib/optimizer.py
+++ b/python/paddle/fluid/contrib/optimizer.py
@ -14,11 +14,13 @@
 from paddle.fluid.optimizer import Optimizer
 from paddle.fluid.regularizer import L1DecayRegularizer
 from paddle.fluid.regularizer import L2DecayRegularizer
 from paddle.fluid.regularizer import append_regularization_ops
 from paddle.fluid import framework
 from paddle.fluid import core
 from paddle.fluid import framework
 from paddle.fluid.framework import program_guard
-from paddle.fluid.clip import append_gradient_clip_ops
+from paddle.fluid import unique_name
 from paddle.fluid import layers
 from paddle.fluid.layer_helper import LayerHelper
 import warnings
 __all__ = ['Momentum']
@ -61,6 +63,9 @@ class Momentum(Optimizer):
            some derived class of ``GradientClipBase`` . There are three cliping strategies 
            ( :ref:`api_fluid_clip_GradientClipByGlobalNorm` , :ref:`api_fluid_clip_GradientClipByNorm` , 
            :ref:`api_fluid_clip_GradientClipByValue` ). Default None, meaning there is no gradient clipping.
        multi_precision (bool, optional): Whether to use multi-precision during weight updating. Default is false.
        rescale_grad (float, optional): Multiply the gradient with `rescale_grad` before updating. \
            Often choose to be ``1.0/batch_size``.
        name (str, optional): This parameter is used by developers to print debugging information. \
            For details, please refer to :ref:`api_guide_Name`. Default is None.
@ -105,6 +110,8 @@ class Momentum(Optimizer):
                 use_nesterov=False,
                 regularization=None,
                 grad_clip=None,
                 multi_precision=False,
                 rescale_grad=1.0,
                 name=None):
        assert learning_rate is not None
        assert momentum is not None
@ -124,11 +131,68 @@ class Momentum(Optimizer):
        if (isinstance(regularization, L2DecayRegularizer)):
            self._regularization_method = "l2_decay"
            self._regularization_coeff = regularization._regularization_coeff
        self._multi_precision = multi_precision
        self._rescale_grad = rescale_grad
        self._master_weights = {}
    def _create_master_weight(self, param):
        assert isinstance(self.helper, LayerHelper)
        var_name = param.name + "_fp32_master"
        var_name = unique_name.generate(var_name)
        var = layers.create_global_var(
            name=var_name,
            shape=param.shape,
            value=0,
            dtype='float32',
            persistable=True)
        block = self.helper.startup_program.global_block()
        block.append_op(
            type="cast",
            inputs={"X": [param]},
            outputs={"Out": [var]},
            attrs={
                "in_dtype": param.dtype,
                "out_dtype": core.VarDesc.VarType.FP32
            })
        self._master_weights[param.name] = var
        return var
    def _get_accumulator(self, name, param):
        """Utility function to fetch an accumulator for a parameter
        Args:
            name: name of the accumulator
            param: parameter variable for which accumulator is to be fetched
        Returns:
            accumulator variable for the parameter
        """
        if self._name is not None:
            name = self._name + "_" + name
        find_master = self._multi_precision and param.dtype == core.VarDesc.VarType.FP16
        target_param = self._master_weights[
            param.name] if find_master else param
        target_name = target_param.name
        if (name not in self._accumulators or
                target_name not in self._accumulators[name]):
            raise Exception("Accumulator {} does not exist for parameter {}".
                            format(name, target_name))
        return self._accumulators[name][target_name]
    def _create_accumulators(self, block, parameters):
        assert isinstance(block, framework.Block)
        for p in parameters:
            if self._multi_precision and p.dtype == core.VarDesc.VarType.FP16:
                master_p = self._create_master_weight(p)
                self._add_accumulator(self._velocity_acc_str, master_p)
                continue
            if p.dtype == core.VarDesc.VarType.FP16 and not self._multi_precision:
                warnings.warn(
                    "Accumulating with FP16 in optimizer can lead to poor accuracy or slow convergence."
                    "Consider using multi_precision=True option of the Momentum optimizer."
                )
            self._add_accumulator(self._velocity_acc_str, p)
    def _append_optimize_op(self, block, param_and_grad):
@ -136,6 +200,10 @@ class Momentum(Optimizer):
        velocity_acc = self._get_accumulator(self._velocity_acc_str,
                                             param_and_grad[0])
        find_master = self._multi_precision and param_and_grad[
            0].dtype == core.VarDesc.VarType.FP16
        master_weight = (self._master_weights[param_and_grad[0].name]
                         if find_master else None)
        lr = self._create_param_lr(param_and_grad)
        if framework.in_dygraph_mode():
@ -151,7 +219,9 @@ class Momentum(Optimizer):
            "mu": self._momentum,
            "use_nesterov": self._use_nesterov,
            "regularization_method": self._regularization_method,
-            "regularization_coeff": self._regularization_coeff
+            "regularization_coeff": self._regularization_coeff,
            "multi_precision": find_master,
            "rescale_grad": self._rescale_grad
        }
        inputs = {
            "Param": [param_and_grad[0]],
@ -159,11 +229,15 @@ class Momentum(Optimizer):
            "Velocity": [velocity_acc],
            "LearningRate": [lr]
        }
        outputs = {
            "ParamOut": [param_and_grad[0]],
            "VelocityOut": [velocity_acc]
        }
        if find_master:
            inputs["MasterParam"] = master_weight
            outputs["MasterParamOut"] = master_weight
        # create the momentum optimize op
        momentum_op = block.append_op(
            type=self.type,
--- a/python/paddle/fluid/contrib/tests/CMakeLists.txt
+++ b/python/paddle/fluid/contrib/tests/CMakeLists.txt
@ -1,8 +1,14 @@
 file(GLOB TEST_OPS RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "test_*.py")
 string(REPLACE ".py" "" TEST_OPS "${TEST_OPS}")
 list(REMOVE_ITEM TEST_OPS test_multi_precision_fp16_train)
 foreach(src ${TEST_OPS})
        py_test(${src} SRCS ${src}.py)
 endforeach()
 py_test_modules(test_multi_precision_fp16_train MODULES test_multi_precision_fp16_train ENVS FLAGS_cudnn_deterministic=true FLAGS_cudnn_batchnorm_spatial_persistent=true FLAGS_conv_workspace_size_limit=1000)
 set_tests_properties(test_image_classification_fp16 PROPERTIES TIMEOUT 120)
 set_tests_properties(test_weight_decay_extend PROPERTIES TIMEOUT 120)
 set_tests_properties(test_multi_precision_fp16_train PROPERTIES TIMEOUT 120)
--- a/python/paddle/fluid/contrib/tests/test_multi_precision_fp16_train.py
+++ b/python/paddle/fluid/contrib/tests/test_multi_precision_fp16_train.py
--- a/python/paddle/fluid/incubate/fleet/parameter_server/ir/pserver_pass.py
+++ b/python/paddle/fluid/incubate/fleet/parameter_server/ir/pserver_pass.py
@ -246,7 +246,8 @@ def _append_pserver_ops(optimize_block, opt_op, endpoint, grad_to_block_id,
    for key in opt_op.input_names:
        new_shape = None
        if key in [
-                "Param", "Grad", "LearningRate", "Beta1Tensor", "Beta2Tensor"
+                "Param", "Grad", "LearningRate", "MasterParam", "Beta1Tensor",
                "Beta2Tensor"
        ]:
            continue
        var = origin_program.global_block().vars[opt_op.input(key)[0]]
--- a/python/paddle/fluid/tests/unittests/test_momentum_op.py
+++ b/python/paddle/fluid/tests/unittests/test_momentum_op.py
@ -59,7 +59,7 @@ class TestMomentumOp1(OpTest):
        param = np.random.random((123, 321)).astype(self.dtype)
        grad = np.random.random((123, 321)).astype(self.dtype)
        velocity = np.zeros((123, 321)).astype(self.dtype)
-        learning_rate = np.array([0.001]).astype(self.dtype)
+        learning_rate = np.array([0.001]).astype(np.float32)
        mu = 0.0001
        use_nesterov = False
@ -217,7 +217,7 @@ class TestSparseMomentumOp(unittest.TestCase):
                                        0.0).astype("float32")
        velocity_out.set(velocity_out_np_array, place)
-        # create and initialize LeraningRate Variable
+        # create and initialize LearningRate Variable
        lr = scope.var('LearningRate').get_tensor()
        lr_array = np.full((1), 2.0).astype("float32")
        lr.set(lr_array, place)
@ -278,6 +278,115 @@ class TestSparseMomentumOp2(TestSparseMomentumOp):
        self.use_nesterov = True
 class TestSparseMomentumOpWithMultiPrecision(unittest.TestCase):
    def setUp(self):
        self.init_args()
        self.regularization_method = ""
        self.regularization_coeff = 1.0
    def check_with_place(self, place):
        scope = core.Scope()
        # create and initialize Grad Variable
        height = 10
        rows = [0, 4, 7]
        row_numel = 12
        mu = 1.0
        use_nesterov = self.use_nesterov
        regularization_method = self.regularization_method
        regularization_coeff = self.regularization_coeff
        # create and initialize Param Variable
        param_array = np.full((height, row_numel), 5.0).astype("float32")
        param_out_array = np.full((height, row_numel), 0.0).astype("float32")
        param = scope.var('Param').get_tensor()
        param.set(param_array.astype("float16"), place)
        param_out = scope.var("ParamOut").get_tensor()
        param_out.set(param_out_array.astype("float16"), place)
        master_param = scope.var('MasterParam').get_tensor()
        master_param.set(param_array, place)
        master_param_out = scope.var("MasterParamOut").get_tensor()
        master_param_out.set(param_out_array, place)
        grad_selected_rows = scope.var('Grad').get_selected_rows()
        grad_selected_rows.set_height(height)
        grad_selected_rows.set_rows(rows)
        grad_np_array = np.ones((len(rows), row_numel)).astype("float32")
        grad_np_array[0, 0] = 2.0
        grad_np_array[2, 8] = 4.0
        grad_tensor = grad_selected_rows.get_tensor()
        grad_tensor.set(grad_np_array.astype("float16"), place)
        velocity = scope.var('Velocity').get_tensor()
        velocity_np_array = np.ones((height, row_numel)).astype("float32")
        velocity.set(velocity_np_array, place)
        velocity_out = scope.var('VelocityOut').get_tensor()
        velocity_out_np_array = np.full((height, row_numel),
                                        0.0).astype("float32")
        velocity_out.set(velocity_out_np_array, place)
        # create and initialize LearningRate Variable
        lr = scope.var('LearningRate').get_tensor()
        lr_array = np.full((1), 2.0).astype("float32")
        lr.set(lr_array, place)
        # create and run operator
        op = Operator(
            "momentum",
            Param='Param',
            Grad='Grad',
            Velocity='Velocity',
            MasterParam='MasterParam',
            ParamOut='ParamOut',
            VelocityOut='VelocityOut',
            MasterParamOut='MasterParamOut',
            LearningRate='LearningRate',
            mu=mu,
            use_nesterov=use_nesterov,
            regularization_method=regularization_method,
            regularization_coeff=regularization_coeff,
            multi_precision=True,
            rescale_grad=1.0)
        op.run(scope, place)
        # get and compare result
        param_out_np_array = np.array(param_out)
        velocity_out_np_array = np.array(velocity_out)
        _grad_np_array = np.full((height, row_numel), 0.0).astype("float32")
        for i in range(len(rows)):
            _grad_np_array[rows[i]] = grad_np_array[i]
        _param = param_array
        _param_out, _velocity_out = calculate_momentum_by_numpy(
            param=_param,
            grad=_grad_np_array,
            mu=mu,
            velocity=velocity_np_array,
            use_nesterov=use_nesterov,
            learning_rate=lr_array,
            regularization_method=regularization_method,
            regularization_coeff=regularization_coeff)
        self.assertTrue((_velocity_out == velocity_out_np_array).all())
        self.assertTrue((_param_out == param_out_np_array).all())
    def init_args(self):
        self.use_nesterov = False
    def test_sparse_momentum(self):
        if core.is_compiled_with_cuda():
            self.check_with_place(fluid.CUDAPlace(0))
 class TestSparseMomentumOpWithMultiPrecision2(
        TestSparseMomentumOpWithMultiPrecision):
    def init_args(self):
        self.use_nesterov = True
 class TestMomentumV2(unittest.TestCase):
    def test_momentum_dygraph(self):
        paddle.disable_static()
@ -334,7 +443,7 @@ class TestMomentumOpWithDecay(OpTest):
        param = np.random.random((123, 321)).astype(self.dtype)
        grad = np.random.random((123, 321)).astype(self.dtype)
        velocity = np.zeros((123, 321)).astype(self.dtype)
-        learning_rate = np.array([0.001]).astype(self.dtype)
+        learning_rate = np.array([0.001]).astype(np.float32)
        mu = 0.0001
        use_nesterov = self.use_nesterov
        regularization_method = self.regularization_method