Fix some typos in AMP. (#21354)

* fix some typos in AMP. test=develop * delete useless codes. test=develop
5 years ago · be2e3e67d9
parent afb134847d
commit be2e3e67d9
4 changed files with 17 additions and 32 deletions
--- a/python/paddle/fluid/contrib/mixed_precision/decorator.py
+++ b/python/paddle/fluid/contrib/mixed_precision/decorator.py
@ -24,10 +24,10 @@ from .fp16_lists import AutoMixedPrecisionLists
 __all__ = ["decorate"]


-class OptimizerWithMixedPrecison(object):
+class OptimizerWithMixedPrecision(object):
    """
    Optimizer with mixed-precision (MP) training. This is a wrapper of a common 
-    optimizer, plus the support of mixed-precision pretraining. The object
+    optimizer, plus the support of mixed-precision pre-training. The object
    of this class almost has the same behavior as the common optimizer, with the 
    methods `minimize()`, `backward()`, `apply_gradients()` implemented. 
    Additionally, it enables the MP training automatically, i.e, the creation 
@ -116,7 +116,7 @@ class OptimizerWithMixedPrecison(object):
                 no_grad_set=None,
                 callbacks=None):
        """
-        Backward propogation or auto differentiation for gradients' computation.
+        Backward propagation or auto differentiation for gradients' computation.

        Args:
            loss (Variable): The loss Variable to minimize.
@ -124,7 +124,7 @@ class OptimizerWithMixedPrecison(object):
                                       parameters in `parameter_list`.
            parameter_list (list|None): A list of Variables to update.
            no_grad_set (set|None): A set of Variables should be ignored.
-            callbacks (list|None): A list of callables to run when appending 
+            callbacks (list|None): A list of callable objects to run when appending
                                   backward operator for one parameter.

        Returns:
@ -136,6 +136,8 @@ class OptimizerWithMixedPrecison(object):
        self._params_grads = self._optimizer.backward(
            self._scaled_loss, startup_program, parameter_list, no_grad_set,
            callbacks)
+        # Change the op_role_var attr for some ops, so that gradients
+        # transferred across GPUs can be FP16.
        update_role_var_grad(self._train_program, self._params_grads)
        scaled_params_grads = []
        for p, g in self._params_grads:
@ -257,7 +259,7 @@ def decorate(optimizer,
    """
    if amp_lists is None:
        amp_lists = AutoMixedPrecisionLists()
-    mp_optimizer = OptimizerWithMixedPrecison(
+    mp_optimizer = OptimizerWithMixedPrecision(
        optimizer, amp_lists, init_loss_scaling, use_dynamic_loss_scaling,
        incr_every_n_steps, decr_every_n_nan_or_inf, incr_ratio, decr_ratio)

--- a/python/paddle/fluid/contrib/mixed_precision/fp16_lists.py
+++ b/python/paddle/fluid/contrib/mixed_precision/fp16_lists.py
@ -22,7 +22,7 @@ class AutoMixedPrecisionLists(object):
    AutoMixedPrecisionLists is a class for black/white list. It can update
    pre-defined black list and white list according to users' custom black
    white lists. The lists are used for an algorithm which determines op's
-    exectuion mode (fp32 or fp16).
+    execution mode (fp32 or fp16).

    Args:
        custom_white_list (set): Users' custom white list.
@ -95,7 +95,7 @@ black_list = {

 # This set contains two types of ops. All ops supported fp16 calculation. One 
 # of two types is considered numerically-safe, but may be made unsafe by an
-# updtream blacklist op. Another type do not have numerically-significant 
+# upstream blacklist op. Another type do not have numerically-significant
 # effects, like stack, flatten2.
 gray_list = {
    'elementwise_add',
@ -139,7 +139,7 @@ gray_list = {
 '''
 # The set of ops that don't support fp16 calculation
 unsupported_fp16_list = {
-		# from python/paddle/fluid/layers/io.py
+	# from python/paddle/fluid/layers/io.py
    'send',
    'send_barrier',
    'recv',
--- a/python/paddle/fluid/contrib/mixed_precision/fp16_utils.py
+++ b/python/paddle/fluid/contrib/mixed_precision/fp16_utils.py
@ -16,24 +16,6 @@ from __future__ import print_function

 from ... import core
 from ... import layers
-from ... import framework
-
-
-def append_cast_op(i, o, prog):
-    """
-    Append a cast op in a given Program to cast input `i` to data type `o.dtype`.
-
-    Args:
-        i (Variable): The input Variable.
-        o (Variable): The output Variable.
-        prog (Program): The Program to append cast op.
-    """
-    prog.global_block().append_op(
-        type="cast",
-        inputs={"X": i},
-        outputs={"Out": o},
-        attrs={"in_dtype": i.dtype,
-               "out_dtype": o.dtype})


 def _rename_arg(op, old_name, new_name):
@ -75,7 +57,7 @@ def _insert_cast_op(block, op, idx, src_dtype, dest_dtype):
        op (Operator): The operator to insert cast op.
        idx (int): The index of current operator.
        src_dtype (VarType): The input variable dtype of cast op.
-        desr_dtype (VarType): The output variable dtype of cast op.
+        dest_dtype (VarType): The output variable dtype of cast op.

    Returns:
        num_cast_op (int): The number of cast ops that have been inserted.
@ -261,7 +243,7 @@ def rewrite_program(main_prog, amp_lists):
 def update_role_var_grad(main_prog, params_grads):
    """
    Update op_role_var attr for some ops to make sure the gradients
-    transfered across gpus is FP16.
+    transferred across GPUs is FP16.
    1. Check whether the op that outputs gradient is cast or not.
    2. If op is cast and gradient is FP32, remove the op_role_var
       and find the prev op which outputs FP16 gradient
@ -293,7 +275,8 @@ def update_role_var_grad(main_prog, params_grads):
                attr_val.extend(op_for_fp16_grad.attr(op_role_var_attr_name))
            op_for_fp16_grad._set_attr(op_role_var_attr_name, attr_val)

-            # maximize the allreduce overlap
+            # Maximize the all_reduce overlap, and perform the cast
+            # operation after gradients transfer.
            op._set_attr('op_role', OPTIMIZE)


@ -303,7 +286,7 @@ def update_loss_scaling(is_overall_finite, prev_loss_scaling, num_good_steps,
    """
    Update loss scaling according to overall gradients. If all gradients is 
    finite after incr_every_n_steps, loss scaling will increase by incr_ratio. 
-    Otherwisw, loss scaling will decrease by decr_ratio after 
+    Otherwise, loss scaling will decrease by decr_ratio after
    decr_every_n_nan_or_inf steps and each step some gradients are infinite.

    Args:
--- a/python/paddle/fluid/incubate/fleet/base/fleet_base.py
+++ b/python/paddle/fluid/incubate/fleet/base/fleet_base.py
@ -23,7 +23,7 @@ from paddle.fluid.optimizer import SGD
 from paddle.fluid.incubate.fleet.base.role_maker import MPISymetricRoleMaker
 from paddle.fluid.incubate.fleet.base.role_maker import RoleMakerBase
 from paddle.fluid.incubate.fleet.base.role_maker import UserDefinedRoleMaker
-from paddle.fluid.contrib.mixed_precision.decorator import OptimizerWithMixedPrecison
+from paddle.fluid.contrib.mixed_precision.decorator import OptimizerWithMixedPrecision


 class Mode:
@ -259,7 +259,7 @@ class DistributedOptimizer(object):

    def __init__(self, optimizer, strategy=None):
        if not isinstance(optimizer, SGD.__bases__) \
-                 and not isinstance(optimizer, OptimizerWithMixedPrecison):
+                 and not isinstance(optimizer, OptimizerWithMixedPrecision):
            raise TypeError("optimizer must be an instance of Optimizer")

        self._optimizer = optimizer