fix state dict to save/load learning rate scheduler (#25403)

* fix optimizer.state_dict and LRScheduler.state_dict to save/load dygraph,test=develop * fix optimizer.state_dict and LRScheduler.state_dict to save/load dygraph,test=develop * Add a judgment that state_dict/set_dict is used incorrectly,test=develop * fix some doc error,test=develop * fix current_step_lr for _LearningRateEpochDecay,test=develop * remove some unsed code to improve coverage,test=develop * remove some unsed code to improve coverage,test=develop
5 years ago · 914ff10a8f
parent fc93266b0a
commit 914ff10a8f
5 changed files with 262 additions and 80 deletions
--- a/python/paddle/fluid/dygraph/checkpoint.py
+++ b/python/paddle/fluid/dygraph/checkpoint.py
@ -80,9 +80,9 @@ def save_dygraph(state_dict, model_path):
    for k, v in state_dict.items():
        if isinstance(v, (Variable, core.VarBase)):
            model_dict[k] = v.numpy()
+            name_table[k] = v.name
        else:
            model_dict[k] = v
-        name_table[k] = v.name
    model_dict["StructuredToParameterName@@"] = name_table

    file_name = model_path + suffix
--- a/python/paddle/fluid/dygraph/learning_rate_scheduler.py
+++ b/python/paddle/fluid/dygraph/learning_rate_scheduler.py
@ -15,6 +15,7 @@
 from __future__ import print_function

 import math
+import warnings

 from .. import unique_name
 from ..framework import Variable
@ -66,6 +67,51 @@ class LearningRateDecay(object):
            persistable=False)
        return lr

+    def state_dict(self):
+        """
+        Returns the state of the scheduler as a :class:`dict`.
+
+        It is a subset of self.__dict__ .
+        """
+        self._state_keys()
+        state_dict = {}
+        for key in self.keys:
+            if key not in self.__dict__:
+                continue
+            value = self.__dict__[key]
+            if isinstance(value, Variable):
+                assert value.shape == [
+                    1
+                ], "shape of Variable in state_dict must be [1] {}".format(
+                    value.shape)
+                value = value.numpy()[0]
+            state_dict[key] = value
+
+        return state_dict
+
+    def _state_keys(self):
+        """
+        set the keys in self.__dict__ that are needed to be saved.
+        """
+        self.keys = ['step_num']
+
+    def set_dict(self, state_dict):
+        """
+        Loads the schedulers state.
+        """
+        self._state_keys()
+        for key in self.keys:
+            if key in state_dict:
+                self.__dict__[key] = state_dict[key]
+            else:
+                raise RuntimeError(
+                    "Please check whether state_dict is correct for optimizer. Can't find [ {} ] in state_dict".
+                    format(key))
+        if len(state_dict) > len(self.keys):
+            warnings.warn(
+                "There are some unused values in state_dict. Maybe the optimizer have different 'LearningRateDecay' when invoking state_dict and set_dict"
+            )
+
    def step(self):
        raise NotImplementedError()

@ -402,7 +448,7 @@ class PolynomialDecay(LearningRateDecay):
        learning_rate(Variable|float): The initial learning rate. If the type 
            is Variable, it's a tensor with shape [1], the data type can be  
            float32 or float64. It also can be set to python int number.
-        decay_steps(int32): The decay step size. It determines the decay cycle.
+        decay_steps(int): The decay step size. It determines the decay cycle.
        end_learning_rate(float, optional): The minimum final learning rate. The default value is 0.0001.
        power(float, optional): Power of polynomial. The default value is 1.0.
        cycle(bool, optional): If set true, decay the learning rate every decay_steps. The default value is False.
@ -785,7 +831,7 @@ class ReduceLROnPlateau(LearningRateDecay):
            raise ValueError(
                'new_lr = origin_lr * decay_rate and decay_rate should be < 1.0.'
            )
-        self.decay_rate = decay_rate
+        self.decay_rate = self.create_lr_var(decay_rate)

        threshold_mode = threshold_mode.lower()
        if threshold_mode not in ['rel', 'abs']:
@ -794,8 +840,10 @@ class ReduceLROnPlateau(LearningRateDecay):
        self.threshold_mode = threshold_mode
        check_type(learning_rate, 'learning_rate', (float, int, Variable),
                   'ReduceLROnPlateau')
-        if isinstance(learning_rate, (float, int)):
-            learning_rate = self.create_lr_var(learning_rate)
+        if not isinstance(learning_rate, (float, int, Variable)):
+            raise TypeError(
+                "The type of 'learning_rate' in 'ReduceLROnPlateau' must be 'float, int, Variable', but received %s."
+                % type(learning_rate))

        self.learning_rate = learning_rate
        self.verbose = verbose
@ -809,9 +857,17 @@ class ReduceLROnPlateau(LearningRateDecay):
        self.cooldown_counter = 0
        self.best_loss = None
        self.num_bad_epochs = 0
-        self.epoch = 0
+        self.epoch_num = 0
+
+    def _state_keys(self):
+        self.keys = [
+            'cooldown_counter', 'best_loss', 'num_bad_epochs', 'epoch_num',
+            'learning_rate'
+        ]

    def __call__(self):
+        if not isinstance(self.learning_rate, Variable):
+            self.learning_rate = self.create_lr_var(self.learning_rate)
        return self.learning_rate

    def step(self, loss):
@ -837,7 +893,7 @@ class ReduceLROnPlateau(LearningRateDecay):
            "should be (1L,), but the current loss.shape is {}. Maybe that "  \
            "you should call fluid.layers.mean to process it first.".format(loss.shape)

-        self.epoch += 1
+        self.epoch_num += 1
        if self.cooldown_counter > 0:
            self.cooldown_counter -= 1
        else:
@ -855,10 +911,11 @@ class ReduceLROnPlateau(LearningRateDecay):
                                                self.decay_rate, self.min_lr)
                if self.learning_rate - new_lr > self.eps:
                    if self.verbose:
+                        old_lr = self.learning_rate.numpy()[0] if isinstance(
+                            self.learning_rate,
+                            Variable) else self.learning_rate
                        print('Epoch {}: reducing learning rate from {} to {}.'.
-                              format(self.epoch,
-                                     self.learning_rate.numpy()[0],
-                                     new_lr.numpy()[0]))
+                              format(self.epoch_num, old_lr, new_lr.numpy()[0]))
                    self.learning_rate = new_lr

    def _is_better(self, current, best):
@ -891,22 +948,28 @@ class _LearningRateEpochDecay(LearningRateDecay):
            raise TypeError(
                "The type of 'learning_rate' must be 'float, int', but received %s."
                % type(learning_rate))
-        if learning_rate >= 1.0:
-            raise ValueError("The initial learning rate")
+        if learning_rate < 0:
+            raise ValueError("Invalid learning rate: {}".format(learning_rate))

        self.base_lr = float(learning_rate)

        self.epoch_num = -1
+        self.dtype = dtype
        if dtype is None:
            self.dtype = "float32"
        self.learning_rate = self.create_lr_var(self.base_lr)

        self.epoch()

+    def _state_keys(self):
+        self.keys = ['epoch_num', 'learning_rate']
+
    def __call__(self):
        """ 
        Return last computed learning rate on current epoch.
        """
+        if not isinstance(self.learning_rate, Variable):
+            self.learning_rate = self.create_lr_var(self.learning_rate)
        return self.learning_rate

    def epoch(self, epoch=None):
@ -919,8 +982,6 @@ class _LearningRateEpochDecay(LearningRateDecay):
            self.epoch_num = epoch

        self.learning_rate = self.get_lr()
-        if isinstance(self.learning_rate, float):
-            self.learning_rate = self.create_lr_var(self.learning_rate)

    def get_lr(self):
        raise NotImplementedError
@ -947,7 +1008,7 @@ class StepDecay(_LearningRateEpochDecay):

    Parameters:
        learning_rate (float|int): The initial learning rate. It can be set to python float or int number.
-        step_size (int): Period of learning rate decay..
+        step_size (int): Period of learning rate decay.
        decay_rate (float, optional): The Ratio that the learning rate will be reduced. ``new_lr = origin_lr * decay_rate`` . 
            It should be less than 1.0. Default: 0.1.

@ -1025,7 +1086,7 @@ class MultiStepDecay(_LearningRateEpochDecay):
            learning_rate = 0.005

    Parameters:
-        learning_rate (float|int): The initial learning rate. It can be set to python float or int number. If it
+        learning_rate (float|int): The initial learning rate. It can be set to python float or int number.
        milestones (tuple|list): List or tuple of each boundaries. Must be increasing.
        decay_rate (float, optional): The Ratio that the learning rate will be reduced. ``new_lr = origin_lr * decay_rate`` . 
            It should be less than 1.0. Default: 0.1.
--- a/python/paddle/fluid/optimizer.py
+++ b/python/paddle/fluid/optimizer.py
@ -33,7 +33,7 @@ from .layers import ops
 from .regularizer import append_regularization_ops
 from .dygraph import base as imperative_base
 from .dygraph import no_grad
-from .dygraph.learning_rate_scheduler import LearningRateDecay
+from .dygraph.learning_rate_scheduler import LearningRateDecay, _LearningRateEpochDecay
 from paddle.fluid import core
 from paddle.fluid.layers import tensor
 from functools import reduce
@ -149,12 +149,12 @@ class Optimizer(object):
                state_dict[var_tmp.name] = var_tmp
        # global step if use lr decay
        if isinstance(self._learning_rate, LearningRateDecay):
+            state_dict["LR_Scheduler"] = self._learning_rate.state_dict()
+
+            if not isinstance(self._learning_rate, _LearningRateEpochDecay):
                var_tmp = None
-            if framework.in_dygraph_mode():
                var_temp = framework._varbase_creator(
                    None, name='global_step', dtype='int32')
-            else:
-                var_temp = Variable(None, name='global_step', dtype='int32')

                tensor.fill_constant(
                    [1], "int32", self._learning_rate.step_num, out=var_temp)
@ -193,22 +193,20 @@ class Optimizer(object):
        '''

        if isinstance(self._learning_rate, LearningRateDecay):
+            self._learning_rate.set_dict(state_dict["LR_Scheduler"])
+
+            if not isinstance(self._learning_rate, _LearningRateEpochDecay):
                assert 'global_step' in state_dict, \
                        'Global step not in state dict, Dygraph use LearningRateDecay, global_step must in state_dict'
                global_step = state_dict['global_step']

-            if isinstance(global_step, core.VarBase):
+                if isinstance(global_step, Variable):
                    step_np = global_step
                    step_np = np.array(step_np.value().get_tensor())
                    assert step_np.shape == (1,),  \
                            "global step shape is (1,), the shape is {}".format( step_np.shape )

                    self._learning_rate.step_num = int(step_np[0])
-            elif isinstance(global_step, Variable):
-                step_np = global_step.numpy()
-                assert step_np.shape == (1,),  \
-                        "global step shape is (1,), the shape is {}".format( step_np.shape )
-                self._learning_rate.step_num = step_np[0]
                elif isinstance(global_step, np.ndarray):
                    assert global_step.shape == (1,),  \
                            "global step shape is (1,), the shape is {}".format( global_step.shape )
@ -423,11 +421,14 @@ class Optimizer(object):

        """
        current_lr = self._global_learning_rate()
-        if current_lr:
+        if isinstance(current_lr, framework.Variable):
            return self._global_learning_rate().numpy()[0]

        if isinstance(self._learning_rate, float):
            return self._learning_rate
+        elif isinstance(self._learning_rate, _LearningRateEpochDecay):
+            step_lr = self._learning_rate()
+            return step_lr.numpy()[0]
        else:
            step_lr = self._learning_rate.step()
            if isinstance(step_lr, (float, int)):
--- a/python/paddle/fluid/tests/unittests/test_imperative_save_load.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_save_load.py
@ -277,8 +277,11 @@ class TestDygraphPtbRnn(unittest.TestCase):
            self.opti_dict = adam.state_dict()
            self.base_opti = {}
            for k, v in self.opti_dict.items():
+                if isinstance(v, core.VarBase):
                    self.base_opti[v.name] = v.numpy()
                    self.assertTrue(np.sum(np.abs(v.numpy())) != 0)
+                else:
+                    self.base_opti[k] = v

            fluid.save_dygraph(self.opti_dict, "./test_dy")

@ -360,6 +363,7 @@ class TestDygraphPtbRnn(unittest.TestCase):
            opti_dict = adam.state_dict()
            # set to zero
            for k, v in opti_dict.items():
+                if isinstance(v, core.VarBase):
                    np_t = v.numpy()
                    var = v.value().get_tensor()
                    var.set(np.zeros_like(np_t), place)
@ -375,8 +379,11 @@ class TestDygraphPtbRnn(unittest.TestCase):

            opti_dict = adam.state_dict()
            for k, v in opti_dict.items():
+                if isinstance(v, core.VarBase):
                    self.assertTrue(
                        np.array_equal(v.numpy(), self.base_opti[v.name]))
+                else:
+                    self.assertEqual(v, self.base_opti[k])

            # check parameter
            state_dict = ptb_model.state_dict()
@ -466,6 +473,7 @@ class TestDygraphPtbRnn(unittest.TestCase):
            opti_dict = adam.state_dict()
            # set to zero
            for k, v in opti_dict.items():
+                if isinstance(v, core.VarBase):
                    np_t = v.numpy()
                    var = v.value().get_tensor()
                    var.set(np.zeros_like(np_t), place)
@ -476,11 +484,13 @@ class TestDygraphPtbRnn(unittest.TestCase):
                adam._learning_rate.step_num = 0

            adam.set_dict(self.opti_dict)
-
            opti_dict = adam.state_dict()
            for k, v in opti_dict.items():
+                if isinstance(v, core.VarBase):
                    self.assertTrue(
                        np.array_equal(v.numpy(), self.base_opti[v.name]))
+                else:
+                    self.assertEqual(v, self.base_opti[k])

            # check parameter
            state_dict = ptb_model.state_dict()
@ -571,12 +581,14 @@ class TestDygraphPtbRnn(unittest.TestCase):
            np_opti_dict = {}
            # set to zero
            for k, v in opti_dict.items():
+                if isinstance(v, core.VarBase):
                    np_t = v.numpy()
                    np_opti_dict[v.name] = np_t
                    var = v.value().get_tensor()
                    var.set(np.zeros_like(np_t), place)
-
                    self.assertTrue(np.sum(np.abs(v.numpy())) == 0)
+                else:
+                    np_opti_dict[k] = v

            if isinstance(adam._learning_rate, LearningRateDecay):
                adam._learning_rate.step_num = 0
@ -585,8 +597,11 @@ class TestDygraphPtbRnn(unittest.TestCase):

            opti_dict = adam.state_dict()
            for k, v in opti_dict.items():
+                if isinstance(v, core.VarBase):
                    self.assertTrue(
                        np.array_equal(v.numpy(), self.base_opti[v.name]))
+                else:
+                    self.assertEqual(v, self.base_opti[k])

            # check parameter
            state_dict = ptb_model.state_dict()
@ -827,7 +842,10 @@ class TestDygraphPtbRnn(unittest.TestCase):
            np_state_dict = {}

            for k, v in self.opti_dict.items():
+                if isinstance(v, core.VarBase):
                    np_opti_dict[v.name] = v.numpy()
+                else:
+                    np_opti_dict[k] = v

            for k, v in self.state_dict.items():
                np_state_dict[k] = v.numpy()
--- a/python/paddle/fluid/tests/unittests/test_learning_rate_scheduler.py
+++ b/python/paddle/fluid/tests/unittests/test_learning_rate_scheduler.py
@ -121,6 +121,104 @@ def lambda_decay(global_step, learning_rate, lr_lambda):


 class TestLearningRateDecayDygraph(unittest.TestCase):
+    def test_LR_state_dict(self):
+        with fluid.dygraph.guard():
+            x = np.random.uniform(-1, 1, [3, 10]).astype("float32")
+            linear = fluid.dygraph.Linear(10, 10)
+            input = fluid.dygraph.to_variable(x)
+
+            Exponential_scheduler = fluid.dygraph.ExponentialDecay(
+                learning_rate=0.1,
+                decay_steps=10000,
+                decay_rate=0.5,
+                staircase=True)
+            Step_scheduler = fluid.dygraph.StepDecay(0.5, step_size=3)
+            Reducelr_scheduler = fluid.dygraph.ReduceLROnPlateau(
+                learning_rate=1.0, decay_rate=0.5, patience=5, cooldown=3)
+
+            adam1 = fluid.optimizer.Adam(
+                learning_rate=Exponential_scheduler,
+                parameter_list=linear.parameters())
+            adam2 = fluid.optimizer.Adam(
+                learning_rate=Step_scheduler,
+                parameter_list=linear.parameters())
+            adam3 = fluid.optimizer.Adam(
+                learning_rate=Reducelr_scheduler,
+                parameter_list=linear.parameters())
+            print(adam3.state_dict())
+
+            for epoch in range(10):
+                out = linear(input)
+                loss = fluid.layers.reduce_mean(out)
+                loss.backward()
+                adam1.minimize(loss)
+                adam2.minimize(loss)
+                adam3.minimize(loss)
+                linear.clear_gradients()
+
+                Step_scheduler.epoch()
+                Reducelr_scheduler.step(loss)
+
+            fluid.dygraph.save_dygraph(linear.state_dict(), "save_path")
+
+            Exponential_scheduler_test = fluid.dygraph.ExponentialDecay(
+                learning_rate=0.1,
+                decay_steps=10000,
+                decay_rate=0.5,
+                staircase=True)
+            Step_scheduler_test = fluid.dygraph.StepDecay(0.5, step_size=3)
+            Reducelr_scheduler_test = fluid.dygraph.ReduceLROnPlateau(
+                learning_rate=1.0, decay_rate=0.5, patience=5, cooldown=3)
+
+            fluid.dygraph.save_dygraph(adam1.state_dict(), "save_path")
+            _, opt_state = fluid.dygraph.load_dygraph("save_path")
+            adam_test = fluid.optimizer.Adam(
+                learning_rate=Exponential_scheduler_test,
+                parameter_list=linear.parameters())
+            adam_test.set_dict(opt_state)
+            self.assertEqual(adam_test._learning_rate.step_num,
+                             adam1._learning_rate.step_num,
+                             "epoch_num is different before and after set_dict")
+
+            fluid.dygraph.save_dygraph(adam2.state_dict(), "save_path")
+            _, opt_state = fluid.dygraph.load_dygraph("save_path")
+            adam_test = fluid.optimizer.Adam(
+                learning_rate=Step_scheduler_test,
+                parameter_list=linear.parameters())
+            adam_test.set_dict(opt_state)
+            self.assertEqual(adam_test._learning_rate.epoch_num,
+                             adam2._learning_rate.epoch_num,
+                             "epoch_num is different before and after set_dict")
+            self.assertEqual(
+                adam_test._learning_rate(),
+                adam2._learning_rate(),
+                "current learning rate is different before and after set_dict")
+
+            fluid.dygraph.save_dygraph(adam3.state_dict(), "save_path")
+            _, opt_state = fluid.dygraph.load_dygraph("save_path")
+            adam_test = fluid.optimizer.Adam(
+                learning_rate=Reducelr_scheduler_test,
+                parameter_list=linear.parameters())
+            adam_test.set_dict(opt_state)
+            self.assertEqual(adam_test._learning_rate.best_loss,
+                             adam3._learning_rate.best_loss.numpy()[0],
+                             "best_loss is different before and after set_dict")
+            self.assertEqual(
+                adam_test._learning_rate.cooldown_counter,
+                adam3._learning_rate.cooldown_counter,
+                "cooldown_counter is different before and after set_dict")
+            self.assertEqual(
+                adam_test._learning_rate.num_bad_epochs,
+                adam3._learning_rate.num_bad_epochs,
+                "num_bad_epochs is different before and after set_dict")
+            self.assertEqual(adam_test._learning_rate.epoch_num,
+                             adam3._learning_rate.epoch_num,
+                             "epoch is different before and after set_dict")
+            self.assertEqual(
+                adam_test._learning_rate(),
+                adam3._learning_rate(),
+                "current learning rate is different before and after set_dict")
+
    def test_NoamDecay(self):
        with fluid.dygraph.guard():
            d_model = 0.01
@ -169,17 +267,22 @@ class TestLearningRateDecayDygraph(unittest.TestCase):
            learning_rate = 0.5
            milestones = [2, 4, 8]
            decay_rate = 0.2
+            linear = fluid.dygraph.Linear(10, 10)
+
            scheduler = fluid.dygraph.MultiStepDecay(learning_rate, milestones,
                                                     decay_rate)
+
+            adam = fluid.optimizer.AdamOptimizer(
+                learning_rate=scheduler, parameter_list=linear.parameters())
            for epoch in range(10):
                right_result = multi_step_decay(epoch, learning_rate,
                                                milestones, decay_rate)
-                fluid_result = scheduler().numpy()[0]
+                fluid_result = adam.current_step_lr()
                scheduler.epoch()
                self.assertAlmostEqual(
                    right_result,
                    fluid_result,
-                    msg='Failed lr scheduler in step {0}, Python result is {1}, Fluid result is {2}'.
+                    msg='Failed lr scheduler in epoch {0}, Python result is {1}, Fluid result is {2}'.
                    format(epoch, right_result, fluid_result))

            with self.assertRaises(ValueError):
@ -194,7 +297,7 @@ class TestLearningRateDecayDygraph(unittest.TestCase):
                lr = fluid.dygraph.MultiStepDecay("test", [20, 30, 50])

            with self.assertRaises(ValueError):
-                lr = fluid.dygraph.MultiStepDecay(2.0, [20, 30, 50])
+                lr = fluid.dygraph.MultiStepDecay(-1, [20, 30, 50])

    def test_StepDecay(self):
        with fluid.dygraph.guard():
@ -211,15 +314,14 @@ class TestLearningRateDecayDygraph(unittest.TestCase):
                self.assertAlmostEqual(
                    right_result,
                    fluid_result,
-                    msg='Failed lr scheduler in step {0}, Python result is {1}, Fluid result is {2}'.
+                    msg='Failed lr scheduler in epoch {0}, Python result is {1}, Fluid result is {2}'.
                    format(epoch, right_result, fluid_result))

            with self.assertRaises(TypeError):
-                lr = fluid.dygraph.MultiStepDecay(learning_rate, "test", 0.1)
+                lr = fluid.dygraph.StepDecay(learning_rate, "test", 0.1)

            with self.assertRaises(ValueError):
-                lr = fluid.dygraph.MultiStepDecay(learning_rate, [20, 30, 50],
-                                                  1)
+                lr = fluid.dygraph.StepDecay(learning_rate, 20, 2)

    def test_LambdaDecay(self):
        with fluid.dygraph.guard():