New save load interface (#20148)

* add new save load interface; test=develop * add new save interface; test=develop * add save load interface ; * fix save load error; * fix dygraph set dict bug; * add save load unit test; test=develop * fix test_imperative_optimizer bug; test=develop * fix unitest optimizer bug; test=develop * fix code coverage; test=develop * fix converage; test=develop * add document for apis; test=develop * fix unitest error; test=develop * fix save load unit test error; test=develop * fix error message; test=develop * change set_parameter set_optimizer to save_dygraph; test=develop * add load_graph check; test=develop * fix api spec; test=develop
6 years ago · fa43e80e19
parent e2c7b6821a
commit fa43e80e19
21 changed files with 2535 additions and 728 deletions
--- a/paddle/fluid/API.spec
+++ b/paddle/fluid/API.spec
--- a/paddle/fluid/framework/CMakeLists.txt
+++ b/paddle/fluid/framework/CMakeLists.txt
@ -226,6 +226,9 @@ cc_test(dlpack_tensor_test SRCS dlpack_tensor_test.cc DEPS dlpack_tensor glog)
 cc_library(op_compatible_info SRCS op_compatible_info DEPS string_helper proto_desc)
 cc_test(op_compatible_info_test SRCS op_compatible_info_test.cc DEPS op_compatible_info proto_desc string_helper glog)

+cc_library(save_load_util SRCS save_load_util DEPS tensor scope layer)
+cc_test(save_load_util_test SRCS save_load_util_test.cc DEPS save_load_util tensor scope layer)
+
 # Get the current working branch
 execute_process(
  COMMAND git rev-parse --abbrev-ref HEAD
--- a/paddle/fluid/framework/save_load_util.cc
+++ b/paddle/fluid/framework/save_load_util.cc
--- a/paddle/fluid/framework/save_load_util.h
+++ b/paddle/fluid/framework/save_load_util.h
@ -0,0 +1,54 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <fstream>
+#include <iostream>
+#include <map>
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "paddle/fluid/framework/scope.h"
+#include "paddle/fluid/framework/tensor.h"
+#include "paddle/fluid/imperative/type_defs.h"
+
+namespace paddle {
+namespace framework {
+
+bool SaveStaticNameListToDisk(
+    const std::string& file_name,
+    const std::vector<std::string>& vec_tensor_name_list, const Scope& scope);
+
+bool LoadStaticNameListFromDisk(
+    const std::string& file_name,
+    const std::vector<std::string>& vec_tensor_name_list, const Scope& scope);
+
+bool SaveDygraphVarBaseListToDisk(
+    const std::string& file_name,
+    const std::vector<std::shared_ptr<imperative::VarBase>>& vec_var_base_list);
+
+const std::vector<std::shared_ptr<imperative::VarBase>>
+LoadDygraphVarBaseListFromDisk(const std::string& file_name);
+
+bool SaveTensorToDisk(const std::string& file_name,
+                      const std::map<std::string, Tensor*>& map_tensor);
+
+bool LoadTensorFromDisk(
+    const std::string& file_name,
+    std::map<std::string, std::shared_ptr<Tensor>>* map_tensor);
+
+}  // namespace framework
+}  // namespace paddle
--- a/paddle/fluid/framework/save_load_util_test.cc
+++ b/paddle/fluid/framework/save_load_util_test.cc
@ -0,0 +1,78 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include <stdlib.h>
+#include <time.h>
+#include <iostream>
+#include <memory>
+
+#include "gtest/gtest.h"
+#include "paddle/fluid/framework/save_load_util.h"
+#include "paddle/fluid/platform/macros.h"
+
+namespace paddle {
+namespace framework {
+TEST(test_save_load_util, test_save_load) {
+  srand(time(NULL));
+  auto cpu_place = platform::CPUPlace();
+  Tensor tensor1;
+  tensor1.Resize({1000, 1000});
+  auto src_data_1 = tensor1.mutable_data<float>(cpu_place);
+  Tensor tensor2;
+  tensor2.Resize({5000, 1000});
+  auto src_data_2 = tensor2.mutable_data<float>(cpu_place);
+
+  for (int64_t i = 0; i < tensor1.numel(); ++i) {
+    float temp = (rand() % 10000) * 1.0 / 50000 - 1.0;  // NOLINT
+
+    src_data_1[i] = temp;
+  }
+
+  for (int64_t i = 0; i < tensor2.numel(); ++i) {
+    float temp = (rand() % 10000) * 1.0 / 50000 - 1.0;  // NOLINT
+
+    src_data_2[i] = temp;
+  }
+
+  std::map<std::string, Tensor*> map_tensor;
+  map_tensor["t1"] = &tensor1;
+  map_tensor["t2"] = &tensor2;
+
+  SaveTensorToDisk("test_1", map_tensor);
+
+  std::map<std::string, std::shared_ptr<Tensor>> load_map_tensor;
+
+  LoadTensorFromDisk("test_1", &load_map_tensor);
+
+  ASSERT_TRUE(load_map_tensor.find("t1") != load_map_tensor.end());
+  ASSERT_TRUE(load_map_tensor.find("t2") != load_map_tensor.end());
+
+  auto new_tensor_1 = load_map_tensor["t1"];
+  auto new_tensor_2 = load_map_tensor["t2"];
+
+  float* ptr_1 = tensor1.data<float>();
+  float* ptr_1_new = new_tensor_1->data<float>();
+
+  for (int64_t i = 0; i < tensor1.numel(); ++i) {
+    ASSERT_EQ(ptr_1[i], ptr_1_new[i]);
+  }
+
+  float* ptr_2 = tensor2.data<float>();
+  float* ptr_2_new = new_tensor_2->data<float>();
+
+  for (int64_t i = 0; i < tensor2.numel(); ++i) {
+    ASSERT_EQ(ptr_2[i], ptr_2_new[i]);
+  }
+}
+}  // namespace framework
+}  // namespace paddle
--- a/paddle/fluid/pybind/CMakeLists.txt
+++ b/paddle/fluid/pybind/CMakeLists.txt
@ -1,6 +1,6 @@
 set(PYBIND_DEPS pybind python proto_desc memory executor fleet_wrapper box_wrapper nccl_wrapper prune
  feed_fetch_method pass_builder parallel_executor profiler layer tracer engine scope_pool
-  analysis_predictor imperative_profiler nccl_context imperative_flag)
+  analysis_predictor imperative_profiler nccl_context imperative_flag save_load_util)

 if(WITH_PYTHON)
  list(APPEND PYBIND_DEPS py_func_op)
--- a/paddle/fluid/pybind/protobuf.cc
+++ b/paddle/fluid/pybind/protobuf.cc
@ -148,6 +148,7 @@ void BindVarDsec(pybind11::module *m) {
      .def("set_name", &pd::VarDesc::SetName)
      .def("set_shape", &pd::VarDesc::SetShape)
      .def("set_shapes", &pd::VarDesc::SetShapes)
+      .def("get_shape", &pd::VarDesc::GetShape)
      .def("set_dtype", &pd::VarDesc::SetDataType)
      .def("set_dtypes", &pd::VarDesc::SetDataTypes)
      .def("shape", &pd::VarDesc::GetShape,
--- a/paddle/fluid/pybind/pybind.cc
+++ b/paddle/fluid/pybind/pybind.cc
@ -39,10 +39,12 @@ limitations under the License. */
 #include "paddle/fluid/framework/parallel_executor.h"
 #include "paddle/fluid/framework/prune.h"
 #include "paddle/fluid/framework/reader.h"
+#include "paddle/fluid/framework/save_load_util.h"
 #include "paddle/fluid/framework/scope_pool.h"
 #include "paddle/fluid/framework/selected_rows.h"
 #include "paddle/fluid/framework/trainer.h"
 #include "paddle/fluid/framework/version.h"
+#include "paddle/fluid/imperative/layer.h"
 #include "paddle/fluid/memory/allocation/allocator_strategy.h"
 #include "paddle/fluid/operators/activation_op.h"
 #include "paddle/fluid/operators/py_func_op.h"
@ -153,6 +155,88 @@ static inline int PlaceIndex(const PlaceType &p) {
  return static_cast<int>(paddle::platform::Place(p).which());
 }

+static PyObject *GetPythonAttribute(PyObject *obj, const char *attr_name) {
+  // NOTE(zjl): PyObject_GetAttrString would return nullptr when attr_name
+  // is not inside obj, but it would also set the error flag of Python.
+  // If the error flag is set in C++, C++ code would not raise Exception,
+  // but Python would raise Exception once C++ call ends.
+  // To avoid unexpected Exception raised in Python, we check whether
+  // attribute exists before calling PyObject_GetAttrString.
+  //
+  // Caution: PyObject_GetAttrString would increase reference count of PyObject.
+  // Developer should call Py_DECREF manually after the attribute is not used.
+  if (PyObject_HasAttrString(obj, attr_name)) {
+    return PyObject_GetAttrString(obj, attr_name);
+  } else {
+    return nullptr;
+  }
+}
+
+template <typename T>
+static T PyObjectCast(PyObject *obj) {
+  try {
+    return py::cast<T>(py::handle(obj));
+  } catch (py::cast_error &) {
+    PADDLE_THROW("Python object is not type of %s", typeid(T).name());
+  }
+}
+
+using PyNameVarBaseMap = std::unordered_map<std::string, py::handle>;
+
+static std::vector<std::shared_ptr<imperative::VarBase>> GetVarBaseList(
+    const PyNameVarBaseMap &state_dict) {
+  std::vector<std::shared_ptr<imperative::VarBase>> vec_res;
+  vec_res.reserve(state_dict.size());
+
+  for (auto &para : state_dict) {
+    PyObject *py_obj = para.second.ptr();
+    if (!py_obj || py_obj == Py_None) {
+      PADDLE_THROW("Save parameter [%s] is None", para.first);
+    }
+
+    const char *kIVarField = "_ivar";
+    PyObject *py_ivar = GetPythonAttribute(py_obj, kIVarField);
+    PADDLE_ENFORCE_NOT_NULL(py_ivar, "Can not find  ivar in Variable");
+
+    vec_res.emplace_back(
+        PyObjectCast<std::shared_ptr<imperative::VarBase>>(py_ivar));
+    Py_DECREF(py_ivar);
+  }
+
+  return vec_res;
+}
+
+static std::vector<std::string> inline GetNameList(
+    const py::handle &py_handle) {
+  std::vector<std::string> vec_res;
+
+  PyObject *py_obj = py_handle.ptr();  // get underlying PyObject
+  // Python None is not nullptr in C++!
+  if (!py_obj || py_obj == Py_None) {
+    PADDLE_THROW("Save parameter list is None");
+  }
+
+  if (PyList_Check(py_obj)) {
+    size_t len = PyList_GET_SIZE(py_obj);
+
+    vec_res.reserve(len);
+
+    const char *kNameField = "name";
+
+    for (size_t i = 0; i < len; ++i) {
+      PyObject *py_name =
+          PyObject_GetAttrString(PyList_GET_ITEM(py_obj, i), kNameField);
+      PADDLE_ENFORCE_NOT_NULL(py_name);
+      vec_res.emplace_back(PyObjectCast<std::string>(py_name));
+      Py_DECREF(py_name);
+    }
+  } else {
+    PADDLE_THROW("Set parameter should be a list");
+  }
+
+  return vec_res;
+}
+
 #ifdef PADDLE_WITH_AVX
 PYBIND11_MODULE(core_avx, m) {
 #else
@ -174,6 +258,39 @@ PYBIND11_MODULE(core_noavx, m) {

  m.def("set_num_threads", &platform::SetNumThreads);

+  m.def("_save_static_dict",
+        [](const std::string &str_file_name, const py::handle &vec_var_list,
+           const Scope &scope) {
+          std::vector<std::string> vec_name_list = GetNameList(vec_var_list);
+          SaveStaticNameListToDisk(str_file_name, vec_name_list, scope);
+        });
+
+  m.def("_load_static_dict",
+        [](const std::string &str_file_name, const py::handle &vec_var_list,
+           const Scope &scope) {
+          std::vector<std::string> vec_name_list = GetNameList(vec_var_list);
+          LoadStaticNameListFromDisk(str_file_name, vec_name_list, scope);
+        });
+
+  m.def("_save_dygraph_dict", [](const std::string &str_file_name,
+                                 const PyNameVarBaseMap &state_dict) {
+    auto vec_var_base_list = GetVarBaseList(state_dict);
+
+    SaveDygraphVarBaseListToDisk(str_file_name, vec_var_base_list);
+  });
+
+  m.def("_load_dygraph_dict", [](const std::string &str_file_name) {
+    auto load_tensor = LoadDygraphVarBaseListFromDisk(str_file_name);
+
+    std::unordered_map<std::string, std::shared_ptr<imperative::VarBase>>
+        map_output;
+
+    for (size_t i = 0; i < load_tensor.size(); ++i) {
+      map_output.emplace(load_tensor[i]->Name(), load_tensor[i]);
+    }
+
+    return map_output;
+  });
  m.def("save_op_compatible_info", [](framework::ProgramDesc &desc) {
    framework::OpCompatibleMap op_compatible_map;
    op_compatible_map.InitOpCompatibleMap();
@ -373,7 +490,8 @@ PYBIND11_MODULE(core_noavx, m) {
           })
      .def("__init__", [](LoDTensor &instance) { new (&instance) LoDTensor(); })
      // We implement offset based LOD in C++ while we use length based with
-      // Python API. So we changed set_lod to set_recursive_sequence_lengths to
+      // Python API. So we changed set_lod to set_recursive_sequence_lengths
+      // to
      // avoid misuse.
      // The discussion is here:
      // https://github.com/PaddlePaddle/Paddle/issues/10855
@ -1713,7 +1831,8 @@ All parameter, weight, gradient are variables in Paddle.
              self.memory_optimize_ = (py_obj == Py_True);
            } else {
              PADDLE_THROW(
-                  "BuildStrategy.memory_optimize must be None, False or True");
+                  "BuildStrategy.memory_optimize must be None, False or "
+                  "True");
            }
          },
          R"DOC(The type is BOOL or None, memory opitimize aims to save total memory
--- a/python/paddle/fluid/init.py
+++ b/python/paddle/fluid/init.py
@ -86,6 +86,8 @@ from paddle.fluid.layers.math_op_patch import monkey_patch_variable
 from . import install_check
 from .dygraph.nn import *
 from .dygraph.layers import *
+from .io import save, load
+from .dygraph.checkpoint import save_dygraph, load_dygraph

 Tensor = LoDTensor

@ -122,6 +124,8 @@ __all__ = framework.__all__ + executor.__all__ + \
        'unique_name',
        'Scope',
        'install_check',
+        'save',
+        'load',
    ]


--- a/python/paddle/fluid/core.py
+++ b/python/paddle/fluid/core.py
@ -177,6 +177,10 @@ if avx_supported():
        from .core_avx import _is_dygraph_debug_enabled
        from .core_avx import _dygraph_debug_level
        from .core_avx import _set_paddle_lib_path
+        from .core_avx import _save_static_dict
+        from .core_avx import _load_static_dict
+        from .core_avx import _save_dygraph_dict
+        from .core_avx import _load_dygraph_dict
    except Exception as e:
        if has_avx_core:
            raise e
@ -206,6 +210,10 @@ if load_noavx:
        from .core_noavx import _is_dygraph_debug_enabled
        from .core_noavx import _dygraph_debug_level
        from .core_noavx import _set_paddle_lib_path
+        from .core_noavx import _save_static_dict
+        from .core_noavx import _load_static_dict
+        from .core_noavx import _save_dygraph_dict
+        from .core_noavx import _load_dygraph_dict
    except Exception as e:
        if has_noavx_core:
            sys.stderr.write(
--- a/python/paddle/fluid/dygraph/checkpoint.py
+++ b/python/paddle/fluid/dygraph/checkpoint.py
--- a/python/paddle/fluid/dygraph/layers.py
+++ b/python/paddle/fluid/dygraph/layers.py
@ -24,6 +24,7 @@ from paddle.fluid import core
 from .layer_object_helper import LayerObjectHelper
 from paddle.fluid import framework
 from ..param_attr import ParamAttr
+from paddle.fluid.framework import Variable

 __all__ = ['Layer']

@ -198,11 +199,11 @@ class Layer(core.Layer):
        """
        assert isinstance(parameter, framework.Parameter)

-        if parameter.name in self._loaddict_holder:
-            var = parameter._ivar.value()
-            tensor = var.get_tensor()
-            tensor.set(self._loaddict_holder[parameter.name].numpy(),
-                       framework._current_expected_place())
+        if len(self._loaddict_holder) > 0:
+            assert parameter.name in self._loaddict_holder, "Parameter not found, Can't not find [ {} ] in stat_dict".format(
+                parameter.name)
+
+            parameter.set_value(self._loaddict_holder[parameter.name])

        self._parameters[name] = parameter
        return parameter
@ -223,11 +224,12 @@ class Layer(core.Layer):
            if params is None:
                raise ValueError(
                    "super(YourLayer, self).__init__() should be called first")
-            if value.name in self._loaddict_holder:
-                var = value._ivar.value()
-                tensor = var.get_tensor()
-                tensor.set(self._loaddict_holder[value.name].numpy(),
-                           framework._current_expected_place())
+            if len(self._loaddict_holder) > 0:
+                assert value.name in self._loaddict_holder, "Parameter not found, Can't not find [ {} ] in stat_dict".format(
+                    value.name)
+
+                value.set_value(self._loaddict_holder[value.name])
+
            if name in params:
                # remove unused param in tracer
                if framework._dygraph_tracer_ is not None:
@ -252,6 +254,27 @@ class Layer(core.Layer):
            object.__delattr__(self, name)

    def state_dict(self, destination=None, include_sublayers=True):
+        '''
+        Get all parameter of current and sub-layers. And set all the parameters into a dict
+
+        Args:
+            destination(dict|optical) : If provide, all the parameter will set to this dict . Defaul is None
+            include_sublayers(bool) : If true, also include the parameters from sublayers.
+
+        Retruns:
+            state_dict(dict) : dict contains all the parameters
+
+        Examples:
+            .. code-block:: python                                                                                              
+                import paddle.fluid as fluid
+                with fluid.dygraph.guard():
+                    emb = fluid.dygraph.Embedding( "emb", [10, 10])
+
+                    state_dict = emb.state_dict()
+                    fluid.save_dygraph( state_dict, "paddle_dy")
+
+        '''
+
        if destination is None:
            destination = collections.OrderedDict()
        for name, data in self._parameters.items():
@ -268,14 +291,67 @@ class Layer(core.Layer):
                    destination = destination_temp
        return destination

+    def set_dict(self, stat_dict, include_sublayers=True):
+        '''
+        Set parameter from stat_dict. All the parameter will be reset by the tensor in the stat_dict
+
+        Args:
+            state_dict(dict) : Dict contains all the Parameter
+            include_sublayers(bool) : If true, also include the parameters from sublayers.
+        Returns:
+            None
+
+        Examples:
+            .. code-block:: python                                                                                              
+                import paddle.fluid as fluid
+                with fluid.dygraph.guard():
+                    emb = fluid.dygraph.Embedding( "emb", [10, 10])
+
+                    state_dict = emb.state_dict()
+                    fluid.save_dygraph( state_dict, "paddle_dy")
+                    
+                    para_state_dict, _ = fluid.load_dygraph( "paddle_dy")
+
+                    emb.set_dict( para_state_dict )
+
+        '''
+        self.load_dict(stat_dict, include_sublayers=include_sublayers)
+
    def load_dict(self, stat_dict, include_sublayers=True):
+        '''
+        Set parameter from stat_dict. All the parameter will be reset by the tensor in the stat_dict
+
+        This api will be Deprecated. Please use set_dict
+
+        Args:
+            state_dict(dict) : Dict contains all the Parameter
+            include_sublayers(bool) : If true, also include the parameters from sublayers.
+        Returns:
+            None
+
+        Examples:
+            .. code-block:: python                                                                                              
+                import paddle.fluid as fluid
+                with fluid.dygraph.guard():
+                    emb = fluid.dygraph.Embedding( "emb", [10, 10])
+
+                    state_dict = emb.state_dict()
+                    fluid.save_dygraph( state_dict, "paddle_dy")
+                    
+                    para_state_dict, _ = fluid.load_dygraph( "paddle_dy")
+
+                    emb.load_dict( para_state_dict )
+
+        '''
+
        self._loaddict_holder = stat_dict
        for name, item in self.__dict__.get('_parameters', None).items():
            if item.name in stat_dict:
-                var = item._ivar.value()
-                tensor = var.get_tensor()
-                tensor.set(stat_dict[item.name].numpy(),
-                           framework._current_expected_place())
+                item.set_value(stat_dict[item.name])
+            else:
+                raise RuntimeError(
+                    "Parameter not found, Can't not find [ {} ] in stat_dict".
+                    format(item.name))

        if include_sublayers:
            for layer_name, layer_item in self._sub_layers.items():
--- a/python/paddle/fluid/framework.py
+++ b/python/paddle/fluid/framework.py
@ -150,6 +150,19 @@ def is_compiled_with_cuda():
    return core.is_compiled_with_cuda()


+def _var_base_to_np(var_base):
+    """
+    convert VarBase tp numpy
+    
+    Args:
+        var_base(VarBase) : the VarBase to convert
+    Returns (np.ndarray): the np.ndarray contain the value of VarBase
+
+    """
+    var = var_base._copy_to(core.CPUPlace(), True)
+    return np.array(var.value().get_tensor())
+
+
 def cuda_places(device_ids=None):
    """
    **Note**:
@ -472,6 +485,7 @@ class Variable(object):
                 stop_gradient=False,
                 is_data=False,
                 need_check_feed=False,
+                 belong_to_optimizer=False,
                 **kwargs):
        self.block = block
        if name is None:
@ -481,6 +495,8 @@ class Variable(object):
            if not isinstance(dtype, core.VarDesc.VarType):
                dtype = convert_np_dtype_to_dtype_(dtype)

+        self.belong_to_optimizer = belong_to_optimizer
+
        if in_dygraph_mode():
            # record vars in tracer rather than blocks
            self._ivar = kwargs.get("ivar", None)
@ -681,15 +697,25 @@ class Variable(object):
                    out = fc(t)  # call with different weight

        """
-        assert isinstance(value, (Variable, np.ndarray))
-        if list(value.shape) != list(self.shape):
-            raise ValueError(
-                "The shape of the new value must be the same as that of the original Variable."
-            )
-        self_tensor = self._ivar.value().get_tensor()
+        assert isinstance(value, (Variable, np.ndarray, core.VarBase)), \
+                "Variable set_value function, arguments type only support Variable, numpy, VarBase"
+
+        value_np = value
        if isinstance(value, Variable):
-            value = value._ivar.value().get_tensor().__array__()
-        self_tensor.set(value, _current_expected_place())
+            value_np = value.numpy()
+        elif isinstance(value, core.VarBase):
+            value_np = _var_base_to_np(value)
+        self_tensor = self._ivar.value().get_tensor()
+
+        self_tensor_np = np.array(self_tensor)
+
+        assert self_tensor_np.shape == value_np.shape,  \
+                                      "Variable Shape not match, Variable [ {} ] need tensor with shape {} but load set tensor with shape {}".format( self._ivar.name, self_tensor_np.shape, value_np.shape)
+
+        assert self_tensor_np.dtype == value_np.dtype,  \
+                                      "Variable dtype not match, Variable [ {} ] need tensor with dtype {}  but load tensor with dtype {}".format( self._ivar.name, self_tensor_np.dtype, value_np.dtype)
+
+        self_tensor.set(value_np, _current_expected_place())

    @dygraph_only
    def backward(self, backward_strategy=None):
--- a/python/paddle/fluid/io.py
+++ b/python/paddle/fluid/io.py
@ -27,7 +27,7 @@ import paddle
 import paddle.reader
 from paddle.reader import *
 from paddle.fluid import layers
-from paddle.fluid.executor import Executor
+from paddle.fluid.executor import Executor, global_scope
 from paddle.fluid.evaluator import Evaluator
 from paddle.fluid.framework import Program, Parameter, default_main_program, default_startup_program, Variable, program_guard
 from paddle.fluid.compiler import CompiledProgram
@ -41,7 +41,8 @@ batch = paddle.batch

 __all__ = [
    'save_vars', 'save_params', 'save_persistables', 'load_vars', 'load_params',
-    'load_persistables', 'save_inference_model', 'load_inference_model', 'batch'
+    'load_persistables', 'save_inference_model', 'load_inference_model',
+    'batch', 'save', 'load'
 ] + reader.__all__ + paddle.reader.__all__

 _logger = get_logger(
@ -94,6 +95,10 @@ def is_persistable(var):
    return var.persistable


+def is_belong_to_optimizer(var):
+    return var.belong_to_optimizer
+
+
 def _clone_var_in_block_(block, var):
    assert isinstance(var, Variable)
    if var.desc.type() == core.VarDesc.VarType.LOD_TENSOR:
@ -1439,3 +1444,96 @@ def _load_persistable_nodes(executor, dirname, graph):
        else:
            _logger.warn("Cannot find the var %s!!!" % (node.name()))
    load_vars(executor=executor, dirname=dirname, vars=var_list)
+
+
+def save(program, model_path):
+    """
+    This function save parameters, optimizer information and network description to  model_path.
+
+    The parameters contains all the trainable Variable, will save to a file with suffix ".pdparams".
+    The optimizer information contains all the variable used by optimizer. For Adam optimizer, contains beta1, beta2, momentum etc. All the information will save to a file with suffix ".pdopt". (If the optimizer have no variable need to save (like SGD), the fill will not generated).
+    The network description is the description of the program. It's only used for deployment. The description  will save to a file with a suffix ".pdmodel".
+    
+    Args:
+        program(Program) : The program to saved.
+        model_path(str): the file prefix to save the program. The format is "dirname/file_prefix". If file_prefix is empty str. A exception will be raised
+
+    Returns:
+        None
+
+    Examples:
+        .. code-block:: python
+
+            import paddle.fluid as fluid
+
+            prog = fluid.default_main_program()
+            fluid.save( prog, "./temp")
+
+    """
+
+    base_name = os.path.basename(model_path)
+    assert base_name != "", \
+            "model_path MUST be format of dirname/filename [dirname\\filename in Window], Now filename is empty str"
+
+    parameter_list = list(filter(is_parameter, program.list_vars()))
+    paddle.fluid.core._save_static_dict(model_path + ".pdparams",
+                                        parameter_list, global_scope())
+
+    optimizer_var_list = list(
+        filter(is_belong_to_optimizer, program.list_vars()))
+
+    paddle.fluid.core._save_static_dict(model_path + ".pdopt",
+                                        optimizer_var_list, global_scope())
+
+    main_program = program.clone()
+    program.desc.flush()
+    main_program.desc._set_version()
+    paddle.fluid.core.save_op_compatible_info(program.desc)
+
+    with open(model_path + ".pdmodel", "wb") as f:
+        f.write(program.desc.serialize_to_string())
+
+
+def load(program, model_path):
+    """
+    This function filter out parameters and optimizer information from program, and then get corresponding value from file.
+    An exception will throw if shape or dtype of the parameters is not match between program and loaded file.
+
+    NOTICE: This function MUST called after run start_up_program
+
+    Args: 
+        program: The program to be load
+        model_path: The file prefix store the program
+
+    Returns:
+        None
+        
+     Examples:
+        .. code-block:: python
+
+            import paddle.fluid as fluid
+
+            prog = fluid.default_main_program()
+            fluid.save( prog, "./temp")
+
+            fluid.load( prog, "./temp")
+
+    """
+
+    parameter_file_name = model_path + ".pdparams"
+    assert os.path.exists(parameter_file_name), \
+            "Parameter file [{}] not exits".format( parameter_file_name)
+
+    parameter_list = list(filter(is_parameter, program.list_vars()))
+    paddle.fluid.core._load_static_dict(parameter_file_name, parameter_list,
+                                        global_scope())
+
+    optimizer_var_list = list(
+        filter(is_belong_to_optimizer, program.list_vars()))
+
+    if len(optimizer_var_list) > 0:
+        opt_file_name = model_path + ".pdopt"
+        assert os.path.exists(opt_file_name), \
+                "Optimizer file [{}] not exits".format( opt_file_name)
+        paddle.fluid.core._load_static_dict(opt_file_name, optimizer_var_list,
+                                            global_scope())
--- a/python/paddle/fluid/layers/nn.py
+++ b/python/paddle/fluid/layers/nn.py
@ -8103,7 +8103,11 @@ def autoincreased_step_counter(counter_name=None, begin=1, step=1):
    if counter_name is None:
        counter_name = '@STEP_COUNTER@'
    counter, is_new_var = helper.create_or_get_global_variable(
-        name=counter_name, dtype='int64', shape=[1], persistable=True)
+        name=counter_name,
+        dtype='int64',
+        shape=[1],
+        persistable=True,
+        belong_to_optimizer=True)
    if is_new_var:
        helper.set_variable_initializer(
            counter, initializer=Constant(
--- a/python/paddle/fluid/optimizer.py
+++ b/python/paddle/fluid/optimizer.py
@ -32,6 +32,7 @@ from .layers import ops
 from .regularizer import append_regularization_ops
 from .dygraph import base as imperative_base
 from .dygraph.learning_rate_scheduler import LearningRateDecay
+from .framework import _var_base_to_np
 from paddle.fluid import core
 from paddle.fluid.layers import tensor
 from functools import reduce
@ -95,90 +96,124 @@ class Optimizer(object):
        self._accumulators = defaultdict(lambda: dict())
        self.helper = None
        self._opti_name_list = []
+        self._accumulators_holder = {}

-    def load(self, stat_dict):
-        """
-        load optimizer with learning rate decay in dygraph mode
-        :return: None
+    @framework.dygraph_only
+    def state_dict(self):
+        '''
+        Get state dict information from optimizer. It contain all the variable used by optimizer. For Adam opimizer, contains beta1, beta2, momentum etc. If LearningRateDecay have been used, global_step will be include in state dict.
+        If the optimzier never be called(minimize function), the state_dict is empty.

-        Args:
-            stat_dict: the dict load by load_persistable method
+        Args: None
+        Return:
+            state_dict(dict) : dict contains all the variablel used by optimizer
+        
+        Examples:
+            .. code-block:: python

+                import paddle.fluid as fluid
+                adam = fluid.optimizer.Adam(0.001)
+                state_dict = adam.state_dict()
+
+        '''
+        state_dict = {}
+        for k, v in self._accumulators.items():
+            for para_name, var_tmp in v.items():
+                state_dict[var_tmp.name] = var_tmp
+        # global step if use lr decay
+        if isinstance(self._learning_rate, LearningRateDecay):
+            var_temp = Variable(None, name='global_step', dtype='int32')
+            tensor.fill_constant(
+                [1], "int32", self._learning_rate.step_num, out=var_temp)
+
+            state_dict['global_step'] = var_temp
+        return state_dict
+
+    @framework.dygraph_only
+    def set_dict(self, state_dict):
+        '''
+        Load optimizer state dict. For Adam opimizer, contains beta1, beta2, momentum etc. If LearningRateDecay have been used, global_step will be changed.
+
+        Args: 
+            state_dict(dict) : Dict contains all the Variable needed by optimizer
+        Return:
+            None
+        
        Examples:
+            .. code-block:: python

-        .. code-block:: python
+                with fluid.dygraph.guard():
+                    emb = fluid.dygraph.Embedding( "emb", [10, 10])

-            from __future__ import print_function
-            import numpy as np
-            import paddle
-            import paddle.fluid as fluid
-            from paddle.fluid.optimizer import SGDOptimizer
-            from paddle.fluid.dygraph.nn import FC
-            from paddle.fluid.dygraph.base import to_variable
-
-            class MLP(fluid.Layer):
-                def __init__(self, name_scope):
-                    super(MLP, self).__init__(name_scope)
-
-                    self._fc1 = FC(self.full_name(), 10)
-                    self._fc2 = FC(self.full_name(), 10)
-
-                def forward(self, inputs):
-                    y = self._fc1(inputs)
-                    y = self._fc2(y)
-                    return y
-
-            with fluid.dygraph.guard():
-                mlp = MLP('mlp')
-                optimizer2 = SGDOptimizer(
-                    learning_rate=fluid.layers.natural_exp_decay(
-                    learning_rate=0.1,
-                    decay_steps=10000,
-                    decay_rate=0.5,
-                    staircase=True))
+                    state_dict = emb.state_dict()
+                    fluid.save_dygraph( state_dict, "paddle_dy")

-                train_reader = paddle.batch(
-                        paddle.dataset.mnist.train(), batch_size=128, drop_last=True)
-
-                for batch_id, data in enumerate(train_reader()):
-                    dy_x_data = np.array(
-                            [x[0].reshape(1, 28, 28) for x in data]).astype('float32')
-
-                    y_data = np.array([x[1] for x in data]).astype('int64').reshape(
-                            128, 1)
-
-                    img = to_variable(dy_x_data)
-                    label = to_variable(y_data)
-                    label._stop_gradient = True
-                    cost = mlp(img)
-                    avg_loss = fluid.layers.reduce_mean(cost)
-                    avg_loss.backward()
-                    optimizer.minimize(avg_loss)
-                    mlp.clear_gradients()
-                    fluid.dygraph.save_persistables(
-                            mlp.state_dict(), [optimizer, optimizer2], "save_dir_2")
-                    if batch_id == 2:
-                            break
-
-            with fluid.dygraph.guard():
-                mlp_load = MLP('mlp')
-                optimizer_load2 = SGDOptimizer(
-                        learning_rate=fluid.layers.natural_exp_decay(
-                        learning_rate=0.1,
-                        decay_steps=10000,
-                        decay_rate=0.5,
-                        staircase=True))
-                parameters, optimizers = fluid.dygraph.load_persistables(
-                    "save_dir_2")
-                mlp_load.load_dict(parameters)
-                optimizer_load2.load(optimizers)
-            self.assertTrue(optimizer2._learning_rate.__dict__ == optimizer_load2._learning_rate.__dict__)
+                    adam = fluid.optimizer.Adam( learning_rate = fluid.layers.noam_decay( 100, 10000) )
+                    state_dict = adam.state_dict()
+                    fluid.save_dygraph( state_dict, "padle_dy")

-        """
-        if framework.in_dygraph_mode():
-            self._learning_rate = stat_dict[self._name]
-        else:
-            raise TypeError("load can only be used under DyGraph mode")
+                    para_state_dict, opti_state_dict = fluid.load_dygraph( "paddle_dy")
+
+                    adam.set_dict( opti_state_dict )
+
+        '''
+
+        if isinstance(self._learning_rate, LearningRateDecay):
+            assert 'global_step' in state_dict, \
+                    'Global step not in state dict, Dygraph use LearningRateDecay, global_step must in state_dict'
+            global_step = state_dict['global_step']
+
+            if isinstance(global_step, core.VarBase):
+                step_np = global_step._copy_to(core.CPUPlace(), True)
+                step_np = np.array(step_np.value().get_tensor())
+                assert step_np.shape == (1,),  \
+                        "global step shape is (1,), the shape is {}".format( step_np.shape )
+
+                self._learning_rate.step_num = int(step_np[0])
+            elif isinstance(global_step, Variable):
+                step_np = global_step.numpy()
+                assert step_np.shape == (1,),  \
+                        "global step shape is (1,), the shape is {}".format( step_np.shape )
+                self._learning_rate.step_num = step_np[0]
+            elif isinstance(global_step, np.ndarray):
+                assert global_step.shape == (1,),  \
+                        "global step shape is (1,), the shape is {}".format( global_step.shape )
+                self._learning_rate.step_num = global_step[0]
+            else:
+                raise RuntimeError(
+                    "Type not supprt, value in state dict must be [VarBase, Variable, numpy], the type is ",
+                    type(global_step))
+
+        self._accumulators_holder = state_dict
+        for k, v in self._accumulators.items():
+            for para_name, var_tmp in v.items():
+                assert var_tmp.name in state_dict, \
+                        "optimizer variable {} not found".format( var_tmp.name )
+                var = var_tmp._ivar.value()
+                tensor = var.get_tensor()
+                model_np = np.array(tensor)
+
+                load_para = state_dict[var_tmp.name]
+
+                if isinstance(load_para, Variable):
+                    load_para_np = load_para.numpy()
+                elif isinstance(load_para, core.VarBase):
+                    load_para_np = _var_base_to_np(load_para)
+                elif isinstance(load_para, np.ndarray):
+                    load_para_np = load_para
+                else:
+                    raise RuntimeError("State dict type {} not supprt".format(
+                        str(type(load_para))))
+
+                assert model_np.shape == load_para_np.shape,  \
+                                          "Parameter shape not match, Dygraph Parameter [ {} ] need tensor with shape {} but load tensor with shape {}".format(
+                                                 item.name, model_np.shape, load_para_np.shape)
+
+                assert model_np.dtype == load_para_np.dtype, \
+                                          "Parameter dtype not match, Dygraph Parameter [ {} ] need tensor with dtype {}  but load tensor with dtype {}".format(
+                                                item.name, model_np.dtype, load_para_np.dtype)
+
+                tensor.set(load_para_np, framework._current_expected_place())

    def get_opti_var_name_list(self):
        return self._opti_name_list
@ -315,9 +350,17 @@ class Optimizer(object):
            persistable=True,
            dtype=dtype or param.dtype,
            type=param.type,
-            shape=shape)
+            shape=shape,
+            belong_to_optimizer=True)
        self.helper.set_variable_initializer(
            var, initializer=Constant(value=float(fill_value)))
+
+        if framework.in_dygraph_mode():
+            if len(self._accumulators_holder) > 0:
+                assert var_name in self._accumulators_holder, \
+                        "Optimizer set error, {} should in state dict".format( var_name )
+                var.set_value(self._accumulators_holder[var_name])
+
        self._accumulators[name][param.name] = var
        return var

--- a/python/paddle/fluid/tests/unittests/test_imperative_checkpoint.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_checkpoint.py
@ -1,172 +0,0 @@
-# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import unittest
-import numpy as np
-
-import paddle
-import paddle.fluid as fluid
-from paddle.fluid.optimizer import SGDOptimizer
-from paddle.fluid import Conv2D, Pool2D, FC, core
-from paddle.fluid.dygraph.base import to_variable
-
-
-class SimpleImgConvPool(fluid.Layer):
-    def __init__(self,
-                 name_scope,
-                 num_filters,
-                 filter_size,
-                 pool_size,
-                 pool_stride,
-                 pool_padding=0,
-                 pool_type='max',
-                 global_pooling=False,
-                 conv_stride=1,
-                 conv_padding=0,
-                 conv_dilation=1,
-                 conv_groups=1,
-                 act=None,
-                 use_cudnn=False,
-                 param_attr=None,
-                 bias_attr=None):
-        super(SimpleImgConvPool, self).__init__(name_scope)
-
-        self._conv2d = Conv2D(
-            self.full_name(),
-            num_filters=num_filters,
-            filter_size=filter_size,
-            stride=conv_stride,
-            padding=conv_padding,
-            dilation=conv_dilation,
-            groups=conv_groups,
-            param_attr=None,
-            bias_attr=None,
-            use_cudnn=use_cudnn)
-
-        self._pool2d = Pool2D(
-            self.full_name(),
-            pool_size=pool_size,
-            pool_type=pool_type,
-            pool_stride=pool_stride,
-            pool_padding=pool_padding,
-            global_pooling=global_pooling,
-            use_cudnn=use_cudnn)
-
-    def forward(self, inputs):
-        x = self._conv2d(inputs)
-        x = self._pool2d(x)
-        return x
-
-
-class MNIST(fluid.Layer):
-    def __init__(self, name_scope):
-        super(MNIST, self).__init__(name_scope)
-
-        self._simple_img_conv_pool_1 = SimpleImgConvPool(
-            self.full_name(), 20, 5, 2, 2, act="relu")
-
-        self._simple_img_conv_pool_2 = SimpleImgConvPool(
-            self.full_name(), 50, 5, 2, 2, act="relu")
-
-        pool_2_shape = 50 * 4 * 4
-        SIZE = 10
-        scale = (2.0 / (pool_2_shape**2 * SIZE))**0.5
-        self._fc = FC(self.full_name(),
-                      10,
-                      param_attr=fluid.param_attr.ParamAttr(
-                          initializer=fluid.initializer.NormalInitializer(
-                              loc=0.0, scale=scale)),
-                      act="softmax")
-
-    def forward(self, inputs):
-        x = self._simple_img_conv_pool_1(inputs)
-        x = self._simple_img_conv_pool_2(x)
-        x = self._fc(x)
-        return x
-
-
-class TestDygraphCheckpoint(unittest.TestCase):
-    def reader_decorator(self, reader):
-        def _reader_imple():
-            for item in reader():
-                image = np.array(item[0]).reshape(1, 28, 28)
-                label = np.array(item[1]).astype('int64').reshape(1)
-                yield image, label
-
-        return _reader_imple
-
-    def test_save_load_persistables(self):
-        seed = 90
-        epoch_num = 1
-        batch_size = 128
-
-        with fluid.dygraph.guard():
-            fluid.default_startup_program().random_seed = seed
-            fluid.default_main_program().random_seed = seed
-
-            mnist = MNIST("mnist")
-            sgd = SGDOptimizer(learning_rate=1e-3)
-
-            batch_py_reader = fluid.io.PyReader(capacity=1)
-            batch_py_reader.decorate_sample_list_generator(
-                paddle.batch(
-                    self.reader_decorator(paddle.dataset.mnist.train()),
-                    batch_size=batch_size,
-                    drop_last=True),
-                places=fluid.CPUPlace())
-
-            dy_param_init_value = {}
-
-            for epoch in range(epoch_num):
-                for batch_id, data in enumerate(batch_py_reader()):
-                    img = data[0]
-                    label = data[1]
-                    label.stop_gradient = True
-
-                    cost = mnist(img)
-                    loss = fluid.layers.cross_entropy(cost, label)
-                    avg_loss = fluid.layers.mean(loss)
-
-                    dy_out = avg_loss.numpy()
-
-                    avg_loss.backward()
-                    sgd.minimize(avg_loss)
-                    fluid.dygraph.save_persistables(mnist.state_dict(),
-                                                    "save_dir")
-                    mnist.clear_gradients()
-
-                    for param in mnist.parameters():
-                        dy_param_init_value[param.name] = param.numpy()
-
-                    restore, _ = fluid.dygraph.load_persistables("save_dir")
-
-                    self.assertRaises(IOError, fluid.dygraph.load_persistables,
-                                      "not_exist_dir")
-
-                    mnist.load_dict(restore)
-
-                    self.assertEqual(len(dy_param_init_value), len(restore))
-                    for ky, value in restore.items():
-                        self.assertTrue(
-                            np.allclose(value.numpy(), dy_param_init_value[
-                                value.name]))
-                        self.assertTrue(np.isfinite(value.numpy().all()))
-                        self.assertFalse(np.isnan(value.numpy().any()))
-
-                    if batch_id > 10:
-                        break
-
-
-if __name__ == '__main__':
-    unittest.main()
--- a/python/paddle/fluid/tests/unittests/test_imperative_save_load.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_save_load.py
--- a/python/paddle/fluid/tests/unittests/test_imperative_save_load_optimizer.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_save_load_optimizer.py
@ -1,196 +0,0 @@
-# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-import unittest
-import numpy as np
-
-import paddle
-import paddle.fluid as fluid
-from paddle.fluid.optimizer import SGDOptimizer, Adam
-from paddle.fluid.dygraph.nn import FC
-from paddle.fluid.dygraph.base import to_variable
-
-
-class MLP(fluid.Layer):
-    def __init__(self, name_scope):
-        super(MLP, self).__init__(name_scope)
-
-        self._fc1 = FC(self.full_name(), 10)
-        self._fc2 = FC(self.full_name(), 10)
-
-    def forward(self, inputs):
-        y = self._fc1(inputs)
-        y = self._fc2(y)
-        return y
-
-
-class TestImperativeOptimizerBase(unittest.TestCase):
-    def setUp(self):
-        self.batch_num = 20
-
-    def get_optimizer(self):
-        raise NotImplementedError()
-
-    def _check_mlp(self):
-        seed = 90
-        with fluid.dygraph.guard():
-            fluid.default_startup_program().random_seed = seed
-            fluid.default_main_program().random_seed = seed
-
-            mlp = MLP('mlp')
-            optimizer = self.get_optimizer()
-            optimizer2 = SGDOptimizer(
-                learning_rate=fluid.layers.natural_exp_decay(
-                    learning_rate=0.1,
-                    decay_steps=10000,
-                    decay_rate=0.5,
-                    staircase=True))
-            train_reader = paddle.batch(
-                paddle.dataset.mnist.train(), batch_size=128, drop_last=True)
-
-            for batch_id, data in enumerate(train_reader()):
-                dy_x_data = np.array(
-                    [x[0].reshape(1, 28, 28) for x in data]).astype('float32')
-                y_data = np.array([x[1] for x in data]).astype('int64').reshape(
-                    128, 1)
-
-                img = to_variable(dy_x_data)
-                label = to_variable(y_data)
-                label._stop_gradient = True
-
-                cost = mlp(img)
-                avg_loss = fluid.layers.reduce_mean(cost)
-
-                avg_loss.backward()
-                optimizer.minimize(avg_loss)
-                optimizer2.minimize(avg_loss)
-                mlp.clear_gradients()
-                fluid.dygraph.save_persistables(mlp.state_dict(), "save_dir_2",
-                                                [optimizer, optimizer2])
-                if batch_id == 2:
-                    break
-
-        with fluid.dygraph.guard():
-            fluid.default_startup_program().random_seed = seed
-            fluid.default_main_program().random_seed = seed
-
-            mlp_load = MLP('mlp')
-            optimizer_load1 = self.get_optimizer()
-            optimizer_load2 = SGDOptimizer(
-                learning_rate=fluid.layers.natural_exp_decay(
-                    learning_rate=0.1,
-                    decay_steps=10000,
-                    decay_rate=0.5,
-                    staircase=True))
-            parameters, optimizers = fluid.dygraph.load_persistables(
-                "save_dir_2")
-            mlp_load.load_dict(parameters)
-            optimizer_load1.load(optimizers)
-            optimizer_load2.load(optimizers)
-
-        self.assertTrue(optimizer._learning_rate.__dict__ ==
-                        optimizer_load1._learning_rate.__dict__)
-        self.assertTrue(optimizer2._learning_rate.__dict__ ==
-                        optimizer_load2._learning_rate.__dict__)
-
-
-class TestImperativeOptimizerPiecewiseDecay(TestImperativeOptimizerBase):
-    def get_optimizer(self):
-        bd = [3, 6, 9]
-        optimizer = SGDOptimizer(learning_rate=fluid.layers.piecewise_decay(
-            boundaries=bd, values=[0.1 * (0.1**i) for i in range(len(bd) + 1)]))
-        return optimizer
-
-    def test_sgd(self):
-        self._check_mlp()
-
-
-class TestImperativeOptimizerNaturalExpDecay(TestImperativeOptimizerBase):
-    def get_optimizer(self):
-        optimizer = SGDOptimizer(learning_rate=fluid.layers.natural_exp_decay(
-            learning_rate=0.1,
-            decay_steps=10000,
-            decay_rate=0.5,
-            staircase=True))
-        return optimizer
-
-    def test_sgd(self):
-        self._check_mlp()
-
-
-class TestImperativeOptimizerExponentialDecay(TestImperativeOptimizerBase):
-    def get_optimizer(self):
-        optimizer = SGDOptimizer(learning_rate=fluid.layers.exponential_decay(
-            learning_rate=0.1,
-            decay_steps=10000,
-            decay_rate=0.5,
-            staircase=True))
-        return optimizer
-
-    def test_sgd(self):
-        self._check_mlp()
-
-
-class TestImperativeOptimizerInverseTimeDecay(TestImperativeOptimizerBase):
-    def get_optimizer(self):
-        optimizer = Adam(learning_rate=fluid.layers.inverse_time_decay(
-            learning_rate=0.1,
-            decay_steps=10000,
-            decay_rate=0.5,
-            staircase=True))
-        return optimizer
-
-    def test_adam(self):
-        self._check_mlp()
-
-
-class TestImperativeOptimizerPolynomialDecay(TestImperativeOptimizerBase):
-    def get_optimizer(self):
-        optimizer = SGDOptimizer(learning_rate=fluid.layers.polynomial_decay(
-            learning_rate=0.1, decay_steps=5, cycle=self.cycle))
-        return optimizer
-
-    def test_sgd_cycle(self):
-        self.cycle = True
-        self._check_mlp()
-
-    def test_sgd(self):
-        self.cycle = False
-        self._check_mlp()
-
-
-class TestImperativeOptimizerCosineDecay(TestImperativeOptimizerBase):
-    def get_optimizer(self):
-        optimizer = SGDOptimizer(learning_rate=fluid.layers.cosine_decay(
-            learning_rate=0.1, step_each_epoch=10000, epochs=120))
-        return optimizer
-
-    def test_sgd(self):
-        self._check_mlp()
-
-
-class TestImperativeOptimizerNoamDecay(TestImperativeOptimizerBase):
-    def get_optimizer(self):
-        optimizer = SGDOptimizer(learning_rate=fluid.layers.noam_decay(
-            d_model=512, warmup_steps=8000))
-        return optimizer
-
-    def test_sgd(self):
-        self._check_mlp()
-
-
-if __name__ == '__main__':
-    unittest.main()
--- a/python/paddle/fluid/tests/unittests/test_layers.py
+++ b/python/paddle/fluid/tests/unittests/test_layers.py
@ -180,7 +180,7 @@ class TestLayer(LayerTest):
            self.assertFalse(np.array_equal(out1.numpy(), out2.numpy()))

            mismatched_weight = np.random.randn(4, 4).astype("float32")
-            with self.assertRaises(ValueError):
+            with self.assertRaises(AssertionError):
                fc2.weight.set_value(mismatched_weight)
            fc2.weight.set_value(fc1_weight_init)
            fc2.bias.set_value(fc1_bias_init)
--- a/python/paddle/fluid/tests/unittests/test_static_save_load.py
+++ b/python/paddle/fluid/tests/unittests/test_static_save_load.py