Add fleet checkpoint on local fs and remote fs(such as hdfs) for EDL (#22586)

5 years ago · 24a063f6ac
parent 0c23e3ff4d
commit 24a063f6ac
13 changed files with 1130 additions and 214 deletions
--- a/paddle/fluid/framework/io/CMakeLists.txt
+++ b/paddle/fluid/framework/io/CMakeLists.txt
@ -1,4 +1,4 @@
 cc_library(fs SRCS fs.cc DEPS string_helper glog boost)
-cc_library(shell SRCS shell.cc DEPS string_helper glog)
+cc_library(shell SRCS shell.cc DEPS string_helper glog timer)

 cc_test(test_fs SRCS test_fs.cc DEPS fs shell)
--- a/paddle/fluid/framework/io/shell.cc
+++ b/paddle/fluid/framework/io/shell.cc
@ -13,6 +13,8 @@
 // limitations under the License.

 #include "paddle/fluid/framework/io/shell.h"
+#include "paddle/fluid/platform/enforce.h"
+#include "paddle/fluid/platform/timer.h"

 namespace paddle {
 namespace framework {
@ -296,23 +298,48 @@ std::pair<std::shared_ptr<FILE>, std::shared_ptr<FILE>> shell_p2open(
 #endif
 }

-std::string shell_get_command_output(const std::string& cmd) {
+std::string shell_get_command_output(const std::string& cmd, int time_out,
+                                     int sleep_inter, bool print_cmd) {
 #if defined _WIN32 || defined __APPLE__
-  return "";
+  PADDLE_THROW(platform::errors::Unimplemented(
+      "This function(shell_get_command_output) is not implemented under _WIN32 "
+      "or __APPLE__."));
 #else
  int err_no = 0;
+  platform::Timer timer;
  do {
+    if (print_cmd) {
+      LOG(INFO) << "exec cmd:[" << cmd << "]";
+    }
    err_no = 0;
    std::shared_ptr<FILE> pipe = shell_popen(cmd, "r", &err_no);
    string::LineFileReader reader;

-    if (reader.getdelim(&*pipe, 0)) {
-      pipe = nullptr;
+    char* buf = reader.getdelim(&*pipe, 0);
    if (err_no == 0) {
+      if (buf) {
        return reader.get();
      }
+      return "";
+    }
+
+    if (sleep_inter > 0) {
+      usleep(sleep_inter);
+    }
+
+    timer.Pause();
+    if (time_out > 0 && timer.ElapsedMS() >= time_out) {
+      PADDLE_THROW(paddle::platform::errors::ExecutionTimeout(
+          "shell_get_command_output execute  error errno:%d and try until "
+          "timeout.",
+          errno));
+      return "";
    }
-  } while (err_no == -1);
+    timer.Resume();
+
+    pipe = nullptr;
+  } while (err_no);
+
  return "";
 #endif
 }
--- a/paddle/fluid/framework/io/shell.h
+++ b/paddle/fluid/framework/io/shell.h
@ -65,7 +65,12 @@ inline void shell_execute(const std::string& cmd) {
  } while (err_no == -1);
 }

-extern std::string shell_get_command_output(const std::string& cmd);
+// timeout:ms, default -1 means forever.
+// sleep_inter:ms, default -1 means not sleep.
+extern std::string shell_get_command_output(const std::string& cmd,
+                                            int time_out = -1,
+                                            int sleep_inter = -1,
+                                            bool print_cmd = false);

 }  // namespace framework
 }  // namespace paddle
--- a/paddle/fluid/pybind/pybind.cc
+++ b/paddle/fluid/pybind/pybind.cc
@ -1494,8 +1494,10 @@ All parameter, weight, gradient are variables in Paddle.
  m.def("is_compiled_with_mkldnn", IsCompiledWithMKLDNN);
  m.def("is_compiled_with_brpc", IsCompiledWithBrpc);
  m.def("is_compiled_with_dist", IsCompiledWithDIST);
-  m.def("run_cmd", [](const std::string &cmd) -> const std::string {
-    return paddle::framework::shell_get_command_output(cmd);
+  m.def("run_cmd", [](const std::string &cmd, int time_out = -1,
+                      int sleep_inter = -1) -> const std::string {
+    return paddle::framework::shell_get_command_output(cmd, time_out,
+                                                       sleep_inter);
  });
 #ifdef PADDLE_WITH_CUDA
  m.def("is_float16_supported", [](const platform::CUDAPlace &place) -> bool {
--- a/python/paddle/distributed/cloud_utils.py
+++ b/python/paddle/distributed/cloud_utils.py
@ -0,0 +1,79 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from utils import get_cluster, logger
+import os
+
+
+def get_cloud_cluster(args_node_ips, args_node_ip, args_port, selected_gpus):
+    """
+    args_node_ips, args_node_ip:string
+    """
+    #you can automatically get ip info while using paddlecloud multi nodes mode.
+    node_ips = os.getenv("PADDLE_TRAINERS")
+    assert node_ips is not None, "PADDLE_TRAINERS should not be None"
+
+    node_ip = os.getenv("POD_IP")
+    assert node_ip is not None, "POD_IP should not be None"
+
+    node_rank = os.getenv("PADDLE_TRAINER_ID")
+    assert node_rank is not None, "PADDLE_TRAINER_ID should not be None"
+
+    node_ips = node_ips.split(",")
+    num_nodes = len(node_ips)
+    node_rank = int(node_rank)
+
+    if node_ip != "127.0.0.1" and node_ip != args_node_ip:
+        logger.warning("Please NOTE: When using paddlecloud, node_ip is \
+automatically got from POD_IP. Your input node_ip: {} doesn't equals to \
+node_ip: {} from paddlecloud environment.".format(args_node_ip, node_ip))
+
+    if args_node_ips != "127.0.0.1" and args_node_ips != ",".join(node_ips):
+        logger.warning(
+            "Please NOTE: When using paddlecloud, cluster_node_ips is \
+automatically got from PADDLE_TRAINERS(multi nodes) or POD_IP(single node).\
+Your input cluster_node_ips: {} doesn't equals to IPs: {} from \
+paddlecloud environment.".format(args_node_ips, node_ips))
+
+    started_port = args_port
+    print("num_nodes:", num_nodes)
+    if num_nodes > 1:
+        try:
+            paddle_port = int(os.getenv("PADDLE_PORT", ""))
+            paddle_port_num = int(os.getenv("TRAINER_PORTS_NUM", ""))
+
+            if paddle_port_num >= len(
+                    selected_gpus) and paddle_port != args_port:
+                logger.warning("Use Cloud specified port:{}.".format(
+                    paddle_port))
+                started_port = paddle_port
+
+        except Exception as e:
+            print(e)
+            pass
+
+    if started_port is None:
+        started_port = 6170
+
+    logger.debug("parsed from args:node_ips:{} \
+        node_ip:{} node_rank:{} started_port:{}"
+                 .format(node_ips, node_ip, node_rank, started_port))
+
+    ports = [x for x in range(started_port, started_port + len(selected_gpus))]
+    cluster, pod = get_cluster(node_ips, node_ip, ports, selected_gpus)
+    return cluster, cluster.pods[node_rank]
+
+
+def get_trainers_num():
+    return int(os.getenv("PADDLE_TRAINERS_NUM", "1"))
--- a/python/paddle/distributed/fs_wrapper.py
+++ b/python/paddle/distributed/fs_wrapper.py
@ -0,0 +1,223 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle.fluid as fluid
+import sys
+import abc
+import os
+from pathlib import PurePosixPath
+import shutil
+
+
+class FS(object):
+    @abc.abstractmethod
+    def list_dirs(self, fs_path):
+        pass
+
+    @abc.abstractmethod
+    def ls_dir(self, fs_path):
+        pass
+
+    @abc.abstractmethod
+    def stat(self, fs_path):
+        pass
+
+    @abc.abstractmethod
+    def upload(self, local_path, fs_path):
+        pass
+
+    @abc.abstractmethod
+    def download(self, fs_path, local_path):
+        pass
+
+    @abc.abstractmethod
+    def mkdir(self, fs_path):
+        pass
+
+    @abc.abstractmethod
+    def mv(self, fs_src_path, fs_dst_path):
+        pass
+
+    @abc.abstractmethod
+    def rmr(self, fs_path):
+        pass
+
+    @abc.abstractmethod
+    def rm(self, fs_path):
+        pass
+
+    @abc.abstractmethod
+    def delete(self, fs_path):
+        pass
+
+    @abc.abstractmethod
+    def need_upload_download(self):
+        pass
+
+
+class LocalFS(FS):
+    def list_dirs(self, fs_path):
+        if not self.stat(fs_path):
+            return []
+
+        return [
+            f for f in os.listdir(fs_path) if os.path.isdir(fs_path + "/" + f)
+        ]
+
+    def ls_dir(self, fs_path):
+        return [f for f in os.listdir(fs_path)]
+
+    def stat(self, fs_path):
+        return os.path.exists(fs_path)
+
+    def mkdir(self, fs_path):
+        assert not os.path.isfile(fs_path), "{} is already a file".format(
+            fs_path)
+        os.system("mkdir -p {}".format(fs_path))
+
+    def mv(self, fs_src_path, fs_dst_path):
+        os.rename(fs_src_path, fs_dst_path)
+
+    def rmr(self, fs_path):
+        shutil.rmtree(fs_path)
+
+    def rm(self, fs_path):
+        os.remove(fs_path)
+
+    def delete(self, fs_path):
+        if not self.stat(fs_path):
+            return
+
+        if os.path.isfile(fs_path):
+            return self.rm(fs_path)
+
+        return self.rmr(fs_path)
+
+    def need_upload_download(self):
+        return False
+
+
+class BDFS(FS):
+    def __init__(self,
+                 hdfs_name,
+                 hdfs_ugi,
+                 time_out=20 * 60 * 1000,
+                 sleep_inter=1000):
+        self._base_cmd = "hadoop fs -Dfs.default.name=\"{}\" -Dhadoop.job.ugi=\"{}\"".format(
+            hdfs_name, hdfs_ugi)
+        self._time_out = time_out
+        self._sleep_inter = sleep_inter
+
+    def _run_cmd(self, cmd):
+        ret = fluid.core.run_cmd(cmd, self._time_out, self._sleep_inter)
+        if len(ret) <= 0:
+            return []
+
+        lines = ret.splitlines()
+        return lines
+
+    def list_dirs(self, fs_path):
+        if not self.stat(fs_path):
+            return []
+
+        dirs, _ = self.ls_dir(fs_path)
+        return dirs
+
+    def ls_dir(self, fs_path):
+        """
+        list directory under fs_path, and only give the pure name, not include the fs_path
+        """
+        cmd = "{} -ls {}".format(self._base_cmd, fs_path)
+        lines = self._run_cmd(cmd)
+
+        dirs = []
+        files = []
+        for line in lines:
+            arr = line.split()
+            if len(arr) != 8:
+                continue
+
+            if fs_path not in arr[7]:
+                continue
+
+            p = PurePosixPath(arr[7])
+            if arr[0][0] == 'd':
+                dirs.append(p.name)
+            else:
+                files.append(p.name)
+
+        return dirs, files
+
+    def is_dir(self, fs_path):
+        cmd = "{} -test -d {} ; echo $?".format(self._base_cmd, fs_path)
+
+        test = self._run_cmd(cmd)
+        if test[0].strip() == "0":
+            return True
+
+        return False
+
+    def stat(self, fs_path):
+        cmd = "{} -test -e {} ; echo $?".format(self._base_cmd, fs_path)
+
+        test = self._run_cmd(cmd)
+        if test[0].strip() == "0":
+            return True
+
+        return False
+
+    def upload(self, local_path, fs_path):
+        cmd = "{} -put {} {}".format(self._base_cmd, local_path, fs_path)
+        fluid.core.run_cmd(cmd, self._time_out, self._sleep_inter)
+
+    def download(self, fs_path, local_path):
+        cmd = "{} -get {} {}/".format(self._base_cmd, fs_path, local_path)
+        fluid.core.run_cmd(cmd, self._time_out, self._sleep_inter)
+
+    def mkdir(self, fs_path):
+
+        if not self.stat(fs_path):
+            cmd = "{} -mkdir {}".format(self._base_cmd, fs_path)
+            fluid.core.run_cmd(cmd, self._time_out, self._sleep_inter)
+
+    def mv(self, fs_src_path, fs_dst_path):
+        cmd = "{} -mv {} {}".format(self._base_cmd, fs_src_path, fs_dst_path)
+        fluid.core.run_cmd(cmd, self._time_out, self._sleep_inter)
+
+    def rmr(self, fs_path):
+        if not self.stat(fs_path):
+            return
+
+        cmd = "{} -rmr {}".format(self._base_cmd, fs_path)
+        return fluid.core.run_cmd(cmd, self._time_out, self._sleep_inter)
+
+    def rm(self, fs_path):
+        if not self.stat(fs_path):
+            return
+
+        cmd = "{} -rm {}".format(self._base_cmd, fs_path)
+        return fluid.core.run_cmd(cmd, self._time_out, self._sleep_inter)
+
+    def delete(self, fs_path):
+        if not self.stat(fs_path):
+            return
+
+        is_dir = self.is_dir(fs_path)
+        if is_dir:
+            return self.rmr(fs_path)
+
+        return self.rm(fs_path)
+
+    def need_upload_download(self):
+        return True
--- a/python/paddle/distributed/launch.py
+++ b/python/paddle/distributed/launch.py
--- a/python/paddle/distributed/utils.py
+++ b/python/paddle/distributed/utils.py
--- a/python/paddle/fluid/incubate/fleet/collective/init.py
+++ b/python/paddle/fluid/incubate/fleet/collective/init.py
@ -26,10 +26,14 @@ from paddle.fluid.incubate.fleet.base.fleet_base import Mode
 from paddle.fluid.incubate.fleet.base.fleet_base import DistributedOptimizer

 from paddle.fluid import compiler
+from paddle.distributed.fs_wrapper import LocalFS, BDFS

 import os
 import sys
 import six
+import json
+import re
+import shutil


 class LambConfig(object):
@ -42,6 +46,21 @@ class DistFCConfig(object):
        pass


+class TrainStatus(object):
+    def __init__(self, epoch_no=-1):
+        # completed epoch
+        self._epoch_no = epoch_no
+
+    def next(self):
+        return self._epoch_no + 1
+
+    def __eq__(self, t):
+        return self._epoch_no == t._epoch_no
+
+    def __ne__(self, t):
+        return not self == t
+
+
 class Collective(Fleet):
    def __init__(self):
        super(Collective, self).__init__(Mode.COLLECTIVE)
@ -51,6 +70,8 @@ class Collective(Fleet):
        self._origin_program = None
        self._transpiled_program = None
        self.main_program = None
+        self._checkoint_prefix = "__paddle_fleet_checkpoint__"
+        self._param_file_name = "_paddle_fleet_param__"

    def init_worker(self):
        logging.warn(
@ -103,7 +124,11 @@ class Collective(Fleet):
                                executor, main_program, None, None,
                                export_for_deployment)

-    def save_persistables(self, executor, dirname, main_program=None):
+    def save_persistables(self,
+                          executor,
+                          dirname,
+                          main_program=None,
+                          filename=None):
        """
        This function filters out all variables with `persistable==True` from
        the give `main_program` and then saves these variables to the folder
@ -125,7 +150,182 @@ class Collective(Fleet):
            "In fleet.save_inference_model() function, main_program " \
            "must be as Program type."

-        io.save_persistables(executor, dirname, main_program, None)
+        io.save_persistables(executor, dirname, main_program, filename=filename)
+
+    def _save_train_status(self, path, train_status):
+        d = {}
+        d["epoch_no"] = train_status._epoch_no
+
+        file_name = "{}/fleet_train_status".format(path)
+        with open(file_name, 'w') as f:
+            json.dump(d, f)
+
+    def _load_train_status(self, path):
+        file_name = "{}/fleet_train_status".format(path)
+
+        r = TrainStatus()
+        if not os.path.isfile(file_name):
+            return r
+
+        d = {}
+        with open(file_name, 'r') as f:
+            d = json.load(f)
+
+        assert "epoch_no" in d, "Can't find epoch_no in dict from train_status file:{}".format(
+            d)
+        r._epoch_no = d["epoch_no"]
+        assert r._epoch_no >= 0, "Data in checkpoint file is not valid:{}".format(
+            d)
+
+        return r
+
+    def _get_last_checkpoint_no(self, root_path, fs):
+        """
+        only get the first depth
+        """
+        max_no = -1
+        d = {}
+        dirs = fs.list_dirs(root_path)
+        for dir in dirs:
+            g = dir.split(".")
+            if len(g) != 2:
+                continue
+
+            if g[0] != "__paddle_fleet_checkpoint__":
+                continue
+
+            try:
+                n = int(g[1])
+                if n > max_no:
+                    max_no = n
+            except:
+                continue
+
+        return max_no
+
+    def clean_redundant_check_points(self,
+                                     root_path,
+                                     fs=LocalFS(),
+                                     checkpoint_num=1):
+        max_no = self._get_last_checkpoint_no(root_path, fs)
+        if max_no < 0:
+            return
+
+        if checkpoint_num < 1:
+            checkpoint_num = 1
+
+        dirs = fs.list_dirs(root_path)
+        for dir in dirs:
+            g = dir.split(".")
+            if len(g) != 2:
+                continue
+
+            if g[0] != self._checkoint_prefix:
+                continue
+
+            try:
+                n = int(g[1])
+                if n <= max_no - checkpoint_num:
+                    path = "{}/{}.{}".format(root_path, self._checkoint_prefix,
+                                             n)
+                    fs.rmr(path)
+            except Exception as e:
+                print(e)
+                continue
+
+    def save_check_point(self,
+                         executor,
+                         path,
+                         train_status,
+                         main_program=None,
+                         fs=LocalFS(),
+                         local_cache_path=".cache",
+                         remain_all_checkpoint=True):
+        """
+        This function save persistables and current epoch num to path.
+        """
+
+        if main_program == None:
+            main_program = self._transpiled_program
+
+        if not fs.stat(path):
+            fs.mkdir(path)
+
+        max_no = self._get_last_checkpoint_no(path, fs=fs)
+        if max_no < 0:
+            max_no = -1
+
+        real_path = "{}/{}.{}".format(path, self._checkoint_prefix, max_no + 1)
+        tmp_path = "{}.tmp".format(real_path)
+        saved_path = tmp_path
+
+        local_fs = LocalFS()
+
+        cache_path = None
+        if fs.need_upload_download():
+            cache_path = "{}/{}.{}.saved_cache".format(
+                local_cache_path, self._checkoint_prefix, max_no + 1)
+            if not local_fs.stat(cache_path):
+                local_fs.mkdir(cache_path)
+            saved_path = cache_path
+
+        self.save_persistables(
+            executor=executor,
+            dirname=saved_path,
+            main_program=main_program,
+            filename=self._param_file_name)
+        self._save_train_status(path=saved_path, train_status=train_status)
+
+        if fs.need_upload_download():
+            fs.delete(tmp_path)
+            fs.upload(cache_path, tmp_path)
+        fs.mv(tmp_path, real_path)
+
+        if not remain_all_checkpoint:
+            self.clean_redundant_check_points(path)
+
+    def load_check_point(self,
+                         executor,
+                         path,
+                         trainer_id,
+                         main_program=None,
+                         fs=LocalFS(),
+                         local_cache_path=".cache",
+                         ignore_empty=True):
+        """
+        This function load persistables and current epoch num from path.
+        """
+        max_no = self._get_last_checkpoint_no(path, fs)
+
+        if not ignore_empty:
+            assert max_no >= 0, "Can't find checkpoint"
+
+        if max_no < 0:
+            return None
+
+        local_fs = LocalFS()
+        if fs.need_upload_download():
+            cache_path = "{}/{}.{}.load_cache.{}".format(
+                local_cache_path, self._checkoint_prefix, max_no, trainer_id)
+            if local_fs.stat(cache_path):
+                local_fs.delete(cache_path)
+
+        real_path = "{}/{}.{}".format(path, self._checkoint_prefix, max_no)
+        load_path = real_path
+        if fs.need_upload_download():
+            fs.download(real_path, cache_path)
+            load_path = cache_path
+
+        if main_program == None:
+            main_program = self._transpiled_program
+
+        io.load_persistables(
+            executor=executor,
+            dirname=load_path,
+            main_program=main_program,
+            filename=self._param_file_name)
+
+        return self._load_train_status(load_path)


 fleet = Collective()
--- a/python/paddle/fluid/tests/unittests/CMakeLists.txt
+++ b/python/paddle/fluid/tests/unittests/CMakeLists.txt
@ -28,6 +28,7 @@ list(APPEND MIXED_DIST_TEST_OPS test_communicator_geo)
 list(APPEND MIXED_DIST_TEST_OPS test_communicator_half_async)
 list(APPEND MIXED_DIST_TEST_OPS test_communicator_sync)
 list(APPEND MIXED_DIST_TEST_OPS test_fleet_api_input)
+list(APPEND MIXED_DIST_TEST_OPS test_fleet_checkpoint)
 foreach(TEST_OP ${MIXED_DIST_TEST_OPS})
  list(REMOVE_ITEM TEST_OPS ${TEST_OP})
 endforeach()
@ -301,6 +302,7 @@ if(WITH_DISTRIBUTE)
        if(WITH_GPU)
            # NOTE. test_launch only work in gpu collective mode
            bash_test_modules(test_launch MODULES test_launch.sh  ENVS PADDLE_BINARY_DIR=${PADDLE_BINARY_DIR})
+            py_test_modules(test_fleet_checkpoint MODULES test_fleet_checkpoint)
        endif()
        bash_test_modules(test_launch_ps MODULES test_launch_ps.sh ENVS PADDLE_BINARY_DIR=${PADDLE_BINARY_DIR})

--- a/python/paddle/fluid/tests/unittests/test_fleet_checkpoint.py
+++ b/python/paddle/fluid/tests/unittests/test_fleet_checkpoint.py
@ -0,0 +1,77 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import paddle.fluid as fluid
+import paddle.fluid.incubate.fleet.base.role_maker as role_maker
+from paddle.fluid.incubate.fleet.collective import CollectiveOptimizer, fleet, TrainStatus
+import os
+from paddle.distributed.fs_wrapper import LocalFS, BDFS
+
+
+class FleetTest(unittest.TestCase):
+    def _test_check_point(self, fs, dir_path):
+        file_name = "persistables"
+
+        os.environ["TRAINING_ROLE"] = "TRAINER"
+        os.environ["PADDLE_TRAINER_ID"] = "0"
+        os.environ["PADDLE_TRAINER_ENDPOINTS"] = "127.0.0.1:6070"
+
+        role = role_maker.PaddleCloudRoleMaker(is_collective=True)
+        fleet.init(role)
+
+        image = fluid.data(name='img', shape=[None, 28, 28], dtype='float32')
+        label = fluid.data(name='label', shape=[None, 1], dtype='int64')
+        feeder = fluid.DataFeeder(
+            feed_list=[image, label], place=fluid.CPUPlace())
+        predict = fluid.layers.fc(input=image, size=10, act='softmax')
+        loss = fluid.layers.cross_entropy(input=predict, label=label)
+        avg_loss = fluid.layers.mean(loss)
+        optimizer = fluid.optimizer.AdamOptimizer(learning_rate=0.001)
+
+        dist_optimizer = fleet.distributed_optimizer(optimizer)
+        dist_optimizer.minimize(avg_loss)
+
+        exe = fluid.Executor(fluid.CPUPlace())
+        exe.run(fluid.default_startup_program())
+
+        status = TrainStatus(2)
+        fleet.save_check_point(exe, dir_path, train_status=status, fs=fs)
+        n1 = fleet._get_last_checkpoint_no(dir_path, fs=fs)
+
+        status2 = fleet.load_check_point(exe, dir_path, trainer_id=0, fs=fs)
+        self.assertEqual(status2, status)
+
+        fleet.save_check_point(exe, dir_path, train_status=status, fs=fs)
+        n2 = fleet._get_last_checkpoint_no(dir_path, fs=fs)
+        self.assertEqual(n2, n1 + 1)
+
+        fleet.clean_redundant_check_points(dir_path, fs=fs)
+
+    def test_hdfs_check_point(self):
+        try:
+            fs = BDFS("xxxx", "xxxx", 1 * 1000, 1 * 1000)
+            dir_path = "/user/Paddle_Data/gongweibao/edl_test/my_paddle_model"
+            self._test_check_point(fs, dir_path)
+        except Exception as e:
+            print(e)
+
+    def test_local_check_point(self):
+        fs = LocalFS()
+        dir_path = "./my_paddle_model"
+        self._test_check_point(fs, dir_path)
+
+
+if __name__ == '__main__':
+    unittest.main()
--- a/python/paddle/fluid/tests/unittests/test_launch.sh
+++ b/python/paddle/fluid/tests/unittests/test_launch.sh
@ -6,6 +6,7 @@ launch_py=${PADDLE_BINARY_DIR}/python/paddle/distributed/launch.py
 python ${launch_py} multi_process.py

 # use paddlecloud
+echo "begin test use paddlecloud"
 cluster_node_ips="10.0.0.1"
 node_ip="10.0.0.1"
 export PADDLE_TRAINERS_NUM=2
@ -14,7 +15,7 @@ export PADDLE_TRAINERS=127.0.0.1,127.0.0.2
 export PADDLE_TRAINER_ID=0

 export PADDLE_PORT=35019
-export PADDLE_PORTS_NUM=2
+export TRAINER_PORTS_NUM=2

 distributed_args="--use_paddlecloud --cluster_node_ips=${cluster_node_ips} --node_ip=${node_ip} --selected_gpus=0,1 --log_dir=testlog"
 CUDA_VISIBLE_DEVICES=0,1 python ${launch_py} ${distributed_args} multi_process.py
@ -47,8 +48,9 @@ if [ -f $file_1 ]; then
    rm $file_1
 fi

+
 unset PADDLE_PORT
-unset PADDLE_PORTS_NUM
+unset TRAINER_PORTS_NUM

 echo ""
 echo "paddle.distributed.launch async poll process test"
--- a/python/requirements.txt
+++ b/python/requirements.txt
@ -19,3 +19,4 @@ decorator
 prettytable
 objgraph
 astor
+pathlib