Add fleet checkpoint on local fs and remote fs(such as hdfs) for EDL (#22586)
parent
0c23e3ff4d
commit
24a063f6ac
@ -1,4 +1,4 @@
|
|||||||
cc_library(fs SRCS fs.cc DEPS string_helper glog boost)
|
cc_library(fs SRCS fs.cc DEPS string_helper glog boost)
|
||||||
cc_library(shell SRCS shell.cc DEPS string_helper glog)
|
cc_library(shell SRCS shell.cc DEPS string_helper glog timer)
|
||||||
|
|
||||||
cc_test(test_fs SRCS test_fs.cc DEPS fs shell)
|
cc_test(test_fs SRCS test_fs.cc DEPS fs shell)
|
||||||
|
@ -0,0 +1,79 @@
|
|||||||
|
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
|
||||||
|
from utils import get_cluster, logger
|
||||||
|
import os
|
||||||
|
|
||||||
|
|
||||||
|
def get_cloud_cluster(args_node_ips, args_node_ip, args_port, selected_gpus):
|
||||||
|
"""
|
||||||
|
args_node_ips, args_node_ip:string
|
||||||
|
"""
|
||||||
|
#you can automatically get ip info while using paddlecloud multi nodes mode.
|
||||||
|
node_ips = os.getenv("PADDLE_TRAINERS")
|
||||||
|
assert node_ips is not None, "PADDLE_TRAINERS should not be None"
|
||||||
|
|
||||||
|
node_ip = os.getenv("POD_IP")
|
||||||
|
assert node_ip is not None, "POD_IP should not be None"
|
||||||
|
|
||||||
|
node_rank = os.getenv("PADDLE_TRAINER_ID")
|
||||||
|
assert node_rank is not None, "PADDLE_TRAINER_ID should not be None"
|
||||||
|
|
||||||
|
node_ips = node_ips.split(",")
|
||||||
|
num_nodes = len(node_ips)
|
||||||
|
node_rank = int(node_rank)
|
||||||
|
|
||||||
|
if node_ip != "127.0.0.1" and node_ip != args_node_ip:
|
||||||
|
logger.warning("Please NOTE: When using paddlecloud, node_ip is \
|
||||||
|
automatically got from POD_IP. Your input node_ip: {} doesn't equals to \
|
||||||
|
node_ip: {} from paddlecloud environment.".format(args_node_ip, node_ip))
|
||||||
|
|
||||||
|
if args_node_ips != "127.0.0.1" and args_node_ips != ",".join(node_ips):
|
||||||
|
logger.warning(
|
||||||
|
"Please NOTE: When using paddlecloud, cluster_node_ips is \
|
||||||
|
automatically got from PADDLE_TRAINERS(multi nodes) or POD_IP(single node).\
|
||||||
|
Your input cluster_node_ips: {} doesn't equals to IPs: {} from \
|
||||||
|
paddlecloud environment.".format(args_node_ips, node_ips))
|
||||||
|
|
||||||
|
started_port = args_port
|
||||||
|
print("num_nodes:", num_nodes)
|
||||||
|
if num_nodes > 1:
|
||||||
|
try:
|
||||||
|
paddle_port = int(os.getenv("PADDLE_PORT", ""))
|
||||||
|
paddle_port_num = int(os.getenv("TRAINER_PORTS_NUM", ""))
|
||||||
|
|
||||||
|
if paddle_port_num >= len(
|
||||||
|
selected_gpus) and paddle_port != args_port:
|
||||||
|
logger.warning("Use Cloud specified port:{}.".format(
|
||||||
|
paddle_port))
|
||||||
|
started_port = paddle_port
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
print(e)
|
||||||
|
pass
|
||||||
|
|
||||||
|
if started_port is None:
|
||||||
|
started_port = 6170
|
||||||
|
|
||||||
|
logger.debug("parsed from args:node_ips:{} \
|
||||||
|
node_ip:{} node_rank:{} started_port:{}"
|
||||||
|
.format(node_ips, node_ip, node_rank, started_port))
|
||||||
|
|
||||||
|
ports = [x for x in range(started_port, started_port + len(selected_gpus))]
|
||||||
|
cluster, pod = get_cluster(node_ips, node_ip, ports, selected_gpus)
|
||||||
|
return cluster, cluster.pods[node_rank]
|
||||||
|
|
||||||
|
|
||||||
|
def get_trainers_num():
|
||||||
|
return int(os.getenv("PADDLE_TRAINERS_NUM", "1"))
|
@ -0,0 +1,223 @@
|
|||||||
|
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
|
||||||
|
import paddle.fluid as fluid
|
||||||
|
import sys
|
||||||
|
import abc
|
||||||
|
import os
|
||||||
|
from pathlib import PurePosixPath
|
||||||
|
import shutil
|
||||||
|
|
||||||
|
|
||||||
|
class FS(object):
|
||||||
|
@abc.abstractmethod
|
||||||
|
def list_dirs(self, fs_path):
|
||||||
|
pass
|
||||||
|
|
||||||
|
@abc.abstractmethod
|
||||||
|
def ls_dir(self, fs_path):
|
||||||
|
pass
|
||||||
|
|
||||||
|
@abc.abstractmethod
|
||||||
|
def stat(self, fs_path):
|
||||||
|
pass
|
||||||
|
|
||||||
|
@abc.abstractmethod
|
||||||
|
def upload(self, local_path, fs_path):
|
||||||
|
pass
|
||||||
|
|
||||||
|
@abc.abstractmethod
|
||||||
|
def download(self, fs_path, local_path):
|
||||||
|
pass
|
||||||
|
|
||||||
|
@abc.abstractmethod
|
||||||
|
def mkdir(self, fs_path):
|
||||||
|
pass
|
||||||
|
|
||||||
|
@abc.abstractmethod
|
||||||
|
def mv(self, fs_src_path, fs_dst_path):
|
||||||
|
pass
|
||||||
|
|
||||||
|
@abc.abstractmethod
|
||||||
|
def rmr(self, fs_path):
|
||||||
|
pass
|
||||||
|
|
||||||
|
@abc.abstractmethod
|
||||||
|
def rm(self, fs_path):
|
||||||
|
pass
|
||||||
|
|
||||||
|
@abc.abstractmethod
|
||||||
|
def delete(self, fs_path):
|
||||||
|
pass
|
||||||
|
|
||||||
|
@abc.abstractmethod
|
||||||
|
def need_upload_download(self):
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
|
class LocalFS(FS):
|
||||||
|
def list_dirs(self, fs_path):
|
||||||
|
if not self.stat(fs_path):
|
||||||
|
return []
|
||||||
|
|
||||||
|
return [
|
||||||
|
f for f in os.listdir(fs_path) if os.path.isdir(fs_path + "/" + f)
|
||||||
|
]
|
||||||
|
|
||||||
|
def ls_dir(self, fs_path):
|
||||||
|
return [f for f in os.listdir(fs_path)]
|
||||||
|
|
||||||
|
def stat(self, fs_path):
|
||||||
|
return os.path.exists(fs_path)
|
||||||
|
|
||||||
|
def mkdir(self, fs_path):
|
||||||
|
assert not os.path.isfile(fs_path), "{} is already a file".format(
|
||||||
|
fs_path)
|
||||||
|
os.system("mkdir -p {}".format(fs_path))
|
||||||
|
|
||||||
|
def mv(self, fs_src_path, fs_dst_path):
|
||||||
|
os.rename(fs_src_path, fs_dst_path)
|
||||||
|
|
||||||
|
def rmr(self, fs_path):
|
||||||
|
shutil.rmtree(fs_path)
|
||||||
|
|
||||||
|
def rm(self, fs_path):
|
||||||
|
os.remove(fs_path)
|
||||||
|
|
||||||
|
def delete(self, fs_path):
|
||||||
|
if not self.stat(fs_path):
|
||||||
|
return
|
||||||
|
|
||||||
|
if os.path.isfile(fs_path):
|
||||||
|
return self.rm(fs_path)
|
||||||
|
|
||||||
|
return self.rmr(fs_path)
|
||||||
|
|
||||||
|
def need_upload_download(self):
|
||||||
|
return False
|
||||||
|
|
||||||
|
|
||||||
|
class BDFS(FS):
|
||||||
|
def __init__(self,
|
||||||
|
hdfs_name,
|
||||||
|
hdfs_ugi,
|
||||||
|
time_out=20 * 60 * 1000,
|
||||||
|
sleep_inter=1000):
|
||||||
|
self._base_cmd = "hadoop fs -Dfs.default.name=\"{}\" -Dhadoop.job.ugi=\"{}\"".format(
|
||||||
|
hdfs_name, hdfs_ugi)
|
||||||
|
self._time_out = time_out
|
||||||
|
self._sleep_inter = sleep_inter
|
||||||
|
|
||||||
|
def _run_cmd(self, cmd):
|
||||||
|
ret = fluid.core.run_cmd(cmd, self._time_out, self._sleep_inter)
|
||||||
|
if len(ret) <= 0:
|
||||||
|
return []
|
||||||
|
|
||||||
|
lines = ret.splitlines()
|
||||||
|
return lines
|
||||||
|
|
||||||
|
def list_dirs(self, fs_path):
|
||||||
|
if not self.stat(fs_path):
|
||||||
|
return []
|
||||||
|
|
||||||
|
dirs, _ = self.ls_dir(fs_path)
|
||||||
|
return dirs
|
||||||
|
|
||||||
|
def ls_dir(self, fs_path):
|
||||||
|
"""
|
||||||
|
list directory under fs_path, and only give the pure name, not include the fs_path
|
||||||
|
"""
|
||||||
|
cmd = "{} -ls {}".format(self._base_cmd, fs_path)
|
||||||
|
lines = self._run_cmd(cmd)
|
||||||
|
|
||||||
|
dirs = []
|
||||||
|
files = []
|
||||||
|
for line in lines:
|
||||||
|
arr = line.split()
|
||||||
|
if len(arr) != 8:
|
||||||
|
continue
|
||||||
|
|
||||||
|
if fs_path not in arr[7]:
|
||||||
|
continue
|
||||||
|
|
||||||
|
p = PurePosixPath(arr[7])
|
||||||
|
if arr[0][0] == 'd':
|
||||||
|
dirs.append(p.name)
|
||||||
|
else:
|
||||||
|
files.append(p.name)
|
||||||
|
|
||||||
|
return dirs, files
|
||||||
|
|
||||||
|
def is_dir(self, fs_path):
|
||||||
|
cmd = "{} -test -d {} ; echo $?".format(self._base_cmd, fs_path)
|
||||||
|
|
||||||
|
test = self._run_cmd(cmd)
|
||||||
|
if test[0].strip() == "0":
|
||||||
|
return True
|
||||||
|
|
||||||
|
return False
|
||||||
|
|
||||||
|
def stat(self, fs_path):
|
||||||
|
cmd = "{} -test -e {} ; echo $?".format(self._base_cmd, fs_path)
|
||||||
|
|
||||||
|
test = self._run_cmd(cmd)
|
||||||
|
if test[0].strip() == "0":
|
||||||
|
return True
|
||||||
|
|
||||||
|
return False
|
||||||
|
|
||||||
|
def upload(self, local_path, fs_path):
|
||||||
|
cmd = "{} -put {} {}".format(self._base_cmd, local_path, fs_path)
|
||||||
|
fluid.core.run_cmd(cmd, self._time_out, self._sleep_inter)
|
||||||
|
|
||||||
|
def download(self, fs_path, local_path):
|
||||||
|
cmd = "{} -get {} {}/".format(self._base_cmd, fs_path, local_path)
|
||||||
|
fluid.core.run_cmd(cmd, self._time_out, self._sleep_inter)
|
||||||
|
|
||||||
|
def mkdir(self, fs_path):
|
||||||
|
|
||||||
|
if not self.stat(fs_path):
|
||||||
|
cmd = "{} -mkdir {}".format(self._base_cmd, fs_path)
|
||||||
|
fluid.core.run_cmd(cmd, self._time_out, self._sleep_inter)
|
||||||
|
|
||||||
|
def mv(self, fs_src_path, fs_dst_path):
|
||||||
|
cmd = "{} -mv {} {}".format(self._base_cmd, fs_src_path, fs_dst_path)
|
||||||
|
fluid.core.run_cmd(cmd, self._time_out, self._sleep_inter)
|
||||||
|
|
||||||
|
def rmr(self, fs_path):
|
||||||
|
if not self.stat(fs_path):
|
||||||
|
return
|
||||||
|
|
||||||
|
cmd = "{} -rmr {}".format(self._base_cmd, fs_path)
|
||||||
|
return fluid.core.run_cmd(cmd, self._time_out, self._sleep_inter)
|
||||||
|
|
||||||
|
def rm(self, fs_path):
|
||||||
|
if not self.stat(fs_path):
|
||||||
|
return
|
||||||
|
|
||||||
|
cmd = "{} -rm {}".format(self._base_cmd, fs_path)
|
||||||
|
return fluid.core.run_cmd(cmd, self._time_out, self._sleep_inter)
|
||||||
|
|
||||||
|
def delete(self, fs_path):
|
||||||
|
if not self.stat(fs_path):
|
||||||
|
return
|
||||||
|
|
||||||
|
is_dir = self.is_dir(fs_path)
|
||||||
|
if is_dir:
|
||||||
|
return self.rmr(fs_path)
|
||||||
|
|
||||||
|
return self.rm(fs_path)
|
||||||
|
|
||||||
|
def need_upload_download(self):
|
||||||
|
return True
|
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
@ -0,0 +1,77 @@
|
|||||||
|
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
|
||||||
|
import unittest
|
||||||
|
import paddle.fluid as fluid
|
||||||
|
import paddle.fluid.incubate.fleet.base.role_maker as role_maker
|
||||||
|
from paddle.fluid.incubate.fleet.collective import CollectiveOptimizer, fleet, TrainStatus
|
||||||
|
import os
|
||||||
|
from paddle.distributed.fs_wrapper import LocalFS, BDFS
|
||||||
|
|
||||||
|
|
||||||
|
class FleetTest(unittest.TestCase):
|
||||||
|
def _test_check_point(self, fs, dir_path):
|
||||||
|
file_name = "persistables"
|
||||||
|
|
||||||
|
os.environ["TRAINING_ROLE"] = "TRAINER"
|
||||||
|
os.environ["PADDLE_TRAINER_ID"] = "0"
|
||||||
|
os.environ["PADDLE_TRAINER_ENDPOINTS"] = "127.0.0.1:6070"
|
||||||
|
|
||||||
|
role = role_maker.PaddleCloudRoleMaker(is_collective=True)
|
||||||
|
fleet.init(role)
|
||||||
|
|
||||||
|
image = fluid.data(name='img', shape=[None, 28, 28], dtype='float32')
|
||||||
|
label = fluid.data(name='label', shape=[None, 1], dtype='int64')
|
||||||
|
feeder = fluid.DataFeeder(
|
||||||
|
feed_list=[image, label], place=fluid.CPUPlace())
|
||||||
|
predict = fluid.layers.fc(input=image, size=10, act='softmax')
|
||||||
|
loss = fluid.layers.cross_entropy(input=predict, label=label)
|
||||||
|
avg_loss = fluid.layers.mean(loss)
|
||||||
|
optimizer = fluid.optimizer.AdamOptimizer(learning_rate=0.001)
|
||||||
|
|
||||||
|
dist_optimizer = fleet.distributed_optimizer(optimizer)
|
||||||
|
dist_optimizer.minimize(avg_loss)
|
||||||
|
|
||||||
|
exe = fluid.Executor(fluid.CPUPlace())
|
||||||
|
exe.run(fluid.default_startup_program())
|
||||||
|
|
||||||
|
status = TrainStatus(2)
|
||||||
|
fleet.save_check_point(exe, dir_path, train_status=status, fs=fs)
|
||||||
|
n1 = fleet._get_last_checkpoint_no(dir_path, fs=fs)
|
||||||
|
|
||||||
|
status2 = fleet.load_check_point(exe, dir_path, trainer_id=0, fs=fs)
|
||||||
|
self.assertEqual(status2, status)
|
||||||
|
|
||||||
|
fleet.save_check_point(exe, dir_path, train_status=status, fs=fs)
|
||||||
|
n2 = fleet._get_last_checkpoint_no(dir_path, fs=fs)
|
||||||
|
self.assertEqual(n2, n1 + 1)
|
||||||
|
|
||||||
|
fleet.clean_redundant_check_points(dir_path, fs=fs)
|
||||||
|
|
||||||
|
def test_hdfs_check_point(self):
|
||||||
|
try:
|
||||||
|
fs = BDFS("xxxx", "xxxx", 1 * 1000, 1 * 1000)
|
||||||
|
dir_path = "/user/Paddle_Data/gongweibao/edl_test/my_paddle_model"
|
||||||
|
self._test_check_point(fs, dir_path)
|
||||||
|
except Exception as e:
|
||||||
|
print(e)
|
||||||
|
|
||||||
|
def test_local_check_point(self):
|
||||||
|
fs = LocalFS()
|
||||||
|
dir_path = "./my_paddle_model"
|
||||||
|
self._test_check_point(fs, dir_path)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
unittest.main()
|
Loading…
Reference in new issue