Add fleet checkpoint on local fs and remote fs(such as hdfs) for EDL (#22586)

revert-23830-2.0-beta
gongweibao 5 years ago committed by GitHub
parent 0c23e3ff4d
commit 24a063f6ac
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

@ -1,4 +1,4 @@
cc_library(fs SRCS fs.cc DEPS string_helper glog boost)
cc_library(shell SRCS shell.cc DEPS string_helper glog)
cc_library(shell SRCS shell.cc DEPS string_helper glog timer)
cc_test(test_fs SRCS test_fs.cc DEPS fs shell)

@ -13,6 +13,8 @@
// limitations under the License.
#include "paddle/fluid/framework/io/shell.h"
#include "paddle/fluid/platform/enforce.h"
#include "paddle/fluid/platform/timer.h"
namespace paddle {
namespace framework {
@ -296,23 +298,48 @@ std::pair<std::shared_ptr<FILE>, std::shared_ptr<FILE>> shell_p2open(
#endif
}
std::string shell_get_command_output(const std::string& cmd) {
std::string shell_get_command_output(const std::string& cmd, int time_out,
int sleep_inter, bool print_cmd) {
#if defined _WIN32 || defined __APPLE__
return "";
PADDLE_THROW(platform::errors::Unimplemented(
"This function(shell_get_command_output) is not implemented under _WIN32 "
"or __APPLE__."));
#else
int err_no = 0;
platform::Timer timer;
do {
if (print_cmd) {
LOG(INFO) << "exec cmd:[" << cmd << "]";
}
err_no = 0;
std::shared_ptr<FILE> pipe = shell_popen(cmd, "r", &err_no);
string::LineFileReader reader;
if (reader.getdelim(&*pipe, 0)) {
pipe = nullptr;
char* buf = reader.getdelim(&*pipe, 0);
if (err_no == 0) {
if (buf) {
return reader.get();
}
return "";
}
if (sleep_inter > 0) {
usleep(sleep_inter);
}
timer.Pause();
if (time_out > 0 && timer.ElapsedMS() >= time_out) {
PADDLE_THROW(paddle::platform::errors::ExecutionTimeout(
"shell_get_command_output execute error errno:%d and try until "
"timeout.",
errno));
return "";
}
} while (err_no == -1);
timer.Resume();
pipe = nullptr;
} while (err_no);
return "";
#endif
}

@ -65,7 +65,12 @@ inline void shell_execute(const std::string& cmd) {
} while (err_no == -1);
}
extern std::string shell_get_command_output(const std::string& cmd);
// timeout:ms, default -1 means forever.
// sleep_inter:ms, default -1 means not sleep.
extern std::string shell_get_command_output(const std::string& cmd,
int time_out = -1,
int sleep_inter = -1,
bool print_cmd = false);
} // namespace framework
} // namespace paddle

@ -1494,8 +1494,10 @@ All parameter, weight, gradient are variables in Paddle.
m.def("is_compiled_with_mkldnn", IsCompiledWithMKLDNN);
m.def("is_compiled_with_brpc", IsCompiledWithBrpc);
m.def("is_compiled_with_dist", IsCompiledWithDIST);
m.def("run_cmd", [](const std::string &cmd) -> const std::string {
return paddle::framework::shell_get_command_output(cmd);
m.def("run_cmd", [](const std::string &cmd, int time_out = -1,
int sleep_inter = -1) -> const std::string {
return paddle::framework::shell_get_command_output(cmd, time_out,
sleep_inter);
});
#ifdef PADDLE_WITH_CUDA
m.def("is_float16_supported", [](const platform::CUDAPlace &place) -> bool {

@ -0,0 +1,79 @@
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from utils import get_cluster, logger
import os
def get_cloud_cluster(args_node_ips, args_node_ip, args_port, selected_gpus):
"""
args_node_ips, args_node_ip:string
"""
#you can automatically get ip info while using paddlecloud multi nodes mode.
node_ips = os.getenv("PADDLE_TRAINERS")
assert node_ips is not None, "PADDLE_TRAINERS should not be None"
node_ip = os.getenv("POD_IP")
assert node_ip is not None, "POD_IP should not be None"
node_rank = os.getenv("PADDLE_TRAINER_ID")
assert node_rank is not None, "PADDLE_TRAINER_ID should not be None"
node_ips = node_ips.split(",")
num_nodes = len(node_ips)
node_rank = int(node_rank)
if node_ip != "127.0.0.1" and node_ip != args_node_ip:
logger.warning("Please NOTE: When using paddlecloud, node_ip is \
automatically got from POD_IP. Your input node_ip: {} doesn't equals to \
node_ip: {} from paddlecloud environment.".format(args_node_ip, node_ip))
if args_node_ips != "127.0.0.1" and args_node_ips != ",".join(node_ips):
logger.warning(
"Please NOTE: When using paddlecloud, cluster_node_ips is \
automatically got from PADDLE_TRAINERS(multi nodes) or POD_IP(single node).\
Your input cluster_node_ips: {} doesn't equals to IPs: {} from \
paddlecloud environment.".format(args_node_ips, node_ips))
started_port = args_port
print("num_nodes:", num_nodes)
if num_nodes > 1:
try:
paddle_port = int(os.getenv("PADDLE_PORT", ""))
paddle_port_num = int(os.getenv("TRAINER_PORTS_NUM", ""))
if paddle_port_num >= len(
selected_gpus) and paddle_port != args_port:
logger.warning("Use Cloud specified port:{}.".format(
paddle_port))
started_port = paddle_port
except Exception as e:
print(e)
pass
if started_port is None:
started_port = 6170
logger.debug("parsed from args:node_ips:{} \
node_ip:{} node_rank:{} started_port:{}"
.format(node_ips, node_ip, node_rank, started_port))
ports = [x for x in range(started_port, started_port + len(selected_gpus))]
cluster, pod = get_cluster(node_ips, node_ip, ports, selected_gpus)
return cluster, cluster.pods[node_rank]
def get_trainers_num():
return int(os.getenv("PADDLE_TRAINERS_NUM", "1"))

@ -0,0 +1,223 @@
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import paddle.fluid as fluid
import sys
import abc
import os
from pathlib import PurePosixPath
import shutil
class FS(object):
@abc.abstractmethod
def list_dirs(self, fs_path):
pass
@abc.abstractmethod
def ls_dir(self, fs_path):
pass
@abc.abstractmethod
def stat(self, fs_path):
pass
@abc.abstractmethod
def upload(self, local_path, fs_path):
pass
@abc.abstractmethod
def download(self, fs_path, local_path):
pass
@abc.abstractmethod
def mkdir(self, fs_path):
pass
@abc.abstractmethod
def mv(self, fs_src_path, fs_dst_path):
pass
@abc.abstractmethod
def rmr(self, fs_path):
pass
@abc.abstractmethod
def rm(self, fs_path):
pass
@abc.abstractmethod
def delete(self, fs_path):
pass
@abc.abstractmethod
def need_upload_download(self):
pass
class LocalFS(FS):
def list_dirs(self, fs_path):
if not self.stat(fs_path):
return []
return [
f for f in os.listdir(fs_path) if os.path.isdir(fs_path + "/" + f)
]
def ls_dir(self, fs_path):
return [f for f in os.listdir(fs_path)]
def stat(self, fs_path):
return os.path.exists(fs_path)
def mkdir(self, fs_path):
assert not os.path.isfile(fs_path), "{} is already a file".format(
fs_path)
os.system("mkdir -p {}".format(fs_path))
def mv(self, fs_src_path, fs_dst_path):
os.rename(fs_src_path, fs_dst_path)
def rmr(self, fs_path):
shutil.rmtree(fs_path)
def rm(self, fs_path):
os.remove(fs_path)
def delete(self, fs_path):
if not self.stat(fs_path):
return
if os.path.isfile(fs_path):
return self.rm(fs_path)
return self.rmr(fs_path)
def need_upload_download(self):
return False
class BDFS(FS):
def __init__(self,
hdfs_name,
hdfs_ugi,
time_out=20 * 60 * 1000,
sleep_inter=1000):
self._base_cmd = "hadoop fs -Dfs.default.name=\"{}\" -Dhadoop.job.ugi=\"{}\"".format(
hdfs_name, hdfs_ugi)
self._time_out = time_out
self._sleep_inter = sleep_inter
def _run_cmd(self, cmd):
ret = fluid.core.run_cmd(cmd, self._time_out, self._sleep_inter)
if len(ret) <= 0:
return []
lines = ret.splitlines()
return lines
def list_dirs(self, fs_path):
if not self.stat(fs_path):
return []
dirs, _ = self.ls_dir(fs_path)
return dirs
def ls_dir(self, fs_path):
"""
list directory under fs_path, and only give the pure name, not include the fs_path
"""
cmd = "{} -ls {}".format(self._base_cmd, fs_path)
lines = self._run_cmd(cmd)
dirs = []
files = []
for line in lines:
arr = line.split()
if len(arr) != 8:
continue
if fs_path not in arr[7]:
continue
p = PurePosixPath(arr[7])
if arr[0][0] == 'd':
dirs.append(p.name)
else:
files.append(p.name)
return dirs, files
def is_dir(self, fs_path):
cmd = "{} -test -d {} ; echo $?".format(self._base_cmd, fs_path)
test = self._run_cmd(cmd)
if test[0].strip() == "0":
return True
return False
def stat(self, fs_path):
cmd = "{} -test -e {} ; echo $?".format(self._base_cmd, fs_path)
test = self._run_cmd(cmd)
if test[0].strip() == "0":
return True
return False
def upload(self, local_path, fs_path):
cmd = "{} -put {} {}".format(self._base_cmd, local_path, fs_path)
fluid.core.run_cmd(cmd, self._time_out, self._sleep_inter)
def download(self, fs_path, local_path):
cmd = "{} -get {} {}/".format(self._base_cmd, fs_path, local_path)
fluid.core.run_cmd(cmd, self._time_out, self._sleep_inter)
def mkdir(self, fs_path):
if not self.stat(fs_path):
cmd = "{} -mkdir {}".format(self._base_cmd, fs_path)
fluid.core.run_cmd(cmd, self._time_out, self._sleep_inter)
def mv(self, fs_src_path, fs_dst_path):
cmd = "{} -mv {} {}".format(self._base_cmd, fs_src_path, fs_dst_path)
fluid.core.run_cmd(cmd, self._time_out, self._sleep_inter)
def rmr(self, fs_path):
if not self.stat(fs_path):
return
cmd = "{} -rmr {}".format(self._base_cmd, fs_path)
return fluid.core.run_cmd(cmd, self._time_out, self._sleep_inter)
def rm(self, fs_path):
if not self.stat(fs_path):
return
cmd = "{} -rm {}".format(self._base_cmd, fs_path)
return fluid.core.run_cmd(cmd, self._time_out, self._sleep_inter)
def delete(self, fs_path):
if not self.stat(fs_path):
return
is_dir = self.is_dir(fs_path)
if is_dir:
return self.rmr(fs_path)
return self.rm(fs_path)
def need_upload_download(self):
return True

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

@ -26,10 +26,14 @@ from paddle.fluid.incubate.fleet.base.fleet_base import Mode
from paddle.fluid.incubate.fleet.base.fleet_base import DistributedOptimizer
from paddle.fluid import compiler
from paddle.distributed.fs_wrapper import LocalFS, BDFS
import os
import sys
import six
import json
import re
import shutil
class LambConfig(object):
@ -42,6 +46,21 @@ class DistFCConfig(object):
pass
class TrainStatus(object):
def __init__(self, epoch_no=-1):
# completed epoch
self._epoch_no = epoch_no
def next(self):
return self._epoch_no + 1
def __eq__(self, t):
return self._epoch_no == t._epoch_no
def __ne__(self, t):
return not self == t
class Collective(Fleet):
def __init__(self):
super(Collective, self).__init__(Mode.COLLECTIVE)
@ -51,6 +70,8 @@ class Collective(Fleet):
self._origin_program = None
self._transpiled_program = None
self.main_program = None
self._checkoint_prefix = "__paddle_fleet_checkpoint__"
self._param_file_name = "_paddle_fleet_param__"
def init_worker(self):
logging.warn(
@ -103,7 +124,11 @@ class Collective(Fleet):
executor, main_program, None, None,
export_for_deployment)
def save_persistables(self, executor, dirname, main_program=None):
def save_persistables(self,
executor,
dirname,
main_program=None,
filename=None):
"""
This function filters out all variables with `persistable==True` from
the give `main_program` and then saves these variables to the folder
@ -125,7 +150,182 @@ class Collective(Fleet):
"In fleet.save_inference_model() function, main_program " \
"must be as Program type."
io.save_persistables(executor, dirname, main_program, None)
io.save_persistables(executor, dirname, main_program, filename=filename)
def _save_train_status(self, path, train_status):
d = {}
d["epoch_no"] = train_status._epoch_no
file_name = "{}/fleet_train_status".format(path)
with open(file_name, 'w') as f:
json.dump(d, f)
def _load_train_status(self, path):
file_name = "{}/fleet_train_status".format(path)
r = TrainStatus()
if not os.path.isfile(file_name):
return r
d = {}
with open(file_name, 'r') as f:
d = json.load(f)
assert "epoch_no" in d, "Can't find epoch_no in dict from train_status file:{}".format(
d)
r._epoch_no = d["epoch_no"]
assert r._epoch_no >= 0, "Data in checkpoint file is not valid:{}".format(
d)
return r
def _get_last_checkpoint_no(self, root_path, fs):
"""
only get the first depth
"""
max_no = -1
d = {}
dirs = fs.list_dirs(root_path)
for dir in dirs:
g = dir.split(".")
if len(g) != 2:
continue
if g[0] != "__paddle_fleet_checkpoint__":
continue
try:
n = int(g[1])
if n > max_no:
max_no = n
except:
continue
return max_no
def clean_redundant_check_points(self,
root_path,
fs=LocalFS(),
checkpoint_num=1):
max_no = self._get_last_checkpoint_no(root_path, fs)
if max_no < 0:
return
if checkpoint_num < 1:
checkpoint_num = 1
dirs = fs.list_dirs(root_path)
for dir in dirs:
g = dir.split(".")
if len(g) != 2:
continue
if g[0] != self._checkoint_prefix:
continue
try:
n = int(g[1])
if n <= max_no - checkpoint_num:
path = "{}/{}.{}".format(root_path, self._checkoint_prefix,
n)
fs.rmr(path)
except Exception as e:
print(e)
continue
def save_check_point(self,
executor,
path,
train_status,
main_program=None,
fs=LocalFS(),
local_cache_path=".cache",
remain_all_checkpoint=True):
"""
This function save persistables and current epoch num to path.
"""
if main_program == None:
main_program = self._transpiled_program
if not fs.stat(path):
fs.mkdir(path)
max_no = self._get_last_checkpoint_no(path, fs=fs)
if max_no < 0:
max_no = -1
real_path = "{}/{}.{}".format(path, self._checkoint_prefix, max_no + 1)
tmp_path = "{}.tmp".format(real_path)
saved_path = tmp_path
local_fs = LocalFS()
cache_path = None
if fs.need_upload_download():
cache_path = "{}/{}.{}.saved_cache".format(
local_cache_path, self._checkoint_prefix, max_no + 1)
if not local_fs.stat(cache_path):
local_fs.mkdir(cache_path)
saved_path = cache_path
self.save_persistables(
executor=executor,
dirname=saved_path,
main_program=main_program,
filename=self._param_file_name)
self._save_train_status(path=saved_path, train_status=train_status)
if fs.need_upload_download():
fs.delete(tmp_path)
fs.upload(cache_path, tmp_path)
fs.mv(tmp_path, real_path)
if not remain_all_checkpoint:
self.clean_redundant_check_points(path)
def load_check_point(self,
executor,
path,
trainer_id,
main_program=None,
fs=LocalFS(),
local_cache_path=".cache",
ignore_empty=True):
"""
This function load persistables and current epoch num from path.
"""
max_no = self._get_last_checkpoint_no(path, fs)
if not ignore_empty:
assert max_no >= 0, "Can't find checkpoint"
if max_no < 0:
return None
local_fs = LocalFS()
if fs.need_upload_download():
cache_path = "{}/{}.{}.load_cache.{}".format(
local_cache_path, self._checkoint_prefix, max_no, trainer_id)
if local_fs.stat(cache_path):
local_fs.delete(cache_path)
real_path = "{}/{}.{}".format(path, self._checkoint_prefix, max_no)
load_path = real_path
if fs.need_upload_download():
fs.download(real_path, cache_path)
load_path = cache_path
if main_program == None:
main_program = self._transpiled_program
io.load_persistables(
executor=executor,
dirname=load_path,
main_program=main_program,
filename=self._param_file_name)
return self._load_train_status(load_path)
fleet = Collective()

@ -28,6 +28,7 @@ list(APPEND MIXED_DIST_TEST_OPS test_communicator_geo)
list(APPEND MIXED_DIST_TEST_OPS test_communicator_half_async)
list(APPEND MIXED_DIST_TEST_OPS test_communicator_sync)
list(APPEND MIXED_DIST_TEST_OPS test_fleet_api_input)
list(APPEND MIXED_DIST_TEST_OPS test_fleet_checkpoint)
foreach(TEST_OP ${MIXED_DIST_TEST_OPS})
list(REMOVE_ITEM TEST_OPS ${TEST_OP})
endforeach()
@ -301,6 +302,7 @@ if(WITH_DISTRIBUTE)
if(WITH_GPU)
# NOTE. test_launch only work in gpu collective mode
bash_test_modules(test_launch MODULES test_launch.sh ENVS PADDLE_BINARY_DIR=${PADDLE_BINARY_DIR})
py_test_modules(test_fleet_checkpoint MODULES test_fleet_checkpoint)
endif()
bash_test_modules(test_launch_ps MODULES test_launch_ps.sh ENVS PADDLE_BINARY_DIR=${PADDLE_BINARY_DIR})

@ -0,0 +1,77 @@
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import unittest
import paddle.fluid as fluid
import paddle.fluid.incubate.fleet.base.role_maker as role_maker
from paddle.fluid.incubate.fleet.collective import CollectiveOptimizer, fleet, TrainStatus
import os
from paddle.distributed.fs_wrapper import LocalFS, BDFS
class FleetTest(unittest.TestCase):
def _test_check_point(self, fs, dir_path):
file_name = "persistables"
os.environ["TRAINING_ROLE"] = "TRAINER"
os.environ["PADDLE_TRAINER_ID"] = "0"
os.environ["PADDLE_TRAINER_ENDPOINTS"] = "127.0.0.1:6070"
role = role_maker.PaddleCloudRoleMaker(is_collective=True)
fleet.init(role)
image = fluid.data(name='img', shape=[None, 28, 28], dtype='float32')
label = fluid.data(name='label', shape=[None, 1], dtype='int64')
feeder = fluid.DataFeeder(
feed_list=[image, label], place=fluid.CPUPlace())
predict = fluid.layers.fc(input=image, size=10, act='softmax')
loss = fluid.layers.cross_entropy(input=predict, label=label)
avg_loss = fluid.layers.mean(loss)
optimizer = fluid.optimizer.AdamOptimizer(learning_rate=0.001)
dist_optimizer = fleet.distributed_optimizer(optimizer)
dist_optimizer.minimize(avg_loss)
exe = fluid.Executor(fluid.CPUPlace())
exe.run(fluid.default_startup_program())
status = TrainStatus(2)
fleet.save_check_point(exe, dir_path, train_status=status, fs=fs)
n1 = fleet._get_last_checkpoint_no(dir_path, fs=fs)
status2 = fleet.load_check_point(exe, dir_path, trainer_id=0, fs=fs)
self.assertEqual(status2, status)
fleet.save_check_point(exe, dir_path, train_status=status, fs=fs)
n2 = fleet._get_last_checkpoint_no(dir_path, fs=fs)
self.assertEqual(n2, n1 + 1)
fleet.clean_redundant_check_points(dir_path, fs=fs)
def test_hdfs_check_point(self):
try:
fs = BDFS("xxxx", "xxxx", 1 * 1000, 1 * 1000)
dir_path = "/user/Paddle_Data/gongweibao/edl_test/my_paddle_model"
self._test_check_point(fs, dir_path)
except Exception as e:
print(e)
def test_local_check_point(self):
fs = LocalFS()
dir_path = "./my_paddle_model"
self._test_check_point(fs, dir_path)
if __name__ == '__main__':
unittest.main()

@ -6,6 +6,7 @@ launch_py=${PADDLE_BINARY_DIR}/python/paddle/distributed/launch.py
python ${launch_py} multi_process.py
# use paddlecloud
echo "begin test use paddlecloud"
cluster_node_ips="10.0.0.1"
node_ip="10.0.0.1"
export PADDLE_TRAINERS_NUM=2
@ -14,7 +15,7 @@ export PADDLE_TRAINERS=127.0.0.1,127.0.0.2
export PADDLE_TRAINER_ID=0
export PADDLE_PORT=35019
export PADDLE_PORTS_NUM=2
export TRAINER_PORTS_NUM=2
distributed_args="--use_paddlecloud --cluster_node_ips=${cluster_node_ips} --node_ip=${node_ip} --selected_gpus=0,1 --log_dir=testlog"
CUDA_VISIBLE_DEVICES=0,1 python ${launch_py} ${distributed_args} multi_process.py
@ -47,8 +48,9 @@ if [ -f $file_1 ]; then
rm $file_1
fi
unset PADDLE_PORT
unset PADDLE_PORTS_NUM
unset TRAINER_PORTS_NUM
echo ""
echo "paddle.distributed.launch async poll process test"

@ -19,3 +19,4 @@ decorator
prettytable
objgraph
astor
pathlib

Loading…
Cancel
Save