Clean up the redundant files and unify the launch interface. (#28928)
parent
47af5c3c9d
commit
1358397e97
@ -1,165 +0,0 @@
|
||||
# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
from __future__ import print_function
|
||||
from __future__ import unicode_literals
|
||||
import subprocess
|
||||
import sys
|
||||
import os
|
||||
import copy
|
||||
from argparse import ArgumentParser, REMAINDER
|
||||
|
||||
|
||||
def parse_args():
|
||||
# Optional arguments for the launch helper
|
||||
parser = ArgumentParser(description="Distributed training")
|
||||
parser.add_argument(
|
||||
"--cluster_node_ips",
|
||||
type=str,
|
||||
default="127.0.0.1",
|
||||
help="Paddle cluster nodes ips, such as 192.168.0.16,192.168.0.17..")
|
||||
|
||||
parser.add_argument(
|
||||
"--node_ip",
|
||||
type=str,
|
||||
default="127.0.0.1",
|
||||
help="The current node ip. ")
|
||||
|
||||
parser.add_argument(
|
||||
"--start_port",
|
||||
type=int,
|
||||
default=6170,
|
||||
help="The trainer's start port on a single node")
|
||||
|
||||
parser.add_argument(
|
||||
"--print_config",
|
||||
type=bool,
|
||||
default=True,
|
||||
help="Print the config or not")
|
||||
|
||||
parser.add_argument(
|
||||
"--endpoints", type=str, default="", help="User defined endpoints")
|
||||
|
||||
parser.add_argument(
|
||||
"--worker_num", type=int, default=2, help="number of workers")
|
||||
|
||||
parser.add_argument(
|
||||
"--server_num", type=int, default=2, help="number of servers")
|
||||
|
||||
parser.add_argument(
|
||||
"--log_dir",
|
||||
default="logs",
|
||||
type=str,
|
||||
help="The path for each process's log.If it's not set, the log will printed to default pipe."
|
||||
)
|
||||
|
||||
# positional
|
||||
parser.add_argument(
|
||||
"training_script",
|
||||
type=str,
|
||||
help="The full path to the single GPU training "
|
||||
"program/script to be launched in parallel, "
|
||||
"followed by all the arguments for the "
|
||||
"training script")
|
||||
|
||||
# rest from the training program
|
||||
parser.add_argument('training_script_args', nargs=REMAINDER)
|
||||
return parser.parse_args()
|
||||
|
||||
|
||||
def start_procs(args):
|
||||
worker_num = args.worker_num
|
||||
server_num = args.server_num
|
||||
start_port = args.start_port
|
||||
default_env = os.environ.copy()
|
||||
current_env = copy.copy(default_env)
|
||||
current_env.pop("http_proxy", None)
|
||||
current_env.pop("https_proxy", None)
|
||||
procs = []
|
||||
cmds = []
|
||||
log_fns = []
|
||||
ports = range(start_port, start_port + server_num, 1)
|
||||
default_endpoints = ",".join(["127.0.0.1:" + str(x) for x in ports])
|
||||
user_endpoints = ""
|
||||
if args.endpoints == "":
|
||||
user_endpoints = default_endpoints
|
||||
else:
|
||||
user_endpoints = args.endpoints
|
||||
user_endpoints_ips = [x.split(":")[0] for x in user_endpoints.split(",")]
|
||||
user_endpoints_port = [x.split(":")[1] for x in user_endpoints.split(",")]
|
||||
for i in range(server_num):
|
||||
current_env.update({
|
||||
"PADDLE_PSERVERS_IP_PORT_LIST": user_endpoints,
|
||||
"PADDLE_PORT": user_endpoints_port[i],
|
||||
"TRAINING_ROLE": "PSERVER",
|
||||
"PADDLE_TRAINERS_NUM": str(worker_num),
|
||||
"POD_IP": user_endpoints_ips[i]
|
||||
})
|
||||
|
||||
cmd = [sys.executable, "-u", args.training_script
|
||||
] + args.training_script_args
|
||||
cmds.append(cmd)
|
||||
if args.log_dir is not None:
|
||||
os.system("mkdir -p {}".format(args.log_dir))
|
||||
fn = open("%s/serverlog.%d" % (args.log_dir, i), "w")
|
||||
log_fns.append(fn)
|
||||
proc = subprocess.Popen(cmd, env=current_env, stdout=fn, stderr=fn)
|
||||
else:
|
||||
proc = subprocess.Popen(cmd, env=current_env)
|
||||
procs.append(proc)
|
||||
|
||||
for i in range(worker_num):
|
||||
current_env.update({
|
||||
"PADDLE_PSERVERS_IP_PORT_LIST": user_endpoints,
|
||||
"PADDLE_TRAINERS_NUM": str(worker_num),
|
||||
"TRAINING_ROLE": "TRAINER",
|
||||
"PADDLE_TRAINER_ID": str(i)
|
||||
})
|
||||
cmd = [sys.executable, "-u", args.training_script
|
||||
] + args.training_script_args
|
||||
cmds.append(cmd)
|
||||
if args.log_dir is not None:
|
||||
os.system("mkdir -p {}".format(args.log_dir))
|
||||
fn = open("%s/workerlog.%d" % (args.log_dir, i), "w")
|
||||
log_fns.append(fn)
|
||||
proc = subprocess.Popen(cmd, env=current_env, stdout=fn, stderr=fn)
|
||||
else:
|
||||
proc = subprocess.Popen(cmd, env=current_env)
|
||||
procs.append(proc)
|
||||
|
||||
# only wait worker to finish here
|
||||
for i, proc in enumerate(procs):
|
||||
if i < server_num:
|
||||
continue
|
||||
procs[i].wait()
|
||||
if len(log_fns) > 0:
|
||||
log_fns[i].close()
|
||||
|
||||
print("all workers exit, going to finish parameter server", file=sys.stderr)
|
||||
for i in range(server_num):
|
||||
if len(log_fns) > 0:
|
||||
log_fns[i].close()
|
||||
procs[i].terminate()
|
||||
print("all parameter server are killed", file=sys.stderr)
|
||||
|
||||
|
||||
def launch():
|
||||
args = parse_args()
|
||||
if args.print_config:
|
||||
start_procs(args)
|
||||
|
||||
|
||||
# server num, worker num
|
||||
if __name__ == "__main__":
|
||||
launch()
|
@ -0,0 +1,26 @@
|
||||
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import paddle
|
||||
import sys
|
||||
import paddle.fluid as fluid
|
||||
|
||||
print("compile with cuda:", fluid.core.is_compiled_with_cuda())
|
||||
print("get_cuda_device_count:", fluid.core.get_cuda_device_count())
|
||||
|
||||
if fluid.core.is_compiled_with_cuda() and fluid.core.get_cuda_device_count(
|
||||
) > 0:
|
||||
sys.exit(0)
|
||||
else:
|
||||
sys.exit(1)
|
@ -0,0 +1,38 @@
|
||||
# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import os
|
||||
import sys
|
||||
import time
|
||||
|
||||
|
||||
def train(prefix):
|
||||
selected_gpus = os.getenv("FLAGS_selected_gpus")
|
||||
trainer_id = int(os.getenv("PADDLE_TRAINER_ID"))
|
||||
worker_endpoints_env = os.getenv("PADDLE_TRAINER_ENDPOINTS")
|
||||
current_endpoint = os.getenv("PADDLE_CURRENT_ENDPOINT")
|
||||
worker_endpoints = worker_endpoints_env
|
||||
trainers_num = len(worker_endpoints.split(','))
|
||||
|
||||
name = "selected_gpus:{} worker_endpoints:{} trainers_num:{} current_endpoint:{} trainer_id:{}"\
|
||||
.format(selected_gpus, worker_endpoints, trainers_num, current_endpoint,trainer_id)
|
||||
|
||||
print(name)
|
||||
with open("{}.check_{}.log".format(prefix, trainer_id), "w") as f:
|
||||
f.write(name)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
prefix = sys.argv[1]
|
||||
train(prefix)
|
@ -1,132 +0,0 @@
|
||||
#!/bin/bash
|
||||
set -e
|
||||
|
||||
|
||||
function test_launch_ps(){
|
||||
fleetrun --server_num=2 --worker_num=2 fleet_ps_training.py 2> ut.elog
|
||||
if grep -q "server are killed" ut.elog; then
|
||||
echo "test pserver launch succeed"
|
||||
else
|
||||
echo "test pserver launch failed"
|
||||
exit -1
|
||||
fi
|
||||
|
||||
fleetrun --servers="127.0.0.1:6780,127.0.0.1:6781" --workers="127.0.0.1:6782,127.0.0.1:6783" fleet_ps_training.py 2> ut.elog
|
||||
if grep -q "server are killed" ut.elog; then
|
||||
echo "test pserver launch succeed"
|
||||
else
|
||||
echo "test pserver launch failed"
|
||||
exit -1
|
||||
fi
|
||||
|
||||
fleetrun --servers="127.0.0.1:6780,127.0.0.1:6781" --workers="127.0.0.1,127.0.0.1" fleet_ps_training.py 2> ut.elog
|
||||
if grep -q "server are killed" ut.elog; then
|
||||
echo "test pserver launch succeed"
|
||||
else
|
||||
echo "test pserver launch failed"
|
||||
exit -1
|
||||
fi
|
||||
}
|
||||
|
||||
function test_launch_ps_heter(){
|
||||
fleetrun --server_num=2 --worker_num=2 --heter_worker_num=2 fleet_ps_training.py 2> ut.elog
|
||||
if grep -q "server are killed" ut.elog; then
|
||||
echo "test heter pserver launch succeed"
|
||||
else
|
||||
echo "test pserver launch failed"
|
||||
exit -1
|
||||
fi
|
||||
}
|
||||
|
||||
if [[ ${WITH_GPU} == "OFF" ]]; then
|
||||
echo "in cpu test mode"
|
||||
test_launch_ps
|
||||
exit 0
|
||||
fi
|
||||
|
||||
echo "No.1 unittest"
|
||||
test_launch_ps
|
||||
test_launch_ps_heter
|
||||
# use default values
|
||||
echo "No.2 unittest"
|
||||
fleetrun multi_process.py fleetrun
|
||||
|
||||
# use paddlecloud
|
||||
echo "begin test use paddlecloud"
|
||||
cluster_node_ips="127.0.0.1,127.0.0.2"
|
||||
export PADDLE_TRAINERS_NUM=2
|
||||
export POD_IP=127.0.0.1
|
||||
export PADDLE_TRAINERS=127.0.0.1,127.0.0.2
|
||||
export PADDLE_TRAINER_ID=0
|
||||
|
||||
export PADDLE_PORT=35789
|
||||
export TRAINER_PORTS_NUM=2
|
||||
|
||||
echo "No.3 unittest"
|
||||
distributed_args="--ips=${cluster_node_ips} --gpus=0,1 --log_dir=testlog"
|
||||
CUDA_VISIBLE_DEVICES=0,1 fleetrun ${distributed_args} multi_process.py fleetrun
|
||||
|
||||
str1="selected_gpus:0 worker_endpoints:127.0.0.1:35789,127.0.0.1:35790,127.0.0.2:35789,127.0.0.2:35790 trainers_num:4 current_endpoint:127.0.0.1:35789 trainer_id:0"
|
||||
str2="selected_gpus:1 worker_endpoints:127.0.0.1:35789,127.0.0.1:35790,127.0.0.2:35789,127.0.0.2:35790 trainers_num:4 current_endpoint:127.0.0.1:35790 trainer_id:1"
|
||||
file_0="multi_process_fleetrun.check_0.log"
|
||||
file_1="multi_process_fleetrun.check_1.log"
|
||||
|
||||
echo "paddlecloud params test"
|
||||
if grep -q "$str1" "$file_0"; then
|
||||
echo "find trainer 0"
|
||||
else
|
||||
echo "not find trainer 0"
|
||||
exit -1
|
||||
fi
|
||||
|
||||
if grep -q "$str2" "$file_1"; then
|
||||
echo "find trainer 1"
|
||||
else
|
||||
echo "not find trainer 1"
|
||||
exit -1
|
||||
fi
|
||||
|
||||
# test async poll process
|
||||
if [ -f $file_0 ]; then
|
||||
rm $file_0
|
||||
fi
|
||||
if [ -f $file_1 ]; then
|
||||
rm $file_1
|
||||
fi
|
||||
|
||||
# test use DISTRIBUTED_TRAINER_ENDPOINTS env in paddlecloud
|
||||
unset PADDLE_PORT
|
||||
export DISTRIBUTED_TRAINER_ENDPOINTS=127.0.0.1:6170,127.0.0.1:6171,127.0.0.2:6170,127.0.0.2:6171
|
||||
|
||||
echo "No.4 unittest"
|
||||
echo "paddle.distributed.launch async poll process test"
|
||||
if ! CUDA_VISIBLE_DEVICES=0,1 fleetrun ${distributed_args} multi_process.py fleetrun abort; then
|
||||
echo "train abort as planned"
|
||||
fi
|
||||
|
||||
abort_str1="abort>>> selected_gpus:0 worker_endpoints:127.0.0.1:6170,127.0.0.1:6171,127.0.0.2:6170,127.0.0.2:6171 trainers_num:4 current_endpoint:127.0.0.1:6170 trainer_id:0"
|
||||
|
||||
if grep -q "$abort_str1" "$file_0"; then
|
||||
echo "trainer 0 abort as planned"
|
||||
else
|
||||
echo "trainer 0 not abort as planned"
|
||||
exit -1
|
||||
fi
|
||||
|
||||
if [ ! -f $file_1 ]; then
|
||||
echo "trainer 1 terminate as planned"
|
||||
else
|
||||
echo "trainer 1 not terminate as planned"
|
||||
exit -1
|
||||
fi
|
||||
|
||||
#test for random ports
|
||||
file_0_0="test_launch_filelock_0_0.log"
|
||||
file_1_0="test_launch_filelock_1_0.log"
|
||||
rm -rf $file_0_0 $file_0_1
|
||||
|
||||
distributed_args="--gpus=0,1 --log_dir=testlog"
|
||||
export PADDLE_LAUNCH_LOG="test_launch_filelock_0"
|
||||
echo "No.5 unittest"
|
||||
CUDA_VISIBLE_DEVICES=0,1 fleetrun ${distributed_args} find_ports.py
|
||||
str_0="worker_endpoints:127.0.0.1:6070,127.0.0.1:6071"
|
@ -0,0 +1,54 @@
|
||||
#!/bin/bash
|
||||
|
||||
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
set -e
|
||||
|
||||
# test use DISTRIBUTED_TRAINER_ENDPOINTS env in paddlecloud
|
||||
unset PADDLE_PORT
|
||||
export DISTRIBUTED_TRAINER_ENDPOINTS=127.0.0.1:6170,127.0.0.1:6171,127.0.0.2:6170,127.0.0.2:6171
|
||||
export cluster_node_ips="127.0.0.1,127.0.0.2"
|
||||
export PADDLE_TRAINERS_NUM=2
|
||||
export POD_IP=127.0.0.1
|
||||
export PADDLE_TRAINERS=127.0.0.1,127.0.0.2
|
||||
export PADDLE_TRAINER_ID=0
|
||||
|
||||
export TRAINER_PORTS_NUM=2
|
||||
|
||||
file_0="multi_process_fleetrun.check_0.log"
|
||||
file_1="multi_process_fleetrun.check_1.log"
|
||||
|
||||
distributed_args="--ips=${cluster_node_ips} --gpus=0,1 --log_dir=testlog"
|
||||
|
||||
echo "paddle.distributed.fleet.launch async poll process test"
|
||||
if ! CUDA_VISIBLE_DEVICES=0,1 python -m paddle.distributed.fleet.launch ${distributed_args} multi_process.py fleetrun abort; then
|
||||
echo "train abort as planned"
|
||||
fi
|
||||
|
||||
abort_str1="abort>>> selected_gpus:0 worker_endpoints:127.0.0.1:6170,127.0.0.1:6171,127.0.0.2:6170,127.0.0.2:6171 trainers_num:4 current_endpoint:127.0.0.1:6170 trainer_id:0"
|
||||
|
||||
if grep -q "$abort_str1" "$file_0"; then
|
||||
echo "trainer 0 abort as planned"
|
||||
else
|
||||
echo "trainer 0 not abort as planned"
|
||||
exit -1
|
||||
fi
|
||||
|
||||
if [ ! -f $file_1 ]; then
|
||||
echo "trainer 1 terminate as planned"
|
||||
else
|
||||
echo "trainer 1 not terminate as planned"
|
||||
exit -1
|
||||
fi
|
@ -0,0 +1,59 @@
|
||||
#!/bin/bash
|
||||
|
||||
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
set -e
|
||||
|
||||
# use paddlecloud
|
||||
echo "begin test use paddlecloud"
|
||||
cluster_node_ips="127.0.0.1,127.0.0.2"
|
||||
export PADDLE_TRAINERS_NUM=2
|
||||
export POD_IP=127.0.0.1
|
||||
export PADDLE_TRAINERS=127.0.0.1,127.0.0.2
|
||||
export PADDLE_TRAINER_ID=0
|
||||
|
||||
export PADDLE_PORT=35789
|
||||
export TRAINER_PORTS_NUM=2
|
||||
|
||||
distributed_args="--ips=${cluster_node_ips} --gpus=0,1 --log_dir=testlog"
|
||||
CUDA_VISIBLE_DEVICES=0,1 python -m paddle.distributed.fleet.launch ${distributed_args} multi_process.py fleetrun
|
||||
|
||||
str1="selected_gpus:0 worker_endpoints:127.0.0.1:35789,127.0.0.1:35790,127.0.0.2:35789,127.0.0.2:35790 trainers_num:4 current_endpoint:127.0.0.1:35789 trainer_id:0"
|
||||
str2="selected_gpus:1 worker_endpoints:127.0.0.1:35789,127.0.0.1:35790,127.0.0.2:35789,127.0.0.2:35790 trainers_num:4 current_endpoint:127.0.0.1:35790 trainer_id:1"
|
||||
file_0="multi_process_fleetrun.check_0.log"
|
||||
file_1="multi_process_fleetrun.check_1.log"
|
||||
|
||||
echo "paddlecloud params test"
|
||||
if grep -q "$str1" "$file_0"; then
|
||||
echo "find trainer 0"
|
||||
else
|
||||
echo "not find trainer 0"
|
||||
exit -1
|
||||
fi
|
||||
|
||||
if grep -q "$str2" "$file_1"; then
|
||||
echo "find trainer 1"
|
||||
else
|
||||
echo "not find trainer 1"
|
||||
exit -1
|
||||
fi
|
||||
|
||||
# test async poll process
|
||||
if [ -f $file_0 ]; then
|
||||
rm $file_0
|
||||
fi
|
||||
if [ -f $file_1 ]; then
|
||||
rm $file_1
|
||||
fi
|
@ -0,0 +1,116 @@
|
||||
#!/bin/bash
|
||||
|
||||
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
set -e
|
||||
export FLAGS_START_PORT=35789
|
||||
|
||||
#local_ip=`ip route get 1 | awk '{print $NF;exit}'`
|
||||
file_0="fleet_nproc_0.check_0.log"
|
||||
|
||||
function test_nproc_0(){
|
||||
gpus=$1
|
||||
rm -f ${file_0}
|
||||
distributed_args="--log_dir=testlog --nproc_per_node=1"
|
||||
# nproc_per_node=1, each with 2 gpus
|
||||
python -m paddle.distributed.launch ${distributed_args} nproc_process.py fleet_nproc_0
|
||||
|
||||
str0="selected_gpus:${gpus} worker_endpoints:127.0.0.1:35789 trainers_num:1 current_endpoint:127.0.0.1:35789 trainer_id:0"
|
||||
if grep -q "$str0" "$file_0"; then
|
||||
echo "find trainer 0"
|
||||
else
|
||||
echo "not find trainer 0"
|
||||
exit -1
|
||||
fi
|
||||
}
|
||||
|
||||
# unittest1:gpu
|
||||
if python detected_gpu.py ; then
|
||||
echo "begin ut 1:"
|
||||
export CUDA_VISIBLE_DEVICES=0,1
|
||||
test_nproc_0 "0,1"
|
||||
fi
|
||||
|
||||
# unittest2:cpu
|
||||
if ! python detected_gpu.py ; then
|
||||
echo "begin ut 2:"
|
||||
export CUDA_VISIBLE_DEVICES=""
|
||||
test_nproc_0 ""
|
||||
fi
|
||||
|
||||
|
||||
function test_nproc_1_gpu(){
|
||||
file_0="fleet_nproc_1.check_0.log"
|
||||
file_1="fleet_nproc_1.check_1.log"
|
||||
rm -f ${file_0} ${file_1}
|
||||
|
||||
distributed_args="--log_dir=testlog --nproc_per_node=2"
|
||||
python -m paddle.distributed.launch ${distributed_args} nproc_process.py fleet_nproc_1
|
||||
|
||||
str0="selected_gpus:0 worker_endpoints:127.0.0.1:35789,127.0.0.1:35790 trainers_num:2 current_endpoint:127.0.0.1:35789 trainer_id:0"
|
||||
if grep -q "$str0" "$file_0"; then
|
||||
echo "find trainer 0"
|
||||
else
|
||||
echo "not find trainer 0"
|
||||
exit -1
|
||||
fi
|
||||
|
||||
str1="selected_gpus:1 worker_endpoints:127.0.0.1:35789,127.0.0.1:35790 trainers_num:2 current_endpoint:127.0.0.1:35790 trainer_id:1"
|
||||
if grep -q "$str1" "$file_1"; then
|
||||
echo "find trainer 1"
|
||||
else
|
||||
echo "not find trainer 1"
|
||||
exit -1
|
||||
fi
|
||||
}
|
||||
|
||||
# unittest3: nproc_per_node=2, each with 1 gpus
|
||||
if python detected_gpu.py ; then
|
||||
echo "begin ut 3:"
|
||||
export CUDA_VISIBLE_DEVICES=0,1
|
||||
test_nproc_1_gpu
|
||||
fi
|
||||
|
||||
function test_nproc_1_cpu(){
|
||||
file_0="fleet_nproc_1.check_0.log"
|
||||
file_1="fleet_nproc_1.check_1.log"
|
||||
rm -f ${file_0} ${file_1}
|
||||
|
||||
distributed_args="--log_dir=testlog --nproc_per_node=2"
|
||||
python -m paddle.distributed.launch ${distributed_args} nproc_process.py fleet_nproc_1
|
||||
|
||||
str0="selected_gpus: worker_endpoints:127.0.0.1:35789,127.0.0.1:35790 trainers_num:2 current_endpoint:127.0.0.1:35789 trainer_id:0"
|
||||
if grep -q "$str0" "$file_0"; then
|
||||
echo "find trainer 0"
|
||||
else
|
||||
echo "not find trainer 0"
|
||||
exit -1
|
||||
fi
|
||||
|
||||
str1="selected_gpus: worker_endpoints:127.0.0.1:35789,127.0.0.1:35790 trainers_num:2 current_endpoint:127.0.0.1:35790 trainer_id:1"
|
||||
if grep -q "$str1" "$file_1"; then
|
||||
echo "find trainer 1"
|
||||
else
|
||||
echo "not find trainer 1"
|
||||
exit -1
|
||||
fi
|
||||
}
|
||||
|
||||
# unittest4: nproc_per_node=2, cpu
|
||||
if ! python detected_gpu.py ; then
|
||||
echo "begin ut 4:"
|
||||
export CUDA_VISIBLE_DEVICES=""
|
||||
test_nproc_1_cpu
|
||||
fi
|
@ -0,0 +1,62 @@
|
||||
#!/bin/bash
|
||||
|
||||
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
set -e
|
||||
|
||||
function test_launch_ps(){
|
||||
python -m paddle.distributed.fleet.launch --server_num=2 --worker_num=2 fleet_ps_training.py 2> ut.elog
|
||||
if grep -q "server are killed" ut.elog; then
|
||||
echo "test pserver launch succeed"
|
||||
else
|
||||
echo "test pserver launch failed"
|
||||
exit -1
|
||||
fi
|
||||
|
||||
python -m paddle.distributed.fleet.launch --servers="127.0.0.1:6780,127.0.0.1:6781" --workers="127.0.0.1:6782,127.0.0.1:6783" fleet_ps_training.py 2> ut.elog
|
||||
if grep -q "server are killed" ut.elog; then
|
||||
echo "test pserver launch succeed"
|
||||
else
|
||||
echo "test pserver launch failed"
|
||||
exit -1
|
||||
fi
|
||||
|
||||
python -m paddle.distributed.fleet.launch --servers="127.0.0.1:6780,127.0.0.1:6781" --workers="127.0.0.1,127.0.0.1" fleet_ps_training.py 2> ut.elog
|
||||
if grep -q "server are killed" ut.elog; then
|
||||
echo "test pserver launch succeed"
|
||||
else
|
||||
echo "test pserver launch failed"
|
||||
exit -1
|
||||
fi
|
||||
}
|
||||
|
||||
function test_launch_ps_heter(){
|
||||
python -m paddle.distributed.fleet.launch --server_num=2 --worker_num=2 --heter_worker_num=2 fleet_ps_training.py 2> ut.elog
|
||||
if grep -q "server are killed" ut.elog; then
|
||||
echo "test heter pserver launch succeed"
|
||||
else
|
||||
echo "test pserver launch failed"
|
||||
exit -1
|
||||
fi
|
||||
}
|
||||
|
||||
if [[ ${WITH_GPU} == "OFF" ]]; then
|
||||
echo "in cpu test mode"
|
||||
test_launch_ps
|
||||
exit 0
|
||||
fi
|
||||
|
||||
test_launch_ps
|
||||
test_launch_ps_heter
|
@ -0,0 +1,27 @@
|
||||
#!/bin/bash
|
||||
|
||||
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
set -e
|
||||
|
||||
#test for random ports
|
||||
file_0_0="test_launch_filelock_0_0.log"
|
||||
file_1_0="test_launch_filelock_1_0.log"
|
||||
rm -rf $file_0_0 $file_0_1
|
||||
|
||||
distributed_args="--gpus=0,1 --log_dir=testlog"
|
||||
export PADDLE_LAUNCH_LOG="test_launch_filelock_0"
|
||||
CUDA_VISIBLE_DEVICES=0,1 python -m paddle.distributed.fleet.launch ${distributed_args} find_ports.py
|
||||
str_0="worker_endpoints:127.0.0.1:6070,127.0.0.1:6071"
|
@ -0,0 +1,20 @@
|
||||
#!/bin/bash
|
||||
|
||||
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
set -e
|
||||
|
||||
# use default values
|
||||
fleetrun multi_process.py fleetrun
|
@ -1,85 +0,0 @@
|
||||
#!/bin/bash
|
||||
set -e
|
||||
# use default values
|
||||
# FIXME: random fails on Unknown command lines -c (or -m).
|
||||
launch_py=${PADDLE_BINARY_DIR}/python/paddle/distributed/launch.py
|
||||
python ${launch_py} multi_process.py launch
|
||||
|
||||
# use paddlecloud
|
||||
echo "begin test use paddlecloud"
|
||||
cluster_node_ips="10.0.0.1"
|
||||
node_ip="10.0.0.1"
|
||||
export PADDLE_TRAINERS_NUM=2
|
||||
export POD_IP=127.0.0.1
|
||||
export PADDLE_TRAINERS=127.0.0.1,127.0.0.2
|
||||
export PADDLE_TRAINER_ID=0
|
||||
|
||||
export PADDLE_PORT=35019
|
||||
export TRAINER_PORTS_NUM=2
|
||||
|
||||
distributed_args="--use_paddlecloud --cluster_node_ips=${cluster_node_ips} --node_ip=${node_ip} --selected_gpus=0,1 --log_dir=testlog"
|
||||
CUDA_VISIBLE_DEVICES=0,1 python ${launch_py} ${distributed_args} multi_process.py launch
|
||||
|
||||
str1="selected_gpus:0 worker_endpoints:127.0.0.1:35019,127.0.0.1:35020,127.0.0.2:35019,127.0.0.2:35020 trainers_num:4 current_endpoint:127.0.0.1:35019 trainer_id:0"
|
||||
str2="selected_gpus:1 worker_endpoints:127.0.0.1:35019,127.0.0.1:35020,127.0.0.2:35019,127.0.0.2:35020 trainers_num:4 current_endpoint:127.0.0.1:35020 trainer_id:1"
|
||||
file_0="multi_process_launch.check_0.log"
|
||||
file_1="multi_process_launch.check_1.log"
|
||||
|
||||
echo "paddlecloud params test"
|
||||
if grep -q "$str1" "$file_0"; then
|
||||
echo "find trainer 0"
|
||||
else
|
||||
echo "not find trainer 0"
|
||||
exit -1
|
||||
fi
|
||||
|
||||
if grep -q "$str2" "$file_1"; then
|
||||
echo "find trainer 1"
|
||||
else
|
||||
echo "not find trainer 1"
|
||||
exit -1
|
||||
fi
|
||||
|
||||
# test async poll process
|
||||
if [ -f $file_0 ]; then
|
||||
rm $file_0
|
||||
fi
|
||||
if [ -f $file_1 ]; then
|
||||
rm $file_1
|
||||
fi
|
||||
|
||||
# test use DISTRIBUTED_TRAINER_ENDPOINTS env in paddlecloud
|
||||
unset PADDLE_PORT
|
||||
export DISTRIBUTED_TRAINER_ENDPOINTS=127.0.0.1:6170,127.0.0.1:6171,127.0.0.2:6170,127.0.0.2:6171
|
||||
|
||||
echo ""
|
||||
echo "paddle.distributed.launch async poll process test"
|
||||
if ! CUDA_VISIBLE_DEVICES=0,1 python ${launch_py} ${distributed_args} multi_process.py launch abort; then
|
||||
echo "train abort as planned"
|
||||
fi
|
||||
|
||||
abort_str1="abort>>> selected_gpus:0 worker_endpoints:127.0.0.1:6170,127.0.0.1:6171,127.0.0.2:6170,127.0.0.2:6171 trainers_num:4 current_endpoint:127.0.0.1:6170 trainer_id:0"
|
||||
|
||||
if grep -q "$abort_str1" "$file_0"; then
|
||||
echo "trainer 0 abort as planned"
|
||||
else
|
||||
echo "trainer 0 not abort as planned"
|
||||
exit -1
|
||||
fi
|
||||
|
||||
if [ ! -f $file_1 ]; then
|
||||
echo "trainer 1 terminate as planned"
|
||||
else
|
||||
echo "trainer 1 not terminate as planned"
|
||||
exit -1
|
||||
fi
|
||||
|
||||
#test for random ports
|
||||
file_0_0="test_launch_filelock_0_0.log"
|
||||
file_1_0="test_launch_filelock_1_0.log"
|
||||
rm -rf $file_0_0 $file_0_1
|
||||
|
||||
distributed_args="--selected_gpus=0,1 --log_dir=testlog"
|
||||
export PADDLE_LAUNCH_LOG="test_launch_filelock_0"
|
||||
CUDA_VISIBLE_DEVICES=0,1 python ${launch_py} ${distributed_args} find_ports.py
|
||||
str_0="worker_endpoints:127.0.0.1:6070,127.0.0.1:6071"
|
@ -0,0 +1,120 @@
|
||||
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
from __future__ import print_function
|
||||
import sys
|
||||
import subprocess
|
||||
import os
|
||||
import time
|
||||
import six
|
||||
import copy
|
||||
import unittest
|
||||
import paddle.fluid as fluid
|
||||
|
||||
from argparse import ArgumentParser, REMAINDER
|
||||
from paddle.distributed.utils import _print_arguments, get_gpus, get_cluster_from_args
|
||||
|
||||
|
||||
def _parse_args():
|
||||
parser = ArgumentParser(
|
||||
description='''start paddle training using multi-process mode.
|
||||
NOTE: your train program ***must*** run as distributed nccl2 mode,
|
||||
see: http://www.paddlepaddle.org/documentation/docs/zh/1.6/user_guides/howto/training/cluster_howto.html#permalink-8--nccl2-
|
||||
And your train program must read environment variables below in order to let different
|
||||
process init properly:
|
||||
FLAGS_selected_gpus
|
||||
PADDLE_TRAINER_ID
|
||||
PADDLE_CURRENT_ENDPOINT
|
||||
PADDLE_TRAINERS_NUM
|
||||
PADDLE_TRAINER_ENDPOINTS
|
||||
POD_IP (current node ip address, not needed for local training)
|
||||
''')
|
||||
|
||||
#Optional arguments for the launch helper
|
||||
parser.add_argument(
|
||||
"--cluster_node_ips",
|
||||
type=str,
|
||||
default="127.0.0.1",
|
||||
help="Paddle cluster nodes ips, such as 192.168.0.16,192.168.0.17..")
|
||||
parser.add_argument(
|
||||
"--node_ip",
|
||||
type=str,
|
||||
default="127.0.0.1",
|
||||
help="The current node ip. ")
|
||||
parser.add_argument(
|
||||
"--use_paddlecloud",
|
||||
action='store_true',
|
||||
help="wheter to use paddlecloud platform to run your multi-process job. If false, no need to set this argument."
|
||||
)
|
||||
parser.add_argument(
|
||||
"--started_port",
|
||||
type=int,
|
||||
default=None,
|
||||
help="The trainer's started port on a single node")
|
||||
|
||||
parser.add_argument(
|
||||
"--print_config",
|
||||
type=bool,
|
||||
default=True,
|
||||
help="Print the config or not")
|
||||
|
||||
parser.add_argument(
|
||||
"--selected_gpus",
|
||||
type=str,
|
||||
default=None,
|
||||
help="It's for gpu training and the training process will run on the selected_gpus,"
|
||||
"each process is bound to a single GPU. And if it's not set, this module will use all the gpu cards for training."
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"--log_level",
|
||||
type=int,
|
||||
default=20, # logging.INFO, details are here:https://docs.python.org/3/library/logging.html#levels
|
||||
help="Logging level, default is logging.INFO")
|
||||
|
||||
parser.add_argument(
|
||||
"--log_dir",
|
||||
type=str,
|
||||
help="The path for each process's log.If it's not set, the log will printed to default pipe."
|
||||
)
|
||||
|
||||
#positional
|
||||
parser.add_argument(
|
||||
"training_script",
|
||||
type=str,
|
||||
help="The full path to the single GPU training "
|
||||
"program/script to be launched in parallel, "
|
||||
"followed by all the arguments for the "
|
||||
"training script")
|
||||
|
||||
#rest from the training program
|
||||
parser.add_argument('training_script_args', nargs=REMAINDER)
|
||||
return parser.parse_args()
|
||||
|
||||
|
||||
class TestCoverage(unittest.TestCase):
|
||||
def test_gpus(self):
|
||||
args = _parse_args()
|
||||
|
||||
if args.print_config:
|
||||
_print_arguments(args)
|
||||
|
||||
gpus = get_gpus(None)
|
||||
|
||||
args.use_paddlecloud = True
|
||||
cluster, pod = get_cluster_from_args(args, "0")
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
unittest.main()
|
@ -1,12 +0,0 @@
|
||||
#!/bin/bash
|
||||
set -e
|
||||
# use default values
|
||||
launch_py=${PADDLE_BINARY_DIR}/python/paddle/distributed/launch_ps.py
|
||||
python ${launch_py} fleet_ps_training.py 2> ut.elog
|
||||
|
||||
if grep -q "server are killed" ut.elog; then
|
||||
echo "succeed"
|
||||
else
|
||||
echo "failed"
|
||||
exit -1
|
||||
fi
|
Loading…
Reference in new issue