【paddle.fleet】add fleetrun command for distributed running (#25806)
* add fleetrun command for distributed running; test=developrevert-24895-update_cub
parent
b717895f64
commit
3dd2e3801a
@ -0,0 +1,85 @@
|
||||
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import os
|
||||
import paddle
|
||||
from paddle.fleet.launch_utils import get_cluster, logger
|
||||
|
||||
|
||||
def get_cloud_cluster(args_node_ips, selected_gpus, args_port=6170):
|
||||
"""
|
||||
args_node_ips, args_node_ip:string
|
||||
"""
|
||||
#you can automatically get ip info while using paddlecloud multi nodes mode.
|
||||
node_ips = os.getenv("PADDLE_TRAINERS")
|
||||
assert node_ips is not None, "PADDLE_TRAINERS should not be None"
|
||||
|
||||
node_ip = os.getenv("POD_IP")
|
||||
assert node_ip is not None, "POD_IP should not be None"
|
||||
|
||||
node_rank = os.getenv("PADDLE_TRAINER_ID")
|
||||
assert node_rank is not None, "PADDLE_TRAINER_ID should not be None"
|
||||
|
||||
node_ips = node_ips.split(",")
|
||||
num_nodes = len(node_ips)
|
||||
node_rank = int(node_rank)
|
||||
|
||||
if args_node_ips != "127.0.0.1" and args_node_ips != ",".join(node_ips):
|
||||
logger.warning(
|
||||
"Please NOTE: When using paddlecloud, cluster_node_ips is \
|
||||
automatically got from PADDLE_TRAINERS(multi nodes) or POD_IP(single node).\
|
||||
Your input cluster_node_ips: {} doesn't equals to IPs: {} from \
|
||||
paddlecloud environment.".format(args_node_ips, node_ips))
|
||||
|
||||
started_port = args_port
|
||||
print("num_nodes:", num_nodes)
|
||||
if num_nodes > 1:
|
||||
try:
|
||||
paddle_port = int(os.getenv("PADDLE_PORT", ""))
|
||||
paddle_port_num = int(os.getenv("TRAINER_PORTS_NUM", ""))
|
||||
|
||||
if paddle_port_num >= len(
|
||||
selected_gpus) and paddle_port != args_port:
|
||||
logger.warning("Use Cloud specified port:{}.".format(
|
||||
paddle_port))
|
||||
started_port = paddle_port
|
||||
|
||||
except Exception as e:
|
||||
print(e)
|
||||
pass
|
||||
|
||||
if started_port is None:
|
||||
started_port = 6170
|
||||
|
||||
logger.debug("parsed from args:node_ips:{} \
|
||||
node_ip:{} node_rank:{} started_port:{}"
|
||||
.format(node_ips, node_ip, node_rank, started_port))
|
||||
|
||||
ports = [x for x in range(started_port, started_port + len(selected_gpus))]
|
||||
cluster, pod = get_cluster(node_ips, node_ip, ports, selected_gpus)
|
||||
return cluster, cluster.pods[node_rank]
|
||||
|
||||
|
||||
def use_paddlecloud():
|
||||
node_ips = os.getenv("PADDLE_TRAINERS")
|
||||
node_ip = os.getenv("POD_IP")
|
||||
node_rank = os.getenv("PADDLE_TRAINER_ID")
|
||||
if node_ips is None or node_ip is None or node_rank is None:
|
||||
return False
|
||||
else:
|
||||
return True
|
||||
|
||||
|
||||
def get_trainers_num():
|
||||
return int(os.getenv("PADDLE_TRAINERS_NUM", "1"))
|
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
@ -0,0 +1,101 @@
|
||||
#!/bin/bash
|
||||
set -e
|
||||
|
||||
|
||||
function test_launch_ps(){
|
||||
fleetrun --server_num=2 --worker_num=2 fleet_ps_training.py 2> ut.elog
|
||||
|
||||
if grep -q "server are killed" ut.elog; then
|
||||
echo "test pserver launch succeed"
|
||||
else
|
||||
echo "test pserver launch failed"
|
||||
exit -1
|
||||
fi
|
||||
}
|
||||
|
||||
if [[ ${WITH_GPU} == "OFF" ]]; then
|
||||
test_launch_ps
|
||||
exit 0
|
||||
fi
|
||||
|
||||
test_launch_ps
|
||||
# use default values
|
||||
fleetrun multi_process.py
|
||||
|
||||
# use paddlecloud
|
||||
echo "begin test use paddlecloud"
|
||||
cluster_node_ips="127.0.0.1,127.0.0.2"
|
||||
export PADDLE_TRAINERS_NUM=2
|
||||
export POD_IP=127.0.0.1
|
||||
export PADDLE_TRAINERS=127.0.0.1,127.0.0.2
|
||||
export PADDLE_TRAINER_ID=0
|
||||
|
||||
export PADDLE_PORT=35019
|
||||
export TRAINER_PORTS_NUM=2
|
||||
|
||||
distributed_args="--ips=${cluster_node_ips} --gpus=0,1 --log_dir=testlog"
|
||||
CUDA_VISIBLE_DEVICES=0,1 fleetrun ${distributed_args} multi_process.py
|
||||
|
||||
str1="selected_gpus:0 worker_endpoints:127.0.0.1:35019,127.0.0.1:35020,127.0.0.2:35019,127.0.0.2:35020 trainers_num:4 current_endpoint:127.0.0.1:35019 trainer_id:0"
|
||||
str2="selected_gpus:1 worker_endpoints:127.0.0.1:35019,127.0.0.1:35020,127.0.0.2:35019,127.0.0.2:35020 trainers_num:4 current_endpoint:127.0.0.1:35020 trainer_id:1"
|
||||
file_0="multi_process.check_0.log"
|
||||
file_1="multi_process.check_1.log"
|
||||
|
||||
echo "paddlecloud params test"
|
||||
if grep -q "$str1" "$file_0"; then
|
||||
echo "find trainer 0"
|
||||
else
|
||||
echo "not find trainer 0"
|
||||
exit -1
|
||||
fi
|
||||
|
||||
if grep -q "$str2" "$file_1"; then
|
||||
echo "find trainer 1"
|
||||
else
|
||||
echo "not find trainer 1"
|
||||
exit -1
|
||||
fi
|
||||
|
||||
# test async poll process
|
||||
if [ -f $file_0 ]; then
|
||||
rm $file_0
|
||||
fi
|
||||
if [ -f $file_1 ]; then
|
||||
rm $file_1
|
||||
fi
|
||||
|
||||
|
||||
unset PADDLE_PORT
|
||||
unset TRAINER_PORTS_NUM
|
||||
|
||||
echo ""
|
||||
echo "paddle.distributed.launch async poll process test"
|
||||
if ! CUDA_VISIBLE_DEVICES=0,1 fleetrun ${distributed_args} multi_process.py abort; then
|
||||
echo "train abort as planned"
|
||||
fi
|
||||
|
||||
abort_str1="abort>>> selected_gpus:0 worker_endpoints:127.0.0.1:6170,127.0.0.1:6171,127.0.0.2:6170,127.0.0.2:6171 trainers_num:4 current_endpoint:127.0.0.1:6170 trainer_id:0"
|
||||
|
||||
if grep -q "$abort_str1" "$file_0"; then
|
||||
echo "trainer 0 abort as planned"
|
||||
else
|
||||
echo "trainer 0 not abort as planned"
|
||||
exit -1
|
||||
fi
|
||||
|
||||
if [ ! -f $file_1 ]; then
|
||||
echo "trainer 1 terminate as planned"
|
||||
else
|
||||
echo "trainer 1 not terminate as planned"
|
||||
exit -1
|
||||
fi
|
||||
|
||||
#test for random ports
|
||||
file_0_0="test_launch_filelock_0_0.log"
|
||||
file_1_0="test_launch_filelock_1_0.log"
|
||||
rm -rf $file_0_0 $file_0_1
|
||||
|
||||
distributed_args="--gpus=0,1 --log_dir=testlog"
|
||||
export PADDLE_LAUNCH_LOG="test_launch_filelock_0"
|
||||
CUDA_VISIBLE_DEVICES=0,1 fleetrun ${distributed_args} find_ports.py
|
||||
str_0="worker_endpoints:127.0.0.1:6070,127.0.0.1:6071"
|
Loading…
Reference in new issue