From 0d4ce6ac5dc8302688a3190dfbae9e2844f4c602 Mon Sep 17 00:00:00 2001 From: danleifeng <52735331+danleifeng@users.noreply.github.com> Date: Fri, 7 Aug 2020 10:38:53 +0800 Subject: [PATCH] fix test_launch and test_fleet_launch bug; test=develop (#26015) --- .../fluid/tests/unittests/multi_process.py | 23 +++++++++++-------- .../tests/unittests/test_fleet_launch.sh | 17 +++++++------- .../fluid/tests/unittests/test_launch.sh | 10 ++++---- 3 files changed, 27 insertions(+), 23 deletions(-) diff --git a/python/paddle/fluid/tests/unittests/multi_process.py b/python/paddle/fluid/tests/unittests/multi_process.py index a67634adfc..f999ce803a 100644 --- a/python/paddle/fluid/tests/unittests/multi_process.py +++ b/python/paddle/fluid/tests/unittests/multi_process.py @@ -17,7 +17,7 @@ import sys import time -def train(): +def train(prefix): selected_gpus = os.getenv("FLAGS_selected_gpus") trainer_id = int(os.getenv("PADDLE_TRAINER_ID")) worker_endpoints_env = os.getenv("PADDLE_TRAINER_ENDPOINTS") @@ -29,11 +29,12 @@ def train(): .format(selected_gpus, worker_endpoints, trainers_num, current_endpoint,trainer_id) print(name) - with open("multi_process.check_{}.log".format(trainer_id), "w") as f: + with open("multi_process_{}.check_{}.log".format(prefix, trainer_id), + "w") as f: f.write(name) -def train_abort(): +def train_abort(prefix): selected_gpus = os.getenv("FLAGS_selected_gpus") trainer_id = int(os.getenv("PADDLE_TRAINER_ID")) worker_endpoints_env = os.getenv("PADDLE_TRAINER_ENDPOINTS") @@ -49,8 +50,9 @@ def train_abort(): name = "abort>>> selected_gpus:{} worker_endpoints:{} trainers_num:{} current_endpoint:{} trainer_id:{}"\ .format(selected_gpus, worker_endpoints, trainers_num, current_endpoint,trainer_id) print(name) - with open("multi_process.check_{}.log".format(trainer_id), - "w") as f: + with open( + "multi_process_{}.check_{}.log".format(prefix, trainer_id), + "w") as f: f.write(name) raise else: @@ -60,12 +62,15 @@ def train_abort(): .format(selected_gpus, worker_endpoints, trainers_num, current_endpoint,trainer_id) print(name) - with open("multi_process.check_{}.log".format(trainer_id), "w") as f: + with open("multi_process_{}.check_{}.log".format(prefix, trainer_id), + "w") as f: f.write(name) if __name__ == '__main__': - if len(sys.argv) == 2 and sys.argv[1] == "abort": - train_abort() + if len(sys.argv) == 3 and sys.argv[2] == "abort": + prefix = sys.argv[1] + train_abort(prefix) else: - train() + prefix = sys.argv[1] + train(prefix) diff --git a/python/paddle/fluid/tests/unittests/test_fleet_launch.sh b/python/paddle/fluid/tests/unittests/test_fleet_launch.sh index 577f9f6504..5e5c4e17f5 100644 --- a/python/paddle/fluid/tests/unittests/test_fleet_launch.sh +++ b/python/paddle/fluid/tests/unittests/test_fleet_launch.sh @@ -4,7 +4,6 @@ set -e function test_launch_ps(){ fleetrun --server_num=2 --worker_num=2 fleet_ps_training.py 2> ut.elog - if grep -q "server are killed" ut.elog; then echo "test pserver launch succeed" else @@ -20,7 +19,7 @@ fi test_launch_ps # use default values -fleetrun multi_process.py +fleetrun multi_process.py fleetrun # use paddlecloud echo "begin test use paddlecloud" @@ -30,16 +29,16 @@ export POD_IP=127.0.0.1 export PADDLE_TRAINERS=127.0.0.1,127.0.0.2 export PADDLE_TRAINER_ID=0 -export PADDLE_PORT=35019 +export PADDLE_PORT=35789 export TRAINER_PORTS_NUM=2 distributed_args="--ips=${cluster_node_ips} --gpus=0,1 --log_dir=testlog" -CUDA_VISIBLE_DEVICES=0,1 fleetrun ${distributed_args} multi_process.py +CUDA_VISIBLE_DEVICES=0,1 fleetrun ${distributed_args} multi_process.py fleetrun -str1="selected_gpus:0 worker_endpoints:127.0.0.1:35019,127.0.0.1:35020,127.0.0.2:35019,127.0.0.2:35020 trainers_num:4 current_endpoint:127.0.0.1:35019 trainer_id:0" -str2="selected_gpus:1 worker_endpoints:127.0.0.1:35019,127.0.0.1:35020,127.0.0.2:35019,127.0.0.2:35020 trainers_num:4 current_endpoint:127.0.0.1:35020 trainer_id:1" -file_0="multi_process.check_0.log" -file_1="multi_process.check_1.log" +str1="selected_gpus:0 worker_endpoints:127.0.0.1:35789,127.0.0.1:35790,127.0.0.2:35789,127.0.0.2:35790 trainers_num:4 current_endpoint:127.0.0.1:35789 trainer_id:0" +str2="selected_gpus:1 worker_endpoints:127.0.0.1:35789,127.0.0.1:35790,127.0.0.2:35789,127.0.0.2:35790 trainers_num:4 current_endpoint:127.0.0.1:35790 trainer_id:1" +file_0="multi_process_fleetrun.check_0.log" +file_1="multi_process_fleetrun.check_1.log" echo "paddlecloud params test" if grep -q "$str1" "$file_0"; then @@ -70,7 +69,7 @@ unset TRAINER_PORTS_NUM echo "" echo "paddle.distributed.launch async poll process test" -if ! CUDA_VISIBLE_DEVICES=0,1 fleetrun ${distributed_args} multi_process.py abort; then +if ! CUDA_VISIBLE_DEVICES=0,1 fleetrun ${distributed_args} multi_process.py fleetrun abort; then echo "train abort as planned" fi diff --git a/python/paddle/fluid/tests/unittests/test_launch.sh b/python/paddle/fluid/tests/unittests/test_launch.sh index f1bf6395f1..98c907a551 100644 --- a/python/paddle/fluid/tests/unittests/test_launch.sh +++ b/python/paddle/fluid/tests/unittests/test_launch.sh @@ -3,7 +3,7 @@ set -e # use default values # FIXME: random fails on Unknown command lines -c (or -m). launch_py=${PADDLE_BINARY_DIR}/python/paddle/distributed/launch.py -python ${launch_py} multi_process.py +python ${launch_py} multi_process.py launch # use paddlecloud echo "begin test use paddlecloud" @@ -18,12 +18,12 @@ export PADDLE_PORT=35019 export TRAINER_PORTS_NUM=2 distributed_args="--use_paddlecloud --cluster_node_ips=${cluster_node_ips} --node_ip=${node_ip} --selected_gpus=0,1 --log_dir=testlog" -CUDA_VISIBLE_DEVICES=0,1 python ${launch_py} ${distributed_args} multi_process.py +CUDA_VISIBLE_DEVICES=0,1 python ${launch_py} ${distributed_args} multi_process.py launch str1="selected_gpus:0 worker_endpoints:127.0.0.1:35019,127.0.0.1:35020,127.0.0.2:35019,127.0.0.2:35020 trainers_num:4 current_endpoint:127.0.0.1:35019 trainer_id:0" str2="selected_gpus:1 worker_endpoints:127.0.0.1:35019,127.0.0.1:35020,127.0.0.2:35019,127.0.0.2:35020 trainers_num:4 current_endpoint:127.0.0.1:35020 trainer_id:1" -file_0="multi_process.check_0.log" -file_1="multi_process.check_1.log" +file_0="multi_process_launch.check_0.log" +file_1="multi_process_launch.check_1.log" echo "paddlecloud params test" if grep -q "$str1" "$file_0"; then @@ -54,7 +54,7 @@ unset TRAINER_PORTS_NUM echo "" echo "paddle.distributed.launch async poll process test" -if ! CUDA_VISIBLE_DEVICES=0,1 python ${launch_py} ${distributed_args} multi_process.py abort; then +if ! CUDA_VISIBLE_DEVICES=0,1 python ${launch_py} ${distributed_args} multi_process.py launch abort; then echo "train abort as planned" fi