From 0d4ce6ac5dc8302688a3190dfbae9e2844f4c602 Mon Sep 17 00:00:00 2001
From: danleifeng <52735331+danleifeng@users.noreply.github.com>
Date: Fri, 7 Aug 2020 10:38:53 +0800
Subject: [PATCH] fix test_launch and test_fleet_launch bug; test=develop
 (#26015)

---
 .../fluid/tests/unittests/multi_process.py    | 23 +++++++++++--------
 .../tests/unittests/test_fleet_launch.sh      | 17 +++++++-------
 .../fluid/tests/unittests/test_launch.sh      | 10 ++++----
 3 files changed, 27 insertions(+), 23 deletions(-)

diff --git a/python/paddle/fluid/tests/unittests/multi_process.py b/python/paddle/fluid/tests/unittests/multi_process.py
index a67634adfc..f999ce803a 100644
--- a/python/paddle/fluid/tests/unittests/multi_process.py
+++ b/python/paddle/fluid/tests/unittests/multi_process.py
@@ -17,7 +17,7 @@ import sys
 import time
 
 
-def train():
+def train(prefix):
     selected_gpus = os.getenv("FLAGS_selected_gpus")
     trainer_id = int(os.getenv("PADDLE_TRAINER_ID"))
     worker_endpoints_env = os.getenv("PADDLE_TRAINER_ENDPOINTS")
@@ -29,11 +29,12 @@ def train():
         .format(selected_gpus, worker_endpoints, trainers_num, current_endpoint,trainer_id)
 
     print(name)
-    with open("multi_process.check_{}.log".format(trainer_id), "w") as f:
+    with open("multi_process_{}.check_{}.log".format(prefix, trainer_id),
+              "w") as f:
         f.write(name)
 
 
-def train_abort():
+def train_abort(prefix):
     selected_gpus = os.getenv("FLAGS_selected_gpus")
     trainer_id = int(os.getenv("PADDLE_TRAINER_ID"))
     worker_endpoints_env = os.getenv("PADDLE_TRAINER_ENDPOINTS")
@@ -49,8 +50,9 @@ def train_abort():
             name = "abort>>> selected_gpus:{} worker_endpoints:{} trainers_num:{} current_endpoint:{} trainer_id:{}"\
                 .format(selected_gpus, worker_endpoints, trainers_num, current_endpoint,trainer_id)
             print(name)
-            with open("multi_process.check_{}.log".format(trainer_id),
-                      "w") as f:
+            with open(
+                    "multi_process_{}.check_{}.log".format(prefix, trainer_id),
+                    "w") as f:
                 f.write(name)
             raise
     else:
@@ -60,12 +62,15 @@ def train_abort():
             .format(selected_gpus, worker_endpoints, trainers_num, current_endpoint,trainer_id)
 
         print(name)
-        with open("multi_process.check_{}.log".format(trainer_id), "w") as f:
+        with open("multi_process_{}.check_{}.log".format(prefix, trainer_id),
+                  "w") as f:
             f.write(name)
 
 
 if __name__ == '__main__':
-    if len(sys.argv) == 2 and sys.argv[1] == "abort":
-        train_abort()
+    if len(sys.argv) == 3 and sys.argv[2] == "abort":
+        prefix = sys.argv[1]
+        train_abort(prefix)
     else:
-        train()
+        prefix = sys.argv[1]
+        train(prefix)
diff --git a/python/paddle/fluid/tests/unittests/test_fleet_launch.sh b/python/paddle/fluid/tests/unittests/test_fleet_launch.sh
index 577f9f6504..5e5c4e17f5 100644
--- a/python/paddle/fluid/tests/unittests/test_fleet_launch.sh
+++ b/python/paddle/fluid/tests/unittests/test_fleet_launch.sh
@@ -4,7 +4,6 @@ set -e
 
 function test_launch_ps(){
     fleetrun --server_num=2 --worker_num=2 fleet_ps_training.py 2> ut.elog
-
     if grep -q "server are killed" ut.elog; then
         echo "test pserver launch succeed"
     else
@@ -20,7 +19,7 @@ fi
 
 test_launch_ps
 # use default values
-fleetrun multi_process.py
+fleetrun multi_process.py fleetrun
 
 # use paddlecloud
 echo "begin test use paddlecloud"
@@ -30,16 +29,16 @@ export POD_IP=127.0.0.1
 export PADDLE_TRAINERS=127.0.0.1,127.0.0.2
 export PADDLE_TRAINER_ID=0
 
-export PADDLE_PORT=35019
+export PADDLE_PORT=35789
 export TRAINER_PORTS_NUM=2
 
 distributed_args="--ips=${cluster_node_ips} --gpus=0,1 --log_dir=testlog"
-CUDA_VISIBLE_DEVICES=0,1 fleetrun ${distributed_args} multi_process.py
+CUDA_VISIBLE_DEVICES=0,1 fleetrun ${distributed_args} multi_process.py fleetrun
 
-str1="selected_gpus:0 worker_endpoints:127.0.0.1:35019,127.0.0.1:35020,127.0.0.2:35019,127.0.0.2:35020 trainers_num:4 current_endpoint:127.0.0.1:35019 trainer_id:0"
-str2="selected_gpus:1 worker_endpoints:127.0.0.1:35019,127.0.0.1:35020,127.0.0.2:35019,127.0.0.2:35020 trainers_num:4 current_endpoint:127.0.0.1:35020 trainer_id:1"
-file_0="multi_process.check_0.log"
-file_1="multi_process.check_1.log"
+str1="selected_gpus:0 worker_endpoints:127.0.0.1:35789,127.0.0.1:35790,127.0.0.2:35789,127.0.0.2:35790 trainers_num:4 current_endpoint:127.0.0.1:35789 trainer_id:0"
+str2="selected_gpus:1 worker_endpoints:127.0.0.1:35789,127.0.0.1:35790,127.0.0.2:35789,127.0.0.2:35790 trainers_num:4 current_endpoint:127.0.0.1:35790 trainer_id:1"
+file_0="multi_process_fleetrun.check_0.log"
+file_1="multi_process_fleetrun.check_1.log"
 
 echo "paddlecloud params test"
 if grep -q "$str1" "$file_0"; then
@@ -70,7 +69,7 @@ unset TRAINER_PORTS_NUM
 
 echo ""
 echo "paddle.distributed.launch async poll process test"
-if ! CUDA_VISIBLE_DEVICES=0,1 fleetrun ${distributed_args} multi_process.py abort; then
+if ! CUDA_VISIBLE_DEVICES=0,1 fleetrun ${distributed_args} multi_process.py fleetrun abort; then
     echo "train abort as planned"
 fi
 
diff --git a/python/paddle/fluid/tests/unittests/test_launch.sh b/python/paddle/fluid/tests/unittests/test_launch.sh
index f1bf6395f1..98c907a551 100644
--- a/python/paddle/fluid/tests/unittests/test_launch.sh
+++ b/python/paddle/fluid/tests/unittests/test_launch.sh
@@ -3,7 +3,7 @@ set -e
 # use default values
 # FIXME: random fails on Unknown command lines -c (or -m).
 launch_py=${PADDLE_BINARY_DIR}/python/paddle/distributed/launch.py
-python ${launch_py} multi_process.py
+python ${launch_py} multi_process.py launch
 
 # use paddlecloud
 echo "begin test use paddlecloud"
@@ -18,12 +18,12 @@ export PADDLE_PORT=35019
 export TRAINER_PORTS_NUM=2
 
 distributed_args="--use_paddlecloud --cluster_node_ips=${cluster_node_ips} --node_ip=${node_ip} --selected_gpus=0,1 --log_dir=testlog"
-CUDA_VISIBLE_DEVICES=0,1 python ${launch_py} ${distributed_args} multi_process.py
+CUDA_VISIBLE_DEVICES=0,1 python ${launch_py} ${distributed_args} multi_process.py launch
 
 str1="selected_gpus:0 worker_endpoints:127.0.0.1:35019,127.0.0.1:35020,127.0.0.2:35019,127.0.0.2:35020 trainers_num:4 current_endpoint:127.0.0.1:35019 trainer_id:0"
 str2="selected_gpus:1 worker_endpoints:127.0.0.1:35019,127.0.0.1:35020,127.0.0.2:35019,127.0.0.2:35020 trainers_num:4 current_endpoint:127.0.0.1:35020 trainer_id:1"
-file_0="multi_process.check_0.log"
-file_1="multi_process.check_1.log"
+file_0="multi_process_launch.check_0.log"
+file_1="multi_process_launch.check_1.log"
 
 echo "paddlecloud params test"
 if grep -q "$str1" "$file_0"; then
@@ -54,7 +54,7 @@ unset TRAINER_PORTS_NUM
 
 echo ""
 echo "paddle.distributed.launch async poll process test"
-if ! CUDA_VISIBLE_DEVICES=0,1 python ${launch_py} ${distributed_args} multi_process.py abort; then
+if ! CUDA_VISIBLE_DEVICES=0,1 python ${launch_py} ${distributed_args} multi_process.py launch abort; then
     echo "train abort as planned"
 fi