【paddle.fleet】add fleetrun command for distributed running (#25806)

* add fleetrun command for distributed running; test=develop
5 years ago · 3dd2e3801a
parent b717895f64
commit 3dd2e3801a
7 changed files with 934 additions and 0 deletions
--- a/paddle/scripts/paddle_build.sh
+++ b/paddle/scripts/paddle_build.sh
@ -958,6 +958,7 @@ function parallel_test() {
    ut_total_startTime_s=`date +%s`
    mkdir -p ${PADDLE_ROOT}/build
    cd ${PADDLE_ROOT}/build
+    pip install ${PADDLE_ROOT}/build/python/dist/*whl
    if [ "$WITH_GPU" == "ON" ];then
        parallel_test_base_gpu
    else
--- a/python/paddle/fleet/cloud_utils.py
+++ b/python/paddle/fleet/cloud_utils.py
@ -0,0 +1,85 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import paddle
+from paddle.fleet.launch_utils import get_cluster, logger
+
+
+def get_cloud_cluster(args_node_ips, selected_gpus, args_port=6170):
+    """
+    args_node_ips, args_node_ip:string
+    """
+    #you can automatically get ip info while using paddlecloud multi nodes mode.
+    node_ips = os.getenv("PADDLE_TRAINERS")
+    assert node_ips is not None, "PADDLE_TRAINERS should not be None"
+
+    node_ip = os.getenv("POD_IP")
+    assert node_ip is not None, "POD_IP should not be None"
+
+    node_rank = os.getenv("PADDLE_TRAINER_ID")
+    assert node_rank is not None, "PADDLE_TRAINER_ID should not be None"
+
+    node_ips = node_ips.split(",")
+    num_nodes = len(node_ips)
+    node_rank = int(node_rank)
+
+    if args_node_ips != "127.0.0.1" and args_node_ips != ",".join(node_ips):
+        logger.warning(
+            "Please NOTE: When using paddlecloud, cluster_node_ips is \
+automatically got from PADDLE_TRAINERS(multi nodes) or POD_IP(single node).\
+Your input cluster_node_ips: {} doesn't equals to IPs: {} from \
+paddlecloud environment.".format(args_node_ips, node_ips))
+
+    started_port = args_port
+    print("num_nodes:", num_nodes)
+    if num_nodes > 1:
+        try:
+            paddle_port = int(os.getenv("PADDLE_PORT", ""))
+            paddle_port_num = int(os.getenv("TRAINER_PORTS_NUM", ""))
+
+            if paddle_port_num >= len(
+                    selected_gpus) and paddle_port != args_port:
+                logger.warning("Use Cloud specified port:{}.".format(
+                    paddle_port))
+                started_port = paddle_port
+
+        except Exception as e:
+            print(e)
+            pass
+
+    if started_port is None:
+        started_port = 6170
+
+    logger.debug("parsed from args:node_ips:{} \
+        node_ip:{} node_rank:{} started_port:{}"
+                 .format(node_ips, node_ip, node_rank, started_port))
+
+    ports = [x for x in range(started_port, started_port + len(selected_gpus))]
+    cluster, pod = get_cluster(node_ips, node_ip, ports, selected_gpus)
+    return cluster, cluster.pods[node_rank]
+
+
+def use_paddlecloud():
+    node_ips = os.getenv("PADDLE_TRAINERS")
+    node_ip = os.getenv("POD_IP")
+    node_rank = os.getenv("PADDLE_TRAINER_ID")
+    if node_ips is None or node_ip is None or node_rank is None:
+        return False
+    else:
+        return True
+
+
+def get_trainers_num():
+    return int(os.getenv("PADDLE_TRAINERS_NUM", "1"))
--- a/python/paddle/fleet/launch.py
+++ b/python/paddle/fleet/launch.py
--- a/python/paddle/fleet/launch_utils.py
+++ b/python/paddle/fleet/launch_utils.py
--- a/python/paddle/fluid/tests/unittests/CMakeLists.txt
+++ b/python/paddle/fluid/tests/unittests/CMakeLists.txt
@ -27,6 +27,7 @@ list(APPEND MIXED_DIST_TEST_OPS test_communicator_async)
 list(APPEND MIXED_DIST_TEST_OPS test_communicator_geo)
 list(APPEND MIXED_DIST_TEST_OPS test_communicator_half_async)
 list(APPEND MIXED_DIST_TEST_OPS test_communicator_sync)
+list(APPEND MIXED_DIST_TEST_OPS test_fleet_launch)
 list(APPEND MIXED_DIST_TEST_OPS test_fleet_api_input)
 list(APPEND MIXED_DIST_TEST_OPS test_fleet_checkpoint)
 list(APPEND MIXED_DIST_TEST_OPS test_collective_optimizer)
@ -399,6 +400,7 @@ if(WITH_DISTRIBUTE)
            py_test_modules(test_fleet_checkpoint MODULES test_fleet_checkpoint)
        endif()
        bash_test_modules(test_launch_ps MODULES test_launch_ps.sh ENVS PADDLE_BINARY_DIR=${PADDLE_BINARY_DIR})
+        bash_test_modules(test_fleet_launch MODULES test_fleet_launch.sh ENVS PADDLE_BINARY_DIR=${PADDLE_BINARY_DIR})

        set(dist_ut_port 20001)
        foreach(TEST_OP ${DIST_TEST_OPS})
--- a/python/paddle/fluid/tests/unittests/test_fleet_launch.sh
+++ b/python/paddle/fluid/tests/unittests/test_fleet_launch.sh
@ -0,0 +1,101 @@
+#!/bin/bash
+set -e
+
+
+function test_launch_ps(){
+    fleetrun --server_num=2 --worker_num=2 fleet_ps_training.py 2> ut.elog
+
+    if grep -q "server are killed" ut.elog; then
+        echo "test pserver launch succeed"
+    else
+        echo "test pserver launch failed"
+        exit -1
+    fi
+}
+
+if [[ ${WITH_GPU} == "OFF" ]]; then
+    test_launch_ps
+    exit 0
+fi
+
+test_launch_ps
+# use default values
+fleetrun multi_process.py
+
+# use paddlecloud
+echo "begin test use paddlecloud"
+cluster_node_ips="127.0.0.1,127.0.0.2"
+export PADDLE_TRAINERS_NUM=2
+export POD_IP=127.0.0.1
+export PADDLE_TRAINERS=127.0.0.1,127.0.0.2
+export PADDLE_TRAINER_ID=0
+
+export PADDLE_PORT=35019
+export TRAINER_PORTS_NUM=2
+
+distributed_args="--ips=${cluster_node_ips} --gpus=0,1 --log_dir=testlog"
+CUDA_VISIBLE_DEVICES=0,1 fleetrun ${distributed_args} multi_process.py
+
+str1="selected_gpus:0 worker_endpoints:127.0.0.1:35019,127.0.0.1:35020,127.0.0.2:35019,127.0.0.2:35020 trainers_num:4 current_endpoint:127.0.0.1:35019 trainer_id:0"
+str2="selected_gpus:1 worker_endpoints:127.0.0.1:35019,127.0.0.1:35020,127.0.0.2:35019,127.0.0.2:35020 trainers_num:4 current_endpoint:127.0.0.1:35020 trainer_id:1"
+file_0="multi_process.check_0.log"
+file_1="multi_process.check_1.log"
+
+echo "paddlecloud params test"
+if grep -q "$str1" "$file_0"; then
+    echo "find trainer 0"
+else
+    echo "not find trainer 0"
+    exit -1
+fi
+
+if grep -q "$str2" "$file_1"; then
+    echo "find trainer 1"
+else
+    echo "not find trainer 1"
+    exit -1
+fi
+
+# test async poll process
+if [ -f $file_0 ]; then
+    rm $file_0
+fi
+if [ -f $file_1 ]; then
+    rm $file_1
+fi
+
+
+unset PADDLE_PORT
+unset TRAINER_PORTS_NUM
+
+echo ""
+echo "paddle.distributed.launch async poll process test"
+if ! CUDA_VISIBLE_DEVICES=0,1 fleetrun ${distributed_args} multi_process.py abort; then
+    echo "train abort as planned"
+fi
+
+abort_str1="abort>>> selected_gpus:0 worker_endpoints:127.0.0.1:6170,127.0.0.1:6171,127.0.0.2:6170,127.0.0.2:6171 trainers_num:4 current_endpoint:127.0.0.1:6170 trainer_id:0"
+
+if grep -q "$abort_str1" "$file_0"; then
+    echo "trainer 0 abort as planned"
+else
+    echo "trainer 0 not abort as planned"
+    exit -1
+fi
+
+if [ ! -f $file_1 ]; then
+    echo "trainer 1 terminate as planned"
+else
+    echo "trainer 1 not terminate as planned"
+    exit -1
+fi
+
+#test for random ports
+file_0_0="test_launch_filelock_0_0.log"
+file_1_0="test_launch_filelock_1_0.log"
+rm -rf $file_0_0 $file_0_1
+
+distributed_args="--gpus=0,1 --log_dir=testlog"
+export PADDLE_LAUNCH_LOG="test_launch_filelock_0"
+CUDA_VISIBLE_DEVICES=0,1 fleetrun ${distributed_args} find_ports.py
+str_0="worker_endpoints:127.0.0.1:6070,127.0.0.1:6071"
--- a/python/setup.py.in
+++ b/python/setup.py.in
@ -475,6 +475,11 @@ with redirect_stdout():
        cmdclass={
            'install_headers': InstallHeaders,
            'install': InstallCommand,
+        },
+        entry_points={
+            'console_scripts': [
+                'fleetrun = paddle.fleet.launch:launch'
+            ]
        }
    )