support parsing ascend rank table file (#31000)
	
		
	
				
					
				
			support parsing ascend rank table filerevert-31562-mean
							parent
							
								
									1201cd2ef2
								
							
						
					
					
						commit
						a6edbc478b
					
				@ -0,0 +1,122 @@
 | 
				
			||||
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
 | 
				
			||||
#
 | 
				
			||||
# Licensed under the Apache License, Version 2.0 (the "License");
 | 
				
			||||
# you may not use this file except in compliance with the License.
 | 
				
			||||
# You may obtain a copy of the License at
 | 
				
			||||
#
 | 
				
			||||
#     http://www.apache.org/licenses/LICENSE-2.0
 | 
				
			||||
#
 | 
				
			||||
# Unless required by applicable law or agreed to in writing, software
 | 
				
			||||
# distributed under the License is distributed on an "AS IS" BASIS,
 | 
				
			||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 | 
				
			||||
# See the License for the specific language governing permissions and
 | 
				
			||||
# limitations under the License.
 | 
				
			||||
 | 
				
			||||
import os
 | 
				
			||||
import json
 | 
				
			||||
import paddle
 | 
				
			||||
from paddle.distributed.fleet.launch_utils import get_cluster, logger, get_host_name_ip, DeviceMode
 | 
				
			||||
 | 
				
			||||
def _get_ascend_rankfile(rank_table_file_path):
 | 
				
			||||
    """
 | 
				
			||||
    Args:
 | 
				
			||||
    rank_table_file_path: ascend npu rank file json
 | 
				
			||||
    {
 | 
				
			||||
        "status": "completed",
 | 
				
			||||
        "version": "1.0",
 | 
				
			||||
        "server_count": "2",
 | 
				
			||||
        "server_list": [
 | 
				
			||||
            {
 | 
				
			||||
                "server_id": "192.168.24.217",
 | 
				
			||||
                "device": [
 | 
				
			||||
                    {
 | 
				
			||||
                        "device_id": "0",
 | 
				
			||||
                        "device_ip": "192.1.184.23",
 | 
				
			||||
                        "rank_id": "0"
 | 
				
			||||
                    },
 | 
				
			||||
                    {
 | 
				
			||||
                        "device_id": "1",
 | 
				
			||||
                        "device_ip": "192.2.21.93",
 | 
				
			||||
                        "rank_id": "1"
 | 
				
			||||
                    }
 | 
				
			||||
                ]
 | 
				
			||||
            },
 | 
				
			||||
            {
 | 
				
			||||
                "server_id": "192.168.26.177",
 | 
				
			||||
                "device": [
 | 
				
			||||
                    {
 | 
				
			||||
                        "device_id": "0",
 | 
				
			||||
                        "device_ip": "192.1.94.132",
 | 
				
			||||
                        "rank_id": "2"
 | 
				
			||||
                    },
 | 
				
			||||
                    {
 | 
				
			||||
                        "device_id": "1",
 | 
				
			||||
                        "device_ip": "192.2.94.30",
 | 
				
			||||
                        "rank_id": "3"
 | 
				
			||||
                    }
 | 
				
			||||
                ]
 | 
				
			||||
            }
 | 
				
			||||
        ]
 | 
				
			||||
    }
 | 
				
			||||
 | 
				
			||||
    Returns:
 | 
				
			||||
        node_ips: node ip list
 | 
				
			||||
        device_count: number of npu per machine
 | 
				
			||||
    """
 | 
				
			||||
    json_data = None
 | 
				
			||||
    with open(rank_table_file_path) as json_file:
 | 
				
			||||
        json_data = json.load(json_file)
 | 
				
			||||
 | 
				
			||||
    node_ips = []
 | 
				
			||||
    device_count = 0
 | 
				
			||||
    server_list = json_data['server_list']
 | 
				
			||||
    for server in server_list:
 | 
				
			||||
        node_ips.append(server['server_id'])
 | 
				
			||||
        device_list = server['device']
 | 
				
			||||
        device_count = len(device_list)
 | 
				
			||||
            
 | 
				
			||||
    return node_ips, device_count
 | 
				
			||||
 | 
				
			||||
def get_cloud_cluster(rank_table_file=None, 
 | 
				
			||||
                    device_mode=DeviceMode.ASCEND_NPU, 
 | 
				
			||||
                    devices_per_proc=None,
 | 
				
			||||
                    start_port=6070):
 | 
				
			||||
    """
 | 
				
			||||
    Args:
 | 
				
			||||
    rank_table_file: string, ascend npu rank file path
 | 
				
			||||
    device_mode: DeviceMode(Int)
 | 
				
			||||
    devices_per_proc:list
 | 
				
			||||
    start_port: the start port of current runtime env
 | 
				
			||||
    """
 | 
				
			||||
    if rank_table_file: 
 | 
				
			||||
        # multi trainers
 | 
				
			||||
        node_ips, device_count = _get_ascend_rankfile(rank_table_file)
 | 
				
			||||
        node_index = os.environ.get("PADDLE_TRAINER_ID")
 | 
				
			||||
        node_ip = None
 | 
				
			||||
        if node_index is None:
 | 
				
			||||
            _, node_ip = get_host_name_ip()
 | 
				
			||||
        else:
 | 
				
			||||
            node_ip = node_ips[int(node_index)]
 | 
				
			||||
 | 
				
			||||
        assert node_ip in node_ips, "Can't find your local ip {%s} in node_ips: {%s}" \
 | 
				
			||||
            % (node_ip, node_ips)
 | 
				
			||||
    else:
 | 
				
			||||
        # single trainer (single ascend card)
 | 
				
			||||
        node_ips = ["127.0.0.1"]
 | 
				
			||||
        node_ip = node_ips[0]
 | 
				
			||||
        device_count = 1
 | 
				
			||||
        devices_per_proc = None
 | 
				
			||||
 | 
				
			||||
    if devices_per_proc is None:
 | 
				
			||||
        devices_per_proc = [str(x) for x in range(device_count)]
 | 
				
			||||
 | 
				
			||||
    free_ports = [
 | 
				
			||||
        x for x in range(start_port, start_port + len(devices_per_proc))
 | 
				
			||||
    ]
 | 
				
			||||
 | 
				
			||||
    trainer_endpoints = []
 | 
				
			||||
    for ip in node_ips:
 | 
				
			||||
        trainer_endpoints.append(["%s:%d" % (ip, port) for port in free_ports])
 | 
				
			||||
 | 
				
			||||
    return get_cluster(node_ips, node_ip, trainer_endpoints,
 | 
				
			||||
                               device_mode, devices_per_proc)
 | 
				
			||||
@ -0,0 +1,103 @@
 | 
				
			||||
#!/bin/bash
 | 
				
			||||
 | 
				
			||||
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
 | 
				
			||||
#
 | 
				
			||||
# Licensed under the Apache License, Version 2.0 (the "License");
 | 
				
			||||
# you may not use this file except in compliance with the License.
 | 
				
			||||
# You may obtain a copy of the License at
 | 
				
			||||
#
 | 
				
			||||
#     http://www.apache.org/licenses/LICENSE-2.0
 | 
				
			||||
#
 | 
				
			||||
# Unless required by applicable law or agreed to in writing, software
 | 
				
			||||
# distributed under the License is distributed on an "AS IS" BASIS,
 | 
				
			||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 | 
				
			||||
# See the License for the specific language governing permissions and
 | 
				
			||||
# limitations under the License.
 | 
				
			||||
 | 
				
			||||
set -e
 | 
				
			||||
 | 
				
			||||
RANK_TABLE_FILE_NAME="rank_table_file.json"
 | 
				
			||||
cat > ${RANK_TABLE_FILE_NAME} <<EOF
 | 
				
			||||
{
 | 
				
			||||
    "status": "completed",
 | 
				
			||||
    "version": "1.0",
 | 
				
			||||
    "server_count": "2",
 | 
				
			||||
    "server_list": [
 | 
				
			||||
        {
 | 
				
			||||
            "server_id": "127.0.0.1",
 | 
				
			||||
            "device": [
 | 
				
			||||
                {
 | 
				
			||||
                    "device_id": "0",
 | 
				
			||||
                    "device_ip": "192.1.184.23",
 | 
				
			||||
                    "rank_id": "0"
 | 
				
			||||
                },
 | 
				
			||||
                {
 | 
				
			||||
                    "device_id": "1",
 | 
				
			||||
                    "device_ip": "192.2.21.93",
 | 
				
			||||
                    "rank_id": "1"
 | 
				
			||||
                }
 | 
				
			||||
            ]
 | 
				
			||||
        },
 | 
				
			||||
        {
 | 
				
			||||
            "server_id": "127.0.0.2",
 | 
				
			||||
            "device": [
 | 
				
			||||
                {
 | 
				
			||||
                    "device_id": "0",
 | 
				
			||||
                    "device_ip": "192.1.94.132",
 | 
				
			||||
                    "rank_id": "2"
 | 
				
			||||
                },
 | 
				
			||||
                {
 | 
				
			||||
                    "device_id": "1",
 | 
				
			||||
                    "device_ip": "192.2.94.30",
 | 
				
			||||
                    "rank_id": "3"
 | 
				
			||||
                }
 | 
				
			||||
            ]
 | 
				
			||||
        }
 | 
				
			||||
    ]
 | 
				
			||||
}
 | 
				
			||||
EOF
 | 
				
			||||
 | 
				
			||||
# set ascend rank table file env
 | 
				
			||||
export RANK_TABLE_FILE="${PWD}/${RANK_TABLE_FILE_NAME}"
 | 
				
			||||
 | 
				
			||||
# use paddlecloud
 | 
				
			||||
echo "begin test use paddlecloud"
 | 
				
			||||
cluster_node_ips="127.0.0.1,127.0.0.2"
 | 
				
			||||
export PADDLE_TRAINERS_NUM=2
 | 
				
			||||
export POD_IP=127.0.0.1
 | 
				
			||||
export PADDLE_TRAINERS=127.0.0.1,127.0.0.2
 | 
				
			||||
export PADDLE_TRAINER_ID=0
 | 
				
			||||
 | 
				
			||||
export PADDLE_PORT=35789
 | 
				
			||||
export TRAINER_PORTS_NUM=2
 | 
				
			||||
 | 
				
			||||
distributed_args="--run_mode=collective --log_dir=testlog"
 | 
				
			||||
python -m paddle.distributed.fleet.launch ${distributed_args} ascend_multi_process_collective.py fleetlaunchascend
 | 
				
			||||
 | 
				
			||||
str1="selected_accelerators:0 worker_endpoints:127.0.0.1:35789,127.0.0.1:35790,127.0.0.2:35789,127.0.0.2:35790 trainers_num:4 current_endpoint:127.0.0.1:35789 trainer_id:0 device_ids:0,1,0,1 device_id:0"
 | 
				
			||||
str2="selected_accelerators:1 worker_endpoints:127.0.0.1:35789,127.0.0.1:35790,127.0.0.2:35789,127.0.0.2:35790 trainers_num:4 current_endpoint:127.0.0.1:35790 trainer_id:1 device_ids:0,1,0,1 device_id:1"
 | 
				
			||||
file_0="multi_process_fleetlaunchascend.check_0.log"
 | 
				
			||||
file_1="multi_process_fleetlaunchascend.check_1.log"
 | 
				
			||||
 | 
				
			||||
echo "paddlecloud params test"
 | 
				
			||||
if grep -q "$str1" "$file_0"; then
 | 
				
			||||
    echo "find trainer 0"
 | 
				
			||||
else
 | 
				
			||||
    echo "not find trainer 0"
 | 
				
			||||
    exit -1
 | 
				
			||||
fi
 | 
				
			||||
 | 
				
			||||
if grep -q "$str2" "$file_1"; then
 | 
				
			||||
    echo "find trainer 1"
 | 
				
			||||
else
 | 
				
			||||
    echo "not find trainer 1"
 | 
				
			||||
    exit -1
 | 
				
			||||
fi
 | 
				
			||||
 | 
				
			||||
# test async poll process
 | 
				
			||||
if [ -f $file_0 ]; then
 | 
				
			||||
    rm $file_0
 | 
				
			||||
fi
 | 
				
			||||
if [ -f $file_1 ]; then
 | 
				
			||||
    rm $file_1
 | 
				
			||||
fi
 | 
				
			||||
					Loading…
					
					
				
		Reference in new issue