support parsing ascend rank table file (#31000)
support parsing ascend rank table filerevert-31562-mean
parent
1201cd2ef2
commit
a6edbc478b
@ -0,0 +1,122 @@
|
||||
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import os
|
||||
import json
|
||||
import paddle
|
||||
from paddle.distributed.fleet.launch_utils import get_cluster, logger, get_host_name_ip, DeviceMode
|
||||
|
||||
def _get_ascend_rankfile(rank_table_file_path):
|
||||
"""
|
||||
Args:
|
||||
rank_table_file_path: ascend npu rank file json
|
||||
{
|
||||
"status": "completed",
|
||||
"version": "1.0",
|
||||
"server_count": "2",
|
||||
"server_list": [
|
||||
{
|
||||
"server_id": "192.168.24.217",
|
||||
"device": [
|
||||
{
|
||||
"device_id": "0",
|
||||
"device_ip": "192.1.184.23",
|
||||
"rank_id": "0"
|
||||
},
|
||||
{
|
||||
"device_id": "1",
|
||||
"device_ip": "192.2.21.93",
|
||||
"rank_id": "1"
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"server_id": "192.168.26.177",
|
||||
"device": [
|
||||
{
|
||||
"device_id": "0",
|
||||
"device_ip": "192.1.94.132",
|
||||
"rank_id": "2"
|
||||
},
|
||||
{
|
||||
"device_id": "1",
|
||||
"device_ip": "192.2.94.30",
|
||||
"rank_id": "3"
|
||||
}
|
||||
]
|
||||
}
|
||||
]
|
||||
}
|
||||
|
||||
Returns:
|
||||
node_ips: node ip list
|
||||
device_count: number of npu per machine
|
||||
"""
|
||||
json_data = None
|
||||
with open(rank_table_file_path) as json_file:
|
||||
json_data = json.load(json_file)
|
||||
|
||||
node_ips = []
|
||||
device_count = 0
|
||||
server_list = json_data['server_list']
|
||||
for server in server_list:
|
||||
node_ips.append(server['server_id'])
|
||||
device_list = server['device']
|
||||
device_count = len(device_list)
|
||||
|
||||
return node_ips, device_count
|
||||
|
||||
def get_cloud_cluster(rank_table_file=None,
|
||||
device_mode=DeviceMode.ASCEND_NPU,
|
||||
devices_per_proc=None,
|
||||
start_port=6070):
|
||||
"""
|
||||
Args:
|
||||
rank_table_file: string, ascend npu rank file path
|
||||
device_mode: DeviceMode(Int)
|
||||
devices_per_proc:list
|
||||
start_port: the start port of current runtime env
|
||||
"""
|
||||
if rank_table_file:
|
||||
# multi trainers
|
||||
node_ips, device_count = _get_ascend_rankfile(rank_table_file)
|
||||
node_index = os.environ.get("PADDLE_TRAINER_ID")
|
||||
node_ip = None
|
||||
if node_index is None:
|
||||
_, node_ip = get_host_name_ip()
|
||||
else:
|
||||
node_ip = node_ips[int(node_index)]
|
||||
|
||||
assert node_ip in node_ips, "Can't find your local ip {%s} in node_ips: {%s}" \
|
||||
% (node_ip, node_ips)
|
||||
else:
|
||||
# single trainer (single ascend card)
|
||||
node_ips = ["127.0.0.1"]
|
||||
node_ip = node_ips[0]
|
||||
device_count = 1
|
||||
devices_per_proc = None
|
||||
|
||||
if devices_per_proc is None:
|
||||
devices_per_proc = [str(x) for x in range(device_count)]
|
||||
|
||||
free_ports = [
|
||||
x for x in range(start_port, start_port + len(devices_per_proc))
|
||||
]
|
||||
|
||||
trainer_endpoints = []
|
||||
for ip in node_ips:
|
||||
trainer_endpoints.append(["%s:%d" % (ip, port) for port in free_ports])
|
||||
|
||||
return get_cluster(node_ips, node_ip, trainer_endpoints,
|
||||
device_mode, devices_per_proc)
|
||||
@ -0,0 +1,103 @@
|
||||
#!/bin/bash
|
||||
|
||||
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
set -e
|
||||
|
||||
RANK_TABLE_FILE_NAME="rank_table_file.json"
|
||||
cat > ${RANK_TABLE_FILE_NAME} <<EOF
|
||||
{
|
||||
"status": "completed",
|
||||
"version": "1.0",
|
||||
"server_count": "2",
|
||||
"server_list": [
|
||||
{
|
||||
"server_id": "127.0.0.1",
|
||||
"device": [
|
||||
{
|
||||
"device_id": "0",
|
||||
"device_ip": "192.1.184.23",
|
||||
"rank_id": "0"
|
||||
},
|
||||
{
|
||||
"device_id": "1",
|
||||
"device_ip": "192.2.21.93",
|
||||
"rank_id": "1"
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"server_id": "127.0.0.2",
|
||||
"device": [
|
||||
{
|
||||
"device_id": "0",
|
||||
"device_ip": "192.1.94.132",
|
||||
"rank_id": "2"
|
||||
},
|
||||
{
|
||||
"device_id": "1",
|
||||
"device_ip": "192.2.94.30",
|
||||
"rank_id": "3"
|
||||
}
|
||||
]
|
||||
}
|
||||
]
|
||||
}
|
||||
EOF
|
||||
|
||||
# set ascend rank table file env
|
||||
export RANK_TABLE_FILE="${PWD}/${RANK_TABLE_FILE_NAME}"
|
||||
|
||||
# use paddlecloud
|
||||
echo "begin test use paddlecloud"
|
||||
cluster_node_ips="127.0.0.1,127.0.0.2"
|
||||
export PADDLE_TRAINERS_NUM=2
|
||||
export POD_IP=127.0.0.1
|
||||
export PADDLE_TRAINERS=127.0.0.1,127.0.0.2
|
||||
export PADDLE_TRAINER_ID=0
|
||||
|
||||
export PADDLE_PORT=35789
|
||||
export TRAINER_PORTS_NUM=2
|
||||
|
||||
distributed_args="--run_mode=collective --log_dir=testlog"
|
||||
python -m paddle.distributed.fleet.launch ${distributed_args} ascend_multi_process_collective.py fleetlaunchascend
|
||||
|
||||
str1="selected_accelerators:0 worker_endpoints:127.0.0.1:35789,127.0.0.1:35790,127.0.0.2:35789,127.0.0.2:35790 trainers_num:4 current_endpoint:127.0.0.1:35789 trainer_id:0 device_ids:0,1,0,1 device_id:0"
|
||||
str2="selected_accelerators:1 worker_endpoints:127.0.0.1:35789,127.0.0.1:35790,127.0.0.2:35789,127.0.0.2:35790 trainers_num:4 current_endpoint:127.0.0.1:35790 trainer_id:1 device_ids:0,1,0,1 device_id:1"
|
||||
file_0="multi_process_fleetlaunchascend.check_0.log"
|
||||
file_1="multi_process_fleetlaunchascend.check_1.log"
|
||||
|
||||
echo "paddlecloud params test"
|
||||
if grep -q "$str1" "$file_0"; then
|
||||
echo "find trainer 0"
|
||||
else
|
||||
echo "not find trainer 0"
|
||||
exit -1
|
||||
fi
|
||||
|
||||
if grep -q "$str2" "$file_1"; then
|
||||
echo "find trainer 1"
|
||||
else
|
||||
echo "not find trainer 1"
|
||||
exit -1
|
||||
fi
|
||||
|
||||
# test async poll process
|
||||
if [ -f $file_0 ]; then
|
||||
rm $file_0
|
||||
fi
|
||||
if [ -f $file_1 ]; then
|
||||
rm $file_1
|
||||
fi
|
||||
Loading…
Reference in new issue