You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
130 lines
4.1 KiB
130 lines
4.1 KiB
6 years ago
|
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
|
||
|
#
|
||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||
|
# you may not use this file except in compliance with the License.
|
||
|
# You may obtain a copy of the License at
|
||
|
#
|
||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||
|
#
|
||
|
# Unless required by applicable law or agreed to in writing, software
|
||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||
|
# See the License for the specific language governing permissions and
|
||
|
# limitations under the License.
|
||
|
|
||
|
from __future__ import print_function
|
||
|
|
||
|
import subprocess
|
||
|
import os
|
||
|
import sys
|
||
|
import time
|
||
|
import argparse
|
||
|
|
||
|
default_envs = {
|
||
|
"PADDLE_TRAINER_ENDPOINTS":
|
||
|
"127.0.0.1:6170,127.0.0.1:6171,127.0.0.1:6172,127.0.0.1:6173,127.0.0.1:6174,127.0.0.1:6175,127.0.0.1:6176,127.0.0.1:6177",
|
||
|
"LD_LIBRARY_PATH": os.getenv("LD_LIBRARY_PATH", ""),
|
||
|
"PATH": os.getenv("PATH"),
|
||
|
"LD_PRELOAD": os.getenv("LD_PRELOAD", ""),
|
||
|
"PADDLE_TRAINERS_NUM": "8",
|
||
|
"NCCL_DEBUG": "INFO",
|
||
|
"GLOG_v": "0",
|
||
|
"NCCL_SOCKET_IFNAME": "eth0",
|
||
|
"NCCL_IB_GID_INDEX": "3",
|
||
|
"NCCL_IB_RETRY_CNT": "0",
|
||
|
}
|
||
|
|
||
|
GPUS = 8
|
||
|
|
||
|
|
||
|
def start_procs(gpus, cmd, log_dir):
|
||
|
procs = []
|
||
|
log_fns = []
|
||
|
os.system("mkdir -p %s" % log_dir)
|
||
|
# ======== update parent envs =======
|
||
|
for k, v in os.environ.items():
|
||
|
if k.startswith("FLAGS_") or k.startswith("NCCL_") or \
|
||
|
k.startswith("GLOG_"):
|
||
|
default_envs[k] = v
|
||
|
|
||
|
# ======== for dist training =======
|
||
|
node_trainer_id = int(os.getenv("PADDLE_TRAINER_ID", "0"))
|
||
|
current_ip = os.getenv("POD_IP", "127.0.0.1")
|
||
|
trainer_ips = os.getenv("PADDLE_TRAINERS", current_ip).split(",")
|
||
|
num_nodes = len(trainer_ips)
|
||
|
all_nodes_devices_endpoints = ""
|
||
|
for n in trainer_ips:
|
||
|
for i in range(gpus):
|
||
|
if all_nodes_devices_endpoints:
|
||
|
all_nodes_devices_endpoints += ","
|
||
|
all_nodes_devices_endpoints += "%s:617%d" % (n, i)
|
||
|
nranks = num_nodes * gpus
|
||
|
# ======== for dist training =======
|
||
|
|
||
|
for i in range(gpus):
|
||
|
curr_env = {}
|
||
|
curr_env.update(default_envs)
|
||
|
curr_env.update({
|
||
|
"FLAGS_selected_gpus": "%d" % i,
|
||
|
"PADDLE_TRAINER_ID": "%d" % (node_trainer_id * gpus + i),
|
||
|
"PADDLE_CURRENT_ENDPOINT": "%s:617%d" % (current_ip, i),
|
||
|
# nranks
|
||
|
"PADDLE_TRAINERS_NUM": "%d" % nranks,
|
||
|
"PADDLE_TRAINER_ENDPOINTS": all_nodes_devices_endpoints
|
||
|
})
|
||
|
|
||
|
print("starting process ", i, cmd, curr_env)
|
||
|
fn = open("%s/workerlog.%d" % (log_dir, i), "w")
|
||
|
log_fns.append(fn)
|
||
|
procs.append(
|
||
|
subprocess.Popen(
|
||
|
cmd.strip().split(" "), stdout=fn, stderr=fn, env=curr_env))
|
||
|
|
||
|
for i in range(gpus):
|
||
|
try:
|
||
|
procs[i].communicate()
|
||
|
procs[i].terminate()
|
||
|
log_fns[i].close()
|
||
|
except:
|
||
|
pass
|
||
|
|
||
|
|
||
|
def main():
|
||
|
parser = argparse.ArgumentParser(
|
||
|
description='''start paddle training using multi-process mode.
|
||
|
NOTE: your train program ***must*** run as distributed nccl2 mode,
|
||
|
see: http://www.paddlepaddle.org/documentation/docs/zh/1.2/user_guides/howto/training/cluster_howto.html#permalink-8--nccl2-
|
||
|
And your train program must read environment variables below in order to let different
|
||
|
process init properly:
|
||
|
FLAGS_selected_gpus
|
||
|
PADDLE_TRAINER_ID
|
||
|
PADDLE_CURRENT_ENDPOINT
|
||
|
PADDLE_TRAINERS_NUM
|
||
|
PADDLE_TRAINER_ENDPOINTS
|
||
|
POD_IP (current node ip address, not needed for local training)
|
||
|
''')
|
||
|
parser.add_argument(
|
||
|
'--gpus',
|
||
|
type=int,
|
||
|
default=8,
|
||
|
help='start number of processes for every gpu')
|
||
|
parser.add_argument(
|
||
|
'--cmd',
|
||
|
type=str,
|
||
|
default="",
|
||
|
help='command to run for each process, e.g. python train.py --lr 0.1')
|
||
|
parser.add_argument(
|
||
|
'--log_dir',
|
||
|
type=str,
|
||
|
default="mylog",
|
||
|
help='directory to put logs per process.')
|
||
|
args = parser.parse_args()
|
||
|
if args.cmd == "":
|
||
|
parser.print_help()
|
||
|
exit(0)
|
||
|
start_procs(args.gpus, args.cmd, args.log_dir)
|
||
|
|
||
|
|
||
|
if __name__ == "__main__":
|
||
|
main()
|