Paddle/tools/run_mp.py

# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from __future__ import print_function

import subprocess
import os
import sys
import time
import argparse

default_envs = {
    "PADDLE_TRAINER_ENDPOINTS":
    "127.0.0.1:6170,127.0.0.1:6171,127.0.0.1:6172,127.0.0.1:6173,127.0.0.1:6174,127.0.0.1:6175,127.0.0.1:6176,127.0.0.1:6177",
    "LD_LIBRARY_PATH": os.getenv("LD_LIBRARY_PATH", ""),
    "PATH": os.getenv("PATH"),
    "LD_PRELOAD": os.getenv("LD_PRELOAD", ""),
    "PADDLE_TRAINERS_NUM": "8",
    "NCCL_DEBUG": "INFO",
    "GLOG_v": "0",
    "NCCL_SOCKET_IFNAME": "eth0",
    "NCCL_IB_GID_INDEX": "3",
    "NCCL_IB_RETRY_CNT": "0",
}

GPUS = 8


def start_procs(gpus, cmd, log_dir):
    procs = []
    log_fns = []
    os.system("mkdir -p %s" % log_dir)
    # ======== update parent envs =======
    for k, v in os.environ.items():
        if k.startswith("FLAGS_") or k.startswith("NCCL_") or \
            k.startswith("GLOG_"):
            default_envs[k] = v

    # ======== for dist training =======
    node_trainer_id = int(os.getenv("PADDLE_TRAINER_ID", "0"))
    current_ip = os.getenv("POD_IP", "127.0.0.1")
    trainer_ips = os.getenv("PADDLE_TRAINERS", current_ip).split(",")
    num_nodes = len(trainer_ips)
    all_nodes_devices_endpoints = ""
    for n in trainer_ips:
        for i in range(gpus):
            if all_nodes_devices_endpoints:
                all_nodes_devices_endpoints += ","
            all_nodes_devices_endpoints += "%s:617%d" % (n, i)
    nranks = num_nodes * gpus
    # ======== for dist training =======

    for i in range(gpus):
        curr_env = {}
        curr_env.update(default_envs)
        curr_env.update({
            "FLAGS_selected_gpus": "%d" % i,
            "PADDLE_TRAINER_ID": "%d" % (node_trainer_id * gpus + i),
            "PADDLE_CURRENT_ENDPOINT": "%s:617%d" % (current_ip, i),
            # nranks
            "PADDLE_TRAINERS_NUM": "%d" % nranks,
            "PADDLE_TRAINER_ENDPOINTS": all_nodes_devices_endpoints
        })

        print("starting process ", i, cmd, curr_env)
        fn = open("%s/workerlog.%d" % (log_dir, i), "w")
        log_fns.append(fn)
        procs.append(
            subprocess.Popen(
                cmd.strip().split(" "), stdout=fn, stderr=fn, env=curr_env))

    for i in range(gpus):
        try:
            procs[i].communicate()
            procs[i].terminate()
            log_fns[i].close()
        except:
            pass


def main():
    parser = argparse.ArgumentParser(
        description='''start paddle training using multi-process mode.
NOTE: your train program ***must*** run as distributed nccl2 mode,
see: http://www.paddlepaddle.org/documentation/docs/zh/1.2/user_guides/howto/training/cluster_howto.html#permalink-8--nccl2-
And your train program must read environment variables below in order to let different
process init properly:
FLAGS_selected_gpus
PADDLE_TRAINER_ID
PADDLE_CURRENT_ENDPOINT
PADDLE_TRAINERS_NUM
PADDLE_TRAINER_ENDPOINTS
POD_IP (current node ip address, not needed for local training)
''')
    parser.add_argument(
        '--gpus',
        type=int,
        default=8,
        help='start number of processes for every gpu')
    parser.add_argument(
        '--cmd',
        type=str,
        default="",
        help='command to run for each process, e.g. python train.py --lr 0.1')
    parser.add_argument(
        '--log_dir',
        type=str,
        default="mylog",
        help='directory to put logs per process.')
    args = parser.parse_args()
    if args.cmd == "":
        parser.print_help()
        exit(0)
    start_procs(args.gpus, args.cmd, args.log_dir)


if __name__ == "__main__":
    main()
add multi process start script (#15381) * add multi process start script test=develop * refine tool test=develop 6 years ago			`# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.`
			`#`
			`# Licensed under the Apache License, Version 2.0 (the "License");`
			`# you may not use this file except in compliance with the License.`
			`# You may obtain a copy of the License at`
			`#`
			`# http://www.apache.org/licenses/LICENSE-2.0`
			`#`
			`# Unless required by applicable law or agreed to in writing, software`
			`# distributed under the License is distributed on an "AS IS" BASIS,`
			`# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.`
			`# See the License for the specific language governing permissions and`
			`# limitations under the License.`

			`from __future__ import print_function`

			`import subprocess`
			`import os`
			`import sys`
			`import time`
			`import argparse`

			`default_envs = {`
			`"PADDLE_TRAINER_ENDPOINTS":`
			`"127.0.0.1:6170,127.0.0.1:6171,127.0.0.1:6172,127.0.0.1:6173,127.0.0.1:6174,127.0.0.1:6175,127.0.0.1:6176,127.0.0.1:6177",`
			`"LD_LIBRARY_PATH": os.getenv("LD_LIBRARY_PATH", ""),`
			`"PATH": os.getenv("PATH"),`
			`"LD_PRELOAD": os.getenv("LD_PRELOAD", ""),`
			`"PADDLE_TRAINERS_NUM": "8",`
			`"NCCL_DEBUG": "INFO",`
			`"GLOG_v": "0",`
			`"NCCL_SOCKET_IFNAME": "eth0",`
			`"NCCL_IB_GID_INDEX": "3",`
			`"NCCL_IB_RETRY_CNT": "0",`
			`}`

			`GPUS = 8`


			`def start_procs(gpus, cmd, log_dir):`
			`procs = []`
			`log_fns = []`
			`os.system("mkdir -p %s" % log_dir)`
			`# ======== update parent envs =======`
			`for k, v in os.environ.items():`
			`if k.startswith("FLAGS_") or k.startswith("NCCL_") or \`
			`k.startswith("GLOG_"):`
			`default_envs[k] = v`

			`# ======== for dist training =======`
			`node_trainer_id = int(os.getenv("PADDLE_TRAINER_ID", "0"))`
			`current_ip = os.getenv("POD_IP", "127.0.0.1")`
			`trainer_ips = os.getenv("PADDLE_TRAINERS", current_ip).split(",")`
			`num_nodes = len(trainer_ips)`
			`all_nodes_devices_endpoints = ""`
			`for n in trainer_ips:`
			`for i in range(gpus):`
			`if all_nodes_devices_endpoints:`
			`all_nodes_devices_endpoints += ","`
			`all_nodes_devices_endpoints += "%s:617%d" % (n, i)`
			`nranks = num_nodes * gpus`
			`# ======== for dist training =======`

			`for i in range(gpus):`
			`curr_env = {}`
			`curr_env.update(default_envs)`
			`curr_env.update({`
			`"FLAGS_selected_gpus": "%d" % i,`
			`"PADDLE_TRAINER_ID": "%d" % (node_trainer_id * gpus + i),`
			`"PADDLE_CURRENT_ENDPOINT": "%s:617%d" % (current_ip, i),`
			`# nranks`
			`"PADDLE_TRAINERS_NUM": "%d" % nranks,`
			`"PADDLE_TRAINER_ENDPOINTS": all_nodes_devices_endpoints`
			`})`

			`print("starting process ", i, cmd, curr_env)`
			`fn = open("%s/workerlog.%d" % (log_dir, i), "w")`
			`log_fns.append(fn)`
			`procs.append(`
			`subprocess.Popen(`
			`cmd.strip().split(" "), stdout=fn, stderr=fn, env=curr_env))`

			`for i in range(gpus):`
			`try:`
			`procs[i].communicate()`
			`procs[i].terminate()`
			`log_fns[i].close()`
			`except:`
			`pass`


			`def main():`
			`parser = argparse.ArgumentParser(`
			`description='''start paddle training using multi-process mode.`
			`NOTE: your train program *must* run as distributed nccl2 mode,`
			`see: http://www.paddlepaddle.org/documentation/docs/zh/1.2/user_guides/howto/training/cluster_howto.html#permalink-8--nccl2-`
			`And your train program must read environment variables below in order to let different`
			`process init properly:`
			`FLAGS_selected_gpus`
			`PADDLE_TRAINER_ID`
			`PADDLE_CURRENT_ENDPOINT`
			`PADDLE_TRAINERS_NUM`
			`PADDLE_TRAINER_ENDPOINTS`
			`POD_IP (current node ip address, not needed for local training)`
			`''')`
			`parser.add_argument(`
			`'--gpus',`
			`type=int,`
			`default=8,`
			`help='start number of processes for every gpu')`
			`parser.add_argument(`
			`'--cmd',`
			`type=str,`
			`default="",`
			`help='command to run for each process, e.g. python train.py --lr 0.1')`
			`parser.add_argument(`
			`'--log_dir',`
			`type=str,`
			`default="mylog",`
			`help='directory to put logs per process.')`
			`args = parser.parse_args()`
			`if args.cmd == "":`
			`parser.print_help()`
			`exit(0)`
			`start_procs(args.gpus, args.cmd, args.log_dir)`


			`if __name__ == "__main__":`
			`main()`