Paddle/python/paddle/distributed/launch.py

# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
paddle.distributed.launch is a module that spawns multiple distributed 
process on each trainning node for gpu trainning.

Usage:
    In both of single node training or multiple node training, this module 
launch a process on each of the given gpu card.

    1. for single node trainning with all visible gpu cards:
       python -m paddle.distributed.launch \
         your_training_py (arg1 arg2 and all others)
    
    2. for single node trainning with [0,4) cards
       python -m paddle.distributed.launch --selected_gpus="0,1,2,3" \
         your_training_py (arg1 arg2 and all others)

    3. for mulitple node training such as two node:192.168.0.16, 192.168.0.17
        on 192.168.0.16:
            python -m paddle.distributed.launch --cluster_node_ips="192.168.0.16,192.168.0.17" \
                --node_ip=192.168.0.16 \
                your_training_py (arg1 arg2 and all others)

        on 192.168.0.17:
            python -m paddle.distributed.launch --cluster_node_ips="192.168.0.16,192.168.0.17" \
                --node_ip=192.168.0.17 \
                your_training_py (arg1 arg2 and all others)
"""

from __future__ import print_function
import sys
from sys import version
import subprocess
import os
import six
import copy
from argparse import ArgumentParser, REMAINDER
import paddle.fluid as fluid


def _print_arguments(args):
    print("-----------  Configuration Arguments -----------")
    for arg, value in sorted(six.iteritems(vars(args))):
        print("%s: %s" % (arg, value))
    print("------------------------------------------------")


def _parse_args():
    """
    Helper function parsing the command line options
    @retval ArgumentParser
    """
    parser = ArgumentParser(
        description='''start paddle training using multi-process mode.
NOTE: your train program ***must*** run as distributed nccl2 mode,
see: http://www.paddlepaddle.org/documentation/docs/zh/1.2/user_guides/howto/training/cluster_howto.html#permalink-8--nccl2-
And your train program must read environment variables below in order to let different
process init properly:
FLAGS_selected_gpus
PADDLE_TRAINER_ID
PADDLE_CURRENT_ENDPOINT
PADDLE_TRAINERS_NUM
PADDLE_TRAINER_ENDPOINTS
POD_IP (current node ip address, not needed for local training)
''')

    # Optional arguments for the launch helper
    parser.add_argument(
        "--cluster_node_ips",
        type=str,
        default="127.0.0.1",
        help="Paddle cluster nodes ips, such as 192.168.0.16,192.168.0.17..")

    parser.add_argument(
        "--node_ip",
        type=str,
        default="127.0.0.1",
        help="The current node ip. ")

    parser.add_argument(
        "--started_port",
        type=int,
        default=6170,
        help="The trainer's started port on a single node")

    parser.add_argument(
        "--print_config",
        type=bool,
        default=True,
        help="Print the config or not")

    parser.add_argument(
        "--selected_gpus",
        type=str,
        default=None,
        help="It's for gpu trainning and the trainning process will run on the selected_gpus,"
        "each process is bound to a single GPU. And if it's not setted, this module will use all the gpu cards for training."
    )

    parser.add_argument(
        "--log_dir",
        type=str,
        help="The path for each process's log.If it's not setted, the log will printed to default pipe."
    )

    # positional
    parser.add_argument(
        "training_script",
        type=str,
        help="The full path to the single GPU training "
        "program/script to be launched in parallel, "
        "followed by all the arguments for the "
        "training script")

    # rest from the training program
    parser.add_argument('training_script_args', nargs=REMAINDER)
    return parser.parse_args()


def start_procs(args):
    """
    """
    procs = []
    log_fns = []

    default_env = os.environ.copy()

    current_node_ip = args.node_ip
    node_ips = [x.strip() for x in args.cluster_node_ips.split(',')]
    node_id = node_ips.index(current_node_ip)
    num_nodes = len(node_ips)

    if args.selected_gpus is None:
        gpus_num = fluid.core.get_cuda_device_count()
        selected_gpus = [str(x) for x in range(0, gpus_num)]
    else:
        selected_gpus = [x.strip() for x in args.selected_gpus.split(',')]
    selected_gpus_num = len(selected_gpus)

    trainers_endpoints = ""
    for ip in node_ips:
        for i in range(selected_gpus_num):
            if trainers_endpoints != "":
                trainers_endpoints += ","
            trainers_endpoints += "%s:617%d" % (ip, i)

    nranks = num_nodes * selected_gpus_num

    if args.print_config:
        print("trainers_endpoints:", trainers_endpoints, ", node_id:", node_id,
              ", current_node_ip:", current_node_ip, ", num_nodes:", num_nodes,
              ", node_ips:", node_ips, ", nranks:", nranks)

    current_env = copy.copy(default_env)
    # paddle broadcast ncclUniqueId use socket, and
    # proxy maybe make trainers unreachable, so delete them.
    # if we set them to "", grpc will log error message "bad uri"
    # so just delete them.
    current_env.pop("http_proxy", None)
    current_env.pop("https_proxy", None)

    procs = []
    cmds = []
    for i in range(0, selected_gpus_num):
        current_env.update({
            "FLAGS_selected_gpus": "%s" % selected_gpus[i],
            "PADDLE_TRAINER_ID": "%d" % (node_id * selected_gpus_num + i),
            "PADDLE_CURRENT_ENDPOINT":
            "%s:%d" % (current_node_ip, args.started_port + i),
            "PADDLE_TRAINERS_NUM": "%d" % nranks,
            "PADDLE_TRAINER_ENDPOINTS": trainers_endpoints
        })

        cmd = [sys.executable, "-u", args.training_script
               ] + args.training_script_args

        cmds.append(cmd)

        if args.log_dir is not None:
            os.system("mkdir -p {}".format(args.log_dir))
            fn = open("%s/workerlog.%d" % (args.log_dir, i), "w")
            log_fns.append(fn)

            proc = subprocess.Popen(cmd, env=current_env, stdout=fn, stderr=fn)
        else:
            proc = subprocess.Popen(cmd, env=current_env)

        procs.append(proc)

    for i in range(0, len(procs)):
        proc = procs[i]

        proc.wait()
        if len(log_fns) > 0:
            log_fns[i].close()

        if proc.returncode != 0:
            raise subprocess.CalledProcessError(
                returncode=procs[i].returncode, cmd=cmds[i])


def launch():
    args = _parse_args()
    if args.print_config:
        _print_arguments(args)
    start_procs(args)


if __name__ == "__main__":
    launch()
Fine tuning launch.py (#17223) 6 years ago			`# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.`
add multi process start script (#15381) * add multi process start script test=develop * refine tool test=develop 6 years ago			`#`
			`# Licensed under the Apache License, Version 2.0 (the "License");`
			`# you may not use this file except in compliance with the License.`
			`# You may obtain a copy of the License at`
			`#`
			`# http://www.apache.org/licenses/LICENSE-2.0`
			`#`
			`# Unless required by applicable law or agreed to in writing, software`
			`# distributed under the License is distributed on an "AS IS" BASIS,`
			`# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.`
			`# See the License for the specific language governing permissions and`
			`# limitations under the License.`
Fine tuning launch.py (#17223) 6 years ago			`"""`
			`paddle.distributed.launch is a module that spawns multiple distributed`
			`process on each trainning node for gpu trainning.`

			`Usage:`
			`In both of single node training or multiple node training, this module`
			`launch a process on each of the given gpu card.`

			`1. for single node trainning with all visible gpu cards:`
			`python -m paddle.distributed.launch \`
			`your_training_py (arg1 arg2 and all others)`

			`2. for single node trainning with [0,4) cards`
			`python -m paddle.distributed.launch --selected_gpus="0,1,2,3" \`
			`your_training_py (arg1 arg2 and all others)`

			`3. for mulitple node training such as two node:192.168.0.16, 192.168.0.17`
			`on 192.168.0.16:`
			`python -m paddle.distributed.launch --cluster_node_ips="192.168.0.16,192.168.0.17" \`
			`--node_ip=192.168.0.16 \`
			`your_training_py (arg1 arg2 and all others)`

			`on 192.168.0.17:`
			`python -m paddle.distributed.launch --cluster_node_ips="192.168.0.16,192.168.0.17" \`
			`--node_ip=192.168.0.17 \`
			`your_training_py (arg1 arg2 and all others)`
			`"""`
add multi process start script (#15381) * add multi process start script test=develop * refine tool test=develop 6 years ago
			`from __future__ import print_function`
Fine tuning launch.py (#17223) 6 years ago			`import sys`
			`from sys import version`
add multi process start script (#15381) * add multi process start script test=develop * refine tool test=develop 6 years ago			`import subprocess`
			`import os`
Fine tuning launch.py (#17223) 6 years ago			`import six`
			`import copy`
			`from argparse import ArgumentParser, REMAINDER`
			`import paddle.fluid as fluid`
fix dist launch script test=develop (#17404) 6 years ago

Fine tuning launch.py (#17223) 6 years ago			`def _print_arguments(args):`
			`print("----------- Configuration Arguments -----------")`
			`for arg, value in sorted(six.iteritems(vars(args))):`
			`print("%s: %s" % (arg, value))`
			`print("------------------------------------------------")`
add multi process start script (#15381) * add multi process start script test=develop * refine tool test=develop 6 years ago
add launch mp distributed job py module test=develop (#15620) * add launch mp distributed mode module test=develop * delete unused file test=develop * refine usage test=develop * refine usage test=develop * move distributed package test=develop * add to whl package test=develop 6 years ago
Fine tuning launch.py (#17223) 6 years ago			`def _parse_args():`
			`"""`
			`Helper function parsing the command line options`
			`@retval ArgumentParser`
			`"""`
			`parser = ArgumentParser(`
add multi process start script (#15381) * add multi process start script test=develop * refine tool test=develop 6 years ago			`description='''start paddle training using multi-process mode.`
			`NOTE: your train program *must* run as distributed nccl2 mode,`
			`see: http://www.paddlepaddle.org/documentation/docs/zh/1.2/user_guides/howto/training/cluster_howto.html#permalink-8--nccl2-`
			`And your train program must read environment variables below in order to let different`
			`process init properly:`
			`FLAGS_selected_gpus`
			`PADDLE_TRAINER_ID`
			`PADDLE_CURRENT_ENDPOINT`
			`PADDLE_TRAINERS_NUM`
			`PADDLE_TRAINER_ENDPOINTS`
			`POD_IP (current node ip address, not needed for local training)`
			`''')`
Fine tuning launch.py (#17223) 6 years ago
			`# Optional arguments for the launch helper`
add multi process start script (#15381) * add multi process start script test=develop * refine tool test=develop 6 years ago			`parser.add_argument(`
Fine tuning launch.py (#17223) 6 years ago			`"--cluster_node_ips",`
			`type=str,`
			`default="127.0.0.1",`
			`help="Paddle cluster nodes ips, such as 192.168.0.16,192.168.0.17..")`

			`parser.add_argument(`
			`"--node_ip",`
			`type=str,`
			`default="127.0.0.1",`
			`help="The current node ip. ")`

			`parser.add_argument(`
			`"--started_port",`
add multi process start script (#15381) * add multi process start script test=develop * refine tool test=develop 6 years ago			`type=int,`
Fine tuning launch.py (#17223) 6 years ago			`default=6170,`
			`help="The trainer's started port on a single node")`

			`parser.add_argument(`
			`"--print_config",`
			`type=bool,`
			`default=True,`
			`help="Print the config or not")`

			`parser.add_argument(`
			`"--selected_gpus",`
			`type=str,`
			`default=None,`
			`help="It's for gpu trainning and the trainning process will run on the selected_gpus,"`
			`"each process is bound to a single GPU. And if it's not setted, this module will use all the gpu cards for training."`
			`)`

add multi process start script (#15381) * add multi process start script test=develop * refine tool test=develop 6 years ago			`parser.add_argument(`
Fine tuning launch.py (#17223) 6 years ago			`"--log_dir",`
add multi process start script (#15381) * add multi process start script test=develop * refine tool test=develop 6 years ago			`type=str,`
Fine tuning launch.py (#17223) 6 years ago			`help="The path for each process's log.If it's not setted, the log will printed to default pipe."`
			`)`

			`# positional`
add launch mp distributed job py module test=develop (#15620) * add launch mp distributed mode module test=develop * delete unused file test=develop * refine usage test=develop * refine usage test=develop * move distributed package test=develop * add to whl package test=develop 6 years ago			`parser.add_argument(`
Fine tuning launch.py (#17223) 6 years ago			`"training_script",`
add launch mp distributed job py module test=develop (#15620) * add launch mp distributed mode module test=develop * delete unused file test=develop * refine usage test=develop * refine usage test=develop * move distributed package test=develop * add to whl package test=develop 6 years ago			`type=str,`
Fine tuning launch.py (#17223) 6 years ago			`help="The full path to the single GPU training "`
			`"program/script to be launched in parallel, "`
			`"followed by all the arguments for the "`
			`"training script")`

			`# rest from the training program`
			`parser.add_argument('training_script_args', nargs=REMAINDER)`
add launch mp distributed job py module test=develop (#15620) * add launch mp distributed mode module test=develop * delete unused file test=develop * refine usage test=develop * refine usage test=develop * move distributed package test=develop * add to whl package test=develop 6 years ago			`return parser.parse_args()`


Fine tuning launch.py (#17223) 6 years ago			`def start_procs(args):`
			`"""`
			`"""`
			`procs = []`
			`log_fns = []`

			`default_env = os.environ.copy()`

			`current_node_ip = args.node_ip`
			`node_ips = [x.strip() for x in args.cluster_node_ips.split(',')]`
			`node_id = node_ips.index(current_node_ip)`
			`num_nodes = len(node_ips)`

			`if args.selected_gpus is None:`
			`gpus_num = fluid.core.get_cuda_device_count()`
			`selected_gpus = [str(x) for x in range(0, gpus_num)]`
			`else:`
			`selected_gpus = [x.strip() for x in args.selected_gpus.split(',')]`
			`selected_gpus_num = len(selected_gpus)`

			`trainers_endpoints = ""`
			`for ip in node_ips:`
			`for i in range(selected_gpus_num):`
			`if trainers_endpoints != "":`
			`trainers_endpoints += ","`
			`trainers_endpoints += "%s:617%d" % (ip, i)`

			`nranks = num_nodes * selected_gpus_num`

			`if args.print_config:`
			`print("trainers_endpoints:", trainers_endpoints, ", node_id:", node_id,`
			`", current_node_ip:", current_node_ip, ", num_nodes:", num_nodes,`
			`", node_ips:", node_ips, ", nranks:", nranks)`

			`current_env = copy.copy(default_env)`
Polish codes of old prs. (#17938) 6 years ago			`# paddle broadcast ncclUniqueId use socket, and`
			`# proxy maybe make trainers unreachable, so delete them.`
			`# if we set them to "", grpc will log error message "bad uri"`
			`# so just delete them.`
			`current_env.pop("http_proxy", None)`
			`current_env.pop("https_proxy", None)`

Fine tuning launch.py (#17223) 6 years ago			`procs = []`
			`cmds = []`
			`for i in range(0, selected_gpus_num):`
			`current_env.update({`
			`"FLAGS_selected_gpus": "%s" % selected_gpus[i],`
			`"PADDLE_TRAINER_ID": "%d" % (node_id * selected_gpus_num + i),`
			`"PADDLE_CURRENT_ENDPOINT":`
			`"%s:%d" % (current_node_ip, args.started_port + i),`
			`"PADDLE_TRAINERS_NUM": "%d" % nranks,`
Polish codes of old prs. (#17938) 6 years ago			`"PADDLE_TRAINER_ENDPOINTS": trainers_endpoints`
Fine tuning launch.py (#17223) 6 years ago			`})`

			`cmd = [sys.executable, "-u", args.training_script`
			`] + args.training_script_args`

			`cmds.append(cmd)`

			`if args.log_dir is not None:`
Unset https_proxy and http_proxy in our launch.py (#17915) 6 years ago			`os.system("mkdir -p {}".format(args.log_dir))`
Fine tuning launch.py (#17223) 6 years ago			`fn = open("%s/workerlog.%d" % (args.log_dir, i), "w")`
			`log_fns.append(fn)`

			`proc = subprocess.Popen(cmd, env=current_env, stdout=fn, stderr=fn)`
			`else:`
			`proc = subprocess.Popen(cmd, env=current_env)`

			`procs.append(proc)`

			`for i in range(0, len(procs)):`
			`proc = procs[i]`

			`proc.wait()`
			`if len(log_fns) > 0:`
			`log_fns[i].close()`

			`if proc.returncode != 0:`
			`raise subprocess.CalledProcessError(`
			`returncode=procs[i].returncode, cmd=cmds[i])`

add launch mp distributed job py module test=develop (#15620) * add launch mp distributed mode module test=develop * delete unused file test=develop * refine usage test=develop * refine usage test=develop * move distributed package test=develop * add to whl package test=develop 6 years ago
Fine tuning launch.py (#17223) 6 years ago			`def launch():`
			`args = _parse_args()`
			`if args.print_config:`
			`_print_arguments(args)`
			`start_procs(args)`
add multi process start script (#15381) * add multi process start script test=develop * refine tool test=develop 6 years ago

			`if __name__ == "__main__":`
Fine tuning launch.py (#17223) 6 years ago			`launch()`