You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
Paddle/python/paddle/fluid/incubate/fleet/collective/__init__.py

517 lines
20 KiB

# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
import logging
import paddle.fluid as fluid
import paddle.fluid.io as io
import paddle.fluid.transpiler.distribute_transpiler as dist_transpiler
from paddle.fluid.executor import Executor
from paddle.fluid.parallel_executor import ParallelExecutor
from paddle.fluid.compiler import CompiledProgram
from paddle.fluid.framework import Program
from paddle.fluid.incubate.fleet.base.fleet_base import Fleet
from paddle.fluid.incubate.fleet.base.fleet_base import Mode
from paddle.fluid.incubate.fleet.base.fleet_base import DistributedOptimizer
from paddle.fluid import compiler
from paddle.fluid.incubate.checkpoint.checkpoint_saver import PaddleModel, CheckpointSaver
import paddle
import os
import sys
import six
import json
import re
import shutil
class LambConfig(object):
def __init__(self):
pass
class DistFCConfig(object):
def __init__(self):
pass
class Collective(Fleet):
def __init__(self):
super(Collective, self).__init__(Mode.COLLECTIVE)
self._local_ip = 0
self.startup_program = None
self._origin_program = None
self._transpiled_program = None
self.main_program = None
self._checkpoint_prefix = "__paddle_fleet_checkpoint__"
self._param_file_name = "_paddle_fleet_param__"
def init_worker(self):
logging.warn(
"You should not call 'init_worker' method for collective mode.")
def run_worker(self, main_programs=None, scopes=None):
logging.warn(
"You should not call 'run_worker' method for collective mode.")
def init_server(self, model_dir=None):
logging.warn(
"You should not call 'init_server' method for collective mode.")
def run_server(self):
logging.warn(
"You should not call 'run_server' method for collective mode.")
def stop_worker(self):
logging.warn(
"You should not call 'stop_worker' method for collective mode.")
def distributed_optimizer(self, optimizer, strategy=None):
self._optimizer = \
CollectiveOptimizer(optimizer, strategy)
return self._optimizer
def save_inference_model(self,
executor,
dirname,
feeded_var_names=None,
target_vars=None,
main_program=None,
export_for_deployment=True):
"""
Prune the given `main_program` to build a new program especially for
inference, and then save it and all related parameters to given
`dirname` by the `executor`.
"""
assert isinstance(executor, Executor), \
"In fleet.save_inference_model() function, executor must be as" \
" Executor type."
if main_program is None:
main_program = self._origin_program
assert isinstance(main_program, Program), \
"In fleet.save_inference_model() function, main_program " \
"must be as Program type."
io.save_inference_model(dirname, feeded_var_names, target_vars,
executor, main_program, None, None,
export_for_deployment)
def save_persistables(self,
executor,
dirname,
main_program=None,
filename=None):
"""
This function filters out all variables with `persistable==True` from
the give `main_program` and then saves these variables to the folder
`dirname` or file `filename`.
The `dirname` is used to specify the folder where persistable variables
are going to be saved. If you would like to save variables in separate
files, set `filename` None; if you would like to save all variables in a
single file, use `filename` to specify the file name.
"""
assert isinstance(executor, Executor), \
"In fleet.save_inference_model() function, executor must be as" \
" Executor type."
if main_program is None:
main_program = self._origin_program
assert isinstance(main_program, Program), \
"In fleet.save_inference_model() function, main_program " \
"must be as Program type."
io.save_persistables(executor, dirname, main_program, filename=filename)
def save_checkpoint(self,
executor,
path,
trainer_id,
train_status,
fs,
main_program=None,
local_cache_path=".cache",
remain_all_checkpoint=True):
"""
This function save persistables and current epoch num to path.
"""
if main_program == None:
main_program = self._transpiled_program
m = PaddleModel(executor, main_program)
t = train_status
c = CheckpointSaver(fs)
real_path, checkpoint_no = c.save_checkpoint(
path=path,
slists=[m, t],
trainer_id=trainer_id,
local_cache_path=local_cache_path)
if not remain_all_checkpoint:
c.clean_redundant_checkpoints(path)
return real_path, checkpoint_no
def load_checkpoint(self,
executor,
path,
trainer_id,
train_status,
fs,
main_program=None,
local_cache_path=".cache",
ignore_empty=True):
"""
This function load persistables and current epoch num from path.
"""
if main_program == None:
main_program = self._transpiled_program
m = PaddleModel(executor, main_program)
c = CheckpointSaver(fs)
return c.load_checkpoint(
path, [m, train_status],
trainer_id=trainer_id,
ignore_empty=ignore_empty,
local_cache_path=local_cache_path)
fleet = Collective()
class DistributedStrategy(fluid.BuildStrategy):
"""
Init function of DistributedStrategy
"""
def __init__(self):
super(DistributedStrategy, self).__init__()
self.use_local_sgd = False
self.use_dist_fc = False
self.dist_fc_config = None # DistFCConfig
self.mode = "nccl2" # or collective
self.collective_mode = None # local_sgd or grad_allreduce
self.nccl_comm_num = 1
self.forward_recompute = False # use RecomputeOptimizer
self.recompute_checkpoints = []
self.use_amp = False # use mixed precision optimizer
self.amp_loss_scaling = 2**15
self.exec_strategy = fluid.ExecutionStrategy()
# configurations below are used for unit test
self._ut4grad_allreduce = False
class CollectiveOpBasedOptimizer(DistributedOptimizer):
"""
Collective Operator Base Class For Distributed Optimizer
The class is invisible to a user
"""
def __init__(self, optimizer, strategy=None):
assert isinstance(
strategy,
DistributedStrategy), "strategy must be DistributedStrategy"
super(CollectiveOpBasedOptimizer, self).__init__(optimizer, strategy)
def backward(self,
loss,
startup_program=None,
parameter_list=None,
no_grad_set=None,
callbacks=None):
return self._optimizer.backward(loss, startup_program, parameter_list,
no_grad_set, callbacks)
def apply_gradients(self, params_grads):
return self._optimizer.apply_gradients(params_grads)
class CollectiveOptimizer(DistributedOptimizer):
"""
DistributedOptimizer is a wrapper for paddle.fluid.optimizer
A user should pass a paddle.fluid.optimizer to DistributedOptimizer
minimize() function is implemented.
DistributedOptimizer is the starting point for a user who wants to
run distributed training. The optimized information will be stored in
Fleet() instance who holds the global information about current distributed
training.
"""
def __init__(self, optimizer, strategy=DistributedStrategy()):
if strategy is None:
strategy = DistributedStrategy()
super(CollectiveOptimizer, self).__init__(optimizer, strategy)
self._forward_recompute = strategy.forward_recompute
if (not isinstance(strategy.recompute_checkpoints, list)):
raise ValueError("DistStrategy.recompute_checkpoints should"
"be a List")
self._recompute_checkpoints = strategy.recompute_checkpoints
self._use_amp = strategy.use_amp
self._amp_loss_scaling = strategy.amp_loss_scaling
self.print_config = False
def backward(self,
loss,
startup_program=None,
parameter_list=None,
no_grad_set=None,
callbacks=None):
return self._optimizer.backward(loss, startup_program, parameter_list,
no_grad_set, callbacks)
def apply_gradients(self, params_grads):
return self._optimizer.apply_gradients(params_grads)
def _check_condition(self, name, **kwargs):
for k, v in six.iteritems(kwargs):
if v is True:
assert False, "you can't use %s and %s together" % (name, k)
def _check_collective_mode(self, main_program, optimizer, strategy):
"""
Check the conflict conditions.
"""
if strategy.use_local_sgd:
strategy.mode = "collective"
strategy.collective_mode = "local_sgd"
self._check_condition(
"use_local_sgd",
use_dgc=main_program._enable_dgc,
use_dist_fc=strategy.use_dist_fc,
use_lamb=main_program._use_lamb)
if strategy.use_dist_fc:
self._check_condition(
"use_dist_fc",
use_dgc=main_program._enable_dgc,
use_local_sgd=strategy.use_local_sgd,
use_lamb=main_program._use_lamb)
assert strategy.dist_fc_config is not None, "DistributedStrategy.dist_fc_config should be set"
if strategy._ut4grad_allreduce:
strategy.mode = "collective"
strategy.collective_mode = "grad_allreduce"
self._check_condition(
"_ut4grad_allreduce",
use_dgc=main_program._enable_dgc,
use_lamb=main_program._use_lamb)
if self._strategy.collective_mode=="local_sgd" \
or self._strategy.collective_mode == "grad_allreduce":
assert self._strategy.mode == "collective", \
"local_sgd and grad_allreduce can be used under collective mode"
def _transpile(self, startup_program, main_program):
"""
Transpile the programs to distributed programs. And add the variables.
"""
worker_endpoints = fleet.worker_endpoints()
trainer_id = fleet.worker_index()
current_endpoint = fleet.worker_endpoints()[trainer_id]
worker_endpoints_env = ','.join(worker_endpoints)
trainers_num = fleet.worker_num()
if self.print_config:
print("worker_endpoints:{} trainers_num:{} current_endpoint:{} \
trainer_id:{}".format(worker_endpoints, trainers_num,
current_endpoint, trainer_id))
# call transpiler
config = dist_transpiler.DistributeTranspilerConfig()
config.mode = self._strategy.mode
config.collective_mode = self._strategy.collective_mode
config.nccl_comm_num = self._strategy.nccl_comm_num
config.use_hierarchical_allreduce = self._strategy.use_hierarchical_allreduce
config.hierarchical_allreduce_inter_nranks = self._strategy.hierarchical_allreduce_inter_nranks
t = dist_transpiler.DistributeTranspiler(config=config)
t.transpile(
trainer_id=trainer_id,
trainers=worker_endpoints_env,
startup_program=startup_program,
program=main_program,
current_endpoint=current_endpoint)
def _get_node_ips_from_endpoints(self, endpoints):
ss = set()
ips = []
for ep in endpoints:
ip = ep.split(":")[0].strip()
if ip not in ss:
ss.add(ip)
ips.append(ip)
else:
continue
return ips
def _node_num(self):
worker_endpoints = fleet.worker_endpoints()
current_endpoint = fleet.worker_endpoints()[fleet.worker_index()]
worker_endpoints_env = ','.join(worker_endpoints)
node_ips = self._get_node_ips_from_endpoints(worker_endpoints)
node_ip = current_endpoint.split(":")[0].strip()
node_num = len(node_ips)
return node_num
def _try_to_compile(self, startup_program, main_program):
node_num = self._node_num()
assert node_num >= 1, "nccl2 node_num must >= 1, now:{}" % node_num
exec_strategy = self._strategy.exec_strategy
if node_num <= 1:
if self._strategy.nccl_comm_num > 1:
logging.warn("set nccl_comm_num=1 since you only have 1 node.")
self._strategy.nccl_comm_num = 1
if self._strategy.use_hierarchical_allreduce:
logging.warn(
"set use_hierarchical_allreduce=False since you only have 1 node."
)
self._strategy.use_hierarchical_allreduce = False
sync_allreduce = os.getenv("FLAGS_sync_nccl_allreduce")
if sync_allreduce is None or sync_allreduce == "1":
exec_strategy.num_threads = self._strategy.nccl_comm_num + 1
if self._strategy.use_hierarchical_allreduce:
exec_strategy.num_threads = 2 * self._strategy.nccl_comm_num + 1
if exec_strategy.num_threads > 4:
logging.warn(
"if you use use_hierarchical_allreduce or "
"with multi nccl comm, please export FLAGS_sync_nccl_allreduce = 0"
)
# NOTE. open sync_batch_norm will hang when use multi num_threads
sync_batch_norm = self._strategy.sync_batch_norm
if sync_batch_norm is not None and sync_batch_norm is True:
self._strategy.nccl_comm_num = 1
self._strategy.use_hierarchical_allreduce = False
exec_strategy.num_threads = 1
logging.warn(
"use sync_batch_norm will hang when set num_threads > 1, so "
"set num_threads=1, nccl_comm_num=1, use_hierarchical_allreduce=False."
)
if self.print_config:
print("node_num:", node_num, "num_threads:",
exec_strategy.num_threads, "use_hierarchical_allreduce:",
self._strategy.use_hierarchical_allreduce, "nccl_comm_num:",
self._strategy.nccl_comm_num, "FLAGS_sync_nccl_allreduce:",
sync_allreduce)
self._transpile(startup_program, main_program)
if self._strategy.mode == "collective":
return main_program
self._strategy.num_trainers = fleet.worker_num()
self._strategy.trainer_id = fleet.worker_index()
self._strategy.trainers_endpoints = fleet.worker_endpoints()
self._strategy.enable_backward_optimizer_op_deps = True
self._compiled_program = compiler.CompiledProgram(main_program)
self._compiled_program.with_data_parallel(
loss_name=self._loss.name,
build_strategy=self._strategy,
exec_strategy=self._strategy.exec_strategy,
share_vars_from=None)
return self._compiled_program
def raiseOptimizeError(self, strategy_name, optimize_name):
raise ValueError("can not use {0} when you set DistStrategy.{1} "
"as True".format(optimize_name, strategy_name))
def minimize(self,
loss,
startup_program=None,
parameter_list=None,
no_grad_set=None):
"""
minimize a program through loss
Args:
loss (Variable|Variable List): loss variable or loss variable list to run optimization.
startup_program (Program): startup_program for initializing parameters
in `parameter_list`.
parameter_list (list): list of Variables to update.
no_grad_set (set|None): set of Variables should be ignored.
Returns:
tuple: (optimize_ops, params_grads) which are, list of operators appended;
and list of (param, grad) Variables pair for optimization.
Note that in parameter server mode, a worker will not get anything about optimize_os
Because optimizer algorithms run on pserver side. We will make this usable in pserver
process, but currently the optimization part is written into Fleet(). A user does not
need to care about how to startup a pserver node.
"""
# check optimizer conflicts
if self._forward_recompute:
if self._recompute_checkpoints == []:
raise ValueError("please set strategy.recompute_checkpoints"
"when set strategy.forward_recompute as True")
if self._optimizer.__class__.__name__ in [
"RecomputeOptimizer", "OptimizerWithMixedPrecision"
]:
self.raiseOptimizeError("forward_recompute",
self._optimizer.__class__.__name__)
self._optimizer = \
fluid.optimizer.RecomputeOptimizer(self._optimizer)
self._optimizer._set_checkpoints(self._recompute_checkpoints)
if self._use_amp:
if self._optimizer.__class__.__name__ in [
"OptimizerWithMixedPrecision", "DGCMomentumOptimizer"
]:
self.raiseOptimizeError("mixed_precision",
self._optimizer.__class__.__name__)
self._optimizer = fluid.contrib.mixed_precision.decorate(
self._optimizer,
init_loss_scaling=self._amp_loss_scaling,
use_dynamic_loss_scaling=True)
main_program = loss.block.program
if startup_program is None:
startup_program = fluid.default_startup_program()
fleet.startup_program = startup_program
self._loss = loss
self._check_collective_mode(main_program, self._optimizer,
self._strategy)
optimize_ops, param_grads = self._optimizer.minimize(
loss, startup_program, parameter_list, no_grad_set=no_grad_set)
fleet._origin_program = main_program.clone(for_test=False)
fleet._transpiled_program = main_program
fleet.main_program = self._try_to_compile(startup_program, main_program)
return optimize_ops, param_grads