@ -18,39 +18,6 @@ from .meta_optimizer_base import MetaOptimizerBase
from . . base . private_helper_function import wait_server_ready
def get_build_strategy ( dist_strategy ) :
build_strategy = paddle . BuildStrategy ( )
build_strategy . enable_sequential_execution = \
dist_strategy . sequential_execution
build_strategy . remove_unnecessary_lock = True
build_strategy . fuse_elewise_add_act_ops = \
dist_strategy . fuse_elewise_add_act_ops
build_strategy . fuse_bn_act_ops = \
dist_strategy . fuse_bn_act_ops
build_strategy . enable_auto_fusion = \
dist_strategy . enable_auto_fusion
build_strategy . fuse_relu_depthwise_conv = \
dist_strategy . fuse_relu_depthwise_conv
build_strategy . fuse_broadcast_ops = \
dist_strategy . fuse_broadcast_ops
build_strategy . sync_batch_norm = \
dist_strategy . sync_batch_norm
return build_strategy
def get_execution_strategy ( dist_strategy ) :
execution_strategy = paddle . ExecutionStrategy ( )
execution_strategy . num_threads = \
dist_strategy . num_threads
execution_strategy . num_iteration_per_drop_scope = \
dist_strategy . num_iteration_per_drop_scope
execution_strategy . num_iteration_per_run = \
dist_strategy . num_iteration_per_run
execution_strategy . use_thread_barrier = \
dist_strategy . use_thread_barrier
return execution_strategy
class GraphExecutionOptimizer ( MetaOptimizerBase ) :
def __init__ ( self , optimizer ) :
super ( GraphExecutionOptimizer , self ) . __init__ ( optimizer )
@ -76,7 +43,7 @@ class GraphExecutionOptimizer(MetaOptimizerBase):
pass
# should fix the variable
def _setup_nccl_op ( self , startup_program , main_program ):
def _setup_nccl_op ( self , startup_program , main_program , build_strategy ):
trainer_endpoints = self . role_maker . get_trainer_endpoints ( )
trainer_id = self . role_maker . worker_index ( )
current_endpoint = self . role_maker . get_trainer_endpoints ( ) [ trainer_id ]
@ -88,14 +55,14 @@ class GraphExecutionOptimizer(MetaOptimizerBase):
wait_server_ready ( other_trainer_endpoints )
nccl_id_var = startup_program . global_block ( ) . create_var (
name = " NCCLID " , persistable = True , type = core . VarDesc . VarType . RAW )
for i in range ( 1 , self . user_define d_strategy. nccl_comm_num ) :
for i in range ( 1 , buil d_strategy. nccl_comm_num ) :
startup_program . global_block ( ) . create_var (
name = " NCCLID_ {} " . format ( i ) ,
persistable = True ,
type = core . VarDesc . VarType . RAW )
if self . user_defined_strategy . hiera chical_allreduce:
for i in range ( 0 , self . user_define d_strategy. nccl_comm_num ) :
if build_strategy . use_hierar chical_allreduce:
for i in range ( 0 , buil d_strategy. nccl_comm_num ) :
startup_program . global_block ( ) . create_var (
name = " Hierarchical_inter_NCCLID_ {} " . format ( i ) ,
persistable = True ,
@ -112,48 +79,80 @@ class GraphExecutionOptimizer(MetaOptimizerBase):
attrs = {
" trainers " : trainer_endpoints ,
" trainer_id " : trainer_id ,
" nccl_comm_num " : self . user_define d_strategy. nccl_comm_num ,
" nccl_comm_num " : buil d_strategy. nccl_comm_num ,
" use_hierarchical_allreduce " :
self . user_defined_strategy . hiera chical_allreduce,
build_strategy . use_hierar chical_allreduce,
" hierarchical_allreduce_inter_ranks " :
self . user_defined_strategy . hierachical_allreduce_inter_ ranks
build_strategy . hierarchical_allreduce_inter_n ranks
} )
def _try_to_compile ( self , startup_program , main_program , loss ) :
build_strategy = get_build_strategy ( self . user_defined_strategy )
exe_strategy = get_execution_strategy ( self . user_defined_strategy )
import copy
dist_strategy = self . user_defined_strategy
local_build_strategy = paddle . fluid . BuildStrategy ( )
local_build_strategy . enable_sequential_execution = \
dist_strategy . build_strategy . enable_sequential_execution
local_build_strategy . fuse_elewise_add_act_ops = \
dist_strategy . build_strategy . fuse_elewise_add_act_ops
local_build_strategy . fuse_bn_act_ops = \
dist_strategy . build_strategy . fuse_bn_act_ops
local_build_strategy . enable_auto_fusion = \
dist_strategy . build_strategy . enable_auto_fusion
local_build_strategy . fuse_relu_depthwise_conv = \
dist_strategy . build_strategy . fuse_relu_depthwise_conv
local_build_strategy . fuse_broadcast_ops = \
dist_strategy . build_strategy . fuse_broadcast_ops
local_build_strategy . fuse_all_optimizer_ops = \
dist_strategy . build_strategy . fuse_all_optimizer_ops
local_build_strategy . enable_inplace = \
dist_strategy . build_strategy . enable_inplace
local_build_strategy . use_hierarchical_allreduce = \
dist_strategy . use_hierarchical_allreduce
local_build_strategy . hierarchical_allreduce_inter_nranks = \
dist_strategy . hierarchical_allreduce_inter_nranks
local_build_strategy . sync_batch_norm = \
dist_strategy . sync_batch_norm
local_build_strategy . fuse_all_reduce_ops = \
dist_strategy . fuse_all_reduce_ops
local_build_strategy . nccl_comm_num = \
dist_strategy . nccl_comm_num
exe_strategy = self . user_defined_strategy . execution_strategy
node_num = self . role_maker . worker_num ( )
if self . role_maker . _is_collective :
assert node_num > = 1 , " nccl2 node_num must >= 1, now: {} " % node_num
if node_num < = 1 :
# local mode
if self . user_defined_strategy . nccl_comm_num > 1 :
if local_buil d_strategy. nccl_comm_num > 1 :
logging . warn ( " set nccl_comm_num=1 since you only have 1 node. " )
self . user_defined_strategy . nccl_comm_num = 1
local_buil d_strategy. nccl_comm_num = 1
if self . user_defined_strategy . hierachical_allreduce :
if local_build_strategy . use_hierar chical_allreduce:
logging . warn (
" set hierachical_allreduce=False since you only have 1 node. "
)
self . user_defined_strategy . hierachical_allreduce = False
local_build_strategy . use_hierar chical_allreduce = False
sync_allreduce = self . user_defined_strategy . sync_nccl_allreduce
sync_allreduce = dist _strategy. sync_nccl_allreduce
if sync_allreduce :
exe_strategy . num_threads = self . user_defined_strategy . nccl_comm_num + 1
if self . user_defined_strategy . hierachical_allreduce :
exe_strategy . num_threads = 2 * self . user_defined_strategy . nccl_comm_num + 1
paddle . fluid . framework . set_flags ( {
" FLAGS_sync_nccl_allreduce " : True
} )
exe_strategy . num_threads = local_build_strategy . nccl_comm_num + 1
if local_build_strategy . use_hierarchical_allreduce :
exe_strategy . num_threads = 2 * local_build_strategy . nccl_comm_num + 1
if exe_strategy . num_threads > 4 :
logging . warn (
" if you use hierachical_allreduce or "
" with multi nccl comm, please export FLAGS_sync_nccl_allreduce = 0 "
" with multi nccl comm, please set distributed_strategy.sync_nccl_allreduce=False "
)
# TODO(guru4elephant): should be an independent optimizer
sync_batch_norm = self . user_defined_strategy . sync_batch_norm
sync_batch_norm = local_build_strategy . sync_batch_norm
if sync_batch_norm :
self . user_define d_strategy. nccl_comm_num = 1
self . user_defined_strategy . hiera chical_allreduce = False
local_buil d_strategy. nccl_comm_num = 1
local_build_strategy . use_hierar chical_allreduce = False
exe_strategy . num_threads = 1
logging . warn (
" use sync_batch_norm will hang when set num_threads > 1, so "
@ -161,19 +160,19 @@ class GraphExecutionOptimizer(MetaOptimizerBase):
)
# TODO(guru4elephant): should be an independent optimizer
self . _setup_nccl_op ( startup_program , main_program )
self . _setup_nccl_op ( startup_program , main_program , local_build_strategy )
build_strategy. num_trainers = self . role_maker . worker_num ( )
build_strategy. trainer_id = self . role_maker . worker_index ( )
build_strategy. trainers_endpoints = self . role_maker . get_trainer_endpoints (
local_ build_strategy. num_trainers = self . role_maker . worker_num ( )
local_ build_strategy. trainer_id = self . role_maker . worker_index ( )
local_ build_strategy. trainers_endpoints = self . role_maker . get_trainer_endpoints (
)
build_strategy. enable_backward_optimizer_op_deps = True
local_ build_strategy. enable_backward_optimizer_op_deps = True
self . _compiled_program = compiler . CompiledProgram ( main_program )
self . _compiled_program . with_data_parallel (
loss_name = loss . name ,
build_strategy = build_strategy,
build_strategy = local_ build_strategy,
exec_strategy = exe_strategy ,
share_vars_from = None )
@ -188,7 +187,7 @@ class GraphExecutionOptimizer(MetaOptimizerBase):
startup_program = paddle . default_startup_program ( )
compiled_program = self . _try_to_compile ( startup_program ,
loss . block . program , loss )
loss . block . program . graph = compiled_program
loss . block . program . _ graph = compiled_program
# just return self.optimizer_ops and self.param_grads
return None , None