diff --git a/mindspore/parallel/_auto_parallel_context.py b/mindspore/parallel/_auto_parallel_context.py index 7fd366c2f0..e2369c4aa6 100644 --- a/mindspore/parallel/_auto_parallel_context.py +++ b/mindspore/parallel/_auto_parallel_context.py @@ -275,7 +275,7 @@ class _AutoParallelContext: Args: indices (list): Indices list. - group (str): The hccl communication group. + group (str): The communication group of hccl/nccl. Raises: TypeError: If type of indices item is not int. @@ -311,7 +311,7 @@ class _AutoParallelContext: Get allreduce fusion split indices. Args: - group (str): The hccl communication group. + group (str): The communication group of hccl/nccl. Returns: Return split sizes list according to the group. @@ -340,7 +340,7 @@ class _AutoParallelContext: Args: sizes (list): Sizes list. - group (str): The hccl communication group. + group (str): The communication group of hccl/nccl. Raises: TypeError: If type of sizes item is not int. @@ -376,7 +376,7 @@ class _AutoParallelContext: Get allreduce fusion split sizes. Args: - group (str): The hccl communication group. + group (str): The communication group of hccl/nccl. Returns: Return split sizes list according to the group. diff --git a/model_zoo/official/cv/resnet/README.md b/model_zoo/official/cv/resnet/README.md index 2d64530d98..fcb2ce1be4 100644 --- a/model_zoo/official/cv/resnet/README.md +++ b/model_zoo/official/cv/resnet/README.md @@ -44,7 +44,7 @@ ImageNet2012 ├── run_distribute_train.sh # launch distributed training(8 pcs) ├── run_parameter_server_train.sh # launch Ascend parameter server training(8 pcs) ├── run_eval.sh # launch evaluation - └── run_standalone_train.sh # launch standalone training(1 pcs) + ├── run_standalone_train.sh # launch standalone training(1 pcs) ├── run_distribute_train_gpu.sh # launch gpu distributed training(8 pcs) ├── run_parameter_server_train_gpu.sh # launch gpu parameter server training(8 pcs) ├── run_eval_gpu.sh # launch gpu evaluation diff --git a/model_zoo/official/cv/resnet/train.py b/model_zoo/official/cv/resnet/train.py index 3d65d10392..8e77dc7833 100755 --- a/model_zoo/official/cv/resnet/train.py +++ b/model_zoo/official/cv/resnet/train.py @@ -81,9 +81,11 @@ if __name__ == '__main__': init() # GPU target else: - init("nccl") context.set_auto_parallel_context(device_num=get_group_size(), parallel_mode=ParallelMode.DATA_PARALLEL, mirror_mean=True) + if args_opt.net == "resnet50": + auto_parallel_context().set_all_reduce_fusion_split_indices([85, 160]) + init("nccl") ckpt_save_dir = config.save_checkpoint_path + "ckpt_" + str(get_rank()) + "/" # create dataset