!3670 remove old MINDSPORE_HCCL_CONFIG_PATH in model zoo

Merge pull request !3670 from panbingao/master
pull/3670/MERGE
mindspore-ci-bot 5 years ago committed by Gitee
commit 6eddd65cf1

@ -24,7 +24,7 @@ This is an example of training DeepLabV3 with PASCAL VOC 2012 dataset in MindSpo
``` ```
- Run `run_distribute_train.sh` for distributed training. - Run `run_distribute_train.sh` for distributed training.
``` bash ``` bash
sh scripts/run_distribute_train.sh MINDSPORE_HCCL_CONFIG_PATH DATA_PATH sh scripts/run_distribute_train.sh RANK_TABLE_FILE DATA_PATH
``` ```
### Evaluation ### Evaluation
Set options in evaluation_config.py. Make sure the 'data_file' and 'finetune_ckpt' are set to your own path. Set options in evaluation_config.py. Make sure the 'data_file' and 'finetune_ckpt' are set to your own path.

@ -16,14 +16,13 @@
echo "==============================================================================================================" echo "=============================================================================================================="
echo "Please run the scipt as: " echo "Please run the scipt as: "
echo "bash run_distribute_train.sh MINDSPORE_HCCL_CONFIG_PATH DATA_PATH" echo "bash run_distribute_train.sh RANK_TABLE_FILE DATA_PATH"
echo "for example: bash run_distribute_train.sh MINDSPORE_HCCL_CONFIG_PATH DATA_PATH [PRETRAINED_CKPT_PATH](option)" echo "for example: bash run_distribute_train.sh RANK_TABLE_FILE DATA_PATH [PRETRAINED_CKPT_PATH](option)"
echo "It is better to use absolute path." echo "It is better to use absolute path."
echo "==============================================================================================================" echo "=============================================================================================================="
DATA_DIR=$2 DATA_DIR=$2
export MINDSPORE_HCCL_CONFIG_PATH=$1
export RANK_TABLE_FILE=$1 export RANK_TABLE_FILE=$1
export RANK_SIZE=8 export RANK_SIZE=8
export DEVICE_NUM=8 export DEVICE_NUM=8

@ -87,13 +87,13 @@ FasterRcnn is a two-stage target detection network,This network uses a region pr
``` ```
# distributed training # distributed training
sh run_distribute_train.sh [MINDSPORE_HCCL_CONFIG_PATH] [PRETRAINED_MODEL] sh run_distribute_train.sh [RANK_TABLE_FILE] [PRETRAINED_MODEL]
# standalone training # standalone training
sh run_standalone_train.sh [PRETRAINED_MODEL] sh run_standalone_train.sh [PRETRAINED_MODEL]
``` ```
> Rank_table.json which is specified by MINDSPORE_HCCL_CONFIG_PATH is needed when you are running a distribute task. You can generate it by using the [hccl_tools](https://gitee.com/mindspore/mindspore/tree/master/model_zoo/utils/hccl_tools). > Rank_table.json which is specified by RANK_TABLE_FILE is needed when you are running a distribute task. You can generate it by using the [hccl_tools](https://gitee.com/mindspore/mindspore/tree/master/model_zoo/utils/hccl_tools).
> As for PRETRAINED_MODELif not set, the model will be trained from the very beginning.Ready-made pretrained_models are not available now. Stay tuned. > As for PRETRAINED_MODELif not set, the model will be trained from the very beginning.Ready-made pretrained_models are not available now. Stay tuned.
#### Result #### Result

@ -16,7 +16,7 @@
if [ $# -lt 1 ] || [ $# -gt 2 ] if [ $# -lt 1 ] || [ $# -gt 2 ]
then then
echo "Usage: sh run_train.sh [MINDSPORE_HCCL_CONFIG_PATH] [PRETRAINED_PATH]" echo "Usage: sh run_train.sh [RANK_TABLE_FILE] [PRETRAINED_PATH]"
exit 1 exit 1
fi fi
@ -33,7 +33,7 @@ echo $PATH1
if [ ! -f $PATH1 ] if [ ! -f $PATH1 ]
then then
echo "error: MINDSPORE_HCCL_CONFIG_PATH=$PATH1 is not a file" echo "error: RANK_TABLE_FILE=$PATH1 is not a file"
exit 1 exit 1
fi fi
@ -51,7 +51,6 @@ fi
ulimit -u unlimited ulimit -u unlimited
export DEVICE_NUM=8 export DEVICE_NUM=8
export RANK_SIZE=8 export RANK_SIZE=8
export MINDSPORE_HCCL_CONFIG_PATH=$PATH1
export RANK_TABLE_FILE=$PATH1 export RANK_TABLE_FILE=$PATH1
for((i=0; i<${DEVICE_NUM}; i++)) for((i=0; i<${DEVICE_NUM}; i++))

@ -16,22 +16,22 @@
if [ $# != 1 ] if [ $# != 1 ]
then then
echo "Usage: sh run_train.sh [MINDSPORE_HCCL_CONFIG_PATH]" echo "Usage: sh run_train.sh [RANK_TABLE_FILE]"
exit 1 exit 1
fi fi
if [ ! -f $1 ] if [ ! -f $1 ]
then then
echo "error: MINDSPORE_HCCL_CONFIG_PATH=$1 is not a file" echo "error: RANK_TABLE_FILE=$1 is not a file"
exit 1 exit 1
fi fi
ulimit -u unlimited ulimit -u unlimited
export DEVICE_NUM=8 export DEVICE_NUM=8
export RANK_SIZE=8 export RANK_SIZE=8
MINDSPORE_HCCL_CONFIG_PATH=$(realpath $1) RANK_TABLE_FILE=$(realpath $1)
export MINDSPORE_HCCL_CONFIG_PATH export RANK_TABLE_FILE
echo "MINDSPORE_HCCL_CONFIG_PATH=${MINDSPORE_HCCL_CONFIG_PATH}" echo "RANK_TABLE_FILE=${RANK_TABLE_FILE}"
export SERVER_ID=0 export SERVER_ID=0
rank_start=$((DEVICE_NUM * SERVER_ID)) rank_start=$((DEVICE_NUM * SERVER_ID))

@ -88,7 +88,7 @@ MaskRcnn is a two-stage target detection network,This network uses a region prop
``` ```
# distributed training # distributed training
sh run_distribute_train.sh [MINDSPORE_HCCL_CONFIG_PATH] [PRETRAINED_MODEL] sh run_distribute_train.sh [RANK_TABLE_FILE] [PRETRAINED_MODEL]
# standalone training # standalone training
sh run_standalone_train.sh [PRETRAINED_MODEL] sh run_standalone_train.sh [PRETRAINED_MODEL]

@ -16,7 +16,7 @@
if [ $# != 2 ] if [ $# != 2 ]
then then
echo "Usage: sh run_train.sh [MINDSPORE_HCCL_CONFIG_PATH] [PRETRAINED_PATH]" echo "Usage: sh run_train.sh [RANK_TABLE_FILE] [PRETRAINED_PATH]"
exit 1 exit 1
fi fi
@ -35,7 +35,7 @@ echo $PATH2
if [ ! -f $PATH1 ] if [ ! -f $PATH1 ]
then then
echo "error: MINDSPORE_HCCL_CONFIG_PATH=$PATH1 is not a file" echo "error: RANK_TABLE_FILE=$PATH1 is not a file"
exit 1 exit 1
fi fi
@ -48,7 +48,6 @@ fi
ulimit -u unlimited ulimit -u unlimited
export DEVICE_NUM=8 export DEVICE_NUM=8
export RANK_SIZE=8 export RANK_SIZE=8
export MINDSPORE_HCCL_CONFIG_PATH=$PATH1
export RANK_TABLE_FILE=$PATH1 export RANK_TABLE_FILE=$PATH1
echo 3 > /proc/sys/vm/drop_caches echo 3 > /proc/sys/vm/drop_caches

@ -60,7 +60,7 @@ Dataset used: [imagenet](http://www.image-net.org/)
### Usage ### Usage
- Ascend: sh run_train.sh Ascend [DEVICE_NUM] [VISIABLE_DEVICES(0,1,2,3,4,5,6,7)] [MINDSPORE_HCCL_CONFIG_PATH] [DATASET_PATH] [CKPT_PATH] - Ascend: sh run_train.sh Ascend [DEVICE_NUM] [VISIABLE_DEVICES(0,1,2,3,4,5,6,7)] [RANK_TABLE_FILE] [DATASET_PATH] [CKPT_PATH]
- GPU: sh run_trian.sh GPU [DEVICE_NUM] [VISIABLE_DEVICES(0,1,2,3,4,5,6,7)] [DATASET_PATH] - GPU: sh run_trian.sh GPU [DEVICE_NUM] [VISIABLE_DEVICES(0,1,2,3,4,5,6,7)] [DATASET_PATH]
### Launch ### Launch

@ -30,7 +30,6 @@ run_ascend()
BASEPATH=$(cd "`dirname $0`" || exit; pwd) BASEPATH=$(cd "`dirname $0`" || exit; pwd)
export PYTHONPATH=${BASEPATH}:$PYTHONPATH export PYTHONPATH=${BASEPATH}:$PYTHONPATH
export MINDSPORE_HCCL_CONFIG_PATH=$4
export RANK_TABLE_FILE=$4 export RANK_TABLE_FILE=$4
if [ -d "../train" ]; if [ -d "../train" ];
then then
@ -81,7 +80,7 @@ run_gpu()
if [ $# -gt 6 ] || [ $# -lt 4 ] if [ $# -gt 6 ] || [ $# -lt 4 ]
then then
echo "Usage:\n \ echo "Usage:\n \
Ascend: sh run_train.sh Ascend [DEVICE_NUM] [VISIABLE_DEVICES(0,1,2,3,4,5,6,7)] [MINDSPORE_HCCL_CONFIG_PATH] [DATASET_PATH] [CKPT_PATH]\n \ Ascend: sh run_train.sh Ascend [DEVICE_NUM] [VISIABLE_DEVICES(0,1,2,3,4,5,6,7)] [RANK_TABLE_FILE] [DATASET_PATH] [CKPT_PATH]\n \
GPU: sh run_train.sh GPU [DEVICE_NUM] [VISIABLE_DEVICES(0,1,2,3,4,5,6,7)] [DATASET_PATH]\n \ GPU: sh run_train.sh GPU [DEVICE_NUM] [VISIABLE_DEVICES(0,1,2,3,4,5,6,7)] [DATASET_PATH]\n \
" "
exit 1 exit 1

@ -141,7 +141,6 @@ def main():
env['RANK_ID'] = str(rank_id) env['RANK_ID'] = str(rank_id)
env['DEVICE_ID'] = str(device_id) env['DEVICE_ID'] = str(device_id)
if args.nproc_per_node > 1: if args.nproc_per_node > 1:
env['MINDSPORE_HCCL_CONFIG_PATH'] = table_fn
env['RANK_TABLE_FILE'] = table_fn env['RANK_TABLE_FILE'] = table_fn
if os.path.exists(device_dir): if os.path.exists(device_dir):
shutil.rmtree(device_dir) shutil.rmtree(device_dir)

@ -138,7 +138,7 @@ def main():
env['RANK_ID'] = str(rank_id) env['RANK_ID'] = str(rank_id)
env['DEVICE_ID'] = str(device_id) env['DEVICE_ID'] = str(device_id)
if args.nproc_per_node > 1: if args.nproc_per_node > 1:
env['MINDSPORE_HCCL_CONFIG_PATH'] = table_fn env['RANK_TABLE_FILE'] = table_fn
env['RANK_TABLE_FILE'] = table_fn env['RANK_TABLE_FILE'] = table_fn
if os.path.exists(device_dir): if os.path.exists(device_dir):
shutil.rmtree(device_dir) shutil.rmtree(device_dir)

@ -132,7 +132,7 @@ Parameters for both training and evaluation can be set in config.py.
``` ```
# distributed training # distributed training
Usage: sh run_distribute_train.sh [resnet50|resnet101] [cifar10|imagenet2012] [MINDSPORE_HCCL_CONFIG_PATH] [DATASET_PATH] Usage: sh run_distribute_train.sh [resnet50|resnet101] [cifar10|imagenet2012] [RANK_TABLE_FILE] [DATASET_PATH]
[PRETRAINED_CKPT_PATH](optional) [PRETRAINED_CKPT_PATH](optional)
# standalone training # standalone training

@ -16,7 +16,7 @@
if [ $# != 4 ] && [ $# != 5 ] if [ $# != 4 ] && [ $# != 5 ]
then then
echo "Usage: sh run_distribute_train.sh [resnet50|resnet101] [cifar10|imagenet2012] [MINDSPORE_HCCL_CONFIG_PATH] [DATASET_PATH] [PRETRAINED_CKPT_PATH](optional)" echo "Usage: sh run_distribute_train.sh [resnet50|resnet101] [cifar10|imagenet2012] [RANK_TABLE_FILE] [DATASET_PATH] [PRETRAINED_CKPT_PATH](optional)"
exit 1 exit 1
fi fi
@ -57,7 +57,7 @@ fi
if [ ! -f $PATH1 ] if [ ! -f $PATH1 ]
then then
echo "error: MINDSPORE_HCCL_CONFIG_PATH=$PATH1 is not a file" echo "error: RANK_TABLE_FILE=$PATH1 is not a file"
exit 1 exit 1
fi fi
@ -76,7 +76,6 @@ fi
ulimit -u unlimited ulimit -u unlimited
export DEVICE_NUM=8 export DEVICE_NUM=8
export RANK_SIZE=8 export RANK_SIZE=8
export MINDSPORE_HCCL_CONFIG_PATH=$PATH1
export RANK_TABLE_FILE=$PATH1 export RANK_TABLE_FILE=$PATH1
export SERVER_ID=0 export SERVER_ID=0

@ -140,7 +140,7 @@ def main():
env['RANK_ID'] = str(rank_id) env['RANK_ID'] = str(rank_id)
env['DEVICE_ID'] = str(device_id) env['DEVICE_ID'] = str(device_id)
if args.nproc_per_node > 1: if args.nproc_per_node > 1:
env['MINDSPORE_HCCL_CONFIG_PATH'] = table_fn env['RANK_TABLE_FILE'] = table_fn
env['RANK_TABLE_FILE'] = table_fn env['RANK_TABLE_FILE'] = table_fn
if os.path.exists(device_dir): if os.path.exists(device_dir):
shutil.rmtree(device_dir) shutil.rmtree(device_dir)

@ -74,7 +74,7 @@ Parameters for both training and inference can be set in config.py.
``` ```
# distributed training # distributed training
Usage: sh run_distribute_train.sh [MINDSPORE_HCCL_CONFIG_PATH] [DATASET_PATH] [DEVICE_NUM] Usage: sh run_distribute_train.sh [RANK_TABLE_FILE] [DATASET_PATH] [DEVICE_NUM]
``` ```

@ -16,13 +16,13 @@
if [ $# != 3 ] if [ $# != 3 ]
then then
echo "Usage: sh run_distribute_train.sh [MINDSPORE_HCCL_CONFIG_PATH] [DATASET_PATH] [DEVICE_NUM]" echo "Usage: sh run_distribute_train.sh [RANK_TABLE_FILE] [DATASET_PATH] [DEVICE_NUM]"
exit 1 exit 1
fi fi
if [ ! -f $1 ] if [ ! -f $1 ]
then then
echo "error: DMINDSPORE_HCCL_CONFIG_PATH=$1 is not a file" echo "error: DRANK_TABLE_FILE=$1 is not a file"
exit 1 exit 1
fi fi
@ -38,7 +38,7 @@ cd $BASE_PATH/../ || exit
ulimit -u unlimited ulimit -u unlimited
export DEVICE_NUM=$3 export DEVICE_NUM=$3
export RANK_SIZE=$3 export RANK_SIZE=$3
export MINDSPORE_HCCL_CONFIG_PATH=$1 export RANK_TABLE_FILE=$1
for((i=0; i<${DEVICE_NUM}; i++)) for((i=0; i<${DEVICE_NUM}; i++))
do do

@ -82,7 +82,7 @@ Parameters for both training and evaluating can be set in config.py
``` ```
# distribute training example(8p) # distribute training example(8p)
sh run_distribute_train.sh MINDSPORE_HCCL_CONFIG_PATH DATA_PATH sh run_distribute_train.sh RANK_TABLE_FILE DATA_PATH
# standalone training # standalone training
sh run_standalone_train.sh DEVICE_ID DATA_PATH sh run_standalone_train.sh DEVICE_ID DATA_PATH
``` ```
@ -91,7 +91,7 @@ sh run_standalone_train.sh DEVICE_ID DATA_PATH
```bash ```bash
# distributed training example(8p) for Ascend # distributed training example(8p) for Ascend
sh scripts/run_distribute_train.sh MINDSPORE_HCCL_CONFIG_PATH /dataset/train sh scripts/run_distribute_train.sh RANK_TABLE_FILE /dataset/train
# standalone training example for Ascend # standalone training example for Ascend
sh scripts/run_standalone_train.sh 0 /dataset/train sh scripts/run_standalone_train.sh 0 /dataset/train

@ -16,7 +16,7 @@
echo "==============================================================================================================" echo "=============================================================================================================="
echo "Please run the scipt as: " echo "Please run the scipt as: "
echo "sh run_distribute_train.sh DEVICE_NUM EPOCH_SIZE LR DATASET MINDSPORE_HCCL_CONFIG_PATH PRE_TRAINED PRE_TRAINED_EPOCH_SIZE" echo "sh run_distribute_train.sh DEVICE_NUM EPOCH_SIZE LR DATASET RANK_TABLE_FILE PRE_TRAINED PRE_TRAINED_EPOCH_SIZE"
echo "for example: sh run_distribute_train.sh 8 500 0.2 coco /data/hccl.json /opt/ssd-300.ckpt(optional) 200(optional)" echo "for example: sh run_distribute_train.sh 8 500 0.2 coco /data/hccl.json /opt/ssd-300.ckpt(optional) 200(optional)"
echo "It is better to use absolute path." echo "It is better to use absolute path."
echo "=================================================================================================================" echo "================================================================================================================="
@ -24,7 +24,7 @@ echo "==========================================================================
if [ $# != 5 ] && [ $# != 7 ] if [ $# != 5 ] && [ $# != 7 ]
then then
echo "Usage: sh run_distribute_train.sh [DEVICE_NUM] [EPOCH_SIZE] [LR] [DATASET] \ echo "Usage: sh run_distribute_train.sh [DEVICE_NUM] [EPOCH_SIZE] [LR] [DATASET] \
[MINDSPORE_HCCL_CONFIG_PATH] [PRE_TRAINED](optional) [PRE_TRAINED_EPOCH_SIZE](optional)" [RANK_TABLE_FILE] [PRE_TRAINED](optional) [PRE_TRAINED_EPOCH_SIZE](optional)"
exit 1 exit 1
fi fi
@ -41,7 +41,7 @@ LR=$3
DATASET=$4 DATASET=$4
PRE_TRAINED=$6 PRE_TRAINED=$6
PRE_TRAINED_EPOCH_SIZE=$7 PRE_TRAINED_EPOCH_SIZE=$7
export MINDSPORE_HCCL_CONFIG_PATH=$5 export RANK_TABLE_FILE=$5
for((i=0;i<RANK_SIZE;i++)) for((i=0;i<RANK_SIZE;i++))
do do

@ -209,10 +209,10 @@ parameters/options:
- Train on Ascend. - Train on Ascend.
``` ```
Usage: sh script/run_distribute_train.sh [MINDSPORE_HCCL_CONFIG_PATH] [DATA_PATH] Usage: sh script/run_distribute_train.sh [RANK_TABLE_FILE] [DATA_PATH]
parameters/options: parameters/options:
MINDSPORE_HCCL_CONFIG_PATH HCCL configuration file path. RANK_TABLE_FILE HCCL configuration file path.
DATA_PATH the storage path of dataset. DATA_PATH the storage path of dataset.
``` ```

@ -16,13 +16,13 @@
if [ $# != 2 ] if [ $# != 2 ]
then then
echo "Usage: sh run_distribute_train.sh [MINDSPORE_HCCL_CONFIG_PATH] [DATA_PATH]" echo "Usage: sh run_distribute_train.sh [RANK_TABLE_FILE] [DATA_PATH]"
exit 1 exit 1
fi fi
if [ ! -f $1 ] if [ ! -f $1 ]
then then
echo "error: MINDSPORE_HCCL_CONFIG_PATH=$1 is not a file" echo "error: RANK_TABLE_FILEH=$1 is not a file"
exit 1 exit 1
fi fi
@ -34,7 +34,7 @@ fi
export DEVICE_NUM=8 export DEVICE_NUM=8
export RANK_SIZE=8 export RANK_SIZE=8
export MINDSPORE_HCCL_CONFIG_PATH=$1 export RANK_TABLE_FILE=$1
for((i=0;i<RANK_SIZE;i++)) for((i=0;i<RANK_SIZE;i++))
do do

@ -77,7 +77,7 @@ Parameters for both training and evaluation can be set in config.py.
``` ```
# distributed training in Ascend # distributed training in Ascend
Usage: bash run_distribute_train.sh [MINDSPORE_HCCL_CONFIG_PATH] [DATASET_PATH] Usage: bash run_distribute_train.sh [RANK_TABLE_FILE] [DATASET_PATH]
# distributed training in GPU # distributed training in GPU
Usage: bash run_distribute_train_for_gpu.sh [RANK_SIZE] [DATASET_PATH] Usage: bash run_distribute_train_for_gpu.sh [RANK_SIZE] [DATASET_PATH]

@ -15,7 +15,7 @@
# ============================================================================ # ============================================================================
if [ $# != 2 ]; then if [ $# != 2 ]; then
echo "Usage: sh run_distribute_train.sh [MINDSPORE_HCCL_CONFIG_PATH] [DATASET_PATH]" echo "Usage: sh run_distribute_train.sh [RANK_TABLE_FILE] [DATASET_PATH]"
exit 1 exit 1
fi fi
@ -31,7 +31,7 @@ PATH1=$(get_real_path $1)
PATH2=$(get_real_path $2) PATH2=$(get_real_path $2)
if [ ! -f $PATH1 ]; then if [ ! -f $PATH1 ]; then
echo "error: MINDSPORE_HCCL_CONFIG_PATH=$PATH1 is not a file" echo "error: RANK_TABLE_FILE=$PATH1 is not a file"
exit 1 exit 1
fi fi
@ -43,7 +43,6 @@ fi
ulimit -u unlimited ulimit -u unlimited
export DEVICE_NUM=8 export DEVICE_NUM=8
export RANK_SIZE=8 export RANK_SIZE=8
export MINDSPORE_HCCL_CONFIG_PATH=$PATH1
export RANK_TABLE_FILE=$PATH1 export RANK_TABLE_FILE=$PATH1
for ((i = 0; i < ${DEVICE_NUM}; i++)); do for ((i = 0; i < ${DEVICE_NUM}; i++)); do

@ -55,7 +55,7 @@ This is an example of training YOLOV3-DarkNet53 with COCO2014 dataset in MindSpo
``` ```
# distributed training # distributed training
sh run_distribute_train.sh [DATASET_PATH] [PRETRAINED_BACKBONE] [MINDSPORE_HCCL_CONFIG_PATH] sh run_distribute_train.sh [DATASET_PATH] [PRETRAINED_BACKBONE] [RANK_TABLE_FILE]
# standalone training # standalone training
sh run_standalone_train.sh [DATASET_PATH] [PRETRAINED_BACKBONE] sh run_standalone_train.sh [DATASET_PATH] [PRETRAINED_BACKBONE]

@ -16,7 +16,7 @@
if [ $# != 3 ] if [ $# != 3 ]
then then
echo "Usage: sh run_distribute_train.sh [DATASET_PATH] [PRETRAINED_BACKBONE] [MINDSPORE_HCCL_CONFIG_PATH]" echo "Usage: sh run_distribute_train.sh [DATASET_PATH] [PRETRAINED_BACKBONE] [RANK_TABLE_FILE]"
exit 1 exit 1
fi fi
@ -30,10 +30,10 @@ get_real_path(){
DATASET_PATH=$(get_real_path $1) DATASET_PATH=$(get_real_path $1)
PRETRAINED_BACKBONE=$(get_real_path $2) PRETRAINED_BACKBONE=$(get_real_path $2)
MINDSPORE_HCCL_CONFIG_PATH=$(get_real_path $3) RANK_TABLE_FILE=$(get_real_path $3)
echo $DATASET_PATH echo $DATASET_PATH
echo $PRETRAINED_BACKBONE echo $PRETRAINED_BACKBONE
echo $MINDSPORE_HCCL_CONFIG_PATH echo $RANK_TABLE_FILE
if [ ! -d $DATASET_PATH ] if [ ! -d $DATASET_PATH ]
then then
@ -47,15 +47,15 @@ then
exit 1 exit 1
fi fi
if [ ! -f $MINDSPORE_HCCL_CONFIG_PATH ] if [ ! -f $RANK_TABLE_FILE ]
then then
echo "error: MINDSPORE_HCCL_CONFIG_PATH=$MINDSPORE_HCCL_CONFIG_PATH is not a file" echo "error: RANK_TABLE_FILE=$RANK_TABLE_FILE is not a file"
exit 1 exit 1
fi fi
export DEVICE_NUM=8 export DEVICE_NUM=8
export RANK_SIZE=8 export RANK_SIZE=8
export MINDSPORE_HCCL_CONFIG_PATH=$MINDSPORE_HCCL_CONFIG_PATH export RANK_TABLE_FILEH=$RANK_TABLE_FILE
for((i=0; i<${DEVICE_NUM}; i++)) for((i=0; i<${DEVICE_NUM}; i++))
do do

@ -16,7 +16,7 @@
echo "=======================================================================================================================================================" echo "======================================================================================================================================================="
echo "Please run the scipt as: " echo "Please run the scipt as: "
echo "sh run_distribute_train.sh DEVICE_NUM EPOCH_SIZE MINDRECORD_DIR IMAGE_DIR ANNO_PATH MINDSPORE_HCCL_CONFIG_PATH PRE_TRAINED PRE_TRAINED_EPOCH_SIZE" echo "sh run_distribute_train.sh DEVICE_NUM EPOCH_SIZE MINDRECORD_DIR IMAGE_DIR ANNO_PATH RANK_TABLE_FILE PRE_TRAINED PRE_TRAINED_EPOCH_SIZE"
echo "For example: sh run_distribute_train.sh 8 150 /data/Mindrecord_train /data /data/train.txt /data/hccl.json /opt/yolov3-150.ckpt(optional) 100(optional)" echo "For example: sh run_distribute_train.sh 8 150 /data/Mindrecord_train /data /data/train.txt /data/hccl.json /opt/yolov3-150.ckpt(optional) 100(optional)"
echo "It is better to use absolute path." echo "It is better to use absolute path."
echo "The learning rate is 0.005 as default, if you want other lr, please change the value in this script." echo "The learning rate is 0.005 as default, if you want other lr, please change the value in this script."
@ -24,7 +24,7 @@ echo "==========================================================================
if [ $# != 6 ] && [ $# != 8 ] if [ $# != 6 ] && [ $# != 8 ]
then then
echo "Usage: sh run_distribute_train.sh [DEVICE_NUM] [EPOCH_SIZE] [MINDRECORD_DIR] [IMAGE_DIR] [ANNO_PATH] [MINDSPORE_HCCL_CONFIG_PATH] \ echo "Usage: sh run_distribute_train.sh [DEVICE_NUM] [EPOCH_SIZE] [MINDRECORD_DIR] [IMAGE_DIR] [ANNO_PATH] [RANK_TABLE_FILE] \
[PRE_TRAINED](optional) [PRE_TRAINED_EPOCH_SIZE](optional)" [PRE_TRAINED](optional) [PRE_TRAINED_EPOCH_SIZE](optional)"
exit 1 exit 1
fi fi
@ -42,7 +42,7 @@ python train.py --only_create_dataset=1 --mindrecord_dir=$MINDRECORD_DIR --image
echo "After running the scipt, the network runs in the background. The log will be generated in LOGx/log.txt" echo "After running the scipt, the network runs in the background. The log will be generated in LOGx/log.txt"
export MINDSPORE_HCCL_CONFIG_PATH=$6 export RANK_TABLE_FILE=$6
export RANK_SIZE=$1 export RANK_SIZE=$1
BASE_PATH=$(cd "`dirname $0`" || exit; pwd) BASE_PATH=$(cd "`dirname $0`" || exit; pwd)

Some files were not shown because too many files have changed in this diff Show More

Loading…
Cancel
Save