From 121943bdb5971ee2924aec86c0b708c95b4ad2f2 Mon Sep 17 00:00:00 2001 From: panfengfeng Date: Mon, 31 Aug 2020 16:41:05 +0800 Subject: [PATCH] update shufflenetv2 scripts --- model_zoo/official/cv/shufflenetv2/Readme.md | 20 ++++----- model_zoo/official/cv/shufflenetv2/eval.py | 3 +- .../scripts/run_distribute_train_for_gpu.sh | 44 ++++++++++++++++++- ...l_for_multi_gpu.sh => run_eval_for_gpu.sh} | 35 +++++++++++++-- .../scripts/run_standalone_train_for_gpu.sh | 28 ++++++++++-- .../official/cv/shufflenetv2/src/dataset.py | 2 - model_zoo/official/cv/shufflenetv2/train.py | 8 ++-- 7 files changed, 114 insertions(+), 26 deletions(-) rename model_zoo/official/cv/shufflenetv2/scripts/{run_eval_for_multi_gpu.sh => run_eval_for_gpu.sh} (53%) diff --git a/model_zoo/official/cv/shufflenetv2/Readme.md b/model_zoo/official/cv/shufflenetv2/Readme.md index 23291073d9..ff83b656d4 100644 --- a/model_zoo/official/cv/shufflenetv2/Readme.md +++ b/model_zoo/official/cv/shufflenetv2/Readme.md @@ -55,7 +55,7 @@ Dataset used: [imagenet](http://www.image-net.org/) +-- Readme.md # descriptions about ShuffleNetV2 +-- scripts ¦ +--run_distribute_train_for_gpu.sh # shell script for distributed training - ¦ +--run_eval_for_multi_gpu.sh # shell script for evaluation + ¦ +--run_eval_for_gpu.sh # shell script for evaluation ¦ +--run_standalone_train_for_gpu.sh # shell script for standalone training +-- src ¦ +--config.py # parameter configuration @@ -75,23 +75,23 @@ Dataset used: [imagenet](http://www.image-net.org/) You can start training using python or shell scripts. The usage of shell scripts as follows: -- Ditributed training on GPU: sh run_distribute_train_for_gpu.sh [DATA_DIR] -- Standalone training on GPU: sh run_standalone_train_for_gpu.sh [DEVICE_ID] [DATA_DIR] +- Ditributed training on GPU: sh run_standalone_train_for_gpu.sh [DEVICE_NUM] [VISIABLE_DEVICES(0,1,2,3,4,5,6,7)] [DATASET_PATH] +- Standalone training on GPU: sh run_standalone_train_for_gpu.sh [DATASET_PATH] ### Launch ``` # training example python: - GPU: mpirun --allow-run-as-root -n 8 python train.py --is_distributed --platform 'GPU' --dataset_path '~/imagenet/train/' > train.log 2>&1 & + GPU: mpirun --allow-run-as-root -n 8 python train.py --is_distributed=True --platform='GPU' --dataset_path='~/imagenet/train/' > train.log 2>&1 & shell: - GPU: sh run_distribute_train_for_gpu.sh ~/imagenet/train/ + GPU: cd scripts & sh run_distribute_train_for_gpu.sh 8 0,1,2,3,4,5,6,7 ~/imagenet/train/ ``` ### Result -Training result will be stored in the example path. Checkpoints will be stored at `. /checkpoint` by default, and training log will be redirected to `./train/train.log`. +Training result will be stored in the example path. Checkpoints will be stored at `./checkpoint` by default, and training log will be redirected to `./train/train.log`. ## [Eval process](#contents) @@ -99,21 +99,21 @@ Training result will be stored in the example path. Checkpoints will be stored a You can start evaluation using python or shell scripts. The usage of shell scripts as follows: -- GPU: sh run_eval_for_multi_gpu.sh [DEVICE_ID] [EPOCH] +- GPU: sh run_eval_for_gpu.sh [DATASET_PATH] [CHECKPOINT_PATH] ### Launch ``` # infer example python: - GPU: CUDA_VISIBLE_DEVICES=0 python eval.py --platform 'GPU' --dataset_path '~/imagenet/val/' --epoch 250 > eval.log 2>&1 & + GPU: CUDA_VISIBLE_DEVICES=0 python eval.py --platform='GPU' --dataset_path='~/imagenet/val/' > eval.log 2>&1 & shell: - GPU: sh run_eval_for_multi_gpu.sh 0 250 + GPU: cd scripts & sh run_eval_for_gpu.sh '~/imagenet/val/' 'checkpoint_file' ``` > checkpoint can be produced in training process. ### Result -Inference result will be stored in the example path, you can find result in `val.log`. +Inference result will be stored in the example path, you can find result in `eval.log`. diff --git a/model_zoo/official/cv/shufflenetv2/eval.py b/model_zoo/official/cv/shufflenetv2/eval.py index 51a4ceea8a..fdbddcf376 100644 --- a/model_zoo/official/cv/shufflenetv2/eval.py +++ b/model_zoo/official/cv/shufflenetv2/eval.py @@ -31,7 +31,6 @@ if __name__ == '__main__': parser.add_argument('--checkpoint', type=str, default='', help='checkpoint of ShuffleNetV2 (Default: None)') parser.add_argument('--dataset_path', type=str, default='', help='Dataset path') parser.add_argument('--platform', type=str, default='GPU', choices=('Ascend', 'GPU'), help='run platform') - parser.add_argument('--epoch', type=str, default='') args_opt = parser.parse_args() if args_opt.platform == 'Ascend': @@ -43,7 +42,7 @@ if __name__ == '__main__': ckpt = load_checkpoint(args_opt.checkpoint) load_param_into_net(net, ckpt) net.set_train(False) - dataset = create_dataset(args_opt.dataset_path, cfg, False) + dataset = create_dataset(args_opt.dataset_path, False, 0, 1) loss = nn.SoftmaxCrossEntropyWithLogits(sparse=True, reduction='mean', is_grad=False, smooth_factor=0.1, num_classes=cfg.num_classes) eval_metrics = {'Loss': nn.Loss(), diff --git a/model_zoo/official/cv/shufflenetv2/scripts/run_distribute_train_for_gpu.sh b/model_zoo/official/cv/shufflenetv2/scripts/run_distribute_train_for_gpu.sh index 305f1dcfff..c3bfedeaf8 100644 --- a/model_zoo/official/cv/shufflenetv2/scripts/run_distribute_train_for_gpu.sh +++ b/model_zoo/official/cv/shufflenetv2/scripts/run_distribute_train_for_gpu.sh @@ -13,5 +13,45 @@ # See the License for the specific language governing permissions and # limitations under the License. # ============================================================================ -DATA_DIR=$1 -mpirun --allow-run-as-root -n 8 python ./train.py --is_distributed --platform 'GPU' --dataset_path $DATA_DIR > train.log 2>&1 & +if [ $# -lt 3 ] +then + echo "Usage: \ + sh run_distribute_train_for_gpu.sh [DEVICE_NUM] [VISIABLE_DEVICES(0,1,2,3,4,5,6,7)] [DATASET_PATH] \ + " +exit 1 +fi + +if [ $1 -lt 1 ] && [ $1 -gt 8 ] +then + echo "error: DEVICE_NUM=$1 is not in (1-8)" +exit 1 +fi + +# check dataset file +if [ ! -d $3 ] +then + echo "error: DATASET_PATH=$3 is not a directory" +exit 1 +fi + +export DEVICE_NUM=$1 +export RANK_SIZE=$1 + +BASEPATH=$(cd "`dirname $0`" || exit; pwd) +export PYTHONPATH=${BASEPATH}:$PYTHONPATH +if [ -d "../train" ]; +then + rm -rf ../train +fi +mkdir ../train +cd ../train || exit + +export CUDA_VISIBLE_DEVICES="$2" + +if [ $1 -gt 1 ] +then + mpirun -n $1 --allow-run-as-root \ + python ${BASEPATH}/../train.py --platform='GPU' --is_distributed=True --dataset_path=$3 > train.log 2>&1 & +else + python ${BASEPATH}/../train.py --platform='GPU' --dataset_path=$3 > train.log 2>&1 & +fi diff --git a/model_zoo/official/cv/shufflenetv2/scripts/run_eval_for_multi_gpu.sh b/model_zoo/official/cv/shufflenetv2/scripts/run_eval_for_gpu.sh similarity index 53% rename from model_zoo/official/cv/shufflenetv2/scripts/run_eval_for_multi_gpu.sh rename to model_zoo/official/cv/shufflenetv2/scripts/run_eval_for_gpu.sh index 3d5c42a72a..af6492886a 100644 --- a/model_zoo/official/cv/shufflenetv2/scripts/run_eval_for_multi_gpu.sh +++ b/model_zoo/official/cv/shufflenetv2/scripts/run_eval_for_gpu.sh @@ -13,6 +13,35 @@ # See the License for the specific language governing permissions and # limitations under the License. # ============================================================================ -DEVICE_ID=$1 -EPOCH=$2 -CUDA_VISIBLE_DEVICES=$DEVICE_ID python ./eval.py --platform 'GPU' --dataset_path '/home/data/ImageNet_Original/val/' --epoch $EPOCH > eval.log 2>&1 & +if [ $# != 2 ] +then + echo "GPU: sh run_eval_for_gpu.sh [DATASET_PATH] [CHECKPOINT_PATH]" +exit 1 +fi + +# check dataset file +if [ ! -d $1 ] +then + echo "error: DATASET_PATH=$1 is not a directory" +exit 1 +fi + +# check checkpoint file +if [ ! -f $2 ] +then + echo "error: CHECKPOINT_PATH=$2 is not a file" +exit 1 +fi + +BASEPATH=$(cd "`dirname $0`" || exit; pwd) +export PYTHONPATH=${BASEPATH}:$PYTHONPATH +export DEVICE_ID=0 + +if [ -d "../eval" ]; +then + rm -rf ../eval +fi +mkdir ../eval +cd ../eval || exit + +python ${BASEPATH}/../eval.py --dataset_path=$1 --checkpoint=$2 > ./eval.log 2>&1 & diff --git a/model_zoo/official/cv/shufflenetv2/scripts/run_standalone_train_for_gpu.sh b/model_zoo/official/cv/shufflenetv2/scripts/run_standalone_train_for_gpu.sh index a007a96cb0..02da407d1c 100644 --- a/model_zoo/official/cv/shufflenetv2/scripts/run_standalone_train_for_gpu.sh +++ b/model_zoo/official/cv/shufflenetv2/scripts/run_standalone_train_for_gpu.sh @@ -13,6 +13,28 @@ # See the License for the specific language governing permissions and # limitations under the License. # ============================================================================ -DEVICE_ID=$1 -DATA_DIR=$2 -CUDA_VISIBLE_DEVICES=$DEVICE_ID python ./train.py --platform 'GPU' --dataset_path $DATA_DIR > train.log 2>&1 & +if [ $# -lt 1 ] +then + echo "Usage: \ + sh run_standalone_train_for_gpu.sh [DATASET_PATH] \ + " +exit 1 +fi + +# check dataset file +if [ ! -d $1 ] +then + echo "error: DATASET_PATH=$1 is not a directory" +exit 1 +fi + +BASEPATH=$(cd "`dirname $0`" || exit; pwd) +export PYTHONPATH=${BASEPATH}:$PYTHONPATH +if [ -d "../train" ]; +then + rm -rf ../train +fi +mkdir ../train +cd ../train || exit + +python ${BASEPATH}/../train.py --platform='GPU' --dataset_path=$1 > train.log 2>&1 & diff --git a/model_zoo/official/cv/shufflenetv2/src/dataset.py b/model_zoo/official/cv/shufflenetv2/src/dataset.py index 26b37d78d5..f67c37f0cc 100644 --- a/model_zoo/official/cv/shufflenetv2/src/dataset.py +++ b/model_zoo/official/cv/shufflenetv2/src/dataset.py @@ -75,7 +75,5 @@ def create_dataset(dataset_path, do_train, rank, group_size, repeat_num=1): ds = ds.map(input_columns="label", operations=type_cast_op, num_parallel_workers=cfg.work_nums) # apply batch operations ds = ds.batch(cfg.batch_size, drop_remainder=True) - # apply dataset repeat operation - ds = ds.repeat(repeat_num) return ds diff --git a/model_zoo/official/cv/shufflenetv2/train.py b/model_zoo/official/cv/shufflenetv2/train.py index ac97fe5a3d..0d6560bcb0 100644 --- a/model_zoo/official/cv/shufflenetv2/train.py +++ b/model_zoo/official/cv/shufflenetv2/train.py @@ -14,6 +14,7 @@ # ============================================================================ """train_imagenet.""" import argparse +import ast import os import random import numpy as np @@ -23,7 +24,7 @@ from network import ShuffleNetV2 import mindspore.nn as nn from mindspore import context from mindspore import dataset as de -from mindspore import ParallelMode +from mindspore.context import ParallelMode from mindspore import Tensor from mindspore.communication.management import init, get_rank, get_group_size from mindspore.nn.optim.momentum import Momentum @@ -42,10 +43,9 @@ de.config.set_seed(cfg.random_seed) if __name__ == '__main__': parser = argparse.ArgumentParser(description='image classification training') - parser.add_argument('--dataset_path', type=str, default='/home/data/imagenet_jpeg/train/', help='Dataset path') + parser.add_argument('--dataset_path', type=str, default='', help='Dataset path') parser.add_argument('--resume', type=str, default='', help='resume training with existed checkpoint') - parser.add_argument('--is_distributed', action='store_true', default=False, - help='distributed training') + parser.add_argument('--is_distributed', type=ast.literal_eval, default=False, help='distributed training') parser.add_argument('--platform', type=str, default='GPU', choices=('Ascend', 'GPU'), help='run platform') parser.add_argument('--model_size', type=str, default='1.0x', help='ShuffleNetV2 model size parameter') args_opt = parser.parse_args()