From 5714e0796862642b6dc239111336ba70668b0d02 Mon Sep 17 00:00:00 2001 From: panfengfeng Date: Tue, 4 Aug 2020 15:09:09 +0800 Subject: [PATCH] update resnet101 scripts --- mindspore/train/dataset_helper.py | 2 +- model_zoo/official/cv/resnet/README.md | 9 +- .../cv/resnet/scripts/run_eval_gpu.sh | 84 +++++++++++++++++++ .../scripts/run_standalone_train_gpu.sh | 2 +- model_zoo/official/cv/resnet/train.py | 19 +++-- 5 files changed, 104 insertions(+), 12 deletions(-) create mode 100755 model_zoo/official/cv/resnet/scripts/run_eval_gpu.sh diff --git a/mindspore/train/dataset_helper.py b/mindspore/train/dataset_helper.py index 00d0267011..ccb3b937b9 100644 --- a/mindspore/train/dataset_helper.py +++ b/mindspore/train/dataset_helper.py @@ -52,7 +52,7 @@ class DatasetHelper: sink_size (int): Control the amount of data each sink. If sink_size=-1, sink the complete dataset each epoch. If sink_size>0, sink sink_size data each epoch. Default: -1. - epoch_num (int): Control the number of epoch data to send. + epoch_num (int): Control the number of epoch data to send. Default: 1. Examples: >>> dataset_helper = DatasetHelper(dataset) diff --git a/model_zoo/official/cv/resnet/README.md b/model_zoo/official/cv/resnet/README.md index f976b5ffd1..a22df320e7 100644 --- a/model_zoo/official/cv/resnet/README.md +++ b/model_zoo/official/cv/resnet/README.md @@ -44,6 +44,9 @@ ImageNet2012 ├── run_distribute_train.sh # launch distributed training(8 pcs) ├── run_eval.sh # launch evaluation └── run_standalone_train.sh # launch standalone training(1 pcs) + ├── run_distribute_train_gpu.sh # launch gpu distributed training(8 pcs) + ├── run_eval_gpu.sh # launch gpu evaluation + └── run_standalone_train_gpu.sh # launch gpu standalone training(1 pcs) ├── src ├── config.py # parameter configuration ├── dataset.py # data preprocessing @@ -241,11 +244,11 @@ result: {'top_5_accuracy': 0.9429417413572343, 'top_1_accuracy': 0.7853513124199 ### Running on GPU ``` # distributed training example -mpirun -n 8 python train.py --net=resnet50 --dataset=cifar10 --dataset_path=~/cifar-10-batches-bin --device_target="GPU" --run_distribute=True +sh run_distribute_train_gpu.sh [resnet50|resnet101] [cifar10|imagenet2012] [DATASET_PATH] [PRETRAINED_CKPT_PATH](optional) # standalone training example -python train.py --net=resnet50 --dataset=cifar10 --dataset_path=~/cifar-10-batches-bin --device_target="GPU" +sh run_standalone_train_gpu.sh [resnet50|resnet101] [cifar10|imagenet2012] [DATASET_PATH] [PRETRAINED_CKPT_PATH](optional) # infer example -python eval.py --net=resnet50 --dataset=cifar10 --dataset_path=~/cifar10-10-verify-bin --device_target="GPU" --checkpoint_path=resnet-90_195.ckpt +sh run_eval_gpu.sh [resnet50|resnet101] [cifar10|imagenet2012] [DATASET_PATH] [CHECKPOINT_PATH] ``` diff --git a/model_zoo/official/cv/resnet/scripts/run_eval_gpu.sh b/model_zoo/official/cv/resnet/scripts/run_eval_gpu.sh new file mode 100755 index 0000000000..fc93602f5a --- /dev/null +++ b/model_zoo/official/cv/resnet/scripts/run_eval_gpu.sh @@ -0,0 +1,84 @@ +#!/bin/bash +# Copyright 2020 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ + +if [ $# != 4 ] +then + echo "Usage: sh run_eval_gpu.sh [resnet50|resnet101] [cifar10|imagenet2012] [DATASET_PATH] [CHECKPOINT_PATH]" +exit 1 +fi + +if [ $1 != "resnet50" ] && [ $1 != "resnet101" ] +then + echo "error: the selected net is neither resnet50 nor resnet101" +exit 1 +fi + +if [ $2 != "cifar10" ] && [ $2 != "imagenet2012" ] +then + echo "error: the selected dataset is neither cifar10 nor imagenet2012" +exit 1 +fi + +if [ $1 == "resnet101" ] && [ $2 == "cifar10" ] +then + echo "error: evaluating resnet101 with cifar10 dataset is unsupported now!" +exit 1 +fi + + +get_real_path(){ + if [ "${1:0:1}" == "/" ]; then + echo "$1" + else + echo "$(realpath -m $PWD/$1)" + fi +} + +PATH1=$(get_real_path $3) +PATH2=$(get_real_path $4) + + +if [ ! -d $PATH1 ] +then + echo "error: DATASET_PATH=$PATH1 is not a directory" +exit 1 +fi + +if [ ! -f $PATH2 ] +then + echo "error: CHECKPOINT_PATH=$PATH2 is not a file" +exit 1 +fi + +ulimit -u unlimited +export DEVICE_NUM=1 +export DEVICE_ID=0 +export RANK_SIZE=$DEVICE_NUM +export RANK_ID=0 + +if [ -d "eval" ]; +then + rm -rf ./eval +fi +mkdir ./eval +cp ../*.py ./eval +cp *.sh ./eval +cp -r ../src ./eval +cd ./eval || exit +env > env.log +echo "start evaluation for device $DEVICE_ID" +python eval.py --net=$1 --dataset=$2 --dataset_path=$PATH1 --checkpoint_path=$PATH2 --device_target="GPU" &> log & +cd .. diff --git a/model_zoo/official/cv/resnet/scripts/run_standalone_train_gpu.sh b/model_zoo/official/cv/resnet/scripts/run_standalone_train_gpu.sh index 0be444d738..076bd4b332 100755 --- a/model_zoo/official/cv/resnet/scripts/run_standalone_train_gpu.sh +++ b/model_zoo/official/cv/resnet/scripts/run_standalone_train_gpu.sh @@ -16,7 +16,7 @@ if [ $# != 3 ] && [ $# != 4 ] then - echo "Usage: sh run_standalone_train.sh [resnet50|resnet101] [cifar10|imagenet2012] [DATASET_PATH] [PRETRAINED_CKPT_PATH](optional)" + echo "Usage: sh run_standalone_train_gpu.sh [resnet50|resnet101] [cifar10|imagenet2012] [DATASET_PATH] [PRETRAINED_CKPT_PATH](optional)" exit 1 fi diff --git a/model_zoo/official/cv/resnet/train.py b/model_zoo/official/cv/resnet/train.py index 7f753a300f..414fa4c7de 100755 --- a/model_zoo/official/cv/resnet/train.py +++ b/model_zoo/official/cv/resnet/train.py @@ -157,13 +157,18 @@ if __name__ == '__main__': else: loss = SoftmaxCrossEntropyWithLogits(sparse=True, reduction="mean", is_grad=False, num_classes=config.class_num) - ## fp32 training - opt = Momentum(filter(lambda x: x.requires_grad, net.get_parameters()), lr, config.momentum, config.weight_decay) - model = Model(net, loss_fn=loss, optimizer=opt, metrics={'acc'}) - # # Mixed precision - # loss_scale = FixedLossScaleManager(config.loss_scale, drop_overflow_update=False) - # opt = Momentum(filter(lambda x: x.requires_grad, net.get_parameters()), lr, config.momentum, config.weight_decay, config.loss_scale) - # model = Model(net, loss_fn=loss, optimizer=opt, loss_scale_manager=loss_scale, metrics={'acc'}, amp_level="O2") + + if args_opt.net == "resnet101": + opt = Momentum(filter(lambda x: x.requires_grad, net.get_parameters()), lr, config.momentum, config.weight_decay, + config.loss_scale) + loss_scale = FixedLossScaleManager(config.loss_scale, drop_overflow_update=False) + # Mixed precision + model = Model(net, loss_fn=loss, optimizer=opt, loss_scale_manager=loss_scale, metrics={'acc'}, + amp_level="O2", keep_batchnorm_fp32=True) + else: + ## fp32 training + opt = Momentum(filter(lambda x: x.requires_grad, net.get_parameters()), lr, config.momentum, config.weight_decay) + model = Model(net, loss_fn=loss, optimizer=opt, metrics={'acc'}) # define callbacks time_cb = TimeMonitor(data_size=step_size)