From 5714e0796862642b6dc239111336ba70668b0d02 Mon Sep 17 00:00:00 2001
From: panfengfeng <panfengfeng@huawei.com>
Date: Tue, 4 Aug 2020 15:09:09 +0800
Subject: [PATCH] update resnet101 scripts

---
 mindspore/train/dataset_helper.py             |  2 +-
 model_zoo/official/cv/resnet/README.md        |  9 +-
 .../cv/resnet/scripts/run_eval_gpu.sh         | 84 +++++++++++++++++++
 .../scripts/run_standalone_train_gpu.sh       |  2 +-
 model_zoo/official/cv/resnet/train.py         | 19 +++--
 5 files changed, 104 insertions(+), 12 deletions(-)
 create mode 100755 model_zoo/official/cv/resnet/scripts/run_eval_gpu.sh

diff --git a/mindspore/train/dataset_helper.py b/mindspore/train/dataset_helper.py
index 00d0267011..ccb3b937b9 100644
--- a/mindspore/train/dataset_helper.py
+++ b/mindspore/train/dataset_helper.py
@@ -52,7 +52,7 @@ class DatasetHelper:
         sink_size (int): Control the amount of data each sink.
                              If sink_size=-1, sink the complete dataset each epoch.
                              If sink_size>0, sink sink_size data each epoch. Default: -1.
-        epoch_num (int): Control the number of epoch data to send.
+        epoch_num (int): Control the number of epoch data to send. Default: 1.
 
     Examples:
         >>> dataset_helper = DatasetHelper(dataset)
diff --git a/model_zoo/official/cv/resnet/README.md b/model_zoo/official/cv/resnet/README.md
index f976b5ffd1..a22df320e7 100644
--- a/model_zoo/official/cv/resnet/README.md
+++ b/model_zoo/official/cv/resnet/README.md
@@ -44,6 +44,9 @@ ImageNet2012
     ├── run_distribute_train.sh         # launch distributed training(8 pcs)
     ├── run_eval.sh                     # launch evaluation
     └── run_standalone_train.sh         # launch standalone training(1 pcs)
+    ├── run_distribute_train_gpu.sh     # launch gpu distributed training(8 pcs)
+    ├── run_eval_gpu.sh                 # launch gpu evaluation
+    └── run_standalone_train_gpu.sh     # launch gpu standalone training(1 pcs)
   ├── src
     ├── config.py                       # parameter configuration
     ├── dataset.py                      # data preprocessing
@@ -241,11 +244,11 @@ result: {'top_5_accuracy': 0.9429417413572343, 'top_1_accuracy': 0.7853513124199
 ### Running on GPU
 ```
 # distributed training example
-mpirun -n 8 python train.py --net=resnet50 --dataset=cifar10 --dataset_path=~/cifar-10-batches-bin --device_target="GPU" --run_distribute=True
+sh run_distribute_train_gpu.sh [resnet50|resnet101] [cifar10|imagenet2012]  [DATASET_PATH] [PRETRAINED_CKPT_PATH](optional)
 
 # standalone training example
-python train.py --net=resnet50 --dataset=cifar10 --dataset_path=~/cifar-10-batches-bin --device_target="GPU"
+sh run_standalone_train_gpu.sh [resnet50|resnet101] [cifar10|imagenet2012] [DATASET_PATH] [PRETRAINED_CKPT_PATH](optional)
 
 # infer example
-python eval.py --net=resnet50 --dataset=cifar10 --dataset_path=~/cifar10-10-verify-bin --device_target="GPU" --checkpoint_path=resnet-90_195.ckpt
+sh run_eval_gpu.sh [resnet50|resnet101] [cifar10|imagenet2012] [DATASET_PATH] [CHECKPOINT_PATH]
 ```
diff --git a/model_zoo/official/cv/resnet/scripts/run_eval_gpu.sh b/model_zoo/official/cv/resnet/scripts/run_eval_gpu.sh
new file mode 100755
index 0000000000..fc93602f5a
--- /dev/null
+++ b/model_zoo/official/cv/resnet/scripts/run_eval_gpu.sh
@@ -0,0 +1,84 @@
+#!/bin/bash
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+
+if [ $# != 4 ]
+then 
+    echo "Usage: sh run_eval_gpu.sh [resnet50|resnet101] [cifar10|imagenet2012] [DATASET_PATH] [CHECKPOINT_PATH]"
+exit 1
+fi
+
+if [ $1 != "resnet50" ] && [ $1 != "resnet101" ]
+then 
+    echo "error: the selected net is neither resnet50 nor resnet101"
+exit 1
+fi
+
+if [ $2 != "cifar10" ] && [ $2 != "imagenet2012" ]
+then 
+    echo "error: the selected dataset is neither cifar10 nor imagenet2012"
+exit 1
+fi
+
+if [ $1 == "resnet101" ] && [ $2 == "cifar10" ]
+then
+    echo "error: evaluating resnet101 with cifar10 dataset is unsupported now!"
+exit 1
+fi
+
+
+get_real_path(){
+  if [ "${1:0:1}" == "/" ]; then
+    echo "$1"
+  else
+    echo "$(realpath -m $PWD/$1)"
+  fi
+}
+
+PATH1=$(get_real_path $3)
+PATH2=$(get_real_path $4)
+
+
+if [ ! -d $PATH1 ]
+then 
+    echo "error: DATASET_PATH=$PATH1 is not a directory"
+exit 1
+fi 
+
+if [ ! -f $PATH2 ]
+then 
+    echo "error: CHECKPOINT_PATH=$PATH2 is not a file"
+exit 1
+fi 
+
+ulimit -u unlimited
+export DEVICE_NUM=1
+export DEVICE_ID=0
+export RANK_SIZE=$DEVICE_NUM
+export RANK_ID=0
+
+if [ -d "eval" ];
+then
+    rm -rf ./eval
+fi
+mkdir ./eval
+cp ../*.py ./eval
+cp *.sh ./eval
+cp -r ../src ./eval
+cd ./eval || exit
+env > env.log
+echo "start evaluation for device $DEVICE_ID"
+python eval.py --net=$1 --dataset=$2 --dataset_path=$PATH1 --checkpoint_path=$PATH2 --device_target="GPU" &> log &
+cd ..
diff --git a/model_zoo/official/cv/resnet/scripts/run_standalone_train_gpu.sh b/model_zoo/official/cv/resnet/scripts/run_standalone_train_gpu.sh
index 0be444d738..076bd4b332 100755
--- a/model_zoo/official/cv/resnet/scripts/run_standalone_train_gpu.sh
+++ b/model_zoo/official/cv/resnet/scripts/run_standalone_train_gpu.sh
@@ -16,7 +16,7 @@
 
 if [ $# != 3 ] && [ $# != 4 ]
 then 
-    echo "Usage: sh run_standalone_train.sh [resnet50|resnet101] [cifar10|imagenet2012] [DATASET_PATH] [PRETRAINED_CKPT_PATH](optional)"
+    echo "Usage: sh run_standalone_train_gpu.sh [resnet50|resnet101] [cifar10|imagenet2012] [DATASET_PATH] [PRETRAINED_CKPT_PATH](optional)"
 exit 1
 fi
 
diff --git a/model_zoo/official/cv/resnet/train.py b/model_zoo/official/cv/resnet/train.py
index 7f753a300f..414fa4c7de 100755
--- a/model_zoo/official/cv/resnet/train.py
+++ b/model_zoo/official/cv/resnet/train.py
@@ -157,13 +157,18 @@ if __name__ == '__main__':
         else:
             loss = SoftmaxCrossEntropyWithLogits(sparse=True, reduction="mean", is_grad=False,
                                                  num_classes=config.class_num)
-        ## fp32 training
-        opt = Momentum(filter(lambda x: x.requires_grad, net.get_parameters()), lr, config.momentum, config.weight_decay)
-        model = Model(net, loss_fn=loss, optimizer=opt, metrics={'acc'})
-        # # Mixed precision
-        # loss_scale = FixedLossScaleManager(config.loss_scale, drop_overflow_update=False)
-        # opt = Momentum(filter(lambda x: x.requires_grad, net.get_parameters()), lr, config.momentum, config.weight_decay, config.loss_scale)
-        # model = Model(net, loss_fn=loss, optimizer=opt, loss_scale_manager=loss_scale, metrics={'acc'}, amp_level="O2")
+
+        if args_opt.net == "resnet101":
+            opt = Momentum(filter(lambda x: x.requires_grad, net.get_parameters()), lr, config.momentum, config.weight_decay,
+                           config.loss_scale)
+            loss_scale = FixedLossScaleManager(config.loss_scale, drop_overflow_update=False)
+            # Mixed precision
+            model = Model(net, loss_fn=loss, optimizer=opt, loss_scale_manager=loss_scale, metrics={'acc'},
+                          amp_level="O2", keep_batchnorm_fp32=True)
+        else:
+            ## fp32 training
+            opt = Momentum(filter(lambda x: x.requires_grad, net.get_parameters()), lr, config.momentum, config.weight_decay)
+            model = Model(net, loss_fn=loss, optimizer=opt, metrics={'acc'})
 
     # define callbacks
     time_cb = TimeMonitor(data_size=step_size)