add training scripts and modify readme of mobilenetv2_quant and

resnet50_quant modify readme
5 years ago · aaa0436882
parent a50bc2182e
commit aaa0436882
5 changed files with 598 additions and 146 deletions
--- a/model_zoo/official/cv/mobilenetv2_quant/Readme.md
+++ b/model_zoo/official/cv/mobilenetv2_quant/Readme.md
@ -70,7 +70,7 @@ For FP16 operators, if the input data type is FP32, the backend of MindSpore wil
 ├── mobileNetv2_quant
  ├── Readme.md     # descriptions about MobileNetV2-Quant
  ├── scripts
-  │   ├──run_train_quant.sh   # shell script for train on Ascend
+  │   ├──run_train.sh   # shell script for train on Ascend and GPU
  │   ├──run_infer_quant.sh    # shell script for evaluation on Ascend
  ├── src
  │   ├──config.py      # parameter configuration
@ -91,19 +91,22 @@ For FP16 operators, if the input data type is FP32, the backend of MindSpore wil

 You can start training using python or shell scripts. The usage of shell scripts as follows:

- Ascend: sh run_train_quant.sh Ascend [DEVICE_NUM] [SERVER_IP(x.x.x.x)] [VISIABLE_DEVICES(0,1,2,3,4,5,6,7)] [DATASET_PATH] [CKPT_PATH]
+- bash run_train.sh [Ascend] [RANK_TABLE_FILE] [DATASET_PATH] [PRETRAINED_CKPT_PATH]\(optional)
+- bash run_train.sh [GPU] [DEVICE_ID_LIST] [DATASET_PATH] [PRETRAINED_CKPT_PATH]\(optional)
+

 ### Launch

-```
-# training example
-  shell:
-      Ascend: sh run_train_quant.sh Ascend 8 10.222.223.224 0,1,2,3,4,5,6,7 ~/imagenet/train/ mobilenet_199.ckpt
+``` bash
+  # training example
+  >>> bash run_train.sh Ascend ~/hccl_4p_0123_x.x.x.x.json ~/imagenet/train/ ~/mobilenet.ckpt
+  >>> bash run_train.sh GPU 1,2 ~/imagenet/train/ ~/mobilenet.ckpt
 ```

 ### Result

-Training result will be stored in the example path. Checkpoints will be stored at `. /checkpoint` by default, and training log  will be redirected to `./train/train.log` like followings.
+Training result will be stored in the example path. Checkpoints trained by `Ascend` will be stored at `./train/device$i/checkpoint` by default, and training log  will be redirected to `./train/device$i/train.log`. Checkpoints trained by `GPU` will be stored in `./train/checkpointckpt_$i` by default, and training log will be redirected to `./train/train.log`.  
+`train.log` is as follows:

 ```
 epoch: [  0/200], step:[  624/  625], loss:[5.258/5.258], time:[140412.236], lr:[0.100]
--- a/model_zoo/official/cv/mobilenetv2_quant/scripts/run_train.sh
+++ b/model_zoo/official/cv/mobilenetv2_quant/scripts/run_train.sh
--- a/model_zoo/official/cv/mobilenetv2_quant/scripts/run_train_quant.sh
+++ b/model_zoo/official/cv/mobilenetv2_quant/scripts/run_train_quant.sh
@ -1,96 +0,0 @@
-#!/usr/bin/env bash
-# Copyright 2020 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ============================================================================
-
-run_ascend()
-{
-    if [ $2 -lt 1 ] && [ $2 -gt 8 ]
-    then
-        echo "error: DEVICE_NUM=$2 is not in (1-9)"
-    exit 1
-    fi
-
-    if [ ! -d $5 ] && [ ! -f $5 ]
-    then
-        echo "error: DATASET_PATH=$5 is not a directory or file"
-    exit 1
-    fi
-
-    BASEPATH=$(cd "`dirname $0`" || exit; pwd)
-    export PYTHONPATH=${BASEPATH}:$PYTHONPATH
-    if [ -d "../train" ];
-    then
-        rm -rf ../train
-    fi
-    mkdir ../train
-    cd ../train || exit
-    python ${BASEPATH}/../src/launch.py \
-            --nproc_per_node=$2 \
-            --visible_devices=$4 \
-            --server_id=$3 \
-            --training_script=${BASEPATH}/../train.py \
-            --dataset_path=$5 \
-            --pre_trained=$6 \
-            --device_target=$1 &> train.log &  # dataset train folder
-}
-
-run_gpu()
-{
-    if [ $2 -lt 1 ] && [ $2 -gt 8 ]
-    then
-        echo "error: DEVICE_NUM=$2 is not in (1-8)"
-    exit 1
-    fi
-
-    if [ ! -d $4 ]
-    then
-        echo "error: DATASET_PATH=$4 is not a directory"
-    exit 1
-    fi
-
-    BASEPATH=$(cd "`dirname $0`" || exit; pwd)
-    export PYTHONPATH=${BASEPATH}:$PYTHONPATH
-    if [ -d "../train" ];
-    then
-        rm -rf ../train
-    fi
-    mkdir ../train
-    cd ../train || exit
-
-    export CUDA_VISIBLE_DEVICES="$3"
-    mpirun -n $2 --allow-run-as-root \
-    python ${BASEPATH}/../train.py \
-        --dataset_path=$4 \
-        --device_target=$1 \
-        --pre_trained=$5  &> ../train.log &  # dataset train folder
-}
-
-if [ $# -gt 6 ] || [ $# -lt 5 ]
-then
-    echo "Usage:\n \
-          Ascend: sh run_train_quant.sh Ascend [DEVICE_NUM] [SERVER_IP(x.x.x.x)] [VISIABLE_DEVICES(0,1,2,3,4,5,6,7)] [DATASET_PATH] [CKPT_PATH]\n \
-          GPU: sh run_train_quant.sh GPU [DEVICE_NUM] [VISIABLE_DEVICES(0,1,2,3,4,5,6,7)] [DATASET_PATH] [CKPT_PATH]\n \
-          "
-exit 1
-fi
-
-if [ $1 = "Ascend" ] ; then
-    run_ascend "$@"
-elif [ $1 = "GPU" ] ; then
-    run_gpu "$@"
-else
-    echo "Unsupported device target."
-fi;
-
--- a/model_zoo/official/cv/resnet50_quant/Readme.md
+++ b/model_zoo/official/cv/resnet50_quant/Readme.md
@ -1,4 +1,43 @@
 # Contents
+# ResNet-50_quant Example
+
+## Description
+
+This is an example of training ResNet-50_quant with ImageNet2012 dataset in MindSpore.
+
+## Requirements
+
+- Install [MindSpore](https://www.mindspore.cn/install/en).
+
+- Download the dataset ImageNet2012 
+
+> Unzip the ImageNet2012 dataset to any path you want and the folder structure should include train and eval dataset as follows:
+> ```
+> .  
+> ├── ilsvrc                  # train dataset
+> └── ilsvrc_eval             # infer dataset: images should be classified into 1000 directories firstly, just like train images
+> ```
+
+
+## Example structure
+
+```shell
+resnet50_quant/
+  ├── eval.py
+  ├── models
+  │   └── resnet_quant.py
+  ├── Readme.md
+  ├── scripts
+  │   ├── run_infer.sh
+  │   └── run_train.sh
+  ├── src
+  │   ├── config.py
+  │   ├── crossentropy.py
+  │   ├── dataset.py
+  │   ├── launch.py
+  │   └── lr_generator.py
+  └── train.py
+```

 - [resnet50 Description](#resnet50-description)
 - [Model Architecture](#model-architecture)
@ -88,21 +127,17 @@ For FP16 operators, if the input data type is FP32, the backend of MindSpore wil

 ### Usage

-
-You can start training using python or shell scripts. The usage of shell scripts as follows:
-
- Ascend: sh run_train.sh Ascend [DEVICE_NUM] [SERVER_IP(x.x.x.x)] [VISIABLE_DEVICES(0,1,2,3,4,5,6,7)] [DATASET_PATH][CKPT_PATH]
+- Ascend: sh run_train.sh Ascend [RANK_TABLE_FILE] [DATASET_PATH] [PRETRAINED_CKPT_PATH]\(optional)
 ### Launch

 ```
-# training example
-  shell:
-      Ascend: sh run_train.sh Ascend 8 10.222.223.224 0,1,2,3,4,5,6,7 ~/resnet/train/ Resnet50-90_5004.ckpt
+  # training example
+  Ascend: bash run_train.sh Ascend ~/hccl_4p_0123_x.x.x.x.json ~/imagenet/train/ 
 ```

 ### Result

-Training result will be stored in the example path. Checkpoints will be stored at `. /checkpoint` by default, and training log  will be redirected to `./train/train.log` like followings.
+Training result will be stored in the example path. Checkpoints will be stored at `./train/device$i/` by default, and training log  will be redirected to `./train/device$i/train.log` like followings. 

 ```
 epoch: 1 step: 5004, loss is 4.8995576
--- a/model_zoo/official/cv/resnet50_quant/scripts/run_train.sh
+++ b/model_zoo/official/cv/resnet50_quant/scripts/run_train.sh