optimize the vgg script

5 years ago · abbd7b50db
parent 12a150bb5d
commit abbd7b50db
7 changed files with 237 additions and 54 deletions
--- a/model_zoo/official/cv/vgg16/README.md
+++ b/model_zoo/official/cv/vgg16/README.md
@ -8,7 +8,9 @@ This example is for VGG16 model training and evaluation.

 - Install [MindSpore](https://www.mindspore.cn/install/en).

- Download the CIFAR-10 binary version dataset.
+- Download the dataset CIFAR-10 or ImageNet2012.
+
+CIFAR-10

 > Unzip the CIFAR-10 dataset to any path you want and the folder structure should be as follows:
 > ```
@ -17,16 +19,89 @@ This example is for VGG16 model training and evaluation.
 > └── cifar-10-verify-bin   # infer dataset
 > ```

+ImageNet2012
+
+> Unzip the ImageNet2012 dataset to any path you want and the folder should include train and eval dataset as follows:
+>
+> ```
+> .
+> └─dataset
+>   ├─ilsvrc                # train dataset
+>   └─validation_preprocess # evaluate dataset
+> ```
+
+## Parameter configuration
+
+Parameters for both training and evaluation can be set in config.py.
+
+- config for vgg16, CIFAR-10 dataset
+
+```
+"num_classes": 10,                   # dataset class num
+"lr": 0.01,                          # learning rate
+"lr_init": 0.01,                     # initial learning rate
+"lr_max": 0.1,                       # max learning rate
+"lr_epochs": '30,60,90,120',         # lr changing based epochs
+"lr_scheduler": "step",              # learning rate mode
+"warmup_epochs": 5,                  # number of warmup epoch
+"batch_size": 64,                    # batch size of input tensor
+"max_epoch": 70,                     # only valid for taining, which is always 1 for inference
+"momentum": 0.9,                     # momentum
+"weight_decay": 5e-4,                # weight decay
+"loss_scale": 1.0,                   # loss scale
+"label_smooth": 0,                   # label smooth
+"label_smooth_factor": 0,            # label smooth factor
+"buffer_size": 10,                   # shuffle buffer size
+"image_size": '224,224',             # image size
+"pad_mode": 'same',                  # pad mode for conv2d
+"padding": 0,                        # padding value for conv2d
+"has_bias": False,                   # whether has bias in conv2d
+"batch_norm": True,                  # wether has batch_norm in conv2d
+"keep_checkpoint_max": 10,           # only keep the last keep_checkpoint_max checkpoint
+"initialize_mode": "XavierUniform",  # conv2d init mode
+"has_dropout": True                  # wether using Dropout layer
+```
+
+- config for vgg16, ImageNet2012 dataset
+
+```
+"num_classes": 1000,                 # dataset class num
+"lr": 0.01,                          # learning rate
+"lr_init": 0.01,                     # initial learning rate
+"lr_max": 0.1,                       # max learning rate
+"lr_epochs": '30,60,90,120',         # lr changing based epochs
+"lr_scheduler": "cosine_annealing",  # learning rate mode
+"warmup_epochs": 0,                  # number of warmup epoch
+"batch_size": 32,                    # batch size of input tensor
+"max_epoch": 150,                    # only valid for taining, which is always 1 for inference
+"momentum": 0.9,                     # momentum
+"weight_decay": 1e-4,                # weight decay
+"loss_scale": 1024,                  # loss scale
+"label_smooth": 1,                   # label smooth
+"label_smooth_factor": 0.1,          # label smooth factor
+"buffer_size": 10,                   # shuffle buffer size
+"image_size": '224,224',             # image size
+"pad_mode": 'pad',                   # pad mode for conv2d
+"padding": 1,                        # padding value for conv2d
+"has_bias": True,                    # whether has bias in conv2d
+"batch_norm": False,                 # wether has batch_norm in conv2d
+"keep_checkpoint_max": 10,           # only keep the last keep_checkpoint_max checkpoint
+"initialize_mode": "KaimingNormal",  # conv2d init mode
+"has_dropout": True                  # wether using Dropout layer
+```
+
 ## Running the Example

 ### Training
+**Run vgg16, using CIFAR-10 dataset**

+- Training using single device(1p)
 ```
 python train.py --data_path=your_data_path --device_id=6 > out.train.log 2>&1 & 
 ```
 The python command above will run in the background, you can view the results through the file `out.train.log`.

-After training, you'll get some checkpoint files under the script folder by default.
+After training, you'll get some checkpoint files in specified ckpt_path, default in ./output directory.

 You will get the loss value as following:
 ```
@ -36,22 +111,7 @@ epcoh: 2 step: 781, loss is 1.827582
 ...
 ```

-### Evaluation
-
- Do eval as follows, need to specify dataset type as "cifar10" or "imagenet2012"
-```
-python eval.py --data_path=your_data_path --dataset="cifar10" --pre_trained=./train_vgg_cifar10-70-781.ckpt > out.eval.log 2>&1 & 
-```
- If the using dataset is
-The above python command will run in the background, you can view the results through the file `out.eval.log`.
-
-You will get the accuracy as following:
-```
-# grep "result: " out.eval.log
-result: {'acc': 0.92}
-```
-
-### Distribute Training
+- Distribute Training
 ```
 sh run_distribute_train.sh rank_table.json your_data_path
 ```
@ -70,18 +130,63 @@ train_parallel1/log:epcoh: 2 step: 97, loss is 1.7133579
 ```
 > About rank_table.json, you can refer to the [distributed training tutorial](https://www.mindspore.cn/tutorial/en/master/advanced_use/distributed_training.html).

+
+**Run vgg16, using imagenet2012 dataset**
+
+- Training using single device(1p)
+```
+python train.py  --device_target="GPU" --dataset="imagenet2012" --is_distributed=0 --data_path=$DATA_PATH  > output.train.log 2>&1 &
+```
+
+- Distribute Training
+```
+# distributed training(8p)
+bash scripts/run_distribute_train_gpu.sh /path/ImageNet2012/train"
+```
+
+
+### Evaluation
+
+- Do eval as follows, need to specify dataset type as "cifar10" or "imagenet2012"
+```
+# when using cifar10 dataset
+python eval.py --data_path=your_data_path --dataset="cifar10" --device_target="Ascend" --pre_trained=./*-70-781.ckpt > out.eval.log 2>&1 & 
+
+# when using imagenet2012 dataset
+python eval.py --data_path=your_data_path --dataset="imagenet2012" --device_target="GPU" --pre_trained=./*-150-5004.ckpt > out.eval.log 2>&1 & 
+```
+- If the using dataset is
+The above python command will run in the background, you can view the results through the file `out.eval.log`.
+
+You will get the accuracy as following:
+```
+# when using cifar10 dataset
+# grep "result: " out.eval.log
+result: {'acc': 0.92}
+
+# when using the imagenet2012 dataset
+after allreduce eval: top1_correct=36636, tot=50000, acc=73.27%
+after allreduce eval: top5_correct=45582, tot=50000, acc=91.16%
+```
+
 ## Usage:

 ### Training
 ```
 usage: train.py [--device_target TARGET][--data_path DATA_PATH]
+                [--dataset  DATASET_TYPE][--is_distributed VALUE]
                [--device_id DEVICE_ID][--pre_trained PRE_TRAINED]
+                [--ckpt_path CHECKPOINT_PATH][--ckpt_interval INTERVAL_STEP]

 parameters/options:
-  --device_target       the training backend type, default is Ascend.
+  --device_target       the training backend type, Ascend or GPU, default is Ascend.
+  --dataset             the dataset type, cifar10 or imagenet2012.
+  --is_distributed      the  way of traing, whether do distribute traing, value can be 0 or 1.
  --data_path           the storage path of dataset
  --device_id           the device which used to train model.
  --pre_trained         the pretrained checkpoint file path.
+  --ckpt_path           the path to save checkpoint.
+  --ckpt_interval       the epoch interval for saving checkpoint.

 ```

@ -89,16 +194,19 @@ parameters/options:

 ```
 usage: eval.py [--device_target TARGET][--data_path DATA_PATH]
-                [--device_id DEVICE_ID][--checkpoint_path CKPT_PATH]
+               [--dataset  DATASET_TYPE][--pre_trained PRE_TRAINED]
+               [--device_id DEVICE_ID]

 parameters/options:
-  --device_target       the evaluation backend type, default is Ascend.
-  --data_path           the storage path of datasetd 
+  --device_target       the evaluation backend type, Ascend or GPU, default is Ascend.
+  --dataset             the dataset type, cifar10 or imagenet2012.
+  --data_path           the storage path of dataset.
  --device_id           the device which used to evaluate model.
-  --pre_trained     the checkpoint file path used to evaluate model.
+  --pre_trained         the checkpoint file path used to evaluate model.
 ```

 ### Distribute Training
+- Train on Ascend.

 ```
 Usage: sh script/run_distribute_train.sh [MINDSPORE_HCCL_CONFIG_PATH] [DATA_PATH]
@ -107,3 +215,11 @@ parameters/options:
  MINDSPORE_HCCL_CONFIG_PATH   HCCL configuration file path.
  DATA_PATH                    the storage path of dataset.
 ```
+
+- Train on GPU.
+```
+Usage: bash run_distribute_train_gpu.sh [DATA_PATH]
+
+parameters/options:
+  DATA_PATH                    the storage path of dataset.
+```
--- a/model_zoo/official/cv/vgg16/eval.py
+++ b/model_zoo/official/cv/vgg16/eval.py
@ -86,6 +86,8 @@ def parse_args(cloud_args=None):
    args_opt.padding = cfg.padding
    args_opt.has_bias = cfg.has_bias
    args_opt.batch_norm = cfg.batch_norm
+    args_opt.initialize_mode = cfg.initialize_mode
+    args_opt.has_dropout = cfg.has_dropout

    args_opt.image_size = list(map(int, args_opt.image_size.split(',')))

--- a/model_zoo/official/cv/vgg16/scripts/run_distribute_train_gpu.sh
+++ b/model_zoo/official/cv/vgg16/scripts/run_distribute_train_gpu.sh
@ -0,0 +1,29 @@
+#!/bin/bash
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+
+echo "=============================================================================================================="
+echo "Please run the scipt as: "
+echo "bash run_distribute_train_gpu.sh DATA_PATH"
+echo "for example: bash run_distribute_train_gpu.sh /path/ImageNet2012/train"
+echo "=============================================================================================================="
+
+DATA_PATH=$1
+
+mpirun -n 8 python train.py  \
+    --device_target="GPU" \
+    --dataset="imagenet2012" \
+    --is_distributed=1 \
+    --data_path=$DATA_PATH  > output.train.log 2>&1 &
--- a/model_zoo/official/cv/vgg16/src/config.py
+++ b/model_zoo/official/cv/vgg16/src/config.py
@ -19,50 +19,54 @@ from easydict import EasyDict as edict

 # config for vgg16, cifar10
 cifar_cfg = edict({
-    'num_classes': 10,
+    "num_classes": 10,
    "lr": 0.01,
-    'lr_init': 0.01,
-    'lr_max': 0.1,
+    "lr_init": 0.01,
+    "lr_max": 0.1,
    "lr_epochs": '30,60,90,120',
    "lr_scheduler": "step",
-    'warmup_epochs': 5,
-    'batch_size': 64,
-    'max_epoch': 70,
-    'momentum': 0.9,
-    'weight_decay': 5e-4,
+    "warmup_epochs": 5,
+    "batch_size": 64,
+    "max_epoch": 70,
+    "momentum": 0.9,
+    "weight_decay": 5e-4,
    "loss_scale": 1.0,
    "label_smooth": 0,
    "label_smooth_factor": 0,
-    'buffer_size': 10,
+    "buffer_size": 10,
    "image_size": '224,224',
-    'pad_mode': 'same',
-    'padding': 0,
-    'has_bias': False,
+    "pad_mode": 'same',
+    "padding": 0,
+    "has_bias": False,
    "batch_norm": True,
-    'keep_checkpoint_max': 10
+    "keep_checkpoint_max": 10,
+    "initialize_mode": "XavierUniform",
+    "has_dropout": False
 })

 # config for vgg16, imagenet2012
 imagenet_cfg = edict({
-    'num_classes': 1000,
+    "num_classes": 1000,
    "lr": 0.01,
-    'lr_init': 0.01,
-    'lr_max': 0.1,
+    "lr_init": 0.01,
+    "lr_max": 0.1,
    "lr_epochs": '30,60,90,120',
    "lr_scheduler": 'cosine_annealing',
-    'warmup_epochs': 0,
-    'batch_size': 32,
-    'max_epoch': 150,
-    'momentum': 0.9,
-    'weight_decay': 1e-4,
+    "warmup_epochs": 0,
+    "batch_size": 32,
+    "max_epoch": 150,
+    "momentum": 0.9,
+    "weight_decay": 1e-4,
    "loss_scale": 1024,
    "label_smooth": 1,
    "label_smooth_factor": 0.1,
-    'buffer_size': 10,
+    "buffer_size": 10,
    "image_size": '224,224',
-    'pad_mode': 'pad',
-    'padding': 1,
-    'has_bias': True,
+    "pad_mode": 'pad',
+    "padding": 1,
+    "has_bias": True,
    "batch_norm": False,
-    'keep_checkpoint_max': 10
+    "keep_checkpoint_max": 10,
+    "initialize_mode": "KaimingNormal",
+    "has_dropout": True
 })
--- a/model_zoo/official/cv/vgg16/src/vgg.py
+++ b/model_zoo/official/cv/vgg16/src/vgg.py
@ -33,7 +33,7 @@ def _make_layer(base, args, batch_norm):
        else:
            weight_shape = (v, in_channels, 3, 3)
            weight = initializer('XavierUniform', shape=weight_shape, dtype=mstype.float32).to_tensor()
-            if args.dataset == "imagenet2012":
+            if args.initialize_mode == "KaimingNormal":
                weight = 'normal'
            conv2d = nn.Conv2d(in_channels=in_channels,
                               out_channels=v,
@ -74,7 +74,7 @@ class Vgg(nn.Cell):
        self.layers = _make_layer(base, args, batch_norm=batch_norm)
        self.flatten = nn.Flatten()
        dropout_ratio = 0.5
-        if args.dataset == "cifar10" or phase == "test":
+        if not args.has_dropout or phase == "test":
            dropout_ratio = 1.0
        self.classifier = nn.SequentialCell([
            nn.Dense(512 * 7 * 7, 4096),
@ -84,7 +84,7 @@ class Vgg(nn.Cell):
            nn.ReLU(),
            nn.Dropout(dropout_ratio),
            nn.Dense(4096, num_classes)])
-        if args.dataset == "imagenet2012":
+        if args.initialize_mode == "KaimingNormal":
            default_recurisive_init(self)
            self.custom_init_weight()

@ -128,14 +128,14 @@ def vgg16(num_classes=1000, args=None, phase="train"):

    Args:
        num_classes (int): Class numbers. Default: 1000.
-        args(dict): param for net init.
+        args(namespace): param for net init.
        phase(str): train or test mode.

    Returns:
        Cell, cell instance of Vgg16 neural network with batch normalization.

    Examples:
-        >>> vgg16(num_classes=1000)
+        >>> vgg16(num_classes=1000, args=args)
    """

    net = Vgg(cfg['16'], num_classes=num_classes, args=args, batch_norm=args.batch_norm, phase=phase)
--- a/model_zoo/official/cv/vgg16/train.py
+++ b/model_zoo/official/cv/vgg16/train.py
@ -161,6 +161,8 @@ def parse_args(cloud_args=None):
    args_opt.padding = cfg.padding
    args_opt.has_bias = cfg.has_bias
    args_opt.batch_norm = cfg.batch_norm
+    args_opt.initialize_mode = cfg.initialize_mode
+    args_opt.has_dropout = cfg.has_dropout

    args_opt.lr_epochs = list(map(int, cfg.lr_epochs.split(',')))
    args_opt.image_size = list(map(int, cfg.image_size.split(',')))
--- a/tests/ut/python/model/test_vgg.py
+++ b/tests/ut/python/model/test_vgg.py
@ -0,0 +1,30 @@
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+"""test_vgg"""
+import numpy as np
+import pytest
+
+from mindspore import Tensor
+from model_zoo.official.cv.vgg16.src.vgg import vgg16
+from model_zoo.official.cv.vgg16.src.config import cifar_cfg as cfg
+from ..ut_filter import non_graph_engine
+
+
+@non_graph_engine
+def test_vgg16():
+    inputs = Tensor(np.random.rand(1, 3, 112, 112).astype(np.float32))
+    net = vgg16(args=cfg)
+    with pytest.raises(ValueError):
+        print(net.construct(inputs))