From cd767f0f292fcd2df53d963c97dc1b66c8a23578 Mon Sep 17 00:00:00 2001 From: Payne Date: Tue, 15 Sep 2020 10:02:08 +0800 Subject: [PATCH] modify readme and some confusion code, fix some bug --- model_zoo/official/cv/mobilenetv2/README.md | 118 ++++++++---------- model_zoo/official/cv/mobilenetv2/eval.py | 2 +- .../scripts/{run_infer.sh => run_eval.sh} | 29 ++--- .../cv/mobilenetv2/scripts/run_train.sh | 9 +- model_zoo/official/cv/mobilenetv2/src/args.py | 19 ++- .../official/cv/mobilenetv2/src/dataset.py | 18 +-- .../official/cv/mobilenetv2/src/launch.py | 12 +- .../official/cv/mobilenetv2/src/utils.py | 10 +- model_zoo/official/cv/mobilenetv2/train.py | 8 +- 9 files changed, 97 insertions(+), 128 deletions(-) rename model_zoo/official/cv/mobilenetv2/scripts/{run_infer.sh => run_eval.sh} (80%) diff --git a/model_zoo/official/cv/mobilenetv2/README.md b/model_zoo/official/cv/mobilenetv2/README.md index f36bed45a3..eef7e7296a 100644 --- a/model_zoo/official/cv/mobilenetv2/README.md +++ b/model_zoo/official/cv/mobilenetv2/README.md @@ -4,17 +4,16 @@ - [Model Architecture](#model-architecture) - [Dataset](#dataset) - [Features](#features) - - [Mixed Precision](#mixed-precision) + - [Mixed Precision](#mixed-precision(ascend)) - [Environment Requirements](#environment-requirements) - [Script Description](#script-description) - [Script and Sample Code](#script-and-sample-code) - [Training Process](#training-process) - - [Evaluation Process](#evaluation-process) - - [Evaluation](#evaluation) + - [Evaluation Process](#eval-process) - [Model Description](#model-description) - [Performance](#performance) - - [Training Performance](#evaluation-performance) - - [Inference Performance](#evaluation-performance) + - [Training Performance](#training-performance) + - [Evaluation Performance](#evaluation-performance) - [Description of Random Situation](#description-of-random-situation) - [ModelZoo Homepage](#modelzoo-homepage) @@ -38,7 +37,7 @@ Dataset used: [imagenet](http://www.image-net.org/) - Train: 120G, 1.2W images - Test: 5G, 50000 images - Data format: RGB images. - - Note: Data will be processed in src/dataset.py + - Note: Data will be processed in src/dataset.py # [Features](#contents) @@ -92,84 +91,84 @@ You can start training using python or shell scripts. The usage of shell scripts ### Launch -``` +```shell # training example python: - Ascend: python train.py --dataset_path ~/imagenet/train/ --platform Ascend --train_method train - GPU: python train.py --dataset_path ~/imagenet/train/ --platform GPU --train_method train - CPU: python train.py --dataset_path ~/imagenet/train/ --platform CPU --train_method train + Ascend: python train.py --platform Ascend --dataset_path [TRAIN_DATASET_PATH] --train_method train + GPU: python train.py --platform GPU --dataset_path [TRAIN_DATASET_PATH] --train_method train + CPU: python train.py --platform CPU --dataset_path [TRAIN_DATASET_PATH] --train_method train shell: - Ascend: sh run_train.sh Ascend 8 0,1,2,3,4,5,6,7 hccl_config.json ~/imagenet/train/ train - GPU: sh run_train.sh GPU 8 0,1,2,3,4,5,6,7 ~/imagenet/train/ train - CPU: sh run_train.sh CPU ~/imagenet/train/ train + Ascend: sh run_train.sh Ascend 8 0,1,2,3,4,5,6,7 hccl_config.json [TRAIN_DATASET_PATH] train + GPU: sh run_train.sh GPU 8 0,1,2,3,4,5,6,7 [TRAIN_DATASET_PATH] train + CPU: sh run_train.sh CPU [TRAIN_DATASET_PATH] train # fine tune example python: - Ascend: python train.py --dataset_path ~/imagenet/train/ --platform Ascend --train_method fine_tune ./pretrain_checkpoint/mobilenetv2_199.ckpt - GPU: python train.py --dataset_path ~/imagenet/train/ --platform GPU --train_method fine_tune ./pretrain_checkpoint/mobilenetv2_199.ckpt - CPU: python train.py --dataset_path ~/imagenet/train/ --platform CPU --train_method fine_tune ./pretrain_checkpoint/mobilenetv2_199.ckpt + Ascend: python train.py --platform Ascend --dataset_path [TRAIN_DATASET_PATH] --train_method fine_tune ./pretrain_checkpoint/mobilenetv2.ckpt + GPU: python train.py --platform GPU --dataset_path [TRAIN_DATASET_PATH] --train_method fine_tune ./pretrain_checkpoint/mobilenetv2.ckpt + CPU: python train.py --platform CPU --dataset_path [TRAIN_DATASET_PATH] --train_method fine_tune ./pretrain_checkpoint/mobilenetv2.ckpt shell: - Ascend: sh run_train.sh Ascend 8 0,1,2,3,4,5,6,7 hccl_config.json ~/imagenet/train/ fine_tune ./pretrain_checkpoint/mobilenetv2_199.ckpt - GPU: sh run_train.sh GPU 8 0,1,2,3,4,5,6,7 ~/imagenet/train/ fine_tune ./pretrain_checkpoint/mobilenetv2_199.ckpt - CPU: sh run_train.sh CPU ~/imagenet/train/ fine_tune ./pretrain_checkpoint/mobilenetv2_199.ckpt + Ascend: sh run_train.sh Ascend 8 0,1,2,3,4,5,6,7 hccl_config.json [TRAIN_DATASET_PATH] fine_tune ./pretrain_checkpoint/mobilenetv2.ckpt + GPU: sh run_train.sh GPU 8 0,1,2,3,4,5,6,7 [TRAIN_DATASET_PATH] fine_tune ./pretrain_checkpoint/mobilenetv2.ckpt + CPU: sh run_train.sh CPU [TRAIN_DATASET_PATH] fine_tune ./pretrain_checkpoint/mobilenetv2.ckpt # incremental learn example python: - Ascend: python train.py --dataset_path ~/imagenet/train/ --platform Ascend --train_method incremental_learn ./pretrain_checkpoint/mobilenetv2_199.ckpt - GPU: python train.py --dataset_path ~/imagenet/train/ --platform GPU --train_method incremental_learn ./pretrain_checkpoint/mobilenetv2_199.ckpt - CPU: python train.py --dataset_path ~/imagenet/train/ --platform CPU --train_method incremental_learn ./pretrain_checkpoint/mobilenetv2_199.ckpt + Ascend: python --platform Ascend train.py --dataset_path [TRAIN_DATASET_PATH] --train_method incremental_learn ./pretrain_checkpoint/mobilenetv2.ckpt ./checkpoint/mobilenetv2_head_15.ckpt + GPU: python --platform GPU train.py --dataset_path [TRAIN_DATASET_PATH] --train_method incremental_learn ./pretrain_checkpoint/mobilenetv2.ckpt ./checkpoint/mobilenetv2_head_15.ckpt + CPU: python --platform CPU train.py --dataset_path [TRAIN_DATASET_PATH] --train_method incremental_learn ./pretrain_checkpoint/mobilenetv2.ckpt ./checkpoint/mobilenetv2_head_15.ckpt shell: - Ascend: sh run_train.sh Ascend 8 0,1,2,3,4,5,6,7 hccl_config.json ~/imagenet/train/ incremental_learn ./pretrain_checkpoint/mobilenetv2_199.ckpt - GPU: sh run_train.sh GPU 8 0,1,2,3,4,5,6,7 ~/imagenet/train/ incremental_learn ./pretrain_checkpoint/mobilenetv2_199.ckpt - CPU: sh run_train.sh CPU ~/imagenet/train/ incremental_learn ./pretrain_checkpoint/mobilenetv2_199.ckpt + Ascend: sh run_train.sh Ascend 8 0,1,2,3,4,5,6,7 hccl_config.json [TRAIN_DATASET_PATH] incremental_learn ./pretrain_checkpoint/mobilenetv2.ckpt ./checkpoint/mobilenetv2_head_15.ckpt + GPU: sh run_train.sh GPU 8 0,1,2,3,4,5,6,7 [TRAIN_DATASET_PATH] incremental_learn ./pretrain_checkpoint/mobilenetv2.ckpt ./checkpoint/mobilenetv2_head_15.ckpt + CPU: sh run_train.sh CPU [TRAIN_DATASET_PATH] incremental_learn ./pretrain_checkpoint/mobilenetv2_199.ckpt ./checkpoint/mobilenetv2_head_15.ckpt ``` ### Result -Training result will be stored in the example path. Checkpoints will be stored at `. /checkpoint` by default, and training log will be redirected to `./train/train.log` like followings. +Training result will be stored in the example path. Checkpoints will be stored at `. /checkpoint` by default, and training log will be redirected to `./train.log` like followings with the platform CPU and GPU, will be wrote to `./train/rank*/log*.log` with the platform Ascend . -``` +```shell epoch: [ 0/200], step:[ 624/ 625], loss:[5.258/5.258], time:[140412.236], lr:[0.100] epoch time: 140522.500, per step time: 224.836, avg loss: 5.258 epoch: [ 1/200], step:[ 624/ 625], loss:[3.917/3.917], time:[138221.250], lr:[0.200] epoch time: 138331.250, per step time: 221.330, avg loss: 3.917 ``` -## [Eval process](#contents) +## [Evaluation process](#contents) ### Usage -You can start training using python or shell scripts. The usage of shell scripts as follows: +You can start training using python or shell scripts.If the train method is train or fine tune, should not input the `[CHECKPOINT_PATH]` The usage of shell scripts as follows: -- Ascend: sh run_infer.sh Ascend [DATASET_PATH] [CHECKPOINT_PATH] [HEAD_CKPT_PATH] -- GPU: sh run_infer.sh GPU [DATASET_PATH] [CHECKPOINT_PATH] [HEAD_CKPT_PATH] -- CPU: sh run_infer.sh CPU [DATASET_PATH] [BACKBONE_CKPT_PATH] [HEAD_CKPT_PATH] +- Ascend: sh run_eval.sh Ascend [DATASET_PATH] [CHECKPOINT_PATH] [HEAD_CKPT_PATH] +- GPU: sh run_eval.sh GPU [DATASET_PATH] [CHECKPOINT_PATH] [HEAD_CKPT_PATH] +- CPU: sh run_eval.sh CPU [DATASET_PATH] [BACKBONE_CKPT_PATH] [HEAD_CKPT_PATH] ### Launch -``` -# infer example +```shell +# eval example python: - Ascend: python eval.py --dataset_path ~/imagenet/val/ --pretrain_ckpt ~/train/mobilenet-200_625.ckpt --platform Ascend --head_ckpt ./checkpoint/mobilenetv2_199.ckpt - GPU: python eval.py --dataset_path ~/imagenet/val/ --pretrain_ckpt ~/train/mobilenet-200_625.ckpt --platform GPU --head_ckpt ./checkpoint/mobilenetv2_199.ckpt - CPU: python eval.py --dataset_path ~/imagenet/val/ --pretrain_ckpt ~/train/mobilenet-200_625.ckpt --platform CPU --head_ckpt ./checkpoint/mobilenetv2_199.ckpt + Ascend: python eval.py --platform Ascend --dataset_path [VAL_DATASET_PATH] --pretrain_ckpt ./pretrain_ckpt/mobilenetv2.ckpt --head_ckpt ./checkpoint/mobilenetv2_head_15.ckpt + GPU: python eval.py --platform GPU --dataset_path [VAL_DATASET_PATH] --pretrain_ckpt ./pretrain_ckpt/mobilenetv2.ckpt --head_ckpt ./checkpoint/mobilenetv2_head_15.ckpt + CPU: python eval.py --platform CPU --dataset_path [VAL_DATASET_PATH] --pretrain_ckpt ./pretrain_ckpt/mobilenetv2.ckpt --head_ckpt ./checkpoint/mobilenetv2_head_15.ckpt shell: - Ascend: sh run_infer.sh Ascend ~/imagenet/val/ ~/train/mobilenet-200_625.ckpt ./checkpoint/mobilenetv2_199.ckpt - GPU: sh run_infer.sh GPU ~/imagenet/val/ ~/train/mobilenet-200_625.ckpt ./checkpoint/mobilenetv2_199.ckpt - CPU: sh run_infer.sh GPU ~/imagenet/val/ ~/train/mobilenet-200_625.ckpt ./checkpoint/mobilenetv2_199.ckpt + Ascend: sh run_eval.sh Ascend [VAL_DATASET_PATH] ./pretrain_ckpt/mobilenetv2.ckpt ./checkpoint/mobilenetv2_head_15.ckpt + GPU: sh run_eval.sh GPU [VAL_DATASET_PATH] ./pretrain_ckpt/mobilenetv2.ckpt ./checkpoint/mobilenetv2_head_15.ckpt + CPU: sh run_eval.sh CPU [VAL_DATASET_PATH] ./pretrain_ckpt/mobilenetv2.ckpt ./checkpoint/mobilenetv2_head_15.ckpt ``` > checkpoint can be produced in training process. ### Result -Inference result will be stored in the example path, you can find result like the followings in `val.log`. +Inference result will be stored in the example path, you can find result like the followings in `eval.log`. -``` +```shell result: {'acc': 0.71976314102564111} ckpt=/path/to/checkpoint/mobilenet-200_625.ckpt ``` @@ -181,7 +180,7 @@ result: {'acc': 0.71976314102564111} ckpt=/path/to/checkpoint/mobilenet-200_625. | Parameters | MobilenetV2 | | | -------------------------- | ---------------------------------------------------------- | ------------------------- | -| Model Version | | large | +| Model Version | V1 | V1 | | Resource | Ascend 910, cpu:2.60GHz 56cores, memory:314G | NV SMX2 V100-32G | | uploaded Date | 05/06/2020 | 05/06/2020 | | MindSpore Version | 0.3.0 | 0.3.0 | @@ -189,33 +188,18 @@ result: {'acc': 0.71976314102564111} ckpt=/path/to/checkpoint/mobilenet-200_625. | Training Parameters | src/config.py | src/config.py | | Optimizer | Momentum | Momentum | | Loss Function | SoftmaxCrossEntropy | SoftmaxCrossEntropy | -| outputs | | | -| Loss | | 1.913 | -| Accuracy | | ACC1[77.09%] ACC5[92.57%] | -| Total time | | | -| Params (M) | | | -| Checkpoint for Fine tuning | | | -| Model for inference | | | - -#### Inference Performance - -| Parameters | | | | -| -------------------------- | ----------------------------- | ------------------------- | -------------------- | -| Model Version | V1 | | | -| Resource | Ascend 910 | NV SMX2 V100-32G | Ascend 310 | -| uploaded Date | 05/06/2020 | 05/22/2020 | | -| MindSpore Version | 0.2.0 | 0.2.0 | 0.2.0 | -| Dataset | ImageNet, 1.2W | ImageNet, 1.2W | ImageNet, 1.2W | -| batch_size | | 130(8P) | | -| outputs | | | | -| Accuracy | | ACC1[72.07%] ACC5[90.90%] | | -| Speed | | | | -| Total time | | | | -| Model for inference | | | | +| outputs | probability | probability | +| Loss | 1.908 | 1.913 | +| Accuracy | ACC1[71.78%] | ACC1[71.08%] | +| Total time | 753 min | 845 min | +| Params (M) | 3.3 M | 3.3 M | +| Checkpoint for Fine tuning | 27.3 M | 27.3 M | +| Scripts | [Link](https://gitee.com/mindspore/mindspore/tree/master/model_zoo/official/cv/mobilenetv2)| # [Description of Random Situation](#contents) -In dataset.py, we set the seed inside “create_dataset" function. We also use random seed in train.py. + +In train.py, we set the seed which is used by numpy.random, mindspore.common.Initializer, mindspore.ops.composite.random_ops and mindspore.nn.probability.distribution. # [ModelZoo Homepage](#contents) diff --git a/model_zoo/official/cv/mobilenetv2/eval.py b/model_zoo/official/cv/mobilenetv2/eval.py index c50947d1ab..4fce332d10 100644 --- a/model_zoo/official/cv/mobilenetv2/eval.py +++ b/model_zoo/official/cv/mobilenetv2/eval.py @@ -51,7 +51,7 @@ if __name__ == '__main__': loss = nn.SoftmaxCrossEntropyWithLogits(sparse=True, reduction='mean') model = Model(net, loss_fn=loss, metrics={'acc'}) - res = model.eval(dataset, dataset_sink_mode=False) + res = model.eval(dataset) print(f"result:{res}\npretrain_ckpt={args_opt.pretrain_ckpt}") if args_opt.head_ckpt: print(f"head_ckpt={args_opt.head_ckpt}") diff --git a/model_zoo/official/cv/mobilenetv2/scripts/run_infer.sh b/model_zoo/official/cv/mobilenetv2/scripts/run_eval.sh similarity index 80% rename from model_zoo/official/cv/mobilenetv2/scripts/run_infer.sh rename to model_zoo/official/cv/mobilenetv2/scripts/run_eval.sh index 5f175578f4..efa3f2a378 100644 --- a/model_zoo/official/cv/mobilenetv2/scripts/run_infer.sh +++ b/model_zoo/official/cv/mobilenetv2/scripts/run_eval.sh @@ -18,10 +18,10 @@ run_ascend() { - # check checkpoint file + # check pretrain_ckpt file if [ ! -f $3 ] then - echo "error: CHECKPOINT_PATH=$3 is not a file" + echo "error: PRETRAIN_CKPT=$3 is not a file" exit 1 fi @@ -44,15 +44,15 @@ run_ascend() --dataset_path=$2 \ --pretrain_ckpt=$3 \ --head_ckpt=$4 \ - &> ../infer.log & # dataset val folder path + &> ../eval.log & # dataset val folder path } run_gpu() { - # check checkpoint file + # check pretrain_ckpt file if [ ! -f $3 ] then - echo "error: CHECKPOINT_PATH=$3 is not a file" + echo "error: PRETRAIN_CKPT=$3 is not a file" exit 1 fi @@ -70,26 +70,18 @@ run_gpu() --dataset_path=$2 \ --pretrain_ckpt=$3 \ --head_ckpt=$4 \ - &> ../infer.log & # dataset train folder + &> ../eval.log & # dataset train folder } run_cpu() { - # check checkpoint file + # check pretrain_ckpt file if [ ! -f $3 ] then - echo "error: BACKBONE_CKPT=$3 is not a file" + echo "error: PRETRAIN_CKPT=$3 is not a file" exit 1 fi - # check checkpoint file - if [ ! -f $4 ] - then - echo "error: HEAD_CKPT=$4 is not a file" - exit 1 - fi - - BASEPATH=$(cd "`dirname $0`" || exit; pwd) export PYTHONPATH=${BASEPATH}:$PYTHONPATH if [ -d "../eval" ]; @@ -104,13 +96,14 @@ run_cpu() --dataset_path=$2 \ --pretrain_ckpt=$3 \ --head_ckpt=$4 \ - &> ../infer.log & # dataset train folder + &> ../eval.log & # dataset train folder } if [ $# -gt 4 ] || [ $# -lt 3 ] then - echo "Ascend: sh run_infer.sh [PLATFORM] [DATASET_PATH] [PRETRAIN_CKPT] \ + echo "Usage: + Ascend: sh run_infer.sh [PLATFORM] [DATASET_PATH] [PRETRAIN_CKPT] GPU: sh run_infer.sh [PLATFORM] [DATASET_PATH] [PRETRAIN_CKPT] CPU: sh run_infer.sh [PLATFORM] [DATASET_PATH] [BACKBONE_CKPT] [HEAD_CKPT]" exit 1 diff --git a/model_zoo/official/cv/mobilenetv2/scripts/run_train.sh b/model_zoo/official/cv/mobilenetv2/scripts/run_train.sh index 0680c3d6ca..7bccd26916 100644 --- a/model_zoo/official/cv/mobilenetv2/scripts/run_train.sh +++ b/model_zoo/official/cv/mobilenetv2/scripts/run_train.sh @@ -109,11 +109,10 @@ run_cpu() if [ $# -gt 7 ] || [ $# -lt 4 ] then - echo "Usage:\n \ - Ascend: sh run_train.sh Ascend [DEVICE_NUM] [VISIABLE_DEVICES(0,1,2,3,4,5,6,7)] [RANK_TABLE_FILE] [DATASET_PATH] [TRAIN_METHOD] [CKPT_PATH] \n \ - GPU: sh run_train.sh GPU [DEVICE_NUM] [VISIABLE_DEVICES(0,1,2,3,4,5,6,7)] [DATASET_PATH] [TRAIN_METHOD] [CKPT_PATH]\n \ - CPU: sh run_train.sh CPU [DATASET_PATH] [TRAIN_METHOD] [CKPT_PATH]\n \ - " + echo "Usage: + Ascend: sh run_train.sh Ascend [DEVICE_NUM] [VISIABLE_DEVICES(0,1,2,3,4,5,6,7)] [RANK_TABLE_FILE] [DATASET_PATH] [TRAIN_METHOD] [CKPT_PATH] + GPU: sh run_train.sh GPU [DEVICE_NUM] [VISIABLE_DEVICES(0,1,2,3,4,5,6,7)] [DATASET_PATH] [TRAIN_METHOD] [CKPT_PATH] + CPU: sh run_train.sh CPU [DATASET_PATH] [TRAIN_METHOD] [CKPT_PATH]" exit 1 fi diff --git a/model_zoo/official/cv/mobilenetv2/src/args.py b/model_zoo/official/cv/mobilenetv2/src/args.py index c92959a359..78de5f2e67 100644 --- a/model_zoo/official/cv/mobilenetv2/src/args.py +++ b/model_zoo/official/cv/mobilenetv2/src/args.py @@ -38,31 +38,28 @@ def launch_parse_args(): def train_parse_args(): train_parser = argparse.ArgumentParser(description='Image classification trian') - train_parser.add_argument('--dataset_path', type=str, required=True, help='Dataset path') train_parser.add_argument('--platform', type=str, default="Ascend", choices=("CPU", "GPU", "Ascend"), \ help='run platform, only support CPU, GPU and Ascend') - train_parser.add_argument('--pretrain_ckpt', type=str, default=None, help='Pretrained checkpoint path \ - for fine tune or incremental learning') - train_parser.add_argument('--run_distribute', type=ast.literal_eval, default=True, help='Run distribute') + train_parser.add_argument('--dataset_path', type=str, required=True, help='Dataset path') train_parser.add_argument('--train_method', type=str, choices=("train", "fine_tune", "incremental_learn"), \ help="\"fine_tune\"or \"incremental_learn\" if to fine tune the net after loading the ckpt, \"train\" to \ train from initialization model") - + train_parser.add_argument('--pretrain_ckpt', type=str, default=None, help='Pretrained checkpoint path \ + for fine tune or incremental learning') + train_parser.add_argument('--run_distribute', type=ast.literal_eval, default=True, help='Run distribute') train_args = train_parser.parse_args() return train_args def eval_parse_args(): eval_parser = argparse.ArgumentParser(description='Image classification eval') - eval_parser.add_argument('--dataset_path', type=str, required=True, help='Dataset path') eval_parser.add_argument('--platform', type=str, default="Ascend", choices=("Ascend", "GPU", "CPU"), \ help='run platform, only support GPU, CPU and Ascend') - eval_parser.add_argument('--pretrain_ckpt', type=str, default=None, help='Pretrained checkpoint path \ + eval_parser.add_argument('--dataset_path', type=str, required=True, help='Dataset path') + eval_parser.add_argument('--pretrain_ckpt', type=str, required=True, help='Pretrained checkpoint path \ for fine tune or incremental learning') eval_parser.add_argument('--head_ckpt', type=str, default=None, help='Pretrained checkpoint path \ - for fine tune or incremental learning') - eval_parser.add_argument('--run_distribute', type=ast.literal_eval, default=False, help='Run distribute') - + for incremental learning') + eval_parser.add_argument('--run_distribute', type=ast.literal_eval, default=False, help='If run distribute in GPU.') eval_args = eval_parser.parse_args() - return eval_args \ No newline at end of file diff --git a/model_zoo/official/cv/mobilenetv2/src/dataset.py b/model_zoo/official/cv/mobilenetv2/src/dataset.py index db0f8c65ba..2473095600 100644 --- a/model_zoo/official/cv/mobilenetv2/src/dataset.py +++ b/model_zoo/official/cv/mobilenetv2/src/dataset.py @@ -16,7 +16,6 @@ create train or eval dataset. """ import os -from tqdm import tqdm import numpy as np from mindspore import Tensor @@ -97,7 +96,11 @@ def create_dataset(dataset_path, do_train, config, repeat_num=1): # apply dataset repeat operation ds = ds.repeat(repeat_num) - return ds + step_size = ds.get_dataset_size() + if step_size == 0: + raise ValueError("The step_size of dataset is zero. Check if the images of train dataset is more than batch_\ + size in config.py") + return ds, step_size def extract_features(net, dataset_path, config): @@ -109,19 +112,16 @@ def extract_features(net, dataset_path, config): config=config, repeat_num=1) step_size = dataset.get_dataset_size() - pbar = tqdm(list(dataset.create_dict_iterator(output_numpy=True))) model = Model(net) - i = 0 - for data in pbar: + + for i, data in enumerate(dataset.create_dict_iterator(output_numpy=True)): features_path = os.path.join(features_folder, f"feature_{i}.npy") label_path = os.path.join(features_folder, f"label_{i}.npy") - if not (os.path.exists(features_path) and os.path.exists(label_path)): + if not os.path.exists(features_path or not os.path.exists(label_path)): image = data["image"] label = data["label"] features = model.predict(Tensor(image)) np.save(features_path, features.asnumpy()) np.save(label_path, label) - pbar.set_description("Process dataset batch: %d" % (i + 1)) - i += 1 - + print(f"Complete the batch {i}/{step_size}") return step_size diff --git a/model_zoo/official/cv/mobilenetv2/src/launch.py b/model_zoo/official/cv/mobilenetv2/src/launch.py index 8785186dcf..793dfe1d9e 100644 --- a/model_zoo/official/cv/mobilenetv2/src/launch.py +++ b/model_zoo/official/cv/mobilenetv2/src/launch.py @@ -38,17 +38,17 @@ def main(): for rank_id in range(0, args.nproc_per_node): os.chdir(cur_path) device_id = visible_devices[rank_id] - device_dir = os.path.join(cur_path, 'device{}'.format(rank_id)) + rank_dir = os.path.join(cur_path, 'rank{}'.format(rank_id)) env['RANK_ID'] = str(rank_id) env['DEVICE_ID'] = str(device_id) - if os.path.exists(device_dir): - shutil.rmtree(device_dir) - os.mkdir(device_dir) - os.chdir(device_dir) + if os.path.exists(rank_dir): + shutil.rmtree(rank_dir) + os.mkdir(rank_dir) + os.chdir(rank_dir) cmd = [sys.executable, '-u'] cmd.append(args.training_script) cmd.extend(args.training_script_args) - log_file = open('{dir}/log{id}.log'.format(dir=device_dir, id=rank_id), 'w') + log_file = open(f'{rank_dir}/log{rank_id}.log', 'w') process = subprocess.Popen(cmd, stdout=log_file, stderr=log_file, env=env) processes.append(process) cmds.append(cmd) diff --git a/model_zoo/official/cv/mobilenetv2/src/utils.py b/model_zoo/official/cv/mobilenetv2/src/utils.py index ba4745a525..b97a48775d 100644 --- a/model_zoo/official/cv/mobilenetv2/src/utils.py +++ b/model_zoo/official/cv/mobilenetv2/src/utils.py @@ -76,14 +76,12 @@ def config_ckpoint(config, lr, step_size): if config.save_checkpoint: config_ck = CheckpointConfig(save_checkpoint_steps=config.save_checkpoint_epochs * step_size, keep_checkpoint_max=config.keep_checkpoint_max) - ckpt_save_dir = config.save_checkpoint_path - if config.platform == "GPU": - if config.run_distribute: - ckpt_save_dir += "ckpt_" + str(get_rank()) + "/" - else: - ckpt_save_dir += "ckpt_" + "/" + rank = 0 + if config.run_distribute: + rank = get_rank() + ckpt_save_dir = config.save_checkpoint_path + "ckpt_" + str(rank) + "/" ckpt_cb = ModelCheckpoint(prefix="mobilenetV2", directory=ckpt_save_dir, config=config_ck) cb += [ckpt_cb] return cb diff --git a/model_zoo/official/cv/mobilenetv2/train.py b/model_zoo/official/cv/mobilenetv2/train.py index 151ac441bf..8555294e53 100644 --- a/model_zoo/official/cv/mobilenetv2/train.py +++ b/model_zoo/official/cv/mobilenetv2/train.py @@ -60,8 +60,7 @@ if __name__ == '__main__': elif args_opt.train_method in ("train", "fine_tune"): if args_opt.platform == "CPU": raise ValueError("Currently, CPU only support \"incremental_learn\", not \"fine_tune\" or \"train\".") - dataset = create_dataset(dataset_path=args_opt.dataset_path, do_train=True, config=config) - step_size = dataset.get_dataset_size() + dataset, step_size = create_dataset(dataset_path=args_opt.dataset_path, do_train=True, config=config) # Currently, only Ascend support switch precision. switch_precision(net, mstype.float16, config) @@ -108,9 +107,8 @@ if __name__ == '__main__': losses.append(network(feature, label).asnumpy()) epoch_mseconds = (time.time()-epoch_start) * 1000 per_step_mseconds = epoch_mseconds / step_size - print("\r epoch[{}], iter[{}] cost: {:5.3f}, per step time: {:5.3f}, avg loss: {:5.3f}"\ - .format(epoch + 1, step_size, epoch_mseconds, per_step_mseconds, np.mean(np.array(losses))), \ - end="") + print("epoch[{}], iter[{}] cost: {:5.3f}, per step time: {:5.3f}, avg loss: {:5.3f}"\ + .format(epoch + 1, step_size, epoch_mseconds, per_step_mseconds, np.mean(np.array(losses)))) if (epoch + 1) % config.save_checkpoint_epochs == 0: save_checkpoint(network, os.path.join(config.save_checkpoint_path, \ f"mobilenetv2_head_{epoch+1}.ckpt"))