From 9d06442c0d12a6a1d60c5f07a2f8e194a149bb0d Mon Sep 17 00:00:00 2001 From: Payne Date: Wed, 23 Sep 2020 19:40:08 +0800 Subject: [PATCH] mobilenetv2 debug and modify README.md --- model_zoo/official/cv/mobilenetv2/README.md | 67 ++++++++++--------- .../cv/mobilenetv2/scripts/run_eval.sh | 2 +- .../official/cv/mobilenetv2/src/dataset.py | 5 +- .../official/cv/mobilenetv2/src/utils.py | 3 +- model_zoo/official/cv/mobilenetv2/train.py | 2 +- 5 files changed, 39 insertions(+), 40 deletions(-) diff --git a/model_zoo/official/cv/mobilenetv2/README.md b/model_zoo/official/cv/mobilenetv2/README.md index 12f1d12d0e..ff0f950655 100644 --- a/model_zoo/official/cv/mobilenetv2/README.md +++ b/model_zoo/official/cv/mobilenetv2/README.md @@ -77,6 +77,7 @@ For FP16 operators, if the input data type is FP32, the backend of MindSpore wil │ ├──utils.py # utils to load ckpt_file for fine tune or incremental learn ├── train.py # training script ├── eval.py # evaluation script + ├── mindspore_hub_conf.py # mindspore hub interface ``` ## [Training process](#contents) @@ -85,45 +86,45 @@ For FP16 operators, if the input data type is FP32, the backend of MindSpore wil You can start training using python or shell scripts. The usage of shell scripts as follows: -- Ascend: sh run_train.sh Ascend [DEVICE_NUM] [VISIABLE_DEVICES(0,1,2,3,4,5,6,7)] [RANK_TABLE_FILE] [DATASET_PATH] [TRAIN_METHOD] [CKPT_PATH] -- GPU: sh run_trian.sh GPU [DEVICE_NUM] [VISIABLE_DEVICES(0,1,2,3,4,5,6,7)] [DATASET_PATH] [TRAIN_METHOD] [CKPT_PATH] -- CPU: sh run_trian.sh CPU [DATASET_PATH] [TRAIN_METHOD] [CKPT_PATH] +- Ascend: sh run_train.sh Ascend [DEVICE_NUM] [VISIABLE_DEVICES(0,1,2,3,4,5,6,7)] [RANK_TABLE_FILE] [DATASET_PATH] [CKPT_PATH] [FREEZE_LAYER] +- GPU: sh run_trian.sh GPU [DEVICE_NUM] [VISIABLE_DEVICES(0,1,2,3,4,5,6,7)] [DATASET_PATH] [CKPT_PATH] [FREEZE_LAYER] +- CPU: sh run_trian.sh CPU [DATASET_PATH] [CKPT_PATH] [FREEZE_LAYER] ### Launch ```shell # training example python: - Ascend: python train.py --platform Ascend --dataset_path [TRAIN_DATASET_PATH] --train_method train - GPU: python train.py --platform GPU --dataset_path [TRAIN_DATASET_PATH] --train_method train - CPU: python train.py --platform CPU --dataset_path [TRAIN_DATASET_PATH] --train_method train + Ascend: python train.py --platform Ascend --dataset_path [TRAIN_DATASET_PATH] + GPU: python train.py --platform GPU --dataset_path [TRAIN_DATASET_PATH] + CPU: python train.py --platform CPU --dataset_path [TRAIN_DATASET_PATH] shell: - Ascend: sh run_train.sh Ascend 8 0,1,2,3,4,5,6,7 hccl_config.json [TRAIN_DATASET_PATH] train - GPU: sh run_train.sh GPU 8 0,1,2,3,4,5,6,7 [TRAIN_DATASET_PATH] train - CPU: sh run_train.sh CPU [TRAIN_DATASET_PATH] train + Ascend: sh run_train.sh Ascend 8 0,1,2,3,4,5,6,7 hccl_config.json [TRAIN_DATASET_PATH] + GPU: sh run_train.sh GPU 8 0,1,2,3,4,5,6,7 [TRAIN_DATASET_PATH] + CPU: sh run_train.sh CPU [TRAIN_DATASET_PATH] -# fine tune example +# fine tune whole network example python: - Ascend: python train.py --platform Ascend --dataset_path [TRAIN_DATASET_PATH] --train_method fine_tune ./pretrain_checkpoint/mobilenetv2.ckpt - GPU: python train.py --platform GPU --dataset_path [TRAIN_DATASET_PATH] --train_method fine_tune ./pretrain_checkpoint/mobilenetv2.ckpt - CPU: python train.py --platform CPU --dataset_path [TRAIN_DATASET_PATH] --train_method fine_tune ./pretrain_checkpoint/mobilenetv2.ckpt + Ascend: python train.py --platform Ascend --dataset_path [TRAIN_DATASET_PATH] --pretrain_ckpt [CKPT_PATH] --freeze_layer none + GPU: python train.py --platform GPU --dataset_path [TRAIN_DATASET_PATH] --pretrain_ckpt [CKPT_PATH] --freeze_layer none + CPU: python train.py --platform CPU --dataset_path [TRAIN_DATASET_PATH] --pretrain_ckpt [CKPT_PATH] --freeze_layer none shell: - Ascend: sh run_train.sh Ascend 8 0,1,2,3,4,5,6,7 hccl_config.json [TRAIN_DATASET_PATH] fine_tune ./pretrain_checkpoint/mobilenetv2.ckpt - GPU: sh run_train.sh GPU 8 0,1,2,3,4,5,6,7 [TRAIN_DATASET_PATH] fine_tune ./pretrain_checkpoint/mobilenetv2.ckpt - CPU: sh run_train.sh CPU [TRAIN_DATASET_PATH] fine_tune ./pretrain_checkpoint/mobilenetv2.ckpt + Ascend: sh run_train.sh Ascend 8 0,1,2,3,4,5,6,7 hccl_config.json [TRAIN_DATASET_PATH] [CKPT_PATH] none + GPU: sh run_train.sh GPU 8 0,1,2,3,4,5,6,7 [TRAIN_DATASET_PATH] [CKPT_PATH] none + CPU: sh run_train.sh CPU [TRAIN_DATASET_PATH] [CKPT_PATH] none -# incremental learn example +# fine tune full connected layers example python: - Ascend: python --platform Ascend train.py --dataset_path [TRAIN_DATASET_PATH] --train_method incremental_learn ./pretrain_checkpoint/mobilenetv2.ckpt ./checkpoint/mobilenetv2_head_15.ckpt - GPU: python --platform GPU train.py --dataset_path [TRAIN_DATASET_PATH] --train_method incremental_learn ./pretrain_checkpoint/mobilenetv2.ckpt ./checkpoint/mobilenetv2_head_15.ckpt - CPU: python --platform CPU train.py --dataset_path [TRAIN_DATASET_PATH] --train_method incremental_learn ./pretrain_checkpoint/mobilenetv2.ckpt ./checkpoint/mobilenetv2_head_15.ckpt + Ascend: python --platform Ascend train.py --dataset_path [TRAIN_DATASET_PATH]--pretrain_ckpt [CKPT_PATH] --freeze_layer backbone + GPU: python --platform GPU train.py --dataset_path [TRAIN_DATASET_PATH] --pretrain_ckpt [CKPT_PATH] --freeze_layer backbone + CPU: python --platform CPU train.py --dataset_path [TRAIN_DATASET_PATH] --pretrain_ckpt [CKPT_PATH] --freeze_layer backbone shell: - Ascend: sh run_train.sh Ascend 8 0,1,2,3,4,5,6,7 hccl_config.json [TRAIN_DATASET_PATH] incremental_learn ./pretrain_checkpoint/mobilenetv2.ckpt ./checkpoint/mobilenetv2_head_15.ckpt - GPU: sh run_train.sh GPU 8 0,1,2,3,4,5,6,7 [TRAIN_DATASET_PATH] incremental_learn ./pretrain_checkpoint/mobilenetv2.ckpt ./checkpoint/mobilenetv2_head_15.ckpt - CPU: sh run_train.sh CPU [TRAIN_DATASET_PATH] incremental_learn ./pretrain_checkpoint/mobilenetv2_199.ckpt ./checkpoint/mobilenetv2_head_15.ckpt + Ascend: sh run_train.sh Ascend 8 0,1,2,3,4,5,6,7 hccl_config.json [TRAIN_DATASET_PATH] [CKPT_PATH] backbone + GPU: sh run_train.sh GPU 8 0,1,2,3,4,5,6,7 [TRAIN_DATASET_PATH] [CKPT_PATH] backbone + CPU: sh run_train.sh CPU [TRAIN_DATASET_PATH] [CKPT_PATH] backbone ``` ### Result @@ -143,23 +144,23 @@ epoch time: 138331.250, per step time: 221.330, avg loss: 3.917 You can start training using python or shell scripts.If the train method is train or fine tune, should not input the `[CHECKPOINT_PATH]` The usage of shell scripts as follows: -- Ascend: sh run_eval.sh Ascend [DATASET_PATH] [CHECKPOINT_PATH] [HEAD_CKPT_PATH] -- GPU: sh run_eval.sh GPU [DATASET_PATH] [CHECKPOINT_PATH] [HEAD_CKPT_PATH] -- CPU: sh run_eval.sh CPU [DATASET_PATH] [BACKBONE_CKPT_PATH] [HEAD_CKPT_PATH] +- Ascend: sh run_eval.sh Ascend [DATASET_PATH] [CHECKPOINT_PATH] +- GPU: sh run_eval.sh GPU [DATASET_PATH] [CHECKPOINT_PATH] +- CPU: sh run_eval.sh CPU [DATASET_PATH] [BACKBONE_CKPT_PATH] ### Launch ```shell # eval example python: - Ascend: python eval.py --platform Ascend --dataset_path [VAL_DATASET_PATH] --pretrain_ckpt ./pretrain_ckpt/mobilenetv2.ckpt --head_ckpt ./checkpoint/mobilenetv2_head_15.ckpt - GPU: python eval.py --platform GPU --dataset_path [VAL_DATASET_PATH] --pretrain_ckpt ./pretrain_ckpt/mobilenetv2.ckpt --head_ckpt ./checkpoint/mobilenetv2_head_15.ckpt - CPU: python eval.py --platform CPU --dataset_path [VAL_DATASET_PATH] --pretrain_ckpt ./pretrain_ckpt/mobilenetv2.ckpt --head_ckpt ./checkpoint/mobilenetv2_head_15.ckpt + Ascend: python eval.py --platform Ascend --dataset_path [VAL_DATASET_PATH] --pretrain_ckpt ./ckpt_0/mobilenetv2_15.ckpt + GPU: python eval.py --platform GPU --dataset_path [VAL_DATASET_PATH] --pretrain_ckpt ./ckpt_0/mobilenetv2_15.ckpt + CPU: python eval.py --platform CPU --dataset_path [VAL_DATASET_PATH] --pretrain_ckpt ./ckpt_0/mobilenetv2_15.ckpt shell: - Ascend: sh run_eval.sh Ascend [VAL_DATASET_PATH] ./pretrain_ckpt/mobilenetv2.ckpt ./checkpoint/mobilenetv2_head_15.ckpt - GPU: sh run_eval.sh GPU [VAL_DATASET_PATH] ./pretrain_ckpt/mobilenetv2.ckpt ./checkpoint/mobilenetv2_head_15.ckpt - CPU: sh run_eval.sh CPU [VAL_DATASET_PATH] ./pretrain_ckpt/mobilenetv2.ckpt ./checkpoint/mobilenetv2_head_15.ckpt + Ascend: sh run_eval.sh Ascend [VAL_DATASET_PATH] ./checkpoint/mobilenetv2_head_15.ckpt + GPU: sh run_eval.sh GPU [VAL_DATASET_PATH] ./checkpoint/mobilenetv2_head_15.ckpt + CPU: sh run_eval.sh CPU [VAL_DATASET_PATH] ./checkpoint/mobilenetv2_head_15.ckpt ``` > checkpoint can be produced in training process. @@ -169,7 +170,7 @@ You can start training using python or shell scripts.If the train method is trai Inference result will be stored in the example path, you can find result like the followings in `eval.log`. ```shell -result: {'acc': 0.71976314102564111} ckpt=/path/to/checkpoint/mobilenet-200_625.ckpt +result: {'acc': 0.71976314102564111} ckpt=./ckpt_0/mobilenet-200_625.ckpt ``` # [Model description](#contents) diff --git a/model_zoo/official/cv/mobilenetv2/scripts/run_eval.sh b/model_zoo/official/cv/mobilenetv2/scripts/run_eval.sh index 5c891301d1..f4623b6871 100644 --- a/model_zoo/official/cv/mobilenetv2/scripts/run_eval.sh +++ b/model_zoo/official/cv/mobilenetv2/scripts/run_eval.sh @@ -97,7 +97,7 @@ run_cpu() } -if [ $# -gt 4 ] || [ $# -lt 3 ] +if [ $# -ne 3 ] then echo "Usage: Ascend: sh run_eval.sh [PLATFORM] [DATASET_PATH] [PRETRAIN_CKPT] diff --git a/model_zoo/official/cv/mobilenetv2/src/dataset.py b/model_zoo/official/cv/mobilenetv2/src/dataset.py index 51bcaed8c5..81e90710cd 100644 --- a/model_zoo/official/cv/mobilenetv2/src/dataset.py +++ b/model_zoo/official/cv/mobilenetv2/src/dataset.py @@ -99,13 +99,12 @@ def create_dataset(dataset_path, do_train, config, repeat_num=1): def extract_features(net, dataset_path, config): - features_folder = dataset_path + '_features' + features_folder = os.path.abspath(dataset_path) + '_features' if not os.path.exists(features_folder): os.makedirs(features_folder) dataset = create_dataset(dataset_path=dataset_path, do_train=False, - config=config, - repeat_num=1) + config=config) step_size = dataset.get_dataset_size() if step_size == 0: raise ValueError("The step_size of dataset is zero. Check if the images count of train dataset is more \ diff --git a/model_zoo/official/cv/mobilenetv2/src/utils.py b/model_zoo/official/cv/mobilenetv2/src/utils.py index 5fda873ca7..90e8db9d88 100644 --- a/model_zoo/official/cv/mobilenetv2/src/utils.py +++ b/model_zoo/official/cv/mobilenetv2/src/utils.py @@ -49,8 +49,7 @@ def context_device_init(config): if config.run_distribute: context.set_auto_parallel_context(device_num=config.rank_size, parallel_mode=ParallelMode.DATA_PARALLEL, - gradients_mean=True, - all_reduce_fusion_config=[140]) + gradients_mean=True) init() else: raise ValueError("Only support CPU, GPU and Ascend.") diff --git a/model_zoo/official/cv/mobilenetv2/train.py b/model_zoo/official/cv/mobilenetv2/train.py index 5d19349a74..02f5172416 100644 --- a/model_zoo/official/cv/mobilenetv2/train.py +++ b/model_zoo/official/cv/mobilenetv2/train.py @@ -117,7 +117,7 @@ if __name__ == '__main__': rank = get_rank() save_ckpt_path = os.path.join(config.save_checkpoint_path, 'ckpt_' + str(rank) + '/') if not os.path.isdir(save_ckpt_path): - os.mkdir(save_checkpoint) + os.mkdir(save_ckpt_path) for epoch in range(epoch_size): random.shuffle(idx_list)