From bed1859921a889b2002bc0c5620843bbd239802b Mon Sep 17 00:00:00 2001 From: zhaoting Date: Mon, 14 Dec 2020 19:49:06 +0800 Subject: [PATCH] add filter weight when fine-tune in mobilenetv2 --- model_zoo/official/cv/mobilenetv2/README.md | 52 ++++++------ .../official/cv/mobilenetv2/README_CN.md | 39 +++++---- .../cv/mobilenetv2/scripts/run_train.sh | 82 +++++++++++++++---- model_zoo/official/cv/mobilenetv2/src/args.py | 2 + .../official/cv/mobilenetv2/src/models.py | 12 +-- model_zoo/official/cv/mobilenetv2/train.py | 2 + 6 files changed, 116 insertions(+), 73 deletions(-) diff --git a/model_zoo/official/cv/mobilenetv2/README.md b/model_zoo/official/cv/mobilenetv2/README.md index 6809ef5dd3..1dd5a1d841 100644 --- a/model_zoo/official/cv/mobilenetv2/README.md +++ b/model_zoo/official/cv/mobilenetv2/README.md @@ -4,17 +4,17 @@ - [Model Architecture](#model-architecture) - [Dataset](#dataset) - [Features](#features) - - [Mixed Precision](#mixed-precision(ascend)) + - [Mixed Precision](#mixed-precision(ascend)) - [Environment Requirements](#environment-requirements) - [Script Description](#script-description) - - [Script and Sample Code](#script-and-sample-code) - - [Training Process](#training-process) - - [Evaluation Process](#eval-process) - - [Export MindIR](#export-mindir) + - [Script and Sample Code](#script-and-sample-code) + - [Training Process](#training-process) + - [Evaluation Process](#eval-process) + - [Export MindIR](#export-mindir) - [Model Description](#model-description) - - [Performance](#performance) - - [Training Performance](#training-performance) - - [Evaluation Performance](#evaluation-performance) + - [Performance](#performance) + - [Training Performance](#training-performance) + - [Evaluation Performance](#evaluation-performance) - [Description of Random Situation](#description-of-random-situation) - [ModelZoo Homepage](#modelzoo-homepage) @@ -35,10 +35,10 @@ The overall network architecture of MobileNetV2 is show below: Dataset used: [imagenet](http://www.image-net.org/) - Dataset size: ~125G, 1.2W colorful images in 1000 classes - - Train: 120G, 1.2W images - - Test: 5G, 50000 images + - Train: 120G, 1.2W images + - Test: 5G, 50000 images - Data format: RGB images. - - Note: Data will be processed in src/dataset.py + - Note: Data will be processed in src/dataset.py # [Features](#contents) @@ -50,12 +50,12 @@ For FP16 operators, if the input data type is FP32, the backend of MindSpore wil # [Environment Requirements](#contents) - Hardware(Ascend/GPU/CPU) - - Prepare hardware environment with Ascend, GPU or CPU processor. If you want to try Ascend, please send the [application form](https://obs-9be7.obs.cn-east-2.myhuaweicloud.com/file/other/Ascend%20Model%20Zoo%E4%BD%93%E9%AA%8C%E8%B5%84%E6%BA%90%E7%94%B3%E8%AF%B7%E8%A1%A8.docx) to ascend@huawei.com. Once approved, you can get the resources. + - Prepare hardware environment with Ascend, GPU or CPU processor. If you want to try Ascend, please send the [application form](https://obs-9be7.obs.cn-east-2.myhuaweicloud.com/file/other/Ascend%20Model%20Zoo%E4%BD%93%E9%AA%8C%E8%B5%84%E6%BA%90%E7%94%B3%E8%AF%B7%E8%A1%A8.docx) to ascend@huawei.com. Once approved, you can get the resources. - Framework - - [MindSpore](https://www.mindspore.cn/install/en) + - [MindSpore](https://www.mindspore.cn/install/en) - For more information, please check the resources below: - - [MindSpore Tutorials](https://www.mindspore.cn/tutorial/training/en/master/index.html) - - [MindSpore Python API](https://www.mindspore.cn/doc/api_python/en/master/index.html) + - [MindSpore Tutorials](https://www.mindspore.cn/tutorial/training/en/master/index.html) + - [MindSpore Python API](https://www.mindspore.cn/doc/api_python/en/master/index.html) # [Script description](#contents) @@ -87,9 +87,11 @@ For FP16 operators, if the input data type is FP32, the backend of MindSpore wil You can start training using python or shell scripts. The usage of shell scripts as follows: -- Ascend: sh run_train.sh Ascend [DEVICE_NUM] [VISIABLE_DEVICES(0,1,2,3,4,5,6,7)] [RANK_TABLE_FILE] [DATASET_PATH] [CKPT_PATH] [FREEZE_LAYER] -- GPU: sh run_trian.sh GPU [DEVICE_NUM] [VISIABLE_DEVICES(0,1,2,3,4,5,6,7)] [DATASET_PATH] [CKPT_PATH] [FREEZE_LAYER] -- CPU: sh run_trian.sh CPU [DATASET_PATH] [CKPT_PATH] [FREEZE_LAYER] +- Ascend: sh run_train.sh Ascend [DEVICE_NUM] [VISIABLE_DEVICES(0,1,2,3,4,5,6,7)] [RANK_TABLE_FILE] [DATASET_PATH] [CKPT_PATH] [FREEZE_LAYER] [FILTER_HEAD] +- GPU: sh run_trian.sh GPU [DEVICE_NUM] [VISIABLE_DEVICES(0,1,2,3,4,5,6,7)] [DATASET_PATH] [CKPT_PATH] [FREEZE_LAYER] [FILTER_HEAD] +- CPU: sh run_trian.sh CPU [DATASET_PATH] [CKPT_PATH] [FREEZE_LAYER] [FILTER_HEAD] + +`CKPT_PATH` `FREEZE_LAYER` and `FILTER_HEAD` are optional, when set `CKPT_PATH`, `FREEZE_LAYER` must be set. `FREEZE_LAYER` should be in ["none", "backbone"], and if you set `FREEZE_LAYER`="backbone", the parameter in backbone will be freezed when training and the parameter in head will not be load from checkpoint. if `FILTER_HEAD`=True, the parameter in head will not be load from checkpoint. > RANK_TABLE_FILE is HCCL configuration file when running on Ascend. > The common restrictions on using the distributed service are as follows. For details, see the HCCL documentation. @@ -113,14 +115,14 @@ You can start training using python or shell scripts. The usage of shell scripts # fine tune whole network example python: - Ascend: python train.py --platform Ascend --dataset_path [TRAIN_DATASET_PATH] --pretrain_ckpt [CKPT_PATH] --freeze_layer none - GPU: python train.py --platform GPU --dataset_path [TRAIN_DATASET_PATH] --pretrain_ckpt [CKPT_PATH] --freeze_layer none - CPU: python train.py --platform CPU --dataset_path [TRAIN_DATASET_PATH] --pretrain_ckpt [CKPT_PATH] --freeze_layer none + Ascend: python train.py --platform Ascend --dataset_path [TRAIN_DATASET_PATH] --pretrain_ckpt [CKPT_PATH] --freeze_layer none --filter_head True + GPU: python train.py --platform GPU --dataset_path [TRAIN_DATASET_PATH] --pretrain_ckpt [CKPT_PATH] --freeze_layer none --filter_head True + CPU: python train.py --platform CPU --dataset_path [TRAIN_DATASET_PATH] --pretrain_ckpt [CKPT_PATH] --freeze_layer none --filter_head True shell: - Ascend: sh run_train.sh Ascend 8 0,1,2,3,4,5,6,7 hccl_config.json [TRAIN_DATASET_PATH] [CKPT_PATH] none - GPU: sh run_train.sh GPU 8 0,1,2,3,4,5,6,7 [TRAIN_DATASET_PATH] [CKPT_PATH] none - CPU: sh run_train.sh CPU [TRAIN_DATASET_PATH] [CKPT_PATH] none + Ascend: sh run_train.sh Ascend 8 0,1,2,3,4,5,6,7 hccl_config.json [TRAIN_DATASET_PATH] [CKPT_PATH] none True + GPU: sh run_train.sh GPU 8 0,1,2,3,4,5,6,7 [TRAIN_DATASET_PATH] [CKPT_PATH] none True + CPU: sh run_train.sh CPU [TRAIN_DATASET_PATH] [CKPT_PATH] none True # fine tune full connected layers example python: @@ -184,7 +186,7 @@ result: {'acc': 0.71976314102564111} ckpt=./ckpt_0/mobilenet-200_625.ckpt Change the export mode and export file in `src/config.py`, and run `export.py`. -``` +```shell python export.py --platform [PLATFORM] --pretrain_ckpt [CKPT_PATH] ``` diff --git a/model_zoo/official/cv/mobilenetv2/README_CN.md b/model_zoo/official/cv/mobilenetv2/README_CN.md index 122336f8dd..ebf0925daa 100644 --- a/model_zoo/official/cv/mobilenetv2/README_CN.md +++ b/model_zoo/official/cv/mobilenetv2/README_CN.md @@ -1,5 +1,4 @@ # 目录 - - [目录](#目录) - [MobileNetV2描述](#mobilenetv2描述) @@ -25,8 +24,6 @@ - [随机情况说明](#随机情况说明) - [ModelZoo主页](#modelzoo主页) - - # MobileNetV2描述 MobileNetV2结合硬件感知神经网络架构搜索(NAS)和NetAdapt算法,已经可以移植到手机CPU上运行,后续随新架构进一步优化改进。(2019年11月20日) @@ -44,10 +41,10 @@ MobileNetV2总体网络架构如下: 使用的数据集:[imagenet](http://www.image-net.org/) - 数据集大小:125G,共1000个类、1.2万张彩色图像 - - 训练集:120G,共1.2万张图像 - - 测试集:5G,共5万张图像 + - 训练集:120G,共1.2万张图像 + - 测试集:5G,共5万张图像 - 数据格式:RGB - - 注:数据在src/dataset.py中处理。 + - 注:数据在src/dataset.py中处理。 # 特性 @@ -59,12 +56,12 @@ MobileNetV2总体网络架构如下: # 环境要求 - 硬件(Ascend/GPU/CPU) - - 使用Ascend、GPU或CPU处理器来搭建硬件环境。如需试用Ascend处理器,请发送[申请表](https://obs-9be7.obs.cn-east-2.myhuaweicloud.com/file/other/Ascend%20Model%20Zoo%E4%BD%93%E9%AA%8C%E8%B5%84%E6%BA%90%E7%94%B3%E8%AF%B7%E8%A1%A8.docx)至ascend@huawei.com,审核通过即可获得资源。 + - 使用Ascend、GPU或CPU处理器来搭建硬件环境。如需试用Ascend处理器,请发送[申请表](https://obs-9be7.obs.cn-east-2.myhuaweicloud.com/file/other/Ascend%20Model%20Zoo%E4%BD%93%E9%AA%8C%E8%B5%84%E6%BA%90%E7%94%B3%E8%AF%B7%E8%A1%A8.docx)至ascend@huawei.com,审核通过即可获得资源。 - 框架 - - [MindSpore](https://www.mindspore.cn/install/en) + - [MindSpore](https://www.mindspore.cn/install/en) - 如需查看详情,请参见如下资源: - - [MindSpore教程](https://www.mindspore.cn/tutorial/training/zh-CN/master/index.html) - - [MindSpore Python API](https://www.mindspore.cn/doc/api_python/zh-CN/master/index.html) + - [MindSpore教程](https://www.mindspore.cn/tutorial/training/zh-CN/master/index.html) + - [MindSpore Python API](https://www.mindspore.cn/doc/api_python/zh-CN/master/index.html) # 脚本说明 @@ -96,9 +93,11 @@ MobileNetV2总体网络架构如下: 使用python或shell脚本开始训练。shell脚本的使用方法如下: -- Ascend: sh run_train.sh Ascend [DEVICE_NUM] [VISIABLE_DEVICES(0,1,2,3,4,5,6,7)] [RANK_TABLE_FILE] [DATASET_PATH] [CKPT_PATH] [FREEZE_LAYER] -- GPU: sh run_trian.sh GPU [DEVICE_NUM] [VISIABLE_DEVICES(0,1,2,3,4,5,6,7)] [DATASET_PATH] [CKPT_PATH] [FREEZE_LAYER] -- CPU: sh run_trian.sh CPU [DATASET_PATH] [CKPT_PATH] [FREEZE_LAYER] +- Ascend: sh run_train.sh Ascend [DEVICE_NUM] [VISIABLE_DEVICES(0,1,2,3,4,5,6,7)] [RANK_TABLE_FILE] [DATASET_PATH] [CKPT_PATH] [FREEZE_LAYER] [FILTER_HEAD] +- GPU: sh run_trian.sh GPU [DEVICE_NUM] [VISIABLE_DEVICES(0,1,2,3,4,5,6,7)] [DATASET_PATH] [CKPT_PATH] [FREEZE_LAYER] [FILTER_HEAD] +- CPU: sh run_trian.sh CPU [DATASET_PATH] [CKPT_PATH] [FREEZE_LAYER] [FILTER_HEAD] + +`CKPT_PATH` `FREEZE_LAYER` 和 `FILTER_HEAD` 是可选择的选项, 如果设置`CKPT_PATH`, `FREEZE_LAYER` 也必须同时设置. `FREEZE_LAYER` 可以是 ["none", "backbone"], 如果设置 `FREEZE_LAYER`="backbone", 训练过程中backbone中的参数会被冻结,同时不会从checkpoint中加载head部分的参数. 如果`FILTER_HEAD`=True, 不会从checkpoint中加载head部分的参数. > RANK_TABLE_FILE 是在Ascned上运行分布式任务时HCCL的配置文件 > 我们列出使用分布式服务常见的使用限制,详细的可以查看HCCL对应的使用文档。 @@ -122,14 +121,14 @@ MobileNetV2总体网络架构如下: # 全网微调示例 python: - Ascend: python train.py --platform Ascend --dataset_path [TRAIN_DATASET_PATH] --pretrain_ckpt [CKPT_PATH] --freeze_layer none - GPU: python train.py --platform GPU --dataset_path [TRAIN_DATASET_PATH] --pretrain_ckpt [CKPT_PATH] --freeze_layer none - CPU: python train.py --platform CPU --dataset_path [TRAIN_DATASET_PATH] --pretrain_ckpt [CKPT_PATH] --freeze_layer none + Ascend: python train.py --platform Ascend --dataset_path [TRAIN_DATASET_PATH] --pretrain_ckpt [CKPT_PATH] --freeze_layer none --filter_head True + GPU: python train.py --platform GPU --dataset_path [TRAIN_DATASET_PATH] --pretrain_ckpt [CKPT_PATH] --freeze_layer none --filter_head True + CPU: python train.py --platform CPU --dataset_path [TRAIN_DATASET_PATH] --pretrain_ckpt [CKPT_PATH] --freeze_layer none --filter_head True shell: - Ascend: sh run_train.sh Ascend 8 0,1,2,3,4,5,6,7 hccl_config.json [TRAIN_DATASET_PATH] [CKPT_PATH] none - GPU: sh run_train.sh GPU 8 0,1,2,3,4,5,6,7 [TRAIN_DATASET_PATH] [CKPT_PATH] none - CPU: sh run_train.sh CPU [TRAIN_DATASET_PATH] [CKPT_PATH] none + Ascend: sh run_train.sh Ascend 8 0,1,2,3,4,5,6,7 hccl_config.json [TRAIN_DATASET_PATH] [CKPT_PATH] none True + GPU: sh run_train.sh GPU 8 0,1,2,3,4,5,6,7 [TRAIN_DATASET_PATH] [CKPT_PATH] none True + CPU: sh run_train.sh CPU [TRAIN_DATASET_PATH] [CKPT_PATH] none True # 全连接层微调示例 python: @@ -193,7 +192,7 @@ result:{'acc':0.71976314102564111} ckpt=./ckpt_0/mobilenet-200_625.ckpt 修改`src/config.py`文件中的`export_mode`和`export_file`, 运行`export.py`。 -``` +```shell python export.py --platform [PLATFORM] --pretrain_ckpt [CKPT_PATH] ``` diff --git a/model_zoo/official/cv/mobilenetv2/scripts/run_train.sh b/model_zoo/official/cv/mobilenetv2/scripts/run_train.sh index 2c812c5706..1e2dc2acf0 100644 --- a/model_zoo/official/cv/mobilenetv2/scripts/run_train.sh +++ b/model_zoo/official/cv/mobilenetv2/scripts/run_train.sh @@ -16,6 +16,25 @@ run_ascend() { + if [ $# = 5 ] ; then + PRETRAINED_CKPT="" + FREEZE_LAYER="none" + FILTER_HEAD="False" + elif [ $# = 7 ] ; then + PRETRAINED_CKPT=$6 + FREEZE_LAYER=$7 + FILTER_HEAD="False" + elif [ $# = 8 ] ; then + PRETRAINED_CKPT=$6 + FREEZE_LAYER=$7 + FILTER_HEAD=$8 + else + echo "Usage: + Ascend: sh run_train.sh Ascend [DEVICE_NUM] [VISIABLE_DEVICES(0,1,2,3,4,5,6,7)] [RANK_TABLE_FILE] [DATASET_PATH] [CKPT_PATH](optional) [FREEZE_LAYER](optional) [FILTER_HEAD](optional) + Ascend: sh run_train.sh Ascend [DEVICE_NUM] [VISIABLE_DEVICES(0,1,2,3,4,5,6,7)] [RANK_TABLE_FILE] [DATASET_PATH]" + exit 1 + fi; + if [ $2 -lt 1 ] && [ $2 -gt 8 ] then echo "error: DEVICE_NUM=$2 is not in (1-8)" @@ -59,8 +78,9 @@ run_ascend() python train.py \ --platform=$1 \ --dataset_path=$5 \ - --pretrain_ckpt=$6 \ - --freeze_layer=$7 \ + --pretrain_ckpt=$PRETRAINED_CKPT \ + --freeze_layer=$FREEZE_LAYER \ + --filter_head=$FILTER_HEAD \ &> log$i.log & cd .. done @@ -68,6 +88,24 @@ run_ascend() run_gpu() { + if [ $# = 4 ] ; then + PRETRAINED_CKPT="" + FREEZE_LAYER="none" + FILTER_HEAD="False" + elif [ $# = 6 ] ; then + PRETRAINED_CKPT=$5 + FREEZE_LAYER=$6 + FILTER_HEAD="False" + elif [ $# = 7 ] ; then + PRETRAINED_CKPT=$5 + FREEZE_LAYER=$6 + FILTER_HEAD=$7 + else + echo "Usage: + GPU: sh run_train.sh GPU [DEVICE_NUM] [VISIABLE_DEVICES(0,1,2,3,4,5,6,7)] [DATASET_PATH] [CKPT_PATH](optional) [FREEZE_LAYER](optional) [FILTER_HEAD](optional) + GPU: sh run_train.sh GPU [DEVICE_NUM] [VISIABLE_DEVICES(0,1,2,3,4,5,6,7)] [DATASET_PATH]" + exit 1 + fi; if [ $2 -lt 1 ] && [ $2 -gt 8 ] then echo "error: DEVICE_NUM=$2 is not in (1-8)" @@ -94,14 +132,32 @@ run_gpu() python ${BASEPATH}/../train.py \ --platform=$1 \ --dataset_path=$4 \ - --pretrain_ckpt=$5 \ - --freeze_layer=$6 \ + --pretrain_ckpt=$PRETRAINED_CKPT \ + --freeze_layer=$FREEZE_LAYER \ + --filter_head=$FILTER_HEAD \ &> ../train.log & # dataset train folder } run_cpu() { - + if [ $# = 2 ] ; then + PRETRAINED_CKPT="" + FREEZE_LAYER="none" + FILTER_HEAD="False" + elif [ $# = 4 ] ; then + PRETRAINED_CKPT=$3 + FREEZE_LAYER=$4 + FILTER_HEAD="False" + elif [ $# = 5 ] ; then + PRETRAINED_CKPT=$3 + FREEZE_LAYER=$4 + FILTER_HEAD=$5 + else + echo "Usage: + CPU: sh run_train.sh CPU [DATASET_PATH] + CPU: sh run_train.sh CPU [DATASET_PATH] [CKPT_PATH](optional) [FREEZE_LAYER](optional) [FILTER_HEAD](optional)" + exit 1 + fi; if [ ! -d $2 ] then echo "error: DATASET_PATH=$2 is not a directory" @@ -120,22 +176,12 @@ run_cpu() python ${BASEPATH}/../train.py \ --platform=$1 \ --dataset_path=$2 \ - --pretrain_ckpt=$3 \ - --freeze_layer=$4 \ + --pretrain_ckpt=$PRETRAINED_CKPT \ + --freeze_layer=$FREEZE_LAYER \ + --filter_head=$FILTER_HEAD \ &> ../train.log & # dataset train folder } -if [ $# -gt 7 ] || [ $# -lt 4 ] -then - echo "Usage: - Ascend: sh run_train.sh Ascend [DEVICE_NUM] [VISIABLE_DEVICES(0,1,2,3,4,5,6,7)] [RANK_TABLE_FILE] [DATASET_PATH] [CKPT_PATH] [FREEZE_LAYER] - Ascend: sh run_train.sh Ascend [DEVICE_NUM] [VISIABLE_DEVICES(0,1,2,3,4,5,6,7)] [RANK_TABLE_FILE] [DATASET_PATH] - GPU: sh run_train.sh GPU [DEVICE_NUM] [VISIABLE_DEVICES(0,1,2,3,4,5,6,7)] [DATASET_PATH] [CKPT_PATH] [FREEZE_LAYER] - GPU: sh run_train.sh GPU [DEVICE_NUM] [VISIABLE_DEVICES(0,1,2,3,4,5,6,7)] [DATASET_PATH] - CPU: sh run_train.sh CPU [DATASET_PATH] [CKPT_PATH] [FREEZE_LAYER]" -exit 1 -fi - if [ $1 = "Ascend" ] ; then run_ascend "$@" elif [ $1 = "GPU" ] ; then diff --git a/model_zoo/official/cv/mobilenetv2/src/args.py b/model_zoo/official/cv/mobilenetv2/src/args.py index 39a721b8dc..42df7a7485 100644 --- a/model_zoo/official/cv/mobilenetv2/src/args.py +++ b/model_zoo/official/cv/mobilenetv2/src/args.py @@ -26,6 +26,8 @@ def train_parse_args(): train_parser.add_argument('--freeze_layer', type=str, default="", choices=["", "none", "backbone"], \ help="freeze the weights of network from start to which layers") train_parser.add_argument('--run_distribute', type=ast.literal_eval, default=True, help='Run distribute') + train_parser.add_argument('--filter_head', type=ast.literal_eval, default=False,\ + help='Filter head weight parameters when load checkpoint, default is False.') train_args = train_parser.parse_args() train_args.is_training = True if train_args.platform == "CPU": diff --git a/model_zoo/official/cv/mobilenetv2/src/models.py b/model_zoo/official/cv/mobilenetv2/src/models.py index 5917ba914c..82067ee0e0 100644 --- a/model_zoo/official/cv/mobilenetv2/src/models.py +++ b/model_zoo/official/cv/mobilenetv2/src/models.py @@ -109,18 +109,10 @@ class Monitor(Callback): 1, cb_params.epoch_num, cur_step_in_epoch, cb_params.batch_num, step_loss, np.mean(self.losses), step_mseconds, self.lr_init[cb_params.cur_step_num - 1])) + def load_ckpt(network, pretrain_ckpt_path, trainable=True): - """ - incremental_learning or not - """ + """load checkpoint into network.""" param_dict = load_checkpoint(pretrain_ckpt_path) - if hasattr(network, "head"): - head_param = network.head.parameters_dict() - for k, v in head_param.items(): - if param_dict[k].shape != v.shape: - param_dict.pop(k) - param_dict.pop(f"moments.{k}") - print(f"Filter {k} don't load weights from checkpoint.") load_param_into_net(network, param_dict) if not trainable: for param in network.get_parameters(): diff --git a/model_zoo/official/cv/mobilenetv2/train.py b/model_zoo/official/cv/mobilenetv2/train.py index fe244090a7..64b7438a59 100644 --- a/model_zoo/official/cv/mobilenetv2/train.py +++ b/model_zoo/official/cv/mobilenetv2/train.py @@ -59,6 +59,8 @@ if __name__ == '__main__': if args_opt.freeze_layer == "backbone": load_ckpt(backbone_net, args_opt.pretrain_ckpt, trainable=False) step_size = extract_features(backbone_net, args_opt.dataset_path, config) + elif args_opt.filter_head: + load_ckpt(backbone_net, args_opt.pretrain_ckpt) else: load_ckpt(net, args_opt.pretrain_ckpt) if step_size == 0: