From 253fd92fe067ee069149650e3636f8976e4c0836 Mon Sep 17 00:00:00 2001 From: zhouyaqiang Date: Wed, 19 Aug 2020 15:07:02 +0800 Subject: [PATCH] add network inceptionv3 --- .../scripts/run_distribute_train.sh | 50 +++++++ .../cv/inceptionv3/scripts/run_eval.sh | 24 ++++ .../scripts/run_standalone_train.sh | 22 ++++ .../official/cv/inceptionv3/src/config.py | 32 ++++- .../cv/inceptionv3/src/inception_v3.py | 122 +++++++++--------- model_zoo/official/cv/inceptionv3/train.py | 26 +++- 6 files changed, 207 insertions(+), 69 deletions(-) create mode 100644 model_zoo/official/cv/inceptionv3/scripts/run_distribute_train.sh create mode 100644 model_zoo/official/cv/inceptionv3/scripts/run_eval.sh create mode 100644 model_zoo/official/cv/inceptionv3/scripts/run_standalone_train.sh diff --git a/model_zoo/official/cv/inceptionv3/scripts/run_distribute_train.sh b/model_zoo/official/cv/inceptionv3/scripts/run_distribute_train.sh new file mode 100644 index 0000000000..e56212b116 --- /dev/null +++ b/model_zoo/official/cv/inceptionv3/scripts/run_distribute_train.sh @@ -0,0 +1,50 @@ +#!/bin/bash +# Copyright 2020 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ + +DATA_DIR=$2 +export RANK_TABLE_FILE=$1 +export RANK_SIZE=8 + + +cores=`cat /proc/cpuinfo|grep "processor" |wc -l` +echo "the number of logical core" $cores +avg_core_per_rank=`expr $cores \/ $RANK_SIZE` +core_gap=`expr $avg_core_per_rank \- 1` +echo "avg_core_per_rank" $avg_core_per_rank +echo "core_gap" $core_gap +for((i=0;i env.log + taskset -c $cmdopt python ../train.py \ + --is_distributed \ + --platform=Ascend \ + --dataset_path=$DATA_DIR > log.txt 2>&1 & + cd ../ +done diff --git a/model_zoo/official/cv/inceptionv3/scripts/run_eval.sh b/model_zoo/official/cv/inceptionv3/scripts/run_eval.sh new file mode 100644 index 0000000000..1760fbb9df --- /dev/null +++ b/model_zoo/official/cv/inceptionv3/scripts/run_eval.sh @@ -0,0 +1,24 @@ +#!/bin/bash +# Copyright 2020 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ + +export DEVICE_ID=$1 +DATA_DIR=$2 +PATH_CHECKPOINT=$3 + +python eval.py \ + --platform=Ascend \ + --checkpoint=$PATH_CHECKPOINT \ + --dataset_path=$DATA_DIR > log.txt 2>&1 & diff --git a/model_zoo/official/cv/inceptionv3/scripts/run_standalone_train.sh b/model_zoo/official/cv/inceptionv3/scripts/run_standalone_train.sh new file mode 100644 index 0000000000..2062b21370 --- /dev/null +++ b/model_zoo/official/cv/inceptionv3/scripts/run_standalone_train.sh @@ -0,0 +1,22 @@ +#!/bin/bash +# Copyright 2020 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ + +export DEVICE_ID=$1 +DATA_DIR=$2 +python train.py \ + --platform=Ascend \ + --dataset_path=$DATA_DIR > log.txt 2>&1 & + diff --git a/model_zoo/official/cv/inceptionv3/src/config.py b/model_zoo/official/cv/inceptionv3/src/config.py index b465a7543a..262aa20b7f 100644 --- a/model_zoo/official/cv/inceptionv3/src/config.py +++ b/model_zoo/official/cv/inceptionv3/src/config.py @@ -39,5 +39,35 @@ config_gpu = edict({ 'opt_eps': 1.0, 'keep_checkpoint_max': 100, 'ckpt_path': './checkpoint/', - 'is_save_on_master': 0 + 'is_save_on_master': 0, + 'dropout_keep_prob': 0.5, + 'has_bias': True, + 'amp_level': 'O0' +}) + +config_ascend = edict({ + 'random_seed': 1, + 'rank': 0, + 'group_size': 1, + 'work_nums': 8, + 'decay_method': 'cosine', + "loss_scale": 1024, + 'batch_size': 128, + 'epoch_size': 250, + 'num_classes': 1000, + 'smooth_factor': 0.1, + 'aux_factor': 0.2, + 'lr_init': 0.00004, + 'lr_max': 0.4, + 'lr_end': 0.000004, + 'warmup_epochs': 1, + 'weight_decay': 0.00004, + 'momentum': 0.9, + 'opt_eps': 1.0, + 'keep_checkpoint_max': 100, + 'ckpt_path': './checkpoint/', + 'is_save_on_master': 0, + 'dropout_keep_prob': 0.8, + 'has_bias': False, + 'amp_level': 'O3' }) diff --git a/model_zoo/official/cv/inceptionv3/src/inception_v3.py b/model_zoo/official/cv/inceptionv3/src/inception_v3.py index f1339b1c88..8d2faf7a91 100644 --- a/model_zoo/official/cv/inceptionv3/src/inception_v3.py +++ b/model_zoo/official/cv/inceptionv3/src/inception_v3.py @@ -19,10 +19,10 @@ from mindspore.common.initializer import XavierUniform class BasicConv2d(nn.Cell): - def __init__(self, in_channel, out_channel, kernel_size, stride=1, pad_mode='same', padding=0): + def __init__(self, in_channel, out_channel, kernel_size, stride=1, pad_mode='same', padding=0, has_bias=False): super(BasicConv2d, self).__init__() self.conv = nn.Conv2d(in_channel, out_channel, kernel_size=kernel_size, stride=stride, - pad_mode=pad_mode, padding=padding, weight_init=XavierUniform(), has_bias=True) + pad_mode=pad_mode, padding=padding, weight_init=XavierUniform(), has_bias=has_bias) self.bn = nn.BatchNorm2d(out_channel, eps=0.001, momentum=0.9997) self.relu = nn.ReLU() @@ -34,23 +34,23 @@ class BasicConv2d(nn.Cell): class Inception_A(nn.Cell): - def __init__(self, in_channels, pool_features): + def __init__(self, in_channels, pool_features, has_bias=False): super(Inception_A, self).__init__() self.concat = P.Concat(axis=1) - self.branch0 = BasicConv2d(in_channels, 64, kernel_size=1) + self.branch0 = BasicConv2d(in_channels, 64, kernel_size=1, has_bias=has_bias) self.branch1 = nn.SequentialCell([ - BasicConv2d(in_channels, 48, kernel_size=1), - BasicConv2d(48, 64, kernel_size=5) + BasicConv2d(in_channels, 48, kernel_size=1, has_bias=has_bias), + BasicConv2d(48, 64, kernel_size=5, has_bias=has_bias) ]) self.branch2 = nn.SequentialCell([ - BasicConv2d(in_channels, 64, kernel_size=1), - BasicConv2d(64, 96, kernel_size=3), - BasicConv2d(96, 96, kernel_size=3) + BasicConv2d(in_channels, 64, kernel_size=1, has_bias=has_bias), + BasicConv2d(64, 96, kernel_size=3, has_bias=has_bias), + BasicConv2d(96, 96, kernel_size=3, has_bias=has_bias) ]) self.branch_pool = nn.SequentialCell([ nn.AvgPool2d(kernel_size=3, pad_mode='same'), - BasicConv2d(in_channels, pool_features, kernel_size=1) + BasicConv2d(in_channels, pool_features, kernel_size=1, has_bias=has_bias) ]) def construct(self, x): @@ -63,14 +63,14 @@ class Inception_A(nn.Cell): class Inception_B(nn.Cell): - def __init__(self, in_channels): + def __init__(self, in_channels, has_bias=False): super(Inception_B, self).__init__() self.concat = P.Concat(axis=1) - self.branch0 = BasicConv2d(in_channels, 384, kernel_size=3, stride=2, pad_mode='valid') + self.branch0 = BasicConv2d(in_channels, 384, kernel_size=3, stride=2, pad_mode='valid', has_bias=has_bias) self.branch1 = nn.SequentialCell([ - BasicConv2d(in_channels, 64, kernel_size=1), - BasicConv2d(64, 96, kernel_size=3), - BasicConv2d(96, 96, kernel_size=3, stride=2, pad_mode='valid') + BasicConv2d(in_channels, 64, kernel_size=1, has_bias=has_bias), + BasicConv2d(64, 96, kernel_size=3, has_bias=has_bias), + BasicConv2d(96, 96, kernel_size=3, stride=2, pad_mode='valid', has_bias=has_bias) ]) self.branch_pool = nn.MaxPool2d(kernel_size=3, stride=2) @@ -84,25 +84,25 @@ class Inception_B(nn.Cell): class Inception_C(nn.Cell): - def __init__(self, in_channels, channels_7x7): + def __init__(self, in_channels, channels_7x7, has_bias=False): super(Inception_C, self).__init__() self.concat = P.Concat(axis=1) - self.branch0 = BasicConv2d(in_channels, 192, kernel_size=1) + self.branch0 = BasicConv2d(in_channels, 192, kernel_size=1, has_bias=has_bias) self.branch1 = nn.SequentialCell([ - BasicConv2d(in_channels, channels_7x7, kernel_size=1), - BasicConv2d(channels_7x7, channels_7x7, kernel_size=(1, 7)), - BasicConv2d(channels_7x7, 192, kernel_size=(7, 1)) + BasicConv2d(in_channels, channels_7x7, kernel_size=1, has_bias=has_bias), + BasicConv2d(channels_7x7, channels_7x7, kernel_size=(1, 7), has_bias=has_bias), + BasicConv2d(channels_7x7, 192, kernel_size=(7, 1), has_bias=has_bias) ]) self.branch2 = nn.SequentialCell([ - BasicConv2d(in_channels, channels_7x7, kernel_size=1), - BasicConv2d(channels_7x7, channels_7x7, kernel_size=(7, 1)), - BasicConv2d(channels_7x7, channels_7x7, kernel_size=(1, 7)), - BasicConv2d(channels_7x7, channels_7x7, kernel_size=(7, 1)), - BasicConv2d(channels_7x7, 192, kernel_size=(1, 7)) + BasicConv2d(in_channels, channels_7x7, kernel_size=1, has_bias=has_bias), + BasicConv2d(channels_7x7, channels_7x7, kernel_size=(7, 1), has_bias=has_bias), + BasicConv2d(channels_7x7, channels_7x7, kernel_size=(1, 7), has_bias=has_bias), + BasicConv2d(channels_7x7, channels_7x7, kernel_size=(7, 1), has_bias=has_bias), + BasicConv2d(channels_7x7, 192, kernel_size=(1, 7), has_bias=has_bias) ]) self.branch_pool = nn.SequentialCell([ nn.AvgPool2d(kernel_size=3, pad_mode='same'), - BasicConv2d(in_channels, 192, kernel_size=1) + BasicConv2d(in_channels, 192, kernel_size=1, has_bias=has_bias) ]) def construct(self, x): @@ -115,18 +115,18 @@ class Inception_C(nn.Cell): class Inception_D(nn.Cell): - def __init__(self, in_channels): + def __init__(self, in_channels, has_bias=False): super(Inception_D, self).__init__() self.concat = P.Concat(axis=1) self.branch0 = nn.SequentialCell([ - BasicConv2d(in_channels, 192, kernel_size=1), - BasicConv2d(192, 320, kernel_size=3, stride=2, pad_mode='valid') + BasicConv2d(in_channels, 192, kernel_size=1, has_bias=has_bias), + BasicConv2d(192, 320, kernel_size=3, stride=2, pad_mode='valid', has_bias=has_bias) ]) self.branch1 = nn.SequentialCell([ - BasicConv2d(in_channels, 192, kernel_size=1), - BasicConv2d(192, 192, kernel_size=(1, 7)), # check - BasicConv2d(192, 192, kernel_size=(7, 1)), - BasicConv2d(192, 192, kernel_size=3, stride=2, pad_mode='valid') + BasicConv2d(in_channels, 192, kernel_size=1, has_bias=has_bias), + BasicConv2d(192, 192, kernel_size=(1, 7), has_bias=has_bias), # check + BasicConv2d(192, 192, kernel_size=(7, 1), has_bias=has_bias), + BasicConv2d(192, 192, kernel_size=3, stride=2, pad_mode='valid', has_bias=has_bias) ]) self.branch_pool = nn.MaxPool2d(kernel_size=3, stride=2) @@ -139,22 +139,22 @@ class Inception_D(nn.Cell): class Inception_E(nn.Cell): - def __init__(self, in_channels): + def __init__(self, in_channels, has_bias=False): super(Inception_E, self).__init__() self.concat = P.Concat(axis=1) - self.branch0 = BasicConv2d(in_channels, 320, kernel_size=1) - self.branch1 = BasicConv2d(in_channels, 384, kernel_size=1) - self.branch1_a = BasicConv2d(384, 384, kernel_size=(1, 3)) - self.branch1_b = BasicConv2d(384, 384, kernel_size=(3, 1)) + self.branch0 = BasicConv2d(in_channels, 320, kernel_size=1, has_bias=has_bias) + self.branch1 = BasicConv2d(in_channels, 384, kernel_size=1, has_bias=has_bias) + self.branch1_a = BasicConv2d(384, 384, kernel_size=(1, 3), has_bias=has_bias) + self.branch1_b = BasicConv2d(384, 384, kernel_size=(3, 1), has_bias=has_bias) self.branch2 = nn.SequentialCell([ - BasicConv2d(in_channels, 448, kernel_size=1), - BasicConv2d(448, 384, kernel_size=3) + BasicConv2d(in_channels, 448, kernel_size=1, has_bias=has_bias), + BasicConv2d(448, 384, kernel_size=3, has_bias=has_bias) ]) - self.branch2_a = BasicConv2d(384, 384, kernel_size=(1, 3)) - self.branch2_b = BasicConv2d(384, 384, kernel_size=(3, 1)) + self.branch2_a = BasicConv2d(384, 384, kernel_size=(1, 3), has_bias=has_bias) + self.branch2_b = BasicConv2d(384, 384, kernel_size=(3, 1), has_bias=has_bias) self.branch_pool = nn.SequentialCell([ nn.AvgPool2d(kernel_size=3, pad_mode='same'), - BasicConv2d(in_channels, 192, kernel_size=1) + BasicConv2d(in_channels, 192, kernel_size=1, has_bias=has_bias) ]) def construct(self, x): @@ -203,30 +203,30 @@ class AuxLogits(nn.Cell): class InceptionV3(nn.Cell): - def __init__(self, num_classes=10, is_training=True): + def __init__(self, num_classes=10, is_training=True, has_bias=False, dropout_keep_prob=0.8): super(InceptionV3, self).__init__() self.is_training = is_training - self.Conv2d_1a = BasicConv2d(3, 32, kernel_size=3, stride=2, pad_mode='valid') - self.Conv2d_2a = BasicConv2d(32, 32, kernel_size=3, stride=1, pad_mode='valid') - self.Conv2d_2b = BasicConv2d(32, 64, kernel_size=3, stride=1) + self.Conv2d_1a = BasicConv2d(3, 32, kernel_size=3, stride=2, pad_mode='valid', has_bias=has_bias) + self.Conv2d_2a = BasicConv2d(32, 32, kernel_size=3, stride=1, pad_mode='valid', has_bias=has_bias) + self.Conv2d_2b = BasicConv2d(32, 64, kernel_size=3, stride=1, has_bias=has_bias) self.maxpool1 = nn.MaxPool2d(kernel_size=3, stride=2) - self.Conv2d_3b = BasicConv2d(64, 80, kernel_size=1) - self.Conv2d_4a = BasicConv2d(80, 192, kernel_size=3, pad_mode='valid') + self.Conv2d_3b = BasicConv2d(64, 80, kernel_size=1, has_bias=has_bias) + self.Conv2d_4a = BasicConv2d(80, 192, kernel_size=3, pad_mode='valid', has_bias=has_bias) self.maxpool2 = nn.MaxPool2d(kernel_size=3, stride=2) - self.Mixed_5b = Inception_A(192, pool_features=32) - self.Mixed_5c = Inception_A(256, pool_features=64) - self.Mixed_5d = Inception_A(288, pool_features=64) - self.Mixed_6a = Inception_B(288) - self.Mixed_6b = Inception_C(768, channels_7x7=128) - self.Mixed_6c = Inception_C(768, channels_7x7=160) - self.Mixed_6d = Inception_C(768, channels_7x7=160) - self.Mixed_6e = Inception_C(768, channels_7x7=192) - self.Mixed_7a = Inception_D(768) - self.Mixed_7b = Inception_E(1280) - self.Mixed_7c = Inception_E(2048) + self.Mixed_5b = Inception_A(192, pool_features=32, has_bias=has_bias) + self.Mixed_5c = Inception_A(256, pool_features=64, has_bias=has_bias) + self.Mixed_5d = Inception_A(288, pool_features=64, has_bias=has_bias) + self.Mixed_6a = Inception_B(288, has_bias=has_bias) + self.Mixed_6b = Inception_C(768, channels_7x7=128, has_bias=has_bias) + self.Mixed_6c = Inception_C(768, channels_7x7=160, has_bias=has_bias) + self.Mixed_6d = Inception_C(768, channels_7x7=160, has_bias=has_bias) + self.Mixed_6e = Inception_C(768, channels_7x7=192, has_bias=has_bias) + self.Mixed_7a = Inception_D(768, has_bias=has_bias) + self.Mixed_7b = Inception_E(1280, has_bias=has_bias) + self.Mixed_7c = Inception_E(2048, has_bias=has_bias) if is_training: self.aux_logits = AuxLogits(768, num_classes) - self.logits = Logits(num_classes, dropout_keep_prob=0.5) + self.logits = Logits(num_classes, dropout_keep_prob) def construct(self, x): x = self.Conv2d_1a(x) diff --git a/model_zoo/official/cv/inceptionv3/train.py b/model_zoo/official/cv/inceptionv3/train.py index bca02e6ee0..f2d2256eef 100644 --- a/model_zoo/official/cv/inceptionv3/train.py +++ b/model_zoo/official/cv/inceptionv3/train.py @@ -28,16 +28,18 @@ from mindspore.train.callback import ModelCheckpoint, CheckpointConfig, LossMoni from mindspore.train.model import Model from mindspore.train.serialization import load_checkpoint, load_param_into_net from mindspore import dataset as de +from mindspore.train.loss_scale_manager import FixedLossScaleManager +from mindspore.common.initializer import XavierUniform, initializer -from src.config import config_gpu as cfg +from src.config import config_gpu, config_ascend from src.dataset import create_dataset from src.inception_v3 import InceptionV3 from src.lr_generator import get_lr from src.loss import CrossEntropy -random.seed(cfg.random_seed) -np.random.seed(cfg.random_seed) -de.config.set_seed(cfg.random_seed) +random.seed(1) +np.random.seed(1) +de.config.set_seed(1) if __name__ == '__main__': @@ -52,7 +54,7 @@ if __name__ == '__main__': context.set_context(mode=context.GRAPH_MODE, device_target=args_opt.platform, save_graphs=False) if os.getenv('DEVICE_ID', "not_set").isdigit(): context.set_context(device_id=int(os.getenv('DEVICE_ID'))) - + cfg = config_ascend if args_opt.platform == 'Ascend' else config_gpu # init distributed if args_opt.is_distributed: if args_opt.platform == "Ascend": @@ -73,7 +75,7 @@ if __name__ == '__main__': batches_per_epoch = dataset.get_dataset_size() # network - net = InceptionV3(num_classes=cfg.num_classes) + net = InceptionV3(num_classes=cfg.num_classes, dropout_keep_prob=cfg.dropout_keep_prob, has_bias=cfg.has_bias) # loss loss = CrossEntropy(smooth_factor=cfg.smooth_factor, num_classes=cfg.num_classes, factor=cfg.aux_factor) @@ -92,6 +94,11 @@ if __name__ == '__main__': else: no_decayed_params.append(param) + if args_opt.platform == "Ascend": + for param in net.trainable_params(): + if 'beta' not in param.name and 'gamma' not in param.name and 'bias' not in param.name: + np.random.seed(seed=1) + param.set_parameter_data(initializer(XavierUniform(), param.data.shape, param.data.dtype)) group_params = [{'params': decayed_params, 'weight_decay': cfg.weight_decay}, {'params': no_decayed_params}, {'order_params': net.trainable_params()}] @@ -104,7 +111,12 @@ if __name__ == '__main__': if args_opt.resume: ckpt = load_checkpoint(args_opt.resume) load_param_into_net(net, ckpt) - model = Model(net, loss_fn=loss, optimizer=optimizer, metrics={'acc'}) + if args_opt.platform == "Ascend": + loss_scale_manager = FixedLossScaleManager(cfg.loss_scale, drop_overflow_update=False) + model = Model(net, loss_fn=loss, optimizer=optimizer, metrics={'acc'}, amp_level=cfg.amp_level, + loss_scale_manager=loss_scale_manager) + else: + model = Model(net, loss_fn=loss, optimizer=optimizer, metrics={'acc'}, amp_level=cfg.amp_level) print("============== Starting Training ==============") loss_cb = LossMonitor(per_print_times=batches_per_epoch)