You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
251 lines
11 KiB
251 lines
11 KiB
# Copyright 2020 Huawei Technologies Co., Ltd
|
|
#
|
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
# you may not use this file except in compliance with the License.
|
|
# You may obtain a copy of the License at
|
|
#
|
|
# http://www.apache.org/licenses/LICENSE-2.0
|
|
#
|
|
# Unless required by applicable law or agreed to in writing, software
|
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
# See the License for the specific language governing permissions and
|
|
# limitations under the License.
|
|
# ============================================================================
|
|
"""Training Interface"""
|
|
import sys
|
|
import os
|
|
import argparse
|
|
import copy
|
|
|
|
from mindspore.communication.management import init, get_rank, get_group_size
|
|
from mindspore.train.model import ParallelMode, Model
|
|
from mindspore.train.callback import TimeMonitor
|
|
from mindspore.train.serialization import load_checkpoint, load_param_into_net
|
|
from mindspore.train.loss_scale_manager import FixedLossScaleManager
|
|
from mindspore.nn import SGD, RMSProp, Loss, Top1CategoricalAccuracy, \
|
|
Top5CategoricalAccuracy
|
|
from mindspore import context, Tensor
|
|
|
|
from src.dataset import create_dataset, create_dataset_val
|
|
from src.utils import add_weight_decay, count_params, str2bool, get_lr
|
|
from src.callback import EmaEvalCallBack, LossMonitor
|
|
from src.loss import LabelSmoothingCrossEntropy
|
|
from src.tinynet import tinynet
|
|
|
|
parser = argparse.ArgumentParser(description='Training')
|
|
|
|
# training parameters
|
|
parser.add_argument('--data_path', type=str, default="", metavar="DIR",
|
|
help='path to dataset')
|
|
parser.add_argument('--model', default='tinynet_c', type=str, metavar='MODEL',
|
|
help='Name of model to train (default: "tinynet_c"')
|
|
parser.add_argument('--num-classes', type=int, default=1000, metavar='N',
|
|
help='number of label classes (default: 1000)')
|
|
parser.add_argument('-b', '--batch-size', type=int, default=32, metavar='N',
|
|
help='input batch size for training (default: 32)')
|
|
parser.add_argument('--drop', type=float, default=0.0, metavar='DROP',
|
|
help='Dropout rate (default: 0.)')
|
|
parser.add_argument('--drop-connect', type=float, default=0.0, metavar='DROP',
|
|
help='Drop connect rate (default: 0.)')
|
|
parser.add_argument('--opt', default='sgd', type=str, metavar='OPTIMIZER',
|
|
help='Optimizer (default: "sgd"')
|
|
parser.add_argument('--opt-eps', default=1e-8, type=float, metavar='EPSILON',
|
|
help='Optimizer Epsilon (default: 1e-8)')
|
|
parser.add_argument('--momentum', type=float, default=0.9, metavar='M',
|
|
help='SGD momentum (default: 0.9)')
|
|
parser.add_argument('--weight-decay', type=float, default=0.0001,
|
|
help='weight decay (default: 0.0001)')
|
|
parser.add_argument('--lr', type=float, default=0.01, metavar='LR',
|
|
help='learning rate (default: 0.01)')
|
|
parser.add_argument('--warmup-lr', type=float, default=0.0001, metavar='LR',
|
|
help='warmup learning rate (default: 0.0001)')
|
|
parser.add_argument('--epochs', type=int, default=200, metavar='N',
|
|
help='number of epochs to train (default: 2)')
|
|
parser.add_argument('--decay-epochs', type=float, default=30, metavar='N',
|
|
help='epoch interval to decay LR')
|
|
parser.add_argument('--warmup-epochs', type=int, default=3, metavar='N',
|
|
help='epochs to warmup LR, if scheduler supports')
|
|
parser.add_argument('--decay-rate', '--dr', type=float, default=0.1, metavar='RATE',
|
|
help='LR decay rate (default: 0.1)')
|
|
parser.add_argument('--smoothing', type=float, default=0.1,
|
|
help='label smoothing (default: 0.1)')
|
|
parser.add_argument('--ema-decay', type=float, default=0,
|
|
help='decay factor for model weights moving average \
|
|
(default: 0.999)')
|
|
parser.add_argument('--amp_level', type=str, default='O0')
|
|
parser.add_argument('--per_print_times', type=int, default=100)
|
|
|
|
# batch norm parameters
|
|
parser.add_argument('--bn-tf', action='store_true', default=False,
|
|
help='Use Tensorflow BatchNorm defaults for models that \
|
|
support it (default: False)')
|
|
parser.add_argument('--bn-momentum', type=float, default=None,
|
|
help='BatchNorm momentum override (if not None)')
|
|
parser.add_argument('--bn-eps', type=float, default=None,
|
|
help='BatchNorm epsilon override (if not None)')
|
|
|
|
# parallel parameters
|
|
parser.add_argument('-j', '--workers', type=int, default=4, metavar='N',
|
|
help='how many training processes to use (default: 1)')
|
|
parser.add_argument('--distributed', action='store_true', default=False)
|
|
parser.add_argument('--dataset_sink', action='store_true', default=True)
|
|
|
|
# checkpoint config
|
|
parser.add_argument('--ckpt', type=str, default=None)
|
|
parser.add_argument('--ckpt_save_epoch', type=int, default=1)
|
|
parser.add_argument('--loss_scale', type=int,
|
|
default=1024, help='static loss scale')
|
|
parser.add_argument('--train', type=str2bool, default=1, help='train or eval')
|
|
parser.add_argument('--GPU', action='store_true', default=False,
|
|
help='Use GPU for training (default: False)')
|
|
|
|
|
|
def main():
|
|
"""Main entrance for training"""
|
|
args = parser.parse_args()
|
|
print(sys.argv)
|
|
devid, args.rank_id, args.rank_size = 0, 0, 1
|
|
|
|
context.set_context(mode=context.GRAPH_MODE)
|
|
|
|
if args.distributed:
|
|
if args.GPU:
|
|
init("nccl")
|
|
context.set_context(device_target='GPU')
|
|
else:
|
|
init()
|
|
devid = int(os.getenv('DEVICE_ID'))
|
|
context.set_context(device_target='Ascend',
|
|
device_id=devid,
|
|
reserve_class_name_in_scope=False)
|
|
context.reset_auto_parallel_context()
|
|
args.rank_id = get_rank()
|
|
args.rank_size = get_group_size()
|
|
context.set_auto_parallel_context(parallel_mode=ParallelMode.DATA_PARALLEL,
|
|
gradients_mean=True,
|
|
device_num=args.rank_size)
|
|
else:
|
|
if args.GPU:
|
|
context.set_context(device_target='GPU')
|
|
|
|
is_master = not args.distributed or (args.rank_id == 0)
|
|
|
|
# parse model argument
|
|
assert args.model.startswith(
|
|
"tinynet"), "Only Tinynet models are supported."
|
|
_, sub_name = args.model.split("_")
|
|
net = tinynet(sub_model=sub_name,
|
|
num_classes=args.num_classes,
|
|
drop_rate=args.drop,
|
|
drop_connect_rate=args.drop_connect,
|
|
global_pool="avg",
|
|
bn_tf=args.bn_tf,
|
|
bn_momentum=args.bn_momentum,
|
|
bn_eps=args.bn_eps)
|
|
|
|
if is_master:
|
|
print("Total number of parameters:", count_params(net))
|
|
# input image size of the network
|
|
input_size = net.default_cfg['input_size'][1]
|
|
|
|
train_dataset = val_dataset = None
|
|
train_data_url = os.path.join(args.data_path, 'train')
|
|
val_data_url = os.path.join(args.data_path, 'val')
|
|
val_dataset = create_dataset_val(args.batch_size,
|
|
val_data_url,
|
|
workers=args.workers,
|
|
distributed=False,
|
|
input_size=input_size)
|
|
|
|
if args.train:
|
|
train_dataset = create_dataset(args.batch_size,
|
|
train_data_url,
|
|
workers=args.workers,
|
|
distributed=args.distributed,
|
|
input_size=input_size)
|
|
batches_per_epoch = train_dataset.get_dataset_size()
|
|
|
|
loss = LabelSmoothingCrossEntropy(
|
|
smooth_factor=args.smoothing, num_classes=args.num_classes)
|
|
time_cb = TimeMonitor(data_size=batches_per_epoch)
|
|
loss_scale_manager = FixedLossScaleManager(
|
|
args.loss_scale, drop_overflow_update=False)
|
|
|
|
lr_array = get_lr(base_lr=args.lr,
|
|
total_epochs=args.epochs,
|
|
steps_per_epoch=batches_per_epoch,
|
|
decay_epochs=args.decay_epochs,
|
|
decay_rate=args.decay_rate,
|
|
warmup_epochs=args.warmup_epochs,
|
|
warmup_lr_init=args.warmup_lr,
|
|
global_epoch=0)
|
|
lr = Tensor(lr_array)
|
|
|
|
loss_cb = LossMonitor(lr_array,
|
|
args.epochs,
|
|
per_print_times=args.per_print_times,
|
|
start_epoch=0)
|
|
|
|
param_group = add_weight_decay(net, weight_decay=args.weight_decay)
|
|
|
|
if args.opt == 'sgd':
|
|
if is_master:
|
|
print('Using SGD optimizer')
|
|
optimizer = SGD(param_group,
|
|
learning_rate=lr,
|
|
momentum=args.momentum,
|
|
weight_decay=args.weight_decay,
|
|
loss_scale=args.loss_scale)
|
|
|
|
elif args.opt == 'rmsprop':
|
|
if is_master:
|
|
print('Using rmsprop optimizer')
|
|
optimizer = RMSProp(param_group,
|
|
learning_rate=lr,
|
|
decay=0.9,
|
|
weight_decay=args.weight_decay,
|
|
momentum=args.momentum,
|
|
epsilon=args.opt_eps,
|
|
loss_scale=args.loss_scale)
|
|
|
|
loss.add_flags_recursive(fp32=True, fp16=False)
|
|
eval_metrics = {'Validation-Loss': Loss(),
|
|
'Top1-Acc': Top1CategoricalAccuracy(),
|
|
'Top5-Acc': Top5CategoricalAccuracy()}
|
|
|
|
if args.ckpt:
|
|
ckpt = load_checkpoint(args.ckpt)
|
|
load_param_into_net(net, ckpt)
|
|
net.set_train(False)
|
|
|
|
model = Model(net, loss, optimizer, metrics=eval_metrics,
|
|
loss_scale_manager=loss_scale_manager,
|
|
amp_level=args.amp_level)
|
|
|
|
net_ema = copy.deepcopy(net)
|
|
net_ema.set_train(False)
|
|
assert args.ema_decay > 0, "EMA should be used in tinynet training."
|
|
|
|
ema_cb = EmaEvalCallBack(network=net,
|
|
ema_network=net_ema,
|
|
loss_fn=loss,
|
|
eval_dataset=val_dataset,
|
|
decay=args.ema_decay,
|
|
save_epoch=args.ckpt_save_epoch,
|
|
dataset_sink_mode=args.dataset_sink,
|
|
start_epoch=0)
|
|
|
|
callbacks = [loss_cb, ema_cb, time_cb] if is_master else []
|
|
|
|
if is_master:
|
|
print("Training on " + args.model
|
|
+ " with " + str(args.num_classes) + " classes")
|
|
|
|
model.train(args.epochs, train_dataset, callbacks=callbacks,
|
|
dataset_sink_mode=args.dataset_sink)
|
|
|
|
|
|
if __name__ == '__main__':
|
|
main()
|