# Copyright 2020 Huawei Technologies Co., Ltd # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # ============================================================================ """DPN model train with MindSpore""" import os import argparse from mindspore import context from mindspore import Tensor from mindspore.nn import SGD from mindspore.nn.loss import SoftmaxCrossEntropyWithLogits from mindspore.train.model import Model from mindspore.context import ParallelMode from mindspore.train.callback import LossMonitor, ModelCheckpoint, CheckpointConfig, TimeMonitor from mindspore.train.loss_scale_manager import FixedLossScaleManager from mindspore.communication.management import init, get_group_size, get_rank from mindspore.common import set_seed from mindspore.train.serialization import load_checkpoint, load_param_into_net from src.imagenet_dataset import classification_dataset from src.dpn import dpns from src.config import config from src.lr_scheduler import get_lr_drop, get_lr_warmup from src.crossentropy import CrossEntropy from src.callbacks import SaveCallback device_id = int(os.getenv('DEVICE_ID')) set_seed(1) def parse_args(): """parameters""" parser = argparse.ArgumentParser('dpn training') # dataset related parser.add_argument('--data_dir', type=str, default='', help='Imagenet data dir') # network related parser.add_argument('--pretrained', default='', type=str, help='ckpt path to load') # distributed related parser.add_argument('--is_distributed', type=int, default=1, help='if multi device') parser.add_argument('--ckpt_path', type=str, default='', help='ckpt path to save') parser.add_argument('--eval_each_epoch', type=int, default=0, help='evaluate on each epoch') args, _ = parser.parse_known_args() args.image_size = config.image_size args.num_classes = config.num_classes args.lr_init = config.lr_init args.lr_max = config.lr_max args.factor = config.factor args.global_step = config.global_step args.epoch_number_to_drop = config.epoch_number_to_drop args.epoch_size = config.epoch_size args.warmup_epochs = config.warmup_epochs args.weight_decay = config.weight_decay args.momentum = config.momentum args.batch_size = config.batch_size args.num_parallel_workers = config.num_parallel_workers args.backbone = config.backbone args.loss_scale_num = config.loss_scale_num args.is_save_on_master = config.is_save_on_master args.rank = config.rank args.group_size = config.group_size args.dataset = config.dataset args.label_smooth = config.label_smooth args.label_smooth_factor = config.label_smooth_factor args.keep_checkpoint_max = config.keep_checkpoint_max args.lr_schedule = config.lr_schedule return args def dpn_train(args): # init context context.set_context(mode=context.GRAPH_MODE, device_target="Ascend", save_graphs=False, device_id=device_id) # init distributed if args.is_distributed: init() args.rank = get_rank() args.group_size = get_group_size() context.set_auto_parallel_context(device_num=args.group_size, parallel_mode=ParallelMode.DATA_PARALLEL, gradients_mean=True) # select for master rank save ckpt or all rank save, compatible for model parallel args.rank_save_ckpt_flag = 0 if args.is_save_on_master: if args.rank == 0: args.rank_save_ckpt_flag = 1 else: args.rank_save_ckpt_flag = 1 # create dataset args.train_dir = os.path.join(args.data_dir, 'train') args.eval_dir = os.path.join(args.data_dir, 'val') train_dataset = classification_dataset(args.train_dir, image_size=args.image_size, per_batch_size=args.batch_size, max_epoch=1, num_parallel_workers=args.num_parallel_workers, shuffle=True, rank=args.rank, group_size=args.group_size) if args.eval_each_epoch: print("create eval_dataset") eval_dataset = classification_dataset(args.eval_dir, image_size=args.image_size, per_batch_size=args.batch_size, max_epoch=1, num_parallel_workers=args.num_parallel_workers, shuffle=False, rank=args.rank, group_size=args.group_size, mode='eval') train_step_size = train_dataset.get_dataset_size() # choose net net = dpns[args.backbone](num_classes=args.num_classes) # load checkpoint if os.path.isfile(args.pretrained): print("load ckpt") load_param_into_net(net, load_checkpoint(args.pretrained)) # learing rate schedule if args.lr_schedule == 'drop': print("lr_schedule:drop") lr = Tensor(get_lr_drop(global_step=args.global_step, total_epochs=args.epoch_size, steps_per_epoch=train_step_size, lr_init=args.lr_init, factor=args.factor)) elif args.lr_schedule == 'warmup': print("lr_schedule:warmup") lr = Tensor(get_lr_warmup(global_step=args.global_step, total_epochs=args.epoch_size, steps_per_epoch=train_step_size, lr_init=args.lr_init, lr_max=args.lr_max, warmup_epochs=args.warmup_epochs)) # optimizer opt = SGD(net.trainable_params(), lr, momentum=args.momentum, weight_decay=args.weight_decay, loss_scale=args.loss_scale_num) # loss scale loss_scale = FixedLossScaleManager(args.loss_scale_num, False) # loss function if args.dataset == "imagenet-1K": print("Use SoftmaxCrossEntropyWithLogits") loss = SoftmaxCrossEntropyWithLogits(sparse=True, reduction='mean') else: if not args.label_smooth: args.label_smooth_factor = 0.0 print("Use Label_smooth CrossEntropy") loss = CrossEntropy(smooth_factor=args.label_smooth_factor, num_classes=args.num_classes) # create model model = Model(net, amp_level="O2", keep_batchnorm_fp32=False, loss_fn=loss, optimizer=opt, loss_scale_manager=loss_scale, metrics={'top_1_accuracy', 'top_5_accuracy'}) # loss/time monitor & ckpt save callback loss_cb = LossMonitor() time_cb = TimeMonitor(data_size=train_step_size) cb = [loss_cb, time_cb] if args.rank_save_ckpt_flag: if args.eval_each_epoch: save_cb = SaveCallback(model, eval_dataset, args.ckpt_path) cb += [save_cb] else: config_ck = CheckpointConfig(save_checkpoint_steps=train_step_size, keep_checkpoint_max=args.keep_checkpoint_max) ckpoint_cb = ModelCheckpoint(prefix="dpn", directory=args.ckpt_path, config=config_ck) cb.append(ckpoint_cb) # train model model.train(args.epoch_size, train_dataset, callbacks=cb) if __name__ == '__main__': dpn_train(parse_args()) print('DPN training success!')