# Copyright 2020 Huawei Technologies Co., Ltd # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # ============================================================================ """train imagenet""" import os import argparse import math import numpy as np from mindspore.communication import init, get_rank from mindspore.train.callback import ModelCheckpoint, CheckpointConfig, TimeMonitor, LossMonitor from mindspore.train.model import ParallelMode from mindspore.train.loss_scale_manager import FixedLossScaleManager from mindspore import Model from mindspore.nn.loss import SoftmaxCrossEntropyWithLogits from mindspore.nn import RMSProp from mindspore import Tensor from mindspore import context from mindspore.common import set_seed from mindspore.common.initializer import XavierUniform, initializer from mindspore.train.serialization import load_checkpoint, load_param_into_net from src.inceptionv4 import Inceptionv4 from src.dataset import create_dataset, device_num from src.config import config os.environ['PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION'] = 'python' set_seed(1) def generate_cosine_lr(steps_per_epoch, total_epochs, lr_init=config.lr_init, lr_end=config.lr_end, lr_max=config.lr_max, warmup_epochs=config.warmup_epochs): """ Applies cosine decay to generate learning rate array. Args: steps_per_epoch(int): steps number per epoch total_epochs(int): all epoch in training. lr_init(float): init learning rate. lr_end(float): end learning rate lr_max(float): max learning rate. warmup_steps(int): all steps in warmup epochs. Returns: np.array, learning rate array. """ total_steps = steps_per_epoch * total_epochs warmup_steps = steps_per_epoch * warmup_epochs decay_steps = total_steps - warmup_steps lr_each_step = [] for i in range(total_steps): if i < warmup_steps: lr_inc = (float(lr_max) - float(lr_init)) / float(warmup_steps) lr = float(lr_init) + lr_inc * (i + 1) else: cosine_decay = 0.5 * (1 + math.cos(math.pi * (i - warmup_steps) / decay_steps)) lr = (lr_max - lr_end) * cosine_decay + lr_end lr_each_step.append(lr) learning_rate = np.array(lr_each_step).astype(np.float32) current_step = steps_per_epoch * (config.start_epoch - 1) learning_rate = learning_rate[current_step:] return learning_rate def inception_v4_train(): """ Train Inceptionv4 in data parallelism """ print('epoch_size: {} batch_size: {} class_num {}'.format(config.epoch_size, config.batch_size, config.num_classes)) context.set_context(mode=context.GRAPH_MODE, device_target=args.platform) if args.platform == "Ascend": context.set_context(device_id=args.device_id) context.set_context(enable_graph_kernel=False) rank = 0 if device_num > 1: if args.platform == "Ascend": init(backend_name='hccl') elif args.platform == "GPU": init() else: raise ValueError("Unsupported device target.") rank = get_rank() context.set_auto_parallel_context(device_num=device_num, parallel_mode=ParallelMode.DATA_PARALLEL, gradients_mean=True, all_reduce_fusion_config=[200, 400]) # create dataset train_dataset = create_dataset(dataset_path=args.dataset_path, do_train=True, repeat_num=1, batch_size=config.batch_size, shard_id=rank) train_step_size = train_dataset.get_dataset_size() # create model net = Inceptionv4(classes=config.num_classes) # loss loss = SoftmaxCrossEntropyWithLogits(sparse=True, reduction="mean") # learning rate lr = Tensor(generate_cosine_lr(steps_per_epoch=train_step_size, total_epochs=config.epoch_size)) decayed_params = [] no_decayed_params = [] for param in net.trainable_params(): if 'beta' not in param.name and 'gamma' not in param.name and 'bias' not in param.name: decayed_params.append(param) else: no_decayed_params.append(param) for param in net.trainable_params(): if 'beta' not in param.name and 'gamma' not in param.name and 'bias' not in param.name: param.set_data(initializer(XavierUniform(), param.data.shape, param.data.dtype)) group_params = [{'params': decayed_params, 'weight_decay': config.weight_decay}, {'params': no_decayed_params}, {'order_params': net.trainable_params()}] opt = RMSProp(group_params, lr, decay=config.decay, epsilon=config.epsilon, weight_decay=config.weight_decay, momentum=config.momentum, loss_scale=config.loss_scale) if args.device_id == 0: print(lr) print(train_step_size) if args.resume: ckpt = load_checkpoint(args.resume) load_param_into_net(net, ckpt) loss_scale_manager = FixedLossScaleManager(config.loss_scale, drop_overflow_update=False) if args.platform == "Ascend": model = Model(net, loss_fn=loss, optimizer=opt, metrics={'acc', 'top_1_accuracy', 'top_5_accuracy'}, loss_scale_manager=loss_scale_manager, amp_level=config.amp_level) elif args.platform == "GPU": model = Model(net, loss_fn=loss, optimizer=opt, metrics={'acc', 'top_1_accuracy', 'top_5_accuracy'}, loss_scale_manager=loss_scale_manager, amp_level='O0') else: raise ValueError("Unsupported device target.") # define callbacks performance_cb = TimeMonitor(data_size=train_step_size) loss_cb = LossMonitor(per_print_times=train_step_size) ckp_save_step = config.save_checkpoint_epochs * train_step_size config_ck = CheckpointConfig(save_checkpoint_steps=ckp_save_step, keep_checkpoint_max=config.keep_checkpoint_max) ckpoint_cb = ModelCheckpoint(prefix=f"inceptionV4-train-rank{rank}", directory='ckpts_rank_' + str(rank), config=config_ck) callbacks = [performance_cb, loss_cb] if device_num > 1 and config.is_save_on_master: if args.device_id == 0: callbacks.append(ckpoint_cb) else: callbacks.append(ckpoint_cb) # train model model.train(config.epoch_size, train_dataset, callbacks=callbacks, dataset_sink_mode=True) def parse_args(): '''parse_args''' arg_parser = argparse.ArgumentParser(description='InceptionV4 image classification training') arg_parser.add_argument('--dataset_path', type=str, default='', help='Dataset path') arg_parser.add_argument('--device_id', type=int, default=0, help='device id') arg_parser.add_argument('--platform', type=str, default='Ascend', choices=("Ascend", "GPU"), help='Platform, support Ascend, GPU.') arg_parser.add_argument('--resume', type=str, default='', help='resume training with existed checkpoint') args_opt = arg_parser.parse_args() return args_opt if __name__ == '__main__': args = parse_args() inception_v4_train() print('Inceptionv4 training success!')