GPU inceptionv3 support in modelzoo

5 years ago · 83580c4dec
parent 6c4ee3f3d1
commit 83580c4dec
12 changed files with 895 additions and 0 deletions
--- a/model_zoo/official/cv/inceptionv3/README.md
+++ b/model_zoo/official/cv/inceptionv3/README.md
@ -0,0 +1,115 @@
+# Inception-v3 Example
+
+## Description
+
+This is an example of training Inception-v3 in MindSpore.
+
+## Requirements
+
+- Install [Mindspore](http://www.mindspore.cn/install/en).
+- Downlaod the dataset.
+
+## Structure
+
+```shell
+.
+└─Inception-v3      
+  ├─README.md
+  ├─scripts      
+    ├─run_standalone_train_for_gpu.sh         # launch standalone training with gpu platform(1p)
+    ├─run_distribute_train_for_gpu.sh         # launch distributed training with gpu platform(8p)
+    └─run_eval_for_gpu.sh                     # launch evaluating with gpu platform
+  ├─src
+    ├─config.py                       # parameter configuration
+    ├─dataset.py                      # data preprocessing
+    ├─inception_v3.py                 # network definition
+    ├─loss.py                         # Customized CrossEntropy loss function
+    ├─lr_generator.py                 # learning rate generator
+  ├─eval.py                           # eval net
+  ├─export.py                         # convert checkpoint
+  └─train.py                          # train net
+  
+```
+
+## Parameter Configuration
+
+Parameters for both training and evaluating can be set in config.py
+
+```       
+'random_seed': 1,                # fix random seed
+'rank': 0,                       # local rank of distributed
+'group_size': 1,                 # world size of distributed
+'work_nums': 8,                  # number of workers to read the data
+'decay_method': 'cosine',        # learning rate scheduler mode
+"loss_scale": 1,                 # loss scale
+'batch_size': 128,               # input batchsize
+'epoch_size': 250,               # total epoch numbers
+'num_classes': 1000,             # dataset class numbers
+'smooth_factor': 0.1,            # label smoothing factor
+'aux_factor': 0.2,               # loss factor of aux logit
+'lr_init': 0.00004,              # initiate learning rate
+'lr_max': 0.4,                   # max bound of learning rate
+'lr_end': 0.000004,               # min bound of learning rate
+'warmup_epochs': 1,              # warmup epoch numbers
+'weight_decay': 0.00004,         # weight decay
+'momentum': 0.9,                 # momentum
+'opt_eps': 1.0,                  # epsilon
+'keep_checkpoint_max': 100,      # max numbers to keep checkpoints
+'ckpt_path': './checkpoint/',    # save checkpoint path
+'is_save_on_master': 1           # save checkpoint on rank0, distributed parameters
+```
+
+
+
+## Running the example
+
+### Train
+
+#### Usage
+
+```
+# distribute training example(8p)
+sh run_distribute_train_for_gpu.sh DATA_DIR 
+# standalone training
+sh run_standalone_train_for_gpu.sh DEVICE_ID DATA_DIR
+```
+
+#### Launch
+
+```bash
+# distributed training example(8p) for GPU
+sh scripts/run_distribute_train_for_gpu.sh /dataset/train
+# standalone training example for GPU
+sh scripts/run_standalone_train_for_gpu.sh 0 /dataset/train
+```
+
+#### Result
+
+You can find checkpoint file together with result in log.
+
+### Evaluation
+
+#### Usage
+
+```
+# Evaluation
+sh run_eval_for_gpu.sh DEVICE_ID DATA_DIR PATH_CHECKPOINT
+```
+
+#### Launch
+
+```bash
+# Evaluation with checkpoint
+sh scripts/run_eval_for_gpu.sh 0 /dataset/val ./checkpoint/inceptionv3-rank3-247_1251.ckpt
+```
+
+> checkpoint can be produced in training process.
+
+#### Result
+
+Evaluation result will be stored in the scripts path. Under this, you can find result like the followings in log.
+ 
+```
+acc=78.75%(TOP1)
+acc=94.07%(TOP5)
+```
--- a/model_zoo/official/cv/inceptionv3/eval.py
+++ b/model_zoo/official/cv/inceptionv3/eval.py
@ -0,0 +1,53 @@
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+"""evaluate_imagenet"""
+import argparse
+import os
+
+import mindspore.nn as nn
+from mindspore import context
+from mindspore.train.model import Model
+from mindspore.train.serialization import load_checkpoint, load_param_into_net
+
+from src.config import config_gpu as cfg
+from src.dataset import create_dataset
+from src.inception_v3 import InceptionV3
+from src.loss import CrossEntropy_Val
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser(description='image classification evaluation')
+    parser.add_argument('--checkpoint', type=str, default='', help='checkpoint of inception-v3 (Default: None)')
+    parser.add_argument('--dataset_path', type=str, default='', help='Dataset path')
+    parser.add_argument('--platform', type=str, default='GPU', choices=('Ascend', 'GPU'), help='run platform')
+    args_opt = parser.parse_args()
+
+    if args_opt.platform == 'Ascend':
+        device_id = int(os.getenv('DEVICE_ID'))
+        context.set_context(device_id=device_id)
+
+    context.set_context(mode=context.GRAPH_MODE, device_target=args_opt.platform)
+    net = InceptionV3(num_classes=cfg.num_classes, is_training=False)
+    ckpt = load_checkpoint(args_opt.checkpoint)
+    load_param_into_net(net, ckpt)
+    net.set_train(False)
+    dataset = create_dataset(args_opt.dataset_path, False, 0, 1)
+    loss = CrossEntropy_Val(smooth_factor=0.1, num_classes=cfg.num_classes)
+    eval_metrics = {'Loss': nn.Loss(),
+                    'Top1-Acc': nn.Top1CategoricalAccuracy(),
+                    'Top5-Acc': nn.Top5CategoricalAccuracy()}
+    model = Model(net, loss, optimizer=None, metrics=eval_metrics)
+    metrics = model.eval(dataset)
+    print("metric: ", metrics)
--- a/model_zoo/official/cv/inceptionv3/export.py
+++ b/model_zoo/official/cv/inceptionv3/export.py
@ -0,0 +1,40 @@
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+"""
+##############export checkpoint file into geir and onnx models#################
+"""
+import argparse
+import numpy as np
+
+import mindspore as ms
+from mindspore import Tensor
+from mindspore.train.serialization import load_checkpoint, load_param_into_net, export
+
+from src.config import config_gpu as cfg
+from src.inception_v3 import InceptionV3
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser(description='checkpoint export')
+    parser.add_argument('--checkpoint', type=str, default='', help='checkpoint of inception-v3 (Default: None)')
+    args_opt = parser.parse_args()
+
+    net = InceptionV3(num_classes=cfg.num_classes, is_training=False)
+    param_dict = load_checkpoint(args_opt.checkpoint)
+    load_param_into_net(net, param_dict)
+
+    input_arr = Tensor(np.random.uniform(0.0, 1.0, size=[1, 3, 299, 299]), ms.float32)
+    export(net, input_arr, file_name=cfg.onnx_filename, file_format="ONNX")
+    export(net, input_arr, file_name=cfg.geir_filename, file_format="GEIR")
--- a/model_zoo/official/cv/inceptionv3/scripts/run_distribute_train_for_gpu.sh
+++ b/model_zoo/official/cv/inceptionv3/scripts/run_distribute_train_for_gpu.sh
@ -0,0 +1,17 @@
+#!/bin/bash
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+DATA_DIR=$1
+mpirun --allow-run-as-root -n 8 python ./train.py --is_distributed --platform 'GPU' --dataset_path $DATA_DIR > train.log 2>&1 &
--- a/model_zoo/official/cv/inceptionv3/scripts/run_eval_for_gpu.sh
+++ b/model_zoo/official/cv/inceptionv3/scripts/run_eval_for_gpu.sh
@ -0,0 +1,19 @@
+#!/bin/bash
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+DEVICE_ID=$1
+DATA_DIR=$2
+PATH_CHECKPOINT=$3
+CUDA_VISIBLE_DEVICES=$DEVICE_ID python ./eval.py --platform 'GPU' --dataset_path $DATA_DIR --checkpoint $PATH_CHECKPOINT > eval.log 2>&1 &
--- a/model_zoo/official/cv/inceptionv3/scripts/run_standalone_train_for_gpu.sh
+++ b/model_zoo/official/cv/inceptionv3/scripts/run_standalone_train_for_gpu.sh
@ -0,0 +1,19 @@
+#!/bin/bash
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+DEVICE_ID=$1
+DATA_DIR=$2
+CUDA_VISIBLE_DEVICES=$DEVICE_ID python ./train.py --platform 'GPU' --dataset_path $DATA_DIR > train.log 2>&1 &
+
--- a/model_zoo/official/cv/inceptionv3/src/config.py
+++ b/model_zoo/official/cv/inceptionv3/src/config.py
@ -0,0 +1,43 @@
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+"""
+network config setting, will be used in main.py
+"""
+from easydict import EasyDict as edict
+
+
+config_gpu = edict({
+    'random_seed': 1,
+    'rank': 0,
+    'group_size': 1,
+    'work_nums': 8,
+    'decay_method': 'cosine',
+    "loss_scale": 1,
+    'batch_size': 128,
+    'epoch_size': 250,
+    'num_classes': 1000,
+    'smooth_factor': 0.1,
+    'aux_factor': 0.2,
+    'lr_init': 0.00004,
+    'lr_max': 0.4,
+    'lr_end': 0.000004,
+    'warmup_epochs': 1,
+    'weight_decay': 0.00004,
+    'momentum': 0.9,
+    'opt_eps': 1.0,
+    'keep_checkpoint_max': 100,
+    'ckpt_path': './checkpoint/',
+    'is_save_on_master': 0
+})
--- a/model_zoo/official/cv/inceptionv3/src/dataset.py
+++ b/model_zoo/official/cv/inceptionv3/src/dataset.py
@ -0,0 +1,69 @@
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+"""
+Data operations, will be used in train.py and eval.py
+"""
+import mindspore.common.dtype as mstype
+import mindspore.dataset.engine as de
+import mindspore.dataset.transforms.c_transforms as C2
+import mindspore.dataset.transforms.vision.c_transforms as C
+from src.config import config_gpu as cfg
+
+
+def create_dataset(dataset_path, do_train, rank, group_size, repeat_num=1):
+    """
+    create a train or eval dataset
+
+    Args:
+        dataset_path(string): the path of dataset.
+        do_train(bool): whether dataset is used for train or eval.
+        rank (int): The shard ID within num_shards (default=None).
+        group_size (int): Number of shards that the dataset should be divided into (default=None).
+        repeat_num(int): the repeat times of dataset. Default: 1.
+
+    Returns:
+        dataset
+    """
+    if group_size == 1:
+        ds = de.ImageFolderDatasetV2(dataset_path, num_parallel_workers=cfg.work_nums, shuffle=True)
+    else:
+        ds = de.ImageFolderDatasetV2(dataset_path, num_parallel_workers=cfg.work_nums, shuffle=True,
+                                     num_shards=group_size, shard_id=rank)
+    # define map operations
+    if do_train:
+        trans = [
+            C.RandomCropDecodeResize(299, scale=(0.08, 1.0), ratio=(0.75, 1.333)),
+            C.RandomHorizontalFlip(prob=0.5),
+            C.RandomColorAdjust(brightness=0.4, contrast=0.4, saturation=0.4)
+            ]
+    else:
+        trans = [
+            C.Decode(),
+            C.Resize(299),
+            C.CenterCrop(299)
+            ]
+    trans += [
+        C.Rescale(1.0 / 255.0, 0.0),
+        C.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
+        C.HWC2CHW()
+    ]
+    type_cast_op = C2.TypeCast(mstype.int32)
+    ds = ds.map(input_columns="image", operations=trans, num_parallel_workers=cfg.work_nums)
+    ds = ds.map(input_columns="label", operations=type_cast_op, num_parallel_workers=cfg.work_nums)
+    # apply batch operations
+    ds = ds.batch(cfg.batch_size, drop_remainder=True)
+    # apply dataset repeat operation
+    ds = ds.repeat(repeat_num)
+    return ds
--- a/model_zoo/official/cv/inceptionv3/src/inception_v3.py
+++ b/model_zoo/official/cv/inceptionv3/src/inception_v3.py
--- a/model_zoo/official/cv/inceptionv3/src/loss.py
+++ b/model_zoo/official/cv/inceptionv3/src/loss.py
@ -0,0 +1,60 @@
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+"""define loss function for network."""
+from mindspore.nn.loss.loss import _Loss
+from mindspore.ops import operations as P
+from mindspore.ops import functional as F
+from mindspore import Tensor
+from mindspore.common import dtype as mstype
+import mindspore.nn as nn
+
+
+class CrossEntropy(_Loss):
+    """the redefined loss function with SoftmaxCrossEntropyWithLogits"""
+    def __init__(self, smooth_factor=0, num_classes=1000, factor=0.4):
+        super(CrossEntropy, self).__init__()
+        self.factor = factor
+        self.onehot = P.OneHot()
+        self.on_value = Tensor(1.0 - smooth_factor, mstype.float32)
+        self.off_value = Tensor(1.0 * smooth_factor / (num_classes - 1), mstype.float32)
+        self.ce = nn.SoftmaxCrossEntropyWithLogits()
+        self.mean = P.ReduceMean(False)
+
+    def construct(self, logits, label):
+        logit, aux = logits
+        one_hot_label = self.onehot(label, F.shape(logit)[1], self.on_value, self.off_value)
+        loss_logit = self.ce(logit, one_hot_label)
+        loss_logit = self.mean(loss_logit, 0)
+        one_hot_label_aux = self.onehot(label, F.shape(aux)[1], self.on_value, self.off_value)
+        loss_aux = self.ce(aux, one_hot_label_aux)
+        loss_aux = self.mean(loss_aux, 0)
+        return loss_logit + self.factor*loss_aux
+
+
+class CrossEntropy_Val(_Loss):
+    """the redefined loss function with SoftmaxCrossEntropyWithLogits, will be used in inference process"""
+    def __init__(self, smooth_factor=0, num_classes=1000):
+        super(CrossEntropy_Val, self).__init__()
+        self.onehot = P.OneHot()
+        self.on_value = Tensor(1.0 - smooth_factor, mstype.float32)
+        self.off_value = Tensor(1.0 * smooth_factor / (num_classes - 1), mstype.float32)
+        self.ce = nn.SoftmaxCrossEntropyWithLogits()
+        self.mean = P.ReduceMean(False)
+
+    def construct(self, logits, label):
+        one_hot_label = self.onehot(label, F.shape(logits)[1], self.on_value, self.off_value)
+        loss_logit = self.ce(logits, one_hot_label)
+        loss_logit = self.mean(loss_logit, 0)
+        return loss_logit
--- a/model_zoo/official/cv/inceptionv3/src/lr_generator.py
+++ b/model_zoo/official/cv/inceptionv3/src/lr_generator.py
@ -0,0 +1,87 @@
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+"""learning rate generator"""
+import math
+import numpy as np
+
+
+def get_lr(lr_init, lr_end, lr_max, warmup_epochs, total_epochs, steps_per_epoch, lr_decay_mode):
+    """
+    generate learning rate array
+
+    Args:
+       lr_init(float): init learning rate
+       lr_end(float): end learning rate
+       lr_max(float): max learning rate
+       warmup_epochs(int): number of warmup epochs
+       total_epochs(int): total epoch of training
+       steps_per_epoch(int): steps of one epoch
+       lr_decay_mode(string): learning rate decay mode, including steps, poly, cosine or default
+
+    Returns:
+       np.array, learning rate array
+    """
+    lr_each_step = []
+    total_steps = steps_per_epoch * total_epochs
+    warmup_steps = steps_per_epoch * warmup_epochs
+    if lr_decay_mode == 'steps':
+        decay_epoch_index = [0.3 * total_steps, 0.6 * total_steps, 0.8 * total_steps]
+        for i in range(total_steps):
+            if i < warmup_steps:
+                lr = lr_init + (lr_max - lr_init) * i / warmup_steps
+            else:
+                if i < decay_epoch_index[0]:
+                    lr = lr_max
+                elif i < decay_epoch_index[1]:
+                    lr = lr_max * 0.1
+                elif i < decay_epoch_index[2]:
+                    lr = lr_max * 0.01
+                else:
+                    lr = lr_max * 0.001
+            lr_each_step.append(lr)
+    elif lr_decay_mode == 'steps_decay':
+        if warmup_steps != 0:
+            inc_each_step = (float(lr_max) - float(lr_init)) / float(warmup_steps)
+        else:
+            inc_each_step = 0
+        for i in range(total_steps):
+            if i < warmup_steps:
+                lr = float(lr_init) + inc_each_step * float(i)
+            else:
+                decay_nums = math.floor((float(i-warmup_steps)/steps_per_epoch) / 2)
+                decay_rate = pow(0.94, decay_nums)
+                lr = float(lr_max)*decay_rate
+                if lr < 0.0:
+                    lr = 0.0
+            lr_each_step.append(lr)
+    elif lr_decay_mode == 'cosine':
+        decay_steps = total_steps - warmup_steps
+        for i in range(total_steps):
+            if i < warmup_steps:
+                lr_inc = (float(lr_max) - float(lr_init)) / float(warmup_steps)
+                lr = float(lr_init) + lr_inc * (i + 1)
+            else:
+                cosine_decay = 0.5 * (1 + math.cos(math.pi * (i-warmup_steps) / decay_steps))
+                lr = (lr_max-lr_end)*cosine_decay + lr_end
+            lr_each_step.append(lr)
+    else:
+        for i in range(total_steps):
+            if i < warmup_steps:
+                lr = lr_init + (lr_max - lr_init) * i / warmup_steps
+            else:
+                lr = lr_max - (lr_max - lr_end) * (i - warmup_steps) / (total_steps - warmup_steps)
+            lr_each_step.append(lr)
+    learning_rate = np.array(lr_each_step).astype(np.float32)
+    return learning_rate
--- a/model_zoo/official/cv/inceptionv3/train.py
+++ b/model_zoo/official/cv/inceptionv3/train.py
@ -0,0 +1,116 @@
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+"""train_imagenet."""
+import argparse
+import os
+import random
+import numpy as np
+
+import mindspore.nn as nn
+from mindspore import Tensor
+from mindspore import context
+from mindspore import ParallelMode
+from mindspore.communication.management import init, get_rank, get_group_size
+from mindspore.nn.optim.rmsprop import RMSProp
+from mindspore.train.callback import ModelCheckpoint, CheckpointConfig, LossMonitor, TimeMonitor
+from mindspore.train.model import Model
+from mindspore.train.serialization import load_checkpoint, load_param_into_net
+from mindspore import dataset as de
+
+from src.config import config_gpu as cfg
+from src.dataset import create_dataset
+from src.inception_v3 import InceptionV3
+from src.lr_generator import get_lr
+from src.loss import CrossEntropy
+
+random.seed(cfg.random_seed)
+np.random.seed(cfg.random_seed)
+de.config.set_seed(cfg.random_seed)
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser(description='image classification training')
+    parser.add_argument('--dataset_path', type=str, default='', help='Dataset path')
+    parser.add_argument('--resume', type=str, default='', help='resume training with existed checkpoint')
+    parser.add_argument('--is_distributed', action='store_true', default=False,
+                        help='distributed training')
+    parser.add_argument('--platform', type=str, default='GPU', choices=('Ascend', 'GPU'), help='run platform')
+    args_opt = parser.parse_args()
+
+    context.set_context(mode=context.GRAPH_MODE, device_target=args_opt.platform, save_graphs=False)
+    if os.getenv('DEVICE_ID', "not_set").isdigit():
+        context.set_context(device_id=int(os.getenv('DEVICE_ID')))
+
+    # init distributed
+    if args_opt.is_distributed:
+        if args_opt.platform == "Ascend":
+            init()
+        else:
+            init("nccl")
+        cfg.rank = get_rank()
+        cfg.group_size = get_group_size()
+        parallel_mode = ParallelMode.DATA_PARALLEL
+        context.set_auto_parallel_context(parallel_mode=parallel_mode, device_num=cfg.group_size,
+                                          parameter_broadcast=True, mirror_mean=True)
+    else:
+        cfg.rank = 0
+        cfg.group_size = 1
+
+    # dataloader
+    dataset = create_dataset(args_opt.dataset_path, True, cfg.rank, cfg.group_size)
+    batches_per_epoch = dataset.get_dataset_size()
+
+    # network
+    net = InceptionV3(num_classes=cfg.num_classes)
+
+    # loss
+    loss = CrossEntropy(smooth_factor=cfg.smooth_factor, num_classes=cfg.num_classes, factor=cfg.aux_factor)
+
+    # learning rate schedule
+    lr = get_lr(lr_init=cfg.lr_init, lr_end=cfg.lr_end, lr_max=cfg.lr_max, warmup_epochs=cfg.warmup_epochs,
+                total_epochs=cfg.epoch_size, steps_per_epoch=batches_per_epoch, lr_decay_mode=cfg.decay_method)
+    lr = Tensor(lr)
+
+    # optimizer
+    decayed_params = list(filter(lambda x: 'beta' not in x.name and 'gamma' not in x.name and 'bias' not in x.name, net.trainable_params()))
+    no_decayed_params = [param for param in net.trainable_params() if param not in decayed_params]
+    group_params = [{'params': decayed_params, 'weight_decay': cfg.weight_decay},
+                    {'params': no_decayed_params},
+                    {'order_params': net.trainable_params()}]
+    optimizer = RMSProp(group_params, lr, decay=0.9, weight_decay=cfg.weight_decay,
+                        momentum=cfg.momentum, epsilon=cfg.opt_eps, loss_scale=cfg.loss_scale)
+    eval_metrics = {'Loss': nn.Loss(),
+                    'Top1-Acc': nn.Top1CategoricalAccuracy(),
+                    'Top5-Acc': nn.Top5CategoricalAccuracy()}
+
+    if args_opt.resume:
+        ckpt = load_checkpoint(args_opt.resume)
+        load_param_into_net(net, ckpt)
+    model = Model(net, loss_fn=loss, optimizer=optimizer, metrics={'acc'})
+
+    print("============== Starting Training ==============")
+    loss_cb = LossMonitor(per_print_times=batches_per_epoch)
+    time_cb = TimeMonitor(data_size=batches_per_epoch)
+    callbacks = [loss_cb, time_cb]
+    config_ck = CheckpointConfig(save_checkpoint_steps=batches_per_epoch, keep_checkpoint_max=cfg.keep_checkpoint_max)
+    ckpoint_cb = ModelCheckpoint(prefix=f"inceptionv3-rank{cfg.rank}", directory=cfg.ckpt_path, config=config_ck)
+    if args_opt.is_distributed & cfg.is_save_on_master:
+        if cfg.rank == 0:
+            callbacks.append(ckpoint_cb)
+        model.train(cfg.epoch_size, dataset, callbacks=callbacks, dataset_sink_mode=True)
+    else:
+        callbacks.append(ckpoint_cb)
+        model.train(cfg.epoch_size, dataset, callbacks=callbacks, dataset_sink_mode=True)
+    print("train success")