!1899 add resnet50_imagenet 8p st
Merge pull request !1899 from zhaoting/ResNet50_stpull/1899/MERGE
commit
792d1a444c
@ -0,0 +1,47 @@
|
|||||||
|
# Copyright 2020 Huawei Technologies Co., Ltd
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
# ============================================================================
|
||||||
|
"""
|
||||||
|
network config setting, will be used in train.py and eval.py
|
||||||
|
"""
|
||||||
|
from easydict import EasyDict as ed
|
||||||
|
|
||||||
|
config = ed({
|
||||||
|
"class_num": 1001,
|
||||||
|
"batch_size": 32,
|
||||||
|
"eval_interval": 1,
|
||||||
|
"eval_batch_size": 50,
|
||||||
|
"loss_scale": 1024,
|
||||||
|
"momentum": 0.9,
|
||||||
|
"weight_decay": 1e-4,
|
||||||
|
"use_nesterov": True,
|
||||||
|
"epoch_size": 90,
|
||||||
|
"pretrained_epoch_size": 1,
|
||||||
|
"buffer_size": 1000,
|
||||||
|
"image_height": 224,
|
||||||
|
"image_width": 224,
|
||||||
|
"save_checkpoint": False,
|
||||||
|
"save_checkpoint_epochs": 5,
|
||||||
|
"keep_checkpoint_max": 10,
|
||||||
|
"save_checkpoint_path": "./",
|
||||||
|
"warmup_epochs": 0,
|
||||||
|
"lr_decay_mode": "cosine",
|
||||||
|
"use_label_smooth": True,
|
||||||
|
"label_smooth_factor": 0.1,
|
||||||
|
"lr_init": 0,
|
||||||
|
"lr_max": 0.1,
|
||||||
|
"use_lars": True,
|
||||||
|
"lars_epsilon": 1e-8,
|
||||||
|
"lars_coefficient": 0.001
|
||||||
|
})
|
@ -0,0 +1,79 @@
|
|||||||
|
# Copyright 2020 Huawei Technologies Co., Ltd
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
# ============================================================================
|
||||||
|
|
||||||
|
"""create train or eval dataset."""
|
||||||
|
|
||||||
|
import os
|
||||||
|
import mindspore.common.dtype as mstype
|
||||||
|
import mindspore.dataset.engine as de
|
||||||
|
import mindspore.dataset.transforms.vision.c_transforms as C
|
||||||
|
import mindspore.dataset.transforms.c_transforms as C2
|
||||||
|
|
||||||
|
|
||||||
|
def create_dataset(dataset_path, do_train, repeat_num=1, batch_size=32):
|
||||||
|
"""
|
||||||
|
create a train or eval dataset.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
dataset_path(string): the path of dataset.
|
||||||
|
do_train(bool): whether dataset is used for train or eval.
|
||||||
|
repeat_num(int): the repeat times of dataset. Default: 1
|
||||||
|
batch_size(int): the batch size of dataset. Default: 32
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
dataset
|
||||||
|
"""
|
||||||
|
|
||||||
|
device_num = int(os.getenv("RANK_SIZE"))
|
||||||
|
rank_id = int(os.getenv("RANK_ID"))
|
||||||
|
if device_num == 1:
|
||||||
|
ds = de.ImageFolderDatasetV2(dataset_path, num_parallel_workers=8, shuffle=True)
|
||||||
|
else:
|
||||||
|
ds = de.ImageFolderDatasetV2(dataset_path, num_parallel_workers=8, shuffle=True,
|
||||||
|
num_shards=device_num, shard_id=rank_id)
|
||||||
|
|
||||||
|
image_size = 224
|
||||||
|
mean = [0.485 * 255, 0.456 * 255, 0.406 * 255]
|
||||||
|
std = [0.229 * 255, 0.224 * 255, 0.225 * 255]
|
||||||
|
|
||||||
|
# define map operations
|
||||||
|
if do_train:
|
||||||
|
trans = [
|
||||||
|
C.RandomCropDecodeResize(image_size, scale=(0.08, 1.0), ratio=(0.75, 1.333)),
|
||||||
|
C.RandomHorizontalFlip(prob=0.5),
|
||||||
|
C.Normalize(mean=mean, std=std),
|
||||||
|
C.HWC2CHW()
|
||||||
|
]
|
||||||
|
else:
|
||||||
|
trans = [
|
||||||
|
C.Decode(),
|
||||||
|
C.Resize((256, 256)),
|
||||||
|
C.CenterCrop(image_size),
|
||||||
|
C.Normalize(mean=mean, std=std),
|
||||||
|
C.HWC2CHW()
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
|
type_cast_op = C2.TypeCast(mstype.int32)
|
||||||
|
|
||||||
|
ds = ds.map(input_columns="image", num_parallel_workers=8, operations=trans)
|
||||||
|
ds = ds.map(input_columns="label", num_parallel_workers=8, operations=type_cast_op)
|
||||||
|
|
||||||
|
# apply batch operations
|
||||||
|
ds = ds.batch(batch_size, drop_remainder=True)
|
||||||
|
|
||||||
|
# apply dataset repeat operation
|
||||||
|
ds = ds.repeat(repeat_num)
|
||||||
|
return ds
|
@ -0,0 +1,87 @@
|
|||||||
|
# Copyright 2020 Huawei Technologies Co., Ltd
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
# ============================================================================
|
||||||
|
"""learning rate generator"""
|
||||||
|
import math
|
||||||
|
import numpy as np
|
||||||
|
|
||||||
|
|
||||||
|
def get_learning_rate(lr_init, lr_end, lr_max, warmup_epochs, total_epochs, steps_per_epoch, lr_decay_mode):
|
||||||
|
"""
|
||||||
|
generate learning rate array
|
||||||
|
|
||||||
|
Args:
|
||||||
|
lr_init(float): init learning rate
|
||||||
|
lr_end(float): end learning rate
|
||||||
|
lr_max(float): max learning rate
|
||||||
|
warmup_epochs(int): number of warmup epochs
|
||||||
|
total_epochs(int): total epoch of training
|
||||||
|
steps_per_epoch(int): steps of one epoch
|
||||||
|
lr_decay_mode(string): learning rate decay mode, including steps, poly, cosine or default
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
np.array, learning rate array
|
||||||
|
"""
|
||||||
|
lr_each_step = []
|
||||||
|
total_steps = steps_per_epoch * total_epochs
|
||||||
|
warmup_steps = steps_per_epoch * warmup_epochs
|
||||||
|
if lr_decay_mode == 'steps':
|
||||||
|
decay_epoch_index = [0.3 * total_steps, 0.6 * total_steps, 0.8 * total_steps]
|
||||||
|
for i in range(total_steps):
|
||||||
|
if i < decay_epoch_index[0]:
|
||||||
|
lr = lr_max
|
||||||
|
elif i < decay_epoch_index[1]:
|
||||||
|
lr = lr_max * 0.1
|
||||||
|
elif i < decay_epoch_index[2]:
|
||||||
|
lr = lr_max * 0.01
|
||||||
|
else:
|
||||||
|
lr = lr_max * 0.001
|
||||||
|
lr_each_step.append(lr)
|
||||||
|
elif lr_decay_mode == 'poly':
|
||||||
|
if warmup_steps != 0:
|
||||||
|
inc_each_step = (float(lr_max) - float(lr_init)) / float(warmup_steps)
|
||||||
|
else:
|
||||||
|
inc_each_step = 0
|
||||||
|
for i in range(total_steps):
|
||||||
|
if i < warmup_steps:
|
||||||
|
lr = float(lr_init) + inc_each_step * float(i)
|
||||||
|
else:
|
||||||
|
base = (1.0 - (float(i) - float(warmup_steps)) / (float(total_steps) - float(warmup_steps)))
|
||||||
|
lr = float(lr_max) * base * base
|
||||||
|
if lr < 0.0:
|
||||||
|
lr = 0.0
|
||||||
|
lr_each_step.append(lr)
|
||||||
|
elif lr_decay_mode == 'cosine':
|
||||||
|
decay_steps = total_steps - warmup_steps
|
||||||
|
for i in range(total_steps):
|
||||||
|
if i < warmup_steps:
|
||||||
|
lr_inc = (float(lr_max) - float(lr_init)) / float(warmup_steps)
|
||||||
|
lr = float(lr_init) + lr_inc * (i + 1)
|
||||||
|
else:
|
||||||
|
linear_decay = (total_steps - i) / decay_steps
|
||||||
|
cosine_decay = 0.5 * (1 + math.cos(math.pi * 2 * 0.47 * i / decay_steps))
|
||||||
|
decayed = linear_decay * cosine_decay + 0.00001
|
||||||
|
lr = lr_max * decayed
|
||||||
|
lr_each_step.append(lr)
|
||||||
|
else:
|
||||||
|
for i in range(total_steps):
|
||||||
|
if i < warmup_steps:
|
||||||
|
lr = lr_init + (lr_max - lr_init) * i / warmup_steps
|
||||||
|
else:
|
||||||
|
lr = lr_max - (lr_max - lr_end) * (i - warmup_steps) / (total_steps - warmup_steps)
|
||||||
|
lr_each_step.append(lr)
|
||||||
|
|
||||||
|
learning_rate = np.array(lr_each_step).astype(np.float32)
|
||||||
|
|
||||||
|
return learning_rate
|
@ -0,0 +1,132 @@
|
|||||||
|
# Copyright 2020 Huawei Technologies Co., Ltd
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
# ============================================================================
|
||||||
|
"""evaluation metric."""
|
||||||
|
|
||||||
|
from mindspore.communication.management import GlobalComm
|
||||||
|
from mindspore.ops import operations as P
|
||||||
|
import mindspore.nn as nn
|
||||||
|
import mindspore.common.dtype as mstype
|
||||||
|
|
||||||
|
|
||||||
|
class ClassifyCorrectCell(nn.Cell):
|
||||||
|
r"""
|
||||||
|
Cell that returns correct count of the prediction in classification network.
|
||||||
|
This Cell accepts a network as arguments.
|
||||||
|
It returns orrect count of the prediction to calculate the metrics.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
network (Cell): The network Cell.
|
||||||
|
|
||||||
|
Inputs:
|
||||||
|
- **data** (Tensor) - Tensor of shape :math:`(N, \ldots)`.
|
||||||
|
- **label** (Tensor) - Tensor of shape :math:`(N, \ldots)`.
|
||||||
|
|
||||||
|
Outputs:
|
||||||
|
Tuple, containing a scalar correct count of the prediction
|
||||||
|
|
||||||
|
Examples:
|
||||||
|
>>> # For a defined network Net without loss function
|
||||||
|
>>> net = Net()
|
||||||
|
>>> eval_net = nn.ClassifyCorrectCell(net)
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self, network):
|
||||||
|
super(ClassifyCorrectCell, self).__init__(auto_prefix=False)
|
||||||
|
self._network = network
|
||||||
|
self.argmax = P.Argmax()
|
||||||
|
self.equal = P.Equal()
|
||||||
|
self.cast = P.Cast()
|
||||||
|
self.reduce_sum = P.ReduceSum()
|
||||||
|
self.allreduce = P.AllReduce(P.ReduceOp.SUM, GlobalComm.WORLD_COMM_GROUP)
|
||||||
|
|
||||||
|
def construct(self, data, label):
|
||||||
|
outputs = self._network(data)
|
||||||
|
y_pred = self.argmax(outputs)
|
||||||
|
y_pred = self.cast(y_pred, mstype.int32)
|
||||||
|
y_correct = self.equal(y_pred, label)
|
||||||
|
y_correct = self.cast(y_correct, mstype.float32)
|
||||||
|
y_correct = self.reduce_sum(y_correct)
|
||||||
|
total_correct = self.allreduce(y_correct)
|
||||||
|
return (total_correct,)
|
||||||
|
|
||||||
|
|
||||||
|
class DistAccuracy(nn.Metric):
|
||||||
|
r"""
|
||||||
|
Calculates the accuracy for classification data in distributed mode.
|
||||||
|
The accuracy class creates two local variables, correct number and total number that are used to compute the
|
||||||
|
frequency with which predictions matches labels. This frequency is ultimately returned as the accuracy: an
|
||||||
|
idempotent operation that simply divides correct number by total number.
|
||||||
|
|
||||||
|
.. math::
|
||||||
|
|
||||||
|
\text{accuracy} =\frac{\text{true_positive} + \text{true_negative}}
|
||||||
|
|
||||||
|
{\text{true_positive} + \text{true_negative} + \text{false_positive} + \text{false_negative}}
|
||||||
|
|
||||||
|
Args:
|
||||||
|
eval_type (str): Metric to calculate the accuracy over a dataset, for classification (single-label).
|
||||||
|
|
||||||
|
Examples:
|
||||||
|
>>> y_correct = Tensor(np.array([20]))
|
||||||
|
>>> metric = nn.DistAccuracy(batch_size=3, device_num=8)
|
||||||
|
>>> metric.clear()
|
||||||
|
>>> metric.update(y_correct)
|
||||||
|
>>> accuracy = metric.eval()
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self, batch_size, device_num):
|
||||||
|
super(DistAccuracy, self).__init__()
|
||||||
|
self.clear()
|
||||||
|
self.batch_size = batch_size
|
||||||
|
self.device_num = device_num
|
||||||
|
|
||||||
|
def clear(self):
|
||||||
|
"""Clears the internal evaluation result."""
|
||||||
|
self._correct_num = 0
|
||||||
|
self._total_num = 0
|
||||||
|
|
||||||
|
def update(self, *inputs):
|
||||||
|
"""
|
||||||
|
Updates the internal evaluation result :math:`y_{pred}` and :math:`y`.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
inputs: Input `y_correct`. `y_correct` is a `scalar Tensor`.
|
||||||
|
`y_correct` is the right prediction count that gathered from all devices
|
||||||
|
it's a scalar in float type
|
||||||
|
|
||||||
|
Raises:
|
||||||
|
ValueError: If the number of the input is not 1.
|
||||||
|
"""
|
||||||
|
|
||||||
|
if len(inputs) != 1:
|
||||||
|
raise ValueError('Distribute accuracy needs 1 input (y_correct), but got {}'.format(len(inputs)))
|
||||||
|
y_correct = self._convert_data(inputs[0])
|
||||||
|
self._correct_num += y_correct
|
||||||
|
self._total_num += self.batch_size * self.device_num
|
||||||
|
|
||||||
|
def eval(self):
|
||||||
|
"""
|
||||||
|
Computes the accuracy.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Float, the computed result.
|
||||||
|
|
||||||
|
Raises:
|
||||||
|
RuntimeError: If the sample size is 0.
|
||||||
|
"""
|
||||||
|
|
||||||
|
if self._total_num == 0:
|
||||||
|
raise RuntimeError('Accuracy can not be calculated, because the number of samples is 0.')
|
||||||
|
return self._correct_num / self._total_num
|
@ -0,0 +1,39 @@
|
|||||||
|
# Copyright 2020 Huawei Technologies Co., Ltd
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
# ============================================================================
|
||||||
|
"""
|
||||||
|
network config setting, will be used in train.py and eval.py
|
||||||
|
"""
|
||||||
|
from easydict import EasyDict as ed
|
||||||
|
|
||||||
|
config = ed({
|
||||||
|
"class_num": 1000,
|
||||||
|
"batch_size": 32,
|
||||||
|
"loss_scale": 128,
|
||||||
|
"momentum": 0.9,
|
||||||
|
"weight_decay": 5e-4,
|
||||||
|
"epoch_size": 45,
|
||||||
|
"buffer_size": 1000,
|
||||||
|
"image_height": 224,
|
||||||
|
"image_width": 224,
|
||||||
|
"save_checkpoint": True,
|
||||||
|
"save_checkpoint_steps": 5004,
|
||||||
|
"keep_checkpoint_max": 20,
|
||||||
|
"save_checkpoint_path": "./",
|
||||||
|
"label_smooth": 1,
|
||||||
|
"label_smooth_factor": 0.1,
|
||||||
|
"frequency": 834,
|
||||||
|
"eval_interval": 1,
|
||||||
|
"eval_batch_size": 32
|
||||||
|
})
|
@ -0,0 +1,82 @@
|
|||||||
|
# Copyright 2020 Huawei Technologies Co., Ltd
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
# ============================================================================
|
||||||
|
|
||||||
|
"""create train or eval dataset."""
|
||||||
|
|
||||||
|
import os
|
||||||
|
|
||||||
|
import mindspore.common.dtype as mstype
|
||||||
|
import mindspore.dataset as dataset
|
||||||
|
import mindspore.dataset.engine as de
|
||||||
|
import mindspore.dataset.transforms.c_transforms as C2
|
||||||
|
import mindspore.dataset.transforms.vision.c_transforms as C
|
||||||
|
|
||||||
|
dataset.config.set_seed(1)
|
||||||
|
|
||||||
|
|
||||||
|
def create_dataset(dataset_path, do_train, repeat_num=1, batch_size=32):
|
||||||
|
"""
|
||||||
|
Create a train or eval dataset.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
dataset_path(string): the path of dataset.
|
||||||
|
do_train(bool): whether dataset is used for train or eval.
|
||||||
|
repeat_num(int): the repeat times of dataset. Default: 1
|
||||||
|
batch_size(int): the batch size of dataset. Default: 32
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
dataset
|
||||||
|
"""
|
||||||
|
|
||||||
|
device_num = int(os.getenv("RANK_SIZE"))
|
||||||
|
rank_id = int(os.getenv("RANK_ID"))
|
||||||
|
if device_num == 1:
|
||||||
|
ds = de.ImageFolderDatasetV2(dataset_path, num_parallel_workers=8, shuffle=True)
|
||||||
|
else:
|
||||||
|
ds = de.ImageFolderDatasetV2(dataset_path, num_parallel_workers=8, shuffle=True,
|
||||||
|
num_shards=device_num, shard_id=rank_id)
|
||||||
|
|
||||||
|
image_size = 224
|
||||||
|
mean = [0.485 * 255, 0.456 * 255, 0.406 * 255]
|
||||||
|
std = [0.229 * 255, 0.224 * 255, 0.225 * 255]
|
||||||
|
|
||||||
|
# define map operations
|
||||||
|
if do_train:
|
||||||
|
trans = [
|
||||||
|
C.RandomCropDecodeResize(image_size, scale=(0.08, 1.0), ratio=(0.75, 1.333)),
|
||||||
|
C.RandomHorizontalFlip(prob=0.5),
|
||||||
|
C.Normalize(mean=mean, std=std),
|
||||||
|
C.HWC2CHW()
|
||||||
|
]
|
||||||
|
else:
|
||||||
|
trans = [
|
||||||
|
C.Decode(),
|
||||||
|
C.Resize((256, 256)),
|
||||||
|
C.CenterCrop(image_size),
|
||||||
|
C.Normalize(mean=mean, std=std),
|
||||||
|
C.HWC2CHW()
|
||||||
|
]
|
||||||
|
|
||||||
|
type_cast_op = C2.TypeCast(mstype.int32)
|
||||||
|
|
||||||
|
ds = ds.map(input_columns="image", num_parallel_workers=8, operations=trans)
|
||||||
|
ds = ds.map(input_columns="label", num_parallel_workers=8, operations=type_cast_op)
|
||||||
|
|
||||||
|
# apply batch operations
|
||||||
|
ds = ds.batch(batch_size, drop_remainder=True)
|
||||||
|
|
||||||
|
# apply dataset repeat operation
|
||||||
|
ds = ds.repeat(repeat_num)
|
||||||
|
return ds
|
@ -0,0 +1,120 @@
|
|||||||
|
# Copyright 2020 Huawei Technologies Co., Ltd
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
# ============================================================================
|
||||||
|
"""Dataset help for minddata dataset"""
|
||||||
|
from mindspore._checkparam import check_bool
|
||||||
|
from mindspore.parallel._utils import _get_device_num, _get_parallel_mode
|
||||||
|
from mindspore.train._utils import _exec_datagraph, _get_types_and_shapes, \
|
||||||
|
_to_full_shapes
|
||||||
|
from mindspore.train.parallel_utils import ParallelMode
|
||||||
|
|
||||||
|
|
||||||
|
class DatasetHelper:
|
||||||
|
"""
|
||||||
|
Help function to use the Minddata dataset.
|
||||||
|
|
||||||
|
According to different context, change the iter of dataset, to use the same for loop in different context.
|
||||||
|
|
||||||
|
Note:
|
||||||
|
The iter of DatasetHelper will give one epoch data.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
dataset (DataSet): The dataset.
|
||||||
|
dataset_sink_mode (bool): If true use GetNext to fetch the data, or else feed the data from host.
|
||||||
|
Default: True.
|
||||||
|
iter_first_order (int): The iteration of first-order subgraph.
|
||||||
|
Default: 1.
|
||||||
|
|
||||||
|
Examples:
|
||||||
|
>>> dataset_helper = DatasetHelper(dataset)
|
||||||
|
>>> for inputs in dataset_helper:
|
||||||
|
>>> outputs = network(*inputs)
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self, dataset, dataset_sink_mode=True, iter_first_order=0):
|
||||||
|
check_bool(dataset_sink_mode)
|
||||||
|
self.iter = _DatasetIterMSLoopSink(dataset, iter_first_order)
|
||||||
|
|
||||||
|
def __iter__(self):
|
||||||
|
return self.iter.__iter__()
|
||||||
|
|
||||||
|
# A temp solution for loop sink. Delete later
|
||||||
|
def types_shapes(self):
|
||||||
|
"""Get the types and shapes from dataset on current config."""
|
||||||
|
return self.iter.types_shapes()
|
||||||
|
|
||||||
|
def loop_size(self):
|
||||||
|
"""Get loop_size for every iteration."""
|
||||||
|
return self.iter.loop_size
|
||||||
|
|
||||||
|
|
||||||
|
class _DatasetIter:
|
||||||
|
"""Base iter for dataset help"""
|
||||||
|
|
||||||
|
def __init__(self, dataset):
|
||||||
|
self.loop_size = 1
|
||||||
|
if not hasattr(dataset, '__ME_INITED__'):
|
||||||
|
if not hasattr(dataset, '__loop_size__'):
|
||||||
|
self.loop_size = dataset.get_dataset_size()
|
||||||
|
else:
|
||||||
|
self.loop_size = dataset.__loop_size__
|
||||||
|
dataset.__ME_INITED__ = _exec_datagraph(dataset, self.loop_size).queue_name
|
||||||
|
|
||||||
|
self.ind = 0
|
||||||
|
self.dataset = dataset
|
||||||
|
dataset_types, dataset_shapes = _get_types_and_shapes(dataset)
|
||||||
|
self.dataset_types, self.dataset_shapes = dataset_types, dataset_shapes
|
||||||
|
|
||||||
|
def __iter__(self):
|
||||||
|
self.ind = 0
|
||||||
|
return self
|
||||||
|
|
||||||
|
def __next__(self):
|
||||||
|
if self.ind >= self.loop_count:
|
||||||
|
raise StopIteration()
|
||||||
|
self.ind += 1
|
||||||
|
return self.op()
|
||||||
|
|
||||||
|
def types_shapes(self):
|
||||||
|
return self.dataset_types, self.dataset_shapes
|
||||||
|
|
||||||
|
def get_loop_count(self, dataset):
|
||||||
|
loop_count = 1
|
||||||
|
if hasattr(dataset, '__loop_size__'):
|
||||||
|
loop_size = dataset.__loop_size__
|
||||||
|
if dataset.get_dataset_size() % loop_size != 0:
|
||||||
|
raise ValueError(f'Dataset size {dataset.get_dataset_size()} and '
|
||||||
|
f'loop_size {loop_size} are not matched.')
|
||||||
|
loop_count = int(dataset.get_dataset_size() / loop_size)
|
||||||
|
return loop_count
|
||||||
|
|
||||||
|
|
||||||
|
class _DatasetIterMSLoopSink(_DatasetIter):
|
||||||
|
"""Iter for context (device_target=Ascend)"""
|
||||||
|
|
||||||
|
def __init__(self, dataset, iter_first_order):
|
||||||
|
super(_DatasetIterMSLoopSink, self).__init__(dataset)
|
||||||
|
loop_size = dataset.__loop_size__ + iter_first_order
|
||||||
|
self.loop_count = int(dataset.get_dataset_size() / loop_size * 2)
|
||||||
|
# for self._parallel_mode equal to semi_auto_parallel or auto_parallel, use a complete tensor to
|
||||||
|
# compile, and slice tensor to run. The batch dimension of tensors for compile is device_number
|
||||||
|
# times the batch dimension of tensors for run. Now only support LoopSink.
|
||||||
|
if _get_parallel_mode() in (ParallelMode.SEMI_AUTO_PARALLEL, ParallelMode.AUTO_PARALLEL):
|
||||||
|
device_num = _get_device_num()
|
||||||
|
self.dataset_shapes = _to_full_shapes(self.dataset_shapes, device_num)
|
||||||
|
|
||||||
|
def op():
|
||||||
|
return tuple()
|
||||||
|
|
||||||
|
self.op = op
|
@ -0,0 +1,184 @@
|
|||||||
|
# Copyright 2020 Huawei Technologies Co., Ltd
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
# ============================================================================
|
||||||
|
"""grad_reducer_thor"""
|
||||||
|
import mindspore.common.dtype as mstype
|
||||||
|
from mindspore.communication.management import GlobalComm, get_group_size
|
||||||
|
from mindspore.nn.cell import Cell
|
||||||
|
from mindspore.ops import functional as F, composite as C, operations as P
|
||||||
|
from mindspore.ops.operations.comm_ops import AllReduce, ReduceOp
|
||||||
|
|
||||||
|
reduce_opt = C.MultitypeFuncGraph("reduce_opt")
|
||||||
|
|
||||||
|
_all_reduce_A = AllReduce()
|
||||||
|
|
||||||
|
|
||||||
|
def _init_optimizer_allreduce(group):
|
||||||
|
global _all_reduce_A
|
||||||
|
_all_reduce_A = AllReduce(ReduceOp.SUM, GlobalComm.WORLD_COMM_GROUP)
|
||||||
|
_all_reduce_A.add_prim_attr('fusion', group)
|
||||||
|
|
||||||
|
|
||||||
|
@reduce_opt.register("Function", "Number", "Tensor")
|
||||||
|
def _tensors_allreduce_mean(mul, degree, grad):
|
||||||
|
degree = F.scalar_cast(degree, F.dtype(grad))
|
||||||
|
grad = _all_reduce_A(grad)
|
||||||
|
cast_op = P.Cast()
|
||||||
|
return mul(grad, cast_op(F.scalar_to_array(1.0 / degree), F.dtype(grad)))
|
||||||
|
|
||||||
|
|
||||||
|
@reduce_opt.register("Bool", "Tensor")
|
||||||
|
def _tensors_allreduce(allreduce_filter, grad):
|
||||||
|
if allreduce_filter:
|
||||||
|
return _all_reduce_A(grad)
|
||||||
|
return grad
|
||||||
|
|
||||||
|
|
||||||
|
_get_datatype = C.MultitypeFuncGraph("_get_datatype")
|
||||||
|
|
||||||
|
|
||||||
|
@_get_datatype.register("Tensor")
|
||||||
|
def _tensors_get_datatype(grad):
|
||||||
|
"""
|
||||||
|
Acquire gradient datatype.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
grad (Tensor): The gradient tensor before operation.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
mstype, the datatype of gradient.
|
||||||
|
"""
|
||||||
|
return F.dtype(grad)
|
||||||
|
|
||||||
|
|
||||||
|
_cast_datatype = C.MultitypeFuncGraph("_cast_datatype")
|
||||||
|
|
||||||
|
|
||||||
|
@_cast_datatype.register("TypeType", "Tensor")
|
||||||
|
def _tensors_cast_datatype(datatype, grad):
|
||||||
|
"""
|
||||||
|
Cast gradient to datatype.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
datatype (mstype): the destination datatype of gradient.
|
||||||
|
grad (Tensor): The gradient tensor before operation.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Tensor, the gradient tensor after operation.
|
||||||
|
"""
|
||||||
|
return F.cast(grad, datatype)
|
||||||
|
|
||||||
|
|
||||||
|
class DistributedGradReducerThor(Cell):
|
||||||
|
"""
|
||||||
|
A distributed optimizer.
|
||||||
|
|
||||||
|
Constructs a gradient reducer Cell, which applies communication and average operations on
|
||||||
|
single-process gradient values.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
parameters (list): the parameters to be updated.
|
||||||
|
group (int): the different group to allreduce.
|
||||||
|
mean (bool): When mean is true, the mean coefficient (degree) would apply on gradients. Default: False.
|
||||||
|
degree (int): The mean coefficient. Usually it equals to device number. Default: None.
|
||||||
|
|
||||||
|
Raises:
|
||||||
|
ValueError: If degree is not a int or less than 0.
|
||||||
|
|
||||||
|
Examples:
|
||||||
|
>>> from mindspore.communication import init, get_group_size
|
||||||
|
>>> from mindspore.ops import composite as C
|
||||||
|
>>> from mindspore.ops import operations as P
|
||||||
|
>>> from mindspore.ops import functional as F
|
||||||
|
>>> from mindspore import context
|
||||||
|
>>> from mindspore import nn
|
||||||
|
>>> from mindspore import ParallelMode, ParameterTuple
|
||||||
|
>>>
|
||||||
|
>>> device_id = int(os.environ["DEVICE_ID"])
|
||||||
|
>>> context.set_context(mode=context.GRAPH_MODE, device_target="Ascend", save_graphs=True,
|
||||||
|
>>> device_id=int(device_id), enable_hccl=True)
|
||||||
|
>>> init()
|
||||||
|
>>> context.reset_auto_parallel_context()
|
||||||
|
>>> context.set_auto_parallel_context(parallel_mode=ParallelMode.DATA_PARALLEL)
|
||||||
|
>>>
|
||||||
|
>>>
|
||||||
|
>>> class TrainingWrapper(nn.Cell):
|
||||||
|
>>> def __init__(self, network, optimizer, sens=1.0):
|
||||||
|
>>> super(TrainingWrapper, self).__init__(auto_prefix=False)
|
||||||
|
>>> self.network = network
|
||||||
|
>>> self.network.add_flags(defer_inline=True)
|
||||||
|
>>> self.weights = ParameterTuple(network.trainable_params())
|
||||||
|
>>> self.optimizer = optimizer
|
||||||
|
>>> self.grad = C.GradOperation('grad', get_by_list=True, sens_param=True)
|
||||||
|
>>> self.sens = sens
|
||||||
|
>>> self.reducer_flag = False
|
||||||
|
>>> self.grad_reducer = None
|
||||||
|
>>> self.parallel_mode = context.get_auto_parallel_context("parallel_mode")
|
||||||
|
>>> if self.parallel_mode in [ParallelMode.DATA_PARALLEL,
|
||||||
|
>>> ParallelMode.HYBRID_PARALLEL]:
|
||||||
|
>>> self.reducer_flag = True
|
||||||
|
>>> if self.reducer_flag:
|
||||||
|
>>> mean = context.get_auto_parallel_context("mirror_mean")
|
||||||
|
>>> if mean.get_device_num_is_set():
|
||||||
|
>>> degree = context.get_auto_parallel_context("device_num")
|
||||||
|
>>> else:
|
||||||
|
>>> degree = get_group_size()
|
||||||
|
>>> self.grad_reducer = nn.DistributedGradReducer(optimizer.parameters, mean, degree)
|
||||||
|
>>>
|
||||||
|
>>> def construct(self, *args):
|
||||||
|
>>> weights = self.weights
|
||||||
|
>>> loss = self.network(*args)
|
||||||
|
>>> sens = P.Fill()(P.DType()(loss), P.Shape()(loss), self.sens)
|
||||||
|
>>> grads = self.grad(self.network, weights)(*args, sens)
|
||||||
|
>>> if self.reducer_flag:
|
||||||
|
>>> # apply grad reducer on grads
|
||||||
|
>>> grads = self.grad_reducer(grads)
|
||||||
|
>>> return F.depend(loss, self.optimizer(grads))
|
||||||
|
>>>
|
||||||
|
>>> network = Net()
|
||||||
|
>>> optimizer = nn.Momentum(network.trainable_params(), learning_rate=0.1, momentum=0.9)
|
||||||
|
>>> train_cell = TrainingWrapper(network, optimizer)
|
||||||
|
>>> inputs = Tensor(np.ones([16, 16]).astype(np.float32))
|
||||||
|
>>> label = Tensor(np.zeros([16, 16]).astype(np.float32))
|
||||||
|
>>> grads = train_cell(inputs, label)
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self, parameters, group, mean=True, degree=None):
|
||||||
|
super(DistributedGradReducerThor, self).__init__(auto_prefix=False)
|
||||||
|
self.hyper_map = C.HyperMap()
|
||||||
|
self.mul = P.Mul()
|
||||||
|
if degree is None:
|
||||||
|
self.degree = get_group_size()
|
||||||
|
else:
|
||||||
|
if not isinstance(degree, int) or degree <= 0:
|
||||||
|
raise ValueError("Parameter 'degree' in DistributedGradReducer should large than 0 and be int")
|
||||||
|
self.degree = degree
|
||||||
|
self.mean = mean
|
||||||
|
self.allreduce_filter = tuple(x.layerwise_parallel is False for x in parameters)
|
||||||
|
_init_optimizer_allreduce(group)
|
||||||
|
|
||||||
|
def construct(self, grads):
|
||||||
|
# In some circumstances, the data precision of grads could be mixed with float16 and float32. Thus, the
|
||||||
|
# result of AllReduce is unreliable. To solve the problem, grads should be cast to float32 before AllReduce,
|
||||||
|
# and cast back after the operation.
|
||||||
|
datatypes = self.hyper_map(F.partial(_get_datatype), grads)
|
||||||
|
grads = self.hyper_map(F.partial(_cast_datatype, mstype.float32), grads)
|
||||||
|
|
||||||
|
if self.mean:
|
||||||
|
new_grad = self.hyper_map(F.partial(reduce_opt, self.mul, self.degree), grads)
|
||||||
|
else:
|
||||||
|
new_grad = self.hyper_map(F.partial(reduce_opt), self.allreduce_filter, grads)
|
||||||
|
|
||||||
|
new_grad = self.hyper_map(F.partial(_cast_datatype), datatypes, new_grad)
|
||||||
|
return new_grad
|
@ -0,0 +1,88 @@
|
|||||||
|
# Copyright 2020 Huawei Technologies Co., Ltd
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
# ============================================================================
|
||||||
|
"""learning rate generator"""
|
||||||
|
import math
|
||||||
|
|
||||||
|
import numpy as np
|
||||||
|
|
||||||
|
|
||||||
|
def get_lr(lr_init, lr_end, lr_max, warmup_epochs, total_epochs, steps_per_epoch, lr_decay_mode):
|
||||||
|
"""
|
||||||
|
generate learning rate array
|
||||||
|
|
||||||
|
Args:
|
||||||
|
lr_init(float): init learning rate
|
||||||
|
lr_end(float): end learning rate
|
||||||
|
lr_max(float): max learning rate
|
||||||
|
warmup_epochs(int): number of warmup epochs
|
||||||
|
total_epochs(int): total epoch of training
|
||||||
|
steps_per_epoch(int): steps of one epoch
|
||||||
|
lr_decay_mode(string): learning rate decay mode, including steps, poly, cosine or default
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
np.array, learning rate array
|
||||||
|
"""
|
||||||
|
lr_each_step = []
|
||||||
|
total_steps = steps_per_epoch * total_epochs
|
||||||
|
warmup_steps = steps_per_epoch * warmup_epochs
|
||||||
|
if lr_decay_mode == 'steps':
|
||||||
|
decay_epoch_index = [0.3 * total_steps, 0.6 * total_steps, 0.8 * total_steps]
|
||||||
|
for i in range(total_steps):
|
||||||
|
if i < decay_epoch_index[0]:
|
||||||
|
lr = lr_max
|
||||||
|
elif i < decay_epoch_index[1]:
|
||||||
|
lr = lr_max * 0.1
|
||||||
|
elif i < decay_epoch_index[2]:
|
||||||
|
lr = lr_max * 0.01
|
||||||
|
else:
|
||||||
|
lr = lr_max * 0.001
|
||||||
|
lr_each_step.append(lr)
|
||||||
|
elif lr_decay_mode == 'poly':
|
||||||
|
if warmup_steps != 0:
|
||||||
|
inc_each_step = (float(lr_max) - float(lr_init)) / float(warmup_steps)
|
||||||
|
else:
|
||||||
|
inc_each_step = 0
|
||||||
|
for i in range(total_steps):
|
||||||
|
if i < warmup_steps:
|
||||||
|
lr = float(lr_init) + inc_each_step * float(i)
|
||||||
|
else:
|
||||||
|
base = (1.0 - (float(i) - float(warmup_steps)) / (float(total_steps) - float(warmup_steps)))
|
||||||
|
lr = float(lr_max) * base * base
|
||||||
|
if lr < 0.0:
|
||||||
|
lr = 0.0
|
||||||
|
lr_each_step.append(lr)
|
||||||
|
elif lr_decay_mode == 'cosine':
|
||||||
|
decay_steps = total_steps - warmup_steps
|
||||||
|
for i in range(total_steps):
|
||||||
|
if i < warmup_steps:
|
||||||
|
lr_inc = (float(lr_max) - float(lr_init)) / float(warmup_steps)
|
||||||
|
lr = float(lr_init) + lr_inc * (i + 1)
|
||||||
|
else:
|
||||||
|
linear_decay = (total_steps - i) / decay_steps
|
||||||
|
cosine_decay = 0.5 * (1 + math.cos(math.pi * 2 * 0.47 * i / decay_steps))
|
||||||
|
decayed = linear_decay * cosine_decay + 0.00001
|
||||||
|
lr = lr_max * decayed
|
||||||
|
lr_each_step.append(lr)
|
||||||
|
else:
|
||||||
|
for i in range(total_steps):
|
||||||
|
if i < warmup_steps:
|
||||||
|
lr = lr_init + (lr_max - lr_init) * i / warmup_steps
|
||||||
|
else:
|
||||||
|
lr = lr_max - (lr_max - lr_end) * (i - warmup_steps) / (total_steps - warmup_steps)
|
||||||
|
lr_each_step.append(lr)
|
||||||
|
|
||||||
|
learning_rate = np.array(lr_each_step).astype(np.float32)
|
||||||
|
|
||||||
|
return learning_rate
|
@ -0,0 +1,132 @@
|
|||||||
|
# Copyright 2020 Huawei Technologies Co., Ltd
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
# ============================================================================
|
||||||
|
"""evaluation metric."""
|
||||||
|
|
||||||
|
import mindspore.common.dtype as mstype
|
||||||
|
import mindspore.nn as nn
|
||||||
|
from mindspore.communication.management import GlobalComm
|
||||||
|
from mindspore.ops import operations as P
|
||||||
|
|
||||||
|
|
||||||
|
class ClassifyCorrectCell(nn.Cell):
|
||||||
|
r"""
|
||||||
|
Cell that returns correct count of the prediction in classification network.
|
||||||
|
This Cell accepts a network as arguments.
|
||||||
|
It returns orrect count of the prediction to calculate the metrics.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
network (Cell): The network Cell.
|
||||||
|
|
||||||
|
Inputs:
|
||||||
|
- **data** (Tensor) - Tensor of shape :math:`(N, \ldots)`.
|
||||||
|
- **label** (Tensor) - Tensor of shape :math:`(N, \ldots)`.
|
||||||
|
|
||||||
|
Outputs:
|
||||||
|
Tuple, containing a scalar correct count of the prediction
|
||||||
|
|
||||||
|
Examples:
|
||||||
|
>>> # For a defined network Net without loss function
|
||||||
|
>>> net = Net()
|
||||||
|
>>> eval_net = nn.ClassifyCorrectCell(net)
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self, network):
|
||||||
|
super(ClassifyCorrectCell, self).__init__(auto_prefix=False)
|
||||||
|
self._network = network
|
||||||
|
self.argmax = P.Argmax()
|
||||||
|
self.equal = P.Equal()
|
||||||
|
self.cast = P.Cast()
|
||||||
|
self.reduce_sum = P.ReduceSum()
|
||||||
|
self.allreduce = P.AllReduce(P.ReduceOp.SUM, GlobalComm.WORLD_COMM_GROUP)
|
||||||
|
|
||||||
|
def construct(self, data, label):
|
||||||
|
outputs = self._network(data)
|
||||||
|
y_pred = self.argmax(outputs)
|
||||||
|
y_pred = self.cast(y_pred, mstype.int32)
|
||||||
|
y_correct = self.equal(y_pred, label)
|
||||||
|
y_correct = self.cast(y_correct, mstype.float32)
|
||||||
|
y_correct = self.reduce_sum(y_correct)
|
||||||
|
total_correct = self.allreduce(y_correct)
|
||||||
|
return (total_correct,)
|
||||||
|
|
||||||
|
|
||||||
|
class DistAccuracy(nn.Metric):
|
||||||
|
r"""
|
||||||
|
Calculates the accuracy for classification data in distributed mode.
|
||||||
|
The accuracy class creates two local variables, correct number and total number that are used to compute the
|
||||||
|
frequency with which predictions matches labels. This frequency is ultimately returned as the accuracy: an
|
||||||
|
idempotent operation that simply divides correct number by total number.
|
||||||
|
|
||||||
|
.. math::
|
||||||
|
|
||||||
|
\text{accuracy} =\frac{\text{true_positive} + \text{true_negative}}
|
||||||
|
|
||||||
|
{\text{true_positive} + \text{true_negative} + \text{false_positive} + \text{false_negative}}
|
||||||
|
|
||||||
|
Args:
|
||||||
|
batch_size (int): eval batch size.
|
||||||
|
device_num (int): device number to eval.
|
||||||
|
Examples:
|
||||||
|
>>> y_correct = Tensor(np.array([20]))
|
||||||
|
>>> metric = nn.DistAccuracy(batch_size=3, device_num=8)
|
||||||
|
>>> metric.clear()
|
||||||
|
>>> metric.update(y_correct)
|
||||||
|
>>> accuracy = metric.eval()
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self, batch_size, device_num):
|
||||||
|
super(DistAccuracy, self).__init__()
|
||||||
|
self.clear()
|
||||||
|
self.batch_size = batch_size
|
||||||
|
self.device_num = device_num
|
||||||
|
|
||||||
|
def clear(self):
|
||||||
|
"""Clears the internal evaluation result."""
|
||||||
|
self._correct_num = 0
|
||||||
|
self._total_num = 0
|
||||||
|
|
||||||
|
def update(self, *inputs):
|
||||||
|
"""
|
||||||
|
Updates the internal evaluation result :math:`y_{pred}` and :math:`y`.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
inputs: Input `y_correct`. `y_correct` is a `scalar Tensor`.
|
||||||
|
`y_correct` is the right prediction count that gathered from all devices
|
||||||
|
it's a scalar in float type
|
||||||
|
|
||||||
|
Raises:
|
||||||
|
ValueError: If the number of the input is not 1.
|
||||||
|
"""
|
||||||
|
|
||||||
|
if len(inputs) != 1:
|
||||||
|
raise ValueError('Distribute accuracy needs 1 input (y_correct), but got {}'.format(len(inputs)))
|
||||||
|
y_correct = self._convert_data(inputs[0])
|
||||||
|
self._correct_num += y_correct
|
||||||
|
self._total_num += self.batch_size * self.device_num
|
||||||
|
|
||||||
|
def eval(self):
|
||||||
|
"""
|
||||||
|
Computes the accuracy.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Float, the computed result.
|
||||||
|
|
||||||
|
Raises:
|
||||||
|
RuntimeError: If the sample size is 0.
|
||||||
|
"""
|
||||||
|
|
||||||
|
if self._total_num == 0:
|
||||||
|
raise RuntimeError('Accuracy can not be calculated, because the number of samples is 0.')
|
||||||
|
return self._correct_num / self._total_num
|
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
@ -0,0 +1,201 @@
|
|||||||
|
# Copyright 2020 Huawei Technologies Co., Ltd
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
# ============================================================================
|
||||||
|
"""momentum"""
|
||||||
|
import mindspore.common.dtype as mstype
|
||||||
|
from mindspore.common.initializer import initializer
|
||||||
|
from mindspore.common.parameter import Parameter
|
||||||
|
from mindspore.common.parameter import ParameterTuple
|
||||||
|
from mindspore.common.tensor import Tensor
|
||||||
|
from mindspore.nn.optim.optimizer import Optimizer
|
||||||
|
from mindspore.ops import functional as F, composite as C, operations as P
|
||||||
|
from mindspore.parallel._utils import _get_device_num, _get_mirror_mean
|
||||||
|
|
||||||
|
from .grad_reducer_thor import DistributedGradReducerThor
|
||||||
|
|
||||||
|
momentum_opt = C.MultitypeFuncGraph("momentum_opt")
|
||||||
|
|
||||||
|
|
||||||
|
@momentum_opt.register("Function", "Tensor", "Tensor", "Tensor", "Tensor", "Tensor")
|
||||||
|
def _tensor_run_opt_ext(opt, learning_rate, momentum, gradient, weight, moment):
|
||||||
|
"""Apply momentum optimizer to the weight parameter using Tensor."""
|
||||||
|
success = True
|
||||||
|
success = F.depend(success, opt(weight, moment, learning_rate, gradient, momentum))
|
||||||
|
return success
|
||||||
|
|
||||||
|
|
||||||
|
op_add = P.AddN()
|
||||||
|
apply_decay = C.MultitypeFuncGraph("apply_decay")
|
||||||
|
|
||||||
|
|
||||||
|
@apply_decay.register("Number", "Bool", "Tensor", "Tensor")
|
||||||
|
def _tensor_apply_decay(weight_decay, if_apply, weight, gradient):
|
||||||
|
"""Get grad with weight_decay."""
|
||||||
|
if if_apply:
|
||||||
|
return op_add((weight * weight_decay, gradient))
|
||||||
|
return gradient
|
||||||
|
|
||||||
|
|
||||||
|
class THOR(Optimizer):
|
||||||
|
"""THOR"""
|
||||||
|
|
||||||
|
def __init__(self, params, learning_rate, momentum, matrix_A, matrix_G, A_inv_max, G_inv_max, weight_decay=0.0,
|
||||||
|
loss_scale=1.0,
|
||||||
|
decay_filter=lambda x: x.name not in []):
|
||||||
|
super(THOR, self).__init__(learning_rate, params, weight_decay, loss_scale)
|
||||||
|
if isinstance(momentum, float) and momentum < 0.0:
|
||||||
|
raise ValueError("momentum should be at least 0.0, but got momentum {}".format(momentum))
|
||||||
|
self.momentum = Parameter(Tensor(momentum, mstype.float32), name="momentum")
|
||||||
|
self.params = self.parameters
|
||||||
|
self.moments = self.params.clone(prefix="moments", init='zeros')
|
||||||
|
self.hyper_map = C.HyperMap()
|
||||||
|
self.opt = P.ApplyMomentum()
|
||||||
|
self.matrix_A = ParameterTuple(matrix_A)
|
||||||
|
self.matrix_G = ParameterTuple(matrix_G)
|
||||||
|
self.A_inv_max = ParameterTuple(A_inv_max)
|
||||||
|
self.G_inv_max = ParameterTuple(G_inv_max)
|
||||||
|
self.cube_matmul_left = P.CusMatMulCubeFraczLeftCast()
|
||||||
|
self.cube_matmul_left_fc = P.CusMatMulCubeDenseLeft()
|
||||||
|
self.cube_matmul_right_fc = P.CusMatMulCubeDenseRight()
|
||||||
|
self.cube_matmul_right_mul = P.CusMatMulCubeFraczRightMul()
|
||||||
|
self.transpose = P.Transpose()
|
||||||
|
self.shape = P.Shape()
|
||||||
|
self.reshape = P.Reshape()
|
||||||
|
self.mul = P.Mul()
|
||||||
|
self.weight_idx = []
|
||||||
|
for i in range(len(self.params)):
|
||||||
|
if "conv" in self.params[i].name or "end_point" in self.params[i].name:
|
||||||
|
self.weight_idx.append(i)
|
||||||
|
self.weight_idx.append(len(self.params))
|
||||||
|
self.feature_map = [1.0 / 12544, 1.0 / 3136, 1.0 / 3136, 1.0 / 3136, 1.0 / 3136, 1.0 / 3136, 1.0 / 3136,
|
||||||
|
1.0 / 3136, 1.0 / 3136, 1.0 / 3136, 1.0 / 3136, 1.0 / 3136,
|
||||||
|
1.0 / 784, 1.0 / 784, 1.0 / 784, 1.0 / 784, 1.0 / 784, 1.0 / 784, 1.0 / 784, 1.0 / 784,
|
||||||
|
1.0 / 784, 1.0 / 784, 1.0 / 784, 1.0 / 784, 1.0 / 784,
|
||||||
|
1.0 / 196, 1.0 / 196, 1.0 / 196, 1.0 / 196, 1.0 / 196, 1.0 / 196, 1.0 / 196, 1.0 / 196,
|
||||||
|
1.0 / 196, 1.0 / 196, 1.0 / 196, 1.0 / 196, 1.0 / 196, 1.0 / 196, 1.0 / 196, 1.0 / 196,
|
||||||
|
1.0 / 196, 1.0 / 196, 1.0 / 196,
|
||||||
|
1.0 / 49, 1.0 / 49, 1.0 / 49, 1.0 / 49, 1.0 / 49, 1.0 / 49, 1.0 / 49, 1.0 / 49, 1.0 / 49,
|
||||||
|
1.0]
|
||||||
|
mean = _get_mirror_mean()
|
||||||
|
degree = _get_device_num()
|
||||||
|
self.grad_reducer_Amax = DistributedGradReducerThor(self.parameters, 2, mean, degree)
|
||||||
|
self.grad_reducer_Gmax = DistributedGradReducerThor(self.parameters, 5, mean, degree)
|
||||||
|
self.grad_reducer_A = DistributedGradReducerThor(self.parameters, 3, mean, degree)
|
||||||
|
self.grad_reducer_G = DistributedGradReducerThor(self.parameters, 4, mean, degree)
|
||||||
|
self.matrix_A_inv = ()
|
||||||
|
self.matrix_G_inv = ()
|
||||||
|
self.matrix_max_inv = ()
|
||||||
|
|
||||||
|
for i in range(54):
|
||||||
|
self.matrix_max_inv = self.matrix_max_inv + (
|
||||||
|
Parameter(initializer(1, [1], mstype.float32), name="matrix_max" + str(i), requires_grad=False),)
|
||||||
|
self.log = P.Log()
|
||||||
|
self.exp = P.Exp()
|
||||||
|
self.sqrt = P.Sqrt()
|
||||||
|
self.matrix_max_inv = ParameterTuple(self.matrix_max_inv)
|
||||||
|
self.assign = P.Assign()
|
||||||
|
self.cast = P.Cast()
|
||||||
|
self.thor = True
|
||||||
|
self.weight_decay = weight_decay * loss_scale
|
||||||
|
self.decay_flags = tuple(decay_filter(x) for x in self.parameters)
|
||||||
|
|
||||||
|
def construct(self, gradients):
|
||||||
|
params = self.params
|
||||||
|
moments = self.moments
|
||||||
|
if self.thor:
|
||||||
|
matrix_A_allreduce = ()
|
||||||
|
matrix_G_allreduce = ()
|
||||||
|
matrix_A_max_allreduce = ()
|
||||||
|
matrix_G_max_allreduce = ()
|
||||||
|
for i in range(54):
|
||||||
|
g = gradients[i * 3]
|
||||||
|
matrix_A = self.matrix_A[i]
|
||||||
|
matrix_G = self.matrix_G[i]
|
||||||
|
A_max = self.A_inv_max[i]
|
||||||
|
G_max = self.G_inv_max[i]
|
||||||
|
matrix_A = F.depend(matrix_A, g)
|
||||||
|
matrix_G = F.depend(matrix_G, g)
|
||||||
|
A_max = F.depend(A_max, g)
|
||||||
|
G_max = F.depend(G_max, g)
|
||||||
|
matrix_A_allreduce = matrix_A_allreduce + (matrix_A,)
|
||||||
|
matrix_G_allreduce = matrix_G_allreduce + (matrix_G,)
|
||||||
|
matrix_A_max_allreduce = matrix_A_max_allreduce + (A_max,)
|
||||||
|
matrix_G_max_allreduce = matrix_G_max_allreduce + (G_max,)
|
||||||
|
matrix_A_allreduce = self.grad_reducer_A(matrix_A_allreduce)
|
||||||
|
matrix_G_allreduce = self.grad_reducer_G(matrix_G_allreduce)
|
||||||
|
matrix_A_max_allreduce = self.grad_reducer_Amax(matrix_A_max_allreduce)
|
||||||
|
matrix_G_max_allreduce = self.grad_reducer_Gmax(matrix_G_max_allreduce)
|
||||||
|
new_grads = ()
|
||||||
|
for i in range(54):
|
||||||
|
g = gradients[i * 3]
|
||||||
|
temp_a = matrix_A_allreduce[i]
|
||||||
|
temp_g = matrix_G_allreduce[i]
|
||||||
|
temp_a = self.cast(temp_a, mstype.float32)
|
||||||
|
temp_g = self.cast(temp_g, mstype.float32)
|
||||||
|
matrix_A_inv_max = self.log(matrix_A_max_allreduce[i])
|
||||||
|
matrix_A_inv_max = self.mul(matrix_A_inv_max, -1)
|
||||||
|
matrix_A_inv_max = self.exp(matrix_A_inv_max)
|
||||||
|
temp_a = self.mul(temp_a, matrix_A_inv_max)
|
||||||
|
matrix_G_inv_max = self.log(matrix_G_max_allreduce[i])
|
||||||
|
matrix_G_inv_max = self.mul(matrix_G_inv_max, -1)
|
||||||
|
matrix_G_inv_max = self.exp(matrix_G_inv_max)
|
||||||
|
temp_g = self.mul(temp_g, matrix_G_inv_max)
|
||||||
|
temp_max = self.mul(matrix_A_max_allreduce[i], matrix_G_max_allreduce[i])
|
||||||
|
temp_max = self.mul(temp_max, self.feature_map[i])
|
||||||
|
temp_a = self.cast(temp_a, mstype.float16)
|
||||||
|
temp_g = self.cast(temp_g, mstype.float16)
|
||||||
|
if i == 53:
|
||||||
|
g = self.cube_matmul_left_fc(temp_g, g)
|
||||||
|
g = self.cube_matmul_right_fc(g, temp_a, temp_max)
|
||||||
|
else:
|
||||||
|
g = self.cube_matmul_left(temp_g, g)
|
||||||
|
g = self.cube_matmul_right_mul(g, temp_a, temp_max)
|
||||||
|
fake_A = self.assign(self.matrix_A[i], temp_a)
|
||||||
|
fake_G = self.assign(self.matrix_G[i], temp_g)
|
||||||
|
fake_max = self.assign(self.matrix_max_inv[i], temp_max)
|
||||||
|
g = F.depend(g, fake_A)
|
||||||
|
g = F.depend(g, fake_G)
|
||||||
|
g = F.depend(g, fake_max)
|
||||||
|
if i == 53:
|
||||||
|
new_grads = new_grads + (g,)
|
||||||
|
else:
|
||||||
|
new_grads = new_grads + (g, gradients[i * 3 + 1], gradients[i * 3 + 2])
|
||||||
|
gradients = new_grads
|
||||||
|
else:
|
||||||
|
new_grads = ()
|
||||||
|
for i in range(54):
|
||||||
|
g = gradients[i * 3]
|
||||||
|
matrix_A = self.matrix_A[i]
|
||||||
|
matrix_G = self.matrix_G[i]
|
||||||
|
matrix_max = self.matrix_max_inv[i]
|
||||||
|
matrix_A = F.depend(matrix_A, g)
|
||||||
|
matrix_G = F.depend(matrix_G, g)
|
||||||
|
matrix_max = F.depend(matrix_max, g)
|
||||||
|
if i == 53:
|
||||||
|
g = self.cube_matmul_left_fc(matrix_G, g)
|
||||||
|
g = self.cube_matmul_right_fc(g, matrix_A, matrix_max)
|
||||||
|
new_grads = new_grads + (g,)
|
||||||
|
else:
|
||||||
|
g = self.cube_matmul_left(matrix_G, g)
|
||||||
|
g = self.cube_matmul_right_mul(g, matrix_A, matrix_max)
|
||||||
|
new_grads = new_grads + (g, gradients[i * 3 + 1], gradients[i * 3 + 2])
|
||||||
|
gradients = new_grads
|
||||||
|
|
||||||
|
if self.weight_decay > 0:
|
||||||
|
gradients = self.hyper_map(F.partial(apply_decay, self.weight_decay), self.decay_flags,
|
||||||
|
params, gradients)
|
||||||
|
gradients = self.scale_grad(gradients)
|
||||||
|
lr = self.get_lr()
|
||||||
|
success = self.hyper_map(F.partial(momentum_opt, self.opt, lr, self.momentum), gradients, params, moments)
|
||||||
|
return success
|
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
Loading…
Reference in new issue