!12004 thor generalization code submit
From: @sl_wang Reviewed-by: @guoqi1024 Signed-off-by:pull/12004/MERGE
commit
1f3b059195
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
@ -0,0 +1,19 @@
|
||||
# Copyright 2021 Huawei Technologies Co., Ltd
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
# ============================================================================
|
||||
"""convert to second order related classes and functions."""
|
||||
|
||||
from .convert_utils import ConvertNetUntils, ConvertModelUtils
|
||||
|
||||
__all__ = ["ConvertNetUntils", "ConvertModelUtils"]
|
@ -0,0 +1,157 @@
|
||||
# Copyright 2021 Huawei Technologies Co., Ltd
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
# ============================================================================
|
||||
"""
|
||||
convert utils for second order optimizer: thor
|
||||
"""
|
||||
import mindspore.nn as nn
|
||||
import mindspore.common.dtype as mstype
|
||||
from mindspore import context
|
||||
|
||||
|
||||
class ConvertNetUntils():
|
||||
"""
|
||||
Convert net to thor layer net
|
||||
"""
|
||||
def __init__(self):
|
||||
self._convert_method_map = {nn.Dense: self._convert_dense,
|
||||
nn.Embedding: self._convert_embedding,
|
||||
nn.Conv2d: self._convert_conv2d}
|
||||
|
||||
|
||||
def _convert_dense(self, subcell):
|
||||
"""
|
||||
convert dense cell to second_order cell
|
||||
"""
|
||||
|
||||
weight = subcell.weight
|
||||
act_name = None
|
||||
if subcell.activation_flag:
|
||||
act_class = subcell.activation.__class__.__name__
|
||||
act_name = act_class.lower()
|
||||
if subcell.out_channels == 1001:
|
||||
new_subcell = nn.Dense_Thor(in_channels=subcell.in_channels,
|
||||
out_channels=subcell.out_channels,
|
||||
weight_init=weight,
|
||||
has_bias=subcell.has_bias,
|
||||
bias_init='zeros',
|
||||
activation=act_name)
|
||||
else:
|
||||
compute_type = mstype.float16
|
||||
if context.get_context("device_target") == "GPU":
|
||||
compute_type = mstype.float32
|
||||
new_subcell = nn.Dense_Thor(in_channels=subcell.in_channels,
|
||||
out_channels=subcell.out_channels,
|
||||
weight_init=weight,
|
||||
has_bias=subcell.has_bias,
|
||||
bias_init='zeros',
|
||||
activation=act_name).to_float(compute_type)
|
||||
|
||||
if subcell.has_bias:
|
||||
new_subcell.bias = subcell.bias
|
||||
return new_subcell
|
||||
|
||||
|
||||
def _convert_embedding(self, subcell):
|
||||
"""
|
||||
convert embedding cell to second_order cell
|
||||
"""
|
||||
new_subcell = nn.Embedding_Thor(vocab_size=subcell.vocab_size,
|
||||
embedding_size=subcell.embedding_size,
|
||||
use_one_hot=False)
|
||||
new_subcell.embedding_table = subcell.embedding_table
|
||||
return new_subcell
|
||||
|
||||
|
||||
def _convert_conv2d(self, subcell):
|
||||
"""
|
||||
convert conv2d cell to second_order cell
|
||||
"""
|
||||
out_channel = subcell.out_channels
|
||||
in_channel = subcell.in_channels
|
||||
kernel_size = subcell.kernel_size[0]
|
||||
stride = subcell.stride
|
||||
padding = subcell.padding
|
||||
pad_mode = subcell.pad_mode
|
||||
has_bias = subcell.has_bias
|
||||
weight = subcell.weight
|
||||
new_subcell = nn.Conv2d_Thor(in_channel, out_channel,
|
||||
kernel_size=kernel_size, stride=stride, padding=padding, pad_mode=pad_mode,
|
||||
has_bias=has_bias, weight_init=weight)
|
||||
return new_subcell
|
||||
|
||||
|
||||
def _convert_to_thor_net(self, net):
|
||||
"""
|
||||
convert net to thor net
|
||||
"""
|
||||
cells = net.name_cells()
|
||||
change = False
|
||||
for name in cells:
|
||||
subcell = cells[name]
|
||||
if subcell == net:
|
||||
continue
|
||||
elif isinstance(subcell, (nn.Dense_Thor, nn.Conv2d_Thor, nn.Embedding_Thor)):
|
||||
continue
|
||||
elif isinstance(subcell, (nn.Conv2dTranspose, nn.Conv1d, nn.Conv1dTranspose, nn.BatchNorm1d, nn.GroupNorm,
|
||||
nn.GlobalBatchNorm, nn.LayerNorm, nn.BatchNorm2d, nn.MaxPool2d)):
|
||||
continue
|
||||
elif isinstance(subcell, (nn.Embedding, nn.Dense, nn.Conv2d)):
|
||||
prefix = subcell.param_prefix
|
||||
new_subcell = self._convert_method_map[type(subcell)](subcell)
|
||||
print("subcell name: ", name, "prefix is", prefix, flush=True)
|
||||
if isinstance(new_subcell, (nn.Dense_Thor, nn.Embedding_Thor, nn.Conv2d_Thor)):
|
||||
print("convert to thor layer success.", flush=True)
|
||||
new_subcell.update_parameters_name(prefix + '.')
|
||||
net.insert_child_to_cell(name, new_subcell)
|
||||
change = True
|
||||
else:
|
||||
self._convert_to_thor_net(subcell)
|
||||
|
||||
if isinstance(net, nn.SequentialCell) and change:
|
||||
print("is nn.SequentialCell and change")
|
||||
net.cell_list = list(net.cells())
|
||||
|
||||
|
||||
def convert_to_thor_net(self, net):
|
||||
"""
|
||||
api for convert net to thor net
|
||||
"""
|
||||
net.update_cell_prefix()
|
||||
self._convert_to_thor_net(net)
|
||||
net.update_cell_type("second_order")
|
||||
|
||||
|
||||
class ConvertModelUtils():
|
||||
"""
|
||||
convert model to thor model utils
|
||||
"""
|
||||
|
||||
def convert_to_thor_model(self, model, network, loss_fn=None, optimizer=None, metrics=None, amp_level="O0",
|
||||
loss_scale_manager=None, keep_batchnorm_fp32=False, frequency=834):
|
||||
|
||||
"""
|
||||
api for convert model to thor model
|
||||
"""
|
||||
optim_name = type(optimizer).__name__
|
||||
if optim_name in ("THOR_Ascend", "THOR_GPU"):
|
||||
from .model_thor import Model_Thor
|
||||
if isinstance(network, nn.TrainOneStepCell):
|
||||
model = Model_Thor(network=network, frequency=frequency)
|
||||
else:
|
||||
model = Model_Thor(network=network, loss_fn=loss_fn, optimizer=optimizer, amp_level=amp_level,
|
||||
loss_scale_manager=loss_scale_manager,
|
||||
keep_batchnorm_fp32=keep_batchnorm_fp32, metrics=metrics, frequency=frequency)
|
||||
|
||||
return model
|
@ -0,0 +1,188 @@
|
||||
# Copyright 2021 Huawei Technologies Co., Ltd
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
# ============================================================================
|
||||
"""Dataset help for minddata dataset"""
|
||||
import math
|
||||
import os
|
||||
from mindspore._checkparam import Validator
|
||||
from mindspore import context
|
||||
from mindspore.train._utils import _exec_datagraph, _get_types_and_shapes
|
||||
from mindspore.nn.wrap import GetNextSingleOp
|
||||
from mindspore.parallel._utils import _get_device_num, _need_to_full, _to_full_shapes
|
||||
|
||||
|
||||
def _send_data(dataset, epoch_num):
|
||||
"""Engine dataset to write data to tdt queue."""
|
||||
if not hasattr(dataset, '__has_sent__'):
|
||||
exec_dataset = dataset.__transfer_dataset__
|
||||
exec_dataset.send(epoch_num)
|
||||
dataset.__has_sent__ = True
|
||||
|
||||
|
||||
def _send_data_no_flag(dataset, epoch_num):
|
||||
"""Engine dataset to write data to tdt queue directly."""
|
||||
exec_dataset = dataset.__transfer_dataset__
|
||||
exec_dataset.send(epoch_num)
|
||||
|
||||
|
||||
class DatasetHelper:
|
||||
"""
|
||||
Help function to use the MindData dataset.
|
||||
|
||||
According to different contexts, change the iterations of dataset and use the same iteration for loop in different
|
||||
contexts.
|
||||
|
||||
Note:
|
||||
The iteration of DatasetHelper will provide one epoch data.
|
||||
|
||||
Args:
|
||||
dataset (DataSet): The training dataset iterator.
|
||||
dataset_sink_mode (bool): If true use GetNext to fetch the data, or else feed the data from host. Default: True.
|
||||
sink_size (int): Control the amount of data in each sink.
|
||||
If sink_size=-1, sink the complete dataset for each epoch.
|
||||
If sink_size>0, sink sink_size data for each epoch. Default: -1.
|
||||
epoch_num (int): Control the number of epoch data to send. Default: 1.
|
||||
|
||||
Examples:
|
||||
>>> dataset_helper = DatasetHelper(dataset)
|
||||
>>> for inputs in dataset_helper:
|
||||
>>> outputs = network(*inputs)
|
||||
"""
|
||||
|
||||
def __init__(self, dataset, dataset_sink_mode=True, sink_size=-1, epoch_num=1, iter_first_order=1):
|
||||
dataset_sink_mode = Validator.check_bool(dataset_sink_mode)
|
||||
Validator.check_is_int(sink_size)
|
||||
if sink_size < -1 or sink_size == 0:
|
||||
raise ValueError("The sink_size must be -1 or positive, but got sink_size {}.".format(sink_size))
|
||||
|
||||
if dataset_sink_mode:
|
||||
if context.get_context("device_target") == "Ascend":
|
||||
iterclass = _DatasetIterMSLoopSink
|
||||
self.iter = iterclass(dataset, sink_size, epoch_num, iter_first_order)
|
||||
elif context.get_context("device_target") == "GPU":
|
||||
iterclass = _DatasetIterMS
|
||||
self.iter = iterclass(dataset, sink_size, epoch_num)
|
||||
elif context.get_context("device_target") == "CPU":
|
||||
raise RuntimeError("Currently dataset sink mode is not supported when the device target is CPU.")
|
||||
|
||||
def __iter__(self):
|
||||
return self.iter.__iter__()
|
||||
|
||||
# A temp solution for loop sink. Delete later
|
||||
def types_shapes(self):
|
||||
"""Get the types and shapes from dataset on the current configuration."""
|
||||
return self.iter.types_shapes()
|
||||
|
||||
def sink_size(self):
|
||||
"""Get sink_size for each iteration."""
|
||||
return self.iter.get_sink_size()
|
||||
|
||||
def stop_send(self):
|
||||
"""Free up resources about data sink."""
|
||||
self.iter.stop_send()
|
||||
|
||||
|
||||
class _DatasetIter:
|
||||
"""Base iter for dataset helper"""
|
||||
def __init__(self, dataset, sink_size, epoch_num):
|
||||
self.dataset = dataset
|
||||
self.sink_size = sink_size
|
||||
self.sink_count = 1
|
||||
|
||||
if not hasattr(dataset, '__transfer_dataset__'):
|
||||
if hasattr(dataset, '__loop_size__'):
|
||||
self.sink_size = dataset.__loop_size__
|
||||
dataset.__transfer_dataset__ = _exec_datagraph(dataset, self.sink_size)
|
||||
|
||||
if not hasattr(dataset, '__no_send__'):
|
||||
_send_data(dataset, epoch_num)
|
||||
else:
|
||||
_send_data_no_flag(dataset, epoch_num)
|
||||
|
||||
self.stop_send = dataset.__transfer_dataset__.stop_send
|
||||
self.dataset_types, self.dataset_shapes = _get_types_and_shapes(dataset)
|
||||
|
||||
def __iter__(self):
|
||||
self.index = 0
|
||||
return self
|
||||
|
||||
def __next__(self):
|
||||
if self.index >= self.sink_count:
|
||||
raise StopIteration()
|
||||
self.index += 1
|
||||
return self.op()
|
||||
|
||||
def types_shapes(self):
|
||||
return self.dataset_types, self.dataset_shapes
|
||||
|
||||
def get_sink_count(self, dataset):
|
||||
sink_count = 1
|
||||
if hasattr(dataset, '__loop_size__'):
|
||||
loop_size = dataset.__loop_size__
|
||||
if loop_size <= dataset.get_dataset_size() and dataset.get_dataset_size() % loop_size != 0:
|
||||
raise ValueError(f'Dataset size {dataset.get_dataset_size()} and '
|
||||
f'sink_size {loop_size} are not matched.')
|
||||
sink_count = math.ceil(dataset.get_dataset_size() / loop_size)
|
||||
return sink_count
|
||||
|
||||
def get_sink_size(self):
|
||||
"""get sink_size to device"""
|
||||
sink_size = 1
|
||||
if hasattr(self.dataset, '__loop_size__'):
|
||||
sink_size = self.dataset.__loop_size__
|
||||
else:
|
||||
if context.get_context("enable_ge") or context.get_context("device_target") == "Ascend":
|
||||
if self.sink_size > 0:
|
||||
sink_size = self.sink_size
|
||||
else:
|
||||
sink_size = self.dataset.get_dataset_size()
|
||||
return sink_size
|
||||
|
||||
|
||||
class _DatasetIterMSLoopSink(_DatasetIter):
|
||||
"""Iter for context when device_target is Ascend"""
|
||||
def __init__(self, dataset, sink_size, epoch_num, iter_first_order):
|
||||
super().__init__(dataset, sink_size, epoch_num)
|
||||
sink_count = 1
|
||||
if hasattr(dataset, '__loop_size__'):
|
||||
loop_size = dataset.__loop_size__ + iter_first_order
|
||||
sink_count = int(sink_size / loop_size) * 2
|
||||
self.sink_count = sink_count
|
||||
ms_role = os.getenv("MS_ROLE")
|
||||
if ms_role in ("MS_PSERVER", "MS_SCHED"):
|
||||
self.sink_count = 1
|
||||
# for self._parallel_mode equal to semi_auto_parallel or auto_parallel, and not using full_batch,
|
||||
# use a complete tensor to compile, and slice tensor to run. The batch dimension of tensors for
|
||||
# compile is device_number times the batch dimension of tensors for run. Now only support LoopSink.
|
||||
if _need_to_full():
|
||||
device_num = _get_device_num()
|
||||
self.dataset_shapes = _to_full_shapes(self.dataset_shapes, device_num)
|
||||
|
||||
def op():
|
||||
return tuple()
|
||||
|
||||
self.op = op
|
||||
|
||||
|
||||
class _DatasetIterMS(_DatasetIter):
|
||||
"""Iter for MS when enable_loop_sink is False."""
|
||||
def __init__(self, dataset, sink_size, epoch_num):
|
||||
super().__init__(dataset, sink_size, epoch_num)
|
||||
if sink_size > 0:
|
||||
self.sink_count = sink_size
|
||||
else:
|
||||
self.sink_count = dataset.get_dataset_size()
|
||||
|
||||
queue_name = dataset.__transfer_dataset__.queue_name
|
||||
self.op = GetNextSingleOp(self.dataset_types, self.dataset_shapes, queue_name)
|
@ -0,0 +1,236 @@
|
||||
# Copyright 2021 Huawei Technologies Co., Ltd
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
# ============================================================================
|
||||
"""Model."""
|
||||
|
||||
import math
|
||||
from mindspore.train.callback import RunContext
|
||||
from mindspore import context
|
||||
from mindspore.context import ParallelMode
|
||||
from mindspore.train.model import Model
|
||||
from mindspore.train.dataset_helper import connect_network_with_dataset
|
||||
from mindspore.parallel._utils import _need_to_full, _to_full_tensor
|
||||
from mindspore.common.dtype import pytype_to_dtype
|
||||
from mindspore._c_expression import init_exec_dataset
|
||||
from .dataset_helper import DatasetHelper
|
||||
|
||||
def _convert_type(types):
|
||||
"""
|
||||
Convert from numpy type to tensor type.
|
||||
|
||||
Args:
|
||||
types (list): Numpy type list of element in dataset.
|
||||
|
||||
Returns:
|
||||
list, list of element in dataset.
|
||||
"""
|
||||
ms_types = []
|
||||
for np_type in types:
|
||||
ms_type = pytype_to_dtype(np_type)
|
||||
ms_types.append(ms_type)
|
||||
return ms_types
|
||||
|
||||
|
||||
def _get_types_and_shapes(dataset):
|
||||
"""Get dataset types and shapes."""
|
||||
dataset_types = _convert_type(dataset.output_types())
|
||||
dataset_shapes = dataset.output_shapes()
|
||||
return dataset_types, dataset_shapes
|
||||
|
||||
|
||||
def _exec_datagraph(exec_dataset, dataset_size, phase='dataset'):
|
||||
"""Initialize and execute the dataset graph."""
|
||||
batch_size = exec_dataset.get_batch_size()
|
||||
input_indexs = exec_dataset.input_indexs
|
||||
|
||||
# transform data format
|
||||
dataset_types, dataset_shapes = _get_types_and_shapes(exec_dataset)
|
||||
init_exec_dataset(exec_dataset.__transfer_dataset__.queue_name,
|
||||
dataset_size,
|
||||
batch_size,
|
||||
dataset_types,
|
||||
dataset_shapes,
|
||||
input_indexs,
|
||||
phase=phase,
|
||||
need_run=False)
|
||||
|
||||
|
||||
class Model_Thor(Model):
|
||||
"""
|
||||
High-Level API for Training or Testing.
|
||||
|
||||
`Model` groups layers into an object with training and inference features.
|
||||
|
||||
Args:
|
||||
network (Cell): A training or testing network.
|
||||
loss_fn (Cell): Objective function, if loss_fn is None, the
|
||||
network should contain the logic of loss and grads calculation, and the logic
|
||||
of parallel if needed. Default: None.
|
||||
optimizer (Cell): Optimizer for updating the weights. Default: None.
|
||||
metrics (Union[dict, set]): A Dictionary or a set of metrics to be evaluated by the model during
|
||||
training and testing. eg: {'accuracy', 'recall'}. Default: None.
|
||||
eval_network (Cell): Network for evaluation. If not defined, `network` and `loss_fn` would be wrapped as
|
||||
`eval_network`. Default: None.
|
||||
eval_indexes (list): When defining the `eval_network`, if `eval_indexes` is None, all outputs of the
|
||||
`eval_network` would be passed to metrics, otherwise `eval_indexes` must contain three
|
||||
elements, including the positions of loss value, predicted value and label. The loss
|
||||
value would be passed to the `Loss` metric, the predicted value and label would be passed
|
||||
to other metric. Default: None.
|
||||
amp_level (str): Option for argument `level` in `mindspore.amp.build_train_network`, level for mixed
|
||||
precision training. Supports [O0, O2, O3]. Default: "O0".
|
||||
|
||||
- O0: Do not change.
|
||||
- O2: Cast network to float16, keep batchnorm run in float32, using dynamic loss scale.
|
||||
- O3: Cast network to float16, with additional property 'keep_batchnorm_fp32=False'.
|
||||
|
||||
O2 is recommended on GPU, O3 is recommended on Ascend.
|
||||
|
||||
loss_scale_manager (Union[None, LossScaleManager]): If it is None, the loss would not be scaled. Otherwise,
|
||||
scale the loss by LossScaleManager. It is a key argument.
|
||||
e.g. Use `loss_scale_manager=None` to set the value.
|
||||
keep_batchnorm_fp32 (bool): Keep Batchnorm running in `float32`. If it is set to true, the level setting before
|
||||
will be overwritten. Default: True.
|
||||
"""
|
||||
|
||||
def __init__(self, network, loss_fn=None, optimizer=None, metrics=None, eval_network=None,
|
||||
eval_indexes=None, amp_level="O0", frequency=834, **kwargs):
|
||||
super(Model_Thor, self).__init__(network, loss_fn, optimizer, metrics, eval_network,
|
||||
eval_indexes, amp_level, **kwargs)
|
||||
self._frequency = frequency
|
||||
self._train_network = self._build_train_network()
|
||||
|
||||
def _exec_preprocess(self, network, is_train, phase, dataset, dataset_sink_mode, sink_size=-1,
|
||||
epoch_num=1, iter_first_order=1):
|
||||
"""Initializes dataset."""
|
||||
if dataset_sink_mode and not is_train:
|
||||
dataset.__loop_size__ = 1
|
||||
dataset_helper = DatasetHelper(dataset, dataset_sink_mode, sink_size, epoch_num, iter_first_order)
|
||||
|
||||
if dataset_sink_mode and context.get_context("device_target") != "GPU":
|
||||
network = connect_network_with_dataset(network, dataset_helper)
|
||||
network.set_train(is_train)
|
||||
network.phase = phase
|
||||
|
||||
if self._parallel_mode in (ParallelMode.SEMI_AUTO_PARALLEL, ParallelMode.AUTO_PARALLEL):
|
||||
network.set_auto_parallel()
|
||||
|
||||
return dataset_helper, network
|
||||
|
||||
def _train_dataset_sink_process(self, epoch, train_dataset, list_callback=None, cb_params=None, sink_size=-1):
|
||||
"""
|
||||
Training process. The data would be passed to network through dataset channel.
|
||||
|
||||
Args:
|
||||
epoch (int): Total number of iterations on the data.
|
||||
train_dataset (Dataset): A training dataset iterator. If there is no
|
||||
loss_fn, a tuple with multiple data (data1, data2, data3, ...) should be
|
||||
returned and passed to the network. Otherwise, a tuple (data, label) should
|
||||
be returned. The data and label would be passed to the network and loss
|
||||
function respectively.
|
||||
list_callback (Callback): Executor of callback list. Default: None.
|
||||
cb_params (_InternalCallbackParam): Callback parameters. Default: None.
|
||||
sink_size (int): Control the amount of data in each sink. Default: -1.
|
||||
"""
|
||||
if sink_size == -1:
|
||||
epoch_num = epoch
|
||||
else:
|
||||
epoch_num = math.ceil(epoch * sink_size / train_dataset.get_dataset_size())
|
||||
|
||||
iter_first_order = self._frequency - 1
|
||||
iter_second_order = 1
|
||||
train_dataset.__loop_size__ = iter_second_order
|
||||
dataset_helper, train_network = self._exec_preprocess(self._train_network,
|
||||
is_train=True,
|
||||
phase='train',
|
||||
dataset=train_dataset,
|
||||
dataset_sink_mode=True,
|
||||
sink_size=sink_size,
|
||||
epoch_num=epoch_num,
|
||||
iter_first_order=iter_first_order)
|
||||
|
||||
self._train_network = train_network
|
||||
cb_params.train_network = self._train_network
|
||||
cb_params.cur_step_num = 0
|
||||
|
||||
run_context = RunContext(cb_params)
|
||||
list_callback.begin(run_context)
|
||||
|
||||
# used to stop training for early stop, such as stopAtTIme or stopATStep
|
||||
should_stop = False
|
||||
switch_branch_one = True
|
||||
index_first_order = 0
|
||||
train_network_init_flag = True
|
||||
has_do_dataset_init = False
|
||||
|
||||
for i in range(epoch):
|
||||
cb_params.cur_epoch_num = i + 1
|
||||
list_callback.epoch_begin(run_context)
|
||||
# for data sink dataset_helper only iter once, other wise iter epoch_size times.
|
||||
for inputs in dataset_helper:
|
||||
if _need_to_full() and context.get_context("device_target") == "GPU":
|
||||
inputs = _to_full_tensor(inputs, self._device_number, self._global_rank)
|
||||
list_callback.step_begin(run_context)
|
||||
if context.get_context("device_target") == "GPU":
|
||||
if switch_branch_one:
|
||||
cb_params.cur_step_num += 1
|
||||
if train_network_init_flag:
|
||||
self._train_network.add_flags_recursive(thor=True)
|
||||
self._train_network.phase = 'train0'
|
||||
switch_branch_one = not switch_branch_one
|
||||
outputs = self._train_network(*inputs)
|
||||
cb_params.net_outputs = outputs
|
||||
list_callback.step_end(run_context)
|
||||
else:
|
||||
cb_params.cur_step_num += 1
|
||||
if train_network_init_flag:
|
||||
self._train_network.add_flags_recursive(thor=False)
|
||||
train_network_init_flag = False
|
||||
self._train_network.phase = 'train1'
|
||||
outputs = self._train_network(*inputs)
|
||||
cb_params.net_outputs = outputs
|
||||
index_first_order += 1
|
||||
if index_first_order == iter_first_order:
|
||||
index_first_order = 0
|
||||
switch_branch_one = not switch_branch_one
|
||||
list_callback.step_end(run_context)
|
||||
else:
|
||||
if switch_branch_one:
|
||||
cb_params.cur_step_num += 1
|
||||
if train_network_init_flag:
|
||||
self._train_network.add_flags_recursive(thor=True)
|
||||
self._train_network.phase = 'train0'
|
||||
else:
|
||||
cb_params.cur_step_num += iter_first_order
|
||||
if train_network_init_flag:
|
||||
self._train_network.add_flags_recursive(thor=False)
|
||||
train_network_init_flag = False
|
||||
self._train_network.phase = 'train1'
|
||||
if not has_do_dataset_init:
|
||||
_exec_datagraph(train_dataset, iter_first_order, phase='train1_dataset')
|
||||
has_do_dataset_init = True
|
||||
switch_branch_one = not switch_branch_one
|
||||
outputs = self._train_network(*inputs)
|
||||
cb_params.net_outputs = outputs
|
||||
list_callback.step_end(run_context)
|
||||
|
||||
list_callback.epoch_end(run_context)
|
||||
should_stop = should_stop or run_context.get_stop_requested()
|
||||
if should_stop:
|
||||
break
|
||||
dataset_helper.stop_send()
|
||||
|
||||
list_callback.end(run_context)
|
||||
|
||||
|
||||
__all__ = ["Model_Thor"]
|
@ -1,135 +0,0 @@
|
||||
# Copyright 2020 Huawei Technologies Co., Ltd
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
# ============================================================================
|
||||
"""grad reducer cell for distributed training"""
|
||||
from mindspore.nn.cell import Cell
|
||||
from mindspore.communication.management import GlobalComm, get_group_size
|
||||
from mindspore.ops import functional as F, composite as C, operations as P
|
||||
from mindspore.ops.operations.comm_ops import AllReduce
|
||||
import mindspore.common.dtype as mstype
|
||||
|
||||
reduce_opt = C.MultitypeFuncGraph("reduce_opt")
|
||||
|
||||
|
||||
def _init_allreduce_operators(length, split_indices):
|
||||
""" initialize allreduce communication operators"""
|
||||
indices = split_indices[0]
|
||||
fusion = split_indices[1]
|
||||
op_list = ()
|
||||
j = 0
|
||||
for i in range(length):
|
||||
if j <= len(indices)-1:
|
||||
temp = indices[j]
|
||||
else:
|
||||
temp = length
|
||||
if i >= temp:
|
||||
j = j + 1
|
||||
fusion = fusion + 1
|
||||
op = AllReduce('sum', GlobalComm.WORLD_COMM_GROUP)
|
||||
op.add_prim_attr('fusion', fusion)
|
||||
op_list = op_list + (op,)
|
||||
return op_list
|
||||
|
||||
|
||||
@reduce_opt.register("Function", "Number", "Function", "Tensor")
|
||||
def _tensors_allreduce_mean(mul, degree, allreduce, parameters):
|
||||
"""
|
||||
Apply allreduce on parameters.
|
||||
|
||||
Args:
|
||||
mul(Primitive): The mul operator for parameters.
|
||||
degree (int): The mean coefficient.
|
||||
allreduce (Primitive): The communication operator for parameters.
|
||||
parameters (Tensor): The parameters before operation.
|
||||
|
||||
Returns:
|
||||
Tensor, the parameters after operation.
|
||||
"""
|
||||
degree = F.scalar_cast(degree, F.dtype(parameters))
|
||||
parameters = allreduce(parameters)
|
||||
cast_op = P.Cast()
|
||||
return mul(parameters, cast_op(F.scalar_to_array(1.0 / degree), F.dtype(parameters)))
|
||||
|
||||
|
||||
_get_datatype = C.MultitypeFuncGraph("_get_datatype")
|
||||
|
||||
|
||||
@_get_datatype.register("Tensor")
|
||||
def _tensors_get_datatype(parameters):
|
||||
"""
|
||||
Acquire parameters datatype.
|
||||
|
||||
Args:
|
||||
parameters (Tensor): The parameters before operation.
|
||||
|
||||
Returns:
|
||||
mstype, the datatype of parameters.
|
||||
"""
|
||||
return F.dtype(parameters)
|
||||
|
||||
|
||||
_cast_datatype = C.MultitypeFuncGraph("_cast_datatype")
|
||||
|
||||
|
||||
@_cast_datatype.register("TypeType", "Tensor")
|
||||
def _tensors_cast_datatype(datatype, parameters):
|
||||
"""
|
||||
Cast parameters to datatype.
|
||||
|
||||
Args:
|
||||
datatype (mstype): the destination datatype of parameters.
|
||||
parameters (Tensor): The parameters before operation.
|
||||
|
||||
Returns:
|
||||
Tensor, the parameters after operation.
|
||||
"""
|
||||
return F.cast(parameters, datatype)
|
||||
|
||||
|
||||
class DistributedGradReducerThor(Cell):
|
||||
"""
|
||||
A distributed optimizer.
|
||||
|
||||
Constructs a parameters reducer Cell, which applies communication and average operations on
|
||||
single-process parameters values.
|
||||
|
||||
Args:
|
||||
parameter_length (int): length of the parameters to be updated.
|
||||
split_indices(tuple): parameter split indices.
|
||||
mean (bool): When mean is true, the mean coefficient (degree) would apply on parameters. Default: False.
|
||||
degree (int): The mean coefficient. Usually it equals to device number. Default: None.
|
||||
|
||||
Raises:
|
||||
ValueError: If degree is not a int or less than 0.
|
||||
"""
|
||||
|
||||
def __init__(self, parameter_length, split_indices, mean=True, degree=None):
|
||||
super(DistributedGradReducerThor, self).__init__(auto_prefix=False)
|
||||
self.hyper_map = C.HyperMap()
|
||||
self.mul = P.Mul()
|
||||
if degree is None:
|
||||
self.degree = get_group_size()
|
||||
else:
|
||||
if not isinstance(degree, int) or degree <= 0:
|
||||
raise ValueError("Parameter 'degree' in DistributedGradReducer should large than 0 and be int")
|
||||
self.degree = degree
|
||||
self.mean = mean
|
||||
self.op_list = _init_allreduce_operators(parameter_length, split_indices)
|
||||
|
||||
def construct(self, parameters):
|
||||
datatypes = self.hyper_map(F.partial(_get_datatype), parameters)
|
||||
parameters = self.hyper_map(F.partial(_cast_datatype, mstype.float32), parameters)
|
||||
new_parameters = self.hyper_map(F.partial(reduce_opt, self.mul, self.degree), self.op_list, parameters)
|
||||
new_parameters = self.hyper_map(F.partial(_cast_datatype), datatypes, new_parameters)
|
||||
return new_parameters
|
@ -1,88 +0,0 @@
|
||||
# Copyright 2020 Huawei Technologies Co., Ltd
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
# ============================================================================
|
||||
"""learning rate generator"""
|
||||
import math
|
||||
|
||||
import numpy as np
|
||||
|
||||
|
||||
def get_lr(lr_init, lr_end, lr_max, warmup_epochs, total_epochs, steps_per_epoch, lr_decay_mode):
|
||||
"""
|
||||
generate learning rate array
|
||||
|
||||
Args:
|
||||
lr_init(float): init learning rate
|
||||
lr_end(float): end learning rate
|
||||
lr_max(float): max learning rate
|
||||
warmup_epochs(int): number of warmup epochs
|
||||
total_epochs(int): total epoch of training
|
||||
steps_per_epoch(int): steps of one epoch
|
||||
lr_decay_mode(string): learning rate decay mode, including steps, poly, cosine or default
|
||||
|
||||
Returns:
|
||||
np.array, learning rate array
|
||||
"""
|
||||
lr_each_step = []
|
||||
total_steps = steps_per_epoch * total_epochs
|
||||
warmup_steps = steps_per_epoch * warmup_epochs
|
||||
if lr_decay_mode == 'steps':
|
||||
decay_epoch_index = [0.3 * total_steps, 0.6 * total_steps, 0.8 * total_steps]
|
||||
for i in range(total_steps):
|
||||
if i < decay_epoch_index[0]:
|
||||
lr = lr_max
|
||||
elif i < decay_epoch_index[1]:
|
||||
lr = lr_max * 0.1
|
||||
elif i < decay_epoch_index[2]:
|
||||
lr = lr_max * 0.01
|
||||
else:
|
||||
lr = lr_max * 0.001
|
||||
lr_each_step.append(lr)
|
||||
elif lr_decay_mode == 'poly':
|
||||
if warmup_steps != 0:
|
||||
inc_each_step = (float(lr_max) - float(lr_init)) / float(warmup_steps)
|
||||
else:
|
||||
inc_each_step = 0
|
||||
for i in range(total_steps):
|
||||
if i < warmup_steps:
|
||||
lr = float(lr_init) + inc_each_step * float(i)
|
||||
else:
|
||||
base = (1.0 - (float(i) - float(warmup_steps)) / (float(total_steps) - float(warmup_steps)))
|
||||
lr = float(lr_max) * base * base
|
||||
if lr < 0.0:
|
||||
lr = 0.0
|
||||
lr_each_step.append(lr)
|
||||
elif lr_decay_mode == 'cosine':
|
||||
decay_steps = total_steps - warmup_steps
|
||||
for i in range(total_steps):
|
||||
if i < warmup_steps:
|
||||
lr_inc = (float(lr_max) - float(lr_init)) / float(warmup_steps)
|
||||
lr = float(lr_init) + lr_inc * (i + 1)
|
||||
else:
|
||||
linear_decay = (total_steps - i) / decay_steps
|
||||
cosine_decay = 0.5 * (1 + math.cos(math.pi * 2 * 0.47 * i / decay_steps))
|
||||
decayed = linear_decay * cosine_decay + 0.00001
|
||||
lr = lr_max * decayed
|
||||
lr_each_step.append(lr)
|
||||
else:
|
||||
for i in range(total_steps):
|
||||
if i < warmup_steps:
|
||||
lr = lr_init + (lr_max - lr_init) * i / warmup_steps
|
||||
else:
|
||||
lr = lr_max - (lr_max - lr_end) * (i - warmup_steps) / (total_steps - warmup_steps)
|
||||
lr_each_step.append(lr)
|
||||
|
||||
learning_rate = np.array(lr_each_step).astype(np.float32)
|
||||
|
||||
return learning_rate
|
File diff suppressed because it is too large
Load Diff
@ -1,202 +0,0 @@
|
||||
# Copyright 2020 Huawei Technologies Co., Ltd
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
# ============================================================================
|
||||
"""momentum"""
|
||||
import mindspore.common.dtype as mstype
|
||||
from mindspore.common.initializer import initializer
|
||||
from mindspore.common.parameter import Parameter
|
||||
from mindspore.common.parameter import ParameterTuple
|
||||
from mindspore.common.tensor import Tensor
|
||||
from mindspore.nn.optim.optimizer import Optimizer
|
||||
from mindspore.ops import functional as F, composite as C, operations as P
|
||||
from mindspore.parallel._utils import _get_device_num, _get_gradients_mean
|
||||
|
||||
from .grad_reducer_thor import DistributedGradReducerThor
|
||||
|
||||
momentum_opt = C.MultitypeFuncGraph("momentum_opt")
|
||||
|
||||
|
||||
@momentum_opt.register("Function", "Tensor", "Tensor", "Tensor", "Tensor", "Tensor")
|
||||
def _tensor_run_opt_ext(opt, learning_rate, momentum, gradient, weight, moment):
|
||||
"""Apply momentum optimizer to the weight parameter using Tensor."""
|
||||
success = True
|
||||
success = F.depend(success, opt(weight, moment, learning_rate, gradient, momentum))
|
||||
return success
|
||||
|
||||
|
||||
op_add = P.AddN()
|
||||
apply_decay = C.MultitypeFuncGraph("apply_decay")
|
||||
|
||||
|
||||
@apply_decay.register("Number", "Bool", "Tensor", "Tensor")
|
||||
def _tensor_apply_decay(weight_decay, if_apply, weight, gradient):
|
||||
"""Get grad with weight_decay."""
|
||||
if if_apply:
|
||||
return op_add((weight * weight_decay, gradient))
|
||||
return gradient
|
||||
|
||||
|
||||
class THOR(Optimizer):
|
||||
"""THOR"""
|
||||
|
||||
def __init__(self, params, learning_rate, momentum, matrix_A, matrix_G, A_inv_max, G_inv_max, weight_decay=0.0,
|
||||
loss_scale=1.0,
|
||||
decay_filter=lambda x: x.name not in []):
|
||||
super(THOR, self).__init__(learning_rate, params, weight_decay, loss_scale)
|
||||
if isinstance(momentum, float) and momentum < 0.0:
|
||||
raise ValueError("momentum should be at least 0.0, but got momentum {}".format(momentum))
|
||||
self.momentum = Parameter(Tensor(momentum, mstype.float32), name="momentum")
|
||||
self.params = self.parameters
|
||||
self.moments = self.params.clone(prefix="moments", init='zeros')
|
||||
self.hyper_map = C.HyperMap()
|
||||
self.opt = P.ApplyMomentum()
|
||||
self.matrix_A = ParameterTuple(matrix_A)
|
||||
self.matrix_G = ParameterTuple(matrix_G)
|
||||
self.A_inv_max = ParameterTuple(A_inv_max)
|
||||
self.G_inv_max = ParameterTuple(G_inv_max)
|
||||
self.cube_matmul_left = P.CusMatMulCubeFraczLeftCast()
|
||||
self.cube_matmul_left_fc = P.CusMatMulCubeDenseLeft()
|
||||
self.cube_matmul_right_fc = P.CusMatMulCubeDenseRight()
|
||||
self.cube_matmul_right_mul = P.CusMatMulCubeFraczRightMul()
|
||||
self.transpose = P.Transpose()
|
||||
self.shape = P.Shape()
|
||||
self.reshape = P.Reshape()
|
||||
self.mul = P.Mul()
|
||||
self.weight_idx = []
|
||||
for i in range(len(self.params)):
|
||||
if "conv" in self.params[i].name or "end_point" in self.params[i].name:
|
||||
self.weight_idx.append(i)
|
||||
self.weight_idx.append(len(self.params))
|
||||
self.feature_map = [1.0 / 12544, 1.0 / 3136, 1.0 / 3136, 1.0 / 3136, 1.0 / 3136, 1.0 / 3136, 1.0 / 3136,
|
||||
1.0 / 3136, 1.0 / 3136, 1.0 / 3136, 1.0 / 3136, 1.0 / 3136,
|
||||
1.0 / 784, 1.0 / 784, 1.0 / 784, 1.0 / 784, 1.0 / 784, 1.0 / 784, 1.0 / 784, 1.0 / 784,
|
||||
1.0 / 784, 1.0 / 784, 1.0 / 784, 1.0 / 784, 1.0 / 784,
|
||||
1.0 / 196, 1.0 / 196, 1.0 / 196, 1.0 / 196, 1.0 / 196, 1.0 / 196, 1.0 / 196, 1.0 / 196,
|
||||
1.0 / 196, 1.0 / 196, 1.0 / 196, 1.0 / 196, 1.0 / 196, 1.0 / 196, 1.0 / 196, 1.0 / 196,
|
||||
1.0 / 196, 1.0 / 196, 1.0 / 196,
|
||||
1.0 / 49, 1.0 / 49, 1.0 / 49, 1.0 / 49, 1.0 / 49, 1.0 / 49, 1.0 / 49, 1.0 / 49, 1.0 / 49,
|
||||
1.0]
|
||||
mean = _get_gradients_mean()
|
||||
degree = _get_device_num()
|
||||
parameter_length = len(self.feature_map)
|
||||
self.grad_reducer_Amax = DistributedGradReducerThor(parameter_length, ((27,), 2), mean, degree)
|
||||
self.grad_reducer_Gmax = DistributedGradReducerThor(parameter_length, ((27,), 4), mean, degree)
|
||||
self.grad_reducer_A = DistributedGradReducerThor(parameter_length, ((27,), 6), mean, degree)
|
||||
self.grad_reducer_G = DistributedGradReducerThor(parameter_length, ((27,), 8), mean, degree)
|
||||
self.matrix_A_inv = ()
|
||||
self.matrix_G_inv = ()
|
||||
self.matrix_max_inv = ()
|
||||
|
||||
for i in range(54):
|
||||
self.matrix_max_inv = self.matrix_max_inv + (
|
||||
Parameter(initializer(1, [1], mstype.float32), name="matrix_max" + str(i), requires_grad=False),)
|
||||
self.log = P.Log()
|
||||
self.exp = P.Exp()
|
||||
self.sqrt = P.Sqrt()
|
||||
self.matrix_max_inv = ParameterTuple(self.matrix_max_inv)
|
||||
self.assign = P.Assign()
|
||||
self.cast = P.Cast()
|
||||
self.thor = True
|
||||
self.weight_decay = weight_decay * loss_scale
|
||||
self.decay_flags = tuple(decay_filter(x) for x in self.parameters)
|
||||
|
||||
def construct(self, gradients):
|
||||
params = self.params
|
||||
moments = self.moments
|
||||
if self.thor:
|
||||
matrix_A_allreduce = ()
|
||||
matrix_G_allreduce = ()
|
||||
matrix_A_max_allreduce = ()
|
||||
matrix_G_max_allreduce = ()
|
||||
for i in range(54):
|
||||
g = gradients[i * 3]
|
||||
matrix_A = self.matrix_A[i]
|
||||
matrix_G = self.matrix_G[i]
|
||||
A_max = self.A_inv_max[i]
|
||||
G_max = self.G_inv_max[i]
|
||||
matrix_A = F.depend(matrix_A, g)
|
||||
matrix_G = F.depend(matrix_G, g)
|
||||
A_max = F.depend(A_max, g)
|
||||
G_max = F.depend(G_max, g)
|
||||
matrix_A_allreduce = matrix_A_allreduce + (matrix_A,)
|
||||
matrix_G_allreduce = matrix_G_allreduce + (matrix_G,)
|
||||
matrix_A_max_allreduce = matrix_A_max_allreduce + (A_max,)
|
||||
matrix_G_max_allreduce = matrix_G_max_allreduce + (G_max,)
|
||||
matrix_A_allreduce = self.grad_reducer_A(matrix_A_allreduce)
|
||||
matrix_G_allreduce = self.grad_reducer_G(matrix_G_allreduce)
|
||||
matrix_A_max_allreduce = self.grad_reducer_Amax(matrix_A_max_allreduce)
|
||||
matrix_G_max_allreduce = self.grad_reducer_Gmax(matrix_G_max_allreduce)
|
||||
new_grads = ()
|
||||
for i in range(54):
|
||||
g = gradients[i * 3]
|
||||
temp_a = matrix_A_allreduce[i]
|
||||
temp_g = matrix_G_allreduce[i]
|
||||
temp_a = self.cast(temp_a, mstype.float32)
|
||||
temp_g = self.cast(temp_g, mstype.float32)
|
||||
matrix_A_inv_max = self.log(matrix_A_max_allreduce[i])
|
||||
matrix_A_inv_max = self.mul(matrix_A_inv_max, -1)
|
||||
matrix_A_inv_max = self.exp(matrix_A_inv_max)
|
||||
temp_a = self.mul(temp_a, matrix_A_inv_max)
|
||||
matrix_G_inv_max = self.log(matrix_G_max_allreduce[i])
|
||||
matrix_G_inv_max = self.mul(matrix_G_inv_max, -1)
|
||||
matrix_G_inv_max = self.exp(matrix_G_inv_max)
|
||||
temp_g = self.mul(temp_g, matrix_G_inv_max)
|
||||
temp_max = self.mul(matrix_A_max_allreduce[i], matrix_G_max_allreduce[i])
|
||||
temp_max = self.mul(temp_max, self.feature_map[i])
|
||||
temp_a = self.cast(temp_a, mstype.float16)
|
||||
temp_g = self.cast(temp_g, mstype.float16)
|
||||
if i == 53:
|
||||
g = self.cube_matmul_left_fc(temp_g, g)
|
||||
g = self.cube_matmul_right_fc(g, temp_a, temp_max)
|
||||
else:
|
||||
g = self.cube_matmul_left(temp_g, g)
|
||||
g = self.cube_matmul_right_mul(g, temp_a, temp_max)
|
||||
fake_A = self.assign(self.matrix_A[i], temp_a)
|
||||
fake_G = self.assign(self.matrix_G[i], temp_g)
|
||||
fake_max = self.assign(self.matrix_max_inv[i], temp_max)
|
||||
g = F.depend(g, fake_A)
|
||||
g = F.depend(g, fake_G)
|
||||
g = F.depend(g, fake_max)
|
||||
if i == 53:
|
||||
new_grads = new_grads + (g,)
|
||||
else:
|
||||
new_grads = new_grads + (g, gradients[i * 3 + 1], gradients[i * 3 + 2])
|
||||
gradients = new_grads
|
||||
else:
|
||||
new_grads = ()
|
||||
for i in range(54):
|
||||
g = gradients[i * 3]
|
||||
matrix_A = self.matrix_A[i]
|
||||
matrix_G = self.matrix_G[i]
|
||||
matrix_max = self.matrix_max_inv[i]
|
||||
matrix_A = F.depend(matrix_A, g)
|
||||
matrix_G = F.depend(matrix_G, g)
|
||||
matrix_max = F.depend(matrix_max, g)
|
||||
if i == 53:
|
||||
g = self.cube_matmul_left_fc(matrix_G, g)
|
||||
g = self.cube_matmul_right_fc(g, matrix_A, matrix_max)
|
||||
new_grads = new_grads + (g,)
|
||||
else:
|
||||
g = self.cube_matmul_left(matrix_G, g)
|
||||
g = self.cube_matmul_right_mul(g, matrix_A, matrix_max)
|
||||
new_grads = new_grads + (g, gradients[i * 3 + 1], gradients[i * 3 + 2])
|
||||
gradients = new_grads
|
||||
|
||||
if self.weight_decay > 0:
|
||||
gradients = self.hyper_map(F.partial(apply_decay, self.weight_decay), self.decay_flags,
|
||||
params, gradients)
|
||||
gradients = self.scale_grad(gradients)
|
||||
lr = self.get_lr()
|
||||
success = self.hyper_map(F.partial(momentum_opt, self.opt, lr, self.momentum), gradients, params, moments)
|
||||
return success
|
File diff suppressed because it is too large
Load Diff
Loading…
Reference in new issue