!4460 update the codes of bert_thor

Merge pull request !4460 from zongha/master
5 years ago · eb4749642d
parent df46f27053 da142ccfd3
commit eb4749642d
13 changed files with 138 additions and 275 deletions
--- a/model_zoo/official/nlp/bert_thor/README.md
+++ b/model_zoo/official/nlp/bert_thor/README.md
@ -11,14 +11,14 @@ This is an example of training bert by second-order optimizer THOR. THOR is a no

 ## Running the Example
 ### Pre-Training
- Set options in `config.py`, including lossscale, optimizer and network. Click [here](https://www.mindspore.cn/tutorial/zh-CN/master/use/data_preparation/loading_the_datasets.html#tfrecord) for more information about dataset and the json schema file.
+- Set options in `config.py`, including optimizer and network. Click [here](https://www.mindspore.cn/tutorial/zh-CN/master/use/data_preparation/loading_the_datasets.html#tfrecord) for more information about dataset and the json schema file.

- Run `run_standalone_pretrain.sh` for non-distributed pre-training of BERT-base and BERT-NEZHA model.
+- Run `run_standalone_pretrain.sh` for non-distributed pre-training of BERT-base, BERT-NEZHA and BERT-large model.

    ``` bash   
    sh scripts/run_standalone_pretrain.sh DEVICE_ID EPOCH_SIZE DATA_DIR SCHEMA_DIR
    ```
- Run `run_distribute_pretrain.sh` for distributed pre-training of BERT-base and BERT-NEZHA model.
+- Run `run_distribute_pretrain.sh` for distributed pre-training of BERT-base, BERT-NEZHA and BERT-large model.

    ``` bash   
    sh scripts/run_distribute_pretrain.sh DEVICE_NUM EPOCH_SIZE DATA_DIR SCHEMA_DIR RANK_TABLE_FILE
@ -30,7 +30,7 @@ This is an example of training bert by second-order optimizer THOR. THOR is a no
 usage: run_pretrain.py  [--distribute DISTRIBUTE] [--epoch_size N] [----device_num N] [--device_id N] 
                        [--enable_save_ckpt ENABLE_SAVE_CKPT]
                        [--enable_lossscale ENABLE_LOSSSCALE] [--do_shuffle DO_SHUFFLE]
-                        [--enable_data_sink ENABLE_DATA_SINK] [--data_sink_steps N] [--checkpoint_path CHECKPOINT_PATH]
+                        [--enable_data_sink ENABLE_DATA_SINK] [--data_sink_steps N] [--save_checkpoint_path CHECKPOINT_PATH]
                        [--save_checkpoint_steps N] [--save_checkpoint_num N] 
                        [--data_dir DATA_DIR] [--schema_dir SCHEMA_DIR]

@ -44,7 +44,7 @@ options:
    --do_shuffle               enable shuffle: "true" | "false", default is "true"
    --enable_data_sink         enable data sink: "true" | "false", default is "true"
    --data_sink_steps          set data sink steps: N, default is 1
-    --checkpoint_path          path to save checkpoint files: PATH, default is ""
+    --save_checkpoint_path     path to save checkpoint files: PATH, default is ""
    --save_checkpoint_steps    steps for saving checkpoint files: N, default is 1000
    --save_checkpoint_num      number for saving checkpoint files: N, default is 1
    --data_dir                 path to dataset directory: PATH, default is ""
@ -55,7 +55,7 @@ It contains of parameters of BERT model and options for training, which is set i
 ### Options:
 ```
 config.py:
-    bert_network                    version of BERT model: base | nezha, default is base
+    bert_network                    version of BERT model: base | nezha | large, default is large
    optimizer                       optimizer used in the network: AdamWerigtDecayDynamicLR | Lamb | Momentum | Thor, default is "Thor"

 ```
@ -63,7 +63,7 @@ config.py:
 ### Parameters:
 ```
 Parameters for dataset and network (Pre-Training/Evaluation):
-    batch_size                      batch size of input dataset: N, default is 8
+    batch_size                      batch size of input dataset: N, default is 12
    seq_length                      length of input sequence: N, default is 128
    vocab_size                      size of each embedding vector: N, must be consistant with the dataset you use. Default is 21136
    hidden_size                     size of bert encoder layers: N, default is 768
@ -87,7 +87,7 @@ Parameters for optimizer:
    momentum                        momentum for the moving average: Q
    weight_decay                    weight decay: Q
    loss_scale                      loss scale: N
-    frequency                       the step interval to update second-order information matrix: N, default is 10
-    batch_size                      batch size of input dataset: N, default is 8
+    frequency                       the step interval to update second-order information matrix: N, default is 100
+    batch_size                      batch size of input dataset: N, default is 12
 ```

--- a/model_zoo/official/nlp/bert_thor/run_pretrain.py
+++ b/model_zoo/official/nlp/bert_thor/run_pretrain.py
@ -19,7 +19,6 @@ python run_pretrain.py

 import argparse
 import os
-
 import numpy
 from src import BertNetworkWithLoss, BertTrainOneStepCell, BertTrainOneStepWithLossScaleCell
 from src.bert_net_config import bert_net_cfg
@ -27,10 +26,8 @@ from src.config import cfg
 from src.dataset import create_bert_dataset
 from src.lr_generator import get_bert_lr, get_bert_damping
 from src.model_thor import Model
-# from src.thor_for_bert import THOR
 from src.thor_for_bert_arg import THOR
 from src.utils import LossCallBack, BertLearningRate
-
 import mindspore.common.dtype as mstype
 import mindspore.communication.management as D
 from mindspore import context
@ -69,8 +66,8 @@ def run_pretrain():
    parser.add_argument("--schema_dir", type=str, default="", help="Schema path, it is better to use absolute path")

    args_opt = parser.parse_args()
-    context.set_context(mode=context.GRAPH_MODE, device_target=args_opt.device_target, device_id=args_opt.device_id,
-                        save_graphs=True)
+    context.set_context(mode=context.GRAPH_MODE, device_target=args_opt.device_target,
+                        device_id=args_opt.device_id, save_graphs=False)
    context.set_context(reserve_class_name_in_scope=False)
    context.set_context(variable_memory_max_size="30GB")
    ckpt_save_dir = args_opt.save_checkpoint_path
@ -165,15 +162,13 @@ def run_pretrain():
        optimizer = THOR(filter(lambda x: x.requires_grad, net_with_loss.get_parameters()), lr, cfg.Thor.momentum,
                         filter(lambda x: 'matrix_A' in x.name, net_with_loss.get_parameters()),
                         filter(lambda x: 'matrix_G' in x.name, net_with_loss.get_parameters()),
-                         filter(lambda x: 'A_inv_max' in x.name, net_with_loss.get_parameters()),
-                         filter(lambda x: 'G_inv_max' in x.name, net_with_loss.get_parameters()),
                         cfg.Thor.weight_decay, cfg.Thor.loss_scale, bert_net_cfg.num_hidden_layers,
                         bert_net_cfg.batch_size, damping)
    else:
-        raise ValueError("Don't support optimizer {}, only support [Lamb, Momentum, AdamWeightDecay]".
+        raise ValueError("Don't support optimizer {}, only support [Lamb, Momentum, AdamWeightDecay, Thor]".
                         format(cfg.optimizer))
    callback = [TimeMonitor(args_opt.data_sink_steps), LossCallBack()]
-    if args_opt.enable_save_ckpt == "true":
+    if args_opt.enable_save_ckpt == "true" and rank == 0:
        config_ck = CheckpointConfig(save_checkpoint_steps=args_opt.save_checkpoint_steps,
                                     keep_checkpoint_max=args_opt.save_checkpoint_num)
        ckpoint_cb = ModelCheckpoint(prefix='checkpoint_bert', directory=ckpt_save_dir, config=config_ck)
--- a/model_zoo/official/nlp/bert_thor/scripts/run_distribute_pretrain.sh
+++ b/model_zoo/official/nlp/bert_thor/scripts/run_distribute_pretrain.sh
@ -37,25 +37,26 @@ do

    rm -rf LOG$i
    mkdir ./LOG$i
-    cp  *.py ./LOG$i
-    cp -r src ./LOG$i
+    cp  ../*.py ./LOG$i
+    cp -r ../src ./LOG$i
    cd ./LOG$i || exit
-    echo "start training for rank $i, device $DEVICE_ID"
+    echo "start training for rank $RANK_ID, device $DEVICE_ID"
    env > env.log
-    python ../run_pretrain.py  \
+    python run_pretrain.py  \
    --distribute="true" \
    --epoch_size=$EPOCH_SIZE \
    --device_id=$DEVICE_ID \
    --device_num=$RANK_SIZE \
    --enable_save_ckpt="true" \
    --enable_lossscale="false" \
-    --do_shuffle="true" \
+    --do_shuffle="false" \
    --enable_data_sink="true" \
    --data_sink_steps=1000 \
    --load_checkpoint_path="" \
-    --save_checkpoint_steps=5000 \
+    --save_checkpoint_path='./' \
+    --save_checkpoint_steps=1000 \
    --save_checkpoint_num=30 \
    --data_dir=$DATA_DIR \
    --schema_dir=$SCHEMA_DIR > log.txt 2>&1 &
    cd ../
-done
+done
--- a/model_zoo/official/nlp/bert_thor/scripts/run_standalone_pretrain.sh
+++ b/model_zoo/official/nlp/bert_thor/scripts/run_standalone_pretrain.sh
@ -20,27 +20,39 @@ echo "bash run_standalone_pretrain.sh DEVICE_ID EPOCH_SIZE DATA_DIR SCHEMA_DIR"
 echo "for example: bash run_standalone_pretrain.sh 0 40 /path/zh-wiki/ /path/Schema.json"
 echo "=============================================================================================================="

-DEVICE_ID=$1
 EPOCH_SIZE=$2
 DATA_DIR=$3
 SCHEMA_DIR=$4

-mkdir -p ms_log 
-PROJECT_DIR=$(cd "$(dirname "$0")" || exit; pwd)
-CUR_DIR=`pwd`
-export GLOG_log_dir=${CUR_DIR}/ms_log
-export GLOG_logtostderr=0
-python ${PROJECT_DIR}/../run_pretrain.py  \
-    --distribute="false" \
-    --epoch_size=$EPOCH_SIZE \
-    --device_id=$DEVICE_ID \
-    --enable_save_ckpt="true" \
-    --enable_lossscale="true" \
-    --do_shuffle="true" \
-    --enable_data_sink="true" \
-    --data_sink_steps=1 \
-    --load_checkpoint_path="" \
-    --save_checkpoint_steps=10000 \
-    --save_checkpoint_num=1 \
-    --data_dir=$DATA_DIR \
-    --schema_dir=$SCHEMA_DIR > log.txt 2>&1 &
+ulimit -u unlimited
+export DEVICE_ID=$1
+export RANK_SIZE=1
+
+if [ -d "LOG" ];
+then
+    rm -rf ./LOG
+fi
+mkdir ./LOG
+cp  ../*.py ./LOG
+cp -r ../src ./LOG
+cd ./LOG || exit
+echo "start training for device $DEVICE_ID"
+env > env.log
+python run_pretrain.py  \
+--distribute="false" \
+--epoch_size=$EPOCH_SIZE \
+--device_id=$DEVICE_ID \
+--device_num=$RANK_SIZE \
+--enable_save_ckpt="true" \
+--enable_lossscale="false" \
+--do_shuffle="false" \
+--enable_data_sink="true" \
+--data_sink_steps=1000 \
+--load_checkpoint_path="" \
+--save_checkpoint_path='./' \
+--save_checkpoint_steps=5000 \
+--save_checkpoint_num=20 \
+--data_dir=$DATA_DIR \
+--schema_dir=$SCHEMA_DIR > log.txt 2>&1 &
+cd ../
+
--- a/model_zoo/official/nlp/bert_thor/src/bert_for_pre_training.py
+++ b/model_zoo/official/nlp/bert_thor/src/bert_for_pre_training.py
@ -35,6 +35,8 @@ from .thor_layer import Dense_Thor

 damping = get_bert_damping()
 loss_scale = cfg.Thor.loss_scale
+frequency = cfg.Thor.frequency
+batch_size = cfg.Thor.batch_size
 GRADIENT_CLIP_TYPE = 1
 GRADIENT_CLIP_VALUE = 1.0

@ -91,9 +93,9 @@ class GetMaskedLMOutput(nn.Cell):
                                bias_init='zeros',
                                damping=damping,
                                loss_scale=loss_scale,
-                                frequency=1,
+                                frequency=frequency,
                                activation=config.hidden_act,
-                                batch_size=config.batch_size).to_float(config.compute_type)
+                                batch_size=batch_size).to_float(config.compute_type)
        self.layernorm = nn.LayerNorm((config.hidden_size,)).to_float(config.compute_type)
        self.output_bias = Parameter(
            initializer(
--- a/model_zoo/official/nlp/bert_thor/src/bert_model.py
+++ b/model_zoo/official/nlp/bert_thor/src/bert_model.py
@ -34,6 +34,7 @@ from .thor_layer import Dense_Thor, Embedding_Thor

 damping = get_bert_damping()
 loss_scale = cfg.Thor.loss_scale
+frequency = cfg.Thor.frequency
 batch_size = cfg.Thor.batch_size


@ -200,11 +201,10 @@ class EmbeddingPostprocessor(nn.Cell):
            use_one_hot_embeddings=use_one_hot_embeddings,
            initializer_range=initializer_range,
            name='embedding_table',
-            is_expand=False,
            batch_size=batch_size,
            damping=damping,
            loss_scale=loss_scale,
-            frequency=1)
+            frequency=frequency)
        self.shape_flat = (-1,)
        self.one_hot = P.OneHot()
        self.on_value = Tensor(1.0, mstype.float32)
@ -225,11 +225,10 @@ class EmbeddingPostprocessor(nn.Cell):
            use_one_hot_embeddings=use_one_hot_embeddings,
            initializer_range=initializer_range,
            name='full_position_embeddings',
-            is_expand=False,
            batch_size=batch_size,
            damping=damping,
            loss_scale=loss_scale,
-            frequency=1)
+            frequency=frequency)
        self.position_ids = Tensor(np.arange(seq).reshape(-1, seq).astype(np.int32))
        self.layernorm = nn.LayerNorm((embedding_size,))

@ -274,7 +273,7 @@ class BertOutput(nn.Cell):
                                bias_init='zeros',
                                damping=damping,
                                loss_scale=loss_scale,
-                                frequency=1,
+                                frequency=frequency,
                                activation=None,
                                batch_size=batch_size).to_float(compute_type)
        self.dropout = nn.Dropout(1 - dropout_prob)
@ -488,7 +487,7 @@ class BertAttention(nn.Cell):
                                      bias_init='zeros',
                                      damping=damping,
                                      loss_scale=loss_scale,
-                                      frequency=1,
+                                      frequency=frequency,
                                      activation=query_act,
                                      batch_size=batch_size).to_float(compute_type)
        self.key_layer = Dense_Thor(in_channels=to_tensor_width,
@ -498,7 +497,7 @@ class BertAttention(nn.Cell):
                                    bias_init='zeros',
                                    damping=damping,
                                    loss_scale=loss_scale,
-                                    frequency=1,
+                                    frequency=frequency,
                                    activation=key_act,
                                    batch_size=batch_size).to_float(compute_type)
        self.value_layer = Dense_Thor(in_channels=to_tensor_width,
@ -508,7 +507,7 @@ class BertAttention(nn.Cell):
                                      bias_init='zeros',
                                      damping=damping,
                                      loss_scale=loss_scale,
-                                      frequency=1,
+                                      frequency=frequency,
                                      activation=value_act,
                                      batch_size=batch_size).to_float(compute_type)
        self.shape_from = (batch_size, from_seq_length, num_attention_heads, size_per_head)
@ -764,7 +763,7 @@ class BertEncoderCell(nn.Cell):
                                       bias_init='zeros',
                                       damping=damping,
                                       loss_scale=loss_scale,
-                                       frequency=1,
+                                       frequency=frequency,
                                       activation=hidden_act,
                                       batch_size=batch_size).to_float(compute_type)
        self.output = BertOutput(in_channels=intermediate_size,
@ -945,11 +944,10 @@ class BertModel(nn.Cell):
            use_one_hot_embeddings=use_one_hot_embeddings,
            initializer_range=config.initializer_range,
            name='embedding_table',
-            is_expand=True,
            batch_size=batch_size,
            damping=damping,
            loss_scale=loss_scale,
-            frequency=1)
+            frequency=frequency)
        self.bert_embedding_postprocessor = EmbeddingPostprocessor(
            embedding_size=self.embedding_size,
            embedding_shape=output_embedding_shape,
@ -991,7 +989,7 @@ class BertModel(nn.Cell):
                                bias_init='zeros',
                                damping=damping,
                                loss_scale=loss_scale,
-                                frequency=1,
+                                frequency=frequency,
                                activation="tanh",
                                batch_size=batch_size).to_float(config.compute_type)
        self._create_attention_mask_from_input_mask = CreateAttentionMaskFromInputMask(config)
--- a/model_zoo/official/nlp/bert_thor/src/config.py
+++ b/model_zoo/official/nlp/bert_thor/src/config.py
@ -19,9 +19,6 @@ from easydict import EasyDict as edict

 cfg = edict({
    'bert_network': 'large',
-    'loss_scale_value': 65536,
-    'scale_factor': 2,
-    'scale_window': 1000,
    'optimizer': 'Thor',
    'AdamWeightDecay': edict({
        'learning_rate': 3e-5,
@ -49,7 +46,7 @@ cfg = edict({
        'momentum': 0.9,
        'weight_decay': 5e-4,
        'loss_scale': 1,
-        'frequency': 10,
-        'batch_size': 8,
+        'frequency': 100,
+        'batch_size': 12,
    }),
 })
--- a/model_zoo/official/nlp/bert_thor/src/dataset.py
+++ b/model_zoo/official/nlp/bert_thor/src/dataset.py
@ -16,7 +16,6 @@
 Data operations, will be used in run_pretrain.py
 """
 import os
-
 import mindspore.common.dtype as mstype
 import mindspore.dataset.engine.datasets as de
 import mindspore.dataset.transforms.c_transforms as C
@ -37,7 +36,7 @@ def create_bert_dataset(device_num=1, rank=0, do_shuffle="true", data_dir=None,
                            columns_list=["input_ids", "input_mask", "segment_ids", "next_sentence_labels",
                                          "masked_lm_positions", "masked_lm_ids", "masked_lm_weights"],
                            shuffle=de.Shuffle.FILES if do_shuffle == "true" else False,
-                            num_shards=device_num, shard_id=rank, shard_equal_rows=True)
+                            num_shards=device_num, shard_id=rank, shard_equal_rows=False)
    ori_dataset_size = ds.get_dataset_size()
    print('origin dataset size: ', ori_dataset_size)
    type_cast_op = C.TypeCast(mstype.int32)
--- a/model_zoo/official/nlp/bert_thor/src/grad_reducer_thor1.py
+++ b/model_zoo/official/nlp/bert_thor/src/grad_reducer_thor1.py
@ -80,7 +80,7 @@ def _tensors_cast_datatype(datatype, grad):
    return F.cast(grad, datatype)


-class DistributedGradReducerThor1(Cell):
+class DistributedGradReducerThor(Cell):
    """
    A distributed optimizer.

@ -154,7 +154,7 @@ class DistributedGradReducerThor1(Cell):
    """

    def __init__(self, parameters, group, mean=True, degree=None):
-        super(DistributedGradReducerThor1, self).__init__(auto_prefix=False)
+        super(DistributedGradReducerThor, self).__init__(auto_prefix=False)
        self.hyper_map = C.HyperMap()
        self.mul = P.Mul()
        if degree is None:
@ -168,7 +168,7 @@ class DistributedGradReducerThor1(Cell):
        _init_optimizer_allreduce(group)

    def construct(self, grads):
-        """construct of DistributedGradReducerThor1"""
+        """construct of DistributedGradReducerThor"""
        # In some circumstances, the data precision of grads could be mixed with float16 and float32. Thus, the
        # result of AllReduce is unreliable. To solve the problem, grads should be cast to float32 before AllReduce,
        # and cast back after the operation.
--- a/model_zoo/official/nlp/bert_thor/src/lr_generator.py
+++ b/model_zoo/official/nlp/bert_thor/src/lr_generator.py
@ -58,7 +58,7 @@ def get_poly_lr(global_step, lr_init, lr_end, lr_max, warmup_steps, total_steps,
 # bert kfac hyperparam setting
 def get_bert_lr():
    learning_rate = Tensor(
-        get_poly_lr(global_step=0, lr_init=0.0, lr_end=1e-6, lr_max=4e-4, warmup_steps=0, total_steps=30000,
+        get_poly_lr(global_step=0, lr_init=0.0, lr_end=1e-6, lr_max=3.1e-3, warmup_steps=0, total_steps=30000,
                    poly_power=1))
    return learning_rate

--- a/model_zoo/official/nlp/bert_thor/src/thor_for_bert.py
+++ b/model_zoo/official/nlp/bert_thor/src/thor_for_bert.py
--- a/model_zoo/official/nlp/bert_thor/src/thor_for_bert_arg.py
+++ b/model_zoo/official/nlp/bert_thor/src/thor_for_bert_arg.py
--- a/model_zoo/official/nlp/bert_thor/src/thor_layer.py
+++ b/model_zoo/official/nlp/bert_thor/src/thor_layer.py
@ -14,7 +14,6 @@
 # ============================================================================
 """thor_layer"""
 import numpy as np
-
 import mindspore.common.dtype as mstype
 from mindspore._checkparam import check_bool, check_int_positive
 from mindspore.common.initializer import TruncatedNormal, initializer
@ -24,7 +23,6 @@ from mindspore.nn.cell import Cell
 from mindspore.nn.layer.activation import get_activation
 from mindspore.ops import operations as P

-
 class Embedding_Thor(Cell):
    """
    A embeddings lookup table with a fixed dictionary and size.
@ -37,7 +35,6 @@ class Embedding_Thor(Cell):
        use_one_hot_embeddings (bool): Specifies whether to use one hot encoding form. Default: False.
        initializer_range (float): Initialization value of TruncatedNormal. Default: 0.02.
    """
-
    def __init__(self,
                 vocab_size,
                 embedding_size,
@ -45,11 +42,10 @@ class Embedding_Thor(Cell):
                 use_one_hot_embeddings=False,
                 initializer_range=0.02,
                 name='embedding_table',
-                 is_expand=False,
                 batch_size=12,
                 damping=0.03,
                 loss_scale=1,
-                 frequency=10,
+                 frequency=100,
                 ):
        super(Embedding_Thor, self).__init__()
        self.vocab_size = vocab_size
@ -59,7 +55,6 @@ class Embedding_Thor(Cell):
                                          [vocab_size, embedding_size]),
                                         name=name)
        self.thor = True
-        self.is_expand = is_expand
        self.expand = P.ExpandDims()
        self.shape_flat = (-1,)
        self.gather = P.GatherV2()
@ -71,13 +66,11 @@ class Embedding_Thor(Cell):
        self.em_shape = tuple(embedding_shape)
        self.shape = P.Shape()
        self.loss_scale = Tensor(1 / loss_scale, mstype.float16)
-        self.matrix_A_inv = Parameter(Tensor(np.zeros([vocab_size]).astype(np.float32)), name='matrix_A_inv',
-                                      requires_grad=False)
+
+        self.matrix_A_inv = Parameter(Tensor(np.zeros([vocab_size]).astype(np.float16)),
+                                      name='matrix_A_inv', requires_grad=False)
        self.matrix_G_inv = Parameter(Tensor(np.zeros([embedding_size, embedding_size]).astype(np.float16)),
                                      name="matrix_G_inv", requires_grad=False)
-        self.A_inv_max = Parameter(initializer(0, [1], mstype.float32), name="A_inv_max", requires_grad=False)
-        self.G_inv_max = Parameter(initializer(0, [1], mstype.float32), name="G_inv_max", requires_grad=False)
-        self.fused_abs_max = P.CusFusedAbsMax1()
        self.fake_G = Tensor(np.zeros([embedding_size, embedding_size]).astype(np.float16))
        self.dampingA = Tensor(np.ones([vocab_size]).astype(np.float32))
        self.dampingG = Tensor(np.identity(embedding_size), mstype.float32)
@ -117,9 +110,6 @@ class Embedding_Thor(Cell):
        matrix_G = matrix_G + damping * dampingG
        matrix_G_inv = self.cholesky(matrix_G)
        matrix_G_inv = self.vector_matmul(matrix_G_inv, matrix_G_inv)
-        matrix_G_inv_max = self.fused_abs_max(matrix_G_inv)
-        matrix_G_inv_max = self.fused_abs_max(matrix_G_inv_max)
-        self.G_inv_max = matrix_G_inv_max
        matrix_G_inv = self.matrix_combine(matrix_G_inv)
        matrix_G_inv = self.cast(matrix_G_inv, mstype.float16)
        self.matrix_G_inv = matrix_G_inv
@ -127,8 +117,6 @@ class Embedding_Thor(Cell):

    def construct(self, input_ids):
        """construct of Embedding_Thor"""
-        if self.is_expand:
-            input_ids = self.expand(input_ids, -1)
        flat_ids = self.reshape(input_ids, self.shape_flat)
        if self.use_one_hot_embeddings:
            one_hot_ids = self.one_hot(flat_ids, self.vocab_size, self.on_value, self.off_value)
@ -146,6 +134,7 @@ class Embedding_Thor(Cell):
                dampingA = self.cast(self.dampingA, mstype.float32)
                matrix_A = matrix_A + damping * dampingA
                matrix_A_inv = self.inv(matrix_A)
+                matrix_A_inv = self.cast(matrix_A_inv, mstype.float16)
                self.matrix_A_inv = matrix_A_inv
                self.matrix_G_inv = self.fake_G
                output_for_reshape = self.gather(self.embedding_table, flat_ids, 0)
@ -156,11 +145,9 @@ class Embedding_Thor(Cell):
        output = self.reshape(output_for_reshape, self.em_shape)
        return output, self.embedding_table

-
 class Dense_Thor(Cell):
    """Dense_Thor"""

-    # @cell_attr_register(attrs=['has_bias', 'activation', 'in_channels', 'out_channels'])
    def __init__(self,
                 in_channels,
                 out_channels,
@ -168,7 +155,7 @@ class Dense_Thor(Cell):
                 bias_init='zeros',
                 damping=0.03,
                 loss_scale=1,
-                 frequency=10,
+                 frequency=100,
                 has_bias=False,
                 activation=None,
                 batch_size=12):
@ -200,9 +187,6 @@ class Dense_Thor(Cell):
                                      name='matrix_A_inv', requires_grad=False)
        self.matrix_G_inv = Parameter(Tensor(np.zeros([out_channels, out_channels]).astype(np.float16)),
                                      name="matrix_G_inv", requires_grad=False)
-        self.A_inv_max = Parameter(initializer(0, [1], mstype.float32), name="A_inv_max", requires_grad=False)
-        self.G_inv_max = Parameter(initializer(0, [1], mstype.float32), name="G_inv_max", requires_grad=False)
-        self.fused_abs_max = P.CusFusedAbsMax1()
        self.fake_G = Tensor(np.zeros([out_channels, out_channels]).astype(np.float16))

        self.matmul = P.MatMul(transpose_b=True)
@ -250,9 +234,6 @@ class Dense_Thor(Cell):
        matrix_G = matrix_G + damping * dampingG
        matrix_G_inv = self.cholesky(matrix_G)
        matrix_G_inv = self.vector_matmul(matrix_G_inv, matrix_G_inv)
-        matrix_G_inv_max = self.fused_abs_max(matrix_G_inv)
-        matrix_G_inv_max = self.fused_abs_max(matrix_G_inv_max)
-        self.G_inv_max = matrix_G_inv_max
        matrix_G_inv = self.matrix_combine(matrix_G_inv)
        matrix_G_inv = self.cast(matrix_G_inv, mstype.float16)
        self.matrix_G_inv = matrix_G_inv
@ -265,7 +246,6 @@ class Dense_Thor(Cell):
            shape = self.shape(x)
            normalizer = self.cast(shape[0], mstype.float32)
            matrix_A = self.mul(inputs, 1.0 / normalizer)
-
            damping_step = self.gather(self.damping, self.cov_step, self.axis)
            damping_step = self.cast(damping_step, mstype.float32)
            damping = self.sqrt(damping_step)
@ -273,9 +253,6 @@ class Dense_Thor(Cell):
            matrix_A = matrix_A + damping * dampingA
            matrix_A_inv = self.cholesky(matrix_A)
            matrix_A_inv = self.vector_matmul(matrix_A_inv, matrix_A_inv)
-            matrix_A_inv_max = self.fused_abs_max(matrix_A_inv)
-            matrix_A_inv_max = self.fused_abs_max(matrix_A_inv_max)
-            self.A_inv_max = matrix_A_inv_max
            matrix_A_inv = self.matrix_combine(matrix_A_inv)
            matrix_A_inv = self.cast(matrix_A_inv, mstype.float16)
            self.matrix_A_inv = matrix_A_inv