!1398 Update the bert scripts according to rules of modelzoo

Merge pull request !1398 from chenhaozhe/update_bert_script
5 years ago · b46ad9a1bb
parent 45484c690c b6aceddeab
commit b46ad9a1bb
35 changed files with 2736 additions and 619 deletions
--- a/mindspore/ops/_grad/grad_nn_ops.py
+++ b/mindspore/ops/_grad/grad_nn_ops.py
@ -308,7 +308,7 @@ def get_bprop_softmax(self):
    axis = self.axis
    def bprop(x, out, dout):
-        dx = mul(sub(dout, sum_func(mul(dout, out), axis)), out)
+        dx = mul(out, sub(dout, sum_func(mul(out, dout), axis)))
        return (dx,)
    return bprop
--- a/example/bert_clue/README.md
+++ b/example/bert_clue/README.md
@ -16,12 +16,12 @@ This example implements pre-training, fine-tuning and evaluation of [BERT-base](
 - Run `run_standalone_pretrain.sh` for non-distributed pre-training of BERT-base and BERT-NEZHA model.
    ``` bash   
-    sh run_standalone_pretrain.sh DEVICE_ID EPOCH_SIZE DATA_DIR SCHEMA_DIR
+    sh scripts/run_standalone_pretrain.sh DEVICE_ID EPOCH_SIZE DATA_DIR SCHEMA_DIR
    ```
 - Run `run_distribute_pretrain.sh` for distributed pre-training of BERT-base and BERT-NEZHA model.
    ``` bash   
-    sh run_distribute_pretrain.sh DEVICE_NUM EPOCH_SIZE DATA_DIR SCHEMA_DIR MINDSPORE_HCCL_CONFIG_PATH
+    sh scripts/run_distribute_pretrain.sh DEVICE_NUM EPOCH_SIZE DATA_DIR SCHEMA_DIR MINDSPORE_HCCL_CONFIG_PATH
    ```  
 ### Fine-Tuning
--- a/example/bert_clue/evaluation.py
+++ b/example/bert_clue/evaluation.py
@ -19,8 +19,6 @@ Bert evaluation script.
 import os
 import numpy as np
 from evaluation_config import cfg, bert_net_cfg
 from utils import BertNER, BertCLS
 import mindspore.common.dtype as mstype
 from mindspore import context
 from mindspore.common.tensor import Tensor
@ -28,9 +26,11 @@ import mindspore.dataset as de
 import mindspore.dataset.transforms.c_transforms as C
 from mindspore.train.model import Model
 from mindspore.train.serialization import load_checkpoint, load_param_into_net
-from CRF import postprocess
+from src.evaluation_config import cfg, bert_net_cfg
-from cluener_evaluation import submit
+from src.utils import BertNER, BertCLS
-from finetune_config import tag_to_index
+from src.CRF import postprocess
 from src.cluener_evaluation import submit
 from src.finetune_config import tag_to_index
 class Accuracy():
    '''
--- a/example/bert_clue/finetune.py
+++ b/example/bert_clue/finetune.py
@ -18,8 +18,8 @@ Bert finetune script.
 '''
 import os
-from utils import BertFinetuneCell, BertCLS, BertNER
+from src.utils import BertFinetuneCell, BertCLS, BertNER
-from finetune_config import cfg, bert_net_cfg, tag_to_index
+from src.finetune_config import cfg, bert_net_cfg, tag_to_index
 import mindspore.common.dtype as mstype
 import mindspore.communication.management as D
 from mindspore import context
--- a/example/bert_clue/run_pretrain.py
+++ b/example/bert_clue/run_pretrain.py
@ -26,10 +26,10 @@ from mindspore.train.parallel_utils import ParallelMode
 from mindspore.nn.wrap.loss_scale import DynamicLossScaleUpdateCell
 from mindspore.train.callback import Callback, ModelCheckpoint, CheckpointConfig, TimeMonitor
 from mindspore.train.serialization import load_checkpoint, load_param_into_net
 from mindspore.model_zoo.Bert_NEZHA import BertNetworkWithLoss, BertTrainOneStepCell, BertTrainOneStepWithLossScaleCell
 from mindspore.nn.optim import Lamb, Momentum, AdamWeightDecayDynamicLR
-from dataset import create_bert_dataset
+from src import BertNetworkWithLoss, BertTrainOneStepCell, BertTrainOneStepWithLossScaleCell
-from config import cfg, bert_net_cfg
+from src.dataset import create_bert_dataset
 from src.config import cfg, bert_net_cfg
 _current_dir = os.path.dirname(os.path.realpath(__file__))
 class LossCallBack(Callback):
@ -48,10 +48,8 @@ class LossCallBack(Callback):
        self._per_print_times = per_print_times
    def step_end(self, run_context):
        cb_params = run_context.original_args()
-        with open("./loss.log", "a+") as f:
+        print("epoch: {}, step: {}, outputs are {}".format(cb_params.cur_epoch_num, cb_params.cur_step_num,
-            f.write("epoch: {}, step: {}, outputs are {}".format(cb_params.cur_epoch_num, cb_params.cur_step_num,
+                                                           str(cb_params.net_outputs)))
                                                                 str(cb_params.net_outputs)))
            f.write('\n')
 def run_pretrain():
    """pre-train bert_clue"""
@ -81,6 +79,11 @@ def run_pretrain():
        context.reset_auto_parallel_context()
        context.set_auto_parallel_context(parallel_mode=ParallelMode.DATA_PARALLEL, mirror_mean=True,
                                          device_num=device_num)
        from mindspore.parallel._auto_parallel_context import auto_parallel_context
        if bert_net_cfg.num_hidden_layers == 12:
            auto_parallel_context().set_all_reduce_fusion_split_indices([28, 55, 82, 109, 136, 163, 190, 205])
        elif bert_net_cfg.num_hidden_layers == 24:
            auto_parallel_context().set_all_reduce_fusion_split_indices([38, 93, 148, 203, 258, 313, 368, 397])
        D.init()
        rank = args_opt.device_id % device_num
    else:
--- a/model_zoo/bert/scripts/run_distribute_pretrain.sh
+++ b/model_zoo/bert/scripts/run_distribute_pretrain.sh
@ -16,8 +16,8 @@
 echo "=============================================================================================================="
 echo "Please run the scipt as: "
-echo "sh run_distribute_pretrain.sh DEVICE_NUM EPOCH_SIZE DATA_DIR SCHEMA_DIR MINDSPORE_HCCL_CONFIG_PATH"
+echo "bash run_distribute_pretrain.sh DEVICE_NUM EPOCH_SIZE DATA_DIR SCHEMA_DIR MINDSPORE_HCCL_CONFIG_PATH"
-echo "for example: sh run_distribute_pretrain.sh 8 40 /path/zh-wiki/ /path/Schema.json /path/hccl.json"
+echo "for example: bash run_distribute_pretrain.sh 8 40 /path/zh-wiki/ /path/Schema.json /path/hccl.json"
 echo "It is better to use absolute path."
 echo "=============================================================================================================="
@ -49,6 +49,10 @@ do
    cp  *.py ./LOG$i
    cd ./LOG$i || exit
    echo "start training for rank $i, device $DEVICE_ID"
    mkdir -p ms_log
    CUR_DIR=`pwd`
    export GLOG_log_dir=${CUR_DIR}/ms_log
    export GLOG_logtostderr=0
    env > env.log
    taskset -c $cmdopt python ../run_pretrain.py  \
    --distribute="true" \
@ -59,7 +63,7 @@ do
    --enable_lossscale="true" \
    --do_shuffle="true" \
    --enable_data_sink="true" \
-    --data_sink_steps=1 \
+    --data_sink_steps=100 \
    --checkpoint_path="" \
    --save_checkpoint_steps=10000 \
    --save_checkpoint_num=1 \
--- a/model_zoo/bert/scripts/run_standalone_pretrain.sh
+++ b/model_zoo/bert/scripts/run_standalone_pretrain.sh
@ -16,8 +16,8 @@
 echo "=============================================================================================================="
 echo "Please run the scipt as: "
-echo "sh run_standalone_pretrain.sh DEVICE_ID EPOCH_SIZE DATA_DIR SCHEMA_DIR"
+echo "bash run_standalone_pretrain.sh DEVICE_ID EPOCH_SIZE DATA_DIR SCHEMA_DIR"
-echo "for example: sh run_standalone_pretrain.sh 0 40 /path/zh-wiki/ /path/Schema.json"
+echo "for example: bash run_standalone_pretrain.sh 0 40 /path/zh-wiki/ /path/Schema.json"
 echo "=============================================================================================================="
 DEVICE_ID=$1
@ -25,6 +25,10 @@ EPOCH_SIZE=$2
 DATA_DIR=$3
 SCHEMA_DIR=$4
 mkdir -p ms_log 
 CUR_DIR=`pwd`
 export GLOG_log_dir=${CUR_DIR}/ms_log
 export GLOG_logtostderr=0
 python run_pretrain.py  \
    --distribute="false" \
    --epoch_size=$EPOCH_SIZE \
@ -33,7 +37,7 @@ python run_pretrain.py  \
    --enable_lossscale="true" \
    --do_shuffle="true" \
    --enable_data_sink="true" \
-    --data_sink_steps=1 \
+    --data_sink_steps=100 \
    --checkpoint_path="" \
    --save_checkpoint_steps=10000 \
    --save_checkpoint_num=1 \
--- a/model_zoo/bert/src/CRF.py
+++ b/model_zoo/bert/src/CRF.py
--- a/mindspore/model_zoo/Bert_NEZHA/init.py
+++ b/mindspore/model_zoo/Bert_NEZHA/init.py
--- a/mindspore/model_zoo/Bert_NEZHA/bert_for_pre_training.py
+++ b/mindspore/model_zoo/Bert_NEZHA/bert_for_pre_training.py
@ -357,10 +357,10 @@ class BertTrainOneStepWithLossScaleCell(nn.Cell):
        if self.parallel_mode in [ParallelMode.DATA_PARALLEL, ParallelMode.HYBRID_PARALLEL]:
            self.reducer_flag = True
        self.grad_reducer = F.identity
        self.degree = 1
        if self.reducer_flag:
-            mean = context.get_auto_parallel_context("mirror_mean")
+            self.degree = get_group_size()
-            degree = get_group_size()
+            self.grad_reducer = DistributedGradReducer(optimizer.parameters, False, self.degree)
            self.grad_reducer = DistributedGradReducer(optimizer.parameters, mean, degree)
        self.is_distributed = (self.parallel_mode != ParallelMode.STAND_ALONE)
        self.cast = P.Cast()
        self.alloc_status = P.NPUAllocFloatStatus()
@ -411,10 +411,10 @@ class BertTrainOneStepWithLossScaleCell(nn.Cell):
                                                 masked_lm_weights,
                                                 self.cast(scaling_sens,
                                                           mstype.float32))
        grads = self.hyper_map(F.partial(grad_scale, scaling_sens), grads)
        grads = self.hyper_map(F.partial(clip_grad, GRADIENT_CLIP_TYPE, GRADIENT_CLIP_VALUE), grads)
        # apply grad reducer on grads
        grads = self.grad_reducer(grads)
        grads = self.hyper_map(F.partial(grad_scale, scaling_sens * self.degree), grads)
        grads = self.hyper_map(F.partial(clip_grad, GRADIENT_CLIP_TYPE, GRADIENT_CLIP_VALUE), grads)
        self.get_status(init)
        flag_sum = self.reduce_sum(init, (0,))
        if self.is_distributed:
--- a/mindspore/model_zoo/Bert_NEZHA/bert_model.py
+++ b/mindspore/model_zoo/Bert_NEZHA/bert_model.py
@ -25,6 +25,7 @@ from mindspore.ops import operations as P
 from mindspore.ops import composite as C
 from mindspore.common.tensor import Tensor
 from mindspore.common.parameter import Parameter
 from .fused_layer_norm import FusedLayerNorm
 class BertConfig:
@ -77,7 +78,8 @@ class BertConfig:
                 input_mask_from_dataset=True,
                 token_type_ids_from_dataset=True,
                 dtype=mstype.float32,
-                 compute_type=mstype.float32):
+                 compute_type=mstype.float32,
                 enable_fused_layernorm=False):
        self.batch_size = batch_size
        self.seq_length = seq_length
        self.vocab_size = vocab_size
@ -96,6 +98,7 @@ class BertConfig:
        self.use_relative_positions = use_relative_positions
        self.dtype = dtype
        self.compute_type = compute_type
        self.enable_fused_layernorm = enable_fused_layernorm
 class EmbeddingLookup(nn.Cell):
@ -240,13 +243,19 @@ class BertOutput(nn.Cell):
                 out_channels,
                 initializer_range=0.02,
                 dropout_prob=0.1,
-                 compute_type=mstype.float32):
+                 compute_type=mstype.float32,
                 enable_fused_layernorm=False):
        super(BertOutput, self).__init__()
        self.dense = nn.Dense(in_channels, out_channels,
                              weight_init=TruncatedNormal(initializer_range)).to_float(compute_type)
        self.dropout = nn.Dropout(1 - dropout_prob)
        self.dropout_prob = dropout_prob
        self.add = P.TensorAdd()
-        self.layernorm = nn.LayerNorm((out_channels,)).to_float(compute_type)
+        if compute_type == mstype.float16:
            self.layernorm = FusedLayerNorm((out_channels,),
                                            use_batch_norm=enable_fused_layernorm).to_float(compute_type)
        else:
            self.layernorm = nn.LayerNorm((out_channels,)).to_float(compute_type)
        self.cast = P.Cast()
    def construct(self, hidden_status, input_tensor):
@ -481,12 +490,13 @@ class BertAttention(nn.Cell):
            self.shape_return = (batch_size, from_seq_length, num_attention_heads * size_per_head)
        self.cast_compute_type = SaturateCast(dst_type=compute_type)
-        self._generate_relative_positions_embeddings = \
+        if self.use_relative_positions:
-            RelaPosEmbeddingsGenerator(length=to_seq_length,
+            self._generate_relative_positions_embeddings = \
-                                       depth=size_per_head,
+                RelaPosEmbeddingsGenerator(length=to_seq_length,
-                                       max_relative_position=16,
+                                           depth=size_per_head,
-                                       initializer_range=initializer_range,
+                                           max_relative_position=16,
-                                       use_one_hot_embeddings=use_one_hot_embeddings)
+                                           initializer_range=initializer_range,
                                           use_one_hot_embeddings=use_one_hot_embeddings)
    def construct(self, from_tensor, to_tensor, attention_mask):
        # reshape 2d/3d input tensors to 2d
@ -529,7 +539,7 @@ class BertAttention(nn.Cell):
                                                     self.trans_shape_position)
            attention_scores = attention_scores + key_position_scores_r_t
-        attention_scores = self.multiply(attention_scores, self.scores_mul)
+        attention_scores = self.multiply(self.scores_mul, attention_scores)
        if self.has_attention_mask:
            attention_mask = self.expand_dims(attention_mask, 1)
@ -606,7 +616,8 @@ class BertSelfAttention(nn.Cell):
                 initializer_range=0.02,
                 hidden_dropout_prob=0.1,
                 use_relative_positions=False,
-                 compute_type=mstype.float32):
+                 compute_type=mstype.float32,
                 enable_fused_layernorm=False):
        super(BertSelfAttention, self).__init__()
        if hidden_size % num_attention_heads != 0:
            raise ValueError("The hidden size (%d) is not a multiple of the number "
@ -634,7 +645,8 @@ class BertSelfAttention(nn.Cell):
                                 out_channels=hidden_size,
                                 initializer_range=initializer_range,
                                 dropout_prob=hidden_dropout_prob,
-                                 compute_type=compute_type)
+                                 compute_type=compute_type,
                                 enable_fused_layernorm=enable_fused_layernorm)
        self.reshape = P.Reshape()
        self.shape = (-1, hidden_size)
@ -676,7 +688,8 @@ class BertEncoderCell(nn.Cell):
                 hidden_dropout_prob=0.1,
                 use_relative_positions=False,
                 hidden_act="gelu",
-                 compute_type=mstype.float32):
+                 compute_type=mstype.float32,
                 enable_fused_layernorm=False):
        super(BertEncoderCell, self).__init__()
        self.attention = BertSelfAttention(
            batch_size=batch_size,
@ -688,7 +701,8 @@ class BertEncoderCell(nn.Cell):
            initializer_range=initializer_range,
            hidden_dropout_prob=hidden_dropout_prob,
            use_relative_positions=use_relative_positions,
-            compute_type=compute_type)
+            compute_type=compute_type,
            enable_fused_layernorm=enable_fused_layernorm)
        self.intermediate = nn.Dense(in_channels=hidden_size,
                                     out_channels=intermediate_size,
                                     activation=hidden_act,
@ -697,7 +711,8 @@ class BertEncoderCell(nn.Cell):
                                 out_channels=hidden_size,
                                 initializer_range=initializer_range,
                                 dropout_prob=hidden_dropout_prob,
-                                 compute_type=compute_type)
+                                 compute_type=compute_type,
                                 enable_fused_layernorm=enable_fused_layernorm)
    def construct(self, hidden_states, attention_mask):
        # self-attention
@ -744,7 +759,8 @@ class BertTransformer(nn.Cell):
                 use_relative_positions=False,
                 hidden_act="gelu",
                 compute_type=mstype.float32,
-                 return_all_encoders=False):
+                 return_all_encoders=False,
                 enable_fused_layernorm=False):
        super(BertTransformer, self).__init__()
        self.return_all_encoders = return_all_encoders
@ -761,7 +777,8 @@ class BertTransformer(nn.Cell):
                                    hidden_dropout_prob=hidden_dropout_prob,
                                    use_relative_positions=use_relative_positions,
                                    hidden_act=hidden_act,
-                                    compute_type=compute_type)
+                                    compute_type=compute_type,
                                    enable_fused_layernorm=enable_fused_layernorm)
            layers.append(layer)
        self.layers = nn.CellList(layers)
@ -888,7 +905,8 @@ class BertModel(nn.Cell):
            use_relative_positions=config.use_relative_positions,
            hidden_act=config.hidden_act,
            compute_type=config.compute_type,
-            return_all_encoders=True)
+            return_all_encoders=True,
            enable_fused_layernorm=config.enable_fused_layernorm)
        self.cast = P.Cast()
        self.dtype = config.dtype
--- a/model_zoo/bert/src/cluener_evaluation.py
+++ b/model_zoo/bert/src/cluener_evaluation.py
@ -17,12 +17,12 @@
 import json
 import numpy as np
 from evaluation_config import cfg
 import mindspore.common.dtype as mstype
 from mindspore.common.tensor import Tensor
 from CRF import postprocess
 import tokenization
 from sample_process import label_generation, process_one_example_p
 from .evaluation_config import cfg
 from .CRF import postprocess
 vocab_file = "./vocab.txt"
 tokenizer_ = tokenization.FullTokenizer(vocab_file=vocab_file)
--- a/model_zoo/bert/src/config.py
+++ b/model_zoo/bert/src/config.py
@ -17,16 +17,16 @@ network config setting, will be used in dataset.py, run_pretrain.py
 """
 from easydict import EasyDict as edict
 import mindspore.common.dtype as mstype
-from mindspore.model_zoo.Bert_NEZHA import BertConfig
+from .bert_model import BertConfig
 cfg = edict({
    'bert_network': 'base',
-    'loss_scale_value': 2**32,
+    'loss_scale_value': 65536,
    'scale_factor': 2,
    'scale_window': 1000,
    'optimizer': 'Lamb',
    'AdamWeightDecayDynamicLR': edict({
        'learning_rate': 3e-5,
-        'end_learning_rate': 1e-7,
+        'end_learning_rate': 1e-10,
        'power': 5.0,
        'weight_decay': 1e-5,
        'eps': 1e-6,
@ -34,7 +34,7 @@ cfg = edict({
    }),
    'Lamb': edict({
        'start_learning_rate': 3e-5,
-        'end_learning_rate': 1e-7,
+        'end_learning_rate': 1e-10,
        'power': 10.0,
        'warmup_steps': 10000,
        'weight_decay': 0.01,
@ -56,7 +56,7 @@ if cfg.bert_network == 'base':
    bert_net_cfg = BertConfig(
        batch_size=32,
        seq_length=128,
-        vocab_size=21128,
+        vocab_size=21136,
        hidden_size=768,
        num_hidden_layers=12,
        num_attention_heads=12,
@ -71,13 +71,13 @@ if cfg.bert_network == 'base':
        input_mask_from_dataset=True,
        token_type_ids_from_dataset=True,
        dtype=mstype.float32,
-        compute_type=mstype.float16,
+        compute_type=mstype.float16
    )
 if cfg.bert_network == 'nezha':
    bert_net_cfg = BertConfig(
        batch_size=32,
        seq_length=128,
-        vocab_size=21128,
+        vocab_size=21136,
        hidden_size=1024,
        num_hidden_layers=24,
        num_attention_heads=16,
@ -92,5 +92,27 @@ if cfg.bert_network == 'nezha':
        input_mask_from_dataset=True,
        token_type_ids_from_dataset=True,
        dtype=mstype.float32,
        compute_type=mstype.float16
    )
 if cfg.bert_network == 'large':
    bert_net_cfg = BertConfig(
        batch_size=16,
        seq_length=512,
        vocab_size=30528,
        hidden_size=1024,
        num_hidden_layers=24,
        num_attention_heads=16,
        intermediate_size=4096,
        hidden_act="gelu",
        hidden_dropout_prob=0.1,
        attention_probs_dropout_prob=0.1,
        max_position_embeddings=512,
        type_vocab_size=2,
        initializer_range=0.02,
        use_relative_positions=False,
        input_mask_from_dataset=True,
        token_type_ids_from_dataset=True,
        dtype=mstype.float32,
        compute_type=mstype.float16,
        enable_fused_layernorm=True
    )
--- a/model_zoo/bert/src/dataset.py
+++ b/model_zoo/bert/src/dataset.py
@ -20,7 +20,7 @@ import mindspore.common.dtype as mstype
 import mindspore.dataset.engine.datasets as de
 import mindspore.dataset.transforms.c_transforms as C
 from mindspore import log as logger
-from config import bert_net_cfg
+from .config import bert_net_cfg
 def create_bert_dataset(epoch_size=1, device_num=1, rank=0, do_shuffle="true", enable_data_sink="true",
@ -31,8 +31,9 @@ def create_bert_dataset(epoch_size=1, device_num=1, rank=0, do_shuffle="true", e
    files = os.listdir(data_dir)
    data_files = []
    for file_name in files:
-        data_files.append(os.path.join(data_dir, file_name))
+        if "tfrecord" in file_name:
-    ds = de.TFRecordDataset(data_files, schema_dir,
+            data_files.append(os.path.join(data_dir, file_name))
    ds = de.TFRecordDataset(data_files, schema_dir if schema_dir != "" else None,
                            columns_list=["input_ids", "input_mask", "segment_ids", "next_sentence_labels",
                                          "masked_lm_positions", "masked_lm_ids", "masked_lm_weights"],
                            shuffle=(do_shuffle == "true"), num_shards=device_num, shard_id=rank,
--- a/model_zoo/bert/src/evaluation_config.py
+++ b/model_zoo/bert/src/evaluation_config.py
@ -19,7 +19,7 @@ config settings, will be used in finetune.py
 from easydict import EasyDict as edict
 import mindspore.common.dtype as mstype
-from mindspore.model_zoo.Bert_NEZHA import BertConfig
+from .bert_model import BertConfig
 cfg = edict({
    'task': 'NER',
--- a/model_zoo/bert/src/finetune_config.py
+++ b/model_zoo/bert/src/finetune_config.py
@ -19,7 +19,7 @@ config settings, will be used in finetune.py
 from easydict import EasyDict as edict
 import mindspore.common.dtype as mstype
-from mindspore.model_zoo.Bert_NEZHA import BertConfig
+from .bert_model import BertConfig
 cfg = edict({
    'task': 'NER',
--- a/model_zoo/bert/src/fused_layer_norm.py
+++ b/model_zoo/bert/src/fused_layer_norm.py
@ -0,0 +1,121 @@
 # Copyright 2020 Huawei Technologies Co., Ltd
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 # http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ============================================================================
 """fused layernorm"""
 from mindspore.ops import operations as P
 from mindspore.ops import functional as F
 from mindspore.common.parameter import Parameter
 from mindspore.common.initializer import initializer
 from mindspore.ops.primitive import constexpr
 import mindspore.common.dtype as mstype
 from mindspore.nn.cell import Cell
 import numpy as np
 __all__ = ['FusedLayerNorm']
@constexpr
 def get_shape_for_norm(x_shape, begin_norm_axis):
    print("input_shape: ", x_shape)
    norm_shape = x_shape[begin_norm_axis:]
    output_shape = (1, -1, 1, int(np.prod(norm_shape)))
    print("output_shape: ", output_shape)
    return output_shape
 class FusedLayerNorm(Cell):
    r"""
    Applies Layer Normalization over a mini-batch of inputs.
    Layer normalization is widely used in recurrent neural networks. It applies
    normalization over a mini-batch of inputs for each single training case as described
    in the paper `Layer Normalization <https://arxiv.org/pdf/1607.06450.pdf>`_. Unlike batch
    normalization, layer normalization performs exactly the same computation at training and
    testing times. It can be described using the following formula. It is applied across all channels
    and pixel but only one batch size.
    .. math::
        y = \frac{x - \mathrm{E}[x]}{\sqrt{\mathrm{Var}[x] + \epsilon}} * \gamma + \beta
    Args:
        normalized_shape (Union(tuple[int], list[int]): The normalization is performed over axis
            `begin_norm_axis ... R - 1`.
        begin_norm_axis (int): It first normalization dimension: normalization will be performed along dimensions
            `begin_norm_axis: rank(inputs)`, the value should be in [-1, rank(input)). Default: -1.
        begin_params_axis (int): The first parameter(beta, gamma)dimension: scale and centering parameters
            will have dimensions `begin_params_axis: rank(inputs)` and will be broadcast with
            the normalized inputs accordingly, the value should be in [-1, rank(input)). Default: -1.
        gamma_init (Union[Tensor, str, Initializer, numbers.Number]): Initializer for the gamma weight.
            The values of str refer to the function `initializer` including 'zeros', 'ones', 'xavier_uniform',
            'he_uniform', etc. Default: 'ones'.
        beta_init (Union[Tensor, str, Initializer, numbers.Number]): Initializer for the beta weight.
            The values of str refer to the function `initializer` including 'zeros', 'ones', 'xavier_uniform',
            'he_uniform', etc. Default: 'zeros'.
        use_batch_nrom (bool): Whether use batchnorm to preocess.
    Inputs:
        - **input_x** (Tensor) - The shape of 'input_x' is :math:`(x_1, x_2, ..., x_R)`,
          and `input_shape[begin_norm_axis:]` is equal to `normalized_shape`.
    Outputs:
        Tensor, the normalized and scaled offset tensor, has the same shape and data type as the `input_x`.
    Examples:
        >>> x = Tensor(np.ones([20, 5, 10, 10]), mindspore.float32)
        >>> shape1 = x.shape()[1:]
        >>> m = nn.LayerNorm(shape1,  begin_norm_axis=1, begin_params_axis=1)
        >>> m(x)
    """
    def __init__(self,
                 normalized_shape,
                 begin_norm_axis=-1,
                 begin_params_axis=-1,
                 gamma_init='ones',
                 beta_init='zeros',
                 use_batch_norm=False):
        super(FusedLayerNorm, self).__init__()
        if not isinstance(normalized_shape, (tuple, list)):
            raise TypeError("The type of 'normalized_shape' should be tuple[int] or list[int], but '{}' type is {}."
                            .format(normalized_shape, type(normalized_shape)))
        self.normalized_shape = normalized_shape
        self.begin_norm_axis = begin_norm_axis
        self.begin_params_axis = begin_params_axis
        self.gamma = Parameter(initializer(
            gamma_init, normalized_shape), name="gamma")
        self.beta = Parameter(initializer(
            beta_init, normalized_shape), name="beta")
        self.layer_norm = P.LayerNorm(begin_norm_axis=self.begin_norm_axis, begin_params_axis=self.begin_params_axis)
        self.batch_norm = P.BatchNorm(is_training=True, epsilon=1e-5)
        self.use_batch_norm = use_batch_norm
    def construct(self, input_x):
        if self.use_batch_norm and self.training:
            ones = P.Fill()(mstype.float32, F.shape(input_x)[:self.begin_norm_axis], 1.0)
            zeros = P.Fill()(mstype.float32, F.shape(input_x)[:self.begin_norm_axis], 0.0)
            shape_x = F.shape(input_x)
            norm_shape = get_shape_for_norm(shape_x, self.begin_norm_axis)
            input_x = F.reshape(input_x, norm_shape)
            output, _, _, _, _, _ = self.batch_norm(input_x, ones, zeros, None, None)
            output = F.reshape(output, shape_x)
            y = output * self.gamma + self.beta
        else:
            y, _, _ = self.layer_norm(input_x, self.gamma, self.beta)
        return y
    def extend_repr(self):
        """Display instance object as string."""
        s = 'normalized_shape={}, begin_norm_axis={}, begin_params_axis={}, gamma{}, beta={}'.format(
            self.normalized_shape, self.begin_norm_axis, self.begin_params_axis, self.gamma, self.beta)
        return s
--- a/model_zoo/bert/src/sample_process.py
+++ b/model_zoo/bert/src/sample_process.py
--- a/model_zoo/bert/src/utils.py
+++ b/model_zoo/bert/src/utils.py
@ -30,8 +30,8 @@ from mindspore.train.parallel_utils import ParallelMode
 from mindspore.communication.management import get_group_size
 from mindspore import context
 from mindspore.model_zoo.Bert_NEZHA.bert_model import BertModel
-from mindspore.model_zoo.Bert_NEZHA.bert_for_pre_training import clip_grad
+from .bert_for_pre_training import clip_grad
-from CRF import CRF
+from .CRF import CRF
 GRADIENT_CLIP_TYPE = 1
 GRADIENT_CLIP_VALUE = 1.0
--- a/tests/st/networks/models/bert/bert_tdt_lossscale.py
+++ b/tests/st/networks/models/bert/bert_tdt_lossscale.py
@ -25,7 +25,8 @@ import mindspore.dataset.transforms.c_transforms as C
 from mindspore import context
 from mindspore import log as logger
 from mindspore.common.tensor import Tensor
-from mindspore.model_zoo.Bert_NEZHA import BertConfig, BertNetworkWithLoss, BertTrainOneStepWithLossScaleCell
+from src.bert_model import BertConfig
 from src.bert_for_pre_training import BertNetworkWithLoss, BertTrainOneStepWithLossScaleCell
 from mindspore.nn.optim import Lamb
 from mindspore.train.callback import Callback
 from mindspore.train.loss_scale_manager import DynamicLossScaleManager
@ -77,7 +78,8 @@ def get_config(version='base', batch_size=1):
            input_mask_from_dataset=True,
            token_type_ids_from_dataset=True,
            dtype=mstype.float32,
-            compute_type=mstype.float16)
+            compute_type=mstype.float16,
            enable_fused_layernorm=False)
    else:
        bert_config = BertConfig(batch_size=batch_size)
    return bert_config
--- a/tests/st/networks/models/bert/src/CRF.py
+++ b/tests/st/networks/models/bert/src/CRF.py
@ -0,0 +1,177 @@
 # Copyright 2020 Huawei Technologies Co., Ltd
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 # http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ============================================================================
 '''
 CRF script.
 '''
 import numpy as np
 import mindspore.nn as nn
 from mindspore.ops import operations as P
 from mindspore.common.tensor import Tensor
 from mindspore.common.parameter import Parameter
 import mindspore.common.dtype as mstype
 class CRF(nn.Cell):
    '''
    Conditional Random Field
    Args:
        tag_to_index: The dict for tag to index mapping with extra "<START>" and "<STOP>"sign.
        batch_size: Batch size, i.e., the length of the first dimension.
        seq_length: Sequence length, i.e., the length of the second dimention.
        is_training: Specifies whether to use training mode.
    Returns:
        Training mode: Tensor, total loss.
        Evaluation mode: Tuple, the index for each step with the highest score; Tuple, the index for the last
        step with the highest score.
    '''
    def __init__(self, tag_to_index, batch_size=1, seq_length=128, is_training=True):
        super(CRF, self).__init__()
        self.target_size = len(tag_to_index)
        self.is_training = is_training
        self.tag_to_index = tag_to_index
        self.batch_size = batch_size
        self.seq_length = seq_length
        self.START_TAG = "<START>"
        self.STOP_TAG = "<STOP>"
        self.START_VALUE = Tensor(self.target_size-2, dtype=mstype.int32)
        self.STOP_VALUE = Tensor(self.target_size-1, dtype=mstype.int32)
        transitions = np.random.normal(size=(self.target_size, self.target_size)).astype(np.float32)
        transitions[tag_to_index[self.START_TAG], :] = -10000
        transitions[:, tag_to_index[self.STOP_TAG]] = -10000
        self.transitions = Parameter(Tensor(transitions), name="transition_matrix")
        self.cat = P.Concat(axis=-1)
        self.argmax = P.ArgMaxWithValue(axis=-1)
        self.log = P.Log()
        self.exp = P.Exp()
        self.sum = P.ReduceSum()
        self.tile = P.Tile()
        self.reduce_sum = P.ReduceSum(keep_dims=True)
        self.reshape = P.Reshape()
        self.expand = P.ExpandDims()
        self.mean = P.ReduceMean()
        init_alphas = np.ones(shape=(self.batch_size, self.target_size)) * -10000.0
        init_alphas[:, self.tag_to_index[self.START_TAG]] = 0.
        self.init_alphas = Tensor(init_alphas, dtype=mstype.float32)
        self.cast = P.Cast()
        self.reduce_max = P.ReduceMax(keep_dims=True)
        self.on_value = Tensor(1.0, dtype=mstype.float32)
        self.off_value = Tensor(0.0, dtype=mstype.float32)
        self.onehot = P.OneHot()
    def log_sum_exp(self, logits):
        '''
        Compute the log_sum_exp score for normalization factor.
        '''
        max_score = self.reduce_max(logits, -1)  #16 5 5
        score = self.log(self.reduce_sum(self.exp(logits - max_score), -1))
        score = max_score + score
        return score
    def _realpath_score(self, features, label):
        '''
        Compute the emission and transition score for the real path.
        '''
        label = label * 1
        concat_A = self.tile(self.reshape(self.START_VALUE, (1,)), (self.batch_size,))
        concat_A = self.reshape(concat_A, (self.batch_size, 1))
        labels = self.cat((concat_A, label))
        onehot_label = self.onehot(label, self.target_size, self.on_value, self.off_value)
        emits = features * onehot_label
        labels = self.onehot(labels, self.target_size, self.on_value, self.off_value)
        label1 = labels[:, 1:, :]
        label2 = labels[:, :self.seq_length, :]
        label1 = self.expand(label1, 3)
        label2 = self.expand(label2, 2)
        label_trans = label1 * label2
        transitions = self.expand(self.expand(self.transitions, 0), 0)
        trans = transitions * label_trans
        score = self.sum(emits, (1, 2)) + self.sum(trans, (1, 2, 3))
        stop_value_index = labels[:, (self.seq_length-1):self.seq_length, :]
        stop_value = self.transitions[(self.target_size-1):self.target_size, :]
        stop_score = stop_value * self.reshape(stop_value_index, (self.batch_size, self.target_size))
        score = score + self.sum(stop_score, 1)
        score = self.reshape(score, (self.batch_size, -1))
        return score
    def _normalization_factor(self, features):
        '''
        Compute the total score for all the paths.
        '''
        forward_var = self.init_alphas
        forward_var = self.expand(forward_var, 1)
        for idx in range(self.seq_length):
            feat = features[:, idx:(idx+1), :]
            emit_score = self.reshape(feat, (self.batch_size, self.target_size, 1))
            next_tag_var = emit_score + self.transitions + forward_var
            forward_var = self.log_sum_exp(next_tag_var)
            forward_var = self.reshape(forward_var, (self.batch_size, 1, self.target_size))
        terminal_var = forward_var + self.reshape(self.transitions[(self.target_size-1):self.target_size, :], (1, -1))
        alpha = self.log_sum_exp(terminal_var)
        alpha = self.reshape(alpha, (self.batch_size, -1))
        return alpha
    def _decoder(self, features):
        '''
        Viterbi decode for evaluation.
        '''
        backpointers = ()
        forward_var = self.init_alphas
        for idx in range(self.seq_length):
            feat = features[:, idx:(idx+1), :]
            feat = self.reshape(feat, (self.batch_size, self.target_size))
            bptrs_t = ()
            next_tag_var = self.expand(forward_var, 1) + self.transitions
            best_tag_id, best_tag_value = self.argmax(next_tag_var)
            bptrs_t += (best_tag_id,)
            forward_var = best_tag_value + feat
            backpointers += (bptrs_t,)
        terminal_var = forward_var + self.reshape(self.transitions[(self.target_size-1):self.target_size, :], (1, -1))
        best_tag_id, _ = self.argmax(terminal_var)
        return backpointers, best_tag_id
    def construct(self, features, label):
        if self.is_training:
            forward_score = self._normalization_factor(features)
            gold_score = self._realpath_score(features, label)
            return_value = self.mean(forward_score - gold_score)
        else:
            path_list, tag = self._decoder(features)
            return_value = path_list, tag
        return return_value
 def postprocess(backpointers, best_tag_id):
    '''
    Do postprocess
    '''
    best_tag_id = best_tag_id.asnumpy()
    batch_size = len(best_tag_id)
    best_path = []
    for i in range(batch_size):
        best_path.append([])
        best_local_id = best_tag_id[i]
        best_path[-1].append(best_local_id)
        for bptrs_t in reversed(backpointers):
            bptrs_t = bptrs_t[0].asnumpy()
            local_idx = bptrs_t[i]
            best_local_id = local_idx[best_local_id]
            best_path[-1].append(best_local_id)
        # Pop off the start tag (we dont want to return that to the caller)
        best_path[-1].pop()
        best_path[-1].reverse()
    return best_path
--- a/tests/st/networks/models/bert/src/init.py
+++ b/tests/st/networks/models/bert/src/init.py
@ -0,0 +1,31 @@
 # Copyright 2020 Huawei Technologies Co., Ltd
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 # http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ============================================================================
 """Bert Init."""
 from .bert_for_pre_training import BertNetworkWithLoss, BertPreTraining, \
    BertPretrainingLoss, GetMaskedLMOutput, GetNextSentenceOutput, \
    BertTrainOneStepCell, BertTrainOneStepWithLossScaleCell
 from .bert_model import BertAttention, BertConfig, BertEncoderCell, BertModel, \
    BertOutput, BertSelfAttention, BertTransformer, EmbeddingLookup, \
    EmbeddingPostprocessor, RelaPosEmbeddingsGenerator, RelaPosMatrixGenerator, \
    SaturateCast, CreateAttentionMaskFromInputMask
 __all__ = [
    "BertNetworkWithLoss", "BertPreTraining", "BertPretrainingLoss",
    "GetMaskedLMOutput", "GetNextSentenceOutput", "BertTrainOneStepCell", "BertTrainOneStepWithLossScaleCell",
    "BertAttention", "BertConfig", "BertEncoderCell", "BertModel", "BertOutput",
    "BertSelfAttention", "BertTransformer", "EmbeddingLookup",
    "EmbeddingPostprocessor", "RelaPosEmbeddingsGenerator",
    "RelaPosMatrixGenerator", "SaturateCast", "CreateAttentionMaskFromInputMask"
 ]
--- a/tests/st/networks/models/bert/src/bert_for_pre_training.py
+++ b/tests/st/networks/models/bert/src/bert_for_pre_training.py
--- a/tests/st/networks/models/bert/src/bert_model.py
+++ b/tests/st/networks/models/bert/src/bert_model.py
--- a/tests/st/networks/models/bert/src/cluener_evaluation.py
+++ b/tests/st/networks/models/bert/src/cluener_evaluation.py
@ -0,0 +1,73 @@
 # Copyright 2020 Huawei Technologies Co., Ltd
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 # http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ============================================================================
 '''bert clue evaluation'''
 import json
 import numpy as np
 import mindspore.common.dtype as mstype
 from mindspore.common.tensor import Tensor
 import tokenization
 from sample_process import label_generation, process_one_example_p
 from .evaluation_config import cfg
 from .CRF import postprocess
 vocab_file = "./vocab.txt"
 tokenizer_ = tokenization.FullTokenizer(vocab_file=vocab_file)
 def process(model, text, sequence_length):
    """
    process text.
    """
    data = [text]
    features = []
    res = []
    ids = []
    for i in data:
        feature = process_one_example_p(tokenizer_, i, max_seq_len=sequence_length)
        features.append(feature)
        input_ids, input_mask, token_type_id = feature
        input_ids = Tensor(np.array(input_ids), mstype.int32)
        input_mask = Tensor(np.array(input_mask), mstype.int32)
        token_type_id = Tensor(np.array(token_type_id), mstype.int32)
        if cfg.use_crf:
            backpointers, best_tag_id = model.predict(input_ids, input_mask, token_type_id, Tensor(1))
            best_path = postprocess(backpointers, best_tag_id)
            logits = []
            for ele in best_path:
                logits.extend(ele)
            ids = logits
        else:
            logits = model.predict(input_ids, input_mask, token_type_id, Tensor(1))
            ids = logits.asnumpy()
            ids = np.argmax(ids, axis=-1)
            ids = list(ids)
    res = label_generation(text, ids)
    return res
 def submit(model, path, sequence_length):
    """
    submit task
    """
    data = []
    for line in open(path):
        if not line.strip():
            continue
        oneline = json.loads(line.strip())
        res = process(model, oneline["text"], sequence_length)
        print("text", oneline["text"])
        print("res:", res)
        data.append(json.dumps({"label": res}, ensure_ascii=False))
    open("ner_predict.json", "w").write("\n".join(data))
--- a/Show More
+++ b/Show More