!1398 Update the bert scripts according to rules of modelzoo

Merge pull request !1398 from chenhaozhe/update_bert_script
5 years ago · b46ad9a1bb
parent 45484c690c b6aceddeab
commit b46ad9a1bb
35 changed files with 2736 additions and 619 deletions
--- a/mindspore/ops/_grad/grad_nn_ops.py
+++ b/mindspore/ops/_grad/grad_nn_ops.py
@ -308,7 +308,7 @@ def get_bprop_softmax(self):
    axis = self.axis

    def bprop(x, out, dout):
-        dx = mul(sub(dout, sum_func(mul(dout, out), axis)), out)
+        dx = mul(out, sub(dout, sum_func(mul(out, dout), axis)))
        return (dx,)

    return bprop
--- a/example/bert_clue/README.md
+++ b/example/bert_clue/README.md
@ -16,12 +16,12 @@ This example implements pre-training, fine-tuning and evaluation of [BERT-base](
 - Run `run_standalone_pretrain.sh` for non-distributed pre-training of BERT-base and BERT-NEZHA model.

    ``` bash   
-    sh run_standalone_pretrain.sh DEVICE_ID EPOCH_SIZE DATA_DIR SCHEMA_DIR
+    sh scripts/run_standalone_pretrain.sh DEVICE_ID EPOCH_SIZE DATA_DIR SCHEMA_DIR
    ```
 - Run `run_distribute_pretrain.sh` for distributed pre-training of BERT-base and BERT-NEZHA model.

    ``` bash   
-    sh run_distribute_pretrain.sh DEVICE_NUM EPOCH_SIZE DATA_DIR SCHEMA_DIR MINDSPORE_HCCL_CONFIG_PATH
+    sh scripts/run_distribute_pretrain.sh DEVICE_NUM EPOCH_SIZE DATA_DIR SCHEMA_DIR MINDSPORE_HCCL_CONFIG_PATH
    ```  

 ### Fine-Tuning
--- a/example/bert_clue/evaluation.py
+++ b/example/bert_clue/evaluation.py
@ -19,8 +19,6 @@ Bert evaluation script.

 import os
 import numpy as np
-from evaluation_config import cfg, bert_net_cfg
-from utils import BertNER, BertCLS
 import mindspore.common.dtype as mstype
 from mindspore import context
 from mindspore.common.tensor import Tensor
@ -28,9 +26,11 @@ import mindspore.dataset as de
 import mindspore.dataset.transforms.c_transforms as C
 from mindspore.train.model import Model
 from mindspore.train.serialization import load_checkpoint, load_param_into_net
-from CRF import postprocess
-from cluener_evaluation import submit
-from finetune_config import tag_to_index
+from src.evaluation_config import cfg, bert_net_cfg
+from src.utils import BertNER, BertCLS
+from src.CRF import postprocess
+from src.cluener_evaluation import submit
+from src.finetune_config import tag_to_index

 class Accuracy():
    '''
--- a/example/bert_clue/finetune.py
+++ b/example/bert_clue/finetune.py
@ -18,8 +18,8 @@ Bert finetune script.
 '''

 import os
-from utils import BertFinetuneCell, BertCLS, BertNER
-from finetune_config import cfg, bert_net_cfg, tag_to_index
+from src.utils import BertFinetuneCell, BertCLS, BertNER
+from src.finetune_config import cfg, bert_net_cfg, tag_to_index
 import mindspore.common.dtype as mstype
 import mindspore.communication.management as D
 from mindspore import context
--- a/example/bert_clue/run_pretrain.py
+++ b/example/bert_clue/run_pretrain.py
@ -26,10 +26,10 @@ from mindspore.train.parallel_utils import ParallelMode
 from mindspore.nn.wrap.loss_scale import DynamicLossScaleUpdateCell
 from mindspore.train.callback import Callback, ModelCheckpoint, CheckpointConfig, TimeMonitor
 from mindspore.train.serialization import load_checkpoint, load_param_into_net
-from mindspore.model_zoo.Bert_NEZHA import BertNetworkWithLoss, BertTrainOneStepCell, BertTrainOneStepWithLossScaleCell
 from mindspore.nn.optim import Lamb, Momentum, AdamWeightDecayDynamicLR
-from dataset import create_bert_dataset
-from config import cfg, bert_net_cfg
+from src import BertNetworkWithLoss, BertTrainOneStepCell, BertTrainOneStepWithLossScaleCell
+from src.dataset import create_bert_dataset
+from src.config import cfg, bert_net_cfg
 _current_dir = os.path.dirname(os.path.realpath(__file__))

 class LossCallBack(Callback):
@ -48,10 +48,8 @@ class LossCallBack(Callback):
        self._per_print_times = per_print_times
    def step_end(self, run_context):
        cb_params = run_context.original_args()
-        with open("./loss.log", "a+") as f:
-            f.write("epoch: {}, step: {}, outputs are {}".format(cb_params.cur_epoch_num, cb_params.cur_step_num,
-                                                                 str(cb_params.net_outputs)))
-            f.write('\n')
+        print("epoch: {}, step: {}, outputs are {}".format(cb_params.cur_epoch_num, cb_params.cur_step_num,
+                                                           str(cb_params.net_outputs)))

 def run_pretrain():
    """pre-train bert_clue"""
@ -81,6 +79,11 @@ def run_pretrain():
        context.reset_auto_parallel_context()
        context.set_auto_parallel_context(parallel_mode=ParallelMode.DATA_PARALLEL, mirror_mean=True,
                                          device_num=device_num)
+        from mindspore.parallel._auto_parallel_context import auto_parallel_context
+        if bert_net_cfg.num_hidden_layers == 12:
+            auto_parallel_context().set_all_reduce_fusion_split_indices([28, 55, 82, 109, 136, 163, 190, 205])
+        elif bert_net_cfg.num_hidden_layers == 24:
+            auto_parallel_context().set_all_reduce_fusion_split_indices([38, 93, 148, 203, 258, 313, 368, 397])
        D.init()
        rank = args_opt.device_id % device_num
    else:
--- a/model_zoo/bert/scripts/run_distribute_pretrain.sh
+++ b/model_zoo/bert/scripts/run_distribute_pretrain.sh
@ -16,8 +16,8 @@

 echo "=============================================================================================================="
 echo "Please run the scipt as: "
-echo "sh run_distribute_pretrain.sh DEVICE_NUM EPOCH_SIZE DATA_DIR SCHEMA_DIR MINDSPORE_HCCL_CONFIG_PATH"
-echo "for example: sh run_distribute_pretrain.sh 8 40 /path/zh-wiki/ /path/Schema.json /path/hccl.json"
+echo "bash run_distribute_pretrain.sh DEVICE_NUM EPOCH_SIZE DATA_DIR SCHEMA_DIR MINDSPORE_HCCL_CONFIG_PATH"
+echo "for example: bash run_distribute_pretrain.sh 8 40 /path/zh-wiki/ /path/Schema.json /path/hccl.json"
 echo "It is better to use absolute path."
 echo "=============================================================================================================="

@ -49,6 +49,10 @@ do
    cp  *.py ./LOG$i
    cd ./LOG$i || exit
    echo "start training for rank $i, device $DEVICE_ID"
+    mkdir -p ms_log
+    CUR_DIR=`pwd`
+    export GLOG_log_dir=${CUR_DIR}/ms_log
+    export GLOG_logtostderr=0
    env > env.log
    taskset -c $cmdopt python ../run_pretrain.py  \
    --distribute="true" \
@ -59,7 +63,7 @@ do
    --enable_lossscale="true" \
    --do_shuffle="true" \
    --enable_data_sink="true" \
-    --data_sink_steps=1 \
+    --data_sink_steps=100 \
    --checkpoint_path="" \
    --save_checkpoint_steps=10000 \
    --save_checkpoint_num=1 \
--- a/model_zoo/bert/scripts/run_standalone_pretrain.sh
+++ b/model_zoo/bert/scripts/run_standalone_pretrain.sh
@ -16,8 +16,8 @@

 echo "=============================================================================================================="
 echo "Please run the scipt as: "
-echo "sh run_standalone_pretrain.sh DEVICE_ID EPOCH_SIZE DATA_DIR SCHEMA_DIR"
-echo "for example: sh run_standalone_pretrain.sh 0 40 /path/zh-wiki/ /path/Schema.json"
+echo "bash run_standalone_pretrain.sh DEVICE_ID EPOCH_SIZE DATA_DIR SCHEMA_DIR"
+echo "for example: bash run_standalone_pretrain.sh 0 40 /path/zh-wiki/ /path/Schema.json"
 echo "=============================================================================================================="

 DEVICE_ID=$1
@ -25,6 +25,10 @@ EPOCH_SIZE=$2
 DATA_DIR=$3
 SCHEMA_DIR=$4

+mkdir -p ms_log 
+CUR_DIR=`pwd`
+export GLOG_log_dir=${CUR_DIR}/ms_log
+export GLOG_logtostderr=0
 python run_pretrain.py  \
    --distribute="false" \
    --epoch_size=$EPOCH_SIZE \
@ -33,7 +37,7 @@ python run_pretrain.py  \
    --enable_lossscale="true" \
    --do_shuffle="true" \
    --enable_data_sink="true" \
-    --data_sink_steps=1 \
+    --data_sink_steps=100 \
    --checkpoint_path="" \
    --save_checkpoint_steps=10000 \
    --save_checkpoint_num=1 \
--- a/model_zoo/bert/src/CRF.py
+++ b/model_zoo/bert/src/CRF.py
--- a/mindspore/model_zoo/Bert_NEZHA/init.py
+++ b/mindspore/model_zoo/Bert_NEZHA/init.py
--- a/mindspore/model_zoo/Bert_NEZHA/bert_for_pre_training.py
+++ b/mindspore/model_zoo/Bert_NEZHA/bert_for_pre_training.py
@ -357,10 +357,10 @@ class BertTrainOneStepWithLossScaleCell(nn.Cell):
        if self.parallel_mode in [ParallelMode.DATA_PARALLEL, ParallelMode.HYBRID_PARALLEL]:
            self.reducer_flag = True
        self.grad_reducer = F.identity
+        self.degree = 1
        if self.reducer_flag:
-            mean = context.get_auto_parallel_context("mirror_mean")
-            degree = get_group_size()
-            self.grad_reducer = DistributedGradReducer(optimizer.parameters, mean, degree)
+            self.degree = get_group_size()
+            self.grad_reducer = DistributedGradReducer(optimizer.parameters, False, self.degree)
        self.is_distributed = (self.parallel_mode != ParallelMode.STAND_ALONE)
        self.cast = P.Cast()
        self.alloc_status = P.NPUAllocFloatStatus()
@ -411,10 +411,10 @@ class BertTrainOneStepWithLossScaleCell(nn.Cell):
                                                 masked_lm_weights,
                                                 self.cast(scaling_sens,
                                                           mstype.float32))
-        grads = self.hyper_map(F.partial(grad_scale, scaling_sens), grads)
-        grads = self.hyper_map(F.partial(clip_grad, GRADIENT_CLIP_TYPE, GRADIENT_CLIP_VALUE), grads)
        # apply grad reducer on grads
        grads = self.grad_reducer(grads)
+        grads = self.hyper_map(F.partial(grad_scale, scaling_sens * self.degree), grads)
+        grads = self.hyper_map(F.partial(clip_grad, GRADIENT_CLIP_TYPE, GRADIENT_CLIP_VALUE), grads)
        self.get_status(init)
        flag_sum = self.reduce_sum(init, (0,))
        if self.is_distributed:
--- a/mindspore/model_zoo/Bert_NEZHA/bert_model.py
+++ b/mindspore/model_zoo/Bert_NEZHA/bert_model.py
@ -25,6 +25,7 @@ from mindspore.ops import operations as P
 from mindspore.ops import composite as C
 from mindspore.common.tensor import Tensor
 from mindspore.common.parameter import Parameter
+from .fused_layer_norm import FusedLayerNorm


 class BertConfig:
@ -77,7 +78,8 @@ class BertConfig:
                 input_mask_from_dataset=True,
                 token_type_ids_from_dataset=True,
                 dtype=mstype.float32,
-                 compute_type=mstype.float32):
+                 compute_type=mstype.float32,
+                 enable_fused_layernorm=False):
        self.batch_size = batch_size
        self.seq_length = seq_length
        self.vocab_size = vocab_size
@ -96,6 +98,7 @@ class BertConfig:
        self.use_relative_positions = use_relative_positions
        self.dtype = dtype
        self.compute_type = compute_type
+        self.enable_fused_layernorm = enable_fused_layernorm


 class EmbeddingLookup(nn.Cell):
@ -240,13 +243,19 @@ class BertOutput(nn.Cell):
                 out_channels,
                 initializer_range=0.02,
                 dropout_prob=0.1,
-                 compute_type=mstype.float32):
+                 compute_type=mstype.float32,
+                 enable_fused_layernorm=False):
        super(BertOutput, self).__init__()
        self.dense = nn.Dense(in_channels, out_channels,
                              weight_init=TruncatedNormal(initializer_range)).to_float(compute_type)
        self.dropout = nn.Dropout(1 - dropout_prob)
+        self.dropout_prob = dropout_prob
        self.add = P.TensorAdd()
-        self.layernorm = nn.LayerNorm((out_channels,)).to_float(compute_type)
+        if compute_type == mstype.float16:
+            self.layernorm = FusedLayerNorm((out_channels,),
+                                            use_batch_norm=enable_fused_layernorm).to_float(compute_type)
+        else:
+            self.layernorm = nn.LayerNorm((out_channels,)).to_float(compute_type)
        self.cast = P.Cast()

    def construct(self, hidden_status, input_tensor):
@ -481,12 +490,13 @@ class BertAttention(nn.Cell):
            self.shape_return = (batch_size, from_seq_length, num_attention_heads * size_per_head)

        self.cast_compute_type = SaturateCast(dst_type=compute_type)
-        self._generate_relative_positions_embeddings = \
-            RelaPosEmbeddingsGenerator(length=to_seq_length,
-                                       depth=size_per_head,
-                                       max_relative_position=16,
-                                       initializer_range=initializer_range,
-                                       use_one_hot_embeddings=use_one_hot_embeddings)
+        if self.use_relative_positions:
+            self._generate_relative_positions_embeddings = \
+                RelaPosEmbeddingsGenerator(length=to_seq_length,
+                                           depth=size_per_head,
+                                           max_relative_position=16,
+                                           initializer_range=initializer_range,
+                                           use_one_hot_embeddings=use_one_hot_embeddings)

    def construct(self, from_tensor, to_tensor, attention_mask):
        # reshape 2d/3d input tensors to 2d
@ -529,7 +539,7 @@ class BertAttention(nn.Cell):
                                                     self.trans_shape_position)
            attention_scores = attention_scores + key_position_scores_r_t

-        attention_scores = self.multiply(attention_scores, self.scores_mul)
+        attention_scores = self.multiply(self.scores_mul, attention_scores)

        if self.has_attention_mask:
            attention_mask = self.expand_dims(attention_mask, 1)
@ -606,7 +616,8 @@ class BertSelfAttention(nn.Cell):
                 initializer_range=0.02,
                 hidden_dropout_prob=0.1,
                 use_relative_positions=False,
-                 compute_type=mstype.float32):
+                 compute_type=mstype.float32,
+                 enable_fused_layernorm=False):
        super(BertSelfAttention, self).__init__()
        if hidden_size % num_attention_heads != 0:
            raise ValueError("The hidden size (%d) is not a multiple of the number "
@ -634,7 +645,8 @@ class BertSelfAttention(nn.Cell):
                                 out_channels=hidden_size,
                                 initializer_range=initializer_range,
                                 dropout_prob=hidden_dropout_prob,
-                                 compute_type=compute_type)
+                                 compute_type=compute_type,
+                                 enable_fused_layernorm=enable_fused_layernorm)
        self.reshape = P.Reshape()
        self.shape = (-1, hidden_size)

@ -676,7 +688,8 @@ class BertEncoderCell(nn.Cell):
                 hidden_dropout_prob=0.1,
                 use_relative_positions=False,
                 hidden_act="gelu",
-                 compute_type=mstype.float32):
+                 compute_type=mstype.float32,
+                 enable_fused_layernorm=False):
        super(BertEncoderCell, self).__init__()
        self.attention = BertSelfAttention(
            batch_size=batch_size,
@ -688,7 +701,8 @@ class BertEncoderCell(nn.Cell):
            initializer_range=initializer_range,
            hidden_dropout_prob=hidden_dropout_prob,
            use_relative_positions=use_relative_positions,
-            compute_type=compute_type)
+            compute_type=compute_type,
+            enable_fused_layernorm=enable_fused_layernorm)
        self.intermediate = nn.Dense(in_channels=hidden_size,
                                     out_channels=intermediate_size,
                                     activation=hidden_act,
@ -697,7 +711,8 @@ class BertEncoderCell(nn.Cell):
                                 out_channels=hidden_size,
                                 initializer_range=initializer_range,
                                 dropout_prob=hidden_dropout_prob,
-                                 compute_type=compute_type)
+                                 compute_type=compute_type,
+                                 enable_fused_layernorm=enable_fused_layernorm)

    def construct(self, hidden_states, attention_mask):
        # self-attention
@ -744,7 +759,8 @@ class BertTransformer(nn.Cell):
                 use_relative_positions=False,
                 hidden_act="gelu",
                 compute_type=mstype.float32,
-                 return_all_encoders=False):
+                 return_all_encoders=False,
+                 enable_fused_layernorm=False):
        super(BertTransformer, self).__init__()
        self.return_all_encoders = return_all_encoders

@ -761,7 +777,8 @@ class BertTransformer(nn.Cell):
                                    hidden_dropout_prob=hidden_dropout_prob,
                                    use_relative_positions=use_relative_positions,
                                    hidden_act=hidden_act,
-                                    compute_type=compute_type)
+                                    compute_type=compute_type,
+                                    enable_fused_layernorm=enable_fused_layernorm)
            layers.append(layer)

        self.layers = nn.CellList(layers)
@ -888,7 +905,8 @@ class BertModel(nn.Cell):
            use_relative_positions=config.use_relative_positions,
            hidden_act=config.hidden_act,
            compute_type=config.compute_type,
-            return_all_encoders=True)
+            return_all_encoders=True,
+            enable_fused_layernorm=config.enable_fused_layernorm)

        self.cast = P.Cast()
        self.dtype = config.dtype
--- a/model_zoo/bert/src/cluener_evaluation.py
+++ b/model_zoo/bert/src/cluener_evaluation.py
@ -17,12 +17,12 @@

 import json
 import numpy as np
-from evaluation_config import cfg
 import mindspore.common.dtype as mstype
 from mindspore.common.tensor import Tensor
-from CRF import postprocess
 import tokenization
 from sample_process import label_generation, process_one_example_p
+from .evaluation_config import cfg
+from .CRF import postprocess

 vocab_file = "./vocab.txt"
 tokenizer_ = tokenization.FullTokenizer(vocab_file=vocab_file)
--- a/model_zoo/bert/src/config.py
+++ b/model_zoo/bert/src/config.py
@ -17,16 +17,16 @@ network config setting, will be used in dataset.py, run_pretrain.py
 """
 from easydict import EasyDict as edict
 import mindspore.common.dtype as mstype
-from mindspore.model_zoo.Bert_NEZHA import BertConfig
+from .bert_model import BertConfig
 cfg = edict({
    'bert_network': 'base',
-    'loss_scale_value': 2**32,
+    'loss_scale_value': 65536,
    'scale_factor': 2,
    'scale_window': 1000,
    'optimizer': 'Lamb',
    'AdamWeightDecayDynamicLR': edict({
        'learning_rate': 3e-5,
-        'end_learning_rate': 1e-7,
+        'end_learning_rate': 1e-10,
        'power': 5.0,
        'weight_decay': 1e-5,
        'eps': 1e-6,
@ -34,7 +34,7 @@ cfg = edict({
    }),
    'Lamb': edict({
        'start_learning_rate': 3e-5,
-        'end_learning_rate': 1e-7,
+        'end_learning_rate': 1e-10,
        'power': 10.0,
        'warmup_steps': 10000,
        'weight_decay': 0.01,
@ -56,7 +56,7 @@ if cfg.bert_network == 'base':
    bert_net_cfg = BertConfig(
        batch_size=32,
        seq_length=128,
-        vocab_size=21128,
+        vocab_size=21136,
        hidden_size=768,
        num_hidden_layers=12,
        num_attention_heads=12,
@ -71,13 +71,13 @@ if cfg.bert_network == 'base':
        input_mask_from_dataset=True,
        token_type_ids_from_dataset=True,
        dtype=mstype.float32,
-        compute_type=mstype.float16,
+        compute_type=mstype.float16
    )
 if cfg.bert_network == 'nezha':
    bert_net_cfg = BertConfig(
        batch_size=32,
        seq_length=128,
-        vocab_size=21128,
+        vocab_size=21136,
        hidden_size=1024,
        num_hidden_layers=24,
        num_attention_heads=16,
@ -92,5 +92,27 @@ if cfg.bert_network == 'nezha':
        input_mask_from_dataset=True,
        token_type_ids_from_dataset=True,
        dtype=mstype.float32,
+        compute_type=mstype.float16
+    )
+if cfg.bert_network == 'large':
+    bert_net_cfg = BertConfig(
+        batch_size=16,
+        seq_length=512,
+        vocab_size=30528,
+        hidden_size=1024,
+        num_hidden_layers=24,
+        num_attention_heads=16,
+        intermediate_size=4096,
+        hidden_act="gelu",
+        hidden_dropout_prob=0.1,
+        attention_probs_dropout_prob=0.1,
+        max_position_embeddings=512,
+        type_vocab_size=2,
+        initializer_range=0.02,
+        use_relative_positions=False,
+        input_mask_from_dataset=True,
+        token_type_ids_from_dataset=True,
+        dtype=mstype.float32,
        compute_type=mstype.float16,
+        enable_fused_layernorm=True
    )
--- a/model_zoo/bert/src/dataset.py
+++ b/model_zoo/bert/src/dataset.py
@ -20,7 +20,7 @@ import mindspore.common.dtype as mstype
 import mindspore.dataset.engine.datasets as de
 import mindspore.dataset.transforms.c_transforms as C
 from mindspore import log as logger
-from config import bert_net_cfg
+from .config import bert_net_cfg


 def create_bert_dataset(epoch_size=1, device_num=1, rank=0, do_shuffle="true", enable_data_sink="true",
@ -31,8 +31,9 @@ def create_bert_dataset(epoch_size=1, device_num=1, rank=0, do_shuffle="true", e
    files = os.listdir(data_dir)
    data_files = []
    for file_name in files:
-        data_files.append(os.path.join(data_dir, file_name))
-    ds = de.TFRecordDataset(data_files, schema_dir,
+        if "tfrecord" in file_name:
+            data_files.append(os.path.join(data_dir, file_name))
+    ds = de.TFRecordDataset(data_files, schema_dir if schema_dir != "" else None,
                            columns_list=["input_ids", "input_mask", "segment_ids", "next_sentence_labels",
                                          "masked_lm_positions", "masked_lm_ids", "masked_lm_weights"],
                            shuffle=(do_shuffle == "true"), num_shards=device_num, shard_id=rank,
--- a/model_zoo/bert/src/evaluation_config.py
+++ b/model_zoo/bert/src/evaluation_config.py
@ -19,7 +19,7 @@ config settings, will be used in finetune.py

 from easydict import EasyDict as edict
 import mindspore.common.dtype as mstype
-from mindspore.model_zoo.Bert_NEZHA import BertConfig
+from .bert_model import BertConfig

 cfg = edict({
    'task': 'NER',
--- a/model_zoo/bert/src/finetune_config.py
+++ b/model_zoo/bert/src/finetune_config.py
@ -19,7 +19,7 @@ config settings, will be used in finetune.py

 from easydict import EasyDict as edict
 import mindspore.common.dtype as mstype
-from mindspore.model_zoo.Bert_NEZHA import BertConfig
+from .bert_model import BertConfig

 cfg = edict({
    'task': 'NER',
--- a/model_zoo/bert/src/fused_layer_norm.py
+++ b/model_zoo/bert/src/fused_layer_norm.py
@ -0,0 +1,121 @@
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+"""fused layernorm"""
+from mindspore.ops import operations as P
+from mindspore.ops import functional as F
+from mindspore.common.parameter import Parameter
+from mindspore.common.initializer import initializer
+from mindspore.ops.primitive import constexpr
+import mindspore.common.dtype as mstype
+from mindspore.nn.cell import Cell
+
+import numpy as np
+
+
+__all__ = ['FusedLayerNorm']
+
+@constexpr
+def get_shape_for_norm(x_shape, begin_norm_axis):
+    print("input_shape: ", x_shape)
+    norm_shape = x_shape[begin_norm_axis:]
+    output_shape = (1, -1, 1, int(np.prod(norm_shape)))
+    print("output_shape: ", output_shape)
+    return output_shape
+
+class FusedLayerNorm(Cell):
+    r"""
+    Applies Layer Normalization over a mini-batch of inputs.
+
+    Layer normalization is widely used in recurrent neural networks. It applies
+    normalization over a mini-batch of inputs for each single training case as described
+    in the paper `Layer Normalization <https://arxiv.org/pdf/1607.06450.pdf>`_. Unlike batch
+    normalization, layer normalization performs exactly the same computation at training and
+    testing times. It can be described using the following formula. It is applied across all channels
+    and pixel but only one batch size.
+
+    .. math::
+        y = \frac{x - \mathrm{E}[x]}{\sqrt{\mathrm{Var}[x] + \epsilon}} * \gamma + \beta
+
+    Args:
+        normalized_shape (Union(tuple[int], list[int]): The normalization is performed over axis
+            `begin_norm_axis ... R - 1`.
+        begin_norm_axis (int): It first normalization dimension: normalization will be performed along dimensions
+            `begin_norm_axis: rank(inputs)`, the value should be in [-1, rank(input)). Default: -1.
+        begin_params_axis (int): The first parameter(beta, gamma)dimension: scale and centering parameters
+            will have dimensions `begin_params_axis: rank(inputs)` and will be broadcast with
+            the normalized inputs accordingly, the value should be in [-1, rank(input)). Default: -1.
+        gamma_init (Union[Tensor, str, Initializer, numbers.Number]): Initializer for the gamma weight.
+            The values of str refer to the function `initializer` including 'zeros', 'ones', 'xavier_uniform',
+            'he_uniform', etc. Default: 'ones'.
+        beta_init (Union[Tensor, str, Initializer, numbers.Number]): Initializer for the beta weight.
+            The values of str refer to the function `initializer` including 'zeros', 'ones', 'xavier_uniform',
+            'he_uniform', etc. Default: 'zeros'.
+        use_batch_nrom (bool): Whether use batchnorm to preocess.
+
+    Inputs:
+        - **input_x** (Tensor) - The shape of 'input_x' is :math:`(x_1, x_2, ..., x_R)`,
+          and `input_shape[begin_norm_axis:]` is equal to `normalized_shape`.
+
+    Outputs:
+        Tensor, the normalized and scaled offset tensor, has the same shape and data type as the `input_x`.
+
+    Examples:
+        >>> x = Tensor(np.ones([20, 5, 10, 10]), mindspore.float32)
+        >>> shape1 = x.shape()[1:]
+        >>> m = nn.LayerNorm(shape1,  begin_norm_axis=1, begin_params_axis=1)
+        >>> m(x)
+    """
+    def __init__(self,
+                 normalized_shape,
+                 begin_norm_axis=-1,
+                 begin_params_axis=-1,
+                 gamma_init='ones',
+                 beta_init='zeros',
+                 use_batch_norm=False):
+        super(FusedLayerNorm, self).__init__()
+        if not isinstance(normalized_shape, (tuple, list)):
+            raise TypeError("The type of 'normalized_shape' should be tuple[int] or list[int], but '{}' type is {}."
+                            .format(normalized_shape, type(normalized_shape)))
+        self.normalized_shape = normalized_shape
+        self.begin_norm_axis = begin_norm_axis
+        self.begin_params_axis = begin_params_axis
+        self.gamma = Parameter(initializer(
+            gamma_init, normalized_shape), name="gamma")
+        self.beta = Parameter(initializer(
+            beta_init, normalized_shape), name="beta")
+        self.layer_norm = P.LayerNorm(begin_norm_axis=self.begin_norm_axis, begin_params_axis=self.begin_params_axis)
+
+        self.batch_norm = P.BatchNorm(is_training=True, epsilon=1e-5)
+        self.use_batch_norm = use_batch_norm
+
+    def construct(self, input_x):
+        if self.use_batch_norm and self.training:
+            ones = P.Fill()(mstype.float32, F.shape(input_x)[:self.begin_norm_axis], 1.0)
+            zeros = P.Fill()(mstype.float32, F.shape(input_x)[:self.begin_norm_axis], 0.0)
+            shape_x = F.shape(input_x)
+            norm_shape = get_shape_for_norm(shape_x, self.begin_norm_axis)
+            input_x = F.reshape(input_x, norm_shape)
+            output, _, _, _, _, _ = self.batch_norm(input_x, ones, zeros, None, None)
+            output = F.reshape(output, shape_x)
+            y = output * self.gamma + self.beta
+        else:
+            y, _, _ = self.layer_norm(input_x, self.gamma, self.beta)
+        return y
+
+    def extend_repr(self):
+        """Display instance object as string."""
+        s = 'normalized_shape={}, begin_norm_axis={}, begin_params_axis={}, gamma{}, beta={}'.format(
+            self.normalized_shape, self.begin_norm_axis, self.begin_params_axis, self.gamma, self.beta)
+        return s
--- a/model_zoo/bert/src/sample_process.py
+++ b/model_zoo/bert/src/sample_process.py
--- a/model_zoo/bert/src/utils.py
+++ b/model_zoo/bert/src/utils.py
@ -30,8 +30,8 @@ from mindspore.train.parallel_utils import ParallelMode
 from mindspore.communication.management import get_group_size
 from mindspore import context
 from mindspore.model_zoo.Bert_NEZHA.bert_model import BertModel
-from mindspore.model_zoo.Bert_NEZHA.bert_for_pre_training import clip_grad
-from CRF import CRF
+from .bert_for_pre_training import clip_grad
+from .CRF import CRF

 GRADIENT_CLIP_TYPE = 1
 GRADIENT_CLIP_VALUE = 1.0
--- a/tests/st/networks/models/bert/bert_tdt_lossscale.py
+++ b/tests/st/networks/models/bert/bert_tdt_lossscale.py
@ -25,7 +25,8 @@ import mindspore.dataset.transforms.c_transforms as C
 from mindspore import context
 from mindspore import log as logger
 from mindspore.common.tensor import Tensor
-from mindspore.model_zoo.Bert_NEZHA import BertConfig, BertNetworkWithLoss, BertTrainOneStepWithLossScaleCell
+from src.bert_model import BertConfig
+from src.bert_for_pre_training import BertNetworkWithLoss, BertTrainOneStepWithLossScaleCell
 from mindspore.nn.optim import Lamb
 from mindspore.train.callback import Callback
 from mindspore.train.loss_scale_manager import DynamicLossScaleManager
@ -77,7 +78,8 @@ def get_config(version='base', batch_size=1):
            input_mask_from_dataset=True,
            token_type_ids_from_dataset=True,
            dtype=mstype.float32,
-            compute_type=mstype.float16)
+            compute_type=mstype.float16,
+            enable_fused_layernorm=False)
    else:
        bert_config = BertConfig(batch_size=batch_size)
    return bert_config
--- a/tests/st/networks/models/bert/src/CRF.py
+++ b/tests/st/networks/models/bert/src/CRF.py
@ -0,0 +1,177 @@
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+
+'''
+CRF script.
+'''
+
+import numpy as np
+import mindspore.nn as nn
+from mindspore.ops import operations as P
+from mindspore.common.tensor import Tensor
+from mindspore.common.parameter import Parameter
+import mindspore.common.dtype as mstype
+
+class CRF(nn.Cell):
+    '''
+    Conditional Random Field
+    Args:
+        tag_to_index: The dict for tag to index mapping with extra "<START>" and "<STOP>"sign.
+        batch_size: Batch size, i.e., the length of the first dimension.
+        seq_length: Sequence length, i.e., the length of the second dimention.
+        is_training: Specifies whether to use training mode.
+    Returns:
+        Training mode: Tensor, total loss.
+        Evaluation mode: Tuple, the index for each step with the highest score; Tuple, the index for the last
+        step with the highest score.
+    '''
+    def __init__(self, tag_to_index, batch_size=1, seq_length=128, is_training=True):
+
+        super(CRF, self).__init__()
+        self.target_size = len(tag_to_index)
+        self.is_training = is_training
+        self.tag_to_index = tag_to_index
+        self.batch_size = batch_size
+        self.seq_length = seq_length
+        self.START_TAG = "<START>"
+        self.STOP_TAG = "<STOP>"
+        self.START_VALUE = Tensor(self.target_size-2, dtype=mstype.int32)
+        self.STOP_VALUE = Tensor(self.target_size-1, dtype=mstype.int32)
+        transitions = np.random.normal(size=(self.target_size, self.target_size)).astype(np.float32)
+        transitions[tag_to_index[self.START_TAG], :] = -10000
+        transitions[:, tag_to_index[self.STOP_TAG]] = -10000
+        self.transitions = Parameter(Tensor(transitions), name="transition_matrix")
+        self.cat = P.Concat(axis=-1)
+        self.argmax = P.ArgMaxWithValue(axis=-1)
+        self.log = P.Log()
+        self.exp = P.Exp()
+        self.sum = P.ReduceSum()
+        self.tile = P.Tile()
+        self.reduce_sum = P.ReduceSum(keep_dims=True)
+        self.reshape = P.Reshape()
+        self.expand = P.ExpandDims()
+        self.mean = P.ReduceMean()
+        init_alphas = np.ones(shape=(self.batch_size, self.target_size)) * -10000.0
+        init_alphas[:, self.tag_to_index[self.START_TAG]] = 0.
+        self.init_alphas = Tensor(init_alphas, dtype=mstype.float32)
+        self.cast = P.Cast()
+        self.reduce_max = P.ReduceMax(keep_dims=True)
+        self.on_value = Tensor(1.0, dtype=mstype.float32)
+        self.off_value = Tensor(0.0, dtype=mstype.float32)
+        self.onehot = P.OneHot()
+
+    def log_sum_exp(self, logits):
+        '''
+        Compute the log_sum_exp score for normalization factor.
+        '''
+        max_score = self.reduce_max(logits, -1)  #16 5 5
+        score = self.log(self.reduce_sum(self.exp(logits - max_score), -1))
+        score = max_score + score
+        return score
+
+    def _realpath_score(self, features, label):
+        '''
+        Compute the emission and transition score for the real path.
+        '''
+        label = label * 1
+        concat_A = self.tile(self.reshape(self.START_VALUE, (1,)), (self.batch_size,))
+        concat_A = self.reshape(concat_A, (self.batch_size, 1))
+        labels = self.cat((concat_A, label))
+        onehot_label = self.onehot(label, self.target_size, self.on_value, self.off_value)
+        emits = features * onehot_label
+        labels = self.onehot(labels, self.target_size, self.on_value, self.off_value)
+        label1 = labels[:, 1:, :]
+        label2 = labels[:, :self.seq_length, :]
+        label1 = self.expand(label1, 3)
+        label2 = self.expand(label2, 2)
+        label_trans = label1 * label2
+        transitions = self.expand(self.expand(self.transitions, 0), 0)
+        trans = transitions * label_trans
+        score = self.sum(emits, (1, 2)) + self.sum(trans, (1, 2, 3))
+        stop_value_index = labels[:, (self.seq_length-1):self.seq_length, :]
+        stop_value = self.transitions[(self.target_size-1):self.target_size, :]
+        stop_score = stop_value * self.reshape(stop_value_index, (self.batch_size, self.target_size))
+        score = score + self.sum(stop_score, 1)
+        score = self.reshape(score, (self.batch_size, -1))
+        return score
+
+    def _normalization_factor(self, features):
+        '''
+        Compute the total score for all the paths.
+        '''
+        forward_var = self.init_alphas
+        forward_var = self.expand(forward_var, 1)
+        for idx in range(self.seq_length):
+            feat = features[:, idx:(idx+1), :]
+            emit_score = self.reshape(feat, (self.batch_size, self.target_size, 1))
+            next_tag_var = emit_score + self.transitions + forward_var
+            forward_var = self.log_sum_exp(next_tag_var)
+            forward_var = self.reshape(forward_var, (self.batch_size, 1, self.target_size))
+        terminal_var = forward_var + self.reshape(self.transitions[(self.target_size-1):self.target_size, :], (1, -1))
+        alpha = self.log_sum_exp(terminal_var)
+        alpha = self.reshape(alpha, (self.batch_size, -1))
+        return alpha
+
+    def _decoder(self, features):
+        '''
+        Viterbi decode for evaluation.
+        '''
+        backpointers = ()
+        forward_var = self.init_alphas
+        for idx in range(self.seq_length):
+            feat = features[:, idx:(idx+1), :]
+            feat = self.reshape(feat, (self.batch_size, self.target_size))
+            bptrs_t = ()
+
+            next_tag_var = self.expand(forward_var, 1) + self.transitions
+            best_tag_id, best_tag_value = self.argmax(next_tag_var)
+            bptrs_t += (best_tag_id,)
+            forward_var = best_tag_value + feat
+
+            backpointers += (bptrs_t,)
+        terminal_var = forward_var + self.reshape(self.transitions[(self.target_size-1):self.target_size, :], (1, -1))
+        best_tag_id, _ = self.argmax(terminal_var)
+        return backpointers, best_tag_id
+
+    def construct(self, features, label):
+        if self.is_training:
+            forward_score = self._normalization_factor(features)
+            gold_score = self._realpath_score(features, label)
+            return_value = self.mean(forward_score - gold_score)
+        else:
+            path_list, tag = self._decoder(features)
+            return_value = path_list, tag
+        return return_value
+
+def postprocess(backpointers, best_tag_id):
+    '''
+    Do postprocess
+    '''
+    best_tag_id = best_tag_id.asnumpy()
+    batch_size = len(best_tag_id)
+    best_path = []
+    for i in range(batch_size):
+        best_path.append([])
+        best_local_id = best_tag_id[i]
+        best_path[-1].append(best_local_id)
+        for bptrs_t in reversed(backpointers):
+            bptrs_t = bptrs_t[0].asnumpy()
+            local_idx = bptrs_t[i]
+            best_local_id = local_idx[best_local_id]
+            best_path[-1].append(best_local_id)
+        # Pop off the start tag (we dont want to return that to the caller)
+        best_path[-1].pop()
+        best_path[-1].reverse()
+    return best_path
--- a/tests/st/networks/models/bert/src/init.py
+++ b/tests/st/networks/models/bert/src/init.py
@ -0,0 +1,31 @@
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+"""Bert Init."""
+from .bert_for_pre_training import BertNetworkWithLoss, BertPreTraining, \
+    BertPretrainingLoss, GetMaskedLMOutput, GetNextSentenceOutput, \
+    BertTrainOneStepCell, BertTrainOneStepWithLossScaleCell
+from .bert_model import BertAttention, BertConfig, BertEncoderCell, BertModel, \
+    BertOutput, BertSelfAttention, BertTransformer, EmbeddingLookup, \
+    EmbeddingPostprocessor, RelaPosEmbeddingsGenerator, RelaPosMatrixGenerator, \
+    SaturateCast, CreateAttentionMaskFromInputMask
+
+__all__ = [
+    "BertNetworkWithLoss", "BertPreTraining", "BertPretrainingLoss",
+    "GetMaskedLMOutput", "GetNextSentenceOutput", "BertTrainOneStepCell", "BertTrainOneStepWithLossScaleCell",
+    "BertAttention", "BertConfig", "BertEncoderCell", "BertModel", "BertOutput",
+    "BertSelfAttention", "BertTransformer", "EmbeddingLookup",
+    "EmbeddingPostprocessor", "RelaPosEmbeddingsGenerator",
+    "RelaPosMatrixGenerator", "SaturateCast", "CreateAttentionMaskFromInputMask"
+]
--- a/tests/st/networks/models/bert/src/bert_for_pre_training.py
+++ b/tests/st/networks/models/bert/src/bert_for_pre_training.py
--- a/tests/st/networks/models/bert/src/bert_model.py
+++ b/tests/st/networks/models/bert/src/bert_model.py
--- a/tests/st/networks/models/bert/src/cluener_evaluation.py
+++ b/tests/st/networks/models/bert/src/cluener_evaluation.py
@ -0,0 +1,73 @@
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+
+'''bert clue evaluation'''
+
+import json
+import numpy as np
+import mindspore.common.dtype as mstype
+from mindspore.common.tensor import Tensor
+import tokenization
+from sample_process import label_generation, process_one_example_p
+from .evaluation_config import cfg
+from .CRF import postprocess
+
+vocab_file = "./vocab.txt"
+tokenizer_ = tokenization.FullTokenizer(vocab_file=vocab_file)
+
+def process(model, text, sequence_length):
+    """
+    process text.
+    """
+    data = [text]
+    features = []
+    res = []
+    ids = []
+    for i in data:
+        feature = process_one_example_p(tokenizer_, i, max_seq_len=sequence_length)
+        features.append(feature)
+        input_ids, input_mask, token_type_id = feature
+        input_ids = Tensor(np.array(input_ids), mstype.int32)
+        input_mask = Tensor(np.array(input_mask), mstype.int32)
+        token_type_id = Tensor(np.array(token_type_id), mstype.int32)
+        if cfg.use_crf:
+            backpointers, best_tag_id = model.predict(input_ids, input_mask, token_type_id, Tensor(1))
+            best_path = postprocess(backpointers, best_tag_id)
+            logits = []
+            for ele in best_path:
+                logits.extend(ele)
+            ids = logits
+        else:
+            logits = model.predict(input_ids, input_mask, token_type_id, Tensor(1))
+            ids = logits.asnumpy()
+            ids = np.argmax(ids, axis=-1)
+            ids = list(ids)
+    res = label_generation(text, ids)
+    return res
+
+def submit(model, path, sequence_length):
+    """
+    submit task
+    """
+    data = []
+    for line in open(path):
+        if not line.strip():
+            continue
+        oneline = json.loads(line.strip())
+        res = process(model, oneline["text"], sequence_length)
+        print("text", oneline["text"])
+        print("res:", res)
+        data.append(json.dumps({"label": res}, ensure_ascii=False))
+    open("ner_predict.json", "w").write("\n".join(data))
--- a/Show More
+++ b/Show More