!1398 Update the bert scripts according to rules of modelzoo
Merge pull request !1398 from chenhaozhe/update_bert_scriptpull/1398/MERGE
commit
b46ad9a1bb
@ -0,0 +1,121 @@
|
||||
# Copyright 2020 Huawei Technologies Co., Ltd
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
# ============================================================================
|
||||
"""fused layernorm"""
|
||||
from mindspore.ops import operations as P
|
||||
from mindspore.ops import functional as F
|
||||
from mindspore.common.parameter import Parameter
|
||||
from mindspore.common.initializer import initializer
|
||||
from mindspore.ops.primitive import constexpr
|
||||
import mindspore.common.dtype as mstype
|
||||
from mindspore.nn.cell import Cell
|
||||
|
||||
import numpy as np
|
||||
|
||||
|
||||
__all__ = ['FusedLayerNorm']
|
||||
|
||||
@constexpr
|
||||
def get_shape_for_norm(x_shape, begin_norm_axis):
|
||||
print("input_shape: ", x_shape)
|
||||
norm_shape = x_shape[begin_norm_axis:]
|
||||
output_shape = (1, -1, 1, int(np.prod(norm_shape)))
|
||||
print("output_shape: ", output_shape)
|
||||
return output_shape
|
||||
|
||||
class FusedLayerNorm(Cell):
|
||||
r"""
|
||||
Applies Layer Normalization over a mini-batch of inputs.
|
||||
|
||||
Layer normalization is widely used in recurrent neural networks. It applies
|
||||
normalization over a mini-batch of inputs for each single training case as described
|
||||
in the paper `Layer Normalization <https://arxiv.org/pdf/1607.06450.pdf>`_. Unlike batch
|
||||
normalization, layer normalization performs exactly the same computation at training and
|
||||
testing times. It can be described using the following formula. It is applied across all channels
|
||||
and pixel but only one batch size.
|
||||
|
||||
.. math::
|
||||
y = \frac{x - \mathrm{E}[x]}{\sqrt{\mathrm{Var}[x] + \epsilon}} * \gamma + \beta
|
||||
|
||||
Args:
|
||||
normalized_shape (Union(tuple[int], list[int]): The normalization is performed over axis
|
||||
`begin_norm_axis ... R - 1`.
|
||||
begin_norm_axis (int): It first normalization dimension: normalization will be performed along dimensions
|
||||
`begin_norm_axis: rank(inputs)`, the value should be in [-1, rank(input)). Default: -1.
|
||||
begin_params_axis (int): The first parameter(beta, gamma)dimension: scale and centering parameters
|
||||
will have dimensions `begin_params_axis: rank(inputs)` and will be broadcast with
|
||||
the normalized inputs accordingly, the value should be in [-1, rank(input)). Default: -1.
|
||||
gamma_init (Union[Tensor, str, Initializer, numbers.Number]): Initializer for the gamma weight.
|
||||
The values of str refer to the function `initializer` including 'zeros', 'ones', 'xavier_uniform',
|
||||
'he_uniform', etc. Default: 'ones'.
|
||||
beta_init (Union[Tensor, str, Initializer, numbers.Number]): Initializer for the beta weight.
|
||||
The values of str refer to the function `initializer` including 'zeros', 'ones', 'xavier_uniform',
|
||||
'he_uniform', etc. Default: 'zeros'.
|
||||
use_batch_nrom (bool): Whether use batchnorm to preocess.
|
||||
|
||||
Inputs:
|
||||
- **input_x** (Tensor) - The shape of 'input_x' is :math:`(x_1, x_2, ..., x_R)`,
|
||||
and `input_shape[begin_norm_axis:]` is equal to `normalized_shape`.
|
||||
|
||||
Outputs:
|
||||
Tensor, the normalized and scaled offset tensor, has the same shape and data type as the `input_x`.
|
||||
|
||||
Examples:
|
||||
>>> x = Tensor(np.ones([20, 5, 10, 10]), mindspore.float32)
|
||||
>>> shape1 = x.shape()[1:]
|
||||
>>> m = nn.LayerNorm(shape1, begin_norm_axis=1, begin_params_axis=1)
|
||||
>>> m(x)
|
||||
"""
|
||||
def __init__(self,
|
||||
normalized_shape,
|
||||
begin_norm_axis=-1,
|
||||
begin_params_axis=-1,
|
||||
gamma_init='ones',
|
||||
beta_init='zeros',
|
||||
use_batch_norm=False):
|
||||
super(FusedLayerNorm, self).__init__()
|
||||
if not isinstance(normalized_shape, (tuple, list)):
|
||||
raise TypeError("The type of 'normalized_shape' should be tuple[int] or list[int], but '{}' type is {}."
|
||||
.format(normalized_shape, type(normalized_shape)))
|
||||
self.normalized_shape = normalized_shape
|
||||
self.begin_norm_axis = begin_norm_axis
|
||||
self.begin_params_axis = begin_params_axis
|
||||
self.gamma = Parameter(initializer(
|
||||
gamma_init, normalized_shape), name="gamma")
|
||||
self.beta = Parameter(initializer(
|
||||
beta_init, normalized_shape), name="beta")
|
||||
self.layer_norm = P.LayerNorm(begin_norm_axis=self.begin_norm_axis, begin_params_axis=self.begin_params_axis)
|
||||
|
||||
self.batch_norm = P.BatchNorm(is_training=True, epsilon=1e-5)
|
||||
self.use_batch_norm = use_batch_norm
|
||||
|
||||
def construct(self, input_x):
|
||||
if self.use_batch_norm and self.training:
|
||||
ones = P.Fill()(mstype.float32, F.shape(input_x)[:self.begin_norm_axis], 1.0)
|
||||
zeros = P.Fill()(mstype.float32, F.shape(input_x)[:self.begin_norm_axis], 0.0)
|
||||
shape_x = F.shape(input_x)
|
||||
norm_shape = get_shape_for_norm(shape_x, self.begin_norm_axis)
|
||||
input_x = F.reshape(input_x, norm_shape)
|
||||
output, _, _, _, _, _ = self.batch_norm(input_x, ones, zeros, None, None)
|
||||
output = F.reshape(output, shape_x)
|
||||
y = output * self.gamma + self.beta
|
||||
else:
|
||||
y, _, _ = self.layer_norm(input_x, self.gamma, self.beta)
|
||||
return y
|
||||
|
||||
def extend_repr(self):
|
||||
"""Display instance object as string."""
|
||||
s = 'normalized_shape={}, begin_norm_axis={}, begin_params_axis={}, gamma{}, beta={}'.format(
|
||||
self.normalized_shape, self.begin_norm_axis, self.begin_params_axis, self.gamma, self.beta)
|
||||
return s
|
@ -0,0 +1,177 @@
|
||||
# Copyright 2020 Huawei Technologies Co., Ltd
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
# ============================================================================
|
||||
|
||||
'''
|
||||
CRF script.
|
||||
'''
|
||||
|
||||
import numpy as np
|
||||
import mindspore.nn as nn
|
||||
from mindspore.ops import operations as P
|
||||
from mindspore.common.tensor import Tensor
|
||||
from mindspore.common.parameter import Parameter
|
||||
import mindspore.common.dtype as mstype
|
||||
|
||||
class CRF(nn.Cell):
|
||||
'''
|
||||
Conditional Random Field
|
||||
Args:
|
||||
tag_to_index: The dict for tag to index mapping with extra "<START>" and "<STOP>"sign.
|
||||
batch_size: Batch size, i.e., the length of the first dimension.
|
||||
seq_length: Sequence length, i.e., the length of the second dimention.
|
||||
is_training: Specifies whether to use training mode.
|
||||
Returns:
|
||||
Training mode: Tensor, total loss.
|
||||
Evaluation mode: Tuple, the index for each step with the highest score; Tuple, the index for the last
|
||||
step with the highest score.
|
||||
'''
|
||||
def __init__(self, tag_to_index, batch_size=1, seq_length=128, is_training=True):
|
||||
|
||||
super(CRF, self).__init__()
|
||||
self.target_size = len(tag_to_index)
|
||||
self.is_training = is_training
|
||||
self.tag_to_index = tag_to_index
|
||||
self.batch_size = batch_size
|
||||
self.seq_length = seq_length
|
||||
self.START_TAG = "<START>"
|
||||
self.STOP_TAG = "<STOP>"
|
||||
self.START_VALUE = Tensor(self.target_size-2, dtype=mstype.int32)
|
||||
self.STOP_VALUE = Tensor(self.target_size-1, dtype=mstype.int32)
|
||||
transitions = np.random.normal(size=(self.target_size, self.target_size)).astype(np.float32)
|
||||
transitions[tag_to_index[self.START_TAG], :] = -10000
|
||||
transitions[:, tag_to_index[self.STOP_TAG]] = -10000
|
||||
self.transitions = Parameter(Tensor(transitions), name="transition_matrix")
|
||||
self.cat = P.Concat(axis=-1)
|
||||
self.argmax = P.ArgMaxWithValue(axis=-1)
|
||||
self.log = P.Log()
|
||||
self.exp = P.Exp()
|
||||
self.sum = P.ReduceSum()
|
||||
self.tile = P.Tile()
|
||||
self.reduce_sum = P.ReduceSum(keep_dims=True)
|
||||
self.reshape = P.Reshape()
|
||||
self.expand = P.ExpandDims()
|
||||
self.mean = P.ReduceMean()
|
||||
init_alphas = np.ones(shape=(self.batch_size, self.target_size)) * -10000.0
|
||||
init_alphas[:, self.tag_to_index[self.START_TAG]] = 0.
|
||||
self.init_alphas = Tensor(init_alphas, dtype=mstype.float32)
|
||||
self.cast = P.Cast()
|
||||
self.reduce_max = P.ReduceMax(keep_dims=True)
|
||||
self.on_value = Tensor(1.0, dtype=mstype.float32)
|
||||
self.off_value = Tensor(0.0, dtype=mstype.float32)
|
||||
self.onehot = P.OneHot()
|
||||
|
||||
def log_sum_exp(self, logits):
|
||||
'''
|
||||
Compute the log_sum_exp score for normalization factor.
|
||||
'''
|
||||
max_score = self.reduce_max(logits, -1) #16 5 5
|
||||
score = self.log(self.reduce_sum(self.exp(logits - max_score), -1))
|
||||
score = max_score + score
|
||||
return score
|
||||
|
||||
def _realpath_score(self, features, label):
|
||||
'''
|
||||
Compute the emission and transition score for the real path.
|
||||
'''
|
||||
label = label * 1
|
||||
concat_A = self.tile(self.reshape(self.START_VALUE, (1,)), (self.batch_size,))
|
||||
concat_A = self.reshape(concat_A, (self.batch_size, 1))
|
||||
labels = self.cat((concat_A, label))
|
||||
onehot_label = self.onehot(label, self.target_size, self.on_value, self.off_value)
|
||||
emits = features * onehot_label
|
||||
labels = self.onehot(labels, self.target_size, self.on_value, self.off_value)
|
||||
label1 = labels[:, 1:, :]
|
||||
label2 = labels[:, :self.seq_length, :]
|
||||
label1 = self.expand(label1, 3)
|
||||
label2 = self.expand(label2, 2)
|
||||
label_trans = label1 * label2
|
||||
transitions = self.expand(self.expand(self.transitions, 0), 0)
|
||||
trans = transitions * label_trans
|
||||
score = self.sum(emits, (1, 2)) + self.sum(trans, (1, 2, 3))
|
||||
stop_value_index = labels[:, (self.seq_length-1):self.seq_length, :]
|
||||
stop_value = self.transitions[(self.target_size-1):self.target_size, :]
|
||||
stop_score = stop_value * self.reshape(stop_value_index, (self.batch_size, self.target_size))
|
||||
score = score + self.sum(stop_score, 1)
|
||||
score = self.reshape(score, (self.batch_size, -1))
|
||||
return score
|
||||
|
||||
def _normalization_factor(self, features):
|
||||
'''
|
||||
Compute the total score for all the paths.
|
||||
'''
|
||||
forward_var = self.init_alphas
|
||||
forward_var = self.expand(forward_var, 1)
|
||||
for idx in range(self.seq_length):
|
||||
feat = features[:, idx:(idx+1), :]
|
||||
emit_score = self.reshape(feat, (self.batch_size, self.target_size, 1))
|
||||
next_tag_var = emit_score + self.transitions + forward_var
|
||||
forward_var = self.log_sum_exp(next_tag_var)
|
||||
forward_var = self.reshape(forward_var, (self.batch_size, 1, self.target_size))
|
||||
terminal_var = forward_var + self.reshape(self.transitions[(self.target_size-1):self.target_size, :], (1, -1))
|
||||
alpha = self.log_sum_exp(terminal_var)
|
||||
alpha = self.reshape(alpha, (self.batch_size, -1))
|
||||
return alpha
|
||||
|
||||
def _decoder(self, features):
|
||||
'''
|
||||
Viterbi decode for evaluation.
|
||||
'''
|
||||
backpointers = ()
|
||||
forward_var = self.init_alphas
|
||||
for idx in range(self.seq_length):
|
||||
feat = features[:, idx:(idx+1), :]
|
||||
feat = self.reshape(feat, (self.batch_size, self.target_size))
|
||||
bptrs_t = ()
|
||||
|
||||
next_tag_var = self.expand(forward_var, 1) + self.transitions
|
||||
best_tag_id, best_tag_value = self.argmax(next_tag_var)
|
||||
bptrs_t += (best_tag_id,)
|
||||
forward_var = best_tag_value + feat
|
||||
|
||||
backpointers += (bptrs_t,)
|
||||
terminal_var = forward_var + self.reshape(self.transitions[(self.target_size-1):self.target_size, :], (1, -1))
|
||||
best_tag_id, _ = self.argmax(terminal_var)
|
||||
return backpointers, best_tag_id
|
||||
|
||||
def construct(self, features, label):
|
||||
if self.is_training:
|
||||
forward_score = self._normalization_factor(features)
|
||||
gold_score = self._realpath_score(features, label)
|
||||
return_value = self.mean(forward_score - gold_score)
|
||||
else:
|
||||
path_list, tag = self._decoder(features)
|
||||
return_value = path_list, tag
|
||||
return return_value
|
||||
|
||||
def postprocess(backpointers, best_tag_id):
|
||||
'''
|
||||
Do postprocess
|
||||
'''
|
||||
best_tag_id = best_tag_id.asnumpy()
|
||||
batch_size = len(best_tag_id)
|
||||
best_path = []
|
||||
for i in range(batch_size):
|
||||
best_path.append([])
|
||||
best_local_id = best_tag_id[i]
|
||||
best_path[-1].append(best_local_id)
|
||||
for bptrs_t in reversed(backpointers):
|
||||
bptrs_t = bptrs_t[0].asnumpy()
|
||||
local_idx = bptrs_t[i]
|
||||
best_local_id = local_idx[best_local_id]
|
||||
best_path[-1].append(best_local_id)
|
||||
# Pop off the start tag (we dont want to return that to the caller)
|
||||
best_path[-1].pop()
|
||||
best_path[-1].reverse()
|
||||
return best_path
|
@ -0,0 +1,31 @@
|
||||
# Copyright 2020 Huawei Technologies Co., Ltd
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
# ============================================================================
|
||||
"""Bert Init."""
|
||||
from .bert_for_pre_training import BertNetworkWithLoss, BertPreTraining, \
|
||||
BertPretrainingLoss, GetMaskedLMOutput, GetNextSentenceOutput, \
|
||||
BertTrainOneStepCell, BertTrainOneStepWithLossScaleCell
|
||||
from .bert_model import BertAttention, BertConfig, BertEncoderCell, BertModel, \
|
||||
BertOutput, BertSelfAttention, BertTransformer, EmbeddingLookup, \
|
||||
EmbeddingPostprocessor, RelaPosEmbeddingsGenerator, RelaPosMatrixGenerator, \
|
||||
SaturateCast, CreateAttentionMaskFromInputMask
|
||||
|
||||
__all__ = [
|
||||
"BertNetworkWithLoss", "BertPreTraining", "BertPretrainingLoss",
|
||||
"GetMaskedLMOutput", "GetNextSentenceOutput", "BertTrainOneStepCell", "BertTrainOneStepWithLossScaleCell",
|
||||
"BertAttention", "BertConfig", "BertEncoderCell", "BertModel", "BertOutput",
|
||||
"BertSelfAttention", "BertTransformer", "EmbeddingLookup",
|
||||
"EmbeddingPostprocessor", "RelaPosEmbeddingsGenerator",
|
||||
"RelaPosMatrixGenerator", "SaturateCast", "CreateAttentionMaskFromInputMask"
|
||||
]
|
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
@ -0,0 +1,73 @@
|
||||
# Copyright 2020 Huawei Technologies Co., Ltd
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
# ============================================================================
|
||||
|
||||
'''bert clue evaluation'''
|
||||
|
||||
import json
|
||||
import numpy as np
|
||||
import mindspore.common.dtype as mstype
|
||||
from mindspore.common.tensor import Tensor
|
||||
import tokenization
|
||||
from sample_process import label_generation, process_one_example_p
|
||||
from .evaluation_config import cfg
|
||||
from .CRF import postprocess
|
||||
|
||||
vocab_file = "./vocab.txt"
|
||||
tokenizer_ = tokenization.FullTokenizer(vocab_file=vocab_file)
|
||||
|
||||
def process(model, text, sequence_length):
|
||||
"""
|
||||
process text.
|
||||
"""
|
||||
data = [text]
|
||||
features = []
|
||||
res = []
|
||||
ids = []
|
||||
for i in data:
|
||||
feature = process_one_example_p(tokenizer_, i, max_seq_len=sequence_length)
|
||||
features.append(feature)
|
||||
input_ids, input_mask, token_type_id = feature
|
||||
input_ids = Tensor(np.array(input_ids), mstype.int32)
|
||||
input_mask = Tensor(np.array(input_mask), mstype.int32)
|
||||
token_type_id = Tensor(np.array(token_type_id), mstype.int32)
|
||||
if cfg.use_crf:
|
||||
backpointers, best_tag_id = model.predict(input_ids, input_mask, token_type_id, Tensor(1))
|
||||
best_path = postprocess(backpointers, best_tag_id)
|
||||
logits = []
|
||||
for ele in best_path:
|
||||
logits.extend(ele)
|
||||
ids = logits
|
||||
else:
|
||||
logits = model.predict(input_ids, input_mask, token_type_id, Tensor(1))
|
||||
ids = logits.asnumpy()
|
||||
ids = np.argmax(ids, axis=-1)
|
||||
ids = list(ids)
|
||||
res = label_generation(text, ids)
|
||||
return res
|
||||
|
||||
def submit(model, path, sequence_length):
|
||||
"""
|
||||
submit task
|
||||
"""
|
||||
data = []
|
||||
for line in open(path):
|
||||
if not line.strip():
|
||||
continue
|
||||
oneline = json.loads(line.strip())
|
||||
res = process(model, oneline["text"], sequence_length)
|
||||
print("text", oneline["text"])
|
||||
print("res:", res)
|
||||
data.append(json.dumps({"label": res}, ensure_ascii=False))
|
||||
open("ner_predict.json", "w").write("\n".join(data))
|
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in new issue