|
|
|
@ -30,7 +30,8 @@ from ..core.datatypes import mstype_to_detype
|
|
|
|
|
|
|
|
|
|
class Lookup(cde.LookupOp):
|
|
|
|
|
"""
|
|
|
|
|
Lookup operator that looks up a word to an id.
|
|
|
|
|
Lookup operator that looks up a word to an id.
|
|
|
|
|
|
|
|
|
|
Args:
|
|
|
|
|
vocab(Vocab): a Vocab object.
|
|
|
|
|
unknown(int, optional): default id to lookup a word that is out of vocab. If no argument is passed, 1 will be
|
|
|
|
@ -48,21 +49,22 @@ class Lookup(cde.LookupOp):
|
|
|
|
|
|
|
|
|
|
class Ngram(cde.NgramOp):
|
|
|
|
|
"""
|
|
|
|
|
TensorOp to generate n-gram from a 1-D string Tensor
|
|
|
|
|
TensorOp to generate n-gram from a 1-D string Tensor.
|
|
|
|
|
|
|
|
|
|
Refer to https://en.wikipedia.org/wiki/N-gram#Examples for an overview of what n-gram is and how it works.
|
|
|
|
|
|
|
|
|
|
Args:
|
|
|
|
|
n([int, list]): n in n-gram, n >= 1. n is a list of positive integers, for e.g. n=[4,3], The result
|
|
|
|
|
n (list of int): n in n-gram, n >= 1. n is a list of positive integers, for e.g. n=[4,3], The result
|
|
|
|
|
would be a 4-gram followed by a 3-gram in the same tensor. If number of words is not enough to make up for
|
|
|
|
|
a n-gram, an empty string would be returned. For e.g. 3 grams on ["mindspore","best"] would result in an
|
|
|
|
|
empty string be produced.
|
|
|
|
|
left_pad(tuple, optional): ("pad_token", pad_width). Padding performed on left side of the sequence. pad_width
|
|
|
|
|
will be capped at n-1. left_pad=("_",2) would pad left side of the sequence with "__" (Default is None).
|
|
|
|
|
right_pad(tuple, optional): ("pad_token", pad_width). Padding performed on right side of the sequence.
|
|
|
|
|
left_pad (tuple, optional): ("pad_token", pad_width). Padding performed on left side of the sequence. pad_width
|
|
|
|
|
will be capped at n-1. left_pad=("_",2) would pad left side of the sequence with "__" (default=None).
|
|
|
|
|
right_pad (tuple, optional): ("pad_token", pad_width). Padding performed on right side of the sequence.
|
|
|
|
|
pad_width will be capped at n-1. right_pad=("-":2) would pad right side of the sequence with "--"
|
|
|
|
|
(Default is None).
|
|
|
|
|
separator(str,optional): symbol used to join strings together. for e.g. if 2-gram the ["mindspore", "amazing"]
|
|
|
|
|
with separator="-" the result would be ["mindspore-amazing"] (Default is None which means whitespace is
|
|
|
|
|
(default=None).
|
|
|
|
|
separator (str, optional): symbol used to join strings together. for e.g. if 2-gram the ["mindspore", "amazing"]
|
|
|
|
|
with separator="-" the result would be ["mindspore-amazing"] (default=None, which means whitespace is
|
|
|
|
|
used).
|
|
|
|
|
"""
|
|
|
|
|
|
|
|
|
@ -86,11 +88,12 @@ class JiebaTokenizer(cde.JiebaTokenizerOp):
|
|
|
|
|
Args:
|
|
|
|
|
hmm_path (str): the dictionary file is used by HMMSegment algorithm,
|
|
|
|
|
the dictionary can be obtained on the official website of cppjieba.
|
|
|
|
|
mp_path(str): the dictionary file is used by MPSegment algorithm,
|
|
|
|
|
mp_path (str): the dictionary file is used by MPSegment algorithm,
|
|
|
|
|
the dictionary can be obtained on the official website of cppjieba.
|
|
|
|
|
mode (Enum): [Default "MIX"], "MP" model will tokenize with MPSegment algorithm,
|
|
|
|
|
mode (JiebaMode, optional): "MP" model will tokenize with MPSegment algorithm,
|
|
|
|
|
"HMM" mode will tokenize with Hiddel Markov Model Segment algorithm,
|
|
|
|
|
"MIX" model will tokenize with a mix of MPSegment and HMMSegment algorithm.
|
|
|
|
|
"MIX" model will tokenize with a mix of MPSegment and HMMSegment algorithm
|
|
|
|
|
(default="MIX").
|
|
|
|
|
"""
|
|
|
|
|
|
|
|
|
|
@check_jieba_init
|
|
|
|
@ -104,13 +107,15 @@ class JiebaTokenizer(cde.JiebaTokenizerOp):
|
|
|
|
|
@check_jieba_add_word
|
|
|
|
|
def add_word(self, word, freq=None):
|
|
|
|
|
"""
|
|
|
|
|
Add user defined word to JiebaTokenizer's dictionary
|
|
|
|
|
Add user defined word to JiebaTokenizer's dictionary.
|
|
|
|
|
|
|
|
|
|
Args:
|
|
|
|
|
word(required, string): The word to be added to the JiebaTokenizer instance.
|
|
|
|
|
word (str): The word to be added to the JiebaTokenizer instance.
|
|
|
|
|
The added word will not be written into the built-in dictionary on disk.
|
|
|
|
|
freq(optional, int): The frequency of the word to be added, The higher the frequency,
|
|
|
|
|
the better change the word will be tokenized(default None, use default frequency).
|
|
|
|
|
freq (int, optional): The frequency of the word to be added, The higher the frequency,
|
|
|
|
|
the better change the word will be tokenized(default=None, use default frequency).
|
|
|
|
|
"""
|
|
|
|
|
|
|
|
|
|
if freq is None:
|
|
|
|
|
super().add_word(word, 0)
|
|
|
|
|
else:
|
|
|
|
@ -119,15 +124,20 @@ class JiebaTokenizer(cde.JiebaTokenizerOp):
|
|
|
|
|
@check_jieba_add_dict
|
|
|
|
|
def add_dict(self, user_dict):
|
|
|
|
|
"""
|
|
|
|
|
Add user defined word to JiebaTokenizer's dictionary
|
|
|
|
|
Add user defined word to JiebaTokenizer's dictionary.
|
|
|
|
|
|
|
|
|
|
Args:
|
|
|
|
|
user_dict(path/dict):Dictionary to be added, file path or Python dictionary,
|
|
|
|
|
Python Dict format: {word1:freq1, word2:freq2,...}
|
|
|
|
|
Jieba dictionary format : word(required), freq(optional), such as:
|
|
|
|
|
word1 freq1
|
|
|
|
|
word2
|
|
|
|
|
word3 freq3
|
|
|
|
|
user_dict (str or dict): Dictionary to be added, file path or Python dictionary,
|
|
|
|
|
Python Dict format: {word1:freq1, word2:freq2,...}.
|
|
|
|
|
Jieba dictionary format : word(required), freq(optional), such as:
|
|
|
|
|
|
|
|
|
|
.. code-block::
|
|
|
|
|
|
|
|
|
|
word1 freq1
|
|
|
|
|
word2
|
|
|
|
|
word3 freq3
|
|
|
|
|
"""
|
|
|
|
|
|
|
|
|
|
if isinstance(user_dict, str):
|
|
|
|
|
self.__add_dict_py_file(user_dict)
|
|
|
|
|
elif isinstance(user_dict, dict):
|
|
|
|
@ -190,12 +200,12 @@ class WordpieceTokenizer(cde.WordpieceTokenizerOp):
|
|
|
|
|
"""
|
|
|
|
|
Tokenize scalar token or 1-D tokens to 1-D subword tokens.
|
|
|
|
|
|
|
|
|
|
Args
|
|
|
|
|
vocab(Vocab): a Vocab object.
|
|
|
|
|
suffix_indicator(string, optional): Used to show that the subword is the last part of a word(default '##').
|
|
|
|
|
max_bytes_per_token(int, optional): Tokens exceeding this length will not be further split(default 100).
|
|
|
|
|
unknown_token(string, optional): When we can not found the token: if 'unknown_token' is empty string,
|
|
|
|
|
return the token directly, else return 'unknown_token'(default '[UNK]').
|
|
|
|
|
Args:
|
|
|
|
|
vocab (Vocab): a Vocab object.
|
|
|
|
|
suffix_indicator (str, optional): Used to show that the subword is the last part of a word(default '##').
|
|
|
|
|
max_bytes_per_token (int, optional): Tokens exceeding this length will not be further split(default 100).
|
|
|
|
|
unknown_token (str, optional): When we can not found the token: if 'unknown_token' is empty string,
|
|
|
|
|
return the token directly, else return 'unknown_token'(default='[UNK]').
|
|
|
|
|
"""
|
|
|
|
|
|
|
|
|
|
def __init__(self, vocab, suffix_indicator='##', max_bytes_per_token=100, unknown_token='[UNK]'):
|
|
|
|
@ -209,7 +219,7 @@ class WordpieceTokenizer(cde.WordpieceTokenizerOp):
|
|
|
|
|
if platform.system().lower() != 'windows':
|
|
|
|
|
class WhitespaceTokenizer(cde.WhitespaceTokenizerOp):
|
|
|
|
|
"""
|
|
|
|
|
Tokenize a scalar tensor of UTF-8 string on ICU defined whitespaces(such as: ' ', '\t', '\r', '\n').
|
|
|
|
|
Tokenize a scalar tensor of UTF-8 string on ICU defined whitespaces(such as: ' ', '\\\\t', '\\\\r', '\\\\n').
|
|
|
|
|
"""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
@ -218,7 +228,7 @@ if platform.system().lower() != 'windows':
|
|
|
|
|
Tokenize a scalar tensor of UTF-8 string on Unicode script boundaries.
|
|
|
|
|
|
|
|
|
|
Args:
|
|
|
|
|
keep_whitespace(bool, optional): If or not emit whitespace tokens (default False)
|
|
|
|
|
keep_whitespace (bool, optional): If or not emit whitespace tokens (default=False).
|
|
|
|
|
"""
|
|
|
|
|
|
|
|
|
|
def __init__(self, keep_whitespace=False):
|
|
|
|
@ -246,9 +256,9 @@ if platform.system().lower() != 'windows':
|
|
|
|
|
Apply normalize operation on utf-8 string tensor.
|
|
|
|
|
|
|
|
|
|
Args:
|
|
|
|
|
normalize_form(Enum, optional): Valid values are "NONE", "NFC", "NFKC", "NFD", "NFKD".
|
|
|
|
|
normalize_form (NormalizeForm, optional): Valid values are "NONE", "NFC", "NFKC", "NFD", "NFKD".
|
|
|
|
|
If set "NONE", will do nothing for input string tensor.
|
|
|
|
|
If set to any of "NFC", "NFKC", "NFD", "NFKD", will apply normalize operation(default "NFKC").
|
|
|
|
|
If set to any of "NFC", "NFKC", "NFD", "NFKD", will apply normalize operation(default="NFKC").
|
|
|
|
|
See http://unicode.org/reports/tr15/ for details.
|
|
|
|
|
"""
|
|
|
|
|
|
|
|
|
@ -260,13 +270,14 @@ if platform.system().lower() != 'windows':
|
|
|
|
|
class RegexReplace(cde.RegexReplaceOp):
|
|
|
|
|
"""
|
|
|
|
|
Replace utf-8 string tensor with 'replace' according to regular expression 'pattern'.
|
|
|
|
|
|
|
|
|
|
See http://userguide.icu-project.org/strings/regexp for support regex pattern.
|
|
|
|
|
|
|
|
|
|
Args:
|
|
|
|
|
pattern(string): the regex expression patterns.
|
|
|
|
|
replace(string): the string to replace matched element.
|
|
|
|
|
pattern(str): the regex expression patterns.
|
|
|
|
|
replace(str): the string to replace matched element.
|
|
|
|
|
replace_all(bool, optional): If False, only replace first matched element;
|
|
|
|
|
if True, replace all matched elements(default True).
|
|
|
|
|
if True, replace all matched elements(default=True).
|
|
|
|
|
"""
|
|
|
|
|
|
|
|
|
|
def __init__(self, pattern, replace, replace_all=True):
|
|
|
|
@ -279,13 +290,14 @@ if platform.system().lower() != 'windows':
|
|
|
|
|
class RegexTokenizer(cde.RegexTokenizerOp):
|
|
|
|
|
"""
|
|
|
|
|
Tokenize a scalar tensor of UTF-8 string by regex expression pattern.
|
|
|
|
|
|
|
|
|
|
See http://userguide.icu-project.org/strings/regexp for support regex pattern.
|
|
|
|
|
|
|
|
|
|
Args:
|
|
|
|
|
delim_pattern(string): The pattern of regex delimiters.
|
|
|
|
|
delim_pattern(str): The pattern of regex delimiters.
|
|
|
|
|
The original string will be split by matched elements.
|
|
|
|
|
keep_delim_pattern(string, optional): The string matched by 'delim_pattern' can be kept as a token
|
|
|
|
|
if it can be matched by 'keep_delim_pattern'. And the default value is empty string(''),
|
|
|
|
|
keep_delim_pattern(str, optional): The string matched by 'delim_pattern' can be kept as a token
|
|
|
|
|
if it can be matched by 'keep_delim_pattern'. And the default value is empty str(''),
|
|
|
|
|
in this situation, delimiters will not kept as a output token.
|
|
|
|
|
"""
|
|
|
|
|
|
|
|
|
@ -302,12 +314,12 @@ if platform.system().lower() != 'windows':
|
|
|
|
|
Args:
|
|
|
|
|
lower_case(bool, optional): If True, apply CaseFold, NormalizeUTF8(NFD mode), RegexReplace operation
|
|
|
|
|
on input text to make the text to lower case and strip accents characters; If False, only apply
|
|
|
|
|
NormalizeUTF8('normalization_form' mode) operation on input text(default False).
|
|
|
|
|
keep_whitespace(bool, optional), If True, the whitespace will be kept in out tokens(default False).
|
|
|
|
|
normalization_form(Enum, optional), Used to specify a specific normlaize mode,
|
|
|
|
|
only effective when 'lower_case' is False. See NormalizeUTF8 for details(default 'NONE').
|
|
|
|
|
preserve_unused_token(bool, optional), If True, do not split special tokens like
|
|
|
|
|
'[CLS]', '[SEP]', '[UNK]', '[PAD]', '[MASK]'(default True).
|
|
|
|
|
NormalizeUTF8('normalization_form' mode) operation on input text(default=False).
|
|
|
|
|
keep_whitespace(bool, optional): If True, the whitespace will be kept in out tokens(default=False).
|
|
|
|
|
normalization_form(NormalizeForm, optional): Used to specify a specific normlaize mode,
|
|
|
|
|
only effective when 'lower_case' is False. See NormalizeUTF8 for details(default='NONE').
|
|
|
|
|
preserve_unused_token(bool, optional): If True, do not split special tokens like
|
|
|
|
|
'[CLS]', '[SEP]', '[UNK]', '[PAD]', '[MASK]'(default=True).
|
|
|
|
|
"""
|
|
|
|
|
|
|
|
|
|
def __init__(self, lower_case=False, keep_whitespace=False,
|
|
|
|
@ -326,18 +338,18 @@ if platform.system().lower() != 'windows':
|
|
|
|
|
|
|
|
|
|
Args:
|
|
|
|
|
vocab(Vocab): a Vocab object.
|
|
|
|
|
suffix_indicator(string, optional): Used to show that the subword is the last part of a word(default '##').
|
|
|
|
|
max_bytes_per_token(int, optional): Tokens exceeding this length will not be further split(default 100).
|
|
|
|
|
unknown_token(string, optional): When we can not found the token: if 'unknown_token' is empty string,
|
|
|
|
|
return the token directly, else return 'unknown_token'(default '[UNK]').
|
|
|
|
|
suffix_indicator(str, optional): Used to show that the subword is the last part of a word(default='##').
|
|
|
|
|
max_bytes_per_token(int, optional): Tokens exceeding this length will not be further split(default=100).
|
|
|
|
|
unknown_token(str, optional): When we can not found the token: if 'unknown_token' is empty string,
|
|
|
|
|
return the token directly, else return 'unknown_token'(default='[UNK]').
|
|
|
|
|
lower_case(bool, optional): If True, apply CaseFold, NormalizeUTF8(NFD mode), RegexReplace operation
|
|
|
|
|
on input text to make the text to lower case and strip accents characters; If False, only apply
|
|
|
|
|
NormalizeUTF8('normalization_form' mode) operation on input text(default False).
|
|
|
|
|
keep_whitespace(bool, optional), If True, the whitespace will be kept in out tokens(default False).
|
|
|
|
|
normalization_form(Enum, optional), Used to specify a specific normlaize mode,
|
|
|
|
|
only effective when 'lower_case' is False. See NormalizeUTF8 for details(default 'NONE').
|
|
|
|
|
preserve_unused_token(bool, optional), If True, do not split special tokens like
|
|
|
|
|
'[CLS]', '[SEP]', '[UNK]', '[PAD]', '[MASK]'(default True).
|
|
|
|
|
NormalizeUTF8('normalization_form' mode) operation on input text(default=False).
|
|
|
|
|
keep_whitespace(bool, optional): If True, the whitespace will be kept in out tokens(default=False).
|
|
|
|
|
normalization_form(NormalizeForm, optional): Used to specify a specific normlaize mode,
|
|
|
|
|
only effective when 'lower_case' is False. See NormalizeUTF8 for details(default='NONE').
|
|
|
|
|
preserve_unused_token(bool, optional): If True, do not split special tokens like
|
|
|
|
|
'[CLS]', '[SEP]', '[UNK]', '[PAD]', '[MASK]'(default=True).
|
|
|
|
|
"""
|
|
|
|
|
|
|
|
|
|
def __init__(self, vocab, suffix_indicator='##', max_bytes_per_token=100,
|
|
|
|
|