!2419 Rectification and modification of dataset api documentation comments

Merge pull request !2419 from qianlong21st/temp
pull/2419/MERGE
mindspore-ci-bot 5 years ago committed by Gitee
commit 886dfe6fd7

@ -30,7 +30,8 @@ from ..core.datatypes import mstype_to_detype
class Lookup(cde.LookupOp): class Lookup(cde.LookupOp):
""" """
Lookup operator that looks up a word to an id. Lookup operator that looks up a word to an id.
Args: Args:
vocab(Vocab): a Vocab object. vocab(Vocab): a Vocab object.
unknown(int, optional): default id to lookup a word that is out of vocab. If no argument is passed, 1 will be unknown(int, optional): default id to lookup a word that is out of vocab. If no argument is passed, 1 will be
@ -48,21 +49,22 @@ class Lookup(cde.LookupOp):
class Ngram(cde.NgramOp): class Ngram(cde.NgramOp):
""" """
TensorOp to generate n-gram from a 1-D string Tensor TensorOp to generate n-gram from a 1-D string Tensor.
Refer to https://en.wikipedia.org/wiki/N-gram#Examples for an overview of what n-gram is and how it works. Refer to https://en.wikipedia.org/wiki/N-gram#Examples for an overview of what n-gram is and how it works.
Args: Args:
n([int, list]): n in n-gram, n >= 1. n is a list of positive integers, for e.g. n=[4,3], The result n (list of int): n in n-gram, n >= 1. n is a list of positive integers, for e.g. n=[4,3], The result
would be a 4-gram followed by a 3-gram in the same tensor. If number of words is not enough to make up for would be a 4-gram followed by a 3-gram in the same tensor. If number of words is not enough to make up for
a n-gram, an empty string would be returned. For e.g. 3 grams on ["mindspore","best"] would result in an a n-gram, an empty string would be returned. For e.g. 3 grams on ["mindspore","best"] would result in an
empty string be produced. empty string be produced.
left_pad(tuple, optional): ("pad_token", pad_width). Padding performed on left side of the sequence. pad_width left_pad (tuple, optional): ("pad_token", pad_width). Padding performed on left side of the sequence. pad_width
will be capped at n-1. left_pad=("_",2) would pad left side of the sequence with "__" (Default is None). will be capped at n-1. left_pad=("_",2) would pad left side of the sequence with "__" (default=None).
right_pad(tuple, optional): ("pad_token", pad_width). Padding performed on right side of the sequence. right_pad (tuple, optional): ("pad_token", pad_width). Padding performed on right side of the sequence.
pad_width will be capped at n-1. right_pad=("-":2) would pad right side of the sequence with "--" pad_width will be capped at n-1. right_pad=("-":2) would pad right side of the sequence with "--"
(Default is None). (default=None).
separator(str,optional): symbol used to join strings together. for e.g. if 2-gram the ["mindspore", "amazing"] separator (str, optional): symbol used to join strings together. for e.g. if 2-gram the ["mindspore", "amazing"]
with separator="-" the result would be ["mindspore-amazing"] (Default is None which means whitespace is with separator="-" the result would be ["mindspore-amazing"] (default=None, which means whitespace is
used). used).
""" """
@ -86,11 +88,12 @@ class JiebaTokenizer(cde.JiebaTokenizerOp):
Args: Args:
hmm_path (str): the dictionary file is used by HMMSegment algorithm, hmm_path (str): the dictionary file is used by HMMSegment algorithm,
the dictionary can be obtained on the official website of cppjieba. the dictionary can be obtained on the official website of cppjieba.
mp_path(str): the dictionary file is used by MPSegment algorithm, mp_path (str): the dictionary file is used by MPSegment algorithm,
the dictionary can be obtained on the official website of cppjieba. the dictionary can be obtained on the official website of cppjieba.
mode (Enum): [Default "MIX"], "MP" model will tokenize with MPSegment algorithm, mode (JiebaMode, optional): "MP" model will tokenize with MPSegment algorithm,
"HMM" mode will tokenize with Hiddel Markov Model Segment algorithm, "HMM" mode will tokenize with Hiddel Markov Model Segment algorithm,
"MIX" model will tokenize with a mix of MPSegment and HMMSegment algorithm. "MIX" model will tokenize with a mix of MPSegment and HMMSegment algorithm
(default="MIX").
""" """
@check_jieba_init @check_jieba_init
@ -104,13 +107,15 @@ class JiebaTokenizer(cde.JiebaTokenizerOp):
@check_jieba_add_word @check_jieba_add_word
def add_word(self, word, freq=None): def add_word(self, word, freq=None):
""" """
Add user defined word to JiebaTokenizer's dictionary Add user defined word to JiebaTokenizer's dictionary.
Args: Args:
word(required, string): The word to be added to the JiebaTokenizer instance. word (str): The word to be added to the JiebaTokenizer instance.
The added word will not be written into the built-in dictionary on disk. The added word will not be written into the built-in dictionary on disk.
freq(optional, int): The frequency of the word to be added, The higher the frequency, freq (int, optional): The frequency of the word to be added, The higher the frequency,
the better change the word will be tokenized(default None, use default frequency). the better change the word will be tokenized(default=None, use default frequency).
""" """
if freq is None: if freq is None:
super().add_word(word, 0) super().add_word(word, 0)
else: else:
@ -119,15 +124,20 @@ class JiebaTokenizer(cde.JiebaTokenizerOp):
@check_jieba_add_dict @check_jieba_add_dict
def add_dict(self, user_dict): def add_dict(self, user_dict):
""" """
Add user defined word to JiebaTokenizer's dictionary Add user defined word to JiebaTokenizer's dictionary.
Args: Args:
user_dict(path/dict):Dictionary to be added, file path or Python dictionary, user_dict (str or dict): Dictionary to be added, file path or Python dictionary,
Python Dict format: {word1:freq1, word2:freq2,...} Python Dict format: {word1:freq1, word2:freq2,...}.
Jieba dictionary format : word(required), freq(optional), such as: Jieba dictionary format : word(required), freq(optional), such as:
word1 freq1
word2 .. code-block::
word3 freq3
word1 freq1
word2
word3 freq3
""" """
if isinstance(user_dict, str): if isinstance(user_dict, str):
self.__add_dict_py_file(user_dict) self.__add_dict_py_file(user_dict)
elif isinstance(user_dict, dict): elif isinstance(user_dict, dict):
@ -190,12 +200,12 @@ class WordpieceTokenizer(cde.WordpieceTokenizerOp):
""" """
Tokenize scalar token or 1-D tokens to 1-D subword tokens. Tokenize scalar token or 1-D tokens to 1-D subword tokens.
Args Args:
vocab(Vocab): a Vocab object. vocab (Vocab): a Vocab object.
suffix_indicator(string, optional): Used to show that the subword is the last part of a word(default '##'). suffix_indicator (str, optional): Used to show that the subword is the last part of a word(default '##').
max_bytes_per_token(int, optional): Tokens exceeding this length will not be further split(default 100). max_bytes_per_token (int, optional): Tokens exceeding this length will not be further split(default 100).
unknown_token(string, optional): When we can not found the token: if 'unknown_token' is empty string, unknown_token (str, optional): When we can not found the token: if 'unknown_token' is empty string,
return the token directly, else return 'unknown_token'(default '[UNK]'). return the token directly, else return 'unknown_token'(default='[UNK]').
""" """
def __init__(self, vocab, suffix_indicator='##', max_bytes_per_token=100, unknown_token='[UNK]'): def __init__(self, vocab, suffix_indicator='##', max_bytes_per_token=100, unknown_token='[UNK]'):
@ -209,7 +219,7 @@ class WordpieceTokenizer(cde.WordpieceTokenizerOp):
if platform.system().lower() != 'windows': if platform.system().lower() != 'windows':
class WhitespaceTokenizer(cde.WhitespaceTokenizerOp): class WhitespaceTokenizer(cde.WhitespaceTokenizerOp):
""" """
Tokenize a scalar tensor of UTF-8 string on ICU defined whitespaces(such as: ' ', '\t', '\r', '\n'). Tokenize a scalar tensor of UTF-8 string on ICU defined whitespaces(such as: ' ', '\\\\t', '\\\\r', '\\\\n').
""" """
@ -218,7 +228,7 @@ if platform.system().lower() != 'windows':
Tokenize a scalar tensor of UTF-8 string on Unicode script boundaries. Tokenize a scalar tensor of UTF-8 string on Unicode script boundaries.
Args: Args:
keep_whitespace(bool, optional): If or not emit whitespace tokens (default False) keep_whitespace (bool, optional): If or not emit whitespace tokens (default=False).
""" """
def __init__(self, keep_whitespace=False): def __init__(self, keep_whitespace=False):
@ -246,9 +256,9 @@ if platform.system().lower() != 'windows':
Apply normalize operation on utf-8 string tensor. Apply normalize operation on utf-8 string tensor.
Args: Args:
normalize_form(Enum, optional): Valid values are "NONE", "NFC", "NFKC", "NFD", "NFKD". normalize_form (NormalizeForm, optional): Valid values are "NONE", "NFC", "NFKC", "NFD", "NFKD".
If set "NONE", will do nothing for input string tensor. If set "NONE", will do nothing for input string tensor.
If set to any of "NFC", "NFKC", "NFD", "NFKD", will apply normalize operation(default "NFKC"). If set to any of "NFC", "NFKC", "NFD", "NFKD", will apply normalize operation(default="NFKC").
See http://unicode.org/reports/tr15/ for details. See http://unicode.org/reports/tr15/ for details.
""" """
@ -260,13 +270,14 @@ if platform.system().lower() != 'windows':
class RegexReplace(cde.RegexReplaceOp): class RegexReplace(cde.RegexReplaceOp):
""" """
Replace utf-8 string tensor with 'replace' according to regular expression 'pattern'. Replace utf-8 string tensor with 'replace' according to regular expression 'pattern'.
See http://userguide.icu-project.org/strings/regexp for support regex pattern. See http://userguide.icu-project.org/strings/regexp for support regex pattern.
Args: Args:
pattern(string): the regex expression patterns. pattern(str): the regex expression patterns.
replace(string): the string to replace matched element. replace(str): the string to replace matched element.
replace_all(bool, optional): If False, only replace first matched element; replace_all(bool, optional): If False, only replace first matched element;
if True, replace all matched elements(default True). if True, replace all matched elements(default=True).
""" """
def __init__(self, pattern, replace, replace_all=True): def __init__(self, pattern, replace, replace_all=True):
@ -279,13 +290,14 @@ if platform.system().lower() != 'windows':
class RegexTokenizer(cde.RegexTokenizerOp): class RegexTokenizer(cde.RegexTokenizerOp):
""" """
Tokenize a scalar tensor of UTF-8 string by regex expression pattern. Tokenize a scalar tensor of UTF-8 string by regex expression pattern.
See http://userguide.icu-project.org/strings/regexp for support regex pattern. See http://userguide.icu-project.org/strings/regexp for support regex pattern.
Args: Args:
delim_pattern(string): The pattern of regex delimiters. delim_pattern(str): The pattern of regex delimiters.
The original string will be split by matched elements. The original string will be split by matched elements.
keep_delim_pattern(string, optional): The string matched by 'delim_pattern' can be kept as a token keep_delim_pattern(str, optional): The string matched by 'delim_pattern' can be kept as a token
if it can be matched by 'keep_delim_pattern'. And the default value is empty string(''), if it can be matched by 'keep_delim_pattern'. And the default value is empty str(''),
in this situation, delimiters will not kept as a output token. in this situation, delimiters will not kept as a output token.
""" """
@ -302,12 +314,12 @@ if platform.system().lower() != 'windows':
Args: Args:
lower_case(bool, optional): If True, apply CaseFold, NormalizeUTF8(NFD mode), RegexReplace operation lower_case(bool, optional): If True, apply CaseFold, NormalizeUTF8(NFD mode), RegexReplace operation
on input text to make the text to lower case and strip accents characters; If False, only apply on input text to make the text to lower case and strip accents characters; If False, only apply
NormalizeUTF8('normalization_form' mode) operation on input text(default False). NormalizeUTF8('normalization_form' mode) operation on input text(default=False).
keep_whitespace(bool, optional), If True, the whitespace will be kept in out tokens(default False). keep_whitespace(bool, optional): If True, the whitespace will be kept in out tokens(default=False).
normalization_form(Enum, optional), Used to specify a specific normlaize mode, normalization_form(NormalizeForm, optional): Used to specify a specific normlaize mode,
only effective when 'lower_case' is False. See NormalizeUTF8 for details(default 'NONE'). only effective when 'lower_case' is False. See NormalizeUTF8 for details(default='NONE').
preserve_unused_token(bool, optional), If True, do not split special tokens like preserve_unused_token(bool, optional): If True, do not split special tokens like
'[CLS]', '[SEP]', '[UNK]', '[PAD]', '[MASK]'(default True). '[CLS]', '[SEP]', '[UNK]', '[PAD]', '[MASK]'(default=True).
""" """
def __init__(self, lower_case=False, keep_whitespace=False, def __init__(self, lower_case=False, keep_whitespace=False,
@ -326,18 +338,18 @@ if platform.system().lower() != 'windows':
Args: Args:
vocab(Vocab): a Vocab object. vocab(Vocab): a Vocab object.
suffix_indicator(string, optional): Used to show that the subword is the last part of a word(default '##'). suffix_indicator(str, optional): Used to show that the subword is the last part of a word(default='##').
max_bytes_per_token(int, optional): Tokens exceeding this length will not be further split(default 100). max_bytes_per_token(int, optional): Tokens exceeding this length will not be further split(default=100).
unknown_token(string, optional): When we can not found the token: if 'unknown_token' is empty string, unknown_token(str, optional): When we can not found the token: if 'unknown_token' is empty string,
return the token directly, else return 'unknown_token'(default '[UNK]'). return the token directly, else return 'unknown_token'(default='[UNK]').
lower_case(bool, optional): If True, apply CaseFold, NormalizeUTF8(NFD mode), RegexReplace operation lower_case(bool, optional): If True, apply CaseFold, NormalizeUTF8(NFD mode), RegexReplace operation
on input text to make the text to lower case and strip accents characters; If False, only apply on input text to make the text to lower case and strip accents characters; If False, only apply
NormalizeUTF8('normalization_form' mode) operation on input text(default False). NormalizeUTF8('normalization_form' mode) operation on input text(default=False).
keep_whitespace(bool, optional), If True, the whitespace will be kept in out tokens(default False). keep_whitespace(bool, optional): If True, the whitespace will be kept in out tokens(default=False).
normalization_form(Enum, optional), Used to specify a specific normlaize mode, normalization_form(NormalizeForm, optional): Used to specify a specific normlaize mode,
only effective when 'lower_case' is False. See NormalizeUTF8 for details(default 'NONE'). only effective when 'lower_case' is False. See NormalizeUTF8 for details(default='NONE').
preserve_unused_token(bool, optional), If True, do not split special tokens like preserve_unused_token(bool, optional): If True, do not split special tokens like
'[CLS]', '[SEP]', '[UNK]', '[PAD]', '[MASK]'(default True). '[CLS]', '[SEP]', '[UNK]', '[PAD]', '[MASK]'(default=True).
""" """
def __init__(self, vocab, suffix_indicator='##', max_bytes_per_token=100, def __init__(self, vocab, suffix_indicator='##', max_bytes_per_token=100,

@ -25,7 +25,9 @@ from .validators import check_from_file, check_from_list, check_from_dict, check
class Vocab(cde.Vocab): class Vocab(cde.Vocab):
""" """
Vocab object that is used to lookup a word. It contains a map that maps each word(str) to an id (int) Vocab object that is used to lookup a word.
It contains a map that maps each word(str) to an id (int).
""" """
@classmethod @classmethod
@ -33,29 +35,32 @@ class Vocab(cde.Vocab):
def from_dataset(cls, dataset, columns=None, freq_range=None, top_k=None, special_tokens=None, def from_dataset(cls, dataset, columns=None, freq_range=None, top_k=None, special_tokens=None,
special_first=None): special_first=None):
""" """
Build a vocab from a dataset. This would collect all unique words in a dataset and return a vocab within Build a vocab from a dataset.
This would collect all unique words in a dataset and return a vocab within
the frequency range specified by user in freq_range. User would be warned if no words fall into the frequency. the frequency range specified by user in freq_range. User would be warned if no words fall into the frequency.
Words in vocab are ordered from highest frequency to lowest frequency. Words with the same frequency would be Words in vocab are ordered from highest frequency to lowest frequency. Words with the same frequency would be
ordered lexicographically. ordered lexicographically.
Args: Args:
dataset(Dataset): dataset to build vocab from. dataset(Dataset): dataset to build vocab from.
columns([str, list], optional): column names to get words from. It can be a list of column names. columns(list of str, optional): column names to get words from. It can be a list of column names.
(Default=None where all columns will be used. If any column isn't string type, will return error) (default=None, where all columns will be used. If any column isn't string type, will return error).
freq_range(tuple, optional): A tuple of integers (min_frequency, max_frequency). Words within the frequency freq_range(tuple, optional): A tuple of integers (min_frequency, max_frequency). Words within the frequency
range would be kept. 0 <= min_frequency <= max_frequency <= total_words. min_frequency=0 is the same as range would be kept. 0 <= min_frequency <= max_frequency <= total_words. min_frequency=0 is the same as
min_frequency=1. max_frequency > total_words is the same as max_frequency = total_words. min_frequency=1. max_frequency > total_words is the same as max_frequency = total_words.
min_frequency/max_frequency can be None, which corresponds to 0/total_words separately min_frequency/max_frequency can be None, which corresponds to 0/total_words separately
(default=None, all words are included). (default=None, all words are included).
top_k(int, optional): top_k > 0. Number of words to be built into vocab. top_k most frequent words are top_k(int, optional): top_k > 0. Number of words to be built into vocab. top_k most frequent words are
taken. top_k is taken after freq_range. If not enough top_k, all words will be taken. (default=None taken. top_k is taken after freq_range. If not enough top_k, all words will be taken. (default=None,
all words are included). all words are included).
special_tokens(list, optional): a list of strings, each one is a special token. for example special_tokens(list, optional): a list of strings, each one is a special token. for example
special_tokens=["<pad>","<unk>"] (default=None, no special tokens will be added). special_tokens=["<pad>","<unk>"] (default=None, no special tokens will be added).
special_first(bool, optional): whether special_tokens will be prepended/appended to vocab. If special_tokens special_first(bool, optional): whether special_tokens will be prepended/appended to vocab. If special_tokens
is specified and special_first is set to None, special_tokens will be prepended. (default=None). is specified and special_first is set to None, special_tokens will be prepended. (default=None).
return:
text.Vocab: Vocab object built from dataset. Returns:
Vocab, Vocab object built from dataset.
""" """
vocab = Vocab() vocab = Vocab()
@ -69,7 +74,8 @@ class Vocab(cde.Vocab):
@check_from_list @check_from_list
def from_list(cls, word_list, special_tokens=None, special_first=None): def from_list(cls, word_list, special_tokens=None, special_first=None):
""" """
build a vocab object from a list of word. Build a vocab object from a list of word.
Args: Args:
word_list(list): a list of string where each element is a word of type string. word_list(list): a list of string where each element is a word of type string.
special_tokens(list, optional): a list of strings, each one is a special token. for example special_tokens(list, optional): a list of strings, each one is a special token. for example
@ -77,34 +83,40 @@ class Vocab(cde.Vocab):
special_first(bool, optional): whether special_tokens will be prepended/appended to vocab, If special_tokens special_first(bool, optional): whether special_tokens will be prepended/appended to vocab, If special_tokens
is specified and special_first is set to None, special_tokens will be prepended. (default=None). is specified and special_first is set to None, special_tokens will be prepended. (default=None).
""" """
return super().from_list(word_list, special_tokens, special_first) return super().from_list(word_list, special_tokens, special_first)
@classmethod @classmethod
@check_from_file @check_from_file
def from_file(cls, file_path, delimiter=None, vocab_size=None, special_tokens=None, special_first=None): def from_file(cls, file_path, delimiter=None, vocab_size=None, special_tokens=None, special_first=None):
""" """
build a vocab object from a list of word. Build a vocab object from a list of word.
Args: Args:
file_path(str): path to the file which contains the vocab list. file_path (str): path to the file which contains the vocab list.
delimiter(str, optional): a delimiter to break up each line in file, the first element is taken to be delimiter (str, optional): a delimiter to break up each line in file, the first element is taken to be
the word (default=None). the word (default=None).
vocab_size(int, optional): number of words to read from file_path (default=None, all words are taken). vocab_size (int, optional): number of words to read from file_path (default=None, all words are taken).
special_tokens(list, optional): a list of strings, each one is a special token. for example special_tokens (list, optional): a list of strings, each one is a special token. for example
special_tokens=["<pad>","<unk>"] (default=None, no special tokens will be added). special_tokens=["<pad>","<unk>"] (default=None, no special tokens will be added).
special_first(bool, optional): whether special_tokens will be prepended/appended to vocab, If special_tokens special_first (bool, optional): whether special_tokens will be prepended/appended to vocab,
is specified and special_first is set to None, special_tokens will be prepended. (default=None). If special_tokens is specified and special_first is set to None,
special_tokens will be prepended. (default=None).
""" """
return super().from_file(file_path, delimiter, vocab_size, special_tokens, special_first) return super().from_file(file_path, delimiter, vocab_size, special_tokens, special_first)
@classmethod @classmethod
@check_from_dict @check_from_dict
def from_dict(cls, word_dict): def from_dict(cls, word_dict):
""" """
build a vocab object from a dict. Build a vocab object from a dict.
Args: Args:
word_dict(dict): dict contains word, id pairs where word should be str and id int. id is recommended to word_dict (dict): dict contains word, id pairs where word should be str and id int. id is recommended to
start from 0 and be continuous. ValueError will be raised if id is negative. start from 0 and be continuous. ValueError will be raised if id is negative.
""" """
return super().from_dict(word_dict) return super().from_dict(word_dict)
@ -113,11 +125,11 @@ def to_str(array, encoding='utf8'):
Convert numpy array of `bytes` to array of `str` by decoding each element based on charset `encoding`. Convert numpy array of `bytes` to array of `str` by decoding each element based on charset `encoding`.
Args: Args:
array (numpy array): Array of type `bytes` representing strings. array (numpy.ndarray): Array of type `bytes` representing strings.
encoding (string): Indicating the charset for decoding. encoding (string): Indicating the charset for decoding.
Returns:
Numpy array of `str`.
Returns:
numpy.ndarray, numpy array of `str`.
""" """
if not isinstance(array, np.ndarray): if not isinstance(array, np.ndarray):
@ -131,11 +143,11 @@ def to_bytes(array, encoding='utf8'):
Convert numpy array of `str` to array of `bytes` by encoding each element based on charset `encoding`. Convert numpy array of `str` to array of `bytes` by encoding each element based on charset `encoding`.
Args: Args:
array (numpy array): Array of type `str` representing strings. array (numpy.ndarray): Array of type `str` representing strings.
encoding (string): Indicating the charset for encoding. encoding (str): Indicating the charset for encoding.
Returns:
Numpy array of `bytes`.
Returns:
numpy.ndarray, numpy array of `bytes`.
""" """
if not isinstance(array, np.ndarray): if not isinstance(array, np.ndarray):

@ -75,7 +75,6 @@ class Slice(cde.SliceOp):
Slice operation to extract a tensor out using the given n slices. Slice operation to extract a tensor out using the given n slices.
The functionality of Slice is similar to NumPy indexing feature. The functionality of Slice is similar to NumPy indexing feature.
(Currently only rank 1 Tensors are supported) (Currently only rank 1 Tensors are supported)
Args: Args:
@ -87,17 +86,17 @@ class Slice(cde.SliceOp):
4. Ellipses ...: slice all dimensions between the two slices. 4. Ellipses ...: slice all dimensions between the two slices.
Examples: Examples:
>>> # Data before >>> # Data before
>>> # | col | >>> # | col |
>>> # +---------+ >>> # +---------+
>>> # | [1,2,3] | >>> # | [1,2,3] |
>>> # +---------| >>> # +---------|
>>> data = data.map(operations=Slice(slice(1,3))) # slice indices 1 and 2 only >>> data = data.map(operations=Slice(slice(1,3))) # slice indices 1 and 2 only
>>> # Data after >>> # Data after
>>> # | col | >>> # | col |
>>> # +------------+ >>> # +------------+
>>> # | [1,2] | >>> # | [1,2] |
>>> # +------------| >>> # +------------|
""" """
@check_slice_op @check_slice_op

Loading…
Cancel
Save