!13004 [MD] fix examples in docs of text

From: @tiancixiao
Reviewed-by: @liucunwei,@heleiwang
Signed-off-by: @liucunwei
pull/13004/MERGE
mindspore-ci-bot 4 years ago committed by Gitee
commit df66ed0fc9

@ -63,6 +63,7 @@ class TextTensorOperation(TensorOperation):
""" """
Base class of Text Tensor Ops Base class of Text Tensor Ops
""" """
def parse(self): def parse(self):
raise NotImplementedError("TextTensorOperation has to implement parse() method.") raise NotImplementedError("TextTensorOperation has to implement parse() method.")
@ -112,7 +113,7 @@ class JiebaTokenizer(TextTensorOperation):
>>> tokenizer_op = text.JiebaTokenizer(jieba_hmm_file, jieba_mp_file, mode=JiebaMode.MP, with_offsets=False) >>> tokenizer_op = text.JiebaTokenizer(jieba_hmm_file, jieba_mp_file, mode=JiebaMode.MP, with_offsets=False)
>>> text_file_dataset = text_file_dataset.map(operations=tokenizer_op) >>> text_file_dataset = text_file_dataset.map(operations=tokenizer_op)
>>> # If with_offsets=False, then output three columns {["token", dtype=str], ["offsets_start", dtype=uint32], >>> # If with_offsets=False, then output three columns {["token", dtype=str], ["offsets_start", dtype=uint32],
... # ["offsets_limit", dtype=uint32]} >>> # ["offsets_limit", dtype=uint32]}
>>> tokenizer_op = text.JiebaTokenizer(jieba_hmm_file, jieba_mp_file, mode=JiebaMode.MP, with_offsets=True) >>> tokenizer_op = text.JiebaTokenizer(jieba_hmm_file, jieba_mp_file, mode=JiebaMode.MP, with_offsets=True)
>>> text_file_dataset_1 = text_file_dataset_1.map(operations=tokenizer_op, input_columns=["text"], >>> text_file_dataset_1 = text_file_dataset_1.map(operations=tokenizer_op, input_columns=["text"],
... output_columns=["token", "offsets_start", "offsets_limit"], ... output_columns=["token", "offsets_start", "offsets_limit"],
@ -155,10 +156,10 @@ class JiebaTokenizer(TextTensorOperation):
>>> from mindspore.dataset.text import JiebaMode >>> from mindspore.dataset.text import JiebaMode
>>> jieba_hmm_file = "/path/to/jieba/hmm/file" >>> jieba_hmm_file = "/path/to/jieba/hmm/file"
>>> jieba_mp_file = "/path/to/jieba/mp/file" >>> jieba_mp_file = "/path/to/jieba/mp/file"
>>> jieba_op = text.JiebaTokenizer(jieba_hmm_file, jieba_mp_file, mode=text.JiebaMode.MP) >>> jieba_op = text.JiebaTokenizer(jieba_hmm_file, jieba_mp_file, mode=JiebaMode.MP)
>>> sentence_piece_vocab_file = "/path/to/sentence/piece/vocab/file" >>> sentence_piece_vocab_file = "/path/to/sentence/piece/vocab/file"
>>> with open(sentence_piece_vocab_file, 'r') as f: >>> with open(sentence_piece_vocab_file, 'r') as f:
>>> for line in f: ... for line in f:
... word = line.split(',')[0] ... word = line.split(',')[0]
... jieba_op.add_word(word) ... jieba_op.add_word(word)
>>> text_file_dataset = text_file_dataset.map(operations=jieba_op, input_columns=["text"]) >>> text_file_dataset = text_file_dataset.map(operations=jieba_op, input_columns=["text"])
@ -300,7 +301,8 @@ class Ngram(TextTensorOperation):
(default=None, which will use whitespace as separator). (default=None, which will use whitespace as separator).
Examples: Examples:
>>> text_file_dataset = text_file_dataset.map(operations=text.Ngram(3, separator="")) >>> ngram_op = text.Ngram(3, separator="")
>>> text_file_dataset = text_file_dataset.map(operations=ngram_op)
""" """
@check_ngram @check_ngram
@ -350,20 +352,19 @@ class SlidingWindow(TextTensorOperation):
axis (int, optional): The axis along which the sliding window is computed (default=0). axis (int, optional): The axis along which the sliding window is computed (default=0).
Examples: Examples:
>>> import mindspore.dataset.text as text >>> dataset = ds.NumpySlicesDataset(data=[[1, 2, 3, 4, 5]], column_names="col1")
>>>
>>> # Data before >>> # Data before
>>> # | col1 | >>> # | col1 |
>>> # +-------------+ >>> # +--------------+
>>> # | [1,2,3,4,5] | >>> # | [[1, 2, 3, 4, 5]] |
>>> # +-------------+ >>> # +--------------+
>>> data1 = data1.map(operations=text.SlidingWindow(3, 0)) >>> dataset = dataset.map(operations=text.SlidingWindow(3, 0))
>>> # Data after >>> # Data after
>>> # | col1 | >>> # | col1 |
>>> # +-------------+ >>> # +--------------+
>>> # | [[1,2,3], | >>> # | [[1, 2, 3], |
>>> # | [2,3,4], | >>> # | [2, 3, 4], |
>>> # | [3,4,5]] | >>> # | [3, 4, 5]] |
>>> # +--------------+ >>> # +--------------+
""" """
@ -420,19 +421,19 @@ class TruncateSequencePair(TextTensorOperation):
max_length (int): Maximum length required. max_length (int): Maximum length required.
Examples: Examples:
>>> import mindspore.dataset.text as text >>> dataset = ds.NumpySlicesDataset(data={"col1": [[1, 2, 3]], "col2": [[4, 5]]})
>>>
>>> # Data before >>> # Data before
>>> # | col1 | col2 | >>> # | col1 | col2 |
>>> # +---------+---------| >>> # +-----------+-----------|
>>> # | [1,2,3] | [4,5] | >>> # | [1, 2, 3] | [4, 5] |
>>> # +---------+---------+ >>> # +-----------+-----------+
>>> data1 = data1.map(operations=text.TruncateSequencePair(4)) >>> truncate_sequence_pair_op = text.TruncateSequencePair(max_length=4)
>>> dataset = dataset.map(operations=truncate_sequence_pair_op)
>>> # Data after >>> # Data after
>>> # | col1 | col2 | >>> # | col1 | col2 |
>>> # +---------+---------+ >>> # +-----------+-----------+
>>> # | [1,2] | [4,5] | >>> # | [1, 2] | [4, 5] |
>>> # +---------+---------+ >>> # +-----------+-----------+
""" """
@check_pair_truncate @check_pair_truncate
@ -451,17 +452,15 @@ class UnicodeCharTokenizer(TextTensorOperation):
with_offsets (bool, optional): If or not output offsets of tokens (default=False). with_offsets (bool, optional): If or not output offsets of tokens (default=False).
Examples: Examples:
>>> import mindspore.dataset.text as text
>>>
>>> # If with_offsets=False, default output one column {["text", dtype=str]} >>> # If with_offsets=False, default output one column {["text", dtype=str]}
>>> tokenizer_op = text.UnicodeCharTokenizer() >>> tokenizer_op = text.UnicodeCharTokenizer(with_offsets=False)
>>> data1 = data1.map(operations=tokenizer_op) >>> text_file_dataset = text_file_dataset.map(operations=tokenizer_op)
>>> # If with_offsets=False, then output three columns {["token", dtype=str], ["offsets_start", dtype=uint32], >>> # If with_offsets=True, then output three columns {["token", dtype=str], ["offsets_start", dtype=uint32],
>>> # ["offsets_limit", dtype=uint32]} >>> # ["offsets_limit", dtype=uint32]}
>>> tokenizer_op = text.UnicodeCharTokenizer(True) >>> tokenizer_op = text.UnicodeCharTokenizer(with_offsets=True)
>>> data2 = data2.map(operations=tokenizer_op, input_columns=["text"], >>> text_file_dataset = text_file_dataset.map(operations=tokenizer_op, input_columns=["text"],
>>> output_columns=["token", "offsets_start", "offsets_limit"], >>> output_columns=["token", "offsets_start", "offsets_limit"],
>>> column_order=["token", "offsets_start", "offsets_limit"]) >>> column_order=["token", "offsets_start", "offsets_limit"])
""" """
@check_with_offsets @check_with_offsets
@ -486,19 +485,19 @@ class WordpieceTokenizer(cde.WordpieceTokenizerOp):
with_offsets (bool, optional): If or not output offsets of tokens (default=False). with_offsets (bool, optional): If or not output offsets of tokens (default=False).
Examples: Examples:
>>> import mindspore.dataset.text as text >>> vocab_list = ["book", "cholera", "era", "favor", "##ite", "my", "is", "love", "dur", "##ing", "the"]
>>> >>> vocab = text.Vocab.from_list(vocab_list)
>>> # If with_offsets=False, default output one column {["text", dtype=str]} >>> # If with_offsets=False, default output one column {["text", dtype=str]}
>>> tokenizer_op = text.WordpieceTokenizer(vocab=vocab, unknown_token='[UNK]', >>> tokenizer_op = text.WordpieceTokenizer(vocab=vocab, unknown_token='[UNK]',
... max_bytes_per_token=100, with_offsets=False) ... max_bytes_per_token=100, with_offsets=False)
>>> data1 = data1.map(operations=tokenizer_op) >>> text_file_dataset = text_file_dataset.map(operations=tokenizer_op)
>>> # If with_offsets=False, then output three columns {["token", dtype=str], ["offsets_start", dtype=uint32], >>> # If with_offsets=True, then output three columns {["token", dtype=str], ["offsets_start", dtype=uint32],
>>> # ["offsets_limit", dtype=uint32]} >>> # ["offsets_limit", dtype=uint32]}
>>> tokenizer_op = text.WordpieceTokenizer(vocab=vocab, unknown_token='[UNK]', >>> tokenizer_op = text.WordpieceTokenizer(vocab=vocab, unknown_token='[UNK]',
... max_bytes_per_token=100, with_offsets=True) ... max_bytes_per_token=100, with_offsets=True)
>>> data2 = data2.map(operations=tokenizer_op, >>> text_file_dataset = text_file_dataset.map(operations=tokenizer_op, input_columns=["text"],
... input_columns=["text"], output_columns=["token", "offsets_start", "offsets_limit"], ... output_columns=["token", "offsets_start", "offsets_limit"],
... column_order=["token", "offsets_start", "offsets_limit"]) ... column_order=["token", "offsets_start", "offsets_limit"])
""" """
@check_wordpiece_tokenizer @check_wordpiece_tokenizer
@ -566,6 +565,8 @@ if platform.system().lower() != 'windows':
with_offsets (bool, optional): If or not output offsets of tokens (default=False). with_offsets (bool, optional): If or not output offsets of tokens (default=False).
Examples: Examples:
>>> from mindspore.dataset.text import NormalizeForm
>>>
>>> # If with_offsets=False, default output one column {["text", dtype=str]} >>> # If with_offsets=False, default output one column {["text", dtype=str]}
>>> tokenizer_op = text.BasicTokenizer(lower_case=False, >>> tokenizer_op = text.BasicTokenizer(lower_case=False,
... keep_whitespace=False, ... keep_whitespace=False,
@ -631,6 +632,7 @@ if platform.system().lower() != 'windows':
Examples: Examples:
>>> from mindspore.dataset.text import NormalizeForm >>> from mindspore.dataset.text import NormalizeForm
>>>
>>> # If with_offsets=False, default output one column {["text", dtype=str]} >>> # If with_offsets=False, default output one column {["text", dtype=str]}
>>> vocab_list = ["", "", "", "", "", "", "", "", "", "", "", "", "", "", >>> vocab_list = ["", "", "", "", "", "", "", "", "", "", "", "", "", "",
... "", "", "","", "", "", "", "", "", "", "", "i", "am", "mak", ... "", "", "","", "", "", "", "", "", "", "", "i", "am", "mak",
@ -828,11 +830,9 @@ if platform.system().lower() != 'windows':
>>> # ["offsets_start", dtype=uint32], >>> # ["offsets_start", dtype=uint32],
>>> # ["offsets_limit", dtype=uint32]} >>> # ["offsets_limit", dtype=uint32]}
>>> tokenizer_op = text.UnicodeScriptTokenizer(keep_whitespace=True, with_offsets=True) >>> tokenizer_op = text.UnicodeScriptTokenizer(keep_whitespace=True, with_offsets=True)
>>> text_file_dataset_1 = text_file_dataset_1.map(operations=tokenizer_op, input_columns=["text"], >>> text_file_dataset = text_file_dataset.map(operations=tokenizer_op, input_columns=["text"],
... output_columns=["token", "offsets_start", ... output_columns=["token", "offsets_start", "offsets_limit"],
... "offsets_limit"], ... column_order=["token", "offsets_start", "offsets_limit"])
... column_order=["token", "offsets_start",
... "offsets_limit"])
""" """
@ -859,15 +859,15 @@ if platform.system().lower() != 'windows':
Examples: Examples:
>>> # If with_offsets=False, default output one column {["text", dtype=str]} >>> # If with_offsets=False, default output one column {["text", dtype=str]}
>>> tokenizer_op = text.WhitespaceTokenizer() >>> tokenizer_op = text.WhitespaceTokenizer(with_offsets=False)
>>> data1 = data1.map(operations=tokenizer_op) >>> text_file_dataset = text_file_dataset.map(operations=tokenizer_op)
>>> # If with_offsets=False, then output three columns {["token", dtype=str], >>> # If with_offsets=True, then output three columns {["token", dtype=str],
>>> # ["offsets_start", dtype=uint32], >>> # ["offsets_start", dtype=uint32],
>>> # ["offsets_limit", dtype=uint32]} >>> # ["offsets_limit", dtype=uint32]}
>>> tokenizer_op = text.WhitespaceTokenizer(True) >>> tokenizer_op = text.WhitespaceTokenizer(with_offsets=True)
>>> data2 = data2.map(operations=tokenizer_op, input_columns=["text"], >>> text_file_dataset = text_file_dataset.map(operations=tokenizer_op, input_columns=["text"],
>>> output_columns=["token", "offsets_start", "offsets_limit"], ... output_columns=["token", "offsets_start", "offsets_limit"],
>>> column_order=["token", "offsets_start", "offsets_limit"]) ... column_order=["token", "offsets_start", "offsets_limit"])
""" """
@check_with_offsets @check_with_offsets

Loading…
Cancel
Save