From 52167da6ae255332f3460179d30b29d7e0f0b0b4 Mon Sep 17 00:00:00 2001 From: Xiao Tianci Date: Mon, 8 Mar 2021 21:07:54 +0800 Subject: [PATCH] fix errors in docs of text --- mindspore/dataset/text/transforms.py | 112 +++++++++++++-------------- 1 file changed, 56 insertions(+), 56 deletions(-) diff --git a/mindspore/dataset/text/transforms.py b/mindspore/dataset/text/transforms.py index a295bbe305..25ba4c85c4 100644 --- a/mindspore/dataset/text/transforms.py +++ b/mindspore/dataset/text/transforms.py @@ -63,6 +63,7 @@ class TextTensorOperation(TensorOperation): """ Base class of Text Tensor Ops """ + def parse(self): raise NotImplementedError("TextTensorOperation has to implement parse() method.") @@ -112,7 +113,7 @@ class JiebaTokenizer(TextTensorOperation): >>> tokenizer_op = text.JiebaTokenizer(jieba_hmm_file, jieba_mp_file, mode=JiebaMode.MP, with_offsets=False) >>> text_file_dataset = text_file_dataset.map(operations=tokenizer_op) >>> # If with_offsets=False, then output three columns {["token", dtype=str], ["offsets_start", dtype=uint32], - ... # ["offsets_limit", dtype=uint32]} + >>> # ["offsets_limit", dtype=uint32]} >>> tokenizer_op = text.JiebaTokenizer(jieba_hmm_file, jieba_mp_file, mode=JiebaMode.MP, with_offsets=True) >>> text_file_dataset_1 = text_file_dataset_1.map(operations=tokenizer_op, input_columns=["text"], ... output_columns=["token", "offsets_start", "offsets_limit"], @@ -155,10 +156,10 @@ class JiebaTokenizer(TextTensorOperation): >>> from mindspore.dataset.text import JiebaMode >>> jieba_hmm_file = "/path/to/jieba/hmm/file" >>> jieba_mp_file = "/path/to/jieba/mp/file" - >>> jieba_op = text.JiebaTokenizer(jieba_hmm_file, jieba_mp_file, mode=text.JiebaMode.MP) + >>> jieba_op = text.JiebaTokenizer(jieba_hmm_file, jieba_mp_file, mode=JiebaMode.MP) >>> sentence_piece_vocab_file = "/path/to/sentence/piece/vocab/file" >>> with open(sentence_piece_vocab_file, 'r') as f: - >>> for line in f: + ... for line in f: ... word = line.split(',')[0] ... jieba_op.add_word(word) >>> text_file_dataset = text_file_dataset.map(operations=jieba_op, input_columns=["text"]) @@ -300,7 +301,8 @@ class Ngram(TextTensorOperation): (default=None, which will use whitespace as separator). Examples: - >>> text_file_dataset = text_file_dataset.map(operations=text.Ngram(3, separator="")) + >>> ngram_op = text.Ngram(3, separator="") + >>> text_file_dataset = text_file_dataset.map(operations=ngram_op) """ @check_ngram @@ -350,20 +352,19 @@ class SlidingWindow(TextTensorOperation): axis (int, optional): The axis along which the sliding window is computed (default=0). Examples: - >>> import mindspore.dataset.text as text - >>> + >>> dataset = ds.NumpySlicesDataset(data=[[1, 2, 3, 4, 5]], column_names="col1") >>> # Data before - >>> # | col1 | - >>> # +-------------+ - >>> # | [1,2,3,4,5] | - >>> # +-------------+ - >>> data1 = data1.map(operations=text.SlidingWindow(3, 0)) + >>> # | col1 | + >>> # +--------------+ + >>> # | [[1, 2, 3, 4, 5]] | + >>> # +--------------+ + >>> dataset = dataset.map(operations=text.SlidingWindow(3, 0)) >>> # Data after - >>> # | col1 | - >>> # +-------------+ - >>> # | [[1,2,3], | - >>> # | [2,3,4], | - >>> # | [3,4,5]] | + >>> # | col1 | + >>> # +--------------+ + >>> # | [[1, 2, 3], | + >>> # | [2, 3, 4], | + >>> # | [3, 4, 5]] | >>> # +--------------+ """ @@ -420,19 +421,19 @@ class TruncateSequencePair(TextTensorOperation): max_length (int): Maximum length required. Examples: - >>> import mindspore.dataset.text as text - >>> + >>> dataset = ds.NumpySlicesDataset(data={"col1": [[1, 2, 3]], "col2": [[4, 5]]}) >>> # Data before - >>> # | col1 | col2 | - >>> # +---------+---------| - >>> # | [1,2,3] | [4,5] | - >>> # +---------+---------+ - >>> data1 = data1.map(operations=text.TruncateSequencePair(4)) + >>> # | col1 | col2 | + >>> # +-----------+-----------| + >>> # | [1, 2, 3] | [4, 5] | + >>> # +-----------+-----------+ + >>> truncate_sequence_pair_op = text.TruncateSequencePair(max_length=4) + >>> dataset = dataset.map(operations=truncate_sequence_pair_op) >>> # Data after - >>> # | col1 | col2 | - >>> # +---------+---------+ - >>> # | [1,2] | [4,5] | - >>> # +---------+---------+ + >>> # | col1 | col2 | + >>> # +-----------+-----------+ + >>> # | [1, 2] | [4, 5] | + >>> # +-----------+-----------+ """ @check_pair_truncate @@ -451,17 +452,15 @@ class UnicodeCharTokenizer(TextTensorOperation): with_offsets (bool, optional): If or not output offsets of tokens (default=False). Examples: - >>> import mindspore.dataset.text as text - >>> >>> # If with_offsets=False, default output one column {["text", dtype=str]} - >>> tokenizer_op = text.UnicodeCharTokenizer() - >>> data1 = data1.map(operations=tokenizer_op) - >>> # If with_offsets=False, then output three columns {["token", dtype=str], ["offsets_start", dtype=uint32], + >>> tokenizer_op = text.UnicodeCharTokenizer(with_offsets=False) + >>> text_file_dataset = text_file_dataset.map(operations=tokenizer_op) + >>> # If with_offsets=True, then output three columns {["token", dtype=str], ["offsets_start", dtype=uint32], >>> # ["offsets_limit", dtype=uint32]} - >>> tokenizer_op = text.UnicodeCharTokenizer(True) - >>> data2 = data2.map(operations=tokenizer_op, input_columns=["text"], - >>> output_columns=["token", "offsets_start", "offsets_limit"], - >>> column_order=["token", "offsets_start", "offsets_limit"]) + >>> tokenizer_op = text.UnicodeCharTokenizer(with_offsets=True) + >>> text_file_dataset = text_file_dataset.map(operations=tokenizer_op, input_columns=["text"], + >>> output_columns=["token", "offsets_start", "offsets_limit"], + >>> column_order=["token", "offsets_start", "offsets_limit"]) """ @check_with_offsets @@ -486,19 +485,19 @@ class WordpieceTokenizer(cde.WordpieceTokenizerOp): with_offsets (bool, optional): If or not output offsets of tokens (default=False). Examples: - >>> import mindspore.dataset.text as text - >>> + >>> vocab_list = ["book", "cholera", "era", "favor", "##ite", "my", "is", "love", "dur", "##ing", "the"] + >>> vocab = text.Vocab.from_list(vocab_list) >>> # If with_offsets=False, default output one column {["text", dtype=str]} >>> tokenizer_op = text.WordpieceTokenizer(vocab=vocab, unknown_token='[UNK]', - ... max_bytes_per_token=100, with_offsets=False) - >>> data1 = data1.map(operations=tokenizer_op) - >>> # If with_offsets=False, then output three columns {["token", dtype=str], ["offsets_start", dtype=uint32], + ... max_bytes_per_token=100, with_offsets=False) + >>> text_file_dataset = text_file_dataset.map(operations=tokenizer_op) + >>> # If with_offsets=True, then output three columns {["token", dtype=str], ["offsets_start", dtype=uint32], >>> # ["offsets_limit", dtype=uint32]} >>> tokenizer_op = text.WordpieceTokenizer(vocab=vocab, unknown_token='[UNK]', ... max_bytes_per_token=100, with_offsets=True) - >>> data2 = data2.map(operations=tokenizer_op, - ... input_columns=["text"], output_columns=["token", "offsets_start", "offsets_limit"], - ... column_order=["token", "offsets_start", "offsets_limit"]) + >>> text_file_dataset = text_file_dataset.map(operations=tokenizer_op, input_columns=["text"], + ... output_columns=["token", "offsets_start", "offsets_limit"], + ... column_order=["token", "offsets_start", "offsets_limit"]) """ @check_wordpiece_tokenizer @@ -566,6 +565,8 @@ if platform.system().lower() != 'windows': with_offsets (bool, optional): If or not output offsets of tokens (default=False). Examples: + >>> from mindspore.dataset.text import NormalizeForm + >>> >>> # If with_offsets=False, default output one column {["text", dtype=str]} >>> tokenizer_op = text.BasicTokenizer(lower_case=False, ... keep_whitespace=False, @@ -631,6 +632,7 @@ if platform.system().lower() != 'windows': Examples: >>> from mindspore.dataset.text import NormalizeForm + >>> >>> # If with_offsets=False, default output one column {["text", dtype=str]} >>> vocab_list = ["床", "前", "明", "月", "光", "疑", "是", "地", "上", "霜", "举", "头", "望", "低", ... "思", "故", "乡","繁", "體", "字", "嘿", "哈", "大", "笑", "嘻", "i", "am", "mak", @@ -828,11 +830,9 @@ if platform.system().lower() != 'windows': >>> # ["offsets_start", dtype=uint32], >>> # ["offsets_limit", dtype=uint32]} >>> tokenizer_op = text.UnicodeScriptTokenizer(keep_whitespace=True, with_offsets=True) - >>> text_file_dataset_1 = text_file_dataset_1.map(operations=tokenizer_op, input_columns=["text"], - ... output_columns=["token", "offsets_start", - ... "offsets_limit"], - ... column_order=["token", "offsets_start", - ... "offsets_limit"]) + >>> text_file_dataset = text_file_dataset.map(operations=tokenizer_op, input_columns=["text"], + ... output_columns=["token", "offsets_start", "offsets_limit"], + ... column_order=["token", "offsets_start", "offsets_limit"]) """ @@ -859,15 +859,15 @@ if platform.system().lower() != 'windows': Examples: >>> # If with_offsets=False, default output one column {["text", dtype=str]} - >>> tokenizer_op = text.WhitespaceTokenizer() - >>> data1 = data1.map(operations=tokenizer_op) - >>> # If with_offsets=False, then output three columns {["token", dtype=str], + >>> tokenizer_op = text.WhitespaceTokenizer(with_offsets=False) + >>> text_file_dataset = text_file_dataset.map(operations=tokenizer_op) + >>> # If with_offsets=True, then output three columns {["token", dtype=str], >>> # ["offsets_start", dtype=uint32], >>> # ["offsets_limit", dtype=uint32]} - >>> tokenizer_op = text.WhitespaceTokenizer(True) - >>> data2 = data2.map(operations=tokenizer_op, input_columns=["text"], - >>> output_columns=["token", "offsets_start", "offsets_limit"], - >>> column_order=["token", "offsets_start", "offsets_limit"]) + >>> tokenizer_op = text.WhitespaceTokenizer(with_offsets=True) + >>> text_file_dataset = text_file_dataset.map(operations=tokenizer_op, input_columns=["text"], + ... output_columns=["token", "offsets_start", "offsets_limit"], + ... column_order=["token", "offsets_start", "offsets_limit"]) """ @check_with_offsets