|
|
# Copyright 2020 Huawei Technologies Co., Ltd
|
|
|
#
|
|
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
|
# you may not use this file except in compliance with the License.
|
|
|
# You may obtain a copy of the License at
|
|
|
#
|
|
|
# http://www.apache.org/licenses/LICENSE-2.0
|
|
|
#
|
|
|
# Unless required by applicable law or agreed to in writing, software
|
|
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
|
# See the License for the specific language governing permissions and
|
|
|
# limitations under the License.
|
|
|
# ==============================================================================
|
|
|
"""
|
|
|
Testing UnicodeCharTokenizer op in DE
|
|
|
"""
|
|
|
import numpy as np
|
|
|
import mindspore.dataset as ds
|
|
|
from mindspore import log as logger
|
|
|
import mindspore.dataset.text as text
|
|
|
|
|
|
DATA_FILE = "../data/dataset/testTokenizerData/1.txt"
|
|
|
NORMALIZE_FILE = "../data/dataset/testTokenizerData/normalize.txt"
|
|
|
REGEX_REPLACE_FILE = "../data/dataset/testTokenizerData/regex_replace.txt"
|
|
|
REGEX_TOKENIZER_FILE = "../data/dataset/testTokenizerData/regex_tokenizer.txt"
|
|
|
|
|
|
|
|
|
def split_by_unicode_char(input_strs):
|
|
|
"""
|
|
|
Split utf-8 strings to unicode characters
|
|
|
"""
|
|
|
out = []
|
|
|
for s in input_strs:
|
|
|
out.append([c for c in s])
|
|
|
return out
|
|
|
|
|
|
|
|
|
def test_unicode_char_tokenizer_default():
|
|
|
"""
|
|
|
Test UnicodeCharTokenizer
|
|
|
"""
|
|
|
input_strs = ("Welcome to Beijing!", "北京欢迎您!", "我喜欢English!", " ")
|
|
|
dataset = ds.TextFileDataset(DATA_FILE, shuffle=False)
|
|
|
tokenizer = text.UnicodeCharTokenizer()
|
|
|
dataset = dataset.map(operations=tokenizer)
|
|
|
tokens = []
|
|
|
for i in dataset.create_dict_iterator(num_epochs=1, output_numpy=True):
|
|
|
token = text.to_str(i['text']).tolist()
|
|
|
tokens.append(token)
|
|
|
logger.info("The out tokens is : {}".format(tokens))
|
|
|
assert split_by_unicode_char(input_strs) == tokens
|
|
|
|
|
|
|
|
|
def test_unicode_char_tokenizer_with_offsets():
|
|
|
"""
|
|
|
Test UnicodeCharTokenizer
|
|
|
"""
|
|
|
input_strs = ("Welcome to Beijing!", "北京欢迎您!", "我喜欢English!", " ")
|
|
|
dataset = ds.TextFileDataset(DATA_FILE, shuffle=False)
|
|
|
tokenizer = text.UnicodeCharTokenizer(with_offsets=True)
|
|
|
dataset = dataset.map(operations=tokenizer, input_columns=['text'],
|
|
|
output_columns=['token', 'offsets_start', 'offsets_limit'],
|
|
|
column_order=['token', 'offsets_start', 'offsets_limit'])
|
|
|
tokens = []
|
|
|
expected_offsets_start = [[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18],
|
|
|
[0, 3, 6, 9, 12, 15], [0, 3, 6, 9, 10, 11, 12, 13, 14, 15, 16], [0, 1]]
|
|
|
expected_offsets_limit = [[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19],
|
|
|
[3, 6, 9, 12, 15, 18], [3, 6, 9, 10, 11, 12, 13, 14, 15, 16, 17], [1, 2]]
|
|
|
count = 0
|
|
|
for i in dataset.create_dict_iterator(num_epochs=1, output_numpy=True):
|
|
|
token = text.to_str(i['token']).tolist()
|
|
|
tokens.append(token)
|
|
|
np.testing.assert_array_equal(i['offsets_start'], expected_offsets_start[count])
|
|
|
np.testing.assert_array_equal(i['offsets_limit'], expected_offsets_limit[count])
|
|
|
count += 1
|
|
|
logger.info("The out tokens is : {}".format(tokens))
|
|
|
assert split_by_unicode_char(input_strs) == tokens
|
|
|
|
|
|
|
|
|
def test_whitespace_tokenizer_default():
|
|
|
"""
|
|
|
Test WhitespaceTokenizer
|
|
|
"""
|
|
|
whitespace_strs = [["Welcome", "to", "Beijing!"],
|
|
|
["北京欢迎您!"],
|
|
|
["我喜欢English!"],
|
|
|
[""]]
|
|
|
dataset = ds.TextFileDataset(DATA_FILE, shuffle=False)
|
|
|
tokenizer = text.WhitespaceTokenizer()
|
|
|
dataset = dataset.map(operations=tokenizer)
|
|
|
tokens = []
|
|
|
for i in dataset.create_dict_iterator(num_epochs=1, output_numpy=True):
|
|
|
token = text.to_str(i['text']).tolist()
|
|
|
tokens.append(token)
|
|
|
logger.info("The out tokens is : {}".format(tokens))
|
|
|
assert whitespace_strs == tokens
|
|
|
|
|
|
|
|
|
def test_whitespace_tokenizer_with_offsets():
|
|
|
"""
|
|
|
Test WhitespaceTokenizer
|
|
|
"""
|
|
|
whitespace_strs = [["Welcome", "to", "Beijing!"],
|
|
|
["北京欢迎您!"],
|
|
|
["我喜欢English!"],
|
|
|
[""]]
|
|
|
dataset = ds.TextFileDataset(DATA_FILE, shuffle=False)
|
|
|
tokenizer = text.WhitespaceTokenizer(with_offsets=True)
|
|
|
dataset = dataset.map(operations=tokenizer, input_columns=['text'],
|
|
|
output_columns=['token', 'offsets_start', 'offsets_limit'],
|
|
|
column_order=['token', 'offsets_start', 'offsets_limit'])
|
|
|
tokens = []
|
|
|
expected_offsets_start = [[0, 8, 11], [0], [0], [0]]
|
|
|
expected_offsets_limit = [[7, 10, 19], [18], [17], [0]]
|
|
|
count = 0
|
|
|
for i in dataset.create_dict_iterator(num_epochs=1, output_numpy=True):
|
|
|
token = text.to_str(i['token']).tolist()
|
|
|
tokens.append(token)
|
|
|
np.testing.assert_array_equal(i['offsets_start'], expected_offsets_start[count])
|
|
|
np.testing.assert_array_equal(i['offsets_limit'], expected_offsets_limit[count])
|
|
|
count += 1
|
|
|
|
|
|
logger.info("The out tokens is : {}".format(tokens))
|
|
|
assert whitespace_strs == tokens
|
|
|
|
|
|
|
|
|
def test_unicode_script_tokenizer_default():
|
|
|
"""
|
|
|
Test UnicodeScriptTokenizer when para keep_whitespace=False
|
|
|
"""
|
|
|
unicode_script_strs = [["Welcome", "to", "Beijing", "!"],
|
|
|
["北京欢迎您", "!"],
|
|
|
["我喜欢", "English", "!"],
|
|
|
[""]]
|
|
|
dataset = ds.TextFileDataset(DATA_FILE, shuffle=False)
|
|
|
tokenizer = text.UnicodeScriptTokenizer(keep_whitespace=False)
|
|
|
dataset = dataset.map(operations=tokenizer)
|
|
|
|
|
|
tokens = []
|
|
|
for i in dataset.create_dict_iterator(num_epochs=1, output_numpy=True):
|
|
|
token = text.to_str(i['text']).tolist()
|
|
|
tokens.append(token)
|
|
|
logger.info("The out tokens is : {}".format(tokens))
|
|
|
assert unicode_script_strs == tokens
|
|
|
|
|
|
|
|
|
def test_unicode_script_tokenizer_default2():
|
|
|
"""
|
|
|
Test UnicodeScriptTokenizer when para keep_whitespace=True
|
|
|
"""
|
|
|
unicode_script_strs2 = [["Welcome", " ", "to", " ", "Beijing", "!"],
|
|
|
["北京欢迎您", "!"],
|
|
|
["我喜欢", "English", "!"],
|
|
|
[" "]]
|
|
|
dataset = ds.TextFileDataset(DATA_FILE, shuffle=False)
|
|
|
tokenizer = text.UnicodeScriptTokenizer(keep_whitespace=True)
|
|
|
dataset = dataset.map(operations=tokenizer)
|
|
|
tokens = []
|
|
|
for i in dataset.create_dict_iterator(num_epochs=1, output_numpy=True):
|
|
|
token = text.to_str(i['text']).tolist()
|
|
|
tokens.append(token)
|
|
|
logger.info("The out tokens is :", tokens)
|
|
|
assert unicode_script_strs2 == tokens
|
|
|
|
|
|
|
|
|
def test_unicode_script_tokenizer_with_offsets():
|
|
|
"""
|
|
|
Test UnicodeScriptTokenizer when para keep_whitespace=False and with_offsets=True
|
|
|
"""
|
|
|
unicode_script_strs = [["Welcome", "to", "Beijing", "!"],
|
|
|
["北京欢迎您", "!"],
|
|
|
["我喜欢", "English", "!"],
|
|
|
[""]]
|
|
|
dataset = ds.TextFileDataset(DATA_FILE, shuffle=False)
|
|
|
tokenizer = text.UnicodeScriptTokenizer(keep_whitespace=False, with_offsets=True)
|
|
|
dataset = dataset.map(operations=tokenizer, input_columns=['text'],
|
|
|
output_columns=['token', 'offsets_start', 'offsets_limit'],
|
|
|
column_order=['token', 'offsets_start', 'offsets_limit'])
|
|
|
tokens = []
|
|
|
expected_offsets_start = [[0, 8, 11, 18], [0, 15], [0, 9, 16], [0]]
|
|
|
expected_offsets_limit = [[7, 10, 18, 19], [15, 18], [9, 16, 17], [0]]
|
|
|
count = 0
|
|
|
for i in dataset.create_dict_iterator(num_epochs=1, output_numpy=True):
|
|
|
token = text.to_str(i['token']).tolist()
|
|
|
tokens.append(token)
|
|
|
np.testing.assert_array_equal(i['offsets_start'], expected_offsets_start[count])
|
|
|
np.testing.assert_array_equal(i['offsets_limit'], expected_offsets_limit[count])
|
|
|
count += 1
|
|
|
logger.info("The out tokens is : {}".format(tokens))
|
|
|
assert unicode_script_strs == tokens
|
|
|
|
|
|
|
|
|
def test_unicode_script_tokenizer_with_offsets2():
|
|
|
"""
|
|
|
Test UnicodeScriptTokenizer when para keep_whitespace=True and with_offsets=True
|
|
|
"""
|
|
|
unicode_script_strs2 = [["Welcome", " ", "to", " ", "Beijing", "!"],
|
|
|
["北京欢迎您", "!"],
|
|
|
["我喜欢", "English", "!"],
|
|
|
[" "]]
|
|
|
dataset = ds.TextFileDataset(DATA_FILE, shuffle=False)
|
|
|
tokenizer = text.UnicodeScriptTokenizer(keep_whitespace=True, with_offsets=True)
|
|
|
dataset = dataset.map(operations=tokenizer, input_columns=['text'],
|
|
|
output_columns=['token', 'offsets_start', 'offsets_limit'],
|
|
|
column_order=['token', 'offsets_start', 'offsets_limit'])
|
|
|
tokens = []
|
|
|
expected_offsets_start = [[0, 7, 8, 10, 11, 18], [0, 15], [0, 9, 16], [0]]
|
|
|
expected_offsets_limit = [[7, 8, 10, 11, 18, 19], [15, 18], [9, 16, 17], [2]]
|
|
|
count = 0
|
|
|
for i in dataset.create_dict_iterator(num_epochs=1, output_numpy=True):
|
|
|
token = text.to_str(i['token']).tolist()
|
|
|
tokens.append(token)
|
|
|
np.testing.assert_array_equal(i['offsets_start'], expected_offsets_start[count])
|
|
|
np.testing.assert_array_equal(i['offsets_limit'], expected_offsets_limit[count])
|
|
|
count += 1
|
|
|
logger.info("The out tokens is :", tokens)
|
|
|
assert unicode_script_strs2 == tokens
|
|
|
|
|
|
|
|
|
def test_case_fold():
|
|
|
"""
|
|
|
Test CaseFold
|
|
|
"""
|
|
|
expect_strs = ["welcome to beijing!", "北京欢迎您!", "我喜欢english!", " "]
|
|
|
dataset = ds.TextFileDataset(DATA_FILE, shuffle=False)
|
|
|
op = text.CaseFold()
|
|
|
dataset = dataset.map(operations=op)
|
|
|
|
|
|
lower_strs = []
|
|
|
for i in dataset.create_dict_iterator(num_epochs=1, output_numpy=True):
|
|
|
token = text.to_str(i['text']).tolist()
|
|
|
lower_strs.append(token)
|
|
|
assert lower_strs == expect_strs
|
|
|
|
|
|
|
|
|
def test_normalize_utf8():
|
|
|
"""
|
|
|
Test NormalizeUTF8
|
|
|
"""
|
|
|
|
|
|
def normalize(normalize_form):
|
|
|
dataset = ds.TextFileDataset(NORMALIZE_FILE, shuffle=False)
|
|
|
normalize = text.NormalizeUTF8(normalize_form=normalize_form)
|
|
|
dataset = dataset.map(operations=normalize)
|
|
|
out_bytes = []
|
|
|
out_texts = []
|
|
|
for i in dataset.create_dict_iterator(num_epochs=1, output_numpy=True):
|
|
|
out_bytes.append(i['text'])
|
|
|
out_texts.append(text.to_str(i['text']).tolist())
|
|
|
logger.info("The out bytes is : ", out_bytes)
|
|
|
logger.info("The out texts is: ", out_texts)
|
|
|
return out_bytes
|
|
|
|
|
|
expect_normlize_data = [
|
|
|
# NFC
|
|
|
[b'\xe1\xb9\xa9', b'\xe1\xb8\x8d\xcc\x87', b'q\xcc\xa3\xcc\x87',
|
|
|
b'\xef\xac\x81', b'2\xe2\x81\xb5', b'\xe1\xba\x9b\xcc\xa3'],
|
|
|
# NFKC
|
|
|
[b'\xe1\xb9\xa9', b'\xe1\xb8\x8d\xcc\x87', b'q\xcc\xa3\xcc\x87',
|
|
|
b'fi', b'25', b'\xe1\xb9\xa9'],
|
|
|
# NFD
|
|
|
[b's\xcc\xa3\xcc\x87', b'd\xcc\xa3\xcc\x87', b'q\xcc\xa3\xcc\x87',
|
|
|
b'\xef\xac\x81', b'2\xe2\x81\xb5', b'\xc5\xbf\xcc\xa3\xcc\x87'],
|
|
|
# NFKD
|
|
|
[b's\xcc\xa3\xcc\x87', b'd\xcc\xa3\xcc\x87', b'q\xcc\xa3\xcc\x87',
|
|
|
b'fi', b'25', b's\xcc\xa3\xcc\x87']
|
|
|
]
|
|
|
assert normalize(text.utils.NormalizeForm.NFC) == expect_normlize_data[0]
|
|
|
assert normalize(text.utils.NormalizeForm.NFKC) == expect_normlize_data[1]
|
|
|
assert normalize(text.utils.NormalizeForm.NFD) == expect_normlize_data[2]
|
|
|
assert normalize(text.utils.NormalizeForm.NFKD) == expect_normlize_data[3]
|
|
|
|
|
|
|
|
|
def test_regex_replace():
|
|
|
"""
|
|
|
Test RegexReplace
|
|
|
"""
|
|
|
|
|
|
def regex_replace(first, last, expect_str, pattern, replace):
|
|
|
dataset = ds.TextFileDataset(REGEX_REPLACE_FILE, shuffle=False)
|
|
|
if first > 1:
|
|
|
dataset = dataset.skip(first - 1)
|
|
|
if last >= first:
|
|
|
dataset = dataset.take(last - first + 1)
|
|
|
replace_op = text.RegexReplace(pattern, replace)
|
|
|
dataset = dataset.map(operations=replace_op)
|
|
|
out_text = []
|
|
|
for i in dataset.create_dict_iterator(num_epochs=1, output_numpy=True):
|
|
|
token = text.to_str(i['text']).tolist()
|
|
|
out_text.append(token)
|
|
|
logger.info("Out:", out_text)
|
|
|
logger.info("Exp:", expect_str)
|
|
|
assert expect_str == out_text
|
|
|
|
|
|
regex_replace(1, 2, ['H____ W____', "L__'_ G_"], "\\p{Ll}", '_')
|
|
|
regex_replace(3, 5, ['hello', 'world', '31:beijing'], "^(\\d:|b:)", "")
|
|
|
regex_replace(6, 6, ["WelcometoChina!"], "\\s+", "")
|
|
|
regex_replace(7, 8, ['我不想长大', 'WelcometoShenzhen!'], "\\p{Cc}|\\p{Cf}|\\s+", "")
|
|
|
|
|
|
|
|
|
def test_regex_tokenizer_default():
|
|
|
"""
|
|
|
Test RegexTokenizer
|
|
|
"""
|
|
|
|
|
|
def regex_tokenizer(first, last, expect_str, delim_pattern, keep_delim_pattern):
|
|
|
dataset = ds.TextFileDataset(REGEX_TOKENIZER_FILE, shuffle=False)
|
|
|
if first > 1:
|
|
|
dataset = dataset.skip(first - 1)
|
|
|
if last >= first:
|
|
|
dataset = dataset.take(last - first + 1)
|
|
|
tokenizer_op = text.RegexTokenizer(delim_pattern, keep_delim_pattern)
|
|
|
dataset = dataset.map(operations=tokenizer_op)
|
|
|
out_text = []
|
|
|
count = 0
|
|
|
for i in dataset.create_dict_iterator(num_epochs=1, output_numpy=True):
|
|
|
token = text.to_str(i['text']).tolist()
|
|
|
np.testing.assert_array_equal(token, expect_str[count])
|
|
|
count += 1
|
|
|
out_text.append(token)
|
|
|
logger.info("Out:", out_text)
|
|
|
logger.info("Exp:", expect_str)
|
|
|
|
|
|
regex_tokenizer(1, 1, [['Welcome', 'to', 'Shenzhen!']], "\\s+", "")
|
|
|
regex_tokenizer(1, 1, [['Welcome', ' ', 'to', ' ', 'Shenzhen!']], "\\s+", "\\s+")
|
|
|
regex_tokenizer(2, 2, [['北', '京', '欢', '迎', '您', '!Welcome to Beijing!']], r"\p{Han}", r"\p{Han}")
|
|
|
regex_tokenizer(3, 3, [['12', '¥+', '36', '¥=?']], r"[\p{P}|\p{S}]+", r"[\p{P}|\p{S}]+")
|
|
|
regex_tokenizer(3, 3, [['12', '36']], r"[\p{P}|\p{S}]+", "")
|
|
|
regex_tokenizer(3, 3, [['¥+', '¥=?']], r"[\p{N}]+", "")
|
|
|
|
|
|
|
|
|
def test_regex_tokenizer_with_offsets():
|
|
|
"""
|
|
|
Test RegexTokenizer
|
|
|
"""
|
|
|
|
|
|
def regex_tokenizer(first, last, expect_str, expected_offsets_start, expected_offsets_limit, delim_pattern,
|
|
|
keep_delim_pattern):
|
|
|
dataset = ds.TextFileDataset(REGEX_TOKENIZER_FILE, shuffle=False)
|
|
|
if first > 1:
|
|
|
dataset = dataset.skip(first - 1)
|
|
|
if last >= first:
|
|
|
dataset = dataset.take(last - first + 1)
|
|
|
tokenizer_op = text.RegexTokenizer(delim_pattern, keep_delim_pattern, with_offsets=True)
|
|
|
dataset = dataset.map(operations=tokenizer_op, input_columns=['text'],
|
|
|
output_columns=['token', 'offsets_start', 'offsets_limit'],
|
|
|
column_order=['token', 'offsets_start', 'offsets_limit'])
|
|
|
out_text = []
|
|
|
count = 0
|
|
|
for i in dataset.create_dict_iterator(num_epochs=1, output_numpy=True):
|
|
|
token = text.to_str(i['token']).tolist()
|
|
|
np.testing.assert_array_equal(token, expect_str[count])
|
|
|
np.testing.assert_array_equal(i['offsets_start'], expected_offsets_start[count])
|
|
|
np.testing.assert_array_equal(i['offsets_limit'], expected_offsets_limit[count])
|
|
|
count += 1
|
|
|
out_text.append(token)
|
|
|
logger.info("Out:", out_text)
|
|
|
logger.info("Exp:", expect_str)
|
|
|
|
|
|
regex_tokenizer(1, 1, [['Welcome', 'to', 'Shenzhen!']], [[0, 8, 11]], [[7, 10, 20]], "\\s+", "")
|
|
|
regex_tokenizer(1, 1, [['Welcome', ' ', 'to', ' ', 'Shenzhen!']], [[0, 7, 8, 10, 11]], [[7, 8, 10, 11, 20]],
|
|
|
"\\s+", "\\s+")
|
|
|
regex_tokenizer(2, 2, [['北', '京', '欢', '迎', '您', '!Welcome to Beijing!']], [[0, 3, 6, 9, 12, 15]],
|
|
|
[[3, 6, 9, 12, 15, 35]], r"\p{Han}", r"\p{Han}")
|
|
|
regex_tokenizer(3, 3, [['12', '¥+', '36', '¥=?']], [[0, 2, 6, 8]], [[2, 6, 8, 13]],
|
|
|
r"[\p{P}|\p{S}]+", r"[\p{P}|\p{S}]+")
|
|
|
regex_tokenizer(3, 3, [['12', '36']], [[0, 6]], [[2, 8]], r"[\p{P}|\p{S}]+", "")
|
|
|
regex_tokenizer(3, 3, [['¥+', '¥=?']], [[2, 8]], [[6, 13]], r"[\p{N}]+", "")
|
|
|
|
|
|
|
|
|
if __name__ == '__main__':
|
|
|
test_unicode_char_tokenizer_default()
|
|
|
test_unicode_char_tokenizer_with_offsets()
|
|
|
test_whitespace_tokenizer_default()
|
|
|
test_whitespace_tokenizer_with_offsets()
|
|
|
test_unicode_script_tokenizer_default()
|
|
|
test_unicode_script_tokenizer_default2()
|
|
|
test_unicode_script_tokenizer_with_offsets()
|
|
|
test_unicode_script_tokenizer_with_offsets2()
|
|
|
test_case_fold()
|
|
|
test_normalize_utf8()
|
|
|
test_regex_replace()
|
|
|
test_regex_tokenizer_default()
|
|
|
test_regex_tokenizer_with_offsets()
|