fix python tokenizer

pull/13533/head
YangLuo 4 years ago
parent 34b16e6a64
commit f99204b292

@ -533,7 +533,9 @@ class PythonTokenizer:
self.random = False
def __call__(self, in_array):
if not isinstance(in_array, str):
if not isinstance(in_array, np.ndarray):
raise TypeError("input should be a NumPy array. Got {}.".format(type(in_array)))
if in_array.dtype.type is np.bytes_:
in_array = to_str(in_array)
tokens = self.tokenizer(in_array)
return tokens

@ -216,7 +216,7 @@ def to_str(array, encoding='utf8'):
"""
if not isinstance(array, np.ndarray):
raise ValueError('input should be a NumPy array.')
raise TypeError('input should be a NumPy array.')
return np.char.decode(array, encoding)

@ -52,12 +52,17 @@ def test_python_tokenizer():
if not words:
return [""]
return words
txt = "Welcome to Beijing !"
txt = T.PythonTokenizer(my_tokenizer)(txt)
logger.info("Tokenize result: {}".format(txt))
txt1 = np.array("Welcome to Beijing !".encode())
txt1 = T.PythonTokenizer(my_tokenizer)(txt1)
logger.info("Tokenize result: {}".format(txt1))
txt2 = np.array("Welcome to Beijing !")
txt2 = T.PythonTokenizer(my_tokenizer)(txt2)
logger.info("Tokenize result: {}".format(txt2))
expected = ['Welcome', 'to', 'Beijing', '!']
np.testing.assert_equal(txt, expected)
np.testing.assert_equal(txt1, expected)
np.testing.assert_equal(txt2, expected)
if __name__ == '__main__':

Loading…
Cancel
Save