From 2457f52596e31296be391afae3b906d6d6d6fd7b Mon Sep 17 00:00:00 2001 From: alex-yuyue Date: Wed, 10 Mar 2021 10:36:53 -0500 Subject: [PATCH] Fix CI warning and test team issues Signed-off-by: alex-yuyue --- .../dataset/kernels/ir/image/bindings.cc | 1 - .../bindings/dataset/text/kernels/ir/bindings.cc | 2 -- mindspore/dataset/engine/samplers.py | 16 ++++++++-------- mindspore/dataset/transforms/c_transforms.py | 8 +++++++- mindspore/dataset/vision/c_transforms.py | 9 +++++---- .../dataset/test_sentencepiece_tokenizer.py | 9 +++++++++ .../python/dataset/test_text_bert_tokenizer.py | 16 +++++++++++++++- tests/ut/python/dataset/test_to_number_op.py | 5 +++-- 8 files changed, 47 insertions(+), 19 deletions(-) diff --git a/mindspore/ccsrc/minddata/dataset/api/python/bindings/dataset/kernels/ir/image/bindings.cc b/mindspore/ccsrc/minddata/dataset/api/python/bindings/dataset/kernels/ir/image/bindings.cc index e33d1b1d8d..01d2d77f0a 100644 --- a/mindspore/ccsrc/minddata/dataset/api/python/bindings/dataset/kernels/ir/image/bindings.cc +++ b/mindspore/ccsrc/minddata/dataset/api/python/bindings/dataset/kernels/ir/image/bindings.cc @@ -492,6 +492,5 @@ PYBIND_REGISTER( return uniform_aug; })); })); - } // namespace dataset } // namespace mindspore diff --git a/mindspore/ccsrc/minddata/dataset/api/python/bindings/dataset/text/kernels/ir/bindings.cc b/mindspore/ccsrc/minddata/dataset/api/python/bindings/dataset/text/kernels/ir/bindings.cc index 5b5c082718..f3a17f1e6c 100644 --- a/mindspore/ccsrc/minddata/dataset/api/python/bindings/dataset/text/kernels/ir/bindings.cc +++ b/mindspore/ccsrc/minddata/dataset/api/python/bindings/dataset/text/kernels/ir/bindings.cc @@ -25,7 +25,6 @@ namespace mindspore { namespace dataset { - #ifdef ENABLE_ICU4C PYBIND_REGISTER( @@ -262,6 +261,5 @@ PYBIND_REGISTER(SPieceTokenizerOutType, 0, ([](const py::module *m) { .value("DE_SPIECE_TOKENIZER_OUTTYPE_KINT", SPieceTokenizerOutType::kInt) .export_values(); })); - } // namespace dataset } // namespace mindspore diff --git a/mindspore/dataset/engine/samplers.py b/mindspore/dataset/engine/samplers.py index b61f5cffbe..46a63099f9 100644 --- a/mindspore/dataset/engine/samplers.py +++ b/mindspore/dataset/engine/samplers.py @@ -605,7 +605,7 @@ class SubsetSampler(BuiltinSampler): Samples the elements from a sequence of indices. Args: - indices (list[int]): A sequence of indices. + indices (Any iterable python object but string): A sequence of indices. num_samples (int, optional): Number of elements to sample (default=None, all elements). Examples: @@ -633,6 +633,13 @@ class SubsetSampler(BuiltinSampler): return [sample_id for sample_id, _ in zip(sampler, range(number_of_samples))] + if num_samples is not None: + if not isinstance(num_samples, int): + raise TypeError("num_samples must be integer but was: {}.".format(num_samples)) + if num_samples < 0 or num_samples > validator.INT64_MAX: + raise ValueError("num_samples exceeds the boundary between {} and {}(INT64_MAX)!" + .format(0, validator.INT64_MAX)) + if not isinstance(indices, str) and validator.is_iterable(indices): indices = _get_sample_ids_as_list(indices, num_samples) elif isinstance(indices, int): @@ -645,13 +652,6 @@ class SubsetSampler(BuiltinSampler): raise TypeError("SubsetSampler: Type of indices element must be int, " "but got list[{}]: {}, type: {}.".format(i, item, type(item))) - if num_samples is not None: - if not isinstance(num_samples, int): - raise TypeError("num_samples must be integer but was: {}.".format(num_samples)) - if num_samples < 0 or num_samples > validator.INT64_MAX: - raise ValueError("num_samples exceeds the boundary between {} and {}(INT64_MAX)!" - .format(0, validator.INT64_MAX)) - self.indices = indices super().__init__(num_samples) diff --git a/mindspore/dataset/transforms/c_transforms.py b/mindspore/dataset/transforms/c_transforms.py index 8fe20363f4..20dd3fa583 100644 --- a/mindspore/dataset/transforms/c_transforms.py +++ b/mindspore/dataset/transforms/c_transforms.py @@ -31,7 +31,13 @@ class TensorOperation: Base class Tensor Ops """ def __call__(self, *input_tensor_list): - tensor_row = [cde.Tensor(np.asarray(tensor)) for tensor in input_tensor_list] + tensor_row = [] + for tensor in input_tensor_list: + try: + tensor_row.append(cde.Tensor(np.asarray(tensor))) + except RuntimeError: + raise TypeError("Invalid user input. Got {}: {}, cannot be converted into tensor." \ + .format(type(tensor), tensor)) callable_op = cde.Execute(self.parse()) output_tensor_list = callable_op(tensor_row) for i, element in enumerate(output_tensor_list): diff --git a/mindspore/dataset/vision/c_transforms.py b/mindspore/dataset/vision/c_transforms.py index aa8786e90e..a7a26ea095 100644 --- a/mindspore/dataset/vision/c_transforms.py +++ b/mindspore/dataset/vision/c_transforms.py @@ -1197,12 +1197,13 @@ class RandomSharpness(ImageTensorOperation): class RandomSolarize(ImageTensorOperation): """ - Invert all pixel values with given range. + Randomly invert the pixel values of input image within given range. Args: - threshold (tuple, optional): Range of random solarize threshold. Threshold values should always be - in the range (0, 255), include at least one integer value in the given range and be in - (min, max) format. If min=max, then invert all pixel values above min(max) (default=(0, 255)). + threshold (tuple, optional): Range of random solarize threshold (default=(0, 255)). + Threshold values should always be in (min, max) format, + where min <= max, min and max are integers in the range (0, 255). + If min=max, then invert all pixel values above min(max). Examples: >>> transforms_list = [c_vision.Decode(), c_vision.RandomSolarize(threshold=(10,100))] diff --git a/tests/ut/python/dataset/test_sentencepiece_tokenizer.py b/tests/ut/python/dataset/test_sentencepiece_tokenizer.py index d7705c8379..db805cbe93 100644 --- a/tests/ut/python/dataset/test_sentencepiece_tokenizer.py +++ b/tests/ut/python/dataset/test_sentencepiece_tokenizer.py @@ -13,6 +13,7 @@ # limitations under the License. # ============================================================================== import copy +import numpy as np import mindspore.dataset.text as text import mindspore.dataset as ds from mindspore.dataset.text import SentencePieceModel, to_str, SPieceTokenizerOutType @@ -21,6 +22,13 @@ VOCAB_FILE = "../data/dataset/test_sentencepiece/botchan.txt" DATA_FILE = "../data/dataset/testTokenizerData/sentencepiece_tokenizer.txt" +def test_sentence_piece_tokenizer_callable(): + vocab = text.SentencePieceVocab.from_file([VOCAB_FILE], 5000, 0.9995, SentencePieceModel.UNIGRAM, {}) + tokenizer = text.SentencePieceTokenizer(vocab, out_type=SPieceTokenizerOutType.STRING) + data = '123' + assert np.array_equal(tokenizer(data), ['▁', '12', '3']) + + def test_from_vocab_to_str_UNIGRAM(): vocab = text.SentencePieceVocab.from_file([VOCAB_FILE], 5000, 0.9995, SentencePieceModel.UNIGRAM, {}) tokenizer = text.SentencePieceTokenizer(vocab, out_type=SPieceTokenizerOutType.STRING) @@ -160,6 +168,7 @@ def test_with_zip_concat(): if __name__ == "__main__": + test_sentence_piece_tokenizer_callable() test_from_vocab_to_str_UNIGRAM() test_from_vocab_to_str_BPE() test_from_vocab_to_str_CHAR() diff --git a/tests/ut/python/dataset/test_text_bert_tokenizer.py b/tests/ut/python/dataset/test_text_bert_tokenizer.py index 51c4ab6ebb..2cc57da49b 100644 --- a/tests/ut/python/dataset/test_text_bert_tokenizer.py +++ b/tests/ut/python/dataset/test_text_bert_tokenizer.py @@ -16,6 +16,7 @@ Testing BertTokenizer op in DE """ import numpy as np +import pytest import mindspore.dataset as ds from mindspore import log as logger import mindspore.dataset.text as text @@ -127,7 +128,7 @@ test_paras = [ preserve_unused_token=True, vocab_list=vocab_bert ), - # test non-default parms + # test non-default params dict( first=8, last=8, @@ -242,6 +243,19 @@ def test_bert_tokenizer_with_offsets(): check_bert_tokenizer_with_offsets(**paras) +def test_bert_tokenizer_callable_invalid_input(): + """ + Test WordpieceTokenizer in eager mode with invalid input + """ + data = {'张三': 18, '王五': 20} + vocab = text.Vocab.from_list(vocab_bert) + tokenizer_op = text.BertTokenizer(vocab=vocab) + + with pytest.raises(TypeError) as info: + _ = tokenizer_op(data) + assert "Invalid user input. Got : {'张三': 18, '王五': 20}, cannot be converted into tensor." in str(info) + if __name__ == '__main__': + test_bert_tokenizer_callable_invalid_input() test_bert_tokenizer_default() test_bert_tokenizer_with_offsets() diff --git a/tests/ut/python/dataset/test_to_number_op.py b/tests/ut/python/dataset/test_to_number_op.py index 4fa37e90e1..6a351cd589 100644 --- a/tests/ut/python/dataset/test_to_number_op.py +++ b/tests/ut/python/dataset/test_to_number_op.py @@ -52,9 +52,10 @@ def test_to_number_eager(): # test input invalid tensor invalid_input = [["1", "2", "3"], ["4", "5"]] - with pytest.raises(RuntimeError) as info: + with pytest.raises(TypeError) as info: _ = op(invalid_input) - assert "Invalid data type." in str(info.value) + assert "Invalid user input. Got : [['1', '2', '3'], ['4', '5']], cannot be converted into tensor" in \ + str(info.value) def test_to_number_typical_case_integral():