From 1f4251a440bafdc7781ce254ee5e3905b6a9b6ed Mon Sep 17 00:00:00 2001 From: tony_liu2 Date: Fri, 17 Jul 2020 14:06:14 -0400 Subject: [PATCH] add try to minddataset python tests to remove files on fail delete unrelated file change finally to except else fix spacing alignment fix indentation fix indentation fix indentation add fix to new test case add if main for pytest fix spacing --- tests/ut/python/dataset/test_minddataset.py | 2032 +++++++++-------- .../dataset/test_minddataset_exception.py | 86 +- ...st_minddataset_multi_images_and_ndarray.py | 108 +- .../python/dataset/test_minddataset_padded.py | 114 +- .../dataset/test_minddataset_sampler.py | 63 +- 5 files changed, 1313 insertions(+), 1090 deletions(-) diff --git a/tests/ut/python/dataset/test_minddataset.py b/tests/ut/python/dataset/test_minddataset.py index 5791ea9618..24e6595233 100644 --- a/tests/ut/python/dataset/test_minddataset.py +++ b/tests/ut/python/dataset/test_minddataset.py @@ -46,58 +46,71 @@ def add_and_remove_cv_file(): """add/remove cv file""" paths = ["{}{}".format(CV_FILE_NAME, str(x).rjust(1, '0')) for x in range(FILES_NUM)] - for x in paths: - if os.path.exists("{}".format(x)): + try: + for x in paths: + if os.path.exists("{}".format(x)): + os.remove("{}".format(x)) + if os.path.exists("{}.db".format(x)): + os.remove("{}.db".format(x)) + writer = FileWriter(CV_FILE_NAME, FILES_NUM) + data = get_data(CV_DIR_NAME) + cv_schema_json = {"id": {"type": "int32"}, + "file_name": {"type": "string"}, + "label": {"type": "int32"}, + "data": {"type": "bytes"}} + writer.add_schema(cv_schema_json, "img_schema") + writer.add_index(["file_name", "label"]) + writer.write_raw_data(data) + writer.commit() + yield "yield_cv_data" + except Exception as error: + for x in paths: + os.remove("{}".format(x)) + os.remove("{}.db".format(x)) + raise error + else: + for x in paths: os.remove("{}".format(x)) - if os.path.exists("{}.db".format(x)): os.remove("{}.db".format(x)) - writer = FileWriter(CV_FILE_NAME, FILES_NUM) - data = get_data(CV_DIR_NAME) - cv_schema_json = {"id": {"type": "int32"}, - "file_name": {"type": "string"}, - "label": {"type": "int32"}, - "data": {"type": "bytes"}} - writer.add_schema(cv_schema_json, "img_schema") - writer.add_index(["file_name", "label"]) - writer.write_raw_data(data) - writer.commit() - yield "yield_cv_data" - for x in paths: - os.remove("{}".format(x)) - os.remove("{}.db".format(x)) - @pytest.fixture def add_and_remove_nlp_file(): """add/remove nlp file""" paths = ["{}{}".format(NLP_FILE_NAME, str(x).rjust(1, '0')) for x in range(FILES_NUM)] - for x in paths: - if os.path.exists("{}".format(x)): + try: + for x in paths: + if os.path.exists("{}".format(x)): + os.remove("{}".format(x)) + if os.path.exists("{}.db".format(x)): + os.remove("{}.db".format(x)) + writer = FileWriter(NLP_FILE_NAME, FILES_NUM) + data = [x for x in get_nlp_data(NLP_FILE_POS, NLP_FILE_VOCAB, 10)] + nlp_schema_json = {"id": {"type": "string"}, "label": {"type": "int32"}, + "rating": {"type": "float32"}, + "input_ids": {"type": "int64", + "shape": [-1]}, + "input_mask": {"type": "int64", + "shape": [1, -1]}, + "segment_ids": {"type": "int64", + "shape": [2, -1]} + } + writer.set_header_size(1 << 14) + writer.set_page_size(1 << 15) + writer.add_schema(nlp_schema_json, "nlp_schema") + writer.add_index(["id", "rating"]) + writer.write_raw_data(data) + writer.commit() + yield "yield_nlp_data" + except Exception as error: + for x in paths: + os.remove("{}".format(x)) + os.remove("{}.db".format(x)) + raise error + else: + for x in paths: os.remove("{}".format(x)) - if os.path.exists("{}.db".format(x)): os.remove("{}.db".format(x)) - writer = FileWriter(NLP_FILE_NAME, FILES_NUM) - data = [x for x in get_nlp_data(NLP_FILE_POS, NLP_FILE_VOCAB, 10)] - nlp_schema_json = {"id": {"type": "string"}, "label": {"type": "int32"}, - "rating": {"type": "float32"}, - "input_ids": {"type": "int64", - "shape": [-1]}, - "input_mask": {"type": "int64", - "shape": [1, -1]}, - "segment_ids": {"type": "int64", - "shape": [2, -1]} - } - writer.set_header_size(1 << 14) - writer.set_page_size(1 << 15) - writer.add_schema(nlp_schema_json, "nlp_schema") - writer.add_index(["id", "rating"]) - writer.write_raw_data(data) - writer.commit() - yield "yield_nlp_data" - for x in paths: - os.remove("{}".format(x)) - os.remove("{}.db".format(x)) @pytest.fixture @@ -105,44 +118,51 @@ def add_and_remove_nlp_compress_file(): """add/remove nlp file""" paths = ["{}{}".format(NLP_FILE_NAME, str(x).rjust(1, '0')) for x in range(FILES_NUM)] - for x in paths: - if os.path.exists("{}".format(x)): + try: + for x in paths: + if os.path.exists("{}".format(x)): + os.remove("{}".format(x)) + if os.path.exists("{}.db".format(x)): + os.remove("{}.db".format(x)) + writer = FileWriter(NLP_FILE_NAME, FILES_NUM) + data = [] + for row_id in range(16): + data.append({ + "label": row_id, + "array_a": np.reshape(np.array([0, 1, -1, 127, -128, 128, -129, + 255, 256, -32768, 32767, -32769, 32768, -2147483648, + 2147483647], dtype=np.int32), [-1]), + "array_b": np.reshape(np.array([0, 1, -1, 127, -128, 128, -129, 255, + 256, -32768, 32767, -32769, 32768, + -2147483648, 2147483647, -2147483649, 2147483649, + -922337036854775808, 9223372036854775807]), [1, -1]), + "array_c": str.encode("nlp data"), + "array_d": np.reshape(np.array([[-10, -127], [10, 127]]), [2, -1]) + }) + nlp_schema_json = {"label": {"type": "int32"}, + "array_a": {"type": "int32", + "shape": [-1]}, + "array_b": {"type": "int64", + "shape": [1, -1]}, + "array_c": {"type": "bytes"}, + "array_d": {"type": "int64", + "shape": [2, -1]} + } + writer.set_header_size(1 << 14) + writer.set_page_size(1 << 15) + writer.add_schema(nlp_schema_json, "nlp_schema") + writer.write_raw_data(data) + writer.commit() + yield "yield_nlp_data" + except Exception as error: + for x in paths: + os.remove("{}".format(x)) + os.remove("{}.db".format(x)) + raise error + else: + for x in paths: os.remove("{}".format(x)) - if os.path.exists("{}.db".format(x)): os.remove("{}.db".format(x)) - writer = FileWriter(NLP_FILE_NAME, FILES_NUM) - data = [] - for row_id in range(16): - data.append({ - "label": row_id, - "array_a": np.reshape(np.array([0, 1, -1, 127, -128, 128, -129, - 255, 256, -32768, 32767, -32769, 32768, -2147483648, - 2147483647], dtype=np.int32), [-1]), - "array_b": np.reshape(np.array([0, 1, -1, 127, -128, 128, -129, 255, - 256, -32768, 32767, -32769, 32768, - -2147483648, 2147483647, -2147483649, 2147483649, - -922337036854775808, 9223372036854775807]), [1, -1]), - "array_c": str.encode("nlp data"), - "array_d": np.reshape(np.array([[-10, -127], [10, 127]]), [2, -1]) - }) - nlp_schema_json = {"label": {"type": "int32"}, - "array_a": {"type": "int32", - "shape": [-1]}, - "array_b": {"type": "int64", - "shape": [1, -1]}, - "array_c": {"type": "bytes"}, - "array_d": {"type": "int64", - "shape": [2, -1]} - } - writer.set_header_size(1 << 14) - writer.set_page_size(1 << 15) - writer.add_schema(nlp_schema_json, "nlp_schema") - writer.write_raw_data(data) - writer.commit() - yield "yield_nlp_data" - for x in paths: - os.remove("{}".format(x)) - os.remove("{}.db".format(x)) def test_nlp_compress_data(add_and_remove_nlp_compress_file): @@ -199,22 +219,29 @@ def test_cv_minddataset_writer_tutorial(): """tutorial for cv dataset writer.""" paths = ["{}{}".format(CV_FILE_NAME, str(x).rjust(1, '0')) for x in range(FILES_NUM)] - for x in paths: - if os.path.exists("{}".format(x)): + try: + for x in paths: + if os.path.exists("{}".format(x)): + os.remove("{}".format(x)) + if os.path.exists("{}.db".format(x)): + os.remove("{}.db".format(x)) + writer = FileWriter(CV_FILE_NAME, FILES_NUM) + data = get_data(CV_DIR_NAME) + cv_schema_json = {"file_name": {"type": "string"}, "label": {"type": "int32"}, + "data": {"type": "bytes"}} + writer.add_schema(cv_schema_json, "img_schema") + writer.add_index(["file_name", "label"]) + writer.write_raw_data(data) + writer.commit() + except Exception as error: + for x in paths: + os.remove("{}".format(x)) + os.remove("{}.db".format(x)) + raise error + else: + for x in paths: os.remove("{}".format(x)) - if os.path.exists("{}.db".format(x)): os.remove("{}.db".format(x)) - writer = FileWriter(CV_FILE_NAME, FILES_NUM) - data = get_data(CV_DIR_NAME) - cv_schema_json = {"file_name": {"type": "string"}, "label": {"type": "int32"}, - "data": {"type": "bytes"}} - writer.add_schema(cv_schema_json, "img_schema") - writer.add_index(["file_name", "label"]) - writer.write_raw_data(data) - writer.commit() - for x in paths: - os.remove("{}".format(x)) - os.remove("{}.db".format(x)) def test_cv_minddataset_partition_tutorial(add_and_remove_cv_file): @@ -654,106 +681,124 @@ def test_cv_minddataset_reader_one_partition(add_and_remove_cv_file): def test_cv_minddataset_reader_two_dataset(add_and_remove_cv_file): """tutorial for cv minderdataset.""" - if os.path.exists(CV1_FILE_NAME): - os.remove(CV1_FILE_NAME) - if os.path.exists("{}.db".format(CV1_FILE_NAME)): - os.remove("{}.db".format(CV1_FILE_NAME)) - if os.path.exists(CV2_FILE_NAME): - os.remove(CV2_FILE_NAME) - if os.path.exists("{}.db".format(CV2_FILE_NAME)): - os.remove("{}.db".format(CV2_FILE_NAME)) - writer = FileWriter(CV1_FILE_NAME, 1) - data = get_data(CV_DIR_NAME) - cv_schema_json = {"id": {"type": "int32"}, - "file_name": {"type": "string"}, - "label": {"type": "int32"}, - "data": {"type": "bytes"}} - writer.add_schema(cv_schema_json, "CV1_schema") - writer.add_index(["file_name", "label"]) - writer.write_raw_data(data) - writer.commit() - - writer = FileWriter(CV2_FILE_NAME, 1) - data = get_data(CV_DIR_NAME) - cv_schema_json = {"id": {"type": "int32"}, - "file_name": {"type": "string"}, - "label": {"type": "int32"}, - "data": {"type": "bytes"}} - writer.add_schema(cv_schema_json, "CV2_schema") - writer.add_index(["file_name", "label"]) - writer.write_raw_data(data) - writer.commit() - columns_list = ["data", "file_name", "label"] - num_readers = 4 - data_set = ds.MindDataset([CV_FILE_NAME + str(x) for x in range(FILES_NUM)] + [CV1_FILE_NAME, CV2_FILE_NAME], - columns_list, num_readers) - assert data_set.get_dataset_size() == 30 - num_iter = 0 - for item in data_set.create_dict_iterator(): - logger.info( - "-------------- cv reader basic: {} ------------------------".format(num_iter)) - logger.info( - "-------------- len(item[data]): {} ------------------------".format(len(item["data"]))) - logger.info( - "-------------- item[data]: {} -----------------------------".format(item["data"])) - logger.info( - "-------------- item[file_name]: {} ------------------------".format(item["file_name"])) - logger.info( - "-------------- item[label]: {} ----------------------------".format(item["label"])) - num_iter += 1 - assert num_iter == 30 - if os.path.exists(CV1_FILE_NAME): - os.remove(CV1_FILE_NAME) - if os.path.exists("{}.db".format(CV1_FILE_NAME)): - os.remove("{}.db".format(CV1_FILE_NAME)) - if os.path.exists(CV2_FILE_NAME): - os.remove(CV2_FILE_NAME) - if os.path.exists("{}.db".format(CV2_FILE_NAME)): - os.remove("{}.db".format(CV2_FILE_NAME)) - + try: + if os.path.exists(CV1_FILE_NAME): + os.remove(CV1_FILE_NAME) + if os.path.exists("{}.db".format(CV1_FILE_NAME)): + os.remove("{}.db".format(CV1_FILE_NAME)) + if os.path.exists(CV2_FILE_NAME): + os.remove(CV2_FILE_NAME) + if os.path.exists("{}.db".format(CV2_FILE_NAME)): + os.remove("{}.db".format(CV2_FILE_NAME)) + writer = FileWriter(CV1_FILE_NAME, 1) + data = get_data(CV_DIR_NAME) + cv_schema_json = {"id": {"type": "int32"}, + "file_name": {"type": "string"}, + "label": {"type": "int32"}, + "data": {"type": "bytes"}} + writer.add_schema(cv_schema_json, "CV1_schema") + writer.add_index(["file_name", "label"]) + writer.write_raw_data(data) + writer.commit() + + writer = FileWriter(CV2_FILE_NAME, 1) + data = get_data(CV_DIR_NAME) + cv_schema_json = {"id": {"type": "int32"}, + "file_name": {"type": "string"}, + "label": {"type": "int32"}, + "data": {"type": "bytes"}} + writer.add_schema(cv_schema_json, "CV2_schema") + writer.add_index(["file_name", "label"]) + writer.write_raw_data(data) + writer.commit() + columns_list = ["data", "file_name", "label"] + num_readers = 4 + data_set = ds.MindDataset([CV_FILE_NAME + str(x) for x in range(FILES_NUM)] + [CV1_FILE_NAME, CV2_FILE_NAME], + columns_list, num_readers) + assert data_set.get_dataset_size() == 30 + num_iter = 0 + for item in data_set.create_dict_iterator(): + logger.info( + "-------------- cv reader basic: {} ------------------------".format(num_iter)) + logger.info( + "-------------- len(item[data]): {} ------------------------".format(len(item["data"]))) + logger.info( + "-------------- item[data]: {} -----------------------------".format(item["data"])) + logger.info( + "-------------- item[file_name]: {} ------------------------".format(item["file_name"])) + logger.info( + "-------------- item[label]: {} ----------------------------".format(item["label"])) + num_iter += 1 + assert num_iter == 30 + except Exception as error: + if os.path.exists(CV1_FILE_NAME): + os.remove(CV1_FILE_NAME) + if os.path.exists("{}.db".format(CV1_FILE_NAME)): + os.remove("{}.db".format(CV1_FILE_NAME)) + if os.path.exists(CV2_FILE_NAME): + os.remove(CV2_FILE_NAME) + if os.path.exists("{}.db".format(CV2_FILE_NAME)): + os.remove("{}.db".format(CV2_FILE_NAME)) + raise error + else: + if os.path.exists(CV1_FILE_NAME): + os.remove(CV1_FILE_NAME) + if os.path.exists("{}.db".format(CV1_FILE_NAME)): + os.remove("{}.db".format(CV1_FILE_NAME)) + if os.path.exists(CV2_FILE_NAME): + os.remove(CV2_FILE_NAME) + if os.path.exists("{}.db".format(CV2_FILE_NAME)): + os.remove("{}.db".format(CV2_FILE_NAME)) def test_cv_minddataset_reader_two_dataset_partition(add_and_remove_cv_file): paths = ["{}{}".format(CV1_FILE_NAME, str(x).rjust(1, '0')) for x in range(FILES_NUM)] - for x in paths: - if os.path.exists("{}".format(x)): + try: + for x in paths: + if os.path.exists("{}".format(x)): + os.remove("{}".format(x)) + if os.path.exists("{}.db".format(x)): + os.remove("{}.db".format(x)) + writer = FileWriter(CV1_FILE_NAME, FILES_NUM) + data = get_data(CV_DIR_NAME) + cv_schema_json = {"id": {"type": "int32"}, + "file_name": {"type": "string"}, + "label": {"type": "int32"}, + "data": {"type": "bytes"}} + writer.add_schema(cv_schema_json, "CV1_schema") + writer.add_index(["file_name", "label"]) + writer.write_raw_data(data) + writer.commit() + + columns_list = ["data", "file_name", "label"] + num_readers = 4 + data_set = ds.MindDataset([CV_FILE_NAME + str(x) for x in range(2)] + + [CV1_FILE_NAME + str(x) for x in range(2, 4)], + columns_list, num_readers) + assert data_set.get_dataset_size() < 20 + num_iter = 0 + for item in data_set.create_dict_iterator(): + logger.info( + "-------------- cv reader basic: {} ------------------------".format(num_iter)) + logger.info( + "-------------- len(item[data]): {} ------------------------".format(len(item["data"]))) + logger.info( + "-------------- item[data]: {} -----------------------------".format(item["data"])) + logger.info( + "-------------- item[file_name]: {} ------------------------".format(item["file_name"])) + logger.info( + "-------------- item[label]: {} ----------------------------".format(item["label"])) + num_iter += 1 + assert num_iter < 20 + except Exception as error: + for x in paths: + os.remove("{}".format(x)) + os.remove("{}.db".format(x)) + raise error + else: + for x in paths: os.remove("{}".format(x)) - if os.path.exists("{}.db".format(x)): os.remove("{}.db".format(x)) - writer = FileWriter(CV1_FILE_NAME, FILES_NUM) - data = get_data(CV_DIR_NAME) - cv_schema_json = {"id": {"type": "int32"}, - "file_name": {"type": "string"}, - "label": {"type": "int32"}, - "data": {"type": "bytes"}} - writer.add_schema(cv_schema_json, "CV1_schema") - writer.add_index(["file_name", "label"]) - writer.write_raw_data(data) - writer.commit() - - columns_list = ["data", "file_name", "label"] - num_readers = 4 - data_set = ds.MindDataset([CV_FILE_NAME + str(x) for x in range(2)] + [CV1_FILE_NAME + str(x) for x in range(2, 4)], - columns_list, num_readers) - assert data_set.get_dataset_size() < 20 - num_iter = 0 - for item in data_set.create_dict_iterator(): - logger.info( - "-------------- cv reader basic: {} ------------------------".format(num_iter)) - logger.info( - "-------------- len(item[data]): {} ------------------------".format(len(item["data"]))) - logger.info( - "-------------- item[data]: {} -----------------------------".format(item["data"])) - logger.info( - "-------------- item[file_name]: {} ------------------------".format(item["file_name"])) - logger.info( - "-------------- item[label]: {} ----------------------------".format(item["label"])) - num_iter += 1 - assert num_iter < 20 - for x in paths: - os.remove("{}".format(x)) - os.remove("{}.db".format(x)) - def test_cv_minddataset_reader_basic_tutorial(add_and_remove_cv_file): """tutorial for cv minderdataset.""" @@ -1086,809 +1131,870 @@ def inputs(vectors, maxlen=50): def test_write_with_multi_bytes_and_array_and_read_by_MindDataset(): mindrecord_file_name = "test.mindrecord" - if os.path.exists("{}".format(mindrecord_file_name)): + try: + if os.path.exists("{}".format(mindrecord_file_name)): + os.remove("{}".format(mindrecord_file_name)) + if os.path.exists("{}.db".format(mindrecord_file_name)): + os.remove("{}.db".format(mindrecord_file_name)) + data = [{"file_name": "001.jpg", "label": 4, + "image1": bytes("image1 bytes abc", encoding='UTF-8'), + "image2": bytes("image1 bytes def", encoding='UTF-8'), + "source_sos_ids": np.array([1, 2, 3, 4, 5], dtype=np.int64), + "source_sos_mask": np.array([6, 7, 8, 9, 10, 11, 12], dtype=np.int64), + "image3": bytes("image1 bytes ghi", encoding='UTF-8'), + "image4": bytes("image1 bytes jkl", encoding='UTF-8'), + "image5": bytes("image1 bytes mno", encoding='UTF-8'), + "target_sos_ids": np.array([28, 29, 30, 31, 32], dtype=np.int64), + "target_sos_mask": np.array([33, 34, 35, 36, 37, 38], dtype=np.int64), + "target_eos_ids": np.array([39, 40, 41, 42, 43, 44, 45, 46, 47], dtype=np.int64), + "target_eos_mask": np.array([48, 49, 50, 51], dtype=np.int64)}, + {"file_name": "002.jpg", "label": 5, + "image1": bytes("image2 bytes abc", encoding='UTF-8'), + "image2": bytes("image2 bytes def", encoding='UTF-8'), + "image3": bytes("image2 bytes ghi", encoding='UTF-8'), + "image4": bytes("image2 bytes jkl", encoding='UTF-8'), + "image5": bytes("image2 bytes mno", encoding='UTF-8'), + "source_sos_ids": np.array([11, 2, 3, 4, 5], dtype=np.int64), + "source_sos_mask": np.array([16, 7, 8, 9, 10, 11, 12], dtype=np.int64), + "target_sos_ids": np.array([128, 29, 30, 31, 32], dtype=np.int64), + "target_sos_mask": np.array([133, 34, 35, 36, 37, 38], dtype=np.int64), + "target_eos_ids": np.array([139, 40, 41, 42, 43, 44, 45, 46, 47], dtype=np.int64), + "target_eos_mask": np.array([148, 49, 50, 51], dtype=np.int64)}, + {"file_name": "003.jpg", "label": 6, + "source_sos_ids": np.array([21, 2, 3, 4, 5], dtype=np.int64), + "source_sos_mask": np.array([26, 7, 8, 9, 10, 11, 12], dtype=np.int64), + "target_sos_ids": np.array([228, 29, 30, 31, 32], dtype=np.int64), + "target_sos_mask": np.array([233, 34, 35, 36, 37, 38], dtype=np.int64), + "target_eos_ids": np.array([239, 40, 41, 42, 43, 44, 45, 46, 47], dtype=np.int64), + "image1": bytes("image3 bytes abc", encoding='UTF-8'), + "image2": bytes("image3 bytes def", encoding='UTF-8'), + "image3": bytes("image3 bytes ghi", encoding='UTF-8'), + "image4": bytes("image3 bytes jkl", encoding='UTF-8'), + "image5": bytes("image3 bytes mno", encoding='UTF-8'), + "target_eos_mask": np.array([248, 49, 50, 51], dtype=np.int64)}, + {"file_name": "004.jpg", "label": 7, + "source_sos_ids": np.array([31, 2, 3, 4, 5], dtype=np.int64), + "source_sos_mask": np.array([36, 7, 8, 9, 10, 11, 12], dtype=np.int64), + "image1": bytes("image4 bytes abc", encoding='UTF-8'), + "image2": bytes("image4 bytes def", encoding='UTF-8'), + "image3": bytes("image4 bytes ghi", encoding='UTF-8'), + "image4": bytes("image4 bytes jkl", encoding='UTF-8'), + "image5": bytes("image4 bytes mno", encoding='UTF-8'), + "target_sos_ids": np.array([328, 29, 30, 31, 32], dtype=np.int64), + "target_sos_mask": np.array([333, 34, 35, 36, 37, 38], dtype=np.int64), + "target_eos_ids": np.array([339, 40, 41, 42, 43, 44, 45, 46, 47], dtype=np.int64), + "target_eos_mask": np.array([348, 49, 50, 51], dtype=np.int64)}, + {"file_name": "005.jpg", "label": 8, + "source_sos_ids": np.array([41, 2, 3, 4, 5], dtype=np.int64), + "source_sos_mask": np.array([46, 7, 8, 9, 10, 11, 12], dtype=np.int64), + "target_sos_ids": np.array([428, 29, 30, 31, 32], dtype=np.int64), + "target_sos_mask": np.array([433, 34, 35, 36, 37, 38], dtype=np.int64), + "image1": bytes("image5 bytes abc", encoding='UTF-8'), + "image2": bytes("image5 bytes def", encoding='UTF-8'), + "image3": bytes("image5 bytes ghi", encoding='UTF-8'), + "image4": bytes("image5 bytes jkl", encoding='UTF-8'), + "image5": bytes("image5 bytes mno", encoding='UTF-8'), + "target_eos_ids": np.array([439, 40, 41, 42, 43, 44, 45, 46, 47], dtype=np.int64), + "target_eos_mask": np.array([448, 49, 50, 51], dtype=np.int64)}, + {"file_name": "006.jpg", "label": 9, + "source_sos_ids": np.array([51, 2, 3, 4, 5], dtype=np.int64), + "source_sos_mask": np.array([56, 7, 8, 9, 10, 11, 12], dtype=np.int64), + "target_sos_ids": np.array([528, 29, 30, 31, 32], dtype=np.int64), + "image1": bytes("image6 bytes abc", encoding='UTF-8'), + "image2": bytes("image6 bytes def", encoding='UTF-8'), + "image3": bytes("image6 bytes ghi", encoding='UTF-8'), + "image4": bytes("image6 bytes jkl", encoding='UTF-8'), + "image5": bytes("image6 bytes mno", encoding='UTF-8'), + "target_sos_mask": np.array([533, 34, 35, 36, 37, 38], dtype=np.int64), + "target_eos_ids": np.array([539, 40, 41, 42, 43, 44, 45, 46, 47], dtype=np.int64), + "target_eos_mask": np.array([548, 49, 50, 51], dtype=np.int64)} + ] + + writer = FileWriter(mindrecord_file_name) + schema = {"file_name": {"type": "string"}, + "image1": {"type": "bytes"}, + "image2": {"type": "bytes"}, + "source_sos_ids": {"type": "int64", "shape": [-1]}, + "source_sos_mask": {"type": "int64", "shape": [-1]}, + "image3": {"type": "bytes"}, + "image4": {"type": "bytes"}, + "image5": {"type": "bytes"}, + "target_sos_ids": {"type": "int64", "shape": [-1]}, + "target_sos_mask": {"type": "int64", "shape": [-1]}, + "target_eos_ids": {"type": "int64", "shape": [-1]}, + "target_eos_mask": {"type": "int64", "shape": [-1]}, + "label": {"type": "int32"}} + writer.add_schema(schema, "data is so cool") + writer.write_raw_data(data) + writer.commit() + + # change data value to list + data_value_to_list = [] + for item in data: + new_data = {} + new_data['file_name'] = np.asarray(item["file_name"], dtype='S') + new_data['label'] = np.asarray(list([item["label"]]), dtype=np.int32) + new_data['image1'] = np.asarray(list(item["image1"]), dtype=np.uint8) + new_data['image2'] = np.asarray(list(item["image2"]), dtype=np.uint8) + new_data['image3'] = np.asarray(list(item["image3"]), dtype=np.uint8) + new_data['image4'] = np.asarray(list(item["image4"]), dtype=np.uint8) + new_data['image5'] = np.asarray(list(item["image5"]), dtype=np.uint8) + new_data['source_sos_ids'] = item["source_sos_ids"] + new_data['source_sos_mask'] = item["source_sos_mask"] + new_data['target_sos_ids'] = item["target_sos_ids"] + new_data['target_sos_mask'] = item["target_sos_mask"] + new_data['target_eos_ids'] = item["target_eos_ids"] + new_data['target_eos_mask'] = item["target_eos_mask"] + data_value_to_list.append(new_data) + + num_readers = 2 + data_set = ds.MindDataset(dataset_file=mindrecord_file_name, + num_parallel_workers=num_readers, + shuffle=False) + assert data_set.get_dataset_size() == 6 + num_iter = 0 + for item in data_set.create_dict_iterator(): + assert len(item) == 13 + for field in item: + if isinstance(item[field], np.ndarray): + assert (item[field] == + data_value_to_list[num_iter][field]).all() + else: + assert item[field] == data_value_to_list[num_iter][field] + num_iter += 1 + assert num_iter == 6 + + num_readers = 2 + data_set = ds.MindDataset(dataset_file=mindrecord_file_name, + columns_list=["source_sos_ids", + "source_sos_mask", "target_sos_ids"], + num_parallel_workers=num_readers, + shuffle=False) + assert data_set.get_dataset_size() == 6 + num_iter = 0 + for item in data_set.create_dict_iterator(): + assert len(item) == 3 + for field in item: + if isinstance(item[field], np.ndarray): + assert (item[field] == data[num_iter][field]).all() + else: + assert item[field] == data[num_iter][field] + num_iter += 1 + assert num_iter == 6 + + num_readers = 1 + data_set = ds.MindDataset(dataset_file=mindrecord_file_name, + columns_list=["image2", "source_sos_mask", "image3", "target_sos_ids"], + num_parallel_workers=num_readers, + shuffle=False) + assert data_set.get_dataset_size() == 6 + num_iter = 0 + for item in data_set.create_dict_iterator(): + assert len(item) == 4 + for field in item: + if isinstance(item[field], np.ndarray): + assert (item[field] == + data_value_to_list[num_iter][field]).all() + else: + assert item[field] == data_value_to_list[num_iter][field] + num_iter += 1 + assert num_iter == 6 + + num_readers = 3 + data_set = ds.MindDataset(dataset_file=mindrecord_file_name, + columns_list=["target_sos_ids", + "image4", "source_sos_ids"], + num_parallel_workers=num_readers, + shuffle=False) + assert data_set.get_dataset_size() == 6 + num_iter = 0 + for item in data_set.create_dict_iterator(): + assert len(item) == 3 + for field in item: + if isinstance(item[field], np.ndarray): + assert (item[field] == + data_value_to_list[num_iter][field]).all() + else: + assert item[field] == data_value_to_list[num_iter][field] + num_iter += 1 + assert num_iter == 6 + + num_readers = 3 + data_set = ds.MindDataset(dataset_file=mindrecord_file_name, + columns_list=["target_sos_ids", "image5", + "image4", "image3", "source_sos_ids"], + num_parallel_workers=num_readers, + shuffle=False) + assert data_set.get_dataset_size() == 6 + num_iter = 0 + for item in data_set.create_dict_iterator(): + assert len(item) == 5 + for field in item: + if isinstance(item[field], np.ndarray): + assert (item[field] == + data_value_to_list[num_iter][field]).all() + else: + assert item[field] == data_value_to_list[num_iter][field] + num_iter += 1 + assert num_iter == 6 + + num_readers = 1 + data_set = ds.MindDataset(dataset_file=mindrecord_file_name, + columns_list=["target_eos_mask", "image5", + "image2", "source_sos_mask", "label"], + num_parallel_workers=num_readers, + shuffle=False) + assert data_set.get_dataset_size() == 6 + num_iter = 0 + for item in data_set.create_dict_iterator(): + assert len(item) == 5 + for field in item: + if isinstance(item[field], np.ndarray): + assert (item[field] == + data_value_to_list[num_iter][field]).all() + else: + assert item[field] == data_value_to_list[num_iter][field] + num_iter += 1 + assert num_iter == 6 + + num_readers = 2 + data_set = ds.MindDataset(dataset_file=mindrecord_file_name, + columns_list=["label", "target_eos_mask", "image1", "target_eos_ids", + "source_sos_mask", "image2", "image4", "image3", + "source_sos_ids", "image5", "file_name"], + num_parallel_workers=num_readers, + shuffle=False) + assert data_set.get_dataset_size() == 6 + num_iter = 0 + for item in data_set.create_dict_iterator(): + assert len(item) == 11 + for field in item: + if isinstance(item[field], np.ndarray): + assert (item[field] == + data_value_to_list[num_iter][field]).all() + else: + assert item[field] == data_value_to_list[num_iter][field] + num_iter += 1 + assert num_iter == 6 + except Exception as error: + os.remove("{}".format(mindrecord_file_name)) + os.remove("{}.db".format(mindrecord_file_name)) + raise error + else: os.remove("{}".format(mindrecord_file_name)) - if os.path.exists("{}.db".format(mindrecord_file_name)): os.remove("{}.db".format(mindrecord_file_name)) - data = [{"file_name": "001.jpg", "label": 4, - "image1": bytes("image1 bytes abc", encoding='UTF-8'), - "image2": bytes("image1 bytes def", encoding='UTF-8'), - "source_sos_ids": np.array([1, 2, 3, 4, 5], dtype=np.int64), - "source_sos_mask": np.array([6, 7, 8, 9, 10, 11, 12], dtype=np.int64), - "image3": bytes("image1 bytes ghi", encoding='UTF-8'), - "image4": bytes("image1 bytes jkl", encoding='UTF-8'), - "image5": bytes("image1 bytes mno", encoding='UTF-8'), - "target_sos_ids": np.array([28, 29, 30, 31, 32], dtype=np.int64), - "target_sos_mask": np.array([33, 34, 35, 36, 37, 38], dtype=np.int64), - "target_eos_ids": np.array([39, 40, 41, 42, 43, 44, 45, 46, 47], dtype=np.int64), - "target_eos_mask": np.array([48, 49, 50, 51], dtype=np.int64)}, - {"file_name": "002.jpg", "label": 5, - "image1": bytes("image2 bytes abc", encoding='UTF-8'), - "image2": bytes("image2 bytes def", encoding='UTF-8'), - "image3": bytes("image2 bytes ghi", encoding='UTF-8'), - "image4": bytes("image2 bytes jkl", encoding='UTF-8'), - "image5": bytes("image2 bytes mno", encoding='UTF-8'), - "source_sos_ids": np.array([11, 2, 3, 4, 5], dtype=np.int64), - "source_sos_mask": np.array([16, 7, 8, 9, 10, 11, 12], dtype=np.int64), - "target_sos_ids": np.array([128, 29, 30, 31, 32], dtype=np.int64), - "target_sos_mask": np.array([133, 34, 35, 36, 37, 38], dtype=np.int64), - "target_eos_ids": np.array([139, 40, 41, 42, 43, 44, 45, 46, 47], dtype=np.int64), - "target_eos_mask": np.array([148, 49, 50, 51], dtype=np.int64)}, - {"file_name": "003.jpg", "label": 6, - "source_sos_ids": np.array([21, 2, 3, 4, 5], dtype=np.int64), - "source_sos_mask": np.array([26, 7, 8, 9, 10, 11, 12], dtype=np.int64), - "target_sos_ids": np.array([228, 29, 30, 31, 32], dtype=np.int64), - "target_sos_mask": np.array([233, 34, 35, 36, 37, 38], dtype=np.int64), - "target_eos_ids": np.array([239, 40, 41, 42, 43, 44, 45, 46, 47], dtype=np.int64), - "image1": bytes("image3 bytes abc", encoding='UTF-8'), - "image2": bytes("image3 bytes def", encoding='UTF-8'), - "image3": bytes("image3 bytes ghi", encoding='UTF-8'), - "image4": bytes("image3 bytes jkl", encoding='UTF-8'), - "image5": bytes("image3 bytes mno", encoding='UTF-8'), - "target_eos_mask": np.array([248, 49, 50, 51], dtype=np.int64)}, - {"file_name": "004.jpg", "label": 7, - "source_sos_ids": np.array([31, 2, 3, 4, 5], dtype=np.int64), - "source_sos_mask": np.array([36, 7, 8, 9, 10, 11, 12], dtype=np.int64), - "image1": bytes("image4 bytes abc", encoding='UTF-8'), - "image2": bytes("image4 bytes def", encoding='UTF-8'), - "image3": bytes("image4 bytes ghi", encoding='UTF-8'), - "image4": bytes("image4 bytes jkl", encoding='UTF-8'), - "image5": bytes("image4 bytes mno", encoding='UTF-8'), - "target_sos_ids": np.array([328, 29, 30, 31, 32], dtype=np.int64), - "target_sos_mask": np.array([333, 34, 35, 36, 37, 38], dtype=np.int64), - "target_eos_ids": np.array([339, 40, 41, 42, 43, 44, 45, 46, 47], dtype=np.int64), - "target_eos_mask": np.array([348, 49, 50, 51], dtype=np.int64)}, - {"file_name": "005.jpg", "label": 8, - "source_sos_ids": np.array([41, 2, 3, 4, 5], dtype=np.int64), - "source_sos_mask": np.array([46, 7, 8, 9, 10, 11, 12], dtype=np.int64), - "target_sos_ids": np.array([428, 29, 30, 31, 32], dtype=np.int64), - "target_sos_mask": np.array([433, 34, 35, 36, 37, 38], dtype=np.int64), - "image1": bytes("image5 bytes abc", encoding='UTF-8'), - "image2": bytes("image5 bytes def", encoding='UTF-8'), - "image3": bytes("image5 bytes ghi", encoding='UTF-8'), - "image4": bytes("image5 bytes jkl", encoding='UTF-8'), - "image5": bytes("image5 bytes mno", encoding='UTF-8'), - "target_eos_ids": np.array([439, 40, 41, 42, 43, 44, 45, 46, 47], dtype=np.int64), - "target_eos_mask": np.array([448, 49, 50, 51], dtype=np.int64)}, - {"file_name": "006.jpg", "label": 9, - "source_sos_ids": np.array([51, 2, 3, 4, 5], dtype=np.int64), - "source_sos_mask": np.array([56, 7, 8, 9, 10, 11, 12], dtype=np.int64), - "target_sos_ids": np.array([528, 29, 30, 31, 32], dtype=np.int64), - "image1": bytes("image6 bytes abc", encoding='UTF-8'), - "image2": bytes("image6 bytes def", encoding='UTF-8'), - "image3": bytes("image6 bytes ghi", encoding='UTF-8'), - "image4": bytes("image6 bytes jkl", encoding='UTF-8'), - "image5": bytes("image6 bytes mno", encoding='UTF-8'), - "target_sos_mask": np.array([533, 34, 35, 36, 37, 38], dtype=np.int64), - "target_eos_ids": np.array([539, 40, 41, 42, 43, 44, 45, 46, 47], dtype=np.int64), - "target_eos_mask": np.array([548, 49, 50, 51], dtype=np.int64)} - ] - - writer = FileWriter(mindrecord_file_name) - schema = {"file_name": {"type": "string"}, - "image1": {"type": "bytes"}, - "image2": {"type": "bytes"}, - "source_sos_ids": {"type": "int64", "shape": [-1]}, - "source_sos_mask": {"type": "int64", "shape": [-1]}, - "image3": {"type": "bytes"}, - "image4": {"type": "bytes"}, - "image5": {"type": "bytes"}, - "target_sos_ids": {"type": "int64", "shape": [-1]}, - "target_sos_mask": {"type": "int64", "shape": [-1]}, - "target_eos_ids": {"type": "int64", "shape": [-1]}, - "target_eos_mask": {"type": "int64", "shape": [-1]}, - "label": {"type": "int32"}} - writer.add_schema(schema, "data is so cool") - writer.write_raw_data(data) - writer.commit() - - # change data value to list - data_value_to_list = [] - for item in data: - new_data = {} - new_data['file_name'] = np.asarray(item["file_name"], dtype='S') - new_data['label'] = np.asarray(list([item["label"]]), dtype=np.int32) - new_data['image1'] = np.asarray(list(item["image1"]), dtype=np.uint8) - new_data['image2'] = np.asarray(list(item["image2"]), dtype=np.uint8) - new_data['image3'] = np.asarray(list(item["image3"]), dtype=np.uint8) - new_data['image4'] = np.asarray(list(item["image4"]), dtype=np.uint8) - new_data['image5'] = np.asarray(list(item["image5"]), dtype=np.uint8) - new_data['source_sos_ids'] = item["source_sos_ids"] - new_data['source_sos_mask'] = item["source_sos_mask"] - new_data['target_sos_ids'] = item["target_sos_ids"] - new_data['target_sos_mask'] = item["target_sos_mask"] - new_data['target_eos_ids'] = item["target_eos_ids"] - new_data['target_eos_mask'] = item["target_eos_mask"] - data_value_to_list.append(new_data) - - num_readers = 2 - data_set = ds.MindDataset(dataset_file=mindrecord_file_name, - num_parallel_workers=num_readers, - shuffle=False) - assert data_set.get_dataset_size() == 6 - num_iter = 0 - for item in data_set.create_dict_iterator(): - assert len(item) == 13 - for field in item: - if isinstance(item[field], np.ndarray): - assert (item[field] == - data_value_to_list[num_iter][field]).all() - else: - assert item[field] == data_value_to_list[num_iter][field] - num_iter += 1 - assert num_iter == 6 - - num_readers = 2 - data_set = ds.MindDataset(dataset_file=mindrecord_file_name, - columns_list=["source_sos_ids", - "source_sos_mask", "target_sos_ids"], - num_parallel_workers=num_readers, - shuffle=False) - assert data_set.get_dataset_size() == 6 - num_iter = 0 - for item in data_set.create_dict_iterator(): - assert len(item) == 3 - for field in item: - if isinstance(item[field], np.ndarray): - assert (item[field] == data[num_iter][field]).all() - else: - assert item[field] == data[num_iter][field] - num_iter += 1 - assert num_iter == 6 - - num_readers = 1 - data_set = ds.MindDataset(dataset_file=mindrecord_file_name, - columns_list=[ - "image2", "source_sos_mask", "image3", "target_sos_ids"], - num_parallel_workers=num_readers, - shuffle=False) - assert data_set.get_dataset_size() == 6 - num_iter = 0 - for item in data_set.create_dict_iterator(): - assert len(item) == 4 - for field in item: - if isinstance(item[field], np.ndarray): - assert (item[field] == - data_value_to_list[num_iter][field]).all() - else: - assert item[field] == data_value_to_list[num_iter][field] - num_iter += 1 - assert num_iter == 6 - - num_readers = 3 - data_set = ds.MindDataset(dataset_file=mindrecord_file_name, - columns_list=["target_sos_ids", - "image4", "source_sos_ids"], - num_parallel_workers=num_readers, - shuffle=False) - assert data_set.get_dataset_size() == 6 - num_iter = 0 - for item in data_set.create_dict_iterator(): - assert len(item) == 3 - for field in item: - if isinstance(item[field], np.ndarray): - assert (item[field] == - data_value_to_list[num_iter][field]).all() - else: - assert item[field] == data_value_to_list[num_iter][field] - num_iter += 1 - assert num_iter == 6 - - num_readers = 3 - data_set = ds.MindDataset(dataset_file=mindrecord_file_name, - columns_list=["target_sos_ids", "image5", - "image4", "image3", "source_sos_ids"], - num_parallel_workers=num_readers, - shuffle=False) - assert data_set.get_dataset_size() == 6 - num_iter = 0 - for item in data_set.create_dict_iterator(): - assert len(item) == 5 - for field in item: - if isinstance(item[field], np.ndarray): - assert (item[field] == - data_value_to_list[num_iter][field]).all() - else: - assert item[field] == data_value_to_list[num_iter][field] - num_iter += 1 - assert num_iter == 6 - - num_readers = 1 - data_set = ds.MindDataset(dataset_file=mindrecord_file_name, - columns_list=["target_eos_mask", "image5", - "image2", "source_sos_mask", "label"], - num_parallel_workers=num_readers, - shuffle=False) - assert data_set.get_dataset_size() == 6 - num_iter = 0 - for item in data_set.create_dict_iterator(): - assert len(item) == 5 - for field in item: - if isinstance(item[field], np.ndarray): - assert (item[field] == - data_value_to_list[num_iter][field]).all() - else: - assert item[field] == data_value_to_list[num_iter][field] - num_iter += 1 - assert num_iter == 6 - - num_readers = 2 - data_set = ds.MindDataset(dataset_file=mindrecord_file_name, - columns_list=["label", "target_eos_mask", "image1", "target_eos_ids", "source_sos_mask", - "image2", "image4", "image3", "source_sos_ids", "image5", "file_name"], - num_parallel_workers=num_readers, - shuffle=False) - assert data_set.get_dataset_size() == 6 - num_iter = 0 - for item in data_set.create_dict_iterator(): - assert len(item) == 11 - for field in item: - if isinstance(item[field], np.ndarray): - assert (item[field] == - data_value_to_list[num_iter][field]).all() - else: - assert item[field] == data_value_to_list[num_iter][field] - num_iter += 1 - assert num_iter == 6 - - os.remove("{}".format(mindrecord_file_name)) - os.remove("{}.db".format(mindrecord_file_name)) def test_write_with_multi_bytes_and_MindDataset(): mindrecord_file_name = "test.mindrecord" - data = [{"file_name": "001.jpg", "label": 43, - "image1": bytes("image1 bytes abc", encoding='UTF-8'), - "image2": bytes("image1 bytes def", encoding='UTF-8'), - "image3": bytes("image1 bytes ghi", encoding='UTF-8'), - "image4": bytes("image1 bytes jkl", encoding='UTF-8'), - "image5": bytes("image1 bytes mno", encoding='UTF-8')}, - {"file_name": "002.jpg", "label": 91, - "image1": bytes("image2 bytes abc", encoding='UTF-8'), - "image2": bytes("image2 bytes def", encoding='UTF-8'), - "image3": bytes("image2 bytes ghi", encoding='UTF-8'), - "image4": bytes("image2 bytes jkl", encoding='UTF-8'), - "image5": bytes("image2 bytes mno", encoding='UTF-8')}, - {"file_name": "003.jpg", "label": 61, - "image1": bytes("image3 bytes abc", encoding='UTF-8'), - "image2": bytes("image3 bytes def", encoding='UTF-8'), - "image3": bytes("image3 bytes ghi", encoding='UTF-8'), - "image4": bytes("image3 bytes jkl", encoding='UTF-8'), - "image5": bytes("image3 bytes mno", encoding='UTF-8')}, - {"file_name": "004.jpg", "label": 29, - "image1": bytes("image4 bytes abc", encoding='UTF-8'), - "image2": bytes("image4 bytes def", encoding='UTF-8'), - "image3": bytes("image4 bytes ghi", encoding='UTF-8'), - "image4": bytes("image4 bytes jkl", encoding='UTF-8'), - "image5": bytes("image4 bytes mno", encoding='UTF-8')}, - {"file_name": "005.jpg", "label": 78, - "image1": bytes("image5 bytes abc", encoding='UTF-8'), - "image2": bytes("image5 bytes def", encoding='UTF-8'), - "image3": bytes("image5 bytes ghi", encoding='UTF-8'), - "image4": bytes("image5 bytes jkl", encoding='UTF-8'), - "image5": bytes("image5 bytes mno", encoding='UTF-8')}, - {"file_name": "006.jpg", "label": 37, - "image1": bytes("image6 bytes abc", encoding='UTF-8'), - "image2": bytes("image6 bytes def", encoding='UTF-8'), - "image3": bytes("image6 bytes ghi", encoding='UTF-8'), - "image4": bytes("image6 bytes jkl", encoding='UTF-8'), - "image5": bytes("image6 bytes mno", encoding='UTF-8')} - ] - writer = FileWriter(mindrecord_file_name) - schema = {"file_name": {"type": "string"}, - "image1": {"type": "bytes"}, - "image2": {"type": "bytes"}, - "image3": {"type": "bytes"}, - "label": {"type": "int32"}, - "image4": {"type": "bytes"}, - "image5": {"type": "bytes"}} - writer.add_schema(schema, "data is so cool") - writer.write_raw_data(data) - writer.commit() - - # change data value to list - data_value_to_list = [] - for item in data: - new_data = {} - new_data['file_name'] = np.asarray(item["file_name"], dtype='S') - new_data['label'] = np.asarray(list([item["label"]]), dtype=np.int32) - new_data['image1'] = np.asarray(list(item["image1"]), dtype=np.uint8) - new_data['image2'] = np.asarray(list(item["image2"]), dtype=np.uint8) - new_data['image3'] = np.asarray(list(item["image3"]), dtype=np.uint8) - new_data['image4'] = np.asarray(list(item["image4"]), dtype=np.uint8) - new_data['image5'] = np.asarray(list(item["image5"]), dtype=np.uint8) - data_value_to_list.append(new_data) - - num_readers = 2 - data_set = ds.MindDataset(dataset_file=mindrecord_file_name, - num_parallel_workers=num_readers, - shuffle=False) - assert data_set.get_dataset_size() == 6 - num_iter = 0 - for item in data_set.create_dict_iterator(): - assert len(item) == 7 - for field in item: - if isinstance(item[field], np.ndarray): - assert (item[field] == - data_value_to_list[num_iter][field]).all() - else: - assert item[field] == data_value_to_list[num_iter][field] - num_iter += 1 - assert num_iter == 6 - - num_readers = 2 - data_set = ds.MindDataset(dataset_file=mindrecord_file_name, - columns_list=["image1", "image2", "image5"], - num_parallel_workers=num_readers, - shuffle=False) - assert data_set.get_dataset_size() == 6 - num_iter = 0 - for item in data_set.create_dict_iterator(): - assert len(item) == 3 - for field in item: - if isinstance(item[field], np.ndarray): - assert (item[field] == - data_value_to_list[num_iter][field]).all() - else: - assert item[field] == data_value_to_list[num_iter][field] - num_iter += 1 - assert num_iter == 6 - - num_readers = 2 - data_set = ds.MindDataset(dataset_file=mindrecord_file_name, - columns_list=["image2", "image4"], - num_parallel_workers=num_readers, - shuffle=False) - assert data_set.get_dataset_size() == 6 - num_iter = 0 - for item in data_set.create_dict_iterator(): - assert len(item) == 2 - for field in item: - if isinstance(item[field], np.ndarray): - assert (item[field] == - data_value_to_list[num_iter][field]).all() - else: - assert item[field] == data_value_to_list[num_iter][field] - num_iter += 1 - assert num_iter == 6 - - num_readers = 2 - data_set = ds.MindDataset(dataset_file=mindrecord_file_name, - columns_list=["image5", "image2"], - num_parallel_workers=num_readers, - shuffle=False) - assert data_set.get_dataset_size() == 6 - num_iter = 0 - for item in data_set.create_dict_iterator(): - assert len(item) == 2 - for field in item: - if isinstance(item[field], np.ndarray): - assert (item[field] == - data_value_to_list[num_iter][field]).all() - else: - assert item[field] == data_value_to_list[num_iter][field] - num_iter += 1 - assert num_iter == 6 - - num_readers = 2 - data_set = ds.MindDataset(dataset_file=mindrecord_file_name, - columns_list=["image5", "image2", "label"], - num_parallel_workers=num_readers, - shuffle=False) - assert data_set.get_dataset_size() == 6 - num_iter = 0 - for item in data_set.create_dict_iterator(): - assert len(item) == 3 - for field in item: - if isinstance(item[field], np.ndarray): - assert (item[field] == - data_value_to_list[num_iter][field]).all() - else: - assert item[field] == data_value_to_list[num_iter][field] - num_iter += 1 - assert num_iter == 6 - - num_readers = 2 - data_set = ds.MindDataset(dataset_file=mindrecord_file_name, - columns_list=["image4", "image5", - "image2", "image3", "file_name"], - num_parallel_workers=num_readers, - shuffle=False) - assert data_set.get_dataset_size() == 6 - num_iter = 0 - for item in data_set.create_dict_iterator(): - assert len(item) == 5 - for field in item: - if isinstance(item[field], np.ndarray): - assert (item[field] == - data_value_to_list[num_iter][field]).all() - else: - assert item[field] == data_value_to_list[num_iter][field] - num_iter += 1 - assert num_iter == 6 - - os.remove("{}".format(mindrecord_file_name)) - os.remove("{}.db".format(mindrecord_file_name)) - + try: + data = [{"file_name": "001.jpg", "label": 43, + "image1": bytes("image1 bytes abc", encoding='UTF-8'), + "image2": bytes("image1 bytes def", encoding='UTF-8'), + "image3": bytes("image1 bytes ghi", encoding='UTF-8'), + "image4": bytes("image1 bytes jkl", encoding='UTF-8'), + "image5": bytes("image1 bytes mno", encoding='UTF-8')}, + {"file_name": "002.jpg", "label": 91, + "image1": bytes("image2 bytes abc", encoding='UTF-8'), + "image2": bytes("image2 bytes def", encoding='UTF-8'), + "image3": bytes("image2 bytes ghi", encoding='UTF-8'), + "image4": bytes("image2 bytes jkl", encoding='UTF-8'), + "image5": bytes("image2 bytes mno", encoding='UTF-8')}, + {"file_name": "003.jpg", "label": 61, + "image1": bytes("image3 bytes abc", encoding='UTF-8'), + "image2": bytes("image3 bytes def", encoding='UTF-8'), + "image3": bytes("image3 bytes ghi", encoding='UTF-8'), + "image4": bytes("image3 bytes jkl", encoding='UTF-8'), + "image5": bytes("image3 bytes mno", encoding='UTF-8')}, + {"file_name": "004.jpg", "label": 29, + "image1": bytes("image4 bytes abc", encoding='UTF-8'), + "image2": bytes("image4 bytes def", encoding='UTF-8'), + "image3": bytes("image4 bytes ghi", encoding='UTF-8'), + "image4": bytes("image4 bytes jkl", encoding='UTF-8'), + "image5": bytes("image4 bytes mno", encoding='UTF-8')}, + {"file_name": "005.jpg", "label": 78, + "image1": bytes("image5 bytes abc", encoding='UTF-8'), + "image2": bytes("image5 bytes def", encoding='UTF-8'), + "image3": bytes("image5 bytes ghi", encoding='UTF-8'), + "image4": bytes("image5 bytes jkl", encoding='UTF-8'), + "image5": bytes("image5 bytes mno", encoding='UTF-8')}, + {"file_name": "006.jpg", "label": 37, + "image1": bytes("image6 bytes abc", encoding='UTF-8'), + "image2": bytes("image6 bytes def", encoding='UTF-8'), + "image3": bytes("image6 bytes ghi", encoding='UTF-8'), + "image4": bytes("image6 bytes jkl", encoding='UTF-8'), + "image5": bytes("image6 bytes mno", encoding='UTF-8')} + ] + writer = FileWriter(mindrecord_file_name) + schema = {"file_name": {"type": "string"}, + "image1": {"type": "bytes"}, + "image2": {"type": "bytes"}, + "image3": {"type": "bytes"}, + "label": {"type": "int32"}, + "image4": {"type": "bytes"}, + "image5": {"type": "bytes"}} + writer.add_schema(schema, "data is so cool") + writer.write_raw_data(data) + writer.commit() + + # change data value to list + data_value_to_list = [] + for item in data: + new_data = {} + new_data['file_name'] = np.asarray(item["file_name"], dtype='S') + new_data['label'] = np.asarray(list([item["label"]]), dtype=np.int32) + new_data['image1'] = np.asarray(list(item["image1"]), dtype=np.uint8) + new_data['image2'] = np.asarray(list(item["image2"]), dtype=np.uint8) + new_data['image3'] = np.asarray(list(item["image3"]), dtype=np.uint8) + new_data['image4'] = np.asarray(list(item["image4"]), dtype=np.uint8) + new_data['image5'] = np.asarray(list(item["image5"]), dtype=np.uint8) + data_value_to_list.append(new_data) + + num_readers = 2 + data_set = ds.MindDataset(dataset_file=mindrecord_file_name, + num_parallel_workers=num_readers, + shuffle=False) + assert data_set.get_dataset_size() == 6 + num_iter = 0 + for item in data_set.create_dict_iterator(): + assert len(item) == 7 + for field in item: + if isinstance(item[field], np.ndarray): + assert (item[field] == + data_value_to_list[num_iter][field]).all() + else: + assert item[field] == data_value_to_list[num_iter][field] + num_iter += 1 + assert num_iter == 6 + + num_readers = 2 + data_set = ds.MindDataset(dataset_file=mindrecord_file_name, + columns_list=["image1", "image2", "image5"], + num_parallel_workers=num_readers, + shuffle=False) + assert data_set.get_dataset_size() == 6 + num_iter = 0 + for item in data_set.create_dict_iterator(): + assert len(item) == 3 + for field in item: + if isinstance(item[field], np.ndarray): + assert (item[field] == + data_value_to_list[num_iter][field]).all() + else: + assert item[field] == data_value_to_list[num_iter][field] + num_iter += 1 + assert num_iter == 6 + + num_readers = 2 + data_set = ds.MindDataset(dataset_file=mindrecord_file_name, + columns_list=["image2", "image4"], + num_parallel_workers=num_readers, + shuffle=False) + assert data_set.get_dataset_size() == 6 + num_iter = 0 + for item in data_set.create_dict_iterator(): + assert len(item) == 2 + for field in item: + if isinstance(item[field], np.ndarray): + assert (item[field] == + data_value_to_list[num_iter][field]).all() + else: + assert item[field] == data_value_to_list[num_iter][field] + num_iter += 1 + assert num_iter == 6 + + num_readers = 2 + data_set = ds.MindDataset(dataset_file=mindrecord_file_name, + columns_list=["image5", "image2"], + num_parallel_workers=num_readers, + shuffle=False) + assert data_set.get_dataset_size() == 6 + num_iter = 0 + for item in data_set.create_dict_iterator(): + assert len(item) == 2 + for field in item: + if isinstance(item[field], np.ndarray): + assert (item[field] == + data_value_to_list[num_iter][field]).all() + else: + assert item[field] == data_value_to_list[num_iter][field] + num_iter += 1 + assert num_iter == 6 + + num_readers = 2 + data_set = ds.MindDataset(dataset_file=mindrecord_file_name, + columns_list=["image5", "image2", "label"], + num_parallel_workers=num_readers, + shuffle=False) + assert data_set.get_dataset_size() == 6 + num_iter = 0 + for item in data_set.create_dict_iterator(): + assert len(item) == 3 + for field in item: + if isinstance(item[field], np.ndarray): + assert (item[field] == + data_value_to_list[num_iter][field]).all() + else: + assert item[field] == data_value_to_list[num_iter][field] + num_iter += 1 + assert num_iter == 6 + + num_readers = 2 + data_set = ds.MindDataset(dataset_file=mindrecord_file_name, + columns_list=["image4", "image5", + "image2", "image3", "file_name"], + num_parallel_workers=num_readers, + shuffle=False) + assert data_set.get_dataset_size() == 6 + num_iter = 0 + for item in data_set.create_dict_iterator(): + assert len(item) == 5 + for field in item: + if isinstance(item[field], np.ndarray): + assert (item[field] == + data_value_to_list[num_iter][field]).all() + else: + assert item[field] == data_value_to_list[num_iter][field] + num_iter += 1 + assert num_iter == 6 + except Exception as error: + os.remove("{}".format(mindrecord_file_name)) + os.remove("{}.db".format(mindrecord_file_name)) + raise error + else: + os.remove("{}".format(mindrecord_file_name)) + os.remove("{}.db".format(mindrecord_file_name)) def test_write_with_multi_array_and_MindDataset(): mindrecord_file_name = "test.mindrecord" - data = [{"source_sos_ids": np.array([1, 2, 3, 4, 5], dtype=np.int64), - "source_sos_mask": np.array([6, 7, 8, 9, 10, 11, 12], dtype=np.int64), - "source_eos_ids": np.array([13, 14, 15, 16, 17, 18], dtype=np.int64), - "source_eos_mask": np.array([19, 20, 21, 22, 23, 24, 25, 26, 27], dtype=np.int64), - "target_sos_ids": np.array([28, 29, 30, 31, 32], dtype=np.int64), - "target_sos_mask": np.array([33, 34, 35, 36, 37, 38], dtype=np.int64), - "target_eos_ids": np.array([39, 40, 41, 42, 43, 44, 45, 46, 47], dtype=np.int64), - "target_eos_mask": np.array([48, 49, 50, 51], dtype=np.int64)}, - {"source_sos_ids": np.array([11, 2, 3, 4, 5], dtype=np.int64), - "source_sos_mask": np.array([16, 7, 8, 9, 10, 11, 12], dtype=np.int64), - "source_eos_ids": np.array([113, 14, 15, 16, 17, 18], dtype=np.int64), - "source_eos_mask": np.array([119, 20, 21, 22, 23, 24, 25, 26, 27], dtype=np.int64), - "target_sos_ids": np.array([128, 29, 30, 31, 32], dtype=np.int64), - "target_sos_mask": np.array([133, 34, 35, 36, 37, 38], dtype=np.int64), - "target_eos_ids": np.array([139, 40, 41, 42, 43, 44, 45, 46, 47], dtype=np.int64), - "target_eos_mask": np.array([148, 49, 50, 51], dtype=np.int64)}, - {"source_sos_ids": np.array([21, 2, 3, 4, 5], dtype=np.int64), - "source_sos_mask": np.array([26, 7, 8, 9, 10, 11, 12], dtype=np.int64), - "source_eos_ids": np.array([213, 14, 15, 16, 17, 18], dtype=np.int64), - "source_eos_mask": np.array([219, 20, 21, 22, 23, 24, 25, 26, 27], dtype=np.int64), - "target_sos_ids": np.array([228, 29, 30, 31, 32], dtype=np.int64), - "target_sos_mask": np.array([233, 34, 35, 36, 37, 38], dtype=np.int64), - "target_eos_ids": np.array([239, 40, 41, 42, 43, 44, 45, 46, 47], dtype=np.int64), - "target_eos_mask": np.array([248, 49, 50, 51], dtype=np.int64)}, - {"source_sos_ids": np.array([31, 2, 3, 4, 5], dtype=np.int64), - "source_sos_mask": np.array([36, 7, 8, 9, 10, 11, 12], dtype=np.int64), - "source_eos_ids": np.array([313, 14, 15, 16, 17, 18], dtype=np.int64), - "source_eos_mask": np.array([319, 20, 21, 22, 23, 24, 25, 26, 27], dtype=np.int64), - "target_sos_ids": np.array([328, 29, 30, 31, 32], dtype=np.int64), - "target_sos_mask": np.array([333, 34, 35, 36, 37, 38], dtype=np.int64), - "target_eos_ids": np.array([339, 40, 41, 42, 43, 44, 45, 46, 47], dtype=np.int64), - "target_eos_mask": np.array([348, 49, 50, 51], dtype=np.int64)}, - {"source_sos_ids": np.array([41, 2, 3, 4, 5], dtype=np.int64), - "source_sos_mask": np.array([46, 7, 8, 9, 10, 11, 12], dtype=np.int64), - "source_eos_ids": np.array([413, 14, 15, 16, 17, 18], dtype=np.int64), - "source_eos_mask": np.array([419, 20, 21, 22, 23, 24, 25, 26, 27], dtype=np.int64), - "target_sos_ids": np.array([428, 29, 30, 31, 32], dtype=np.int64), - "target_sos_mask": np.array([433, 34, 35, 36, 37, 38], dtype=np.int64), - "target_eos_ids": np.array([439, 40, 41, 42, 43, 44, 45, 46, 47], dtype=np.int64), - "target_eos_mask": np.array([448, 49, 50, 51], dtype=np.int64)}, - {"source_sos_ids": np.array([51, 2, 3, 4, 5], dtype=np.int64), - "source_sos_mask": np.array([56, 7, 8, 9, 10, 11, 12], dtype=np.int64), - "source_eos_ids": np.array([513, 14, 15, 16, 17, 18], dtype=np.int64), - "source_eos_mask": np.array([519, 20, 21, 22, 23, 24, 25, 26, 27], dtype=np.int64), - "target_sos_ids": np.array([528, 29, 30, 31, 32], dtype=np.int64), - "target_sos_mask": np.array([533, 34, 35, 36, 37, 38], dtype=np.int64), - "target_eos_ids": np.array([539, 40, 41, 42, 43, 44, 45, 46, 47], dtype=np.int64), - "target_eos_mask": np.array([548, 49, 50, 51], dtype=np.int64)} - ] - writer = FileWriter(mindrecord_file_name) - schema = {"source_sos_ids": {"type": "int64", "shape": [-1]}, - "source_sos_mask": {"type": "int64", "shape": [-1]}, - "source_eos_ids": {"type": "int64", "shape": [-1]}, - "source_eos_mask": {"type": "int64", "shape": [-1]}, - "target_sos_ids": {"type": "int64", "shape": [-1]}, - "target_sos_mask": {"type": "int64", "shape": [-1]}, - "target_eos_ids": {"type": "int64", "shape": [-1]}, - "target_eos_mask": {"type": "int64", "shape": [-1]}} - writer.add_schema(schema, "data is so cool") - writer.write_raw_data(data) - writer.commit() - - # change data value to list - do none - data_value_to_list = [] - for item in data: - new_data = {} - new_data['source_sos_ids'] = item["source_sos_ids"] - new_data['source_sos_mask'] = item["source_sos_mask"] - new_data['source_eos_ids'] = item["source_eos_ids"] - new_data['source_eos_mask'] = item["source_eos_mask"] - new_data['target_sos_ids'] = item["target_sos_ids"] - new_data['target_sos_mask'] = item["target_sos_mask"] - new_data['target_eos_ids'] = item["target_eos_ids"] - new_data['target_eos_mask'] = item["target_eos_mask"] - data_value_to_list.append(new_data) - - num_readers = 2 - data_set = ds.MindDataset(dataset_file=mindrecord_file_name, - num_parallel_workers=num_readers, - shuffle=False) - assert data_set.get_dataset_size() == 6 - num_iter = 0 - for item in data_set.create_dict_iterator(): - assert len(item) == 8 - for field in item: - if isinstance(item[field], np.ndarray): - assert (item[field] == - data_value_to_list[num_iter][field]).all() - else: - assert item[field] == data_value_to_list[num_iter][field] - num_iter += 1 - assert num_iter == 6 - - num_readers = 2 - data_set = ds.MindDataset(dataset_file=mindrecord_file_name, - columns_list=["source_eos_ids", "source_eos_mask", - "target_sos_ids", "target_sos_mask", - "target_eos_ids", "target_eos_mask"], - num_parallel_workers=num_readers, - shuffle=False) - assert data_set.get_dataset_size() == 6 - num_iter = 0 - for item in data_set.create_dict_iterator(): - assert len(item) == 6 - for field in item: - if isinstance(item[field], np.ndarray): - assert (item[field] == - data_value_to_list[num_iter][field]).all() - else: - assert item[field] == data_value_to_list[num_iter][field] - num_iter += 1 - assert num_iter == 6 - - num_readers = 2 - data_set = ds.MindDataset(dataset_file=mindrecord_file_name, - columns_list=["source_sos_ids", - "target_sos_ids", - "target_eos_mask"], - num_parallel_workers=num_readers, - shuffle=False) - assert data_set.get_dataset_size() == 6 - num_iter = 0 - for item in data_set.create_dict_iterator(): - assert len(item) == 3 - for field in item: - if isinstance(item[field], np.ndarray): - assert (item[field] == - data_value_to_list[num_iter][field]).all() - else: - assert item[field] == data_value_to_list[num_iter][field] - num_iter += 1 - assert num_iter == 6 - - num_readers = 2 - data_set = ds.MindDataset(dataset_file=mindrecord_file_name, - columns_list=["target_eos_mask", - "source_eos_mask", - "source_sos_mask"], - num_parallel_workers=num_readers, - shuffle=False) - assert data_set.get_dataset_size() == 6 - num_iter = 0 - for item in data_set.create_dict_iterator(): - assert len(item) == 3 - for field in item: - if isinstance(item[field], np.ndarray): - assert (item[field] == - data_value_to_list[num_iter][field]).all() - else: - assert item[field] == data_value_to_list[num_iter][field] - num_iter += 1 - assert num_iter == 6 - - num_readers = 2 - data_set = ds.MindDataset(dataset_file=mindrecord_file_name, - columns_list=["target_eos_ids"], - num_parallel_workers=num_readers, - shuffle=False) - assert data_set.get_dataset_size() == 6 - num_iter = 0 - for item in data_set.create_dict_iterator(): - assert len(item) == 1 - for field in item: - if isinstance(item[field], np.ndarray): - assert (item[field] == - data_value_to_list[num_iter][field]).all() - else: - assert item[field] == data_value_to_list[num_iter][field] - num_iter += 1 - assert num_iter == 6 - - num_readers = 1 - data_set = ds.MindDataset(dataset_file=mindrecord_file_name, - columns_list=["target_eos_mask", "target_eos_ids", - "target_sos_mask", "target_sos_ids", - "source_eos_mask", "source_eos_ids", - "source_sos_mask", "source_sos_ids"], - num_parallel_workers=num_readers, - shuffle=False) - assert data_set.get_dataset_size() == 6 - num_iter = 0 - for item in data_set.create_dict_iterator(): - assert len(item) == 8 - for field in item: - if isinstance(item[field], np.ndarray): - assert (item[field] == - data_value_to_list[num_iter][field]).all() - else: - assert item[field] == data_value_to_list[num_iter][field] - num_iter += 1 - assert num_iter == 6 - - os.remove("{}".format(mindrecord_file_name)) - os.remove("{}.db".format(mindrecord_file_name)) - -def test_write_with_float32_float64_float32_array_float64_array_and_MindDataset(): - mindrecord_file_name = "test.mindrecord" - data = [{"float32_array": np.array([1.2, 2.78, 3.1234, 4.9871, 5.12341], dtype=np.float32), - "float64_array": np.array([48.1234556789, 49.3251241431, 50.13514312414, 51.8971298471, - 123414314.2141243, 87.1212122], dtype=np.float64), - "float32": 3456.12345, - "float64": 1987654321.123456785, - "int32_array": np.array([1, 2, 3, 4, 5], dtype=np.int32), - "int64_array": np.array([48, 49, 50, 51, 123414314, 87], dtype=np.int64), - "int32": 3456, - "int64": 947654321123}, - {"float32_array": np.array([1.2, 2.78, 4.1234, 4.9871, 5.12341], dtype=np.float32), - "float64_array": np.array([48.1234556789, 49.3251241431, 60.13514312414, 51.8971298471, - 123414314.2141243, 87.1212122], dtype=np.float64), - "float32": 3456.12445, - "float64": 1987654321.123456786, - "int32_array": np.array([11, 21, 31, 41, 51], dtype=np.int32), - "int64_array": np.array([481, 491, 501, 511, 1234143141, 871], dtype=np.int64), - "int32": 3466, - "int64": 957654321123}, - {"float32_array": np.array([1.2, 2.78, 5.1234, 4.9871, 5.12341], dtype=np.float32), - "float64_array": np.array([48.1234556789, 49.3251241431, 70.13514312414, 51.8971298471, - 123414314.2141243, 87.1212122], dtype=np.float64), - "float32": 3456.12545, - "float64": 1987654321.123456787, - "int32_array": np.array([12, 22, 32, 42, 52], dtype=np.int32), - "int64_array": np.array([482, 492, 502, 512, 1234143142, 872], dtype=np.int64), - "int32": 3476, - "int64": 967654321123}, - {"float32_array": np.array([1.2, 2.78, 6.1234, 4.9871, 5.12341], dtype=np.float32), - "float64_array": np.array([48.1234556789, 49.3251241431, 80.13514312414, 51.8971298471, - 123414314.2141243, 87.1212122], dtype=np.float64), - "float32": 3456.12645, - "float64": 1987654321.123456788, - "int32_array": np.array([13, 23, 33, 43, 53], dtype=np.int32), - "int64_array": np.array([483, 493, 503, 513, 1234143143, 873], dtype=np.int64), - "int32": 3486, - "int64": 977654321123}, - {"float32_array": np.array([1.2, 2.78, 7.1234, 4.9871, 5.12341], dtype=np.float32), - "float64_array": np.array([48.1234556789, 49.3251241431, 90.13514312414, 51.8971298471, - 123414314.2141243, 87.1212122], dtype=np.float64), - "float32": 3456.12745, - "float64": 1987654321.123456789, - "int32_array": np.array([14, 24, 34, 44, 54], dtype=np.int32), - "int64_array": np.array([484, 494, 504, 514, 1234143144, 874], dtype=np.int64), - "int32": 3496, - "int64": 987654321123}, - ] - writer = FileWriter(mindrecord_file_name) - schema = {"float32_array": {"type": "float32", "shape": [-1]}, - "float64_array": {"type": "float64", "shape": [-1]}, - "float32": {"type": "float32"}, - "float64": {"type": "float64"}, - "int32_array": {"type": "int32", "shape": [-1]}, - "int64_array": {"type": "int64", "shape": [-1]}, - "int32": {"type": "int32"}, - "int64": {"type": "int64"}} - writer.add_schema(schema, "data is so cool") - writer.write_raw_data(data) - writer.commit() - - # change data value to list - do none - data_value_to_list = [] - for item in data: - new_data = {} - new_data['float32_array'] = item["float32_array"] - new_data['float64_array'] = item["float64_array"] - new_data['float32'] = item["float32"] - new_data['float64'] = item["float64"] - new_data['int32_array'] = item["int32_array"] - new_data['int64_array'] = item["int64_array"] - new_data['int32'] = item["int32"] - new_data['int64'] = item["int64"] - data_value_to_list.append(new_data) - - num_readers = 2 - data_set = ds.MindDataset(dataset_file=mindrecord_file_name, - num_parallel_workers=num_readers, - shuffle=False) - assert data_set.get_dataset_size() == 5 - num_iter = 0 - for item in data_set.create_dict_iterator(): - assert len(item) == 8 - for field in item: - if isinstance(item[field], np.ndarray): - if item[field].dtype == np.float32: + try: + data = [{"source_sos_ids": np.array([1, 2, 3, 4, 5], dtype=np.int64), + "source_sos_mask": np.array([6, 7, 8, 9, 10, 11, 12], dtype=np.int64), + "source_eos_ids": np.array([13, 14, 15, 16, 17, 18], dtype=np.int64), + "source_eos_mask": np.array([19, 20, 21, 22, 23, 24, 25, 26, 27], dtype=np.int64), + "target_sos_ids": np.array([28, 29, 30, 31, 32], dtype=np.int64), + "target_sos_mask": np.array([33, 34, 35, 36, 37, 38], dtype=np.int64), + "target_eos_ids": np.array([39, 40, 41, 42, 43, 44, 45, 46, 47], dtype=np.int64), + "target_eos_mask": np.array([48, 49, 50, 51], dtype=np.int64)}, + {"source_sos_ids": np.array([11, 2, 3, 4, 5], dtype=np.int64), + "source_sos_mask": np.array([16, 7, 8, 9, 10, 11, 12], dtype=np.int64), + "source_eos_ids": np.array([113, 14, 15, 16, 17, 18], dtype=np.int64), + "source_eos_mask": np.array([119, 20, 21, 22, 23, 24, 25, 26, 27], dtype=np.int64), + "target_sos_ids": np.array([128, 29, 30, 31, 32], dtype=np.int64), + "target_sos_mask": np.array([133, 34, 35, 36, 37, 38], dtype=np.int64), + "target_eos_ids": np.array([139, 40, 41, 42, 43, 44, 45, 46, 47], dtype=np.int64), + "target_eos_mask": np.array([148, 49, 50, 51], dtype=np.int64)}, + {"source_sos_ids": np.array([21, 2, 3, 4, 5], dtype=np.int64), + "source_sos_mask": np.array([26, 7, 8, 9, 10, 11, 12], dtype=np.int64), + "source_eos_ids": np.array([213, 14, 15, 16, 17, 18], dtype=np.int64), + "source_eos_mask": np.array([219, 20, 21, 22, 23, 24, 25, 26, 27], dtype=np.int64), + "target_sos_ids": np.array([228, 29, 30, 31, 32], dtype=np.int64), + "target_sos_mask": np.array([233, 34, 35, 36, 37, 38], dtype=np.int64), + "target_eos_ids": np.array([239, 40, 41, 42, 43, 44, 45, 46, 47], dtype=np.int64), + "target_eos_mask": np.array([248, 49, 50, 51], dtype=np.int64)}, + {"source_sos_ids": np.array([31, 2, 3, 4, 5], dtype=np.int64), + "source_sos_mask": np.array([36, 7, 8, 9, 10, 11, 12], dtype=np.int64), + "source_eos_ids": np.array([313, 14, 15, 16, 17, 18], dtype=np.int64), + "source_eos_mask": np.array([319, 20, 21, 22, 23, 24, 25, 26, 27], dtype=np.int64), + "target_sos_ids": np.array([328, 29, 30, 31, 32], dtype=np.int64), + "target_sos_mask": np.array([333, 34, 35, 36, 37, 38], dtype=np.int64), + "target_eos_ids": np.array([339, 40, 41, 42, 43, 44, 45, 46, 47], dtype=np.int64), + "target_eos_mask": np.array([348, 49, 50, 51], dtype=np.int64)}, + {"source_sos_ids": np.array([41, 2, 3, 4, 5], dtype=np.int64), + "source_sos_mask": np.array([46, 7, 8, 9, 10, 11, 12], dtype=np.int64), + "source_eos_ids": np.array([413, 14, 15, 16, 17, 18], dtype=np.int64), + "source_eos_mask": np.array([419, 20, 21, 22, 23, 24, 25, 26, 27], dtype=np.int64), + "target_sos_ids": np.array([428, 29, 30, 31, 32], dtype=np.int64), + "target_sos_mask": np.array([433, 34, 35, 36, 37, 38], dtype=np.int64), + "target_eos_ids": np.array([439, 40, 41, 42, 43, 44, 45, 46, 47], dtype=np.int64), + "target_eos_mask": np.array([448, 49, 50, 51], dtype=np.int64)}, + {"source_sos_ids": np.array([51, 2, 3, 4, 5], dtype=np.int64), + "source_sos_mask": np.array([56, 7, 8, 9, 10, 11, 12], dtype=np.int64), + "source_eos_ids": np.array([513, 14, 15, 16, 17, 18], dtype=np.int64), + "source_eos_mask": np.array([519, 20, 21, 22, 23, 24, 25, 26, 27], dtype=np.int64), + "target_sos_ids": np.array([528, 29, 30, 31, 32], dtype=np.int64), + "target_sos_mask": np.array([533, 34, 35, 36, 37, 38], dtype=np.int64), + "target_eos_ids": np.array([539, 40, 41, 42, 43, 44, 45, 46, 47], dtype=np.int64), + "target_eos_mask": np.array([548, 49, 50, 51], dtype=np.int64)} + ] + writer = FileWriter(mindrecord_file_name) + schema = {"source_sos_ids": {"type": "int64", "shape": [-1]}, + "source_sos_mask": {"type": "int64", "shape": [-1]}, + "source_eos_ids": {"type": "int64", "shape": [-1]}, + "source_eos_mask": {"type": "int64", "shape": [-1]}, + "target_sos_ids": {"type": "int64", "shape": [-1]}, + "target_sos_mask": {"type": "int64", "shape": [-1]}, + "target_eos_ids": {"type": "int64", "shape": [-1]}, + "target_eos_mask": {"type": "int64", "shape": [-1]}} + writer.add_schema(schema, "data is so cool") + writer.write_raw_data(data) + writer.commit() + + # change data value to list - do none + data_value_to_list = [] + for item in data: + new_data = {} + new_data['source_sos_ids'] = item["source_sos_ids"] + new_data['source_sos_mask'] = item["source_sos_mask"] + new_data['source_eos_ids'] = item["source_eos_ids"] + new_data['source_eos_mask'] = item["source_eos_mask"] + new_data['target_sos_ids'] = item["target_sos_ids"] + new_data['target_sos_mask'] = item["target_sos_mask"] + new_data['target_eos_ids'] = item["target_eos_ids"] + new_data['target_eos_mask'] = item["target_eos_mask"] + data_value_to_list.append(new_data) + + num_readers = 2 + data_set = ds.MindDataset(dataset_file=mindrecord_file_name, + num_parallel_workers=num_readers, + shuffle=False) + assert data_set.get_dataset_size() == 6 + num_iter = 0 + for item in data_set.create_dict_iterator(): + assert len(item) == 8 + for field in item: + if isinstance(item[field], np.ndarray): assert (item[field] == - np.array(data_value_to_list[num_iter][field], np.float32)).all() + data_value_to_list[num_iter][field]).all() else: + assert item[field] == data_value_to_list[num_iter][field] + num_iter += 1 + assert num_iter == 6 + + num_readers = 2 + data_set = ds.MindDataset(dataset_file=mindrecord_file_name, + columns_list=["source_eos_ids", "source_eos_mask", + "target_sos_ids", "target_sos_mask", + "target_eos_ids", "target_eos_mask"], + num_parallel_workers=num_readers, + shuffle=False) + assert data_set.get_dataset_size() == 6 + num_iter = 0 + for item in data_set.create_dict_iterator(): + assert len(item) == 6 + for field in item: + if isinstance(item[field], np.ndarray): assert (item[field] == data_value_to_list[num_iter][field]).all() - else: - assert item[field] == data_value_to_list[num_iter][field] - num_iter += 1 - assert num_iter == 5 - - num_readers = 2 - data_set = ds.MindDataset(dataset_file=mindrecord_file_name, - columns_list=["float32", "int32"], - num_parallel_workers=num_readers, - shuffle=False) - assert data_set.get_dataset_size() == 5 - num_iter = 0 - for item in data_set.create_dict_iterator(): - assert len(item) == 2 - for field in item: - if isinstance(item[field], np.ndarray): - if item[field].dtype == np.float32: + else: + assert item[field] == data_value_to_list[num_iter][field] + num_iter += 1 + assert num_iter == 6 + + num_readers = 2 + data_set = ds.MindDataset(dataset_file=mindrecord_file_name, + columns_list=["source_sos_ids", + "target_sos_ids", + "target_eos_mask"], + num_parallel_workers=num_readers, + shuffle=False) + assert data_set.get_dataset_size() == 6 + num_iter = 0 + for item in data_set.create_dict_iterator(): + assert len(item) == 3 + for field in item: + if isinstance(item[field], np.ndarray): assert (item[field] == - np.array(data_value_to_list[num_iter][field], np.float32)).all() + data_value_to_list[num_iter][field]).all() else: + assert item[field] == data_value_to_list[num_iter][field] + num_iter += 1 + assert num_iter == 6 + + num_readers = 2 + data_set = ds.MindDataset(dataset_file=mindrecord_file_name, + columns_list=["target_eos_mask", + "source_eos_mask", + "source_sos_mask"], + num_parallel_workers=num_readers, + shuffle=False) + assert data_set.get_dataset_size() == 6 + num_iter = 0 + for item in data_set.create_dict_iterator(): + assert len(item) == 3 + for field in item: + if isinstance(item[field], np.ndarray): assert (item[field] == data_value_to_list[num_iter][field]).all() - else: - assert item[field] == data_value_to_list[num_iter][field] - num_iter += 1 - assert num_iter == 5 - - num_readers = 2 - data_set = ds.MindDataset(dataset_file=mindrecord_file_name, - columns_list=["float64", "int64"], - num_parallel_workers=num_readers, - shuffle=False) - assert data_set.get_dataset_size() == 5 - num_iter = 0 - for item in data_set.create_dict_iterator(): - assert len(item) == 2 - for field in item: - if isinstance(item[field], np.ndarray): - if item[field].dtype == np.float32: + else: + assert item[field] == data_value_to_list[num_iter][field] + num_iter += 1 + assert num_iter == 6 + + num_readers = 2 + data_set = ds.MindDataset(dataset_file=mindrecord_file_name, + columns_list=["target_eos_ids"], + num_parallel_workers=num_readers, + shuffle=False) + assert data_set.get_dataset_size() == 6 + num_iter = 0 + for item in data_set.create_dict_iterator(): + assert len(item) == 1 + for field in item: + if isinstance(item[field], np.ndarray): assert (item[field] == - np.array(data_value_to_list[num_iter][field], np.float32)).all() - elif item[field].dtype == np.float64: - assert math.isclose(item[field], - np.array(data_value_to_list[num_iter][field], np.float64), rel_tol=1e-14) + data_value_to_list[num_iter][field]).all() else: + assert item[field] == data_value_to_list[num_iter][field] + num_iter += 1 + assert num_iter == 6 + + num_readers = 1 + data_set = ds.MindDataset(dataset_file=mindrecord_file_name, + columns_list=["target_eos_mask", "target_eos_ids", + "target_sos_mask", "target_sos_ids", + "source_eos_mask", "source_eos_ids", + "source_sos_mask", "source_sos_ids"], + num_parallel_workers=num_readers, + shuffle=False) + assert data_set.get_dataset_size() == 6 + num_iter = 0 + for item in data_set.create_dict_iterator(): + assert len(item) == 8 + for field in item: + if isinstance(item[field], np.ndarray): assert (item[field] == data_value_to_list[num_iter][field]).all() - else: - assert item[field] == data_value_to_list[num_iter][field] - num_iter += 1 - assert num_iter == 5 + else: + assert item[field] == data_value_to_list[num_iter][field] + num_iter += 1 + assert num_iter == 6 + except Exception as error: + os.remove("{}".format(mindrecord_file_name)) + os.remove("{}.db".format(mindrecord_file_name)) + raise error + else: + os.remove("{}".format(mindrecord_file_name)) + os.remove("{}.db".format(mindrecord_file_name)) - os.remove("{}".format(mindrecord_file_name)) - os.remove("{}.db".format(mindrecord_file_name)) def test_numpy_generic(): - paths = ["{}{}".format(CV_FILE_NAME, str(x).rjust(1, '0')) for x in range(FILES_NUM)] - for x in paths: - if os.path.exists("{}".format(x)): + try: + for x in paths: + if os.path.exists("{}".format(x)): + os.remove("{}".format(x)) + if os.path.exists("{}.db".format(x)): + os.remove("{}.db".format(x)) + writer = FileWriter(CV_FILE_NAME, FILES_NUM) + cv_schema_json = {"label1": {"type": "int32"}, "label2": {"type": "int64"}, + "label3": {"type": "float32"}, "label4": {"type": "float64"}} + data = [] + for idx in range(10): + row = {} + row['label1'] = np.int32(idx) + row['label2'] = np.int64(idx*10) + row['label3'] = np.float32(idx+0.12345) + row['label4'] = np.float64(idx+0.12345789) + data.append(row) + writer.add_schema(cv_schema_json, "img_schema") + writer.write_raw_data(data) + writer.commit() + + num_readers = 4 + data_set = ds.MindDataset(CV_FILE_NAME + "0", None, num_readers, shuffle=False) + assert data_set.get_dataset_size() == 10 + idx = 0 + for item in data_set.create_dict_iterator(): + assert item['label1'] == item['label1'] + assert item['label2'] == item['label2'] + assert item['label3'] == item['label3'] + assert item['label4'] == item['label4'] + idx += 1 + assert idx == 10 + except Exception as error: + for x in paths: + os.remove("{}".format(x)) + os.remove("{}.db".format(x)) + raise error + else: + for x in paths: os.remove("{}".format(x)) - if os.path.exists("{}.db".format(x)): os.remove("{}.db".format(x)) - writer = FileWriter(CV_FILE_NAME, FILES_NUM) - cv_schema_json = {"label1": {"type": "int32"}, "label2": {"type": "int64"}, - "label3": {"type": "float32"}, "label4": {"type": "float64"}} - data = [] - for idx in range(10): - row = {} - row['label1'] = np.int32(idx) - row['label2'] = np.int64(idx*10) - row['label3'] = np.float32(idx+0.12345) - row['label4'] = np.float64(idx+0.12345789) - data.append(row) - writer.add_schema(cv_schema_json, "img_schema") - writer.write_raw_data(data) - writer.commit() - num_readers = 4 - data_set = ds.MindDataset(CV_FILE_NAME + "0", None, num_readers, shuffle=False) - assert data_set.get_dataset_size() == 10 - idx = 0 - for item in data_set.create_dict_iterator(): - assert item['label1'] == item['label1'] - assert item['label2'] == item['label2'] - assert item['label3'] == item['label3'] - assert item['label4'] == item['label4'] - idx += 1 - assert idx == 10 - for x in paths: - os.remove("{}".format(x)) - os.remove("{}.db".format(x)) + +def test_write_with_float32_float64_float32_array_float64_array_and_MindDataset(): + mindrecord_file_name = "test.mindrecord" + try: + data = [{"float32_array": np.array([1.2, 2.78, 3.1234, 4.9871, 5.12341], dtype=np.float32), + "float64_array": np.array([48.1234556789, 49.3251241431, 50.13514312414, 51.8971298471, + 123414314.2141243, 87.1212122], dtype=np.float64), + "float32": 3456.12345, + "float64": 1987654321.123456785, + "int32_array": np.array([1, 2, 3, 4, 5], dtype=np.int32), + "int64_array": np.array([48, 49, 50, 51, 123414314, 87], dtype=np.int64), + "int32": 3456, + "int64": 947654321123}, + {"float32_array": np.array([1.2, 2.78, 4.1234, 4.9871, 5.12341], dtype=np.float32), + "float64_array": np.array([48.1234556789, 49.3251241431, 60.13514312414, 51.8971298471, + 123414314.2141243, 87.1212122], dtype=np.float64), + "float32": 3456.12445, + "float64": 1987654321.123456786, + "int32_array": np.array([11, 21, 31, 41, 51], dtype=np.int32), + "int64_array": np.array([481, 491, 501, 511, 1234143141, 871], dtype=np.int64), + "int32": 3466, + "int64": 957654321123}, + {"float32_array": np.array([1.2, 2.78, 5.1234, 4.9871, 5.12341], dtype=np.float32), + "float64_array": np.array([48.1234556789, 49.3251241431, 70.13514312414, 51.8971298471, + 123414314.2141243, 87.1212122], dtype=np.float64), + "float32": 3456.12545, + "float64": 1987654321.123456787, + "int32_array": np.array([12, 22, 32, 42, 52], dtype=np.int32), + "int64_array": np.array([482, 492, 502, 512, 1234143142, 872], dtype=np.int64), + "int32": 3476, + "int64": 967654321123}, + {"float32_array": np.array([1.2, 2.78, 6.1234, 4.9871, 5.12341], dtype=np.float32), + "float64_array": np.array([48.1234556789, 49.3251241431, 80.13514312414, 51.8971298471, + 123414314.2141243, 87.1212122], dtype=np.float64), + "float32": 3456.12645, + "float64": 1987654321.123456788, + "int32_array": np.array([13, 23, 33, 43, 53], dtype=np.int32), + "int64_array": np.array([483, 493, 503, 513, 1234143143, 873], dtype=np.int64), + "int32": 3486, + "int64": 977654321123}, + {"float32_array": np.array([1.2, 2.78, 7.1234, 4.9871, 5.12341], dtype=np.float32), + "float64_array": np.array([48.1234556789, 49.3251241431, 90.13514312414, 51.8971298471, + 123414314.2141243, 87.1212122], dtype=np.float64), + "float32": 3456.12745, + "float64": 1987654321.123456789, + "int32_array": np.array([14, 24, 34, 44, 54], dtype=np.int32), + "int64_array": np.array([484, 494, 504, 514, 1234143144, 874], dtype=np.int64), + "int32": 3496, + "int64": 987654321123}, + ] + writer = FileWriter(mindrecord_file_name) + schema = {"float32_array": {"type": "float32", "shape": [-1]}, + "float64_array": {"type": "float64", "shape": [-1]}, + "float32": {"type": "float32"}, + "float64": {"type": "float64"}, + "int32_array": {"type": "int32", "shape": [-1]}, + "int64_array": {"type": "int64", "shape": [-1]}, + "int32": {"type": "int32"}, + "int64": {"type": "int64"}} + writer.add_schema(schema, "data is so cool") + writer.write_raw_data(data) + writer.commit() + + # change data value to list - do none + data_value_to_list = [] + for item in data: + new_data = {} + new_data['float32_array'] = item["float32_array"] + new_data['float64_array'] = item["float64_array"] + new_data['float32'] = item["float32"] + new_data['float64'] = item["float64"] + new_data['int32_array'] = item["int32_array"] + new_data['int64_array'] = item["int64_array"] + new_data['int32'] = item["int32"] + new_data['int64'] = item["int64"] + data_value_to_list.append(new_data) + + num_readers = 2 + data_set = ds.MindDataset(dataset_file=mindrecord_file_name, + num_parallel_workers=num_readers, + shuffle=False) + assert data_set.get_dataset_size() == 5 + num_iter = 0 + for item in data_set.create_dict_iterator(): + assert len(item) == 8 + for field in item: + if isinstance(item[field], np.ndarray): + if item[field].dtype == np.float32: + assert (item[field] == + np.array(data_value_to_list[num_iter][field], np.float32)).all() + else: + assert (item[field] == + data_value_to_list[num_iter][field]).all() + else: + assert item[field] == data_value_to_list[num_iter][field] + num_iter += 1 + assert num_iter == 5 + + num_readers = 2 + data_set = ds.MindDataset(dataset_file=mindrecord_file_name, + columns_list=["float32", "int32"], + num_parallel_workers=num_readers, + shuffle=False) + assert data_set.get_dataset_size() == 5 + num_iter = 0 + for item in data_set.create_dict_iterator(): + assert len(item) == 2 + for field in item: + if isinstance(item[field], np.ndarray): + if item[field].dtype == np.float32: + assert (item[field] == + np.array(data_value_to_list[num_iter][field], np.float32)).all() + else: + assert (item[field] == + data_value_to_list[num_iter][field]).all() + else: + assert item[field] == data_value_to_list[num_iter][field] + num_iter += 1 + assert num_iter == 5 + + num_readers = 2 + data_set = ds.MindDataset(dataset_file=mindrecord_file_name, + columns_list=["float64", "int64"], + num_parallel_workers=num_readers, + shuffle=False) + assert data_set.get_dataset_size() == 5 + num_iter = 0 + for item in data_set.create_dict_iterator(): + assert len(item) == 2 + for field in item: + if isinstance(item[field], np.ndarray): + if item[field].dtype == np.float32: + assert (item[field] == + np.array(data_value_to_list[num_iter][field], np.float32)).all() + elif item[field].dtype == np.float64: + assert math.isclose(item[field], + np.array(data_value_to_list[num_iter][field], np.float64), + rel_tol=1e-14) + else: + assert (item[field] == + data_value_to_list[num_iter][field]).all() + else: + assert item[field] == data_value_to_list[num_iter][field] + num_iter += 1 + assert num_iter == 5 + except Exception as error: + os.remove("{}".format(mindrecord_file_name)) + os.remove("{}.db".format(mindrecord_file_name)) + raise error + else: + os.remove("{}".format(mindrecord_file_name)) + os.remove("{}.db".format(mindrecord_file_name)) + +if __name__ == '__main__': + test_nlp_compress_data(add_and_remove_nlp_compress_file) + test_nlp_compress_data_old_version(add_and_remove_nlp_compress_file) + test_cv_minddataset_writer_tutorial() + test_cv_minddataset_partition_tutorial(add_and_remove_cv_file) + test_cv_minddataset_partition_num_samples_0(add_and_remove_cv_file) + test_cv_minddataset_partition_num_samples_1(add_and_remove_cv_file) + test_cv_minddataset_partition_num_samples_2(add_and_remove_cv_file) + test_cv_minddataset_partition_tutorial_check_shuffle_result(add_and_remove_cv_file) + test_cv_minddataset_partition_tutorial_check_whole_reshuffle_result_per_epoch(add_and_remove_cv_file) + test_cv_minddataset_check_shuffle_result(add_and_remove_cv_file) + test_cv_minddataset_dataset_size(add_and_remove_cv_file) + test_cv_minddataset_repeat_reshuffle(add_and_remove_cv_file) + test_cv_minddataset_batch_size_larger_than_records(add_and_remove_cv_file) + test_cv_minddataset_issue_888(add_and_remove_cv_file) + test_cv_minddataset_blockreader_tutorial(add_and_remove_cv_file) + test_cv_minddataset_blockreader_some_field_not_in_index_tutorial(add_and_remove_cv_file) + test_cv_minddataset_reader_file_list(add_and_remove_cv_file) + test_cv_minddataset_reader_one_partition(add_and_remove_cv_file) + test_cv_minddataset_reader_two_dataset(add_and_remove_cv_file) + test_cv_minddataset_reader_two_dataset_partition(add_and_remove_cv_file) + test_cv_minddataset_reader_basic_tutorial(add_and_remove_cv_file) + test_nlp_minddataset_reader_basic_tutorial(add_and_remove_cv_file) + test_cv_minddataset_reader_basic_tutorial_5_epoch(add_and_remove_cv_file) + test_cv_minddataset_reader_basic_tutorial_5_epoch_with_batch(add_and_remove_cv_file) + test_cv_minddataset_reader_no_columns(add_and_remove_cv_file) + test_cv_minddataset_reader_repeat_tutorial(add_and_remove_cv_file) + test_write_with_multi_bytes_and_array_and_read_by_MindDataset() + test_write_with_multi_bytes_and_MindDataset() + test_write_with_multi_array_and_MindDataset() + test_numpy_generic() + test_write_with_float32_float64_float32_array_float64_array_and_MindDataset() diff --git a/tests/ut/python/dataset/test_minddataset_exception.py b/tests/ut/python/dataset/test_minddataset_exception.py index 619dff1962..51621750c8 100644 --- a/tests/ut/python/dataset/test_minddataset_exception.py +++ b/tests/ut/python/dataset/test_minddataset_exception.py @@ -99,8 +99,13 @@ def test_invalid_mindrecord(): num_iter = 0 for _ in data_set.create_dict_iterator(): num_iter += 1 - assert num_iter == 0 - os.remove('dummy.mindrecord') + try: + assert num_iter == 0 + except Exception as error: + os.remove('dummy.mindrecord') + raise error + else: + os.remove('dummy.mindrecord') def test_minddataset_lack_db(): @@ -113,8 +118,13 @@ def test_minddataset_lack_db(): num_iter = 0 for _ in data_set.create_dict_iterator(): num_iter += 1 - assert num_iter == 0 - os.remove(CV_FILE_NAME) + try: + assert num_iter == 0 + except Exception as error: + os.remove(CV_FILE_NAME) + raise error + else: + os.remove(CV_FILE_NAME) def test_cv_minddataset_pk_sample_error_class_column(): @@ -189,10 +199,16 @@ def test_minddataset_invalidate_num_shards(): num_iter = 0 for _ in data_set.create_dict_iterator(): num_iter += 1 - assert 'Input shard_id is not within the required interval of (0 to 0).' in str(error_info.value) + try: + assert 'Input shard_id is not within the required interval of (0 to 0).' in str(error_info.value) + except Exception as error: + os.remove(CV_FILE_NAME) + os.remove("{}.db".format(CV_FILE_NAME)) + raise error + else: + os.remove(CV_FILE_NAME) + os.remove("{}.db".format(CV_FILE_NAME)) - os.remove(CV_FILE_NAME) - os.remove("{}.db".format(CV_FILE_NAME)) def test_minddataset_invalidate_shard_id(): create_cv_mindrecord(1) @@ -203,9 +219,15 @@ def test_minddataset_invalidate_shard_id(): num_iter = 0 for _ in data_set.create_dict_iterator(): num_iter += 1 - assert 'Input shard_id is not within the required interval of (0 to 0).' in str(error_info.value) - os.remove(CV_FILE_NAME) - os.remove("{}.db".format(CV_FILE_NAME)) + try: + assert 'Input shard_id is not within the required interval of (0 to 0).' in str(error_info.value) + except Exception as error: + os.remove(CV_FILE_NAME) + os.remove("{}.db".format(CV_FILE_NAME)) + raise error + else: + os.remove(CV_FILE_NAME) + os.remove("{}.db".format(CV_FILE_NAME)) def test_minddataset_shard_id_bigger_than_num_shard(): @@ -217,17 +239,28 @@ def test_minddataset_shard_id_bigger_than_num_shard(): num_iter = 0 for _ in data_set.create_dict_iterator(): num_iter += 1 - assert 'Input shard_id is not within the required interval of (0 to 1).' in str(error_info.value) + try: + assert 'Input shard_id is not within the required interval of (0 to 1).' in str(error_info.value) + except Exception as error: + os.remove(CV_FILE_NAME) + os.remove("{}.db".format(CV_FILE_NAME)) + raise error with pytest.raises(Exception) as error_info: data_set = ds.MindDataset(CV_FILE_NAME, columns_list, num_readers, True, 2, 5) num_iter = 0 for _ in data_set.create_dict_iterator(): num_iter += 1 - assert 'Input shard_id is not within the required interval of (0 to 1).' in str(error_info.value) + try: + assert 'Input shard_id is not within the required interval of (0 to 1).' in str(error_info.value) + except Exception as error: + os.remove(CV_FILE_NAME) + os.remove("{}.db".format(CV_FILE_NAME)) + raise error + else: + os.remove(CV_FILE_NAME) + os.remove("{}.db".format(CV_FILE_NAME)) - os.remove(CV_FILE_NAME) - os.remove("{}.db".format(CV_FILE_NAME)) def test_cv_minddataset_partition_num_samples_equals_0(): """tutorial for cv minddataset.""" @@ -245,7 +278,26 @@ def test_cv_minddataset_partition_num_samples_equals_0(): num_iter += 1 with pytest.raises(Exception) as error_info: partitions(5) - assert 'num_samples should be a positive integer value, but got num_samples=0' in str(error_info.value) + try: + assert 'num_samples should be a positive integer value, but got num_samples=0' in str(error_info.value) + except Exception as error: + os.remove(CV_FILE_NAME) + os.remove("{}.db".format(CV_FILE_NAME)) + raise error + else: + os.remove(CV_FILE_NAME) + os.remove("{}.db".format(CV_FILE_NAME)) - os.remove(CV_FILE_NAME) - os.remove("{}.db".format(CV_FILE_NAME)) +if __name__ == '__main__': + test_cv_lack_json() + test_cv_lack_mindrecord() + test_invalid_mindrecord() + test_minddataset_lack_db() + test_cv_minddataset_pk_sample_error_class_column() + test_cv_minddataset_pk_sample_exclusive_shuffle() + test_cv_minddataset_reader_different_schema() + test_cv_minddataset_reader_different_page_size() + test_minddataset_invalidate_num_shards() + test_minddataset_invalidate_shard_id() + test_minddataset_shard_id_bigger_than_num_shard() + test_cv_minddataset_partition_num_samples_equals_0() diff --git a/tests/ut/python/dataset/test_minddataset_multi_images_and_ndarray.py b/tests/ut/python/dataset/test_minddataset_multi_images_and_ndarray.py index c9c9388e65..5ef3a7adcb 100644 --- a/tests/ut/python/dataset/test_minddataset_multi_images_and_ndarray.py +++ b/tests/ut/python/dataset/test_minddataset_multi_images_and_ndarray.py @@ -27,54 +27,64 @@ CV_FILE_NAME = "./complex.mindrecord" def test_cv_minddataset_reader_multi_image_and_ndarray_tutorial(): - writer = FileWriter(CV_FILE_NAME, FILES_NUM) - cv_schema_json = {"id": {"type": "int32"}, - "image_0": {"type": "bytes"}, - "image_2": {"type": "bytes"}, - "image_3": {"type": "bytes"}, - "image_4": {"type": "bytes"}, - "input_mask": {"type": "int32", "shape": [-1]}, - "segments": {"type": "float32", "shape": [2, 3]}} - writer.add_schema(cv_schema_json, "two_images_schema") - with open("../data/mindrecord/testImageNetData/images/image_00010.jpg", "rb") as file_reader: - img_data = file_reader.read() - ndarray_1 = np.array([1, 2, 3, 4, 5], np.int32) - ndarray_2 = np.array(([2, 3, 1], [7, 9, 0]), np.float32) - data = [] - for i in range(5): - item = {"id": i, "image_0": img_data, "image_2": img_data, "image_3": img_data, "image_4": img_data, - "input_mask": ndarray_1, "segments": ndarray_2} - data.append(item) - writer.write_raw_data(data) - writer.commit() - assert os.path.exists(CV_FILE_NAME) - assert os.path.exists(CV_FILE_NAME + ".db") + try: + writer = FileWriter(CV_FILE_NAME, FILES_NUM) + cv_schema_json = {"id": {"type": "int32"}, + "image_0": {"type": "bytes"}, + "image_2": {"type": "bytes"}, + "image_3": {"type": "bytes"}, + "image_4": {"type": "bytes"}, + "input_mask": {"type": "int32", "shape": [-1]}, + "segments": {"type": "float32", "shape": [2, 3]}} + writer.add_schema(cv_schema_json, "two_images_schema") + with open("../data/mindrecord/testImageNetData/images/image_00010.jpg", "rb") as file_reader: + img_data = file_reader.read() + ndarray_1 = np.array([1, 2, 3, 4, 5], np.int32) + ndarray_2 = np.array(([2, 3, 1], [7, 9, 0]), np.float32) + data = [] + for i in range(5): + item = {"id": i, "image_0": img_data, "image_2": img_data, "image_3": img_data, "image_4": img_data, + "input_mask": ndarray_1, "segments": ndarray_2} + data.append(item) + writer.write_raw_data(data) + writer.commit() + assert os.path.exists(CV_FILE_NAME) + assert os.path.exists(CV_FILE_NAME + ".db") - # tutorial for minderdataset. - columns_list = ["id", "image_0", "image_2", "image_3", "image_4", "input_mask", "segments"] - num_readers = 1 - data_set = ds.MindDataset(CV_FILE_NAME, columns_list, num_readers) - assert data_set.get_dataset_size() == 5 - num_iter = 0 - for item in data_set.create_dict_iterator(): - assert len(item) == 7 - logger.info("item: {}".format(item)) - assert item["image_0"].dtype == np.uint8 - assert (item["image_0"] == item["image_2"]).all() - assert (item["image_3"] == item["image_4"]).all() - assert (item["image_0"] == item["image_4"]).all() - assert item["image_2"].dtype == np.uint8 - assert item["image_3"].dtype == np.uint8 - assert item["image_4"].dtype == np.uint8 - assert item["id"].dtype == np.int32 - assert item["input_mask"].shape == (5,) - assert item["input_mask"].dtype == np.int32 - assert item["segments"].shape == (2, 3) - assert item["segments"].dtype == np.float32 - num_iter += 1 - assert num_iter == 5 + # tutorial for minderdataset. + columns_list = ["id", "image_0", "image_2", "image_3", "image_4", "input_mask", "segments"] + num_readers = 1 + data_set = ds.MindDataset(CV_FILE_NAME, columns_list, num_readers) + assert data_set.get_dataset_size() == 5 + num_iter = 0 + for item in data_set.create_dict_iterator(): + assert len(item) == 7 + logger.info("item: {}".format(item)) + assert item["image_0"].dtype == np.uint8 + assert (item["image_0"] == item["image_2"]).all() + assert (item["image_3"] == item["image_4"]).all() + assert (item["image_0"] == item["image_4"]).all() + assert item["image_2"].dtype == np.uint8 + assert item["image_3"].dtype == np.uint8 + assert item["image_4"].dtype == np.uint8 + assert item["id"].dtype == np.int32 + assert item["input_mask"].shape == (5,) + assert item["input_mask"].dtype == np.int32 + assert item["segments"].shape == (2, 3) + assert item["segments"].dtype == np.float32 + num_iter += 1 + assert num_iter == 5 + except Exception as error: + if os.path.exists("{}".format(CV_FILE_NAME + ".db")): + os.remove(CV_FILE_NAME + ".db") + if os.path.exists("{}".format(CV_FILE_NAME)): + os.remove(CV_FILE_NAME) + raise error + else: + if os.path.exists("{}".format(CV_FILE_NAME + ".db")): + os.remove(CV_FILE_NAME + ".db") + if os.path.exists("{}".format(CV_FILE_NAME)): + os.remove(CV_FILE_NAME) - if os.path.exists("{}".format(CV_FILE_NAME + ".db")): - os.remove(CV_FILE_NAME + ".db") - if os.path.exists("{}".format(CV_FILE_NAME)): - os.remove(CV_FILE_NAME) +if __name__ == '__main__': + test_cv_minddataset_reader_multi_image_and_ndarray_tutorial() diff --git a/tests/ut/python/dataset/test_minddataset_padded.py b/tests/ut/python/dataset/test_minddataset_padded.py index c0724e3236..a05879ab01 100644 --- a/tests/ut/python/dataset/test_minddataset_padded.py +++ b/tests/ut/python/dataset/test_minddataset_padded.py @@ -44,24 +44,31 @@ def add_and_remove_cv_file(): """add/remove cv file""" paths = ["{}{}".format(CV_FILE_NAME, str(x).rjust(1, '0')) for x in range(FILES_NUM)] - for x in paths: - os.remove("{}".format(x)) if os.path.exists("{}".format(x)) else None - os.remove("{}.db".format(x)) if os.path.exists( - "{}.db".format(x)) else None - writer = FileWriter(CV_FILE_NAME, FILES_NUM) - data = get_data(CV_DIR_NAME) - cv_schema_json = {"id": {"type": "int32"}, - "file_name": {"type": "string"}, - "label": {"type": "int32"}, - "data": {"type": "bytes"}} - writer.add_schema(cv_schema_json, "img_schema") - writer.add_index(["file_name", "label"]) - writer.write_raw_data(data) - writer.commit() - yield "yield_cv_data" - for x in paths: - os.remove("{}".format(x)) - os.remove("{}.db".format(x)) + try: + for x in paths: + os.remove("{}".format(x)) if os.path.exists("{}".format(x)) else None + os.remove("{}.db".format(x)) if os.path.exists( + "{}.db".format(x)) else None + writer = FileWriter(CV_FILE_NAME, FILES_NUM) + data = get_data(CV_DIR_NAME) + cv_schema_json = {"id": {"type": "int32"}, + "file_name": {"type": "string"}, + "label": {"type": "int32"}, + "data": {"type": "bytes"}} + writer.add_schema(cv_schema_json, "img_schema") + writer.add_index(["file_name", "label"]) + writer.write_raw_data(data) + writer.commit() + yield "yield_cv_data" + except Exception as error: + for x in paths: + os.remove("{}".format(x)) + os.remove("{}.db".format(x)) + raise error + else: + for x in paths: + os.remove("{}".format(x)) + os.remove("{}.db".format(x)) @pytest.fixture @@ -69,32 +76,39 @@ def add_and_remove_nlp_file(): """add/remove nlp file""" paths = ["{}{}".format(NLP_FILE_NAME, str(x).rjust(1, '0')) for x in range(FILES_NUM)] - for x in paths: - if os.path.exists("{}".format(x)): + try: + for x in paths: + if os.path.exists("{}".format(x)): + os.remove("{}".format(x)) + if os.path.exists("{}.db".format(x)): + os.remove("{}.db".format(x)) + writer = FileWriter(NLP_FILE_NAME, FILES_NUM) + data = [x for x in get_nlp_data(NLP_FILE_POS, NLP_FILE_VOCAB, 10)] + nlp_schema_json = {"id": {"type": "string"}, "label": {"type": "int32"}, + "rating": {"type": "float32"}, + "input_ids": {"type": "int64", + "shape": [-1]}, + "input_mask": {"type": "int64", + "shape": [1, -1]}, + "segment_ids": {"type": "int64", + "shape": [2, -1]} + } + writer.set_header_size(1 << 14) + writer.set_page_size(1 << 15) + writer.add_schema(nlp_schema_json, "nlp_schema") + writer.add_index(["id", "rating"]) + writer.write_raw_data(data) + writer.commit() + yield "yield_nlp_data" + except Exception as error: + for x in paths: + os.remove("{}".format(x)) + os.remove("{}.db".format(x)) + raise error + else: + for x in paths: os.remove("{}".format(x)) - if os.path.exists("{}.db".format(x)): os.remove("{}.db".format(x)) - writer = FileWriter(NLP_FILE_NAME, FILES_NUM) - data = [x for x in get_nlp_data(NLP_FILE_POS, NLP_FILE_VOCAB, 10)] - nlp_schema_json = {"id": {"type": "string"}, "label": {"type": "int32"}, - "rating": {"type": "float32"}, - "input_ids": {"type": "int64", - "shape": [-1]}, - "input_mask": {"type": "int64", - "shape": [1, -1]}, - "segment_ids": {"type": "int64", - "shape": [2, -1]} - } - writer.set_header_size(1 << 14) - writer.set_page_size(1 << 15) - writer.add_schema(nlp_schema_json, "nlp_schema") - writer.add_index(["id", "rating"]) - writer.write_raw_data(data) - writer.commit() - yield "yield_nlp_data" - for x in paths: - os.remove("{}".format(x)) - os.remove("{}.db".format(x)) def test_cv_minddataset_reader_basic_padded_samples(add_and_remove_cv_file): """tutorial for cv minderdataset.""" @@ -119,7 +133,7 @@ def test_cv_minddataset_reader_basic_padded_samples(add_and_remove_cv_file): encoding='utf8') assert item['label'] == padded_sample['label'] assert (item['data'] == np.array(list(padded_sample['data']))).all() - num_iter += 1 + num_iter += 1 assert num_padded_iter == 5 assert num_iter == 15 @@ -636,3 +650,17 @@ def inputs(vectors, maxlen=50): mask = [1] * length + [0] * (maxlen - length) segment = [0] * maxlen return input_, mask, segment + +if __name__ == '__main__': + test_cv_minddataset_reader_basic_padded_samples(add_and_remove_cv_file) + test_cv_minddataset_partition_padded_samples(add_and_remove_cv_file) + test_cv_minddataset_partition_padded_samples_multi_epoch(add_and_remove_cv_file) + test_cv_minddataset_partition_padded_samples_no_dividsible(add_and_remove_cv_file) + test_cv_minddataset_partition_padded_samples_dataset_size_no_divisible(add_and_remove_cv_file) + test_cv_minddataset_partition_padded_samples_no_equal_column_list(add_and_remove_cv_file) + test_cv_minddataset_partition_padded_samples_no_column_list(add_and_remove_cv_file) + test_cv_minddataset_partition_padded_samples_no_num_padded(add_and_remove_cv_file) + test_cv_minddataset_partition_padded_samples_no_padded_samples(add_and_remove_cv_file) + test_nlp_minddataset_reader_basic_padded_samples(add_and_remove_nlp_file) + test_nlp_minddataset_reader_basic_padded_samples_multi_epoch(add_and_remove_nlp_file) + test_nlp_minddataset_reader_basic_padded_samples_check_whole_reshuffle_result_per_epoch(add_and_remove_nlp_file) diff --git a/tests/ut/python/dataset/test_minddataset_sampler.py b/tests/ut/python/dataset/test_minddataset_sampler.py index 8d099f1af2..9c110c0e1f 100644 --- a/tests/ut/python/dataset/test_minddataset_sampler.py +++ b/tests/ut/python/dataset/test_minddataset_sampler.py @@ -34,26 +34,32 @@ def add_and_remove_cv_file(): """add/remove cv file""" paths = ["{}{}".format(CV_FILE_NAME, str(x).rjust(1, '0')) for x in range(FILES_NUM)] - for x in paths: - if os.path.exists("{}".format(x)): + try: + for x in paths: + if os.path.exists("{}".format(x)): + os.remove("{}".format(x)) + if os.path.exists("{}.db".format(x)): + os.remove("{}.db".format(x)) + writer = FileWriter(CV_FILE_NAME, FILES_NUM) + data = get_data(CV_DIR_NAME, True) + cv_schema_json = {"id": {"type": "int32"}, + "file_name": {"type": "string"}, + "label": {"type": "int32"}, + "data": {"type": "bytes"}} + writer.add_schema(cv_schema_json, "img_schema") + writer.add_index(["file_name", "label"]) + writer.write_raw_data(data) + writer.commit() + yield "yield_cv_data" + except Exception as error: + for x in paths: + os.remove("{}".format(x)) + os.remove("{}.db".format(x)) + raise error + else: + for x in paths: os.remove("{}".format(x)) - if os.path.exists("{}.db".format(x)): os.remove("{}.db".format(x)) - writer = FileWriter(CV_FILE_NAME, FILES_NUM) - data = get_data(CV_DIR_NAME, True) - cv_schema_json = {"id": {"type": "int32"}, - "file_name": {"type": "string"}, - "label": {"type": "int32"}, - "data": {"type": "bytes"}} - writer.add_schema(cv_schema_json, "img_schema") - writer.add_index(["file_name", "label"]) - writer.write_raw_data(data) - writer.commit() - yield "yield_cv_data" - for x in paths: - os.remove("{}".format(x)) - os.remove("{}.db".format(x)) - def test_cv_minddataset_pk_sample_no_column(add_and_remove_cv_file): """tutorial for cv minderdataset.""" @@ -626,3 +632,24 @@ def get_data(dir_name, sampler=False): except FileNotFoundError: continue return data_list + +if __name__ == '__main__': + test_cv_minddataset_pk_sample_no_column(add_and_remove_cv_file) + test_cv_minddataset_pk_sample_basic(add_and_remove_cv_file) + test_cv_minddataset_pk_sample_shuffle(add_and_remove_cv_file) + test_cv_minddataset_pk_sample_out_of_range(add_and_remove_cv_file) + test_cv_minddataset_subset_random_sample_basic(add_and_remove_cv_file) + test_cv_minddataset_subset_random_sample_replica(add_and_remove_cv_file) + test_cv_minddataset_subset_random_sample_empty(add_and_remove_cv_file) + test_cv_minddataset_subset_random_sample_out_of_range(add_and_remove_cv_file) + test_cv_minddataset_subset_random_sample_negative(add_and_remove_cv_file) + test_cv_minddataset_random_sampler_basic(add_and_remove_cv_file) + test_cv_minddataset_random_sampler_repeat(add_and_remove_cv_file) + test_cv_minddataset_random_sampler_replacement(add_and_remove_cv_file) + test_cv_minddataset_sequential_sampler_basic(add_and_remove_cv_file) + test_cv_minddataset_sequential_sampler_exceed_size(add_and_remove_cv_file) + test_cv_minddataset_split_basic(add_and_remove_cv_file) + test_cv_minddataset_split_exact_percent(add_and_remove_cv_file) + test_cv_minddataset_split_fuzzy_percent(add_and_remove_cv_file) + test_cv_minddataset_split_deterministic(add_and_remove_cv_file) + test_cv_minddataset_split_sharding(add_and_remove_cv_file)