|
|
|
@ -16,17 +16,15 @@
|
|
|
|
|
This is the test module for mindrecord
|
|
|
|
|
"""
|
|
|
|
|
import collections
|
|
|
|
|
import json
|
|
|
|
|
import numpy as np
|
|
|
|
|
import os
|
|
|
|
|
import pytest
|
|
|
|
|
import re
|
|
|
|
|
import string
|
|
|
|
|
|
|
|
|
|
import numpy as np
|
|
|
|
|
import pytest
|
|
|
|
|
|
|
|
|
|
import mindspore.dataset as ds
|
|
|
|
|
import mindspore.dataset.transforms.vision.c_transforms as vision
|
|
|
|
|
from mindspore import log as logger
|
|
|
|
|
from mindspore.dataset.transforms.vision import Inter
|
|
|
|
|
from mindspore.mindrecord import FileWriter
|
|
|
|
|
|
|
|
|
|
FILES_NUM = 4
|
|
|
|
@ -110,6 +108,7 @@ def add_and_remove_nlp_file():
|
|
|
|
|
os.remove("{}".format(x))
|
|
|
|
|
os.remove("{}.db".format(x))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def test_cv_minddataset_reader_basic_padded_samples(add_and_remove_cv_file):
|
|
|
|
|
"""tutorial for cv minderdataset."""
|
|
|
|
|
columns_list = ["label", "file_name", "data"]
|
|
|
|
@ -177,6 +176,7 @@ def test_cv_minddataset_partition_padded_samples(add_and_remove_cv_file):
|
|
|
|
|
partitions(5, 5, 3)
|
|
|
|
|
partitions(9, 8, 2)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def test_cv_minddataset_partition_padded_samples_multi_epoch(add_and_remove_cv_file):
|
|
|
|
|
"""tutorial for cv minddataset."""
|
|
|
|
|
columns_list = ["data", "file_name", "label"]
|
|
|
|
@ -248,6 +248,7 @@ def test_cv_minddataset_partition_padded_samples_multi_epoch(add_and_remove_cv_f
|
|
|
|
|
partitions(5, 5, 3)
|
|
|
|
|
partitions(9, 8, 2)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def test_cv_minddataset_partition_padded_samples_no_dividsible(add_and_remove_cv_file):
|
|
|
|
|
"""tutorial for cv minddataset."""
|
|
|
|
|
columns_list = ["data", "file_name", "label"]
|
|
|
|
@ -273,6 +274,7 @@ def test_cv_minddataset_partition_padded_samples_no_dividsible(add_and_remove_cv
|
|
|
|
|
with pytest.raises(RuntimeError):
|
|
|
|
|
partitions(4, 1)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def test_cv_minddataset_partition_padded_samples_dataset_size_no_divisible(add_and_remove_cv_file):
|
|
|
|
|
columns_list = ["data", "file_name", "label"]
|
|
|
|
|
|
|
|
|
@ -291,8 +293,10 @@ def test_cv_minddataset_partition_padded_samples_dataset_size_no_divisible(add_a
|
|
|
|
|
num_padded=num_padded)
|
|
|
|
|
with pytest.raises(RuntimeError):
|
|
|
|
|
data_set.get_dataset_size() == 3
|
|
|
|
|
|
|
|
|
|
partitions(4, 1)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def test_cv_minddataset_partition_padded_samples_no_equal_column_list(add_and_remove_cv_file):
|
|
|
|
|
columns_list = ["data", "file_name", "label"]
|
|
|
|
|
|
|
|
|
@ -314,9 +318,11 @@ def test_cv_minddataset_partition_padded_samples_no_equal_column_list(add_and_re
|
|
|
|
|
logger.info("-------------- len(item[data]): {} ------------------------".format(len(item["data"])))
|
|
|
|
|
logger.info("-------------- item[data]: {} -----------------------------".format(item["data"]))
|
|
|
|
|
logger.info("-------------- item[file_name]: {} ------------------------".format(item["file_name"]))
|
|
|
|
|
|
|
|
|
|
with pytest.raises(Exception, match="padded_sample cannot match columns_list."):
|
|
|
|
|
partitions(4, 2)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def test_cv_minddataset_partition_padded_samples_no_column_list(add_and_remove_cv_file):
|
|
|
|
|
data = get_data(CV_DIR_NAME)
|
|
|
|
|
padded_sample = data[0]
|
|
|
|
@ -336,9 +342,11 @@ def test_cv_minddataset_partition_padded_samples_no_column_list(add_and_remove_c
|
|
|
|
|
logger.info("-------------- len(item[data]): {} ------------------------".format(len(item["data"])))
|
|
|
|
|
logger.info("-------------- item[data]: {} -----------------------------".format(item["data"]))
|
|
|
|
|
logger.info("-------------- item[file_name]: {} ------------------------".format(item["file_name"]))
|
|
|
|
|
|
|
|
|
|
with pytest.raises(Exception, match="padded_sample is specified and requires columns_list as well."):
|
|
|
|
|
partitions(4, 2)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def test_cv_minddataset_partition_padded_samples_no_num_padded(add_and_remove_cv_file):
|
|
|
|
|
columns_list = ["data", "file_name", "label"]
|
|
|
|
|
data = get_data(CV_DIR_NAME)
|
|
|
|
@ -357,9 +365,11 @@ def test_cv_minddataset_partition_padded_samples_no_num_padded(add_and_remove_cv
|
|
|
|
|
logger.info("-------------- len(item[data]): {} ------------------------".format(len(item["data"])))
|
|
|
|
|
logger.info("-------------- item[data]: {} -----------------------------".format(item["data"]))
|
|
|
|
|
logger.info("-------------- item[file_name]: {} ------------------------".format(item["file_name"]))
|
|
|
|
|
|
|
|
|
|
with pytest.raises(Exception, match="padded_sample is specified and requires num_padded as well."):
|
|
|
|
|
partitions(4, 2)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def test_cv_minddataset_partition_padded_samples_no_padded_samples(add_and_remove_cv_file):
|
|
|
|
|
columns_list = ["data", "file_name", "label"]
|
|
|
|
|
data = get_data(CV_DIR_NAME)
|
|
|
|
@ -378,11 +388,11 @@ def test_cv_minddataset_partition_padded_samples_no_padded_samples(add_and_remov
|
|
|
|
|
logger.info("-------------- len(item[data]): {} ------------------------".format(len(item["data"])))
|
|
|
|
|
logger.info("-------------- item[data]: {} -----------------------------".format(item["data"]))
|
|
|
|
|
logger.info("-------------- item[file_name]: {} ------------------------".format(item["file_name"]))
|
|
|
|
|
|
|
|
|
|
with pytest.raises(Exception, match="num_padded is specified but padded_sample is not."):
|
|
|
|
|
partitions(4, 2)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def test_nlp_minddataset_reader_basic_padded_samples(add_and_remove_nlp_file):
|
|
|
|
|
columns_list = ["input_ids", "id", "rating"]
|
|
|
|
|
|
|
|
|
@ -406,7 +416,9 @@ def test_nlp_minddataset_reader_basic_padded_samples(add_and_remove_nlp_file):
|
|
|
|
|
for item in data_set.create_dict_iterator(num_epochs=1):
|
|
|
|
|
logger.info("-------------- item[id]: {} ------------------------".format(item["id"]))
|
|
|
|
|
logger.info("-------------- item[rating]: {} --------------------".format(item["rating"]))
|
|
|
|
|
logger.info("-------------- item[input_ids]: {}, shape: {} -----------------".format(item["input_ids"], item["input_ids"].shape))
|
|
|
|
|
logger.info("-------------- item[input_ids]: {}, shape: {} -----------------".format(
|
|
|
|
|
item["input_ids"],
|
|
|
|
|
item["input_ids"].shape))
|
|
|
|
|
if item['id'] == bytes('-1', encoding='utf-8'):
|
|
|
|
|
num_padded_iter += 1
|
|
|
|
|
assert item['id'] == bytes(padded_sample['id'], encoding='utf-8')
|
|
|
|
@ -420,6 +432,7 @@ def test_nlp_minddataset_reader_basic_padded_samples(add_and_remove_nlp_file):
|
|
|
|
|
partitions(5, 5, 3)
|
|
|
|
|
partitions(9, 8, 2)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def test_nlp_minddataset_reader_basic_padded_samples_multi_epoch(add_and_remove_nlp_file):
|
|
|
|
|
columns_list = ["input_ids", "id", "rating"]
|
|
|
|
|
|
|
|
|
@ -451,7 +464,9 @@ def test_nlp_minddataset_reader_basic_padded_samples_multi_epoch(add_and_remove_
|
|
|
|
|
for item in data_set.create_dict_iterator(num_epochs=1):
|
|
|
|
|
logger.info("-------------- item[id]: {} ------------------------".format(item["id"]))
|
|
|
|
|
logger.info("-------------- item[rating]: {} --------------------".format(item["rating"]))
|
|
|
|
|
logger.info("-------------- item[input_ids]: {}, shape: {} -----------------".format(item["input_ids"], item["input_ids"].shape))
|
|
|
|
|
logger.info("-------------- item[input_ids]: {}, shape: {} -----------------".format(
|
|
|
|
|
item["input_ids"],
|
|
|
|
|
item["input_ids"].shape))
|
|
|
|
|
if item['id'] == bytes('-1', encoding='utf-8'):
|
|
|
|
|
num_padded_iter += 1
|
|
|
|
|
assert item['id'] == bytes(padded_sample['id'], encoding='utf-8')
|
|
|
|
@ -519,7 +534,8 @@ def test_nlp_minddataset_reader_basic_padded_samples_check_whole_reshuffle_resul
|
|
|
|
|
assert (item['input_ids'] == padded_sample['input_ids']).all()
|
|
|
|
|
assert (item['rating'] == padded_sample['rating']).all()
|
|
|
|
|
# save epoch result
|
|
|
|
|
epoch_result[partition_id][int(inner_num_iter / dataset_size)][inner_num_iter % dataset_size] = item["id"]
|
|
|
|
|
epoch_result[partition_id][int(inner_num_iter / dataset_size)][inner_num_iter % dataset_size] = item[
|
|
|
|
|
"id"]
|
|
|
|
|
num_iter += 1
|
|
|
|
|
inner_num_iter += 1
|
|
|
|
|
assert epoch_result[partition_id][0] not in (epoch_result[partition_id][1], epoch_result[partition_id][2])
|
|
|
|
@ -651,6 +667,7 @@ def inputs(vectors, maxlen=50):
|
|
|
|
|
segment = [0] * maxlen
|
|
|
|
|
return input_, mask, segment
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if __name__ == '__main__':
|
|
|
|
|
test_cv_minddataset_reader_basic_padded_samples(add_and_remove_cv_file)
|
|
|
|
|
test_cv_minddataset_partition_padded_samples(add_and_remove_cv_file)
|
|
|
|
|