From 26cb3e8a5fab42f8c18b91d54ebb83173b08933f Mon Sep 17 00:00:00 2001 From: eric Date: Tue, 28 Apr 2020 21:40:50 -0400 Subject: [PATCH] Added test function to show that seed doesn't work. Added testcase to show that c image aug don't use seed properly Added passing test cases Added working testcases for using seed Added additional test cases to show seed use Added test case for seed --- mindspore/dataset/core/configuration.py | 9 +- tests/ut/python/dataset/test_config.py | 228 +++++++++++++++++- .../dataset/test_datasets_textfileop.py | 1 + .../dataset/test_random_color_adjust.py | 42 ++-- tests/ut/python/dataset/test_random_crop.py | 7 +- tests/ut/python/dataset/test_rename.py | 5 +- tests/ut/python/dataset/test_shuffle.py | 23 ++ 7 files changed, 282 insertions(+), 33 deletions(-) diff --git a/mindspore/dataset/core/configuration.py b/mindspore/dataset/core/configuration.py index d052c138d8..c08f47526e 100644 --- a/mindspore/dataset/core/configuration.py +++ b/mindspore/dataset/core/configuration.py @@ -15,7 +15,7 @@ """ The configuration manager. """ - +import random import mindspore._c_dataengine as cde INT32_MAX = 2147483647 @@ -32,6 +32,12 @@ class ConfigurationManager: """ Set the seed to be used in any random generator. This is used to produce deterministic results. + Note: + This set_seed function sets the seed in the python random library function for deterministic + python augmentations using randomness. This set_seed function should be called with every + iterator created to reset the random seed. In our pipeline this does not guarantee + deterministic results with num_parallel_workers > 1. + Args: seed(int): seed to be set @@ -47,6 +53,7 @@ class ConfigurationManager: if seed < 0 or seed > UINT32_MAX: raise ValueError("Seed given is not within the required range") self.config.set_seed(seed) + random.seed(seed) def get_seed(self): """ diff --git a/tests/ut/python/dataset/test_config.py b/tests/ut/python/dataset/test_config.py index 0c1e0073af..8a7c0f2911 100644 --- a/tests/ut/python/dataset/test_config.py +++ b/tests/ut/python/dataset/test_config.py @@ -13,14 +13,19 @@ # limitations under the License. # ============================================================================== """ -Testing configuration manager +Testing configuration manager """ import filecmp import glob +import numpy as np import os +from mindspore import log as logger + import mindspore.dataset as ds import mindspore.dataset.transforms.vision.c_transforms as vision +import mindspore.dataset.transforms.vision.py_transforms as py_vision + DATA_DIR = ["../data/dataset/test_tf_file_3_images/train-0000-of-0001.data"] SCHEMA_DIR = "../data/dataset/test_tf_file_3_images/datasetSchema.json" @@ -46,9 +51,17 @@ def test_basic(): assert ds.config.get_prefetch_size() == 4 assert ds.config.get_seed() == 5 + +def test_get_seed(): + """ + This gets the seed value without explicitly setting a default, expect int. + """ + assert isinstance(ds.config.get_seed(), int) + + def test_pipeline(): - """ - Test that our configuration pipeline works when we set parameters at dataset interval + """ + Test that our configuration pipeline works when we set parameters at different locations in dataset code """ data1 = ds.TFRecordDataset(DATA_DIR, SCHEMA_DIR, shuffle=False) ds.config.set_num_parallel_workers(2) @@ -60,12 +73,12 @@ def test_pipeline(): data2 = data2.map(input_columns=["image"], operations=[vision.Decode(True)]) ds.serialize(data2, "testpipeline2.json") - # check that the generated output is different + # check that the generated output is different assert (filecmp.cmp('testpipeline.json', 'testpipeline2.json')) - # this test passes currently because our num_parallel_workers don't get updated. + # this test passes currently because our num_parallel_workers don't get updated. - # remove generated jason files + # remove generated jason files file_list = glob.glob('*.json') for f in file_list: try: @@ -74,6 +87,209 @@ def test_pipeline(): logger.info("Error while deleting: {}".format(f)) +def test_deterministic_run_fail(): + """ + Test RandomCrop with seed, expected to fail + """ + logger.info("test_deterministic_run_fail") + + # when we set the seed all operations within our dataset should be deterministic + ds.config.set_seed(0) + ds.config.set_num_parallel_workers(1) + # First dataset + data1 = ds.TFRecordDataset(DATA_DIR, SCHEMA_DIR, columns_list=["image"], shuffle=False) + # Assuming we get the same seed on calling constructor, if this op is re-used then result won't be + # the same in between the two datasets. For example, RandomCrop constructor takes seed (0) + # outputs a deterministic series of numbers, e,g "a" = [1, 2, 3, 4, 5, 6] <- pretend these are random + random_crop_op = vision.RandomCrop([512, 512], [200, 200, 200, 200]) + decode_op = vision.Decode() + data1 = data1.map(input_columns=["image"], operations=decode_op) + data1 = data1.map(input_columns=["image"], operations=random_crop_op) + + # Second dataset + data2 = ds.TFRecordDataset(DATA_DIR, SCHEMA_DIR, columns_list=["image"], shuffle=False) + data2 = data2.map(input_columns=["image"], operations=decode_op) + # If seed is set up on constructor + data2 = data2.map(input_columns=["image"], operations=random_crop_op) + + try: + for item1, item2 in zip(data1.create_dict_iterator(), data2.create_dict_iterator()): + np.testing.assert_equal (item1["image"], item2["image"]) + + except BaseException as e: + # two datasets split the number out of the sequence a + logger.info("Got an exception in DE: {}".format(str(e))) + assert "Array" in str(e) + + +def test_deterministic_run_pass(): + """ + Test deterministic run with with setting the seed + """ + logger.info("test_deterministic_run_pass") + ds.config.set_seed(0) + ds.config.set_num_parallel_workers(1) + + # First dataset + data1 = ds.TFRecordDataset(DATA_DIR, SCHEMA_DIR, columns_list=["image"], shuffle=False) + # We get the seed when constructor is called + random_crop_op = vision.RandomCrop([512, 512], [200, 200, 200, 200]) + decode_op = vision.Decode() + data1 = data1.map(input_columns=["image"], operations=decode_op) + data1 = data1.map(input_columns=["image"], operations=random_crop_op) + + # Second dataset + data2 = ds.TFRecordDataset(DATA_DIR, SCHEMA_DIR, columns_list=["image"], shuffle=False) + data2 = data2.map(input_columns=["image"], operations=decode_op) + # Since seed is set up on constructor, so the two ops output deterministic sequence. + # Assume the generated random sequence "a" = [1, 2, 3, 4, 5, 6] <- pretend these are random + random_crop_op2 = vision.RandomCrop([512, 512], [200, 200, 200, 200]) + data2 = data2.map(input_columns=["image"], operations=random_crop_op2) + try: + for item1, item2 in zip(data1.create_dict_iterator(), data2.create_dict_iterator()): + np.testing.assert_equal (item1["image"], item2["image"]) + except BaseException as e: + # two datasets both use numbers from the generated sequence "a" + logger.info("Got an exception in DE: {}".format(str(e))) + assert "Array" in str(e) + + +def test_seed_undeterministic(): + """ + Test seed with num parallel workers in c, this test is expected to fail some of the time + """ + logger.info("test_seed_undeterministic") + ds.config.set_seed(0) + + # First dataset + data1 = ds.TFRecordDataset(DATA_DIR, SCHEMA_DIR, columns_list=["image"], shuffle=False) + # seed will be read in during constructor call + random_crop_op = vision.RandomCrop([512, 512], [200, 200, 200, 200]) + decode_op = vision.Decode() + data1 = data1.map(input_columns=["image"], operations=decode_op) + data1 = data1.map(input_columns=["image"], operations=random_crop_op) + + # Second dataset + data2 = ds.TFRecordDataset(DATA_DIR, SCHEMA_DIR, columns_list=["image"], shuffle=False) + data2 = data2.map(input_columns=["image"], operations=decode_op) + # If seed is set up on constructor, so the two ops output deterministic sequence + random_crop_op2 = vision.RandomCrop([512, 512], [200, 200, 200, 200]) + data2 = data2.map(input_columns=["image"], operations=random_crop_op2) + + for item1, item2 in zip(data1.create_dict_iterator(), data2.create_dict_iterator()): + np.testing.assert_equal (item1["image"], item2["image"]) + + +def test_deterministic_run_distribution(): + """ + Test deterministic run with with setting the seed being used in a distribution + """ + logger.info("test_deterministic_run_distribution") + + # when we set the seed all operations within our dataset should be deterministic + ds.config.set_seed(0) + ds.config.set_num_parallel_workers(1) + + # First dataset + data1 = ds.TFRecordDataset(DATA_DIR, SCHEMA_DIR, columns_list=["image"], shuffle=False) + random_crop_op = vision.RandomHorizontalFlip(0.1) + decode_op = vision.Decode() + data1 = data1.map(input_columns=["image"], operations=decode_op) + data1 = data1.map(input_columns=["image"], operations=random_crop_op) + + # Second dataset + data2 = ds.TFRecordDataset(DATA_DIR, SCHEMA_DIR, columns_list=["image"], shuffle=False) + data2 = data2.map(input_columns=["image"], operations=decode_op) + # If seed is set up on constructor, so the two ops output deterministic sequence + random_crop_op2 = vision.RandomHorizontalFlip(0.1) + data2 = data2.map(input_columns=["image"], operations=random_crop_op2) + + for item1, item2 in zip(data1.create_dict_iterator(), data2.create_dict_iterator()): + np.testing.assert_equal (item1["image"], item2["image"]) + + +def test_deterministic_python_seed(): + """ + Test deterministic execution with seed in python + """ + logger.info("deterministic_random_crop_op_python_2") + ds.config.set_seed(0) + ds.config.set_num_parallel_workers(1) + + # First dataset + data1 = ds.TFRecordDataset(DATA_DIR, SCHEMA_DIR, columns_list=["image"], shuffle=False) + + transforms = [ + py_vision.Decode(), + py_vision.RandomCrop([512, 512], [200, 200, 200, 200]), + py_vision.ToTensor(), + ] + transform = py_vision.ComposeOp(transforms) + data1 = data1.map(input_columns=["image"], operations=transform()) + data1_output = [] + # config.set_seed() calls random.seed() + for data_one in data1.create_dict_iterator(): + data1_output.append(data_one["image"]) + + # Second dataset + data2 = ds.TFRecordDataset(DATA_DIR, SCHEMA_DIR, columns_list=["image"], shuffle=False) + data2 = data2.map(input_columns=["image"], operations=transform()) + # config.set_seed() calls random.seed(), resets seed for next dataset iterator + ds.config.set_seed(0) + + data2_output = [] + for data_two in data2.create_dict_iterator(): + data2_output.append(data_two["image"]) + + np.testing.assert_equal (data1_output, data2_output) + + +def test_deterministic_python_seed_multi_thread(): + """ + Test deterministic execution with seed in python, this fails with multi-thread pyfunc run + """ + logger.info("deterministic_random_crop_op_python_2") + ds.config.set_seed(0) + # when we set the seed all operations within our dataset should be deterministic + # First dataset + data1 = ds.TFRecordDataset(DATA_DIR, SCHEMA_DIR, columns_list=["image"], shuffle=False) + transforms = [ + py_vision.Decode(), + py_vision.RandomCrop([512, 512], [200, 200, 200, 200]), + py_vision.ToTensor(), + ] + transform = py_vision.ComposeOp(transforms) + data1 = data1.map(input_columns=["image"], operations=transform(), python_multiprocessing=True) + data1_output = [] + # config.set_seed() calls random.seed() + for data_one in data1.create_dict_iterator(): + data1_output.append(data_one["image"]) + + # Second dataset + data2 = ds.TFRecordDataset(DATA_DIR, SCHEMA_DIR, columns_list=["image"], shuffle=False) + # If seed is set up on constructor + data2 = data2.map(input_columns=["image"], operations=transform(), python_multiprocessing=True) + # config.set_seed() calls random.seed() + ds.config.set_seed(0) + + data2_output = [] + for data_two in data2.create_dict_iterator(): + data2_output.append(data_two["image"]) + + try: + np.testing.assert_equal (data1_output, data2_output) + except BaseException as e: + # expect output to not match during multi-threaded excution + logger.info("Got an exception in DE: {}".format(str(e))) + assert "Array" in str(e) + + if __name__ == '__main__': test_basic() test_pipeline() + test_deterministic_run_pass() + test_deterministic_run_distribution() + test_deterministic_run_fail() + test_deterministic_python_seed() + test_seed_undeterministic() + test_get_seed() diff --git a/tests/ut/python/dataset/test_datasets_textfileop.py b/tests/ut/python/dataset/test_datasets_textfileop.py index 720fcdcce0..fdf4907404 100644 --- a/tests/ut/python/dataset/test_datasets_textfileop.py +++ b/tests/ut/python/dataset/test_datasets_textfileop.py @@ -36,6 +36,7 @@ def test_textline_dataset_all_file(): assert(count == 5) def test_textline_dataset_totext(): + ds.config.set_num_parallel_workers(4) data = ds.TextFileDataset(DATA_ALL_FILE, shuffle=False) count = 0 line = ["This is a text file.", "Another file.", "Be happy every day.", "End of file.", "Good luck to everyone."] diff --git a/tests/ut/python/dataset/test_random_color_adjust.py b/tests/ut/python/dataset/test_random_color_adjust.py index dcb7cd48ac..c3e7bd3d7c 100644 --- a/tests/ut/python/dataset/test_random_color_adjust.py +++ b/tests/ut/python/dataset/test_random_color_adjust.py @@ -37,7 +37,7 @@ def visualize(first, mse, second): plt.subplot(142) plt.imshow(second) - plt.title("py random_color_jitter image") + plt.title("py random_color_adjust image") plt.subplot(143) plt.imshow(first - second) @@ -50,20 +50,20 @@ def diff_mse(in1, in2): return mse * 100 -def test_random_color_jitter_op_brightness(): +def test_random_color_adjust_op_brightness(): """ Test RandomColorAdjust op """ - logger.info("test_random_color_jitter_op") + logger.info("test_random_color_adjust_op") # First dataset data1 = ds.TFRecordDataset(DATA_DIR, SCHEMA_DIR, columns_list=["image"], shuffle=False) decode_op = c_vision.Decode() - random_jitter_op = c_vision.RandomColorAdjust((0.8, 0.8), (1, 1), (1, 1), (0, 0)) + random_adjust_op = c_vision.RandomColorAdjust((0.8, 0.8), (1, 1), (1, 1), (0, 0)) ctrans = [decode_op, - random_jitter_op, + random_adjust_op, ] data1 = data1.map(input_columns=["image"], operations=ctrans) @@ -100,20 +100,20 @@ def test_random_color_jitter_op_brightness(): # visualize(c_image, mse, py_image) -def test_random_color_jitter_op_contrast(): +def test_random_color_adjust_op_contrast(): """ Test RandomColorAdjust op """ - logger.info("test_random_color_jitter_op") + logger.info("test_random_color_adjust_op") # First dataset data1 = ds.TFRecordDataset(DATA_DIR, SCHEMA_DIR, columns_list=["image"], shuffle=False) decode_op = c_vision.Decode() - random_jitter_op = c_vision.RandomColorAdjust((1, 1), (0.5, 0.5), (1, 1), (0, 0)) + random_adjust_op = c_vision.RandomColorAdjust((1, 1), (0.5, 0.5), (1, 1), (0, 0)) ctrans = [decode_op, - random_jitter_op + random_adjust_op ] data1 = data1.map(input_columns=["image"], operations=ctrans) @@ -156,20 +156,20 @@ def test_random_color_jitter_op_contrast(): # visualize(c_image, mse, py_image) -def test_random_color_jitter_op_saturation(): +def test_random_color_adjust_op_saturation(): """ Test RandomColorAdjust op """ - logger.info("test_random_color_jitter_op") + logger.info("test_random_color_adjust_op") # First dataset data1 = ds.TFRecordDataset(DATA_DIR, SCHEMA_DIR, columns_list=["image"], shuffle=False) decode_op = c_vision.Decode() - random_jitter_op = c_vision.RandomColorAdjust((1, 1), (1, 1), (0.5, 0.5), (0, 0)) + random_adjust_op = c_vision.RandomColorAdjust((1, 1), (1, 1), (0.5, 0.5), (0, 0)) ctrans = [decode_op, - random_jitter_op + random_adjust_op ] data1 = data1.map(input_columns=["image"], operations=ctrans) @@ -209,20 +209,20 @@ def test_random_color_jitter_op_saturation(): # visualize(c_image, mse, py_image) -def test_random_color_jitter_op_hue(): +def test_random_color_adjust_op_hue(): """ Test RandomColorAdjust op """ - logger.info("test_random_color_jitter_op") + logger.info("test_random_color_adjust_op") # First dataset data1 = ds.TFRecordDataset(DATA_DIR, SCHEMA_DIR, columns_list=["image"], shuffle=False) decode_op = c_vision.Decode() - random_jitter_op = c_vision.RandomColorAdjust((1, 1), (1, 1), (1, 1), (0.2, 0.2)) + random_adjust_op = c_vision.RandomColorAdjust((1, 1), (1, 1), (1, 1), (0.2, 0.2)) ctrans = [decode_op, - random_jitter_op, + random_adjust_op, ] data1 = data1.map(input_columns=["image"], operations=ctrans) @@ -264,7 +264,7 @@ def test_random_color_jitter_op_hue(): if __name__ == "__main__": - test_random_color_jitter_op_brightness() - test_random_color_jitter_op_contrast() - test_random_color_jitter_op_saturation() - test_random_color_jitter_op_hue() + test_random_color_adjust_op_brightness() + test_random_color_adjust_op_contrast() + test_random_color_adjust_op_saturation() + test_random_color_adjust_op_hue() diff --git a/tests/ut/python/dataset/test_random_crop.py b/tests/ut/python/dataset/test_random_crop.py index 81e779e9f2..2ef3a17dcc 100644 --- a/tests/ut/python/dataset/test_random_crop.py +++ b/tests/ut/python/dataset/test_random_crop.py @@ -17,8 +17,8 @@ Testing RandomCropAndResize op in DE """ import matplotlib.pyplot as plt import mindspore.dataset.transforms.vision.c_transforms as vision -from mindspore import log as logger +from mindspore import log as logger import mindspore.dataset as ds DATA_DIR = ["../data/dataset/test_tf_file_3_images/train-0000-of-0001.data"] @@ -45,9 +45,9 @@ def visualize(a, mse, original): def test_random_crop_op(): """ - Test RandomCropAndResize op + Test RandomCrop Op """ - logger.info("test_random_crop_and_resize_op") + logger.info("test_random_crop_op") # First dataset data1 = ds.TFRecordDataset(DATA_DIR, SCHEMA_DIR, columns_list=["image"], shuffle=False) @@ -67,3 +67,4 @@ def test_random_crop_op(): if __name__ == "__main__": test_random_crop_op() + diff --git a/tests/ut/python/dataset/test_rename.py b/tests/ut/python/dataset/test_rename.py index a1d207b116..5e7b28ed7e 100644 --- a/tests/ut/python/dataset/test_rename.py +++ b/tests/ut/python/dataset/test_rename.py @@ -12,6 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. # ============================================================================== +import numpy as np import mindspore.dataset as ds from mindspore import log as logger @@ -34,9 +35,9 @@ def test_rename(): for i, item in enumerate(data.create_dict_iterator()): logger.info("item[mask] is {}".format(item["masks"])) - assert item["masks"].all() == item["input_ids"].all() + np.testing.assert_equal (item["masks"], item["input_ids"]) logger.info("item[seg_ids] is {}".format(item["seg_ids"])) - assert item["segment_ids"].all() == item["seg_ids"].all() + np.testing.assert_equal (item["segment_ids"], item["seg_ids"]) # need to consume the data in the buffer num_iter += 1 logger.info("Number of data in data: {}".format(num_iter)) diff --git a/tests/ut/python/dataset/test_shuffle.py b/tests/ut/python/dataset/test_shuffle.py index 4a823c5fb7..359bdea648 100644 --- a/tests/ut/python/dataset/test_shuffle.py +++ b/tests/ut/python/dataset/test_shuffle.py @@ -12,6 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. # ============================================================================== +import numpy as np from util import save_and_check import mindspore.dataset as ds @@ -117,6 +118,27 @@ def test_shuffle_05(): save_and_check(data1, parameters, filename, generate_golden=GENERATE_GOLDEN) +def test_shuffle_06(): + """ + Test shuffle: with set seed, both datasets + """ + logger.info("test_shuffle_06") + # define parameters + buffer_size = 13 + seed = 1 + + # apply dataset operations + data1 = ds.TFRecordDataset(DATA_DIR, shuffle=ds.Shuffle.FILES) + ds.config.set_seed(seed) + data1 = data1.shuffle(buffer_size=buffer_size) + + data2 = ds.TFRecordDataset(DATA_DIR, shuffle=ds.Shuffle.FILES) + data2 = data2.shuffle(buffer_size=buffer_size) + + for item1, item2 in zip(data1.create_dict_iterator(), data2.create_dict_iterator()): + np.testing.assert_equal (item1, item2) + + def test_shuffle_exception_01(): """ Test shuffle exception: buffer_size<0 @@ -231,6 +253,7 @@ if __name__ == '__main__': test_shuffle_03() test_shuffle_04() test_shuffle_05() + test_shuffle_06() test_shuffle_exception_01() test_shuffle_exception_02() test_shuffle_exception_03()