Change default value of sampler to seperate behaviour of default sampler

& null sampler, add check for duplicate column name
5 years ago · 419478b410
parent 8b5c35210f
commit 419478b410
11 changed files with 356 additions and 127 deletions
--- a/mindspore/ccsrc/minddata/dataset/api/datasets.cc
+++ b/mindspore/ccsrc/minddata/dataset/api/datasets.cc
--- a/mindspore/ccsrc/minddata/dataset/include/datasets.h
+++ b/mindspore/ccsrc/minddata/dataset/include/datasets.h
@ -87,44 +87,44 @@ std::shared_ptr<SchemaObj> Schema(const std::string &schema_file = "");
 /// \param[in] column_names Column names used to specify columns to load, if empty, will read all columns.
 ///     (default = {})
 /// \param[in] decode the option to decode the images in dataset (default = false)
-/// \param[in] sampler Object used to choose samples from the dataset. If sampler is `nullptr`,
-///     A `RandomSampler` will be used to randomly iterate the entire dataset (default = nullptr)
+/// \param[in] sampler Object used to choose samples from the dataset. If sampler is not given,
+///     a `RandomSampler` will be used to randomly iterate the entire dataset (default = RandomSampler())
 /// \return Shared pointer to the current Dataset
 std::shared_ptr<AlbumDataset> Album(const std::string &dataset_dir, const std::string &data_schema,
                                    const std::vector<std::string> &column_names = {}, bool decode = false,
-                                    const std::shared_ptr<SamplerObj> &sampler = nullptr);
+                                    const std::shared_ptr<SamplerObj> &sampler = RandomSampler());

 /// \brief Function to create a CelebADataset
 /// \notes The generated dataset has two columns ['image', 'attr'].
 //      The type of the image tensor is uint8. The attr tensor is uint32 and one hot type.
 /// \param[in] dataset_dir Path to the root directory that contains the dataset.
 /// \param[in] dataset_type One of 'all', 'train', 'valid' or 'test'.
-/// \param[in] sampler Object used to choose samples from the dataset. If sampler is `nullptr`, A `RandomSampler`
-///     will be used to randomly iterate the entire dataset
+/// \param[in] sampler Object used to choose samples from the dataset. If sampler is not given,
+///     a `RandomSampler` will be used to randomly iterate the entire dataset (default = RandomSampler())
 /// \param[in] decode Decode the images after reading (default=false).
 /// \param[in] extensions Set of file extensions to be included in the dataset (default={}).
 /// \return Shared pointer to the current Dataset
 std::shared_ptr<CelebADataset> CelebA(const std::string &dataset_dir, const std::string &dataset_type = "all",
-                                      const std::shared_ptr<SamplerObj> &sampler = nullptr, bool decode = false,
+                                      const std::shared_ptr<SamplerObj> &sampler = RandomSampler(), bool decode = false,
                                      const std::set<std::string> &extensions = {});

 /// \brief Function to create a Cifar10 Dataset
 /// \notes The generated dataset has two columns ['image', 'label']
 /// \param[in] dataset_dir Path to the root directory that contains the dataset
-/// \param[in] sampler Object used to choose samples from the dataset. If sampler is `nullptr`, A `RandomSampler`
-///     will be used to randomly iterate the entire dataset
+/// \param[in] sampler Object used to choose samples from the dataset. If sampler is not given,
+///     a `RandomSampler` will be used to randomly iterate the entire dataset (default = RandomSampler())
 /// \return Shared pointer to the current Dataset
 std::shared_ptr<Cifar10Dataset> Cifar10(const std::string &dataset_dir,
-                                        const std::shared_ptr<SamplerObj> &sampler = nullptr);
+                                        const std::shared_ptr<SamplerObj> &sampler = RandomSampler());

 /// \brief Function to create a Cifar100 Dataset
 /// \notes The generated dataset has three columns ['image', 'coarse_label', 'fine_label']
 /// \param[in] dataset_dir Path to the root directory that contains the dataset
-/// \param[in] sampler Object used to choose samples from the dataset. If sampler is `nullptr`, A `RandomSampler`
-///     will be used to randomly iterate the entire dataset
+/// \param[in] sampler Object used to choose samples from the dataset. If sampler is not given,
+///     a `RandomSampler` will be used to randomly iterate the entire dataset (default = RandomSampler())
 /// \return Shared pointer to the current Dataset
 std::shared_ptr<Cifar100Dataset> Cifar100(const std::string &dataset_dir,
-                                          const std::shared_ptr<SamplerObj> &sampler = nullptr);
+                                          const std::shared_ptr<SamplerObj> &sampler = RandomSampler());

 /// \brief Function to create a CLUEDataset
 /// \notes The generated dataset has a variable number of columns depending on the task and usage
@ -161,12 +161,12 @@ std::shared_ptr<CLUEDataset> CLUE(const std::vector<std::string> &dataset_files,
 /// \param[in] annotation_file Path to the annotation json
 /// \param[in] task Set the task type of reading coco data, now support 'Detection'/'Stuff'/'Panoptic'/'Keypoint'
 /// \param[in] decode Decode the images after reading
-/// \param[in] sampler Object used to choose samples from the dataset. If sampler is `nullptr`, A `RandomSampler`
-///     will be used to randomly iterate the entire dataset
+/// \param[in] sampler Object used to choose samples from the dataset. If sampler is not given,
+///     a `RandomSampler` will be used to randomly iterate the entire dataset (default = RandomSampler())
 /// \return Shared pointer to the current Dataset
 std::shared_ptr<CocoDataset> Coco(const std::string &dataset_dir, const std::string &annotation_file,
                                  const std::string &task = "Detection", const bool &decode = false,
-                                  const std::shared_ptr<SamplerObj> &sampler = nullptr);
+                                  const std::shared_ptr<SamplerObj> &sampler = RandomSampler());

 /// \brief Function to create a CSVDataset
 /// \notes The generated dataset has a variable number of columns
@ -200,13 +200,13 @@ std::shared_ptr<CSVDataset> CSV(const std::vector<std::string> &dataset_files, c
 ///     The generated dataset has two columns ['image', 'label']
 /// \param[in] dataset_dir Path to the root directory that contains the dataset
 /// \param[in] decode A flag to decode in ImageFolder
-/// \param[in] sampler Object used to choose samples from the dataset. If sampler is `nullptr`,
-///     A `RandomSampler` will be used to randomly iterate the entire dataset
+/// \param[in] sampler Object used to choose samples from the dataset. If sampler is not given,
+///     a `RandomSampler` will be used to randomly iterate the entire dataset (default = RandomSampler())
 /// \param[in] extensions File extensions to be read
 /// \param[in] class_indexing a class name to label map
 /// \return Shared pointer to the current ImageFolderDataset
 std::shared_ptr<ImageFolderDataset> ImageFolder(const std::string &dataset_dir, bool decode = false,
-                                                const std::shared_ptr<SamplerObj> &sampler = nullptr,
+                                                const std::shared_ptr<SamplerObj> &sampler = RandomSampler(),
                                                const std::set<std::string> &extensions = {},
                                                const std::map<std::string, int32_t> &class_indexing = {});

@ -214,25 +214,25 @@ std::shared_ptr<ImageFolderDataset> ImageFolder(const std::string &dataset_dir,
 /// \notes The generated dataset has two columns ['image', 'label']
 /// \param[in] dataset_file The dataset file to be read
 /// \param[in] usage Need "train", "eval" or "inference" data (default="train")
-/// \param[in] sampler Object used to choose samples from the dataset. If sampler is `nullptr`,
-///     A `RandomSampler` will be used to randomly iterate the entire dataset
+/// \param[in] sampler Object used to choose samples from the dataset. If sampler is not given,
+///     a `RandomSampler` will be used to randomly iterate the entire dataset (default = RandomSampler())
 /// \param[in] class_indexing A str-to-int mapping from label name to index (default={}, the folder
 ///     names will be sorted alphabetically and each class will be given a unique index starting from 0).
 /// \param[in] decode Decode the images after reading (default=false).
 /// \return Shared pointer to the current ManifestDataset
-std::shared_ptr<ManifestDataset> Manifest(std::string dataset_file, std::string usage = "train",
-                                          std::shared_ptr<SamplerObj> sampler = nullptr,
+std::shared_ptr<ManifestDataset> Manifest(const std::string &dataset_file, const std::string &usage = "train",
+                                          const std::shared_ptr<SamplerObj> &sampler = RandomSampler(),
                                          const std::map<std::string, int32_t> &class_indexing = {},
                                          bool decode = false);

 /// \brief Function to create a MnistDataset
 /// \notes The generated dataset has two columns ['image', 'label']
 /// \param[in] dataset_dir Path to the root directory that contains the dataset
-/// \param[in] sampler Object used to choose samples from the dataset. If sampler is `nullptr`,
-///     A `RandomSampler` will be used to randomly iterate the entire dataset
+/// \param[in] sampler Object used to choose samples from the dataset. If sampler is not given,
+///     a `RandomSampler` will be used to randomly iterate the entire dataset (default = RandomSampler())
 /// \return Shared pointer to the current MnistDataset
 std::shared_ptr<MnistDataset> Mnist(const std::string &dataset_dir,
-                                    const std::shared_ptr<SamplerObj> &sampler = nullptr);
+                                    const std::shared_ptr<SamplerObj> &sampler = RandomSampler());

 /// \brief Function to create a ConcatDataset
 /// \notes Reload "+" operator to concat two datasets
@ -246,14 +246,14 @@ std::shared_ptr<ConcatDataset> operator+(const std::shared_ptr<Dataset> &dataset
 /// \param[in] total_rows Number of rows for the dataset to generate (default=0, number of rows is random)
 /// \param[in] schema SchemaObj to set column type, data type and data shape
 /// \param[in] columns_list List of columns to be read (default={}, read all columns)
-/// \param[in] sampler Object used to choose samples from the dataset. If sampler is `nullptr`, A `RandomSampler`
-///    will be used to randomly iterate the entire dataset
+/// \param[in] sampler Object used to choose samples from the dataset. If sampler is not given,
+///     a `RandomSampler` will be used to randomly iterate the entire dataset (default = RandomSampler())
 /// \return Shared pointer to the current Dataset
 template <typename T = std::shared_ptr<SchemaObj>>
 std::shared_ptr<RandomDataset> RandomData(const int32_t &total_rows = 0, T schema = nullptr,
                                          const std::vector<std::string> &columns_list = {},
-                                          std::shared_ptr<SamplerObj> sampler = nullptr) {
-  auto ds = std::make_shared<RandomDataset>(total_rows, schema, std::move(columns_list), std::move(sampler));
+                                          const std::shared_ptr<SamplerObj> &sampler = RandomSampler()) {
+  auto ds = std::make_shared<RandomDataset>(total_rows, schema, columns_list, std::move(sampler));
  return ds->ValidateParams() ? ds : nullptr;
 }

@ -286,13 +286,13 @@ std::shared_ptr<TextFileDataset> TextFile(const std::vector<std::string> &datase
 /// \param[in] mode Set the data list txt file to be readed
 /// \param[in] class_indexing A str-to-int mapping from label name to index
 /// \param[in] decode Decode the images after reading
-/// \param[in] sampler Object used to choose samples from the dataset. If sampler is `nullptr`, A `RandomSampler`
-///     will be used to randomly iterate the entire dataset
+/// \param[in] sampler Object used to choose samples from the dataset. If sampler is not given,
+///     a `RandomSampler` will be used to randomly iterate the entire dataset (default = RandomSampler())
 /// \return Shared pointer to the current Dataset
 std::shared_ptr<VOCDataset> VOC(const std::string &dataset_dir, const std::string &task = "Segmentation",
                                const std::string &mode = "train",
                                const std::map<std::string, int32_t> &class_indexing = {}, bool decode = false,
-                                const std::shared_ptr<SamplerObj> &sampler = nullptr);
+                                const std::shared_ptr<SamplerObj> &sampler = RandomSampler());

 /// \brief Function to create a ZipDataset
 /// \notes Applies zip to the dataset
@ -756,7 +756,7 @@ class ImageFolderDataset : public Dataset {
 class ManifestDataset : public Dataset {
 public:
  /// \brief Constructor
-  ManifestDataset(std::string dataset_file, std::string usage, std::shared_ptr<SamplerObj> sampler,
+  ManifestDataset(const std::string &dataset_file, const std::string &usage, const std::shared_ptr<SamplerObj> &sampler,
                  const std::map<std::string, int32_t> &class_indexing, bool decode);

  /// \brief Destructor
@ -808,7 +808,7 @@ class RandomDataset : public Dataset {

  /// \brief Constructor
  RandomDataset(const int32_t &total_rows, std::shared_ptr<SchemaObj> schema,
-                const std::vector<std::string> &columns_list, std::shared_ptr<SamplerObj> sampler)
+                const std::vector<std::string> &columns_list, const std::shared_ptr<SamplerObj> &sampler)
      : total_rows_(total_rows),
        schema_path_(""),
        schema_(std::move(schema)),
@ -816,8 +816,8 @@ class RandomDataset : public Dataset {
        sampler_(std::move(sampler)) {}

  /// \brief Constructor
-  RandomDataset(const int32_t &total_rows, std::string schema_path, std::vector<std::string> columns_list,
-                std::shared_ptr<SamplerObj> sampler)
+  RandomDataset(const int32_t &total_rows, std::string schema_path, const std::vector<std::string> &columns_list,
+                const std::shared_ptr<SamplerObj> &sampler)
      : total_rows_(total_rows), schema_path_(schema_path), columns_list_(columns_list), sampler_(std::move(sampler)) {}

  /// \brief Destructor
--- a/tests/ut/cpp/dataset/c_api_dataset_album_test.cc
+++ b/tests/ut/cpp/dataset/c_api_dataset_album_test.cc
@ -93,7 +93,7 @@ TEST_F(MindDataTestPipeline, TestAlbumDecode) {

 TEST_F(MindDataTestPipeline, TestAlbumNumSamplers) {
  MS_LOG(INFO) << "Doing MindDataTestPipeline-TestAlbumNumSamplers.";
-  
+
  std::string folder_path = datasets_root_path_ + "/testAlbum/images";
  std::string schema_file = datasets_root_path_ + "/testAlbum/datasetSchema.json";
  std::vector<std::string> column_names = {"image", "label", "id"};
@ -134,3 +134,25 @@ TEST_F(MindDataTestPipeline, TestAlbumError) {

  EXPECT_EQ(ds, nullptr);
 }
+
+TEST_F(MindDataTestPipeline, TestAlbumWithNullSampler) {
+  MS_LOG(INFO) << "Doing MindDataTestPipeline-TestAlbumWithNullSampler.";
+  std::string folder_path = datasets_root_path_ + "/testAlbum/images";
+  std::string schema_file = datasets_root_path_ + "/testAlbum/datasetSchema.json";
+  std::vector<std::string> column_names = {"image", "label", "id"};
+  // Create a Album Dataset
+  std::shared_ptr<Dataset> ds = Album(folder_path, schema_file, column_names, true, nullptr);
+  // Expect failure: sampler can not be nullptr
+  EXPECT_EQ(ds, nullptr);
+}
+
+TEST_F(MindDataTestPipeline, TestAlbumDuplicateColumnName) {
+  MS_LOG(INFO) << "Doing MindDataTestPipeline-TestAlbumDuplicateColumnName.";
+  std::string folder_path = datasets_root_path_ + "/testAlbum/images";
+  std::string schema_file = datasets_root_path_ + "/testAlbum/datasetSchema.json";
+  std::vector<std::string> column_names = {"image", "image", "id"};
+  // Create a Album Dataset
+  std::shared_ptr<Dataset> ds = Album(folder_path, schema_file, column_names, true);
+  // Expect failure: duplicate column names
+  EXPECT_EQ(ds, nullptr);
+}
--- a/tests/ut/cpp/dataset/c_api_dataset_cifar_test.cc
+++ b/tests/ut/cpp/dataset/c_api_dataset_cifar_test.cc
@ -107,3 +107,33 @@ TEST_F(MindDataTestPipeline, TestCifar10DatasetFail1) {
  std::shared_ptr<Dataset> ds = Cifar10("", RandomSampler(false, 10));
  EXPECT_EQ(ds, nullptr);
 }
+
+TEST_F(MindDataTestPipeline, TestCifar10DatasetWithNullSampler) {
+  MS_LOG(INFO) << "Doing MindDataTestPipeline-TestCifar10DatasetWithNullSampler.";
+
+  // Create a Cifar10 Dataset
+  std::string folder_path = datasets_root_path_ + "/testCifar10Data/";
+  std::shared_ptr<Dataset> ds = Cifar10(folder_path, nullptr);
+  // Expect failure: sampler can not be nullptr
+  EXPECT_EQ(ds, nullptr);
+}
+
+TEST_F(MindDataTestPipeline, TestCifar100DatasetWithNullSampler) {
+  MS_LOG(INFO) << "Doing MindDataTestPipeline-TestCifar100DatasetWithNullSampler.";
+
+  // Create a Cifar10 Dataset
+  std::string folder_path = datasets_root_path_ + "/testCifar100Data/";
+  std::shared_ptr<Dataset> ds = Cifar100(folder_path, nullptr);
+  // Expect failure: sampler can not be nullptr
+  EXPECT_EQ(ds, nullptr);
+}
+
+TEST_F(MindDataTestPipeline, TestCifar100DatasetWithWrongSampler) {
+  MS_LOG(INFO) << "Doing MindDataTestPipeline-TestCifar100DatasetWithWrongSampler.";
+
+  // Create a Cifar10 Dataset
+  std::string folder_path = datasets_root_path_ + "/testCifar100Data/";
+  std::shared_ptr<Dataset> ds = Cifar100(folder_path, RandomSampler(false, -10));
+  // Expect failure: sampler is not construnced correctly
+  EXPECT_EQ(ds, nullptr);
+}
--- a/tests/ut/cpp/dataset/c_api_dataset_coco_test.cc
+++ b/tests/ut/cpp/dataset/c_api_dataset_coco_test.cc
@ -290,3 +290,14 @@ TEST_F(MindDataTestPipeline, TestCocoStuff) {
  // Manually terminate the pipeline
  iter->Stop();
 }
+
+TEST_F(MindDataTestPipeline, TestCocoWithNullSampler) {
+  MS_LOG(INFO) << "Doing MindDataTestPipeline-TestCocoWithNullSampler.";
+  // Create a Coco Dataset
+  std::string folder_path = datasets_root_path_ + "/testCOCO/train";
+  std::string annotation_file = datasets_root_path_ + "/testCOCO/annotations/train.json";
+
+  std::shared_ptr<Dataset> ds = Coco(folder_path, annotation_file, "Detection", false, nullptr);
+  // Expect failure: sampler can not be nullptr
+  EXPECT_EQ(ds, nullptr);
+}
--- a/tests/ut/cpp/dataset/c_api_dataset_csv_test.cc
+++ b/tests/ut/cpp/dataset/c_api_dataset_csv_test.cc
@ -533,3 +533,14 @@ TEST_F(MindDataTestPipeline, TestCSVDatasetShuffleGlobal) {
  GlobalContext::config_manager()->set_seed(original_seed);
  GlobalContext::config_manager()->set_num_parallel_workers(original_num_parallel_workers);
 }
+
+TEST_F(MindDataTestPipeline, TestCSVDatasetDuplicateColumnName) {
+  MS_LOG(INFO) << "Doing MindDataTestPipeline-TestCSVDatasetDuplicateColumnName.";
+
+  // Create a CSVDataset, with single CSV file
+  std::string train_file = datasets_root_path_ + "/testCSV/1.csv";
+  std::vector<std::string> column_names = {"col1", "col1", "col3", "col4"};
+  std::shared_ptr<Dataset> ds = CSV({train_file}, ',', {}, column_names, -1, ShuffleMode::kFalse);
+  // Expect failure: duplicate column names
+  EXPECT_EQ(ds, nullptr);
+}
--- a/tests/ut/cpp/dataset/c_api_dataset_manifest_test.cc
+++ b/tests/ut/cpp/dataset/c_api_dataset_manifest_test.cc
@ -59,7 +59,7 @@ TEST_F(MindDataTestPipeline, TestManifestDecode) {

  std::string file_path = datasets_root_path_ + "/testManifestData/cpp.json";
  // Create a Manifest Dataset
-  std::shared_ptr<Dataset> ds = Manifest(file_path, "train", nullptr, {}, true);
+  std::shared_ptr<Dataset> ds = Manifest(file_path, "train", RandomSampler(), {}, true);
  EXPECT_NE(ds, nullptr);

  // Create an iterator over the result of the above dataset
@ -130,7 +130,7 @@ TEST_F(MindDataTestPipeline, TestManifestClassIndex) {
  std::vector<int> expected_label = {111, 222};

  // Create a Manifest Dataset
-  std::shared_ptr<Dataset> ds = Manifest(file_path, "train", nullptr, map, true);
+  std::shared_ptr<Dataset> ds = Manifest(file_path, "train", RandomSampler(), map, true);
  EXPECT_NE(ds, nullptr);

  // Create an iterator over the result of the above dataset
@ -204,3 +204,12 @@ TEST_F(MindDataTestPipeline, TestManifestError) {
  std::shared_ptr<Dataset> ds1 = Manifest(file_path, "invalid_usage");
  EXPECT_EQ(ds1, nullptr);
 }
+
+TEST_F(MindDataTestPipeline, TestManifestWithNullSampler) {
+  MS_LOG(INFO) << "Doing MindDataTestPipeline-TestManifestWithNullSampler.";
+  std::string file_path = datasets_root_path_ + "/testManifestData/cpp.json";
+  // Create a Manifest Dataset
+  std::shared_ptr<Dataset> ds = Manifest(file_path, "train", nullptr);
+  // Expect failure: sampler can not be nullptr
+  EXPECT_EQ(ds, nullptr);
+}
--- a/tests/ut/cpp/dataset/c_api_dataset_ops_test.cc
+++ b/tests/ut/cpp/dataset/c_api_dataset_ops_test.cc
@ -311,6 +311,34 @@ TEST_F(MindDataTestPipeline, TestProjectMap) {
  iter->Stop();
 }

+TEST_F(MindDataTestPipeline, TestMapDuplicateColumn) {
+  MS_LOG(INFO) << "Doing MindDataTestPipeline-TestMapDuplicateColumn.";
+
+  // Create an ImageFolder Dataset
+  std::string folder_path = datasets_root_path_ + "/testPK/data/";
+  std::shared_ptr<Dataset> ds = ImageFolder(folder_path, true, RandomSampler(false, 10));
+  EXPECT_NE(ds, nullptr);
+
+  // Create objects for the tensor ops
+  std::shared_ptr<TensorOperation> random_vertical_flip_op = vision::RandomVerticalFlip(0.5);
+  EXPECT_NE(random_vertical_flip_op, nullptr);
+
+  // Create a Map operation on ds
+  auto ds1 = ds->Map({random_vertical_flip_op}, {"image", "image"}, {}, {});
+  // Expect failure: duplicate input column name
+  EXPECT_EQ(ds1, nullptr);
+
+  // Create a Map operation on ds
+  auto ds2 = ds->Map({random_vertical_flip_op}, {}, {"label", "label"}, {});
+  // Expect failure: duplicate output column name
+  EXPECT_EQ(ds2, nullptr);
+
+  // Create a Map operation on ds
+  auto ds3 = ds->Map({random_vertical_flip_op}, {}, {}, {"image", "image"});
+  // Expect failure: duplicate project column name
+  EXPECT_EQ(ds3, nullptr);
+}
+
 TEST_F(MindDataTestPipeline, TestProjectMapAutoInjection) {
  MS_LOG(INFO) << "Doing MindDataTestPipeline.TestProjectMapAutoInjection";

@ -395,6 +423,24 @@ TEST_F(MindDataTestPipeline, TestRenameFail2) {
  EXPECT_EQ(ds, nullptr);
 }

+TEST_F(MindDataTestPipeline, TestRenameFail3) {
+  MS_LOG(INFO) << "Doing MindDataTestPipeline-TestRenameFail3.";
+  // We expect this test to fail because duplicate column name
+
+  // Create an ImageFolder Dataset
+  std::string folder_path = datasets_root_path_ + "/testPK/data/";
+  std::shared_ptr<Dataset> ds = ImageFolder(folder_path, true, RandomSampler(false, 10));
+  EXPECT_NE(ds, nullptr);
+
+  // Create a Rename operation on ds
+  auto ds1 = ds->Rename({"image", "image"}, {"col1", "col2"});
+  EXPECT_EQ(ds1, nullptr);
+
+  // Create a Rename operation on ds
+  auto ds2 = ds->Rename({"image", "label"}, {"col1", "col1"});
+  EXPECT_EQ(ds2, nullptr);
+}
+
 TEST_F(MindDataTestPipeline, TestRenameSuccess) {
  MS_LOG(INFO) << "Doing MindDataTestPipeline-TestRenameSuccess.";

--- a/tests/ut/cpp/dataset/c_api_dataset_randomdata_test.cc
+++ b/tests/ut/cpp/dataset/c_api_dataset_randomdata_test.cc
@ -265,4 +265,28 @@ TEST_F(MindDataTestPipeline, TestRandomDatasetBasic4) {
  // Manually terminate the pipeline
  iter->Stop();
  GlobalContext::config_manager()->set_seed(curr_seed);
-}
+}
+
+TEST_F(MindDataTestPipeline, TestRandomDatasetWithNullSampler) {
+  MS_LOG(INFO) << "Doing MindDataTestPipeline-TestRandomDatasetWithNullSampler.";
+
+  // Create a RandomDataset
+  std::shared_ptr<SchemaObj> schema = Schema();
+  schema->add_column("image", mindspore::TypeId::kNumberTypeUInt8, {2});
+  schema->add_column("label", mindspore::TypeId::kNumberTypeUInt8, {1});
+  std::shared_ptr<Dataset> ds = RandomData(50, schema, {}, nullptr);
+  // Expect failure: sampler can not be nullptr
+  EXPECT_EQ(ds, nullptr);
+}
+
+TEST_F(MindDataTestPipeline, TestRandomDatasetDuplicateColumnName) {
+  MS_LOG(INFO) << "Doing MindDataTestPipeline-TestRandomDatasetDuplicateColumnName.";
+
+  // Create a RandomDataset
+  std::shared_ptr<SchemaObj> schema = Schema();
+  schema->add_column("image", mindspore::TypeId::kNumberTypeUInt8, {2});
+  schema->add_column("label", mindspore::TypeId::kNumberTypeUInt8, {1});
+  std::shared_ptr<Dataset> ds = RandomData(50, schema, {"image", "image"});
+  // Expect failure: duplicate column names
+  EXPECT_EQ(ds, nullptr);
+}
--- a/tests/ut/cpp/dataset/c_api_dataset_voc_test.cc
+++ b/tests/ut/cpp/dataset/c_api_dataset_voc_test.cc
@ -194,3 +194,13 @@ TEST_F(MindDataTestPipeline, TestVOCSegmentationError1) {
  // Expect nullptr for segmentation task with class_index
  EXPECT_EQ(ds, nullptr);
 }
+
+TEST_F(MindDataTestPipeline, TestVOCWithNullSampler) {
+  MS_LOG(INFO) << "Doing MindDataTestPipeline-TestVOCWithNullSampler.";
+
+  // Create a VOC Dataset
+  std::string folder_path = datasets_root_path_ + "/testVOC2012_2";
+  std::shared_ptr<Dataset> ds = VOC(folder_path, "Segmentation", "train", {}, false, nullptr);
+  // Expect failure: sampler can not be nullptr
+  EXPECT_EQ(ds, nullptr);
+}
--- a/tests/ut/cpp/dataset/c_api_datasets_test.cc
+++ b/tests/ut/cpp/dataset/c_api_datasets_test.cc
@ -118,24 +118,44 @@ TEST_F(MindDataTestPipeline, TestCelebAException) {
  EXPECT_EQ(ds1, nullptr);
 }

-TEST_F(MindDataTestPipeline, TestImageFolderFail1) {
-  MS_LOG(INFO) << "Doing MindDataTestPipeline-TestImageFolderFail1.";
+TEST_F(MindDataTestPipeline, TestCelebADatasetWithNullSampler) {
+  MS_LOG(INFO) << "Doing MindDataTestPipeline-TestCelebADataset.";

-  // Create an ImageFolder Dataset
-  std::shared_ptr<Dataset> ds = ImageFolder("", true, nullptr);
+  // Create a CelebA Dataset
+  std::string folder_path = datasets_root_path_ + "/testCelebAData/";
+  std::shared_ptr<Dataset> ds = CelebA(folder_path, "all", nullptr, false, {});
+  // Expect failure: sampler can not be nullptr
  EXPECT_EQ(ds, nullptr);
 }

-TEST_F(MindDataTestPipeline, TestMnistFail1) {
-  MS_LOG(INFO) << "Doing MindDataTestPipeline-TestMnistFail1.";
+TEST_F(MindDataTestPipeline, TestMnistFailWithWrongDatasetDir) {
+  MS_LOG(INFO) << "Doing MindDataTestPipeline-TestMnistFailWithWrongDatasetDir.";

  // Create a Mnist Dataset
  std::shared_ptr<Dataset> ds = Mnist("", RandomSampler(false, 10));
  EXPECT_EQ(ds, nullptr);
 }

-TEST_F(MindDataTestPipeline, TestImageFolderFail2) {
-  MS_LOG(INFO) << "Doing MindDataTestPipeline-TestImageFolderFail2.";
+TEST_F(MindDataTestPipeline, TestMnistFailWithNullSampler) {
+  MS_LOG(INFO) << "Doing MindDataTestPipeline-TestMnistFailWithNullSampler.";
+
+  // Create a Mnist Dataset
+  std::string folder_path = datasets_root_path_ + "/testMnistData/";
+  std::shared_ptr<Dataset> ds = Mnist(folder_path, nullptr);
+  // Expect failure: sampler can not be nullptr
+  EXPECT_EQ(ds, nullptr);
+}
+
+TEST_F(MindDataTestPipeline, TestImageFolderWithWrongDatasetDir) {
+  MS_LOG(INFO) << "Doing MindDataTestPipeline-TestImageFolderWithWrongDatasetDir.";
+
+  // Create an ImageFolder Dataset
+  std::shared_ptr<Dataset> ds = ImageFolder("", true, nullptr);
+  EXPECT_EQ(ds, nullptr);
+}
+
+TEST_F(MindDataTestPipeline, TestImageFolderFailWithWrongExtension) {
+  MS_LOG(INFO) << "Doing MindDataTestPipeline-TestImageFolderFailWithWrongExtension.";

  // Create an ImageFolder Dataset
  std::string folder_path = datasets_root_path_ + "/testPK/data/";
@ -150,8 +170,29 @@ TEST_F(MindDataTestPipeline, TestImageFolderFail2) {
  // Iterate the dataset and get each row
  std::unordered_map<std::string, std::shared_ptr<Tensor>> row;
  iter->GetNextRow(&row);
+  // Expect no data: can not find files with specified extension
  EXPECT_EQ(row.size(), 0);

  // Manually terminate the pipeline
  iter->Stop();
 }
+
+TEST_F(MindDataTestPipeline, TestImageFolderFailWithNullSampler) {
+  MS_LOG(INFO) << "Doing MindDataTestPipeline-TestImageFolderFailWithNullSampler.";
+
+  // Create an ImageFolder Dataset
+  std::string folder_path = datasets_root_path_ + "/testPK/data/";
+  std::shared_ptr<Dataset> ds = ImageFolder(folder_path, true, nullptr);
+  // Expect failure: sampler can not be nullptr
+  EXPECT_EQ(ds, nullptr);
+}
+
+TEST_F(MindDataTestPipeline, TestImageFolderFailWithWrongSampler) {
+  MS_LOG(INFO) << "Doing MindDataTestPipeline-TestImageFolderFailWithWrongSampler.";
+
+  // Create a Cifar10 Dataset
+  std::string folder_path = datasets_root_path_ + "/testCifar100Data/";
+  std::shared_ptr<Dataset> ds = ImageFolder(folder_path, true, SequentialSampler(-2, 5));
+  // Expect failure: sampler is not construnced correctly
+  EXPECT_EQ(ds, nullptr);
+}