diff --git a/mindspore/ccsrc/minddata/dataset/api/datasets.cc b/mindspore/ccsrc/minddata/dataset/api/datasets.cc index 2a31f30015..f5ceed8e5a 100644 --- a/mindspore/ccsrc/minddata/dataset/api/datasets.cc +++ b/mindspore/ccsrc/minddata/dataset/api/datasets.cc @@ -324,7 +324,7 @@ std::shared_ptr Dataset::BuildVocab(const std::vector &colum // Finish building vocab by triggering GetNextRow std::unordered_map> row; iter->GetNextRow(&row); - if (vocab == nullptr) { + if (vocab->vocab().empty()) { MS_LOG(ERROR) << "Fail to build vocab."; return nullptr; } diff --git a/mindspore/ccsrc/minddata/dataset/text/vocab.h b/mindspore/ccsrc/minddata/dataset/text/vocab.h index cd59c24132..121543b421 100644 --- a/mindspore/ccsrc/minddata/dataset/text/vocab.h +++ b/mindspore/ccsrc/minddata/dataset/text/vocab.h @@ -107,6 +107,9 @@ class Vocab { // @param std::string & word - word to be added will skip if word already exists void append_word(const std::string &word); + // return a read-only vocab + const std::unordered_map vocab() { return word2id_; } + // destructor ~Vocab() = default; diff --git a/mindspore/dataset/engine/datasets.py b/mindspore/dataset/engine/datasets.py index f609a5f099..7eb77480ea 100644 --- a/mindspore/dataset/engine/datasets.py +++ b/mindspore/dataset/engine/datasets.py @@ -4457,8 +4457,8 @@ class VOCDataset(MappableDataset): task (str): Set the task type of reading voc data, now only support "Segmentation" or "Detection" (default="Segmentation"). mode (str): Set the data list txt file to be readed (default="train"). - class_indexing (dict, optional): A str-to-int mapping from label name to index - (default=None, the folder names will be sorted alphabetically and each + class_indexing (dict, optional): A str-to-int mapping from label name to index, only valid in + "Detection" task (default=None, the folder names will be sorted alphabetically and each class will be given a unique index starting from 0). num_samples (int, optional): The number of images to be included in the dataset (default=None, all images). diff --git a/tests/ut/cpp/dataset/c_api_dataset_vocab.cc b/tests/ut/cpp/dataset/c_api_dataset_vocab.cc index 87d5046c44..78276702df 100644 --- a/tests/ut/cpp/dataset/c_api_dataset_vocab.cc +++ b/tests/ut/cpp/dataset/c_api_dataset_vocab.cc @@ -252,3 +252,17 @@ TEST_F(MindDataTestPipeline, TestVocabFromDatasetFail2) { std::numeric_limits::max(), {"", ""}, true); EXPECT_EQ(vocab, nullptr); } + +TEST_F(MindDataTestPipeline, TestVocabFromDatasetFail3) { + MS_LOG(INFO) << "Doing MindDataTestPipeline-TestVocabFromDatasetFail3."; + + // Create a TextFile dataset + std::string data_file = datasets_root_path_ + "/testVocab/words.txt"; + std::shared_ptr ds = TextFile({data_file}, 0, ShuffleMode::kFalse); + EXPECT_NE(ds, nullptr); + + // Create vocab from dataset + // Expected failure: column name does not exist in ds + std::shared_ptr vocab = ds->BuildVocab({"ColumnNotExist"}); + EXPECT_EQ(vocab, nullptr); +}