diff --git a/mindspore/ccsrc/minddata/dataset/api/python/bindings/dataset/include/datasets_bindings.cc b/mindspore/ccsrc/minddata/dataset/api/python/bindings/dataset/include/datasets_bindings.cc index 996c7429d8..852ca3b9c9 100644 --- a/mindspore/ccsrc/minddata/dataset/api/python/bindings/dataset/include/datasets_bindings.cc +++ b/mindspore/ccsrc/minddata/dataset/api/python/bindings/dataset/include/datasets_bindings.cc @@ -96,23 +96,24 @@ PYBIND_REGISTER(DatasetNode, 1, ([](const py::module *m) { // PYBIND FOR LEAF NODES // (In alphabetical order) -PYBIND_REGISTER( - CelebANode, 2, ([](const py::module *m) { - (void)py::class_>(*m, "CelebANode", "to create a CelebANode") - .def(py::init([](std::string dataset_dir, std::string usage, std::optional sampler, bool decode, - std::optional extensions, std::optional> cc) { - auto celebA = std::make_shared(dataset_dir, usage, toSamplerObj(sampler), decode, - toStringSet(extensions), toDatasetCache(std::move(cc))); - THROW_IF_ERROR(celebA->ValidateParams()); - return celebA; - })); - })); +PYBIND_REGISTER(CelebANode, 2, ([](const py::module *m) { + (void)py::class_>(*m, "CelebANode", + "to create a CelebANode") + .def(py::init([](std::string dataset_dir, std::string usage, py::handle sampler, bool decode, + py::list extensions, std::shared_ptr cc) { + auto celebA = + std::make_shared(dataset_dir, usage, toSamplerObj(sampler), decode, + toStringSet(extensions), toDatasetCache(std::move(cc))); + THROW_IF_ERROR(celebA->ValidateParams()); + return celebA; + })); + })); PYBIND_REGISTER(Cifar10Node, 2, ([](const py::module *m) { (void)py::class_>(*m, "Cifar10Node", "to create a Cifar10Node") - .def(py::init([](std::string dataset_dir, std::string usage, std::optional sampler, - std::optional> cc) { + .def(py::init([](std::string dataset_dir, std::string usage, py::handle sampler, + std::shared_ptr cc) { auto cifar10 = std::make_shared(dataset_dir, usage, toSamplerObj(sampler), toDatasetCache(std::move(cc))); THROW_IF_ERROR(cifar10->ValidateParams()); @@ -123,8 +124,8 @@ PYBIND_REGISTER(Cifar10Node, 2, ([](const py::module *m) { PYBIND_REGISTER(Cifar100Node, 2, ([](const py::module *m) { (void)py::class_>(*m, "Cifar100Node", "to create a Cifar100Node") - .def(py::init([](std::string dataset_dir, std::string usage, std::optional sampler, - std::optional> cc) { + .def(py::init([](std::string dataset_dir, std::string usage, py::handle sampler, + std::shared_ptr cc) { auto cifar100 = std::make_shared(dataset_dir, usage, toSamplerObj(sampler), toDatasetCache(std::move(cc))); THROW_IF_ERROR(cifar100->ValidateParams()); @@ -136,7 +137,7 @@ PYBIND_REGISTER( CLUENode, 2, ([](const py::module *m) { (void)py::class_>(*m, "CLUENode", "to create a CLUENode") .def(py::init([](py::list files, std::string task, std::string usage, int64_t num_samples, int32_t shuffle, - int32_t num_shards, int32_t shard_id, std::optional> cc) { + int32_t num_shards, int32_t shard_id, std::shared_ptr cc) { std::shared_ptr clue_node = std::make_shared(toStringVector(files), task, usage, num_samples, toShuffleMode(shuffle), num_shards, shard_id, toDatasetCache(std::move(cc))); @@ -145,24 +146,24 @@ PYBIND_REGISTER( })); })); -PYBIND_REGISTER( - CocoNode, 2, ([](const py::module *m) { - (void)py::class_>(*m, "CocoNode", "to create a CocoNode") - .def(py::init([](std::string dataset_dir, std::string annotation_file, std::string task, bool decode, - std::optional sampler, std::optional> cc) { - std::shared_ptr coco = std::make_shared( - dataset_dir, annotation_file, task, decode, toSamplerObj(sampler), toDatasetCache(std::move(cc))); - THROW_IF_ERROR(coco->ValidateParams()); - return coco; - })); - })); +PYBIND_REGISTER(CocoNode, 2, ([](const py::module *m) { + (void)py::class_>(*m, "CocoNode", + "to create a CocoNode") + .def(py::init([](std::string dataset_dir, std::string annotation_file, std::string task, + bool decode, py::handle sampler, std::shared_ptr cc) { + std::shared_ptr coco = + std::make_shared(dataset_dir, annotation_file, task, decode, toSamplerObj(sampler), + toDatasetCache(std::move(cc))); + THROW_IF_ERROR(coco->ValidateParams()); + return coco; + })); + })); PYBIND_REGISTER(CSVNode, 2, ([](const py::module *m) { (void)py::class_>(*m, "CSVNode", "to create a CSVNode") .def(py::init([](std::vector csv_files, char field_delim, py::list column_defaults, std::vector column_names, int64_t num_samples, int32_t shuffle, - int32_t num_shards, int32_t shard_id, - std::optional> cc) { + int32_t num_shards, int32_t shard_id, std::shared_ptr cc) { auto csv = std::make_shared(csv_files, field_delim, toCSVBase(column_defaults), column_names, num_samples, toShuffleMode(shuffle), num_shards, shard_id, toDatasetCache(std::move(cc))); @@ -194,10 +195,10 @@ PYBIND_REGISTER(GeneratorNode, 2, ([](const py::module *m) { PYBIND_REGISTER(ImageFolderNode, 2, ([](const py::module *m) { (void)py::class_>( *m, "ImageFolderNode", "to create an ImageFolderNode") - .def(py::init([](std::string dataset_dir, bool decode, std::optional sampler, - std::optional extensions, std::optional class_indexing, - std::optional> cc) { - bool recursive = false; + .def(py::init([](std::string dataset_dir, bool decode, py::handle sampler, py::list extensions, + py::dict class_indexing, std::shared_ptr cc) { + // Don't update recursive to true + bool recursive = false; // Will be removed in future PR auto imagefolder = std::make_shared( dataset_dir, decode, toSamplerObj(sampler), recursive, toStringSet(extensions), toStringMap(class_indexing), toDatasetCache(std::move(cc))); @@ -209,9 +210,8 @@ PYBIND_REGISTER(ImageFolderNode, 2, ([](const py::module *m) { PYBIND_REGISTER(ManifestNode, 2, ([](const py::module *m) { (void)py::class_>(*m, "ManifestNode", "to create a ManifestNode") - .def(py::init([](std::string dataset_file, std::string usage, std::optional sampler, - std::optional class_indexing, bool decode, - std::optional> cc) { + .def(py::init([](std::string dataset_file, std::string usage, py::handle sampler, + py::dict class_indexing, bool decode, std::shared_ptr cc) { auto manifest = std::make_shared(dataset_file, usage, toSamplerObj(sampler), toStringMap(class_indexing), decode, toDatasetCache(std::move(cc))); @@ -223,8 +223,8 @@ PYBIND_REGISTER(ManifestNode, 2, ([](const py::module *m) { PYBIND_REGISTER(MindDataNode, 2, ([](const py::module *m) { (void)py::class_>(*m, "MindDataNode", "to create a MindDataNode") - .def(py::init([](std::string dataset_file, std::optional columns_list, - std::optional sampler, py::dict padded_sample, int64_t num_padded) { + .def(py::init([](std::string dataset_file, py::list columns_list, py::handle sampler, + py::dict padded_sample, int64_t num_padded) { nlohmann::json padded_sample_json; std::map sample_bytes; THROW_IF_ERROR(ToJson(padded_sample, &padded_sample_json, &sample_bytes)); @@ -235,8 +235,8 @@ PYBIND_REGISTER(MindDataNode, 2, ([](const py::module *m) { THROW_IF_ERROR(minddata->ValidateParams()); return minddata; })) - .def(py::init([](py::list dataset_file, std::optional columns_list, - std::optional sampler, py::dict padded_sample, int64_t num_padded) { + .def(py::init([](py::list dataset_file, py::list columns_list, py::handle sampler, + py::dict padded_sample, int64_t num_padded) { nlohmann::json padded_sample_json; std::map sample_bytes; THROW_IF_ERROR(ToJson(padded_sample, &padded_sample_json, &sample_bytes)); @@ -252,8 +252,8 @@ PYBIND_REGISTER(MindDataNode, 2, ([](const py::module *m) { PYBIND_REGISTER(MnistNode, 2, ([](const py::module *m) { (void)py::class_>(*m, "MnistNode", "to create an MnistNode") - .def(py::init([](std::string dataset_dir, std::string usage, std::optional sampler, - std::optional> cc) { + .def(py::init([](std::string dataset_dir, std::string usage, py::handle sampler, + std::shared_ptr cc) { auto mnist = std::make_shared(dataset_dir, usage, toSamplerObj(sampler), toDatasetCache(std::move(cc))); THROW_IF_ERROR(mnist->ValidateParams()); @@ -264,15 +264,14 @@ PYBIND_REGISTER(MnistNode, 2, ([](const py::module *m) { PYBIND_REGISTER( RandomNode, 2, ([](const py::module *m) { (void)py::class_>(*m, "RandomNode", "to create a RandomNode") - .def(py::init([](int32_t total_rows, std::shared_ptr schema, std::optional columns_list, - std::optional> cc) { + .def(py::init([](int32_t total_rows, std::shared_ptr schema, py::list columns_list, + std::shared_ptr cc) { auto random_node = std::make_shared(total_rows, schema, toStringVector(columns_list), toDatasetCache(std::move(cc))); THROW_IF_ERROR(random_node->ValidateParams()); return random_node; })) - .def(py::init([](int32_t total_rows, std::string schema, std::optional columns_list, - std::optional> cc) { + .def(py::init([](int32_t total_rows, std::string schema, py::list columns_list, std::shared_ptr cc) { auto random_node = std::make_shared(total_rows, schema, toStringVector(columns_list), toDatasetCache(std::move(cc))); THROW_IF_ERROR(random_node->ValidateParams()); @@ -284,7 +283,7 @@ PYBIND_REGISTER(TextFileNode, 2, ([](const py::module *m) { (void)py::class_>(*m, "TextFileNode", "to create a TextFileNode") .def(py::init([](py::list dataset_files, int32_t num_samples, int32_t shuffle, int32_t num_shards, - int32_t shard_id, std::optional> cc) { + int32_t shard_id, std::shared_ptr cc) { std::shared_ptr textfile_node = std::make_shared( toStringVector(dataset_files), num_samples, toShuffleMode(shuffle), num_shards, shard_id, toDatasetCache(std::move(cc))); @@ -293,44 +292,34 @@ PYBIND_REGISTER(TextFileNode, 2, ([](const py::module *m) { })); })); -PYBIND_REGISTER( - TFRecordNode, 2, ([](const py::module *m) { - (void)py::class_>(*m, "TFRecordNode", - "to create a TFRecordNode") - .def(py::init([](py::list dataset_files, std::shared_ptr schema, std::optional columns_list, - std::optional num_samples, int32_t shuffle, std::optional num_shards, - std::optional shard_id, bool shard_equal_rows, - std::optional> cc) { - if (!num_samples) { - *num_samples = 0; - } - std::shared_ptr tfrecord = std::make_shared( - toStringVector(dataset_files), schema, toStringVector(columns_list), *num_samples, toShuffleMode(shuffle), - *num_shards, *shard_id, shard_equal_rows, toDatasetCache(std::move(cc))); - THROW_IF_ERROR(tfrecord->ValidateParams()); - return tfrecord; - })) - .def(py::init([](py::list dataset_files, std::string schema, std::optional columns_list, - std::optional num_samples, int32_t shuffle, std::optional num_shards, - std::optional shard_id, bool shard_equal_rows, - std::optional> cc) { - if (!num_samples) { - *num_samples = 0; - } - std::shared_ptr tfrecord = std::make_shared( - toStringVector(dataset_files), schema, toStringVector(columns_list), *num_samples, toShuffleMode(shuffle), - *num_shards, *shard_id, shard_equal_rows, toDatasetCache(std::move(cc))); - THROW_IF_ERROR(tfrecord->ValidateParams()); - return tfrecord; - })); - })); +PYBIND_REGISTER(TFRecordNode, 2, ([](const py::module *m) { + (void)py::class_>(*m, "TFRecordNode", + "to create a TFRecordNode") + .def(py::init([](py::list dataset_files, std::shared_ptr schema, py::list columns_list, + int64_t num_samples, int32_t shuffle, int32_t num_shards, int32_t shard_id, + bool shard_equal_rows, std::shared_ptr cc) { + std::shared_ptr tfrecord = std::make_shared( + toStringVector(dataset_files), schema, toStringVector(columns_list), num_samples, + toShuffleMode(shuffle), num_shards, shard_id, shard_equal_rows, toDatasetCache(std::move(cc))); + THROW_IF_ERROR(tfrecord->ValidateParams()); + return tfrecord; + })) + .def(py::init([](py::list dataset_files, std::string schema, py::list columns_list, + int64_t num_samples, int32_t shuffle, int32_t num_shards, int32_t shard_id, + bool shard_equal_rows, std::shared_ptr cc) { + std::shared_ptr tfrecord = std::make_shared( + toStringVector(dataset_files), schema, toStringVector(columns_list), num_samples, + toShuffleMode(shuffle), num_shards, shard_id, shard_equal_rows, toDatasetCache(std::move(cc))); + THROW_IF_ERROR(tfrecord->ValidateParams()); + return tfrecord; + })); + })); PYBIND_REGISTER(VOCNode, 2, ([](const py::module *m) { (void)py::class_>(*m, "VOCNode", "to create a VOCNode") .def( - py::init([](std::string dataset_dir, std::string task, std::string usage, - std::optional class_indexing, bool decode, - std::optional sampler, std::optional> cc) { + py::init([](std::string dataset_dir, std::string task, std::string usage, py::dict class_indexing, + bool decode, py::handle sampler, std::shared_ptr cc) { std::shared_ptr voc = std::make_shared(dataset_dir, task, usage, toStringMap(class_indexing), decode, toSamplerObj(sampler), toDatasetCache(std::move(cc))); @@ -416,15 +405,14 @@ PYBIND_REGISTER(BuildVocabNode, 2, ([](const py::module *m) { PYBIND_REGISTER(ConcatNode, 2, ([](const py::module *m) { (void)py::class_>(*m, "ConcatNode", "to create a ConcatNode") - .def( - py::init([](std::vector> datasets, std::optional sampler, - py::list children_flag_and_nums, py::list children_start_end_index) { - auto concat = std::make_shared(datasets, toSamplerObj(sampler), - toPairVector(children_flag_and_nums), - toPairVector(children_start_end_index)); - THROW_IF_ERROR(concat->ValidateParams()); - return concat; - })); + .def(py::init([](std::vector> datasets, py::handle sampler, + py::list children_flag_and_nums, py::list children_start_end_index) { + auto concat = std::make_shared(datasets, toSamplerObj(sampler), + toPairVector(children_flag_and_nums), + toPairVector(children_start_end_index)); + THROW_IF_ERROR(concat->ValidateParams()); + return concat; + })); })); PYBIND_REGISTER(FilterNode, 2, ([](const py::module *m) { @@ -441,10 +429,8 @@ PYBIND_REGISTER(FilterNode, 2, ([](const py::module *m) { PYBIND_REGISTER(MapNode, 2, ([](const py::module *m) { (void)py::class_>(*m, "MapNode", "to create a MapNode") - .def(py::init([](std::shared_ptr self, std::optional operations, - std::optional input_columns, std::optional output_columns, - std::optional project_columns, - std::optional> cc, + .def(py::init([](std::shared_ptr self, py::list operations, py::list input_columns, + py::list output_columns, py::list project_columns, std::shared_ptr cc, std::vector> py_callbacks) { auto map = std::make_shared( self, std::move(toTensorOperations(operations)), toStringVector(input_columns), @@ -465,17 +451,15 @@ PYBIND_REGISTER(ProjectNode, 2, ([](const py::module *m) { })); })); -PYBIND_REGISTER(RenameNode, 2, ([](const py::module *m) { - (void)py::class_>(*m, "RenameNode", - "to create a RenameNode") - .def(py::init([](std::shared_ptr self, std::optional input_columns, - std::optional output_columns) { - auto rename = std::make_shared(self, toStringVector(input_columns), - toStringVector(output_columns)); - THROW_IF_ERROR(rename->ValidateParams()); - return rename; - })); - })); +PYBIND_REGISTER( + RenameNode, 2, ([](const py::module *m) { + (void)py::class_>(*m, "RenameNode", "to create a RenameNode") + .def(py::init([](std::shared_ptr self, py::list input_columns, py::list output_columns) { + auto rename = std::make_shared(self, toStringVector(input_columns), toStringVector(output_columns)); + THROW_IF_ERROR(rename->ValidateParams()); + return rename; + })); + })); PYBIND_REGISTER(RepeatNode, 2, ([](const py::module *m) { (void)py::class_>(*m, "RepeatNode", diff --git a/mindspore/ccsrc/minddata/dataset/api/python/pybind_conversion.cc b/mindspore/ccsrc/minddata/dataset/api/python/pybind_conversion.cc index 10d941b844..1ccb1ad877 100644 --- a/mindspore/ccsrc/minddata/dataset/api/python/pybind_conversion.cc +++ b/mindspore/ccsrc/minddata/dataset/api/python/pybind_conversion.cc @@ -28,10 +28,10 @@ bool toBool(const py::handle &handle) { return py::reinterpret_borrow std::string toString(const py::handle &handle) { return py::reinterpret_borrow(handle); } -std::set toStringSet(const std::optional list) { +std::set toStringSet(const py::list list) { std::set set; - if (list) { - for (auto l : *list) { + if (!list.empty()) { + for (auto l : list) { if (!l.is_none()) { (void)set.insert(py::str(l)); } @@ -40,20 +40,20 @@ std::set toStringSet(const std::optional list) { return set; } -std::map toStringMap(const std::optional dict) { +std::map toStringMap(const py::dict dict) { std::map map; - if (dict) { - for (auto p : *dict) { + if (!dict.empty()) { + for (auto p : dict) { (void)map.emplace(toString(p.first), toInt(p.second)); } } return map; } -std::vector toStringVector(const std::optional list) { +std::vector toStringVector(const py::list list) { std::vector vector; - if (list) { - for (auto l : *list) { + if (!list.empty()) { + for (auto l : list) { if (l.is_none()) vector.emplace_back(""); else @@ -63,10 +63,10 @@ std::vector toStringVector(const std::optional list) { return vector; } -std::pair toIntPair(const std::optional tuple) { +std::pair toIntPair(const py::tuple tuple) { std::pair pair; - if (tuple) { - pair = std::make_pair(toInt64((*tuple)[0]), toInt64((*tuple)[1])); + if (!tuple.empty()) { + pair = std::make_pair(toInt64((tuple)[0]), toInt64((tuple)[1])); } return pair; } @@ -85,10 +85,10 @@ std::vector> toPairVector(const py::list list) { return vector; } -std::vector> toTensorOperations(std::optional operations) { +std::vector> toTensorOperations(py::list operations) { std::vector> vector; - if (operations) { - for (auto op : *operations) { + if (!operations.empty()) { + for (auto op : operations) { std::shared_ptr tensor_op; if (py::isinstance(op)) { tensor_op = op.cast>(); @@ -132,19 +132,19 @@ std::vector> toDatasetNode(std::shared_ptr toSamplerObj(std::optional py_sampler, bool isMindDataset) { +std::shared_ptr toSamplerObj(py::handle py_sampler, bool isMindDataset) { if (py_sampler) { std::shared_ptr sampler_obj; if (!isMindDataset) { // Common Sampler std::shared_ptr sampler; - auto create = py::reinterpret_borrow(py_sampler.value()).attr("create"); + auto create = py::reinterpret_borrow(py_sampler).attr("create"); sampler = create().cast>(); sampler_obj = std::make_shared(std::move(sampler)); } else { // Mindrecord Sampler std::shared_ptr sampler; - auto create = py::reinterpret_borrow(py_sampler.value()).attr("create_for_minddataset"); + auto create = py::reinterpret_borrow(py_sampler).attr("create_for_minddataset"); sampler = create().cast>(); sampler_obj = std::make_shared(std::move(sampler)); } @@ -156,10 +156,10 @@ std::shared_ptr toSamplerObj(std::optional py_sampler, b } // Here we take in a python object, that holds a reference to a C++ object -std::shared_ptr toDatasetCache(std::optional> cc) { +std::shared_ptr toDatasetCache(std::shared_ptr cc) { if (cc) { std::shared_ptr built_cache; - built_cache = std::make_shared(std::move(cc.value())); + built_cache = std::make_shared(std::move(cc)); return built_cache; } else { // don't need to check here as cache is not enabled. diff --git a/mindspore/ccsrc/minddata/dataset/api/python/pybind_conversion.h b/mindspore/ccsrc/minddata/dataset/api/python/pybind_conversion.h index 2b6788affa..49c5e6c425 100644 --- a/mindspore/ccsrc/minddata/dataset/api/python/pybind_conversion.h +++ b/mindspore/ccsrc/minddata/dataset/api/python/pybind_conversion.h @@ -47,25 +47,25 @@ bool toBool(const py::handle &handle); std::string toString(const py::handle &handle); -std::set toStringSet(const std::optional list); +std::set toStringSet(const py::list list); -std::map toStringMap(const std::optional dict); +std::map toStringMap(const py::dict dict); -std::vector toStringVector(const std::optional list); +std::vector toStringVector(const py::list list); -std::pair toIntPair(const std::optional tuple); +std::pair toIntPair(const py::tuple tuple); std::vector> toPairVector(const py::list list); -std::vector> toTensorOperations(std::optional operations); +std::vector> toTensorOperations(py::list operations); std::shared_ptr toTensorOperation(py::handle operation); std::vector> toDatasetNode(std::shared_ptr self, py::list datasets); -std::shared_ptr toSamplerObj(std::optional py_sampler, bool isMindDataset = false); +std::shared_ptr toSamplerObj(py::handle py_sampler, bool isMindDataset = false); -std::shared_ptr toDatasetCache(std::optional> cc); +std::shared_ptr toDatasetCache(std::shared_ptr cc); ShuffleMode toShuffleMode(const int32_t shuffle); diff --git a/mindspore/dataset/engine/datasets.py b/mindspore/dataset/engine/datasets.py index db6f1e3c1a..92bc811168 100644 --- a/mindspore/dataset/engine/datasets.py +++ b/mindspore/dataset/engine/datasets.py @@ -2266,13 +2266,13 @@ class MapDataset(Dataset): if start_ind != end_ind: new_ops.append(py_transforms.Compose(operations[start_ind:end_ind])) operations = new_ops - self.operations = operations + self.operations = replace_none(operations, []) if input_columns is not None and not isinstance(input_columns, list): input_columns = [input_columns] self.input_columns = replace_none(input_columns, []) if output_columns is not None and not isinstance(output_columns, list): output_columns = [output_columns] - self.output_columns = replace_none(output_columns, input_columns) + self.output_columns = replace_none(output_columns, self.input_columns) self.cache = cache self.column_order = column_order @@ -3025,8 +3025,9 @@ class ImageFolderDataset(MappableDataset): cc = self.cache.cache_client else: cc = None + class_indexing = replace_none(self.class_indexing, {}) return cde.ImageFolderNode(self.dataset_dir, self.decode, self.sampler, self.extensions, - self.class_indexing, cc).SetNumWorkers(self.num_parallel_workers) + class_indexing, cc).SetNumWorkers(self.num_parallel_workers) def get_args(self): args = super().get_args() @@ -4043,7 +4044,8 @@ class ManifestDataset(MappableDataset): cc = self.cache.cache_client else: cc = None - return cde.ManifestNode(self.dataset_file, self.usage, self.sampler, self.class_indexing, + class_indexing = replace_none(self.class_indexing, {}) + return cde.ManifestNode(self.dataset_file, self.usage, self.sampler, class_indexing, self.decode, cc).SetNumWorkers(self.num_parallel_workers) @check_manifestdataset @@ -4701,7 +4703,8 @@ class VOCDataset(MappableDataset): cc = self.cache.cache_client else: cc = None - return cde.VOCNode(self.dataset_dir, self.task, self.usage, self.class_indexing, self.decode, + class_indexing = replace_none(self.class_indexing, {}) + return cde.VOCNode(self.dataset_dir, self.task, self.usage, class_indexing, self.decode, self.sampler, cc).SetNumWorkers(self.num_parallel_workers) @check_vocdataset